diff --git a/kmod/src/counters.h b/kmod/src/counters.h index 6c97923c..82d34e46 100644 --- a/kmod/src/counters.h +++ b/kmod/src/counters.h @@ -56,27 +56,11 @@ EXPAND_COUNTER(dentry_revalidate_root) \ EXPAND_COUNTER(dentry_revalidate_valid) \ EXPAND_COUNTER(dir_backref_excessive_retries) \ - EXPAND_COUNTER(forest_add_root) \ EXPAND_COUNTER(forest_bloom_fail) \ EXPAND_COUNTER(forest_bloom_pass) \ - EXPAND_COUNTER(forest_clear_lock) \ - EXPAND_COUNTER(forest_delete) \ - EXPAND_COUNTER(forest_insert) \ - EXPAND_COUNTER(forest_iter) \ - EXPAND_COUNTER(forest_lookup) \ - EXPAND_COUNTER(forest_read_lock_log) \ - EXPAND_COUNTER(forest_read_lock_rotated) \ - EXPAND_COUNTER(forest_refresh_bloom_roots) \ - EXPAND_COUNTER(forest_refresh_dirty_log) \ - EXPAND_COUNTER(forest_refresh_skip_log) \ EXPAND_COUNTER(forest_read_items) \ EXPAND_COUNTER(forest_roots_next_hint) \ - EXPAND_COUNTER(forest_roots_lock) \ - EXPAND_COUNTER(forest_roots_server) \ - EXPAND_COUNTER(forest_saw_stale) \ EXPAND_COUNTER(forest_set_bloom_bits) \ - EXPAND_COUNTER(forest_set_dirtied) \ - EXPAND_COUNTER(forest_trigger_refresh) \ EXPAND_COUNTER(item_clear_dirty) \ EXPAND_COUNTER(item_create) \ EXPAND_COUNTER(item_delete) \ diff --git a/kmod/src/forest.c b/kmod/src/forest.c index 284f1f1c..2d53a1d9 100644 --- a/kmod/src/forest.c +++ b/kmod/src/forest.c @@ -12,7 +12,7 @@ */ #include #include -#include +#include #include #include "super.h" @@ -41,34 +41,21 @@ * for the item. Readers check log btrees for the most recent version * that it should use. * - * From a mount's perspective, the only btree whose blocks are actively - * changing is the mount's own log btree in memory. Every other btree - * it reads is stable (but could be stale) on disk. They don't need to - * be locked, but we might have to retry reads if we hit blocks that - * have been overwritten. + * The item cache reads items in bulk from stable btrees, and writes a + * transaction's worth of dirty items into the item log btree. * * Log btrees are typically very sparse. It would be wasteful for * readers to read every log btree looking for an item. Each log btree * contains a bloom filter keyed on the starting key of locks. This * lets lock holders quickly eliminate log trees that cannot contain - * keys protected by their lock. Since reads have to be done under - * locks, we cache the list of trees that could contain items in the - * lock. - * - * The list of roots in the locks can get out of date. Item - * modification in the current transactoin requires that the list - * contain the dirty log tree. Transaction commits mean that we can - * read from the stale log tree instead of the dirty one. And getting - * stale block reads from any of the trees means we need to rebuild the - * list from scratch. + * keys protected by their lock. */ struct forest_info { - struct rw_semaphore rwsem; + struct mutex mutex; struct scoutfs_radix_allocator *alloc; struct scoutfs_block_writer *wri; struct scoutfs_log_trees our_log; - atomic64_t commit_seq; struct mutex srch_mutex; struct scoutfs_srch_file srch_file; @@ -78,198 +65,20 @@ struct forest_info { #define DECLARE_FOREST_INFO(sb, name) \ struct forest_info *name = SCOUTFS_SB(sb)->forest_info -struct forest_root { - struct list_head entry; - struct scoutfs_btree_root item_root; - u64 rid; - u64 nr; - u8 our_dirty:1; -}; - struct forest_refs { struct scoutfs_btree_ref fs_ref; struct scoutfs_btree_ref logs_ref; } __packed; +/* initialize some refs that initially aren't equal */ +#define DECLARE_STALE_TRACKING_SUPER_REFS(a, b) \ + struct forest_refs a = {{cpu_to_le64(0),}}; \ + struct forest_refs b = {{cpu_to_le64(1),}} + struct forest_bloom_nrs { unsigned int nrs[SCOUTFS_FOREST_BLOOM_NRS]; }; -struct forest_lock_private { - u64 last_refreshed; - struct rw_semaphore rwsem; - unsigned int used_lock_roots:1; - struct list_head roots; - u64 set_bloom_nr; - atomic64_t dirtied_cseq; - u64 refreshed_cseq; - u64 refreshed_dirtied; -}; - -static struct forest_lock_private *get_lock_private(struct scoutfs_lock *lock) -{ - struct forest_lock_private *lpriv = ACCESS_ONCE(lock->forest_private); - - if (lpriv == NULL) { - lpriv = kzalloc(sizeof(struct forest_lock_private), GFP_NOFS); - if (lpriv) { - init_rwsem(&lpriv->rwsem); - INIT_LIST_HEAD(&lpriv->roots); - atomic64_set(&lpriv->dirtied_cseq, 0); - - if (cmpxchg(&lock->forest_private, NULL, lpriv) != NULL) - kfree(lpriv); - lpriv = lock->forest_private; - } - } - - return lpriv; -} - -static bool is_fs_root(struct forest_root *fr) -{ - return fr->rid == 0 && fr->nr == 0; -} - -/* - * We can be sure that we have the most recent version of an item if we - * have it write locked with the version of the lock. There can be no - * greater versions of the item in the system. - */ -static bool is_write_locked_version(struct scoutfs_lock *lock, u64 vers) -{ - return lock->mode == SCOUTFS_LOCK_WRITE && - vers == lock->write_version; -} - -static void free_roots(struct forest_lock_private *lpriv) -{ - struct forest_root *fr; - struct forest_root *tmp; - - list_for_each_entry_safe(fr, tmp, &lpriv->roots, entry) { - list_del_init(&fr->entry); - kfree(fr); - } -} - -/* - * Add a *copy* of the root to the list of roots to read. If our_dirty - * is set then later readers will acquire the lock to serialize writers - * and update the root from the current dirty version. - */ -static int add_root(struct super_block *sb, struct scoutfs_lock *lock, - struct forest_lock_private *lpriv, - struct scoutfs_btree_root *item_root, u64 rid, u64 nr, - bool our_dirty) -{ - struct forest_root *fr; - - BUG_ON(!rwsem_is_locked(&lpriv->rwsem)); - - fr = kmalloc(sizeof(struct forest_root), GFP_NOFS); - if (!fr) - return -ENOMEM; - - fr->item_root = *item_root; - fr->rid = rid; - fr->nr = nr; - fr->our_dirty = !!our_dirty; - list_add_tail(&fr->entry, &lpriv->roots); - - scoutfs_inc_counter(sb, forest_add_root); - trace_scoutfs_forest_add_root(sb, &lock->start, fr->rid, fr->nr, - le64_to_cpu(fr->item_root.ref.blkno), - le64_to_cpu(fr->item_root.ref.seq)); - - return 0; -} - -/* - * The caller has dirtied the current log tree and still holds the - * transaction. We need to make sure that future reads know to check - * this dirty tree in particular. The tree can be committed (and - * rotated out!) before the next refresh so we use a commit sequence - * which will identify that it can find this tree either still dirty or - * can trust that it will find an item for it. - */ -static void set_dirtied_cseq(struct super_block *sb, struct forest_info *finf, - struct scoutfs_lock *lock, - struct forest_lock_private *lpriv) -{ - u64 cseq = atomic64_read(&finf->commit_seq); - - BUG_ON(!rwsem_is_locked(&finf->rwsem)); - - if (atomic64_read(&lpriv->dirtied_cseq) != cseq) { - atomic64_set(&lpriv->dirtied_cseq, cseq); - scoutfs_inc_counter(sb, forest_set_dirtied); - - trace_scoutfs_forest_set_dirtied(sb, &lock->start, - le64_to_cpu(finf->our_log.rid), - le64_to_cpu(finf->our_log.nr), - cseq); - } -} - -/* - * This is called by the locking code while it's excluding users of the - * lock. - */ -void scoutfs_forest_clear_lock(struct super_block *sb, - struct scoutfs_lock *lock) -{ - struct forest_lock_private *lpriv = ACCESS_ONCE(lock->forest_private); - - if (lpriv) { - scoutfs_inc_counter(sb, forest_clear_lock); - free_roots(lpriv); - kfree(lpriv); - lock->forest_private = NULL; - } -} - -/* - * Usually we're reading from persistent btrees that won't be changing. - * But refresh can add a root that references the current dirty log root - * so that readers can see items which haven't yet been committed. Once - * we get the lock we make sure to give the forest root the current - * version of the tree which could have changed since it was added. - * Acquiring the lock also serializes commit responses updating the log - * and we can see if a commit has rotated in a new tree and we need to - * refresh the list. - */ -static int read_lock_forest_root(struct super_block *sb, - struct forest_info *finf, - struct forest_lock_private *lpriv, - struct forest_root *fr) -{ - int ret = 0; - - BUG_ON(!rwsem_is_locked(&lpriv->rwsem)); - - if (fr->our_dirty) { - down_read(&finf->rwsem); - if (fr->nr == le64_to_cpu(finf->our_log.nr)) { - scoutfs_inc_counter(sb, forest_read_lock_log); - fr->item_root = finf->our_log.item_root; - } else { - scoutfs_inc_counter(sb, forest_read_lock_rotated); - up_read(&finf->rwsem); - ret = -EUCLEAN; - } - } - - return ret; -} - -static void read_unlock_forest_root(struct forest_info *finf, - struct forest_root *fr) -{ - if (fr->our_dirty) - up_read(&finf->rwsem); -} - static void calc_bloom_nrs(struct forest_bloom_nrs *bloom, struct scoutfs_key *key) { @@ -306,792 +115,6 @@ static struct scoutfs_block *read_bloom_ref(struct super_block *sb, return bl; } -/* - * Empty the list of btrees currently stored in the lock and walk the - * current fs image looking for btrees whose bloom filters indicate that - * the btree may contain items covered by the lock. - * - * We ensure that the our log btree is always first and that the fs - * btree is always last because those positions offer short-circuiting - * optimizations. - * - * This doesn't deal with rereading stale blocks itself.. it returns - * ESTALE to the caller who already has to deal with retrying stale - * blocks from their btree reads. We give them the refs we read so that - * they can identify persistent stale block errors that come from - * corruption. - * - * Because we're starting all the reads from stable refs from the - * server, this will not see any dirty blocks we have in memory. We - * don't have to lock any of the btree reads. It also won't find the - * currently dirty version of our log btree. Writers record the version - * of the current dirty log tree that must be added if it's still dirty - * when we refresh. - */ -static int refresh_bloom_roots(struct super_block *sb, - struct scoutfs_lock *lock, - struct forest_refs *refs) -{ - DECLARE_FOREST_INFO(sb, finf); - struct forest_lock_private *lpriv = ACCESS_ONCE(lock->forest_private); - struct scoutfs_net_roots roots; - struct scoutfs_log_trees_val ltv; - struct scoutfs_log_trees *lt; - SCOUTFS_BTREE_ITEM_REF(iref); - struct forest_bloom_nrs bloom; - struct scoutfs_bloom_block *bb; - struct scoutfs_block *bl; - struct scoutfs_key key; - u64 our_rid = 0; - u64 our_nr = 0; - u64 dirtied; - u64 cseq; - int ret; - int i; - - scoutfs_inc_counter(sb, forest_refresh_bloom_roots); - - memset(refs, 0, sizeof(*refs)); - - down_write(&lpriv->rwsem); - - /* empty the list so no one iterates until someone's added */ - free_roots(lpriv); - - /* make sure readers see writer's in-memory dirty items */ - cseq = atomic64_read(&finf->commit_seq); - dirtied = atomic64_read(&lpriv->dirtied_cseq); - - if (dirtied == cseq) { - down_read(&finf->rwsem); - cseq = atomic64_read(&finf->commit_seq); - dirtied = atomic64_read(&lpriv->dirtied_cseq); - if (dirtied == cseq) { - scoutfs_inc_counter(sb, forest_refresh_dirty_log); - lt = &finf->our_log; - our_rid = le64_to_cpu(lt->rid); - our_nr = le64_to_cpu(lt->nr); - /* root be updated before reads, but nice to trace */ - ret = add_root(sb, lock, lpriv, <->item_root, - our_rid, our_nr, true); - } else { - ret = 0; - /* must get roots from network to see committed */ - lpriv->used_lock_roots = 1; - } - up_read(&finf->rwsem); - if (ret < 0) - goto out; - } - - trace_scoutfs_forest_refresh_seqs(sb, &lock->start, our_rid, our_nr, - dirtied, lpriv->refreshed_dirtied, - cseq, lpriv->refreshed_cseq); - - /* first use the lock's constant roots, then sample newer roots */ - if (!lpriv->used_lock_roots) { - lpriv->used_lock_roots = 1; - roots = lock->roots; - scoutfs_inc_counter(sb, forest_roots_lock); - } else { - ret = scoutfs_client_get_roots(sb, &roots); - if (ret) - goto out; - scoutfs_inc_counter(sb, forest_roots_server); - } - - trace_scoutfs_forest_using_roots(sb, &roots.fs_root, &roots.logs_root); - refs->fs_ref = roots.fs_root.ref; - refs->logs_ref = roots.logs_root.ref; - - calc_bloom_nrs(&bloom, &lock->start); - - scoutfs_key_init_log_trees(&key, 0, 0); - for (;; scoutfs_key_inc(&key)) { - - ret = scoutfs_btree_next(sb, &roots.logs_root, &key, &iref); - if (ret == 0) { - if (iref.val_len == sizeof(ltv)) { - key = *iref.key; - memcpy(<v, iref.val, iref.val_len); - } else { - ret = -EIO; - } - scoutfs_btree_put_iref(&iref); - } - if (ret < 0) { - if (ret == -ENOENT) { - ret = 0; - break; - } - goto out; - } - - if (ltv.bloom_ref.blkno == 0) - continue; - - bl = read_bloom_ref(sb, <v.bloom_ref); - if (IS_ERR(bl)) { - ret = PTR_ERR(bl); - goto out; - } - bb = bl->data; - - for (i = 0; i < ARRAY_SIZE(bloom.nrs); i++) { - if (!test_bit_le(bloom.nrs[i], bb->bits)) - break; - } - - scoutfs_block_put(sb, bl); - - trace_scoutfs_forest_bloom_search(sb, &lock->start, - le64_to_cpu(key.sklt_rid), - le64_to_cpu(key.sklt_nr), - le64_to_cpu(ltv.bloom_ref.blkno), - le64_to_cpu(ltv.bloom_ref.seq), - i); - - /* one of the bloom bits wasn't set */ - if (i != ARRAY_SIZE(bloom.nrs)) { - scoutfs_inc_counter(sb, forest_bloom_fail); - continue; - } - - scoutfs_inc_counter(sb, forest_bloom_pass); - - /* we've added our dirty log, skip old committed versions */ - if (le64_to_cpu(key.sklt_rid) == our_rid && - le64_to_cpu(key.sklt_nr) == our_nr) { - scoutfs_inc_counter(sb, forest_refresh_skip_log); - continue; - } - - ret = add_root(sb, lock, lpriv, <v.item_root, - le64_to_cpu(key.sklt_rid), - le64_to_cpu(key.sklt_nr), false); - if (ret < 0) - goto out; - } - - /* always add final fs tree last */ - ret = add_root(sb, lock, lpriv, &roots.fs_root, 0, 0, false); - if (ret < 0) - goto out; - - lpriv->refreshed_cseq = cseq; - lpriv->refreshed_dirtied = dirtied; - lpriv->last_refreshed = lock->refresh_gen; - - ret = 0; - -out: - if (ret < 0) - free_roots(lpriv); - - up_write(&lpriv->rwsem); - return ret; -} - -/* initialize some refs that initially aren't equal */ -#define DECLARE_STALE_TRACKING_SUPER_REFS(a, b) \ - struct forest_refs a = {{cpu_to_le64(0),}}; \ - struct forest_refs b = {{cpu_to_le64(1),}} - -/* - * If the caller got our magic errnos we refresh the roots and return - * -EAGAIN so they retry. If we get -ESTALE from block reference - * inconsistency with the same root refs then it's consistent corruption - * and we return an error. We pass through all other errnos that aren't - * our magic retry errnos. - */ -static int refresh_check(struct super_block *sb, struct scoutfs_lock *lock, - struct forest_refs *prev_refs, - struct forest_refs *refs, int err) -{ - int ret; - - /* don't want to get in a loop passing eagain through, not expected */ - if (WARN_ON_ONCE(err == -EAGAIN)) - return -EINVAL; - - if (!(err == -ESTALE || err == -EUCLEAN)) - return err; - - if (err == -ESTALE) { - scoutfs_inc_counter(sb, forest_saw_stale); - if (memcmp(prev_refs, refs, sizeof(*refs)) == 0) - return -EIO; - } - *prev_refs = *refs; - - ret = refresh_bloom_roots(sb, lock, refs); - if (ret == 0 || ret == -ESTALE) - ret = -EAGAIN; - - return ret; -} - -/* - * Iterate over all the roots that could contain items covered by the - * caller's lock. The caller starts iteration by passing in a NULL fr. - * We return -EUCLEAN if the caller needs to refresh the bloom roots. - * We use the lock's refresh gen to find out when the lock was - * invalidated and the contents of the trees could have changed. - * - * The commit_seqs are keeping the list of roots in sync with our log - * root. As writers modify it we make sure we have a root that will - * lock and check our in-memory dirty log tre. Once that's committed we - * refresh again so we read the stable committed version without locks. - */ -static int for_each_forest_root(struct super_block *sb, - struct scoutfs_lock *lock, - struct forest_info *finf, - struct forest_lock_private *lpriv, - struct forest_root **fr) -{ - u64 cseq = atomic64_read(&finf->commit_seq); - u64 dirtied = atomic64_read(&lpriv->dirtied_cseq); - - if (WARN_ON_ONCE(!rwsem_is_locked(&lpriv->rwsem))) - return -EIO; - - if (list_empty(&lpriv->roots) || - lock->refresh_gen != lpriv->last_refreshed || - dirtied > lpriv->refreshed_dirtied || - (dirtied == lpriv->refreshed_cseq && - cseq > lpriv->refreshed_cseq)) { - scoutfs_inc_counter(sb, forest_trigger_refresh); - trace_scoutfs_forest_trigger_refresh(sb, - &lock->start, - !!list_empty(&lpriv->roots), - lock->refresh_gen, - lpriv->last_refreshed, - dirtied, lpriv->refreshed_dirtied, - cseq, lpriv->refreshed_cseq); - return -EUCLEAN; - } - - if (*fr == NULL) - *fr = list_prepare_entry((*fr), &lpriv->roots, entry); - - list_for_each_entry_continue((*fr), &lpriv->roots, entry) - return 0; - - *fr = NULL; - return 0; -} - -/* - * We fake 1 as the version for the fs items. The least valid log item - * version is also 1, but we guarantee that we check the log trees first - * so they'll always be found before the fs items. - */ -static u64 item_vers(struct forest_root *fr, void *val) -{ - struct scoutfs_log_item_value *liv; - - if (is_fs_root(fr)) - return 1; - - liv = val; - return le64_to_cpu(liv->vers); -} - -static bool item_flags(struct forest_root *fr, void *val) -{ - struct scoutfs_log_item_value *liv; - - if (is_fs_root(fr)) - return 0; - - liv = val; - return liv->flags; -} - -static bool item_is_deletion(struct forest_root *fr, void *val) -{ - return item_flags(fr, val) & SCOUTFS_LOG_ITEM_FLAG_DELETION; -} - -/* just a little helper to slim down all the call sites */ -static int lock_safe(struct scoutfs_lock *lock, struct scoutfs_key *key, - int mode) -{ - if (WARN_ON_ONCE(!scoutfs_lock_protected(lock, key, mode))) - return -EINVAL; - else - return 0; -} - -/* - * Copy the cached item's value into the caller's single value vector. - * The number of bytes that fit in the vec and were copied is returned. - * A null val returns 0. Items in log trees have a value header that - * needs to be skipped. - */ -static int copy_val(struct forest_root *fr, struct kvec *val, void *item_val, - int item_val_len) -{ - void *val_start = item_val; - unsigned int val_len = item_val_len; - int ret; - - if (!is_fs_root(fr)) { - val_start += sizeof(struct scoutfs_log_item_value); - val_len -= sizeof(struct scoutfs_log_item_value); - } - - if (val) { - ret = min_t(size_t, val_len, val->iov_len); - memcpy(val->iov_base, val_start, ret); - } else { - ret = 0; - } - - return ret; -} - -int scoutfs_forest_lookup(struct super_block *sb, struct scoutfs_key *key, - struct kvec *val, struct scoutfs_lock *lock) -{ - DECLARE_FOREST_INFO(sb, finf); - DECLARE_STALE_TRACKING_SUPER_REFS(prev_refs, refs); - struct forest_lock_private *lpriv; - SCOUTFS_BTREE_ITEM_REF(iref); - struct forest_root *fr; - u64 found_vers; - u64 vers; - int ret; - int err; - - scoutfs_inc_counter(sb, forest_lookup); - - if ((ret = lock_safe(lock, key, SCOUTFS_LOCK_READ)) < 0) - goto out; - - lpriv = get_lock_private(lock); - if (!lpriv) { - ret = -ENOMEM; - goto out; - } - -retry: - down_read(&lpriv->rwsem); - - found_vers = 0; - ret = -ENOENT; - fr = NULL; - - while (!(err = for_each_forest_root(sb, lock, finf, lpriv, &fr)) && fr){ - - /* done if we found log items before fs root */ - if (found_vers > 0 && is_fs_root(fr)) - break; - - err = read_lock_forest_root(sb, finf, lpriv, fr); - if (err < 0) - break; - err = scoutfs_btree_lookup(sb, &fr->item_root, key, &iref); - if (err < 0) - read_unlock_forest_root(finf, fr); - if (err == -ENOENT) - continue; - if (err < 0) - break; - - vers = item_vers(fr, iref.val); - - if (vers > found_vers) { - found_vers = vers; - - if (item_is_deletion(fr, iref.val)) - ret = -ENOENT; - else - ret = copy_val(fr, val, iref.val, iref.val_len); - } - scoutfs_btree_put_iref(&iref); - read_unlock_forest_root(finf, fr); - - /* done if we have the most recent locked dirty version */ - if (is_write_locked_version(lock, vers)) - break; - } - - up_read(&lpriv->rwsem); - - err = refresh_check(sb, lock, &prev_refs, &refs, err); - if (err == -EAGAIN) - goto retry; - if (err < 0) - ret = err; -out: - return ret; -} - -int scoutfs_forest_lookup_exact(struct super_block *sb, - struct scoutfs_key *key, struct kvec *val, - struct scoutfs_lock *lock) -{ - int ret; - - ret = scoutfs_forest_lookup(sb, key, val, lock); - if (ret == val->iov_len) - ret = 0; - else if (ret >= 0) - ret = -EIO; - - return ret; -} - -static inline void forest_iter_set_max(struct scoutfs_key *key, bool forward) -{ - if (forward) - scoutfs_key_set_ones(key); - else - scoutfs_key_set_zeros(key); -} - -static inline void forest_iter_set_min(struct scoutfs_key *key, bool forward) -{ - return forest_iter_set_max(key, !forward); -} - -static inline void forest_iter_key_advance(struct scoutfs_key *key, bool forward) -{ - if (forward) - scoutfs_key_inc(key); - else - scoutfs_key_dec(key); -} - -static inline int forest_iter_key_cmp(struct scoutfs_key *a, - struct scoutfs_key *b, bool forward) -{ - int cmp = scoutfs_key_compare(a, b); - if (cmp == 0 || forward) - return cmp; - return -cmp; -} - -/* returns true if a is before b in the direction of iteration */ -static inline bool forest_iter_key_before(struct scoutfs_key *a, - struct scoutfs_key *b, bool forward) -{ - int cmp = scoutfs_key_compare(a, b); - - return forward ? cmp < 0 : cmp > 0; -} - -/* returns true if a is before or equal to b in the direction of iteration */ -static inline bool forest_iter_key_within(struct scoutfs_key *a, - struct scoutfs_key *b, bool forward) -{ - int cmp = scoutfs_key_compare(a, b); - - return forward ? cmp <= 0 : cmp >= 0; -} - -static inline int forest_iter_btree_search(struct super_block *sb, - struct scoutfs_btree_root *root, - struct scoutfs_key *key, - struct scoutfs_btree_item_ref *iref, - bool forward) -{ - if (forward) - return scoutfs_btree_next(sb, root, key, iref); - else - return scoutfs_btree_prev(sb, root, key, iref); -} - -struct forest_iter_pos { - struct rb_node node; - struct forest_root *fr; - struct scoutfs_key key; - u64 vers; - bool deletion; - void *val; - int val_len; -}; - -static struct forest_iter_pos *first_iter_pos(struct rb_root *root) -{ - return rb_entry_safe(rb_first(root), struct forest_iter_pos, node); -} - -static struct forest_iter_pos *next_iter_pos(struct forest_iter_pos *ip) -{ - return rb_entry_safe(rb_next(&ip->node), struct forest_iter_pos, node); -} - -/* - * Sort root iter positions first by missing items, then by key in the - * direction if iteration, and then by reverse version. Thus the first - * iter_pos in the rbtree is either a root that needs to check the next - * item, a deletion that removes all older versions of the key, or is - * the item that iteration should return. - */ -static int cmp_iter_pos(struct forest_iter_pos *a, struct forest_iter_pos *b, - bool fwd) -{ - int cmp; - - if (a->vers == 0) - return -1; - if (b->vers == 0) - return 1; - - cmp = forest_iter_key_cmp(&a->key, &b->key, fwd); - if (cmp) - return cmp; - - return scoutfs_cmp_u64s(b->vers, a->vers); -} - -/* - * There's a sneaky subtlety here. The fs items have a fake verison of - * 1 which can equal a log tree version of 1. We always iterate over - * the fs root last so we try to insert the fake fs item last. It will - * compare equal to the version and will be inserted to the right of the - * existing log item. - */ -static void insert_iter_pos(struct forest_iter_pos *ins, struct rb_root *root, - bool fwd) -{ - struct rb_node **node = &root->rb_node; - struct rb_node *parent = NULL; - struct forest_iter_pos *ip; - int cmp; - - while (*node) { - parent = *node; - ip = container_of(*node, struct forest_iter_pos, node); - - cmp = cmp_iter_pos(ins, ip, fwd); - if (cmp < 0) - node = &(*node)->rb_left; - else - node = &(*node)->rb_right; - } - - rb_link_node(&ins->node, parent, node); - rb_insert_color(&ins->node, root); -} - -/* - * clear the version and re-insert the iter_pos so that the next - * iteration will search for the next item in the root. - */ -static void advance_iter_pos(struct forest_iter_pos *ip, struct rb_root *root, - bool fwd) -{ - ip->vers = 0; - forest_iter_key_advance(&ip->key, fwd); - kfree(ip->val); - ip->val = NULL; - rb_erase(&ip->node, root); - insert_iter_pos(ip, root, fwd); -} - -static void destroy_iter_pos(struct forest_iter_pos *ip, struct rb_root *root) -{ - kfree(ip->val); - rb_erase(&ip->node, root); - kfree(ip); -} - -/* - * Iterate over items in all the roots looking for the next least - * non-deletion item in the direction of iteration. The roots can have - * any combination of item keys, versions, and deletions so we have to - * be very careful. - * - * We store the next item in each root in a node in an rbtree. The - * nodes are sorted by needing to be read, key, then reverse version. - * The first node in the rbtree is always a root to search, a deletion - * item to remove, or the item that iteration should return. - * - * btree locking prevents us from holding references to the items in all - * the roots so we store copies of the items in the nodes. - */ -static int forest_iter(struct super_block *sb, struct scoutfs_key *key, - struct scoutfs_key *end, struct kvec *val, - struct scoutfs_lock *lock, bool fwd) -{ - DECLARE_STALE_TRACKING_SUPER_REFS(prev_refs, refs); - struct forest_lock_private *lpriv; - DECLARE_FOREST_INFO(sb, finf); - SCOUTFS_BTREE_ITEM_REF(iref); - struct rb_root iter_root = RB_ROOT; - struct scoutfs_key found_key; - struct forest_iter_pos *nip; - struct forest_iter_pos *ip; - struct forest_root *fr; - u64 found_vers = 0; - int found_ret = 0; - int ret; - - scoutfs_inc_counter(sb, forest_iter); - scoutfs_key_set_zeros(&found_key); - - if ((ret = lock_safe(lock, key, SCOUTFS_LOCK_READ)) < 0) - goto out; - - /* use the end key as the end key if it's closer to reduce compares */ - if (forest_iter_key_before(&lock->end, end, fwd)) - end = &lock->end; - - /* convenience to avoid searching if caller iterates past their end */ - if (!forest_iter_key_within(key, end, fwd)) { - ret = -ENOENT; - goto out; - } - - lpriv = get_lock_private(lock); - if (!lpriv) { - ret = -ENOMEM; - goto out; - } - -retry: - down_read(&lpriv->rwsem); - - /* initialize iter position for each tree */ - fr = NULL; - while (!(ret = for_each_forest_root(sb, lock, finf, lpriv, &fr)) && fr){ - ip = kmalloc(sizeof(struct forest_iter_pos), GFP_NOFS); - if (!ip) { - ret = -ENOMEM; - goto unlock; - } - - ip->fr = fr; - ip->key = *key; - ip->vers = 0; - ip->deletion = false; - ip->val = NULL; - insert_iter_pos(ip, &iter_root, fwd); - } - if (ret < 0) - goto unlock; - - scoutfs_key_set_zeros(&found_key); - found_vers = 0; - found_ret = -ENOENT; - - /* search until we hit the end key on all roots */ - while ((ip = first_iter_pos(&iter_root))) { - fr = ip->fr; - - /* search for the next item in the root */ - if (ip->vers == 0) { - ret = read_lock_forest_root(sb, finf, lpriv, fr); - if (ret < 0) - goto unlock; - ret = forest_iter_btree_search(sb, &fr->item_root, - &ip->key, &iref, fwd); - if (ret < 0) - read_unlock_forest_root(finf, fr); - if (ret == -ENOENT) { - destroy_iter_pos(ip, &iter_root); - continue; - } - if (ret < 0) - goto unlock; - - ip->key = *iref.key; - ip->vers = item_vers(fr, iref.val); - ip->deletion = item_is_deletion(fr, iref.val); - - trace_scoutfs_forest_iter_search(sb, fr->rid, fr->nr, - ip->vers, - item_flags(fr, iref.val), - &ip->key); - - if (!forest_iter_key_within(&ip->key, end, fwd)) { - /* root is done if next is past end */ - destroy_iter_pos(ip, &iter_root); - } else { - kfree(ip->val); - ip->val = kmalloc(iref.val_len, GFP_NOFS); - if (!ip->val) { - ret = -ENOMEM; - } else { - /* copy item and re-sort its node */ - memcpy(ip->val, iref.val, iref.val_len); - ip->val_len = iref.val_len; - rb_erase(&ip->node, &iter_root); - insert_iter_pos(ip, &iter_root, fwd); - } - } - - scoutfs_btree_put_iref(&iref); - read_unlock_forest_root(finf, fr); - - if (ret < 0) - goto unlock; - continue; - } - - /* deletions remove all earlier versions and themselves */ - if (ip->deletion) { - while ((nip = next_iter_pos(ip)) && - !scoutfs_key_compare(&ip->key, &nip->key)) { - advance_iter_pos(nip, &iter_root, fwd); - } - advance_iter_pos(ip, &iter_root, fwd); - continue; - } - - /* use the first non-deletion across all roots */ - found_key = ip->key; - found_vers = ip->vers; - found_ret = copy_val(ip->fr, val, ip->val, ip->val_len); - break; - } - - ret = 0; -unlock: - up_read(&lpriv->rwsem); - - /* destroy_ rebalances so postorder traversal could skip nodes */ - for (ip = first_iter_pos(&iter_root); - ip && (nip = next_iter_pos(ip), 1); - ip = nip) { - destroy_iter_pos(ip, &iter_root); - } - - ret = refresh_check(sb, lock, &prev_refs, &refs, ret); - if (ret == -EAGAIN) - goto retry; - -out: - trace_scoutfs_forest_iter_ret(sb, key, end, fwd, ret, - found_vers, found_ret, &found_key); - - if (ret == 0) { - ret = found_ret; - /* _next/_prev interfaces modify caller's key :/ */ - if (ret >= 0) - *key = found_key; - } - - return ret; -} - -int scoutfs_forest_next(struct super_block *sb, struct scoutfs_key *key, - struct scoutfs_key *last, struct kvec *val, - struct scoutfs_lock *lock) -{ - return forest_iter(sb, key, last, val, lock, true); -} - -int scoutfs_forest_prev(struct super_block *sb, struct scoutfs_key *key, - struct scoutfs_key *first, struct kvec *val, - struct scoutfs_lock *lock) -{ - return forest_iter(sb, key, first, val, lock, false); -} - /* * This is an unlocked iteration across all the btrees to find a hint at * the next key that the caller could read. It's used to find out what @@ -1120,8 +143,9 @@ int scoutfs_forest_next_hint(struct super_block *sb, struct scoutfs_key *key, bool have_next; int ret; -retry: scoutfs_inc_counter(sb, forest_roots_next_hint); + +retry: ret = scoutfs_client_get_roots(sb, &roots); if (ret) goto out; @@ -1253,11 +277,11 @@ int scoutfs_forest_read_items(struct super_block *sb, int ret; int i; + scoutfs_inc_counter(sb, forest_read_items); calc_bloom_nrs(&bloom, &lock->start); roots = lock->roots; retry: - scoutfs_inc_counter(sb, forest_read_items); ret = scoutfs_client_get_roots(sb, &roots); if (ret) goto out; @@ -1347,19 +371,18 @@ out: /* * Make sure that the bloom bits for the lock's start key are all set in * the current log's bloom block. We record the nr of our log tree in - * the lock so that we only try to cow and set the bits once per tree. + * the lock so that we only try to cow and set the bits once per tree + * across multiple commits as long as the lock isn't purged. * - * The caller already gets the big finf write rwsem lock to modify the - * dirty log btree, might as well use it to protect the bloom ref and - * the lpriv field. We'll need finer grained locking once the btrees - * get block locks. + * This is using a coarse mutex to serialize cowing the block. It could + * be much finer grained, but it's infrequent. We'll keep an eye on if + * it gets expensive enough to warrant fixing. */ -static int set_lock_bloom_bits(struct super_block *sb, - struct scoutfs_lock *lock, u64 nr) +int scoutfs_forest_set_bloom_bits(struct super_block *sb, + struct scoutfs_lock *lock) { struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super; DECLARE_FOREST_INFO(sb, finf); - struct forest_lock_private *lpriv; struct scoutfs_block *new_bl = NULL; struct scoutfs_block *bl = NULL; struct scoutfs_bloom_block *bb; @@ -1367,24 +390,21 @@ static int set_lock_bloom_bits(struct super_block *sb, struct forest_bloom_nrs bloom; int nr_set = 0; u64 blkno; + u64 nr; int ret; int err; int i; - BUG_ON(!rwsem_is_locked(&finf->rwsem)); - - lpriv = get_lock_private(lock); - if (!lpriv) { - ret = -ENOMEM; - goto out; - } + nr = le64_to_cpu(finf->our_log.nr); /* our rid is constant */ - if (lpriv->set_bloom_nr == nr) { + if (atomic64_read(&lock->forest_bloom_nr) == nr) { ret = 0; goto out; } + mutex_lock(&finf->mutex); + scoutfs_inc_counter(sb, forest_set_bloom_bits); calc_bloom_nrs(&bloom, &lock->start); @@ -1394,7 +414,7 @@ static int set_lock_bloom_bits(struct super_block *sb, bl = read_bloom_ref(sb, ref); if (IS_ERR(bl)) { ret = PTR_ERR(bl); - goto out; + goto unlock; } bb = bl->data; } @@ -1403,7 +423,7 @@ static int set_lock_bloom_bits(struct super_block *sb, ret = scoutfs_radix_alloc(sb, finf->alloc, finf->wri, &blkno); if (ret < 0) - goto out; + goto unlock; new_bl = scoutfs_block_create(sb, blkno); if (IS_ERR(new_bl)) { @@ -1411,7 +431,7 @@ static int set_lock_bloom_bits(struct super_block *sb, blkno); BUG_ON(err); /* could have dirtied */ ret = PTR_ERR(new_bl); - goto out; + goto unlock; } if (bl) { @@ -1452,21 +472,15 @@ static int set_lock_bloom_bits(struct super_block *sb, le64_to_cpu(finf->our_log.bloom_ref.seq), nr_set); - lpriv->set_bloom_nr = nr; + atomic64_set(&lock->forest_bloom_nr, nr); ret = 0; +unlock: + mutex_unlock(&finf->mutex); out: scoutfs_block_put(sb, bl); return ret; } -int scoutfs_forest_set_bloom_bits(struct super_block *sb, - struct scoutfs_lock *lock) -{ - DECLARE_FOREST_INFO(sb, finf); - - return set_lock_bloom_bits(sb, lock, le64_to_cpu(finf->our_log.nr)); -} - int scoutfs_forest_insert_list(struct super_block *sb, struct scoutfs_btree_item_list *lst) { @@ -1476,253 +490,6 @@ int scoutfs_forest_insert_list(struct super_block *sb, &finf->our_log.item_root, lst); } -/* - * The btree code takes a single value buffer. When we're working with - * the log btrees we want to add a log item value metadata header. In - * the interest of expedience we're just allocating a new contiguous - * buffer that prepends the header. We could make the btree ops take - * vectored values or we could make all btree items have the metadata. - */ -static struct kvec *alloc_log_item_value(struct kvec *val, __u8 flags, - struct scoutfs_lock *lock) -{ - struct scoutfs_log_item_value *liv; - struct kvec *kv; - unsigned int val_len = val ? val->iov_len : 0; - - kv = kmalloc(sizeof(*kv) + sizeof(*liv) + val_len, GFP_NOFS); - if (kv) { - liv = (void *)kv + sizeof(*kv); - - kv->iov_base = liv; - kv->iov_len = sizeof(*liv) + val_len; - - liv->vers = cpu_to_le64(lock->write_version); - liv->flags = flags; - if (val) - memcpy(liv->data, val->iov_base, val->iov_len); - } - - return kv; -} - -/* - * Create a new dirty item. Can return -EEXIST if the item already - * exists or will just force createion the caller's item, overwriting - * any existing item. We can be overwriting an existing deletion item - * in our log root. - */ -static int forest_insert(struct super_block *sb, struct scoutfs_key *key, - struct kvec *val, struct scoutfs_lock *lock, - bool check_eexist, bool check_enoent, bool could_read) -{ - DECLARE_FOREST_INFO(sb, finf); - struct forest_lock_private *lpriv; - struct kvec *iv = NULL; - int ret; - - scoutfs_inc_counter(sb, forest_insert); - - lpriv = get_lock_private(lock); - if (!lpriv) { - ret = -ENOMEM; - goto out; - } - - if (check_eexist || check_enoent) { - ret = scoutfs_forest_lookup(sb, key, NULL, lock); - if (ret == 0 && check_eexist) { - ret = -EEXIST; - goto out; - } - if (ret == -ENOENT) { - if (check_enoent) - goto out; - ret = 0; - } - if (ret < 0) - goto out; - - } - - iv = alloc_log_item_value(val, 0, lock); - if (iv == NULL) { - ret = -ENOMEM; - goto out; - } - - down_write(&finf->rwsem); - - ret = set_lock_bloom_bits(sb, lock, le64_to_cpu(finf->our_log.nr)); - if (ret < 0) - goto unlock; - - ret = scoutfs_btree_force(sb, finf->alloc, finf->wri, - &finf->our_log.item_root, key, - iv->iov_base, iv->iov_len); - if (ret == 0 && could_read) - set_dirtied_cseq(sb, finf, lock, lpriv); -unlock: - up_write(&finf->rwsem); - kfree(iv); - -out: - return ret; -} - -/* - * Insert an item, returning -EEXIST if it already exists. - */ -int scoutfs_forest_create(struct super_block *sb, struct scoutfs_key *key, - struct kvec *val, struct scoutfs_lock *lock) -{ - int ret; - - if ((ret = lock_safe(lock, key, SCOUTFS_LOCK_WRITE)) < 0) - return ret; - - return forest_insert(sb, key, val, lock, true, false, true); -} - -/* - * Insert an item, ignoring whether it exists or not. - */ -int scoutfs_forest_create_force(struct super_block *sb, - struct scoutfs_key *key, struct kvec *val, - struct scoutfs_lock *lock) -{ - int ret; - - if ((ret = lock_safe(lock, key, SCOUTFS_LOCK_WRITE_ONLY)) < 0) - return ret; - - return forest_insert(sb, key, val, lock, false, false, false); -} - -/* - * Overwrite an existing item, possibly changing its value length, - * returning -ENOENT if it didn't already exist. - */ -int scoutfs_forest_update(struct super_block *sb, struct scoutfs_key *key, - struct kvec *val, struct scoutfs_lock *lock) -{ - int ret; - - if ((ret = lock_safe(lock, key, SCOUTFS_LOCK_WRITE)) < 0) - return ret; - - return forest_insert(sb, key, val, lock, false, true, true); -} - -/* XXX not yet supported, idea is btree op that only uses dirty blocks */ -int scoutfs_forest_delete_dirty(struct super_block *sb, - struct scoutfs_key *key) -{ - BUG(); - return 0; -} - -static int forest_delete(struct super_block *sb, struct scoutfs_key *key, - struct scoutfs_lock *lock, bool check_enoent, - bool could_read) -{ - DECLARE_FOREST_INFO(sb, finf); - struct forest_lock_private *lpriv; - struct scoutfs_log_item_value liv; - int ret; - - scoutfs_inc_counter(sb, forest_delete); - - lpriv = get_lock_private(lock); - if (!lpriv) { - ret = -ENOMEM; - goto out; - } - - if (check_enoent) { - ret = scoutfs_forest_lookup(sb, key, NULL, lock); - if (ret < 0) - goto out; - } - - liv.vers = cpu_to_le64(lock->write_version); - liv.flags = SCOUTFS_LOG_ITEM_FLAG_DELETION; - - down_write(&finf->rwsem); - - ret = set_lock_bloom_bits(sb, lock, le64_to_cpu(finf->our_log.nr)); - if (ret < 0) - goto unlock; - - ret = scoutfs_btree_force(sb, finf->alloc, finf->wri, - &finf->our_log.item_root, - key, &liv, sizeof(liv)); - if (ret == 0 && could_read) - set_dirtied_cseq(sb, finf, lock, lpriv); -unlock: - up_write(&finf->rwsem); -out: - return ret; -} - -/* - * Delete an item from the forest of btrees. This interface returns - * -ENOENT if the item doesn't exist (may already be deleted). We have - * to first read from the forest to see if it exists. If we get -ENOENT - * it might be because it exists in our log tree. We force our deletion - * item regardless of the current state of the item in our log tree. - */ -int scoutfs_forest_delete(struct super_block *sb, struct scoutfs_key *key, - struct scoutfs_lock *lock) -{ - int ret; - - if ((ret = lock_safe(lock, key, SCOUTFS_LOCK_WRITE)) < 0) - return ret; - - return forest_delete(sb, key, lock, true, true); -} - -/* - * Like deletion, but we don't have to read the current item to return - * -ENOENT. We just force a deletion item. - */ -int scoutfs_forest_delete_force(struct super_block *sb, - struct scoutfs_key *key, - struct scoutfs_lock *lock) -{ - int ret; - - if ((ret = lock_safe(lock, key, SCOUTFS_LOCK_WRITE_ONLY)) < 0) - return ret; - - return forest_delete(sb, key, lock, false, false); -} - -/* XXX not supported, just for initial demo */ -int scoutfs_forest_delete_save(struct super_block *sb, - struct scoutfs_key *key, - struct list_head *list, - struct scoutfs_lock *lock) -{ - int ret = scoutfs_forest_delete(sb, key, lock); - BUG_ON(ret != 0); - return ret; -} - -/* XXX not supported, just for initial demo */ -int scoutfs_forest_restore(struct super_block *sb, struct list_head *list, - struct scoutfs_lock *lock) -{ - BUG(); - return 0; -} - -/* XXX not supported, just for initial demo */ -void scoutfs_forest_free_batch(struct super_block *sb, struct list_head *list) -{ -} - /* * Add a srch entry to the current transaction's log file. It will be * committed in a transaction along with the dirty btree blocks that @@ -1756,7 +523,7 @@ void scoutfs_forest_init_btrees(struct super_block *sb, { DECLARE_FOREST_INFO(sb, finf); - down_write(&finf->rwsem); + mutex_lock(&finf->mutex); finf->alloc = alloc; finf->wri = wri; @@ -1767,7 +534,6 @@ void scoutfs_forest_init_btrees(struct super_block *sb, finf->our_log.bloom_ref = lt->bloom_ref; finf->our_log.rid = lt->rid; finf->our_log.nr = lt->nr; - atomic64_inc(&finf->commit_seq); finf->srch_file = lt->srch_file; WARN_ON_ONCE(finf->srch_bl); /* commiting should have put the block */ finf->srch_bl = NULL; @@ -1775,10 +541,9 @@ void scoutfs_forest_init_btrees(struct super_block *sb, trace_scoutfs_forest_init_our_log(sb, le64_to_cpu(lt->rid), le64_to_cpu(lt->nr), le64_to_cpu(lt->item_root.ref.blkno), - le64_to_cpu(lt->item_root.ref.seq), - atomic64_read(&finf->commit_seq)); + le64_to_cpu(lt->item_root.ref.seq)); - up_write(&finf->rwsem); + mutex_unlock(&finf->mutex); } /* @@ -1816,9 +581,8 @@ int scoutfs_forest_setup(struct super_block *sb) } /* the finf fields will be setup as we open a transaction */ - init_rwsem(&finf->rwsem); + mutex_init(&finf->mutex); mutex_init(&finf->srch_mutex); - atomic64_set(&finf->commit_seq, 0); sbi->forest_info = finf; ret = 0; diff --git a/kmod/src/forest.h b/kmod/src/forest.h index b480c971..6d0c0c8c 100644 --- a/kmod/src/forest.h +++ b/kmod/src/forest.h @@ -13,41 +13,8 @@ typedef int (*scoutfs_forest_item_cb)(struct super_block *sb, struct scoutfs_log_item_value *liv, void *val, int val_len, void *arg); -int scoutfs_forest_lookup(struct super_block *sb, struct scoutfs_key *key, - struct kvec *val, struct scoutfs_lock *lock); -int scoutfs_forest_lookup_exact(struct super_block *sb, - struct scoutfs_key *key, struct kvec *val, - struct scoutfs_lock *lock); -int scoutfs_forest_next(struct super_block *sb, struct scoutfs_key *key, - struct scoutfs_key *last, struct kvec *val, - struct scoutfs_lock *lock); int scoutfs_forest_next_hint(struct super_block *sb, struct scoutfs_key *key, struct scoutfs_key *next); -int scoutfs_forest_prev(struct super_block *sb, struct scoutfs_key *key, - struct scoutfs_key *first, struct kvec *val, - struct scoutfs_lock *lock); -int scoutfs_forest_create(struct super_block *sb, struct scoutfs_key *key, - struct kvec *val, struct scoutfs_lock *lock); -int scoutfs_forest_create_force(struct super_block *sb, - struct scoutfs_key *key, struct kvec *val, - struct scoutfs_lock *lock); -int scoutfs_forest_update(struct super_block *sb, struct scoutfs_key *key, - struct kvec *val, struct scoutfs_lock *lock); -int scoutfs_forest_delete_dirty(struct super_block *sb, - struct scoutfs_key *key); -int scoutfs_forest_delete(struct super_block *sb, struct scoutfs_key *key, - struct scoutfs_lock *lock); -int scoutfs_forest_delete_force(struct super_block *sb, - struct scoutfs_key *key, - struct scoutfs_lock *lock); -int scoutfs_forest_delete_save(struct super_block *sb, - struct scoutfs_key *key, - struct list_head *list, - struct scoutfs_lock *lock); -int scoutfs_forest_restore(struct super_block *sb, struct list_head *list, - struct scoutfs_lock *lock); -void scoutfs_forest_free_batch(struct super_block *sb, struct list_head *list); - int scoutfs_forest_read_items(struct super_block *sb, struct scoutfs_lock *lock, struct scoutfs_key *key, @@ -67,9 +34,6 @@ void scoutfs_forest_init_btrees(struct super_block *sb, void scoutfs_forest_get_btrees(struct super_block *sb, struct scoutfs_log_trees *lt); -void scoutfs_forest_clear_lock(struct super_block *sb, - struct scoutfs_lock *lock); - int scoutfs_forest_setup(struct super_block *sb); void scoutfs_forest_destroy(struct super_block *sb); diff --git a/kmod/src/lock.c b/kmod/src/lock.c index 19413458..7775b1ad 100644 --- a/kmod/src/lock.c +++ b/kmod/src/lock.c @@ -21,7 +21,6 @@ #include "super.h" #include "lock.h" -#include "forest.h" #include "scoutfs_trace.h" #include "msg.h" #include "cmp.h" @@ -230,7 +229,6 @@ static void lock_free(struct lock_info *linfo, struct scoutfs_lock *lock) BUG_ON(!list_empty(&lock->shrink_head)); BUG_ON(!list_empty(&lock->cov_list)); - scoutfs_forest_clear_lock(sb, lock); kfree(lock); } @@ -265,6 +263,8 @@ static struct scoutfs_lock *lock_alloc(struct super_block *sb, init_waitqueue_head(&lock->waitq); lock->mode = SCOUTFS_LOCK_NULL; + atomic64_set(&lock->forest_bloom_nr, 0); + trace_scoutfs_lock_alloc(sb, lock); return lock; diff --git a/kmod/src/lock.h b/kmod/src/lock.h index f659e50b..cb9bb3d6 100644 --- a/kmod/src/lock.h +++ b/kmod/src/lock.h @@ -46,8 +46,8 @@ struct scoutfs_lock { struct scoutfs_tseq_entry tseq_entry; - /* the forest btree code stores data per lock */ - struct forest_lock_private *forest_private; + /* the forest tracks which log tree last saw bloom bit updates */ + atomic64_t forest_bloom_nr; }; struct scoutfs_lock_coverage { diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h index eeafd395..f26525bd 100644 --- a/kmod/src/scoutfs_trace.h +++ b/kmod/src/scoutfs_trace.h @@ -2104,138 +2104,15 @@ TRACE_EVENT(scoutfs_forest_using_roots, __entry->logs_blkno, __entry->logs_seq) ); -TRACE_EVENT(scoutfs_forest_add_root, - TP_PROTO(struct super_block *sb, struct scoutfs_key *key, u64 rid, - u64 nr, u64 blkno, u64 seq), - TP_ARGS(sb, key, rid, nr, blkno, seq), - TP_STRUCT__entry( - SCSB_TRACE_FIELDS - sk_trace_define(key) - __field(__u64, b_rid) - __field(__u64, nr) - __field(__u64, blkno) - __field(__u64, seq) - ), - TP_fast_assign( - SCSB_TRACE_ASSIGN(sb); - sk_trace_assign(key, key); - __entry->b_rid = rid; - __entry->nr = nr; - __entry->blkno = blkno; - __entry->seq = seq; - ), - TP_printk(SCSBF" key "SK_FMT" rid %016llx nr %llu blkno %llu seq %llx", - SCSB_TRACE_ARGS, sk_trace_args(key), - __entry->b_rid, __entry->nr, __entry->blkno, __entry->seq) -); - -TRACE_EVENT(scoutfs_forest_set_dirtied, - TP_PROTO(struct super_block *sb, struct scoutfs_key *key, u64 rid, - u64 nr, u64 cseq), - TP_ARGS(sb, key, rid, nr, cseq), - TP_STRUCT__entry( - SCSB_TRACE_FIELDS - sk_trace_define(key) - __field(__u64, b_rid) - __field(__u64, nr) - __field(__u64, cseq) - ), - TP_fast_assign( - SCSB_TRACE_ASSIGN(sb); - sk_trace_assign(key, key); - __entry->b_rid = rid; - __entry->nr = nr; - __entry->cseq = cseq; - ), - TP_printk(SCSBF" key "SK_FMT" rid %016llx nr %llu cseq %llu", - SCSB_TRACE_ARGS, sk_trace_args(key), - __entry->b_rid, __entry->nr, __entry->cseq) -); - -TRACE_EVENT(scoutfs_forest_trigger_refresh, - TP_PROTO(struct super_block *sb, struct scoutfs_key *key, - bool empty_roots, u64 refresh_gen, u64 last_refreshed, - u64 dirtied_cseq, u64 refreshed_dirtied, - u64 commit_seq, u64 refreshed_cseq), - TP_ARGS(sb, key, empty_roots, refresh_gen, last_refreshed, - dirtied_cseq, refreshed_dirtied, commit_seq, refreshed_cseq), - TP_STRUCT__entry( - SCSB_TRACE_FIELDS - sk_trace_define(key) - __field(int, empty_roots) - __field(__u64, refresh_gen) - __field(__u64, last_refreshed) - __field(__u64, dirtied_cseq) - __field(__u64, refreshed_dirtied) - __field(__u64, commit_seq) - __field(__u64, refreshed_cseq) - ), - TP_fast_assign( - SCSB_TRACE_ASSIGN(sb); - sk_trace_assign(key, key); - __entry->empty_roots = !!empty_roots; - __entry->refresh_gen = refresh_gen; - __entry->last_refreshed = last_refreshed; - __entry->dirtied_cseq = dirtied_cseq; - __entry->refreshed_dirtied = refreshed_dirtied; - __entry->commit_seq = commit_seq; - __entry->refreshed_cseq = refreshed_cseq; - ), - TP_printk(SCSBF" key "SK_FMT" empty %u refg %llu last_refg %llu dirt %llu refdir %llu cseq %llu refcseq %llu", - SCSB_TRACE_ARGS, sk_trace_args(key), - __entry->empty_roots, - __entry->refresh_gen, - __entry->last_refreshed, - __entry->dirtied_cseq, - __entry->refreshed_dirtied, - __entry->commit_seq, - __entry->refreshed_cseq) -); - -TRACE_EVENT(scoutfs_forest_refresh_seqs, - TP_PROTO(struct super_block *sb, struct scoutfs_key *key, u64 rid, - u64 nr, u64 dirtied_cseq, u64 refreshed_dirtied, - u64 commit_seq, u64 refreshed_cseq), - TP_ARGS(sb, key, rid, nr, dirtied_cseq, refreshed_dirtied, commit_seq, - refreshed_cseq), - TP_STRUCT__entry( - SCSB_TRACE_FIELDS - sk_trace_define(key) - __field(__u64, b_rid) - __field(__u64, nr) - __field(__u64, dirtied_cseq) - __field(__u64, refreshed_dirtied) - __field(__u64, commit_seq) - __field(__u64, refreshed_cseq) - ), - TP_fast_assign( - SCSB_TRACE_ASSIGN(sb); - sk_trace_assign(key, key); - __entry->b_rid = rid; - __entry->nr = nr; - __entry->dirtied_cseq = dirtied_cseq; - __entry->refreshed_dirtied = refreshed_dirtied; - __entry->commit_seq = commit_seq; - __entry->refreshed_cseq = refreshed_cseq; - ), - TP_printk(SCSBF" key "SK_FMT" rid %016llx nr %llu dirt %llu refdir %llu cseq %llu refcseq %llu", - SCSB_TRACE_ARGS, sk_trace_args(key), __entry->b_rid, - __entry->nr, __entry->dirtied_cseq, - __entry->refreshed_dirtied, __entry->commit_seq, - __entry->refreshed_cseq) -); - TRACE_EVENT(scoutfs_forest_init_our_log, - TP_PROTO(struct super_block *sb, u64 rid, u64 nr, u64 blkno, u64 seq, - u64 cseq), - TP_ARGS(sb, rid, nr, blkno, seq, cseq), + TP_PROTO(struct super_block *sb, u64 rid, u64 nr, u64 blkno, u64 seq), + TP_ARGS(sb, rid, nr, blkno, seq), TP_STRUCT__entry( SCSB_TRACE_FIELDS __field(__u64, b_rid) __field(__u64, nr) __field(__u64, blkno) __field(__u64, seq) - __field(__u64, cseq) ), TP_fast_assign( SCSB_TRACE_ASSIGN(sb); @@ -2243,67 +2120,10 @@ TRACE_EVENT(scoutfs_forest_init_our_log, __entry->nr = nr; __entry->blkno = blkno; __entry->seq = seq; - __entry->cseq = cseq; ), - TP_printk(SCSBF" rid %016llx nr %llu blkno %llu seq %llx cseq %llu", + TP_printk(SCSBF" rid %016llx nr %llu blkno %llu seq %llx", SCSB_TRACE_ARGS, __entry->b_rid, __entry->nr, - __entry->blkno, __entry->seq, __entry->cseq) -); - -TRACE_EVENT(scoutfs_forest_iter_search, - TP_PROTO(struct super_block *sb, u64 rid, u64 nr, u64 vers, - u8 flags, struct scoutfs_key *key), - TP_ARGS(sb, rid, nr, vers, flags, key), - TP_STRUCT__entry( - SCSB_TRACE_FIELDS - __field(__u64, b_rid) - __field(__u64, nr) - __field(__u64, vers) - __field(__u8, flags) - sk_trace_define(key) - ), - TP_fast_assign( - SCSB_TRACE_ASSIGN(sb); - __entry->b_rid = rid; - __entry->nr = nr; - __entry->vers = vers; - __entry->flags = flags; - sk_trace_assign(key, key); - ), - TP_printk(SCSBF" rid %016llx nr %llu vers %llu flags %x key "SK_FMT, - SCSB_TRACE_ARGS, __entry->b_rid, __entry->nr, - __entry->vers, __entry->flags, sk_trace_args(key)) -); - -TRACE_EVENT(scoutfs_forest_iter_ret, - TP_PROTO(struct super_block *sb, struct scoutfs_key *key, - struct scoutfs_key *end, bool forward, int ret, - u64 found_vers, int found_ret, struct scoutfs_key *found), - TP_ARGS(sb, key, end, forward, ret, found_vers, found_ret, found), - TP_STRUCT__entry( - SCSB_TRACE_FIELDS - sk_trace_define(key) - sk_trace_define(end) - __field(char, forward) - __field(int, ret) - __field(__u64, found_vers) - __field(int, found_ret) - sk_trace_define(found) - ), - TP_fast_assign( - SCSB_TRACE_ASSIGN(sb); - sk_trace_assign(key, key); - sk_trace_assign(end, end); - __entry->forward = !!forward; - __entry->ret = ret; - __entry->found_vers = found_vers; - __entry->found_ret = found_ret; - sk_trace_assign(found, found); - ), - TP_printk(SCSBF" key "SK_FMT" end "SK_FMT" fwd %u ret %d fv %llu fc %d f "SK_FMT, - SCSB_TRACE_ARGS, sk_trace_args(key), sk_trace_args(end), - __entry->forward, __entry->ret, __entry->found_vers, - __entry->found_ret, sk_trace_args(found)) + __entry->blkno, __entry->seq) ); DECLARE_EVENT_CLASS(scoutfs_block_class,