From 8fe683dab8990f14b23f3f57b91caf1a6bbcb4fa Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Mon, 11 May 2020 15:56:34 -0700 Subject: [PATCH] scoutfs: cow dirty radix blocks instead of moving The radix allocator has to be careful to not get lost in recursion trying to allocate metadata blocks for its dirty radix blocks while allocating metadata blocks for others. The first pass had used path data structures to record the references to all the blocks we'd need to modify to reflect the frees and allocations performed while dirtying radix blocks. Once it had all the path blocks it moved the old clean blocks into new dirty locations so that the dirtying couldn't fail. This had two very bad performance implications. First, it meant that trying to read clean versions of dirtied trees would always read the old blocks again because their clean version had been moved to the dirty version. Typically this wouldn't happen but the server does exactly this every time it tries to merge freed blocks back into its avail allocator. This created a significant IO load on the server. Secondly, that block cache move not being allowed to fail motivated us to move to a locked rbtree for the block cache instead of the lockless rcu radix_tree. This changes the recursion avoidance to use per-block private metadata to track every block that we allocate and cow rather than move. Each dirty block knows its parent ref and the blknos it would clear and set. If dirtying fails we can walk back through all the blocks we dirty and restore their original references before dropping all the dirty blocks and returning an error. This lets us get rid of the path structure entirely and results in a much cleaner system. This change meant tracking free blocks without clearing them as they're used to satisfy dirty block allocations. The change now has a cursor that walks the avail metadata tree without modifying it. While building this it became clear that tracking the first set bits of refs doesn't provide any value if we're always searching from a cursor. The cursor ends up providing the same value of avoiding constantly searching empty initial bits and refs. Maintaining the first metadata was just overhead. Signed-off-by: Zach Brown --- kmod/src/block.h | 1 + kmod/src/counters.h | 17 +- kmod/src/format.h | 2 - kmod/src/radix.c | 1467 ++++++++++++++++++-------------------- kmod/src/scoutfs_trace.h | 102 +-- 5 files changed, 743 insertions(+), 846 deletions(-) diff --git a/kmod/src/block.h b/kmod/src/block.h index 57e849a5..22e2437d 100644 --- a/kmod/src/block.h +++ b/kmod/src/block.h @@ -10,6 +10,7 @@ struct scoutfs_block_writer { struct scoutfs_block { u64 blkno; void *data; + void *priv; }; __le32 scoutfs_block_calc_crc(struct scoutfs_block_header *hdr, u32 size); diff --git a/kmod/src/counters.h b/kmod/src/counters.h index 06314b40..79da35cc 100644 --- a/kmod/src/counters.h +++ b/kmod/src/counters.h @@ -88,10 +88,23 @@ EXPAND_COUNTER(quorum_write_block) \ EXPAND_COUNTER(quorum_write_block_error) \ EXPAND_COUNTER(quorum_fenced) \ + EXPAND_COUNTER(radix_alloc) \ + EXPAND_COUNTER(radix_alloc_data) \ + EXPAND_COUNTER(radix_block_cow) \ + EXPAND_COUNTER(radix_block_read) \ + EXPAND_COUNTER(radix_complete_dirty_block) \ + EXPAND_COUNTER(radix_create_synth) \ + EXPAND_COUNTER(radix_free) \ + EXPAND_COUNTER(radix_free_data) \ EXPAND_COUNTER(radix_enospc_data) \ - EXPAND_COUNTER(radix_enospc_paths) \ + EXPAND_COUNTER(radix_enospc_meta) \ EXPAND_COUNTER(radix_enospc_synth) \ - EXPAND_COUNTER(radix_merge_retry) \ + EXPAND_COUNTER(radix_inconsistent_eio) \ + EXPAND_COUNTER(radix_inconsistent_ref) \ + EXPAND_COUNTER(radix_merge) \ + EXPAND_COUNTER(radix_merge_empty) \ + EXPAND_COUNTER(radix_undo_ref) \ + EXPAND_COUNTER(radix_walk) \ EXPAND_COUNTER(trans_commit_data_alloc_low) \ EXPAND_COUNTER(trans_commit_fsync) \ EXPAND_COUNTER(trans_commit_full) \ diff --git a/kmod/src/format.h b/kmod/src/format.h index dc83c506..8418a638 100644 --- a/kmod/src/format.h +++ b/kmod/src/format.h @@ -167,8 +167,6 @@ struct scoutfs_key { struct scoutfs_radix_block { struct scoutfs_block_header hdr; - __le32 sm_first; - __le32 lg_first; union { struct scoutfs_radix_ref { __le64 blkno; diff --git a/kmod/src/radix.c b/kmod/src/radix.c index c8940d3f..5d233c2a 100644 --- a/kmod/src/radix.c +++ b/kmod/src/radix.c @@ -52,9 +52,7 @@ * merge process. * * Allocations search for the next free bit from a cursor that's stored - * in the root of each tree. We track the first set parent ref or leaf - * bit in references to blocks to avoid searching entire blocks every - * time. + * in the root of each tree. * * The radix isn't always fully populated. References can contain * blknos with 0 or ~0 to indicate that its referenced subtree is either @@ -64,10 +62,14 @@ * descends. This lets mkfs initialize a tree with a large contigious * set region without having to populate all its blocks. * - * The radix is used to allocate and free blocks when performing cow - * updates of the blocks that make up radix itself. Recursion is - * carefully avoided by building up references to all the blocks needed - * for the operation and then dirtying and modifying them all at once. + * The metadata allocator radix tree itself is used to allocate and free + * its own blocks as it makes cow updates to itself. Recursion is + * avoided by tracking all the blocks we dirty with their parents, + * making sure we have dirty leaves to record frees and allocs for all + * the dirtied blocks, and using a read-only cursor to find blknos for + * each new dirty block. This lets us either atomically set and clear + * all the leaf bits once we have all the dirty blocks or unwind all the + * dirty blocks and restore their parent references. * * Radix block references contain totals of bits set in its referenced * subtree. This helps us balance the number of free bits stored across @@ -78,6 +80,16 @@ * tracked in the metadata allocator trees but aren't used. */ +/* + * This is just a sanity test at run time. It's log base + * SCOUTFS_RADIX_BITS of SCOUTFS_BLOCK_SM_MAX, but we can come close by + * dividing bit widths by shifts if we under-estimate the number of bits + * in a leaf by rounding it down to a power of two. In practice the + * trees are sized for the capacity of the device and are very short. + */ +#define RADIX_MAX_HEIGHT (((64 - SCOUTFS_BLOCK_SM_SHIFT) % \ + (SCOUTFS_BLOCK_LG_SHIFT + 2)) + 2) + /* * We create temporary synthetic blocks past possible blocks to populate * stubbed out refs that reference entirely empty or full subtrees. @@ -85,141 +97,49 @@ */ #define RADIX_SYNTH_BLKNO (SCOUTFS_BLOCK_LG_MAX + 1) -struct radix_path { - struct rb_node node; - struct list_head head; - struct list_head alloc_head; - u8 height; - struct scoutfs_radix_root *root; - u64 leaf_bit; - /* path and index arrays indexed by level, [0] is leaf */ - struct scoutfs_block **bls; - unsigned int *inds; +static bool is_synth(u64 blkno) +{ + return blkno >= RADIX_SYNTH_BLKNO; +} + +/* we use fake blknos to indicate subtrees either entirely empty or full */ +static bool is_stub(u64 blkno) +{ + return blkno == 0 || blkno == U64_MAX; +} + +struct radix_block_private { + struct scoutfs_block *bl; + struct list_head entry; + struct list_head dirtied_entry; + struct scoutfs_block *parent; + struct scoutfs_radix_ref *ref; + struct scoutfs_radix_ref orig_ref; + struct scoutfs_block *blkno_bl; + struct scoutfs_block *old_blkno_bl; + int blkno_ind; + int old_blkno_ind; }; +static bool was_dirtied(struct radix_block_private *priv) +{ + return !list_empty(&priv->dirtied_entry); +} + struct radix_change { - struct list_head paths; - struct list_head new_paths; - struct list_head alloc_paths; - struct rb_root rbroot; - u64 block_allocs; - u64 caller_allocs; - u64 alloc_bits; + struct scoutfs_radix_root *avail; + struct list_head blocks; + struct list_head dirtied_blocks; u64 next_synth; + u64 next_find_bit; + u64 first_free; + struct scoutfs_block *free_bl; + u64 free_leaf_bit; + unsigned int free_ind; }; -static struct radix_path *alloc_path(struct scoutfs_radix_root *root) -{ - struct radix_path *path; - u8 height = root->height; - - path = kzalloc(sizeof(struct radix_path) + - (member_sizeof(struct radix_path, inds[0]) * height) + - (member_sizeof(struct radix_path, bls[0]) * height), - GFP_NOFS); - if (path) { - RB_CLEAR_NODE(&path->node); - INIT_LIST_HEAD(&path->head); - INIT_LIST_HEAD(&path->alloc_head); - path->height = root->height; - path->root = root; - path->bls = (void *)(path + 1); - path->inds = (void *)(&path->bls[height]); - } - return path; -} - -/* Return a pointer to a reference in the path to a block at the given level. */ -static struct scoutfs_radix_ref *path_ref(struct radix_path *path, int level) -{ - struct scoutfs_radix_block *rdx; - - BUG_ON(level < 0 || level >= path->height); - - if (level == path->height - 1) { - return &path->root->ref; - } else { - rdx = path->bls[level + 1]->data; - return &rdx->refs[path->inds[level + 1]]; - } -} - -static bool paths_share_blocks(struct radix_path *a, struct radix_path *b) -{ - int i; - - for (i = 0; i < min(a->height, b->height); i++) { - if (a->bls[i] == b->bls[i]) - return true; - } - - return false; -} - -/* - * Drop a path's reference to blocks and free its memory. If we still - * have synthetic blocks then we reset their references to the original - * empty or full blknos. Ref sequence numbers aren't updated when we - * initially reference synthetic blocks. - */ -static void free_path(struct super_block *sb, struct radix_path *path) -{ - struct scoutfs_radix_ref *ref; - struct scoutfs_block *bl; - __le64 orig; - int i; - - if (!IS_ERR_OR_NULL(path)) { - for (i = 0; i < path->height; i++) { - bl = path->bls[i]; - if (bl == NULL) - continue; - - if (bl->blkno >= RADIX_SYNTH_BLKNO) { - ref = path_ref(path, i); - if (bl->blkno & 1) - orig = cpu_to_le64(U64_MAX); - else - orig = 0; - - if (ref->blkno != orig) - ref->blkno = orig; - } - scoutfs_block_put(sb, bl); - } - kfree(path); - } -} - -static struct radix_change *alloc_change(void) -{ - struct radix_change *chg; - - chg = kzalloc(sizeof(struct radix_change), GFP_NOFS); - if (chg) { - INIT_LIST_HEAD(&chg->paths); - INIT_LIST_HEAD(&chg->new_paths); - INIT_LIST_HEAD(&chg->alloc_paths); - chg->rbroot = RB_ROOT; - chg->next_synth = RADIX_SYNTH_BLKNO; - } - return chg; -} - -static void free_change(struct super_block *sb, struct radix_change *chg) -{ - struct radix_path *path; - struct radix_path *tmp; - - if (!IS_ERR_OR_NULL(chg)) { - list_splice_init(&chg->new_paths, &chg->paths); - list_for_each_entry_safe(path, tmp, &chg->paths, head) { - list_del_init(&path->head); - free_path(sb, path); - } - kfree(chg); - } -} +#define DECLARE_RADIX_CHANGE(a) \ + struct radix_change a = {NULL, } /* * We can use native longs to set full aligned regions, but we have to @@ -375,14 +295,14 @@ static int find_next_lg(__le64 *map, int ind) return SCOUTFS_RADIX_BITS; } -static u64 bit_from_inds(struct radix_path *path) +static u64 bit_from_inds(u32 *level_inds, u8 height) { - u64 bit = path->inds[0]; + u64 bit = level_inds[0]; u64 mult = SCOUTFS_RADIX_BITS; int i; - for (i = 1; i < path->height; i++) { - bit += (u64)path->inds[i] * mult; + for (i = 1; i < height; i++) { + bit += (u64)level_inds[i] * mult; mult *= SCOUTFS_RADIX_REFS; } @@ -428,17 +348,17 @@ static u64 full_subtree_total(int level) return total; } -static void calc_level_inds(struct radix_path *path, u64 bit) +static void calc_level_inds(u32 *level_inds, u8 height, u64 bit) { u32 ind; int i; bit = div_u64_rem(bit, SCOUTFS_RADIX_BITS, &ind); - path->inds[0] = ind; + level_inds[0] = ind; - for (i = 1; i < path->height; i++) { + for (i = 1; i < height; i++) { bit = div_u64_rem(bit, SCOUTFS_RADIX_REFS, &ind); - path->inds[i] = ind; + level_inds[i] = ind; } } @@ -450,279 +370,127 @@ static u64 calc_leaf_bit(u64 bit) return bit - ind; } -static int compare_path(struct scoutfs_radix_root *root, u64 leaf_bit, - struct radix_path *path) -{ - return scoutfs_cmp((unsigned long)root, (unsigned long)path->root) ?: - scoutfs_cmp(leaf_bit, path->leaf_bit); -} - -static struct radix_path *walk_paths(struct rb_root *rbroot, - struct scoutfs_radix_root *root, - u64 leaf_bit, struct radix_path *ins) -{ - struct rb_node **node = &rbroot->rb_node; - struct rb_node *parent = NULL; - struct radix_path *path; - int cmp; - - while (*node) { - parent = *node; - path = container_of(*node, struct radix_path, node); - - cmp = compare_path(root, leaf_bit, path); - if (cmp < 0) - node = &(*node)->rb_left; - else if (cmp > 0) - node = &(*node)->rb_right; - else - return path; - } - - if (ins) { - rb_link_node(&ins->node, parent, node); - rb_insert_color(&ins->node, rbroot); - return ins; - } - - return NULL; -} - /* - * Make sure radix metadata is consistent. + * Make sure ref total tracking is correct after having modified a leaf + * and updated all the parent refs. */ -static void check_first_total(struct radix_path *path) +static void check_totals(struct scoutfs_block *leaf) { + struct radix_block_private *priv; + struct scoutfs_block *bl = leaf; struct scoutfs_radix_block *rdx; struct scoutfs_radix_ref *ref; int level; u64 st; u64 lt; - u32 sf; - u32 lf; int i; - for (level = 0; level < path->height; level++) { - rdx = path->bls[level]->data; - ref = path_ref(path, level); + for (level = 0; bl; level++, bl = priv->parent) { + priv = bl->priv; + rdx = bl->data; + ref = priv->ref; if (level == 0) { st = bitmap_weight((long *)rdx->bits, SCOUTFS_RADIX_BITS); lt = count_lg_bits(rdx->bits, 0, SCOUTFS_RADIX_BITS); - - sf = find_next_bit_le(rdx->bits, SCOUTFS_RADIX_BITS, 0); - lf = find_next_lg(rdx->bits, 0); } else { st = 0; lt = 0; - sf = SCOUTFS_RADIX_REFS; - lf = SCOUTFS_RADIX_REFS; for (i = 0; i < SCOUTFS_RADIX_REFS; i++) { st += le64_to_cpu(rdx->refs[i].sm_total); lt += le64_to_cpu(rdx->refs[i].lg_total); - if (rdx->refs[i].sm_total != 0 && i < sf) - sf = i; - if (rdx->refs[i].lg_total != 0 && i < lf) - lf = i; } } if (le64_to_cpu(ref->sm_total) != st || - le64_to_cpu(ref->lg_total) != lt || - le32_to_cpu(rdx->sm_first) > sf || - le32_to_cpu(rdx->lg_first) > lf) { - printk("radix inconsistency: level %u calced sf %u st %llu lf %u lt %llu, stored sf %u st %llu lf %u lt %llu\n", - level, sf, st, lf, lt, - le32_to_cpu(rdx->sm_first), + le64_to_cpu(ref->lg_total) != lt) { + printk("radix inconsistency: level %u calced st %llu lt %llu, stored st %llu lt %llu\n", + level, st, lt, le64_to_cpu(ref->sm_total), - le32_to_cpu(rdx->lg_first), le64_to_cpu(ref->lg_total)); BUG(); } + + bl = priv->parent; } } -#define set_first_nonzero_ref(rdx, ind, first, total) \ -do { \ - int _ind = min_t(u32, le32_to_cpu(rdx->first), (ind)); \ - \ - while (_ind < SCOUTFS_RADIX_REFS && rdx->refs[_ind].total == 0) \ - _ind++; \ - \ - rdx->first = cpu_to_le32(_ind); \ -} while (0) - /* - * The caller has changed bits in a leaf block and updated the block's - * first tracking. We update the first tracking and totals in parent - * blocks and refs up to the root ref. We do this after modifying - * leaves, instead of during descent, because we descend through clean - * blocks and then dirty all he blocks in all the paths before modifying - * leaves. + * The caller has changed bits in a leaf block. We update the totals in + * rers up to the root ref. */ -static void fixup_parent_refs(struct radix_path *path, +static void fixup_parent_refs(struct super_block *sb, + struct scoutfs_block *leaf, s64 sm_delta, s64 lg_delta) { - struct scoutfs_radix_block *rdx; + struct radix_block_private *priv; struct scoutfs_radix_ref *ref; - int level; - int ind; + struct scoutfs_block *bl; - for (level = 0; level < path->height; level++) { - rdx = path->bls[level]->data; - ref = path_ref(path, level); + for (bl = leaf; bl; bl = priv->parent) { + priv = bl->priv; + ref = priv->ref; le64_add_cpu(&ref->sm_total, sm_delta); le64_add_cpu(&ref->lg_total, lg_delta); - if (level > 0) { - ind = path->inds[level]; - set_first_nonzero_ref(rdx, ind, sm_first, sm_total); - set_first_nonzero_ref(rdx, ind, lg_first, lg_total); - } } if (0) /* expensive, would be nice to make conditional */ - check_first_total(path); + check_totals(leaf); +} + +/* return 0 if the bit is past the last bit for the device */ +static u64 wrap_bit(struct super_block *sb, bool meta, u64 bit) +{ + return bit > last_from_super(sb, meta) ? 0 : bit; } static void store_next_find_bit(struct super_block *sb, bool meta, struct scoutfs_radix_root *root, u64 bit) { - if (bit > last_from_super(sb, meta)) - bit = 0; - root->next_find_bit = cpu_to_le64(bit); + root->next_find_bit = cpu_to_le64(wrap_bit(sb, meta, bit)); } -/* - * Allocate (clear and return) a region of bits from the leaf block of a - * path. The leaf walk has ensured that we have at least one block free. - * - * We always try to allocate smaller multi-block allocations from the - * start of the small region. This at least gets a single task extending - * a file one large extent. Multiple tasks extending writes will interleave. - * It'll do for now. - * - * We always search for free bits from the start of the leaf. - * This means that we can return recently freed blocks just behind the - * next free cursor. I'm not sure if that's much of a problem. - */ -static void alloc_leaf_bits(struct super_block *sb, bool meta, - struct radix_path *path, - int nbits, u64 *bit_ret, int *nbits_ret) +static void bug_on_bad_bits(int ind, int nbits) { - struct scoutfs_radix_block *rdx = path->bls[0]->data; - struct scoutfs_radix_ref *ref = path_ref(path, 0); - u32 sm_first; - u32 lg_first; + BUG_ON(ind < 0 || ind > SCOUTFS_RADIX_BITS); + BUG_ON(nbits < 0 || nbits > SCOUTFS_RADIX_BITS); + BUG_ON(ind + nbits > SCOUTFS_RADIX_BITS); +} + +static void set_leaf_bits(struct super_block *sb, struct scoutfs_block *bl, + int ind, int nbits) +{ + struct scoutfs_radix_block *rdx = bl->data; int lg_nbits; - int ind; - int end; - if (nbits >= SCOUTFS_RADIX_LG_BITS && ref->lg_total != 0) { - /* always allocate large allocs from full large regions */ - ind = le32_to_cpu(rdx->lg_first); - ind = find_next_lg(rdx->bits, ind); - sm_first = le32_to_cpu(rdx->sm_first); - lg_first = round_up(ind + nbits, SCOUTFS_RADIX_LG_BITS); + bug_on_bad_bits(ind, nbits); - } else { - /* otherwise alloc as much as we can from the next small */ - ind = le32_to_cpu(rdx->sm_first); - ind = find_next_bit_le(rdx->bits, SCOUTFS_RADIX_BITS, ind); + /* must never double-free bits */ + BUG_ON(!bitmap_empty_region_le(rdx->bits, ind, nbits)); + bitmap_set_le(rdx->bits, ind, nbits); + lg_nbits = count_lg_bits(rdx->bits, ind, nbits); - if (nbits > 1) { - end = find_next_zero_bit_le(rdx->bits, SCOUTFS_RADIX_BITS, ind); - nbits = min(nbits, end - ind); - } + fixup_parent_refs(sb, bl, nbits, lg_nbits); + trace_scoutfs_radix_set_bits(sb, bl->blkno, ind, nbits); +} - sm_first = ind + nbits; - lg_first = le32_to_cpu(rdx->lg_first); - } +static void clear_leaf_bits(struct super_block *sb, struct scoutfs_block *bl, + int ind, int nbits) +{ + struct scoutfs_radix_block *rdx = bl->data; + int lg_nbits; - /* callers and structures should have ensured success */ - BUG_ON(ind >= SCOUTFS_RADIX_BITS); + bug_on_bad_bits(ind, nbits); + /* must never alloc in-use bits */ + BUG_ON(!bitmap_full_region_le(rdx->bits, ind, nbits)); lg_nbits = count_lg_bits(rdx->bits, ind, nbits); bitmap_clear_le(rdx->bits, ind, nbits); - /* always update the first we searched through */ - rdx->sm_first = cpu_to_le32(sm_first); - rdx->lg_first = cpu_to_le32(lg_first); - fixup_parent_refs(path, -nbits, -lg_nbits); - - *bit_ret = path->leaf_bit + ind; - *nbits_ret = nbits; - - store_next_find_bit(sb, meta, path->root, path->leaf_bit + ind + nbits); -} - -/* - * Allocate a metadata blkno for the caller from the leaves of paths - * which were stored in the change for metadata allocation. - */ -static u64 change_alloc_meta(struct super_block *sb, struct radix_change *chg) -{ - struct scoutfs_radix_ref *ref; - struct radix_path *path; - int nbits_ret; - u64 bit; - - path = list_first_entry_or_null(&chg->alloc_paths, struct radix_path, - alloc_head); - BUG_ON(!path); /* shouldn't be possible */ - - alloc_leaf_bits(sb, true, path, 1, &bit, &nbits_ret); - - /* remove the path from the alloc list once its empty */ - ref = path_ref(path, 0); - if (ref->sm_total == 0) - list_del_init(&path->alloc_head); - - return bit; -} - -static void set_path_leaf_bits(struct super_block *sb, struct radix_path *path, - u64 bit, int nbits) -{ - struct scoutfs_radix_block *rdx; - int lg_ind; - int ind; - - BUG_ON(nbits <= 0); - BUG_ON(calc_leaf_bit(bit) != calc_leaf_bit(bit + nbits - 1)); - BUG_ON(calc_leaf_bit(bit) != path->leaf_bit); - - rdx = path->bls[0]->data; - ind = bit - path->leaf_bit; - lg_ind = round_down(ind, SCOUTFS_RADIX_LG_BITS); - - /* should have returned an error if it was set while we got paths */ - BUG_ON(!bitmap_empty_region_le(rdx->bits, ind, nbits)); - bitmap_set_le(rdx->bits, ind, nbits); - - if (ind < le32_to_cpu(rdx->sm_first)) - rdx->sm_first = cpu_to_le32(ind); - if (lg_ind < le32_to_cpu(rdx->lg_first) && - lg_is_full(rdx->bits, lg_ind)) - rdx->lg_first = cpu_to_le32(lg_ind); - fixup_parent_refs(path, nbits, count_lg_bits(rdx->bits, ind, nbits)); - - trace_scoutfs_radix_set(sb, path->root, path->bls[0]->blkno, - bit, ind, nbits); -} - -/* Find the path for the root and bit in the change and set the region */ -static void set_change_leaf_bits(struct super_block *sb, - struct radix_change *chg, - struct scoutfs_radix_root *root, - u64 bit, int nbits) -{ - struct radix_path *path; - - path = walk_paths(&chg->rbroot, root, calc_leaf_bit(bit), NULL); - BUG_ON(!path); /* should have gotten paths for all leaves to set */ - set_path_leaf_bits(sb, path, bit, nbits); + fixup_parent_refs(sb, bl, -nbits, -lg_nbits); + trace_scoutfs_radix_clear_bits(sb, bl->blkno, ind, nbits); } /* @@ -754,7 +522,6 @@ static void init_block(struct super_block *sb, struct scoutfs_radix_block *rdx, { struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super; struct scoutfs_radix_ref ref; - u32 first = full ? 0 : level ? SCOUTFS_RADIX_REFS : SCOUTFS_RADIX_BITS; int tail; int i; @@ -766,8 +533,6 @@ static void init_block(struct super_block *sb, struct scoutfs_radix_block *rdx, rdx->hdr.magic = cpu_to_le32(SCOUTFS_BLOCK_MAGIC_RADIX); rdx->hdr.blkno = cpu_to_le64(blkno); rdx->hdr.seq = seq; - rdx->sm_first = cpu_to_le32(first); - rdx->lg_first = cpu_to_le32(first); if (level == 0) { if (full) @@ -794,76 +559,62 @@ static void init_block(struct super_block *sb, struct scoutfs_radix_block *rdx, memset((void *)rdx + SCOUTFS_BLOCK_LG_SIZE - tail, 0, tail); } -/* get path flags */ +static int find_next_change_blkno(struct super_block *sb, + struct radix_change *chg, + u64 *blkno); + enum { - GPF_NEXT_SM = (1 << 0), - GPF_NEXT_LG = (1 << 1), + GLF_NEXT_SM = (1 << 0), + GLF_NEXT_LG = (1 << 1), + GLF_DIRTY = (1 << 2), }; + /* - * Give the caller an allocated path that holds references to the blocks - * traversed to the leaf of the given root. + * Get the caller their block for walking down the radix. We can have + * to populate synthetic blocks, read existing blocks, and cow new dirty + * copies of either of those for callers who need to modify. We update + * references and record the blocks and references in the change for + * callers to further build atomic changes with. */ -static int get_path(struct super_block *sb, struct scoutfs_radix_root *root, - struct radix_change *chg, int gpf, u64 bit, - struct radix_path **path_ret) +static int get_radix_block(struct super_block *sb, + struct scoutfs_radix_allocator *alloc, + struct scoutfs_block_writer *wri, + struct radix_change *chg, + struct scoutfs_radix_root *root, int glf, + struct scoutfs_block *parent, + struct scoutfs_radix_ref *ref, int level, + struct scoutfs_block **bl_ret) { - struct scoutfs_radix_block *rdx; - struct scoutfs_radix_ref *ref; - struct radix_path *path = NULL; - struct scoutfs_block *bl; + struct radix_block_private *priv = NULL; bool saw_inconsistent = false; + struct scoutfs_radix_block *rdx; + struct scoutfs_block *bl = NULL; + struct scoutfs_block *dirty; + bool put_block = true; u64 blkno; u64 synth; - int level; - int ind; int ret; - int i; - /* can't operate outside radix until we support growing devices */ - if (WARN_ON_ONCE(root->height < height_from_last(bit)) || - WARN_ON_ONCE((gpf & GPF_NEXT_SM) && (gpf & GPF_NEXT_LG))) - return -EINVAL; - - path = alloc_path(root); - if (!path) { - ret = -ENOMEM; - goto out; - } - - /* switch to searching for small bits if no large found */ - if ((gpf & GPF_NEXT_LG) && le64_to_cpu(root->ref.lg_total) == 0) - gpf ^= GPF_NEXT_LG | GPF_NEXT_SM; - - calc_level_inds(path, bit); - - for (level = root->height - 1; level >= 0; level--) { - ref = path_ref(path, level); - - blkno = le64_to_cpu(ref->blkno); - if (blkno == U64_MAX || blkno == 0) { - synth = chg->next_synth++; - if ((blkno & 1) != (synth & 1)) - synth = chg->next_synth++; - /* careful not to go too high or wrap */ - if (synth == U64_MAX || synth < RADIX_SYNTH_BLKNO) { - scoutfs_inc_counter(sb, radix_enospc_synth); - ret = -ENOSPC; - goto out; - } - bl = scoutfs_block_create(sb, synth); - if (!IS_ERR_OR_NULL(bl)) { - init_block(sb, bl->data, synth, ref->seq, level, - blkno == U64_MAX); - ref->blkno = cpu_to_le64(bl->blkno); - - } - } else { - bl = scoutfs_block_read(sb, blkno); - } - if (IS_ERR(bl)) { - ret = PTR_ERR(bl); + /* create a synthetic block or read an existing block */ + blkno = le64_to_cpu(ref->blkno); + if (is_stub(blkno)) { + synth = chg->next_synth++; + /* don't create synth mistaken for all-full */ + if (synth == U64_MAX) { + scoutfs_inc_counter(sb, radix_enospc_synth); + ret = -ENOSPC; goto out; } + bl = scoutfs_block_create(sb, synth); + if (!IS_ERR_OR_NULL(bl)) { + init_block(sb, bl->data, synth, ref->seq, level, + blkno == U64_MAX); + scoutfs_inc_counter(sb, radix_create_synth); + } + } else { + bl = scoutfs_block_read(sb, blkno); + if (!IS_ERR_OR_NULL(bl)) + scoutfs_inc_counter(sb, radix_block_read); /* * We can have a stale block in the cache but the tree @@ -872,46 +623,153 @@ static int get_path(struct super_block *sb, struct scoutfs_radix_root *root, * consistent block after reading from the device then * we've found corruption. */ - if (!scoutfs_block_consistent_ref(sb, bl, ref->seq, ref->blkno, + while (!IS_ERR(bl) && + !scoutfs_block_consistent_ref(sb, bl, ref->seq, + ref->blkno, SCOUTFS_BLOCK_MAGIC_RADIX)) { + scoutfs_inc_counter(sb, radix_inconsistent_ref); + scoutfs_block_writer_forget(sb, wri, bl); + scoutfs_block_invalidate(sb, bl); + BUG_ON(bl->priv != NULL); + scoutfs_block_put(sb, bl); + bl = NULL; if (!saw_inconsistent) { - scoutfs_block_invalidate(sb, bl); - scoutfs_block_put(sb, bl); saw_inconsistent = true; - level++; - continue; + bl = scoutfs_block_read(sb, blkno); + } else { + bl = ERR_PTR(-EIO); + scoutfs_inc_counter(sb, radix_inconsistent_eio); } - ret = -EIO; - goto out; } saw_inconsistent = false; + } + if (IS_ERR(bl)) { + ret = PTR_ERR(bl); + goto out; + } + + if ((glf & GLF_DIRTY) && !scoutfs_block_writer_is_dirty(sb, bl)) { + /* make a cow copy for the caller that needs a dirty block */ + ret = find_next_change_blkno(sb, chg, &blkno); + if (ret < 0) + goto out; + + dirty = scoutfs_block_create(sb, blkno); + if (IS_ERR(dirty)) { + ret = PTR_ERR(dirty); + goto out; + } + + memcpy(dirty->data, bl->data, SCOUTFS_BLOCK_LG_SIZE); + scoutfs_block_put(sb, bl); + bl = dirty; + scoutfs_inc_counter(sb, radix_block_cow); + } + + priv = bl->priv; + if (!priv) { + priv = kzalloc(sizeof(struct radix_block_private), GFP_NOFS); + if (!priv) { + ret = -ENOMEM; + goto out; + } + + bl->priv = priv; + priv->bl = bl; + INIT_LIST_HEAD(&priv->dirtied_entry); + priv->parent = parent; + priv->ref = ref; + priv->orig_ref = *ref; + /* put at head so for_each restores refs in reverse */ + list_add(&priv->entry, &chg->blocks); + /* priv holds bl get, put as change is completed */ + put_block = false; + } + + if ((glf & GLF_DIRTY) && !scoutfs_block_writer_is_dirty(sb, bl)) { + scoutfs_block_writer_mark_dirty(sb, wri, bl); + list_add(&priv->dirtied_entry, &chg->dirtied_blocks); + } + + trace_scoutfs_radix_get_block(sb, root, glf, level, + parent ? parent->blkno : 0, + le64_to_cpu(ref->blkno), bl->blkno); + + /* update refs to new synth or dirty blocks */ + if (le64_to_cpu(ref->blkno) != bl->blkno) { + rdx = bl->data; + rdx->hdr.blkno = cpu_to_le64(bl->blkno); + prandom_bytes(&rdx->hdr.seq, sizeof(rdx->hdr.seq)); + ref->blkno = rdx->hdr.blkno; + ref->seq = rdx->hdr.seq; + } + + ret = 0; +out: + if (put_block) + scoutfs_block_put(sb, bl); + if (ret < 0) + bl = NULL; + + *bl_ret = bl; + return ret; +} + +static int get_leaf_walk(struct super_block *sb, + struct scoutfs_radix_allocator *alloc, + struct scoutfs_block_writer *wri, + struct radix_change *chg, + struct scoutfs_radix_root *root, + int glf, u64 bit, u64 *leaf_bit_ret, + struct scoutfs_block **bl_ret) +{ + struct scoutfs_radix_block *rdx; + struct scoutfs_radix_ref *ref; + struct scoutfs_block *parent = NULL; + struct scoutfs_block *bl; + u32 level_inds[RADIX_MAX_HEIGHT]; + int level; + int ind = 0; + int ret; + int i; + + /* can't operate outside radix until we support growing devices */ + if (WARN_ON_ONCE(root->height < height_from_last(bit)) || + WARN_ON_ONCE(root->height > RADIX_MAX_HEIGHT) || + WARN_ON_ONCE((glf & GLF_NEXT_SM) && (glf & GLF_NEXT_LG))) + return -EINVAL; + + calc_level_inds(level_inds, root->height, bit); + ref = &root->ref; + + for (level = root->height - 1; level >= 0; level--) { + ret = get_radix_block(sb, alloc, wri, chg, root, glf, parent, + ref, level, &bl); + if (ret) + goto out; + + trace_scoutfs_radix_walk(sb, root, glf, level, bl->blkno, ind, + bit); - path->bls[level] = bl; if (level == 0) { - /* path's leaf_bit is first in the leaf block */ - path->inds[0] = 0; + /* returned leaf_bit is first in the leaf block */ + level_inds[0] = 0; break; } rdx = bl->data; - ind = path->inds[level]; + ind = level_inds[level]; - /* search for a path to a leaf with a set large region */ - while ((gpf & GPF_NEXT_LG) && ind < SCOUTFS_RADIX_REFS && + /* search for a ref to a child with a set large region */ + while ((glf & GLF_NEXT_LG) && ind < SCOUTFS_RADIX_REFS && le64_to_cpu(rdx->refs[ind].lg_total) == 0) { - if (ind < le32_to_cpu(rdx->lg_first)) - ind = le32_to_cpu(rdx->lg_first); - else - ind++; + ind++; } - /* search for a path to a leaf with a any bits set */ - while ((gpf & GPF_NEXT_SM) && ind < SCOUTFS_RADIX_REFS && + /* search for a ref to a child with any bits set */ + while ((glf & GLF_NEXT_SM) && ind < SCOUTFS_RADIX_REFS && le64_to_cpu(rdx->refs[ind].sm_total) == 0) { - if (ind < le32_to_cpu(rdx->sm_first)) - ind = le32_to_cpu(rdx->sm_first); - else - ind++; + ind++; } /* @@ -928,224 +786,332 @@ static int get_path(struct super_block *sb, struct scoutfs_radix_root *root, ret = -ENOENT; goto out; } - path->inds[level + 1]++; + level_inds[level + 1]++; for (i = level; i >= 0; i--) - path->inds[i] = 0; - for (i = level; i <= level + 1; i++) { - scoutfs_block_put(sb, path->bls[i]); - path->bls[i] = NULL; - } + level_inds[i] = 0; level += 2; continue; } /* reset all lower indices if we searched */ - if (ind != path->inds[level]) { + if (ind != level_inds[level]) { for (i = level - 1; i >= 0; i--) - path->inds[i] = 0; - path->inds[level] = ind; + level_inds[i] = 0; + level_inds[level] = ind; + } + + parent = bl; + ref = &rdx->refs[ind]; + } + + *leaf_bit_ret = bit_from_inds(level_inds, root->height); + ret = 0; + scoutfs_inc_counter(sb, radix_walk); +out: + if (ret < 0) + *bl_ret = NULL; + else + *bl_ret = bl; + return ret; +} + +/* + * Get the caller their leaf block in which they'll set or clear bits. + * If they're asking for a dirty block then the leaf walk might dirty + * blocks. For each newly dirtied block we also make sure we have dirty + * blocks for the leaves that contain the bits for each newly dirtied + * block's old blkno and new blkno. + */ +static int get_leaf(struct super_block *sb, + struct scoutfs_radix_allocator *alloc, + struct scoutfs_block_writer *wri, struct radix_change *chg, + struct scoutfs_radix_root *root, int glf, u64 bit, + u64 *leaf_bit_ret, struct scoutfs_block **bl_ret) +{ + struct radix_block_private *priv; + struct scoutfs_block *bl; + u64 leaf_bit; + u64 old_blkno; + int ret; + + ret = get_leaf_walk(sb, alloc, wri, chg, root, glf, bit, leaf_bit_ret, + bl_ret); + if (ret < 0 || !(glf & GLF_DIRTY)) + goto out; + + /* walk to leaves containing bits of newly dirtied block's blknos */ + while ((priv = list_first_entry_or_null(&chg->dirtied_blocks, + struct radix_block_private, + dirtied_entry))) { + /* done when we see tail blocks with their blkno_bl set */ + if (priv->blkno_bl != NULL) + break; + + old_blkno = le64_to_cpu(priv->orig_ref.blkno); + if (!is_stub(old_blkno) && !is_synth(old_blkno)) { + ret = get_leaf_walk(sb, alloc, wri, chg, &alloc->freed, + GLF_DIRTY, old_blkno, &leaf_bit, + &bl); + if (ret < 0) + break; + priv->old_blkno_ind = old_blkno - leaf_bit; + priv->old_blkno_bl = bl; + } + + ret = get_leaf_walk(sb, alloc, wri, chg, &alloc->avail, + GLF_DIRTY, priv->bl->blkno, &leaf_bit, + &bl); + if (ret < 0) + break; + + priv->blkno_ind = priv->bl->blkno - leaf_bit; + priv->blkno_bl = bl; + + list_move_tail(&priv->dirtied_entry, &chg->dirtied_blocks); + } +out: + return ret; +} + +/* + * Find the next region of set bits of the given size starting from the + * given bit. This only finds the bits, it doesn't change anything. We + * always try to return regions past the starting bit. We can search to + * a leaf that has bits that are all past the starting bit and we'll + * retry. This will wrap around to the start of the tree and fall back + * to satisfying large regions with small regions. + */ +static int find_next_set_bits(struct super_block *sb, struct radix_change *chg, + struct scoutfs_radix_root *root, bool meta, + u64 start, int nbits, u64 *bit_ret, + int *nbits_ret, struct scoutfs_block **bl_ret) +{ + struct scoutfs_radix_block *rdx; + struct scoutfs_block *bl; + u64 leaf_bit; + u64 bit; + int end; + int ind; + int glf; + int ret; + + bit = start; + glf = nbits > 1 ? GLF_NEXT_LG : GLF_NEXT_SM; +retry: + ret = get_leaf(sb, NULL, NULL, chg, root, glf, bit, &leaf_bit, &bl); + if (ret == -ENOENT) { + if (bit != 0) { + bit = 0; + goto retry; + } + + /* switch to searching for small bits if no large found */ + if (glf == GLF_NEXT_LG) { + glf = GLF_NEXT_SM; + bit = start; + goto retry; + } + ret = -ENOSPC; + goto out; + } + rdx = bl->data; + + /* start from search bit if it's in the leaf, otherwise 0 */ + if (leaf_bit < bit && ((bit - leaf_bit) < SCOUTFS_RADIX_BITS)) + ind = bit - leaf_bit; + else + ind = 0; + + /* large allocs are always aligned from large regions */ + if (nbits >= SCOUTFS_RADIX_LG_BITS && (glf == GLF_NEXT_LG)) { + ind = find_next_lg(rdx->bits, ind); + if (ind == SCOUTFS_RADIX_BITS) { + bit = wrap_bit(sb, meta, leaf_bit + SCOUTFS_RADIX_BITS); + goto retry; + } + nbits = SCOUTFS_RADIX_LG_BITS; + ret = 0; + goto out; + } + + /* otherwise use as much of the next set region as we can */ + ind = find_next_bit_le(rdx->bits, SCOUTFS_RADIX_BITS, ind); + if (ind == SCOUTFS_RADIX_BITS) { + bit = wrap_bit(sb, meta, leaf_bit + SCOUTFS_RADIX_BITS); + goto retry; + } + + if (nbits > 1) { + end = find_next_zero_bit_le(rdx->bits, min_t(int, ind + nbits, + SCOUTFS_RADIX_BITS), ind); + nbits = end - ind; + } + ret = 0; + +out: + *bit_ret = leaf_bit + ind; + *nbits_ret = nbits; + if (bl_ret) + *bl_ret = bl; + + return ret; +} + +static void prepare_change(struct radix_change *chg, + struct scoutfs_radix_root *avail) +{ + memset(chg, 0, sizeof(struct radix_change)); + chg->avail = avail; + INIT_LIST_HEAD(&chg->blocks); + INIT_LIST_HEAD(&chg->dirtied_blocks); + chg->next_synth = RADIX_SYNTH_BLKNO; + chg->next_find_bit = le64_to_cpu(avail->next_find_bit); +} + +/* + * We successfully got all the dirty block references we need to make + * the change. Set their old blkno's freed bits and clear all their new + * dirty blkno's avail bits. We drop the blocks from the dirtied_blocks + * list here as we go so we won't attempt to do this all over again + * as we complete the change. + */ +static void apply_change_bits(struct super_block *sb, struct radix_change *chg) +{ + struct radix_block_private *priv; + struct scoutfs_block *bl; + + /* first update the contents of the blocks */ + list_for_each_entry(priv, &chg->blocks, entry) { + bl = priv->bl; + + /* complete cow allocations for dirtied blocks */ + if (was_dirtied(priv)) { + /* can't try to write to synth blknos */ + BUG_ON(is_synth(bl->blkno)); + + clear_leaf_bits(sb, priv->blkno_bl, priv->blkno_ind, 1); + if (priv->old_blkno_bl) { + set_leaf_bits(sb, priv->old_blkno_bl, + priv->old_blkno_ind, 1); + } + scoutfs_inc_counter(sb, radix_complete_dirty_block); + + list_del_init(&priv->dirtied_entry); + } + } +} + +/* + * Drop all references to the blocks that we held as we worked with the + * radix blocks. + * + * If the operation failed then we drop the blocks we dirtied during + * this change and restore their refs. Nothing can update a ref to a + * dirty block so these will always be current. + * + * We always drop synthetic blocks. They could been cowed so they might + * not be currently referenced. Blocks are added to the head of the + * blocks list as they're first used so we're undoing ref changes in + * reverse order. This means that the error case will always first + * unwind synthetic cows then the synthetic source block itself. + */ +static void complete_change(struct super_block *sb, + struct scoutfs_block_writer *wri, + struct radix_change *chg, int err) +{ + struct radix_block_private *priv; + struct radix_block_private *tmp; + struct scoutfs_block *bl; + + /* only complete once for each call to prepare */ + if (!chg->avail) + return; + + /* finish dirty block frees and allocs on success */ + if (err == 0 && !list_empty(&chg->dirtied_blocks)) + apply_change_bits(sb, chg); + + /* replace refs and remove blocks from the cache */ + list_for_each_entry(priv, &chg->blocks, entry) { + bl = priv->bl; + + if (is_synth(bl->blkno) || (err < 0 && was_dirtied(priv))) { + if (le64_to_cpu(priv->ref->blkno) == bl->blkno) { + *priv->ref = priv->orig_ref; + scoutfs_inc_counter(sb, radix_undo_ref); + } + scoutfs_block_writer_forget(sb, wri, bl); + scoutfs_block_invalidate(sb, bl); } } - path->leaf_bit = bit_from_inds(path); + /* finally put all blocks now that were done with contents */ + list_for_each_entry_safe(priv, tmp, &chg->blocks, entry) { + bl = priv->bl; + + bl->priv = NULL; + scoutfs_block_put(sb, bl); + list_del(&priv->entry); + kfree(priv); + } + + if (err == 0) + store_next_find_bit(sb, true, chg->avail, chg->next_find_bit); + chg->avail = NULL; +} + +/* + * Find the next free metadata blkno from the metadata allocator that + * the change is tracking. This is used to find the next free blkno for + * the next cowed block without modifying the allocator. Because it's + * not modifying the allocator it can wrap and find the same block + * twice, we watch for that. + */ +static int find_next_change_blkno(struct super_block *sb, + struct radix_change *chg, u64 *blkno) +{ + struct scoutfs_radix_block *rdx; + u64 bit; + int nbits; + int ret; + + if (chg->free_bl == NULL) { + ret = find_next_set_bits(sb, chg, chg->avail, true, + chg->next_find_bit, 1, &bit, &nbits, + &chg->free_bl); + if (ret < 0) + goto out; + chg->free_leaf_bit = calc_leaf_bit(bit); + chg->free_ind = bit - chg->free_leaf_bit; + } + + bit = chg->free_leaf_bit + chg->free_ind; + if (chg->first_free == 0) { + chg->first_free = bit; + } else if (chg->first_free == bit) { + ret = -ENOSPC; + goto out; + } + + *blkno = bit; + + rdx = chg->free_bl->data; + chg->free_ind = find_next_bit_le(rdx->bits, SCOUTFS_RADIX_BITS, + chg->free_ind + 1); + if (chg->free_ind >= SCOUTFS_RADIX_BITS) { + chg->free_ind = SCOUTFS_RADIX_BITS; + chg->free_bl = NULL; + } + chg->next_find_bit = wrap_bit(sb, true, + chg->free_leaf_bit + chg->free_ind); + ret = 0; out: - if (ret < 0) { - free_path(sb, path); - path = NULL; - } - - *path_ret = path; + if (ret == -ENOSPC) + scoutfs_inc_counter(sb, radix_enospc_meta); return ret; } -/* - * Get all the paths we're going to need to dirty all the blocks in all - * the paths in the change. The caller has added their path to the leaf - * that they want to change to start the process off. - * - * For every clean block in paths we can have to set a bit in a leaf to - * free the old blkno and clear a bit in a leaf to allocate a new dirty - * blkno. We keep checking new paths for clean blocks until eventually - * all the paths only contain blocks whose blknos are in leaves that we - * already have paths to. - */ -static int get_all_paths(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, - struct radix_change *chg) -{ - struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super; - struct scoutfs_radix_block *rdx; - struct scoutfs_radix_ref *ref; - struct scoutfs_block *bl; - struct radix_path *path; - struct radix_path *adding; - struct radix_path *found; - bool meta_wrapped; - bool stable; - u64 start_meta; - u64 next_meta; - u64 last_meta; - u64 leaf_bit; - int ind; - int ret; - int i; - - start_meta = calc_leaf_bit(le64_to_cpu(alloc->avail.next_find_bit)); - next_meta = start_meta; - last_meta = le64_to_cpu(super->last_meta_blkno); - meta_wrapped = false; - - do { - stable = true; - - /* get paths to leaves to allocate dirty blknos from */ - if (chg->alloc_bits < chg->block_allocs + chg->caller_allocs) { - stable = false; - - /* we're not modifying as we go, check for wrapping */ - if (next_meta >= start_meta && meta_wrapped) { - scoutfs_inc_counter(sb, radix_enospc_paths); - ret = -ENOSPC; - break; - } - - ret = get_path(sb, &alloc->avail, chg, GPF_NEXT_SM, - next_meta, &adding); - if (ret < 0) { - if (ret == -ENOENT) { - meta_wrapped = true; - next_meta = 0; - continue; - } - break; - } - - next_meta = adding->leaf_bit + SCOUTFS_RADIX_BITS; - if (next_meta > last_meta) { - meta_wrapped = true; - next_meta = 0; - } - - /* might already have path, maybe add it to alloc */ - found = walk_paths(&chg->rbroot, adding->root, - adding->leaf_bit, adding); - if (found != adding) { - free_path(sb, adding); - adding = found; - } else { - list_add_tail(&adding->head, &chg->new_paths); - } - if (list_empty(&adding->alloc_head)) { - ref = path_ref(adding, 0); - chg->alloc_bits += le64_to_cpu(ref->sm_total); - list_add_tail(&adding->alloc_head, - &chg->alloc_paths); - } - } - - if ((path = list_first_entry_or_null(&chg->new_paths, - struct radix_path, - head))) { - list_move_tail(&path->head, &chg->paths); - stable = false; - - /* check all the blocks in all new paths */ - for (i = path->height - 1; i >= 0; i--) { - bl = path->bls[i]; - - /* dirty are done, only visit each block once */ - if (scoutfs_block_writer_is_dirty(sb, bl) || - scoutfs_block_tas_visited(sb, bl)) - continue; - - /* record the number of allocs we'll need */ - chg->block_allocs++; - - /* don't need to free synth blknos */ - if (bl->blkno >= RADIX_SYNTH_BLKNO) - continue; - - /* see if we already a path to this leaf */ - leaf_bit = calc_leaf_bit(bl->blkno); - if (walk_paths(&chg->rbroot, &alloc->freed, - leaf_bit, NULL)) - continue; - - /* get a new path to freed leaf to set */ - ret = get_path(sb, &alloc->freed, chg, 0, - bl->blkno, &adding); - if (ret < 0) - break; - - rdx = adding->bls[0]->data; - ind = bl->blkno - adding->leaf_bit; - if (test_bit_le(ind, rdx->bits)) { - /* XXX corruption, bit already set? */ - ret = -EIO; - break; - } - - walk_paths(&chg->rbroot, adding->root, - adding->leaf_bit, adding); - list_add_tail(&adding->head, &chg->new_paths); - } - } - - ret = 0; - } while (!stable); - - return ret; -} - -/* - * We have pinned blocks in paths to all the leaves that we need to - * modify to make a change to radix trees. Walk through the paths - * moving blocks to their new allocated blknos, freeing the old stable - * blknos. - */ -static void dirty_all_path_blocks(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, - struct scoutfs_block_writer *wri, - struct radix_change *chg) -{ - struct scoutfs_radix_block *rdx; - struct scoutfs_radix_ref *ref; - struct scoutfs_block *bl; - struct radix_path *path; - u64 blkno; - int level; - - BUG_ON(!list_empty(&chg->new_paths)); - - list_for_each_entry(path, &chg->paths, head) { - - for (level = path->height - 1; level >= 0; level--) { - bl = path->bls[level]; - - if (scoutfs_block_writer_is_dirty(sb, bl)) - continue; - - if (bl->blkno < RADIX_SYNTH_BLKNO) - set_change_leaf_bits(sb, chg, &alloc->freed, - bl->blkno, 1); - - blkno = change_alloc_meta(sb, chg); - scoutfs_block_clear_visited(sb, bl); - scoutfs_block_move(sb, wri, bl, blkno); - scoutfs_block_writer_mark_dirty(sb, wri, bl); - - rdx = bl->data; - rdx->hdr.blkno = cpu_to_le64(bl->blkno); - prandom_bytes(&rdx->hdr.seq, sizeof(rdx->hdr.seq)); - - ref = path_ref(path, level); - ref->blkno = rdx->hdr.blkno; - ref->seq = rdx->hdr.seq; - } - } -} - static bool valid_free_bit_range(struct super_block *sb, bool meta, u64 bit, int nbits) { @@ -1166,9 +1132,9 @@ static int radix_free(struct super_block *sb, struct scoutfs_radix_root *root, bool meta, u64 bit, int nbits) { - struct scoutfs_radix_block *rdx; - struct radix_change *chg; - struct radix_path *path; + struct scoutfs_block *bl; + DECLARE_RADIX_CHANGE(chg); + u64 leaf_bit; int ind; int ret; @@ -1178,36 +1144,19 @@ static int radix_free(struct super_block *sb, return -EINVAL; mutex_lock(&alloc->mutex); + prepare_change(&chg, &alloc->avail); - chg = alloc_change(); - if (!chg) { - ret = -ENOMEM; - goto out; - } - - ret = get_path(sb, root, chg, 0, bit, &path); - if (ret < 0) - goto out; - list_add_tail(&path->head, &chg->new_paths); - - ind = bit - path->leaf_bit; - rdx = path->bls[0]->data; - if (!bitmap_empty_region_le(rdx->bits, ind, nbits)) { - /* XXX corruption, trying to free set bits */ - ret = -EIO; - goto out; - } - - ret = get_all_paths(sb, alloc, chg); + ret = get_leaf(sb, alloc, wri, &chg, root, GLF_DIRTY, bit, + &leaf_bit, &bl); if (ret < 0) goto out; - dirty_all_path_blocks(sb, alloc, wri, chg); - set_path_leaf_bits(sb, path, bit, nbits); - ret = 0; + ind = bit - leaf_bit; + set_leaf_bits(sb, bl, ind, nbits); out: - free_change(sb, chg); + complete_change(sb, wri, &chg, ret); mutex_unlock(&alloc->mutex); + return ret; } @@ -1219,27 +1168,33 @@ int scoutfs_radix_alloc(struct super_block *sb, struct scoutfs_radix_allocator *alloc, struct scoutfs_block_writer *wri, u64 *blkno) { - struct radix_change *chg; + struct scoutfs_block *bl; + DECLARE_RADIX_CHANGE(chg); + u64 leaf_bit; + u64 bit; + int ind; int ret; + scoutfs_inc_counter(sb, radix_alloc); + mutex_lock(&alloc->mutex); + prepare_change(&chg, &alloc->avail); - chg = alloc_change(); - if (!chg) { - ret = -ENOMEM; - goto out; - } - - chg->caller_allocs = 1; - ret = get_all_paths(sb, alloc, chg); + ret = find_next_change_blkno(sb, &chg, &bit); if (ret < 0) goto out; - dirty_all_path_blocks(sb, alloc, wri, chg); - *blkno = change_alloc_meta(sb, chg); + ret = get_leaf(sb, alloc, wri, &chg, &alloc->avail, GLF_DIRTY, bit, + &leaf_bit, &bl); + if (ret < 0) + goto out; + + ind = bit - leaf_bit; + clear_leaf_bits(sb, bl, ind, 1); + *blkno = bit; ret = 0; out: - free_change(sb, chg); + complete_change(sb, wri, &chg, ret); mutex_unlock(&alloc->mutex); return ret; @@ -1257,13 +1212,16 @@ int scoutfs_radix_alloc_data(struct super_block *sb, struct scoutfs_radix_root *root, int count, u64 *blkno_ret, int *count_ret) { - struct radix_change *chg; - struct radix_path *path; + struct scoutfs_block *bl; + DECLARE_RADIX_CHANGE(chg); + u64 leaf_bit; u64 bit; int nbits; - int gpf; + int ind; int ret; + scoutfs_inc_counter(sb, radix_alloc_data); + *blkno_ret = 0; *count_ret = 0; @@ -1271,41 +1229,32 @@ int scoutfs_radix_alloc_data(struct super_block *sb, return -EINVAL; nbits = min(count, SCOUTFS_RADIX_LG_BITS); - gpf = nbits > 1 ? GPF_NEXT_LG : GPF_NEXT_SM; mutex_lock(&alloc->mutex); + prepare_change(&chg, &alloc->avail); - chg = alloc_change(); - if (!chg) { - ret = -ENOMEM; - goto out; - } - -find_next: - bit = le64_to_cpu(root->next_find_bit); - ret = get_path(sb, root, chg, gpf, bit, &path); - if (ret) { - if (ret == -ENOENT) { - if (root->next_find_bit != 0) { - root->next_find_bit = 0; - goto find_next; - } + ret = find_next_set_bits(sb, &chg, root, false, + le64_to_cpu(root->next_find_bit), nbits, + &bit, &nbits, NULL); + if (ret < 0) { + if (ret == -ENOSPC) scoutfs_inc_counter(sb, radix_enospc_data); - ret = -ENOSPC; - } goto out; } - list_add_tail(&path->head, &chg->new_paths); - ret = get_all_paths(sb, alloc, chg); + ret = get_leaf(sb, alloc, wri, &chg, root, GLF_DIRTY, bit, + &leaf_bit, &bl); if (ret < 0) goto out; - dirty_all_path_blocks(sb, alloc, wri, chg); - alloc_leaf_bits(sb, false, path, nbits, blkno_ret, count_ret); + ind = bit - leaf_bit; + clear_leaf_bits(sb, bl, ind, nbits); + *blkno_ret = bit; + *count_ret = nbits; + store_next_find_bit(sb, false, root, bit + nbits); ret = 0; out: - free_change(sb, chg); + complete_change(sb, wri, &chg, ret); mutex_unlock(&alloc->mutex); return ret; @@ -1319,6 +1268,7 @@ int scoutfs_radix_free(struct super_block *sb, struct scoutfs_radix_allocator *alloc, struct scoutfs_block_writer *wri, u64 blkno) { + scoutfs_inc_counter(sb, radix_free); return radix_free(sb, alloc, wri, &alloc->freed, true, blkno, 1); } @@ -1332,6 +1282,7 @@ int scoutfs_radix_free_data(struct super_block *sb, struct scoutfs_radix_root *root, u64 blkno, int count) { + scoutfs_inc_counter(sb, radix_free_data); return radix_free(sb, alloc, wri, root, false, blkno, count); } @@ -1353,9 +1304,9 @@ int scoutfs_radix_free_data(struct super_block *sb, * read the old blocks. * * We can also be called with a src tree that is the current allocator - * avail tree. In this case dirtying the blocks in all the paths can - * consume bits in the source tree. We notice when dirtying allocation - * empties the src block and we retry finding a new leaf to merge. + * avail tree. In this case dirtying the leaf blocks can consume bits + * in the source tree. We notice when dirtying the src block and we + * retry finding a new leaf to merge. * * The caller specifies the minimum count to move. -ENOENT will be * returned if the source tree runs out of bits, potentially after @@ -1377,20 +1328,21 @@ int scoutfs_radix_merge(struct super_block *sb, struct scoutfs_radix_block *inp_rdx; struct scoutfs_radix_block *src_rdx; struct scoutfs_radix_block *dst_rdx; - struct radix_change *chg = NULL; - struct radix_path *inp_path = NULL; - struct radix_path *src_path; - struct radix_path *dst_path; + struct scoutfs_block *inp_bl; + struct scoutfs_block *src_bl; + struct scoutfs_block *dst_bl; + DECLARE_RADIX_CHANGE(chg); s64 src_lg_delta; s64 dst_lg_delta; + u64 leaf_bit; u64 bit; int merge_size; int merged; - int inp_sm; - int lg_ind; int ind; int ret; + scoutfs_inc_counter(sb, radix_merge); + mutex_lock(&alloc->mutex); /* can't try to free too much when inp is read-only */ @@ -1402,15 +1354,11 @@ int scoutfs_radix_merge(struct super_block *sb, while (count > 0) { - chg = alloc_change(); - if (!chg) { - ret = -ENOMEM; - goto out; - } - + prepare_change(&chg, &alloc->avail); bit = le64_to_cpu(src->next_find_bit); wrapped: - ret = get_path(sb, inp, chg, GPF_NEXT_SM, bit, &inp_path); + ret = get_leaf(sb, NULL, NULL, &chg, inp, GLF_NEXT_SM, bit, + &leaf_bit, &inp_bl); if (ret < 0) { if (ret == -ENOENT) { if (bit != 0) { @@ -1422,43 +1370,28 @@ wrapped: } goto out; } - /* unique input is not modified, not stored in the change */ - bit = inp_path->leaf_bit; + bit = leaf_bit; + inp_rdx = inp_bl->data; - ret = get_path(sb, src, chg, 0, bit, &src_path); + ret = get_leaf(sb, alloc, wri, &chg, src, GLF_DIRTY, bit, + &leaf_bit, &src_bl); if (ret < 0) goto out; - list_add_tail(&src_path->head, &chg->new_paths); + src_rdx = src_bl->data; - ret = get_path(sb, dst, chg, 0, bit, &dst_path); + ret = get_leaf(sb, alloc, wri, &chg, dst, GLF_DIRTY, bit, + &leaf_bit, &dst_bl); if (ret < 0) goto out; - list_add_tail(&dst_path->head, &chg->new_paths); + dst_rdx = dst_bl->data; - ret = get_all_paths(sb, alloc, chg); - if (ret < 0) - goto out; + apply_change_bits(sb, &chg); - /* this can modify src/dst when they're alloc trees */ - dirty_all_path_blocks(sb, alloc, wri, chg); - - inp_rdx = inp_path->bls[0]->data; - src_rdx = src_path->bls[0]->data; - dst_rdx = dst_path->bls[0]->data; - - inp_sm = le64_to_cpu(path_ref(inp_path, 0)->sm_total); - ind = find_next_bit_le(inp_rdx->bits, SCOUTFS_RADIX_BITS, - le32_to_cpu(inp_rdx->sm_first)); - lg_ind = round_down(ind, SCOUTFS_RADIX_LG_BITS); - - /* back out and retry if no input left, or inp not ro */ - if (inp_sm == 0 || - (inp != src && paths_share_blocks(inp_path, src_path))) { - scoutfs_inc_counter(sb, radix_merge_retry); - free_path(sb, inp_path); - inp_path = NULL; - free_change(sb, chg); - chg = NULL; + /* change allocs could have cleared all of inp if its avail */ + ind = find_next_bit_le(inp_rdx->bits, SCOUTFS_RADIX_BITS, 0); + if (ind == SCOUTFS_RADIX_BITS) { + scoutfs_inc_counter(sb, radix_merge_empty); + complete_change(sb, wri, &chg, -EAGAIN); continue; } @@ -1481,8 +1414,7 @@ wrapped: /* carefully modify src last, it might also be inp */ merged = bitmap_xor_bitmap_le(dst_rdx->bits, inp_rdx->bits, - ind, min_t(u64, inp_sm, count), - &merge_size); + ind, count, &merge_size); dst_lg_delta = count_lg_from_set(dst_rdx->bits, inp_rdx->bits, ind, merge_size); @@ -1491,25 +1423,15 @@ wrapped: bitmap_xor_bitmap_le(src_rdx->bits, inp_rdx->bits, ind, merged, NULL); - if (ind < le32_to_cpu(dst_rdx->sm_first)) - dst_rdx->sm_first = cpu_to_le32(ind); - /* first doesn't have to be precise, search will cleanup */ - if (lg_ind < le32_to_cpu(dst_rdx->lg_first)) - dst_rdx->lg_first = cpu_to_le32(lg_ind); + fixup_parent_refs(sb, src_bl, -merged, -src_lg_delta); + fixup_parent_refs(sb, dst_bl, merged, dst_lg_delta); - fixup_parent_refs(src_path, -merged, -src_lg_delta); - fixup_parent_refs(dst_path, merged, dst_lg_delta); + trace_scoutfs_radix_merge(sb, inp, inp_bl->blkno, src, + src_bl->blkno, dst, dst_bl->blkno, + count, bit, ind, merged, + src_lg_delta, dst_lg_delta); - trace_scoutfs_radix_merge(sb, inp, inp_path->bls[0]->blkno, - src, src_path->bls[0]->blkno, - dst, dst_path->bls[0]->blkno, count, - bit, ind, merged, src_lg_delta, - dst_lg_delta); - - free_path(sb, inp_path); - inp_path = NULL; - free_change(sb, chg); - chg = NULL; + complete_change(sb, wri, &chg, 0); store_next_find_bit(sb, meta, src, bit + SCOUTFS_RADIX_BITS); count -= min_t(u64, count, merged); @@ -1517,8 +1439,7 @@ wrapped: ret = 0; out: - free_path(sb, inp_path); - free_change(sb, chg); + complete_change(sb, wri, &chg, ret); mutex_unlock(&alloc->mutex); return ret; diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h index 0fad61cb..05c1427b 100644 --- a/kmod/src/scoutfs_trace.h +++ b/kmod/src/scoutfs_trace.h @@ -2226,123 +2226,87 @@ DEFINE_EVENT(scoutfs_block_class, scoutfs_block_shrink, TP_ARGS(sb, bp, blkno, refcount, io_count, bits, lru_moved) ); -TRACE_EVENT(scoutfs_radix_dirty, +TRACE_EVENT(scoutfs_radix_get_block, TP_PROTO(struct super_block *sb, struct scoutfs_radix_root *root, - u64 orig_blkno, u64 dirty_blkno, u64 par_blkno), - TP_ARGS(sb, root, orig_blkno, dirty_blkno, par_blkno), + int glf, int level, u64 par_blkno, u64 ref_blkno, u64 blkno), + TP_ARGS(sb, root, glf, level, par_blkno, ref_blkno, blkno), TP_STRUCT__entry( SCSB_TRACE_FIELDS __field(__u64, root_blkno) - __field(__u64, orig_blkno) - __field(__u64, dirty_blkno) + __field(int, glf) + __field(int, level) __field(__u64, par_blkno) + __field(__u64, ref_blkno) + __field(__u64, blkno) ), TP_fast_assign( SCSB_TRACE_ASSIGN(sb); __entry->root_blkno = le64_to_cpu(root->ref.blkno); - __entry->orig_blkno = orig_blkno; - __entry->dirty_blkno = dirty_blkno; + __entry->glf = glf; + __entry->level = level; __entry->par_blkno = par_blkno; + __entry->ref_blkno = ref_blkno; + __entry->blkno = blkno; ), - TP_printk(SCSBF" root_blkno %llu orig_blkno %llu dirty_blkno %llu par_blkno %llu", - SCSB_TRACE_ARGS, __entry->root_blkno, __entry->orig_blkno, - __entry->dirty_blkno, __entry->par_blkno) + TP_printk(SCSBF" root_blkno %llu glf 0x%x level %u par_blkno %llu ref_blkno %llu blkno %llu", + SCSB_TRACE_ARGS, __entry->root_blkno, __entry->glf, + __entry->level, __entry->par_blkno, __entry->ref_blkno, + __entry->blkno) ); TRACE_EVENT(scoutfs_radix_walk, TP_PROTO(struct super_block *sb, struct scoutfs_radix_root *root, - int grl, int level, u64 blkno, int ind, u64 bit, u64 next), - TP_ARGS(sb, root, grl, level, blkno, ind, bit, next), + int glf, int level, u64 blkno, int ind, u64 bit), + TP_ARGS(sb, root, glf, level, blkno, ind, bit), TP_STRUCT__entry( SCSB_TRACE_FIELDS __field(__u64, root_blkno) - __field(unsigned int, grl) + __field(unsigned int, glf) __field(__u64, blkno) __field(int, level) __field(int, ind) __field(__u64, bit) - __field(__u64, next) ), TP_fast_assign( SCSB_TRACE_ASSIGN(sb); __entry->root_blkno = le64_to_cpu(root->ref.blkno); - __entry->grl = grl; + __entry->glf = glf; __entry->blkno = blkno; __entry->level = level; __entry->ind = ind; __entry->bit = bit; - __entry->next = next; ), - TP_printk(SCSBF" root_blkno %llu grl 0x%x blkno %llu level %d ind %d bit %llu next %llu", - SCSB_TRACE_ARGS, __entry->root_blkno, __entry->grl, - __entry->blkno, __entry->level, __entry->ind, __entry->bit, - __entry->next) -); - -TRACE_EVENT(scoutfs_radix_fixup_refs, - TP_PROTO(struct super_block *sb, struct scoutfs_radix_root *root, - u32 sm_first, u64 sm_total, u16 lg_first, u64 lg_total, - u64 blkno, int level), - TP_ARGS(sb, root, sm_first, sm_total, lg_first, lg_total, blkno, level), - TP_STRUCT__entry( - SCSB_TRACE_FIELDS - __field(__u64, root_blkno) - __field(__u32, sm_first) - __field(__u64, sm_total) - __field(__u16, lg_first) - __field(__u64, lg_total) - __field(__u64, blkno) - __field(int, level) - ), - TP_fast_assign( - SCSB_TRACE_ASSIGN(sb); - __entry->root_blkno = le64_to_cpu(root->ref.blkno); - __entry->sm_first = sm_first; - __entry->sm_total = sm_total; - __entry->lg_first = lg_first; - __entry->lg_total = lg_total; - __entry->blkno = blkno; - __entry->level = level; - ), - TP_printk(SCSBF" root_blkno %llu sm_first %u sm_total %llu lg_first %u lg_total %llu blkno %llu level %u", - SCSB_TRACE_ARGS, __entry->root_blkno, __entry->sm_first, - __entry->sm_total, __entry->lg_first, __entry->lg_total, - __entry->blkno, __entry->level) + TP_printk(SCSBF" root_blkno %llu glf 0x%x blkno %llu level %d par_ind %d bit %llu", + SCSB_TRACE_ARGS, __entry->root_blkno, __entry->glf, + __entry->blkno, __entry->level, __entry->ind, __entry->bit) ); DECLARE_EVENT_CLASS(scoutfs_radix_bitop, - TP_PROTO(struct super_block *sb, struct scoutfs_radix_root *root, - u64 blkno, u64 bit, int ind, int nbits), - TP_ARGS(sb, root, blkno, bit, ind, nbits), + TP_PROTO(struct super_block *sb, u64 blkno, int ind, int nbits), + TP_ARGS(sb, blkno, ind, nbits), TP_STRUCT__entry( SCSB_TRACE_FIELDS - __field(__u64, root_blkno) __field(__u64, blkno) - __field(__u64, bit) __field(int, ind) __field(int, nbits) ), TP_fast_assign( SCSB_TRACE_ASSIGN(sb); - __entry->root_blkno = le64_to_cpu(root->ref.blkno); __entry->blkno = blkno; - __entry->bit = bit; __entry->ind = ind; __entry->nbits = nbits; ), - TP_printk(SCSBF" root_blkno %llu blkno %llu bit %llu ind %d nbits %d", - SCSB_TRACE_ARGS, __entry->root_blkno, __entry->blkno, - __entry->bit, __entry->ind, __entry->nbits) + TP_printk(SCSBF" blkno %llu ind %d nbits %d", + SCSB_TRACE_ARGS, __entry->blkno, __entry->ind, + __entry->nbits) ); -DEFINE_EVENT(scoutfs_radix_bitop, scoutfs_radix_clear, - TP_PROTO(struct super_block *sb, struct scoutfs_radix_root *root, - u64 blkno, u64 bit, int ind, int nbits), - TP_ARGS(sb, root, blkno, bit, ind, nbits) +DEFINE_EVENT(scoutfs_radix_bitop, scoutfs_radix_clear_bits, + TP_PROTO(struct super_block *sb, u64 blkno, int ind, int nbits), + TP_ARGS(sb, blkno, ind, nbits) ); -DEFINE_EVENT(scoutfs_radix_bitop, scoutfs_radix_set, - TP_PROTO(struct super_block *sb, struct scoutfs_radix_root *root, - u64 blkno, u64 bit, int ind, int nbits), - TP_ARGS(sb, root, blkno, bit, ind, nbits) +DEFINE_EVENT(scoutfs_radix_bitop, scoutfs_radix_set_bits, + TP_PROTO(struct super_block *sb, u64 blkno, int ind, int nbits), + TP_ARGS(sb, blkno, ind, nbits) ); TRACE_EVENT(scoutfs_radix_merge,