diff --git a/kmod/src/alloc.c b/kmod/src/alloc.c index 92276395..d556112e 100644 --- a/kmod/src/alloc.c +++ b/kmod/src/alloc.c @@ -1272,9 +1272,15 @@ int scoutfs_alloc_foreach(struct super_block *sb, struct scoutfs_block_ref refs[2] = {{0,}}; struct scoutfs_super_block *super = NULL; struct scoutfs_srch_compact *sc; + struct scoutfs_log_merge_request *lmreq; + struct scoutfs_log_merge_complete *lmcomp; struct scoutfs_log_trees lt; SCOUTFS_BTREE_ITEM_REF(iref); struct scoutfs_key key; + int expected; + u64 avail_tot; + u64 freed_tot; + u64 id; int ret; super = kmalloc(sizeof(struct scoutfs_super_block), GFP_NOFS); @@ -1381,6 +1387,57 @@ retry: scoutfs_key_inc(&key); } + /* log merge allocators */ + memset(&key, 0, sizeof(key)); + key.sk_zone = SCOUTFS_LOG_MERGE_REQUEST_ZONE; + expected = sizeof(*lmreq); + id = 0; + avail_tot = 0; + freed_tot = 0; + + for (;;) { + ret = scoutfs_btree_next(sb, &super->log_merge, &key, &iref); + if (ret == 0) { + if (iref.key->sk_zone != key.sk_zone) { + ret = -ENOENT; + } else if (iref.val_len == expected) { + key = *iref.key; + if (key.sk_zone == SCOUTFS_LOG_MERGE_REQUEST_ZONE) { + lmreq = iref.val; + id = le64_to_cpu(lmreq->rid); + avail_tot = le64_to_cpu(lmreq->meta_avail.total_nr); + freed_tot = le64_to_cpu(lmreq->meta_freed.total_nr); + } else { + lmcomp = iref.val; + id = le64_to_cpu(lmcomp->rid); + avail_tot = le64_to_cpu(lmcomp->meta_avail.total_nr); + freed_tot = le64_to_cpu(lmcomp->meta_freed.total_nr); + } + } else { + ret = -EIO; + } + scoutfs_btree_put_iref(&iref); + } + if (ret == -ENOENT) { + if (key.sk_zone == SCOUTFS_LOG_MERGE_REQUEST_ZONE) { + memset(&key, 0, sizeof(key)); + key.sk_zone = SCOUTFS_LOG_MERGE_COMPLETE_ZONE; + expected = sizeof(*lmcomp); + continue; + } + break; + } + if (ret < 0) + goto out; + + ret = cb(sb, arg, SCOUTFS_ALLOC_OWNER_LOG_MERGE, id, true, true, avail_tot) ?: + cb(sb, arg, SCOUTFS_ALLOC_OWNER_LOG_MERGE, id, true, false, freed_tot); + if (ret < 0) + goto out; + + scoutfs_key_inc(&key); + } + ret = 0; out: if (ret == -ESTALE) { diff --git a/kmod/src/alloc.h b/kmod/src/alloc.h index 1e245c5f..9130d086 100644 --- a/kmod/src/alloc.h +++ b/kmod/src/alloc.h @@ -55,6 +55,16 @@ #define SCOUTFS_SERVER_DATA_FILL_LO \ (1ULL * 1024 * 1024 * 1024 >> SCOUTFS_BLOCK_SM_SHIFT) +/* + * Log merge meta allocations are only used for one request and will + * never use more than the dirty limit. + */ +#define SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT (64ULL * 1024 * 1024) +/* a few extra blocks for alloc blocks */ +#define SCOUTFS_SERVER_MERGE_FILL_TARGET \ + ((SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT >> SCOUTFS_BLOCK_LG_SHIFT) + 4) +#define SCOUTFS_SERVER_MERGE_FILL_LO SCOUTFS_SERVER_MERGE_FILL_TARGET + /* * Each of the server meta_alloc roots will try to keep a minimum amount * of free blocks. The server will swap roots when its current avail diff --git a/kmod/src/btree.c b/kmod/src/btree.c index b9b02696..46989385 100644 --- a/kmod/src/btree.c +++ b/kmod/src/btree.c @@ -83,6 +83,10 @@ enum btree_walk_flags { BTW_ALLOC = (1 << 3), /* allocate a new block for 0 ref, requires dirty */ BTW_INSERT = (1 << 4), /* walking to insert, try splitting */ BTW_DELETE = (1 << 5), /* walking to delete, try joining */ + BTW_PAR_RNG = (1 << 6), /* return range through final parent */ + BTW_GET_PAR = (1 << 7), /* get reference to final parent */ + BTW_SET_PAR = (1 << 8), /* override reference to final parent */ + BTW_SUBTREE = (1 << 9), /* root is parent subtree, return -ERANGE if split/join */ }; /* total length of the value payload */ @@ -104,16 +108,22 @@ static inline unsigned int item_bytes(struct scoutfs_btree_item *item) } /* - * Join blocks when they both are 1/4 full. This puts some distance - * between the join threshold and the full threshold for splitting. - * Blocks that just split or joined need to undergo a reasonable amount - * of item modification before they'll split or join again. + * Refill blocks from their siblings when they're under 1/4 full. This + * puts some distance between the join threshold and the full threshold + * for splitting. Blocks that just split or joined need to undergo a + * reasonable amount of item modification before they'll split or join + * again. */ static unsigned int join_low_watermark(void) { return (SCOUTFS_BLOCK_LG_SIZE - sizeof(struct scoutfs_btree_block)) / 4; } +static bool total_above_join_low_water(struct scoutfs_btree_block *bt) +{ + return le16_to_cpu(bt->total_item_bytes) >= join_low_watermark(); +} + /* * return the integer percentages of total space the block could have * consumed by items that is currently consumed. @@ -512,6 +522,7 @@ static void create_item(struct scoutfs_btree_block *bt, item->val_off = insert_value(bt, ptr_off(bt, item), val, val_len); item->val_len = cpu_to_le16(val_len); + memset(item->__pad, 0, sizeof(item->__pad)); le16_add_cpu(&bt->total_item_bytes, item_bytes(item)); } @@ -805,12 +816,13 @@ static int try_join(struct super_block *sb, struct scoutfs_btree_block *sib; struct scoutfs_block *sib_bl; struct scoutfs_block_ref *ref; + const unsigned int lwm = join_low_watermark(); unsigned int sib_tot; bool move_right; int to_move; int ret; - if (le16_to_cpu(bt->total_item_bytes) >= join_low_watermark()) + if (total_above_join_low_water(bt)) return 0; scoutfs_inc_counter(sb, btree_join); @@ -830,18 +842,23 @@ static int try_join(struct super_block *sb, return ret; sib = sib_bl->data; - sib_tot = le16_to_cpu(bt->total_item_bytes); - if (sib_tot < join_low_watermark()) + /* combine if resulting block would be up to 75% full, move big chunk otherwise */ + sib_tot = le16_to_cpu(sib->total_item_bytes); + if (sib_tot <= lwm * 2) to_move = sib_tot; else - to_move = sib_tot - join_low_watermark(); + to_move = lwm; - if (le16_to_cpu(bt->mid_free_len) < to_move) { + /* compact to make room for over-estimate of worst case move overrun */ + if (le16_to_cpu(bt->mid_free_len) < + (to_move + item_len_bytes(SCOUTFS_BTREE_MAX_VAL_LEN))) { ret = compact_values(sb, bt); - if (ret < 0) + if (ret < 0) { scoutfs_block_put(sb, sib_bl); - return ret; + return ret; + } } + move_items(bt, sib, move_right, to_move); /* update our parent's item */ @@ -904,20 +921,21 @@ static bool bad_avl_node_off(__le16 node_off, int nr) * - call after leaf modification * - padding is zero */ -static void verify_btree_block(struct super_block *sb, +__attribute__((unused)) +static void verify_btree_block(struct super_block *sb, char *str, struct scoutfs_btree_block *bt, int level, - struct scoutfs_key *start, + bool last_ref, struct scoutfs_key *start, struct scoutfs_key *end) { __le16 *buckets = leaf_item_hash_buckets(bt); struct scoutfs_btree_item *item; + struct scoutfs_avl_node *node; char *reason = NULL; int first_val = 0; int hashed = 0; int end_off; int tot = 0; int i = 0; - int j = 0; int nr; if (bt->level != level) { @@ -956,8 +974,9 @@ static void verify_btree_block(struct super_block *sb, goto out; } - for (j = 0; j < sizeof(item->__pad); j++) { - WARN_ON_ONCE(item->__pad[j] != 0); + if (memchr_inv(item->__pad, '\0', sizeof(item->__pad))) { + reason = "item struct __pad isn't zero"; + goto out; } if (scoutfs_key_compare(&item->key, start) < 0 || @@ -972,19 +991,29 @@ static void verify_btree_block(struct super_block *sb, goto out; } + if (level > 0 && le16_to_cpu(item->val_len) != + sizeof(struct scoutfs_block_ref)) { + reason = "parent item val not sizeof ref"; + goto out; + } + if (le16_to_cpu(item->val_len) > SCOUTFS_BTREE_MAX_VAL_LEN) { reason = "bad item val len"; goto out; } + if (le16_to_cpu(item->val_off) % SCOUTFS_BTREE_VALUE_ALIGN) { + reason = "item value not aligned"; + goto out; + } + if (((int)le16_to_cpu(item->val_off) + le16_to_cpu(item->val_len)) > end_off) { reason = "item value outside valid"; goto out; } - tot += sizeof(struct scoutfs_btree_item) + - le16_to_cpu(item->val_len); + tot += item_len_bytes(le16_to_cpu(item->val_len)); if (item->val_len != 0) { first_val = min_t(int, first_val, @@ -992,6 +1021,15 @@ static void verify_btree_block(struct super_block *sb, } } + if (last_ref && level > 0 && + (node = scoutfs_avl_last(&bt->item_root)) != NULL) { + item = node_item(node); + if (scoutfs_key_compare(&item->key, end) != 0) { + reason = "final ref item key not range end"; + goto out; + } + } + for (i = 0; level == 0 && i < SCOUTFS_BTREE_LEAF_ITEM_HASH_NR; i++) { if (buckets[i] == 0) continue; @@ -1024,17 +1062,18 @@ out: if (!reason) return; - printk("found btree block inconsistency: %s\n", reason); - printk("start "SK_FMT" end "SK_FMT"\n", SK_ARG(start), SK_ARG(end)); + printk("verifying btree %s: %s\n", str, reason); + printk("args: level %u last_ref %u start "SK_FMT" end "SK_FMT"\n", + level, last_ref, SK_ARG(start), SK_ARG(end)); printk("calced: i %u tot %u hashed %u fv %u\n", i, tot, hashed, first_val); - printk("hdr: crc %x magic %x fsid %llx seq %llx blkno %llu\n", + printk("bt hdr: crc %x magic %x fsid %llx seq %llx blkno %llu\n", le32_to_cpu(bt->hdr.crc), le32_to_cpu(bt->hdr.magic), le64_to_cpu(bt->hdr.fsid), le64_to_cpu(bt->hdr.seq), le64_to_cpu(bt->hdr.blkno)); printk("item_root: node %u\n", le16_to_cpu(bt->item_root.node)); - printk("nr %u tib %u mfl %u lvl %u\n", + printk("bt: nr %u tib %u mfl %u lvl %u\n", le16_to_cpu(bt->nr_items), le16_to_cpu(bt->total_item_bytes), le16_to_cpu(bt->mid_free_len), bt->level); @@ -1051,6 +1090,92 @@ out: BUG(); } +/* + * Walk from the root to the leaf, verifying the blocks traversed. + */ +__attribute__((unused)) +static void verify_btree_walk(struct super_block *sb, char *str, + struct scoutfs_btree_root *root, + struct scoutfs_key *key) +{ + struct scoutfs_avl_node *next_node; + struct scoutfs_avl_node *node; + struct scoutfs_btree_item *item; + struct scoutfs_btree_item *prev; + struct scoutfs_block *bl = NULL; + struct scoutfs_btree_block *bt; + struct scoutfs_block_ref ref; + struct scoutfs_key start; + struct scoutfs_key end; + bool last_ref; + int level; + int ret; + + if (root->height == 0 && root->ref.blkno != 0) { + WARN_ONCE(1, "invalid btree root height %u blkno %llu seq %016llx\n", + root->height, le64_to_cpu(root->ref.blkno), + le64_to_cpu(root->ref.seq)); + return; + } + + if (root->height == 0) + return; + + scoutfs_key_set_zeros(&start); + scoutfs_key_set_ones(&end); + level = root->height; + ref = root->ref; + /* first parent last ref isn't all ones in subtrees */ + last_ref = false; + + while(level-- > 0) { + scoutfs_block_put(sb, bl); + bl = NULL; + ret = get_ref_block(sb, NULL, NULL, 0, &ref, &bl); + if (ret) { + printk("verifying btree %s: read error %d\n", + str, ret); + break; + } + bt = bl->data; + + verify_btree_block(sb, str, bt, level, last_ref, &start, &end); + + if (level == 0) + break; + + node = scoutfs_avl_search(&bt->item_root, cmp_key_item, key, + NULL, NULL, &next_node, NULL); + item = node_item(node ?: next_node); + + if (item == NULL) { + printk("verifying btree %s: no ref item\n", str); + printk("root: height %u blkno %llu seq %016llx\n", + root->height, le64_to_cpu(root->ref.blkno), + le64_to_cpu(root->ref.seq)); + printk("walk level %u start "SK_FMT" end "SK_FMT"\n", + level, SK_ARG(&start), SK_ARG(&end)); + + printk("block: level %u blkno %llu seq %016llx\n", + bt->level, le64_to_cpu(bt->hdr.blkno), + le64_to_cpu(bt->hdr.seq)); + printk("key: "SK_FMT"\n", SK_ARG(key)); + BUG(); + } + + if ((prev = prev_item(bt, item))) { + start = *item_key(prev); + scoutfs_key_inc(&start); + } + end = *item_key(item); + + memcpy(&ref, item_val(bt, item), sizeof(ref)); + last_ref = !next_item(bt, item); + } + + scoutfs_block_put(sb, bl); +} + struct btree_walk_key_range { struct scoutfs_key start; struct scoutfs_key end; @@ -1082,7 +1207,8 @@ static int btree_walk(struct super_block *sb, int flags, struct scoutfs_key *key, unsigned int val_len, struct scoutfs_block **bl_ret, - struct btree_walk_key_range *kr) + struct btree_walk_key_range *kr, + struct scoutfs_btree_root *par_root) { struct scoutfs_block *par_bl = NULL; struct scoutfs_block *bl = NULL; @@ -1098,7 +1224,9 @@ static int btree_walk(struct super_block *sb, unsigned int nr; int ret; - if (WARN_ON_ONCE((flags & BTW_DIRTY) && (!alloc || !wri))) + if (WARN_ON_ONCE((flags & BTW_DIRTY) && (!alloc || !wri)) || + WARN_ON_ONCE((flags & BTW_PAR_RNG) && !kr) || + WARN_ON_ONCE((flags & (BTW_GET_PAR|BTW_SET_PAR)) && !par_root)) return -EINVAL; /* all ops come through walk and walk calls all reads */ @@ -1125,7 +1253,14 @@ restart: ret = 0; if (!root->height) { - if (!(flags & BTW_INSERT)) { + if (flags & BTW_GET_PAR) { + memset(par_root, 0, sizeof(*par_root)); + *root = *par_root; + ret = 0; + } else if (flags & BTW_SET_PAR) { + *root = *par_root; + ret = 0; + } else if (!(flags & BTW_INSERT)) { ret = -ENOENT; } else { ret = get_ref_block(sb, alloc, wri, BTW_ALLOC | BTW_DIRTY, &root->ref, &bl); @@ -1144,14 +1279,40 @@ restart: trace_scoutfs_btree_walk(sb, root, key, flags, level, ref); + /* par range set by ref to last parent block */ + if (level < 2 && (flags & BTW_PAR_RNG)) { + ret = 0; + break; + } + + if (level < 2 && (flags & BTW_GET_PAR)) { + par_root->ref = *ref; + par_root->height = level + 1; + ret = 0; + break; + } + + if (level < 2 && (flags & BTW_SET_PAR)) { + if (ref == &root->ref) { + /* single parent block is replaced, can shrink/grow */ + *root = *par_root; + } else { + /* subtree replacing one of parents must match height */ + if (par_root->height != level + 1) { + ret = -EINVAL; + break; + } + *ref = par_root->ref; + } + ret = 0; + break; + } + ret = get_ref_block(sb, alloc, wri, flags, ref, &bl); if (ret) break; bt = bl->data; - if (0 && kr) - verify_btree_block(sb, bt, level, &kr->start, &kr->end); - /* XXX more aggressive block verification, before ref updates? */ if (bt->level != level) { scoutfs_corruption(sb, SC_BTREE_BLOCK_LEVEL, @@ -1167,6 +1328,17 @@ restart: break; } + /* + * join/split won't check subtree parent root, let + * caller know when it needs to be split/join. + */ + if ((flags & BTW_SUBTREE) && level == 1 && + (!total_above_join_low_water(bt) || + !mid_free_item_room(bt, sizeof(struct scoutfs_block_ref)))) { + ret = -ERANGE; + break; + } + /* * Splitting and joining can add or remove parents or * change the parent item we use to reach the child @@ -1292,7 +1464,7 @@ int scoutfs_btree_lookup(struct super_block *sb, if (WARN_ON_ONCE(iref->key)) return -EINVAL; - ret = btree_walk(sb, NULL, NULL, root, 0, key, 0, &bl, NULL); + ret = btree_walk(sb, NULL, NULL, root, 0, key, 0, &bl, NULL, NULL); if (ret == 0) { bt = bl->data; @@ -1344,7 +1516,7 @@ int scoutfs_btree_insert(struct super_block *sb, return -EINVAL; ret = btree_walk(sb, alloc, wri, root, BTW_DIRTY | BTW_INSERT, key, - val_len, &bl, NULL); + val_len, &bl, NULL, NULL); if (ret == 0) { bt = bl->data; @@ -1406,7 +1578,7 @@ int scoutfs_btree_update(struct super_block *sb, return -EINVAL; ret = btree_walk(sb, alloc, wri, root, BTW_DIRTY | BTW_INSERT, key, - val_len, &bl, NULL); + val_len, &bl, NULL, NULL); if (ret == 0) { bt = bl->data; @@ -1448,7 +1620,7 @@ int scoutfs_btree_force(struct super_block *sb, return -EINVAL; ret = btree_walk(sb, alloc, wri, root, BTW_DIRTY | BTW_INSERT, key, - val_len, &bl, NULL); + val_len, &bl, NULL, NULL); if (ret == 0) { bt = bl->data; @@ -1486,7 +1658,7 @@ int scoutfs_btree_delete(struct super_block *sb, scoutfs_inc_counter(sb, btree_delete); ret = btree_walk(sb, alloc, wri, root, BTW_DELETE | BTW_DIRTY, key, - 0, &bl, NULL); + 0, &bl, NULL, NULL); if (ret == 0) { bt = bl->data; @@ -1550,7 +1722,7 @@ static int btree_iter(struct super_block *sb,struct scoutfs_btree_root *root, for (;;) { ret = btree_walk(sb, NULL, NULL, root, flags, &walk_key, - 0, &bl, &kr); + 0, &bl, &kr, NULL); if (ret < 0) break; bt = bl->data; @@ -1623,7 +1795,8 @@ int scoutfs_btree_dirty(struct super_block *sb, scoutfs_inc_counter(sb, btree_dirty); - ret = btree_walk(sb, alloc, wri, root, BTW_DIRTY, key, 0, &bl, NULL); + ret = btree_walk(sb, alloc, wri, root, BTW_DIRTY, key, 0, &bl, + NULL, NULL); if (ret == 0) { bt = bl->data; @@ -1659,7 +1832,7 @@ int scoutfs_btree_read_items(struct super_block *sb, struct scoutfs_block *bl; int ret; - ret = btree_walk(sb, NULL, NULL, root, 0, key, 0, &bl, &kr); + ret = btree_walk(sb, NULL, NULL, root, 0, key, 0, &bl, &kr, NULL); if (ret < 0) goto out; bt = bl->data; @@ -1714,7 +1887,7 @@ int scoutfs_btree_insert_list(struct super_block *sb, while (lst) { ret = btree_walk(sb, alloc, wri, root, BTW_DIRTY | BTW_INSERT, - &lst->key, lst->val_len, &bl, &kr); + &lst->key, lst->val_len, &bl, &kr, NULL); if (ret < 0) goto out; bt = bl->data; @@ -1742,3 +1915,542 @@ int scoutfs_btree_insert_list(struct super_block *sb, out: return ret; } + +/* + * Descend towards the leaf that would contain the key. As we arrive at + * the last parent block, set start and end to the range of keys that + * could be found through traversal of that last parent. + * + * If the tree is too short for parent blocks then the max key range + * is returned. + */ +int scoutfs_btree_parent_range(struct super_block *sb, + struct scoutfs_btree_root *root, + struct scoutfs_key *key, + struct scoutfs_key *start, + struct scoutfs_key *end) +{ + struct btree_walk_key_range kr; + int ret; + + ret = btree_walk(sb, NULL, NULL, root, BTW_PAR_RNG, key, 0, NULL, + &kr, NULL); + if (ret == -ENOENT) + ret = 0; + + *start = kr.start; + *end = kr.end; + return ret; +} + +/* + * Initialize the caller's root as a subtree whose ref points to the + * last parent found as we traverse towards the leaf containing the key. + * If the tree is too small to have multiple blocks at the final parent + * level then the caller's root will be initialized to equal full input + * root. If the tree is empty then the par root will also be empty. + */ +int scoutfs_btree_get_parent(struct super_block *sb, + struct scoutfs_btree_root *root, + struct scoutfs_key *key, + struct scoutfs_btree_root *par_root) +{ + return btree_walk(sb, NULL, NULL, root, BTW_GET_PAR, key, 0, NULL, + NULL, par_root); +} + +/* + * Dirty a path towards the leaf block containing the key. As we reach + * the reference to the final parent block override it with the ref in + * the caller's block. If the tree only has a single block at the final + * parent level, or a single leaf block, then the entire tree is + * replaced with the caller's root. + * + * This manages allocs and frees while dirtying blocks in the path to + * the ref, but it doesn't account for allocating the blocks that are + * referenced by the ref nor freeing blocks referenced by the old ref + * that's overwritten. Keeping allocators in sync with the result of + * the ref override is the responsibility of the caller. + */ +int scoutfs_btree_set_parent(struct super_block *sb, + struct scoutfs_alloc *alloc, + struct scoutfs_block_writer *wri, + struct scoutfs_btree_root *root, + struct scoutfs_key *key, + struct scoutfs_btree_root *par_root) +{ + + trace_scoutfs_btree_set_parent(sb, root, key, par_root); + + return btree_walk(sb, alloc, wri, root, BTW_DIRTY | BTW_SET_PAR, + key, 0, NULL, NULL, par_root); +} + +/* + * Descend to the leaf, making sure that all the blocks conform to the + * balance constraints. Blocks below the low threshold will be joined. + * This is called to split blocks that were too large for insertions, + * but those insertions were in a distant context and we don't bother + * communicating the val_len back here. We just try to insert a max + * value. + * + * This always dirties all the way to the leaf. It could be made more + * efficient with more btree walk flags to walk and check for blocks + * that need balancing, and then walks that don't dirty unless they need + * to join/split. + */ +int scoutfs_btree_rebalance(struct super_block *sb, + struct scoutfs_alloc *alloc, + struct scoutfs_block_writer *wri, + struct scoutfs_btree_root *root, + struct scoutfs_key *key) +{ + return btree_walk(sb, alloc, wri, root, + BTW_DIRTY | BTW_INSERT | BTW_DELETE, + key, SCOUTFS_BTREE_MAX_VAL_LEN, NULL, NULL, NULL); +} + +struct merge_pos { + struct rb_node node; + struct scoutfs_btree_root *root; + struct scoutfs_key key; + unsigned int val_len; + u8 val[SCOUTFS_BTREE_MAX_VAL_LEN]; +}; + +/* + * Find the next item in the mpos's root after its key and make sure + * that it's in its sorted position in the rbtree. We're responsible + * for freeing the mpos if we don't put it back in the pos_root. This + * happens naturally naturally when its item_root has no more items to + * merge. + */ +static int reset_mpos(struct super_block *sb, struct rb_root *pos_root, + struct merge_pos *mpos, struct scoutfs_key *end, + scoutfs_btree_merge_cmp_t merge_cmp) +{ + SCOUTFS_BTREE_ITEM_REF(iref); + struct merge_pos *walk; + struct rb_node *parent; + struct rb_node **node; + int key_cmp; + int val_cmp; + int ret; + +restart: + if (!RB_EMPTY_NODE(&mpos->node)) { + rb_erase(&mpos->node, pos_root); + RB_CLEAR_NODE(&mpos->node); + } + + /* find the next item in the root within end */ + ret = scoutfs_btree_next(sb, mpos->root, &mpos->key, &iref); + if (ret == 0) { + if (scoutfs_key_compare(iref.key, end) > 0) { + ret = -ENOENT; + } else { + mpos->key = *iref.key; + mpos->val_len = iref.val_len; + memcpy(mpos->val, iref.val, iref.val_len); + } + scoutfs_btree_put_iref(&iref); + } + if (ret < 0) { + kfree(mpos); + if (ret == -ENOENT) + ret = 0; + goto out; + } + +rewalk: + /* sort merge items by key then oldest to newest */ + node = &pos_root->rb_node; + parent = NULL; + while (*node) { + parent = *node; + walk = container_of(*node, struct merge_pos, node); + + key_cmp = scoutfs_key_compare(&mpos->key, &walk->key); + val_cmp = merge_cmp(mpos->val, mpos->val_len, + walk->val, walk->val_len); + + /* drop old versions of logged keys as we discover them */ + if (key_cmp == 0) { + scoutfs_inc_counter(sb, btree_merge_drop_old); + if (val_cmp < 0) { + scoutfs_key_inc(&mpos->key); + goto restart; + } else { + BUG_ON(val_cmp == 0); + rb_erase(&walk->node, pos_root); + kfree(walk); + goto rewalk; + } + } + + if ((key_cmp ?: val_cmp) < 0) + node = &(*node)->rb_left; + else + node = &(*node)->rb_right; + } + + rb_link_node(&mpos->node, parent, node); + rb_insert_color(&mpos->node, pos_root); + ret = 0; +out: + return ret; +} + +static struct merge_pos *first_mpos(struct rb_root *root) +{ + struct rb_node *node = rb_first(root); + if (node) + return container_of(node, struct merge_pos, node); + return NULL; +} + +/* + * Merge items from a number of read-only input roots into a writable + * destination root. The order of the input roots doesn't matter, the + * items are merged in sorted key order. + * + * The merge_cmp callback determines the order that the input items are + * merged in. The is_del callback determines if a merging item should + * be removed from the destination. + * + * subtree indicates that the destination root is in fact one of many + * parent blocks and shouldn't be split or allowed to fall below the + * join low water mark. + * + * drop_val indicates the initial length of the value that should be + * dropped when merging items into destination items. + * + * -ERANGE is returned if the merge doesn't fully exhaust the range, due + * to allocators running low or needing to join/split the parent. + * *next_ret is set to the next key which hasn't been merged so that the + * caller can retry with a new allocator and subtree. + */ +int scoutfs_btree_merge(struct super_block *sb, + struct scoutfs_alloc *alloc, + struct scoutfs_block_writer *wri, + struct scoutfs_key *start, + struct scoutfs_key *end, + struct scoutfs_key *next_ret, + struct scoutfs_btree_root *root, + struct list_head *inputs, + scoutfs_btree_merge_cmp_t merge_cmp, + scoutfs_btree_merge_is_del_t merge_is_del, bool subtree, + int drop_val, int dirty_limit, int alloc_low) +{ + struct scoutfs_btree_root_head *rhead; + struct rb_root pos_root = RB_ROOT; + struct scoutfs_btree_item *item; + struct scoutfs_btree_block *bt; + struct scoutfs_block *bl = NULL; + struct btree_walk_key_range kr; + struct scoutfs_avl_node *par; + struct merge_pos *mpos; + struct merge_pos *tmp; + int walk_val_len; + int walk_flags; + bool is_del; + int cmp; + int ret; + + trace_scoutfs_btree_merge(sb, root, start, end); + scoutfs_inc_counter(sb, btree_merge); + + list_for_each_entry(rhead, inputs, head) { + mpos = kmalloc(sizeof(*mpos), GFP_NOFS); + if (!mpos) { + ret = -ENOMEM; + goto out; + } + + RB_CLEAR_NODE(&mpos->node); + mpos->key = *start; + mpos->root = &rhead->root; + + ret = reset_mpos(sb, &pos_root, mpos, end, merge_cmp); + if (ret < 0) + goto out; + } + + walk_flags = BTW_DIRTY; + if (subtree) + walk_flags |= BTW_SUBTREE; + walk_val_len = 0; + + while ((mpos = first_mpos(&pos_root))) { + + if (scoutfs_block_writer_dirty_bytes(sb, wri) >= dirty_limit) { + scoutfs_inc_counter(sb, btree_merge_dirty_limit); + ret = -ERANGE; + *next_ret = mpos->key; + goto out; + } + + if (scoutfs_alloc_meta_low(sb, alloc, alloc_low)) { + scoutfs_inc_counter(sb, btree_merge_alloc_low); + ret = -ERANGE; + *next_ret = mpos->key; + goto out; + } + + scoutfs_block_put(sb, bl); + bl = NULL; + ret = btree_walk(sb, alloc, wri, root, walk_flags, + &mpos->key, walk_val_len, &bl, &kr, NULL); + if (ret < 0) { + if (ret == -ERANGE) + *next_ret = mpos->key; + goto out; + } + bt = bl->data; + scoutfs_inc_counter(sb, btree_merge_walk); + + for (; mpos; mpos = first_mpos(&pos_root)) { + + /* val must have at least what we need to drop */ + if (mpos->val_len < drop_val) { + ret = -EIO; + goto out; + } + + /* walk to new leaf if we exceed parent ref key */ + if (scoutfs_key_compare(&mpos->key, &kr.end) > 0) + break; + + /* see if there's an existing item */ + item = leaf_item_hash_search(sb, bt, &mpos->key); + is_del = merge_is_del(mpos->val, mpos->val_len); + + trace_scoutfs_btree_merge_items(sb, mpos->root, + &mpos->key, mpos->val_len, + item ? root : NULL, + item ? item_key(item) : NULL, + item ? item_val_len(item) : 0, is_del); + + /* rewalk and split if ins/update needs room */ + if (!is_del && !mid_free_item_room(bt, mpos->val_len)) { + walk_flags |= BTW_INSERT; + walk_val_len = mpos->val_len; + break; + } + + /* insert missing non-deletion merge items */ + if (!item && !is_del) { + scoutfs_avl_search(&bt->item_root, + cmp_key_item, &mpos->key, + &cmp, &par, NULL, NULL); + create_item(bt, &mpos->key, + mpos->val + drop_val, + mpos->val_len - drop_val, par, cmp); + scoutfs_inc_counter(sb, btree_merge_insert); + } + + /* update existing items */ + if (item && !is_del) { + update_item_value(bt, item, + mpos->val + drop_val, + mpos->val_len - drop_val); + scoutfs_inc_counter(sb, btree_merge_update); + } + + /* delete if merge item was deletion */ + if (item && is_del) { + /* rewalk and join if non-root falls under low water mark */ + if (root->ref.blkno != bt->hdr.blkno && + !total_above_join_low_water(bt)) { + walk_flags |= BTW_DELETE; + break; + } + delete_item(bt, item, NULL); + scoutfs_inc_counter(sb, btree_merge_delete); + } + + /* reset walk args now that we're not split/join */ + walk_flags &= ~(BTW_INSERT | BTW_DELETE); + walk_val_len = 0; + + /* finished with this merge item */ + scoutfs_key_inc(&mpos->key); + ret = reset_mpos(sb, &pos_root, mpos, end, merge_cmp); + if (ret < 0) + goto out; + mpos = NULL; + } + } + + ret = 0; +out: + scoutfs_block_put(sb, bl); + rbtree_postorder_for_each_entry_safe(mpos, tmp, &pos_root, node) { + kfree(mpos); + } + + return ret; +} + +/* + * Free all the blocks referenced by a btree. The btree is only read, + * this does not update the blocks as it frees. The caller ensures that + * these btrees aren't been modified. + * + * The caller's key tracks which blocks have been freed. It must be + * initialized to zeros before the first call to start freeing blocks. + * Once a block is freed the key is updated such that the freed block + * will not be read again. + * + * Returns 0 when progress has been made successfully, which includes + * partial progress. The key is set to all ones once we've freed all + * the blocks. + * + * This works by descending to the last parent block and freeing all its + * leaf blocks without reading them. As it descends it remembers the + * number of parent blocks which were traversed through their final + * child ref. If we free all the leaf blocks then all these parent + * blocks are no longer needed and can be freed. The caller's key is + * updated to past the subtree that we just freed and we retry the + * descent from the root through the next set of parents to the next set + * of leaf blocks to free. + */ +int scoutfs_btree_free_blocks(struct super_block *sb, + struct scoutfs_alloc *alloc, + struct scoutfs_block_writer *wri, + struct scoutfs_key *key, + struct scoutfs_btree_root *root, int alloc_low) +{ + u64 blknos[SCOUTFS_BTREE_MAX_HEIGHT]; + struct scoutfs_block *bl = NULL; + struct scoutfs_btree_item *item; + struct scoutfs_btree_block *bt; + struct scoutfs_block_ref ref; + struct scoutfs_avl_node *node; + struct scoutfs_avl_node *next; + struct scoutfs_key par_next; + int nr_par; + int level; + int ret; + int i; + + if (WARN_ON_ONCE(root->height > ARRAY_SIZE(blknos))) + return -EIO; /* XXX corruption */ + + if (root->height == 0) { + scoutfs_key_set_ones(key); + return 0; + } + + if (scoutfs_key_is_ones(key)) + return 0; + + /* just free a single leaf block */ + if (root->height == 1) { + ret = scoutfs_free_meta(sb, alloc, wri, + le64_to_cpu(root->ref.blkno)); + if (ret == 0) { + trace_scoutfs_btree_free_blocks_single(sb, root, + le64_to_cpu(root->ref.blkno)); + scoutfs_key_set_ones(key); + } + goto out; + } + + for (;;) { + /* start the walk at the root block */ + level = root->height - 1; + ref = root->ref; + scoutfs_key_set_ones(&par_next); + nr_par = 0; + + /* read blocks until we read the last parent */ + for (;;) { + scoutfs_block_put(sb, bl); + bl = NULL; + ret = get_ref_block(sb, alloc, wri, 0, &ref, &bl); + if (ret < 0) + goto out; + bt = bl->data; + + node = scoutfs_avl_search(&bt->item_root, cmp_key_item, + key, NULL, NULL, &next, NULL); + if (node == NULL) + node = next; + + /* should never descend into parent with no more refs */ + if (WARN_ON_ONCE(node == NULL)) { + ret = -EIO; + goto out; + } + + /* we'll free refs in the last parent */ + if (level == 1) + break; + + item = node_item(node); + next = scoutfs_avl_next(&bt->item_root, node); + if (next) { + /* didn't take last ref, still need parents */ + nr_par = 0; + par_next = *item_key(item); + scoutfs_key_inc(&par_next); + } else { + /* final ref, could free after all leaves */ + blknos[nr_par++] = le64_to_cpu(bt->hdr.blkno); + } + + memcpy(&ref, item_val(bt, item), sizeof(ref)); + level--; + } + + /* free all leaf block refs in last parent */ + while (node) { + + /* make sure we can always free parents after leaves */ + if (scoutfs_alloc_meta_low(sb, alloc, + alloc_low + nr_par + 1)) { + ret = 0; + goto out; + } + + item = node_item(node); + memcpy(&ref, item_val(bt, item), sizeof(ref)); + + trace_scoutfs_btree_free_blocks_leaf(sb, root, + le64_to_cpu(ref.blkno)); + ret = scoutfs_free_meta(sb, alloc, wri, + le64_to_cpu(ref.blkno)); + if (ret < 0) + goto out; + + node = scoutfs_avl_next(&bt->item_root, node); + if (node) { + /* done with keys in child we just freed */ + *key = *item_key(item); + scoutfs_key_inc(key); + } + } + + /* now that leaves are freed, free any empty parents */ + for (i = 0; i < nr_par; i++) { + trace_scoutfs_btree_free_blocks_parent(sb, root, + blknos[i]); + ret = scoutfs_free_meta(sb, alloc, wri, blknos[i]); + BUG_ON(ret); /* checked meta low, freed should fit */ + } + + /* restart walk past the subtree we just freed */ + *key = par_next; + + /* but done if we just freed all parents down right spine */ + if (scoutfs_key_is_ones(&par_next)) { + ret = 0; + goto out; + } + } + +out: + scoutfs_block_put(sb, bl); + return ret; +} diff --git a/kmod/src/btree.h b/kmod/src/btree.h index 79d4de58..3d27fec2 100644 --- a/kmod/src/btree.h +++ b/kmod/src/btree.h @@ -82,6 +82,58 @@ int scoutfs_btree_insert_list(struct super_block *sb, struct scoutfs_btree_root *root, struct scoutfs_btree_item_list *lst); +int scoutfs_btree_parent_range(struct super_block *sb, + struct scoutfs_btree_root *root, + struct scoutfs_key *key, + struct scoutfs_key *start, + struct scoutfs_key *end); +int scoutfs_btree_get_parent(struct super_block *sb, + struct scoutfs_btree_root *root, + struct scoutfs_key *key, + struct scoutfs_btree_root *par_root); +int scoutfs_btree_set_parent(struct super_block *sb, + struct scoutfs_alloc *alloc, + struct scoutfs_block_writer *wri, + struct scoutfs_btree_root *root, + struct scoutfs_key *key, + struct scoutfs_btree_root *par_root); +int scoutfs_btree_rebalance(struct super_block *sb, + struct scoutfs_alloc *alloc, + struct scoutfs_block_writer *wri, + struct scoutfs_btree_root *root, + struct scoutfs_key *key); + +/* merge input is a list of roots */ +struct scoutfs_btree_root_head { + struct list_head head; + struct scoutfs_btree_root root; +}; +/* + * Compare the values of merge input items whose keys are equal to + * determine their merge order. + */ +typedef int (*scoutfs_btree_merge_cmp_t)(void *a_val, int a_val_len, + void *b_val, int b_val_len); +/* whether merging item should be removed from destination */ +typedef bool (*scoutfs_btree_merge_is_del_t)(void *val, int val_len); +int scoutfs_btree_merge(struct super_block *sb, + struct scoutfs_alloc *alloc, + struct scoutfs_block_writer *wri, + struct scoutfs_key *start, + struct scoutfs_key *end, + struct scoutfs_key *next_ret, + struct scoutfs_btree_root *root, + struct list_head *input_list, + scoutfs_btree_merge_cmp_t merge_cmp, + scoutfs_btree_merge_is_del_t merge_is_del, bool subtree, + int drop_val, int dirty_limit, int alloc_low); + +int scoutfs_btree_free_blocks(struct super_block *sb, + struct scoutfs_alloc *alloc, + struct scoutfs_block_writer *wri, + struct scoutfs_key *key, + struct scoutfs_btree_root *root, int alloc_low); + void scoutfs_btree_put_iref(struct scoutfs_btree_item_ref *iref); #endif diff --git a/kmod/src/client.c b/kmod/src/client.c index 7a4b4322..68fe4736 100644 --- a/kmod/src/client.c +++ b/kmod/src/client.c @@ -217,6 +217,26 @@ int scoutfs_client_srch_commit_compact(struct super_block *sb, res, sizeof(*res), NULL, 0); } +int scoutfs_client_get_log_merge(struct super_block *sb, + struct scoutfs_log_merge_request *req) +{ + struct client_info *client = SCOUTFS_SB(sb)->client_info; + + return scoutfs_net_sync_request(sb, client->conn, + SCOUTFS_NET_CMD_GET_LOG_MERGE, + NULL, 0, req, sizeof(*req)); +} + +int scoutfs_client_commit_log_merge(struct super_block *sb, + struct scoutfs_log_merge_complete *comp) +{ + struct client_info *client = SCOUTFS_SB(sb)->client_info; + + return scoutfs_net_sync_request(sb, client->conn, + SCOUTFS_NET_CMD_COMMIT_LOG_MERGE, + comp, sizeof(*comp), NULL, 0); +} + int scoutfs_client_send_omap_response(struct super_block *sb, u64 id, struct scoutfs_open_ino_map *map) { diff --git a/kmod/src/client.h b/kmod/src/client.h index f8866abd..1cbcbc1d 100644 --- a/kmod/src/client.h +++ b/kmod/src/client.h @@ -22,6 +22,10 @@ int scoutfs_client_srch_get_compact(struct super_block *sb, struct scoutfs_srch_compact *sc); int scoutfs_client_srch_commit_compact(struct super_block *sb, struct scoutfs_srch_compact *res); +int scoutfs_client_get_log_merge(struct super_block *sb, + struct scoutfs_log_merge_request *req); +int scoutfs_client_commit_log_merge(struct super_block *sb, + struct scoutfs_log_merge_complete *comp); int scoutfs_client_send_omap_response(struct super_block *sb, u64 id, struct scoutfs_open_ino_map *map); int scoutfs_client_open_ino_map(struct super_block *sb, u64 group_nr, diff --git a/kmod/src/counters.h b/kmod/src/counters.h index 7cb5a331..9e9e9f5e 100644 --- a/kmod/src/counters.h +++ b/kmod/src/counters.h @@ -44,6 +44,14 @@ EXPAND_COUNTER(btree_insert) \ EXPAND_COUNTER(btree_leaf_item_hash_search) \ EXPAND_COUNTER(btree_lookup) \ + EXPAND_COUNTER(btree_merge) \ + EXPAND_COUNTER(btree_merge_alloc_low) \ + EXPAND_COUNTER(btree_merge_delete) \ + EXPAND_COUNTER(btree_merge_dirty_limit) \ + EXPAND_COUNTER(btree_merge_drop_old) \ + EXPAND_COUNTER(btree_merge_insert) \ + EXPAND_COUNTER(btree_merge_update) \ + EXPAND_COUNTER(btree_merge_walk) \ EXPAND_COUNTER(btree_next) \ EXPAND_COUNTER(btree_prev) \ EXPAND_COUNTER(btree_split) \ diff --git a/kmod/src/forest.c b/kmod/src/forest.c index 9047c223..37be80a0 100644 --- a/kmod/src/forest.c +++ b/kmod/src/forest.c @@ -37,9 +37,9 @@ * * The log btrees are modified by multiple transactions over time so * there is no consistent ordering relationship between the items in - * different btrees. Each item in a log btree stores a version number - * for the item. Readers check log btrees for the most recent version - * that it should use. + * different btrees. Each item in a log btree stores a seq for the + * item. Readers check log btrees for the most recent seq that it + * should use. * * The item cache reads items in bulk from stable btrees, and writes a * transaction's worth of dirty items into the item log btree. @@ -52,6 +52,8 @@ */ struct forest_info { + struct super_block *sb; + struct mutex mutex; struct scoutfs_alloc *alloc; struct scoutfs_block_writer *wri; @@ -60,6 +62,9 @@ struct forest_info { struct mutex srch_mutex; struct scoutfs_srch_file srch_file; struct scoutfs_block *srch_bl; + + struct workqueue_struct *workq; + struct delayed_work log_merge_dwork; }; #define DECLARE_FOREST_INFO(sb, name) \ @@ -249,7 +254,7 @@ static int forest_read_items(struct super_block *sb, struct scoutfs_key *key, * If we hit stale blocks and retry we can call the callback for * duplicate items. This is harmless because the items are stable while * the caller holds their cluster lock and the caller has to filter out - * item versions anyway. + * item seqs anyway. */ int scoutfs_forest_read_items(struct super_block *sb, struct scoutfs_lock *lock, @@ -426,29 +431,29 @@ out: /* * The caller is commiting items in the transaction and has found the - * greatest item version amongst them. We store it in the log_trees root + * greatest item seq amongst them. We store it in the log_trees root * to send to the server. */ -void scoutfs_forest_set_max_vers(struct super_block *sb, u64 max_vers) +void scoutfs_forest_set_max_seq(struct super_block *sb, u64 max_seq) { DECLARE_FOREST_INFO(sb, finf); - finf->our_log.max_item_vers = cpu_to_le64(max_vers); + finf->our_log.max_item_seq = cpu_to_le64(max_seq); } /* - * The server is calling during setup to find the greatest item version + * The server is calling during setup to find the greatest item seq * amongst all the log tree roots. They have the authoritative current * super. * - * Item versions are only used to compare items in log trees, not in the - * main fs tree. All we have to do is find the greatest version amongst - * the log_trees so that new locks will have a write_version greater - * than all the items in the log_trees. + * Item seqs are only used to compare items in log trees, not in the + * main fs tree. All we have to do is find the greatest seq amongst the + * log_trees so that the core seq will have a greater seq than all the + * items in the log_trees. */ -int scoutfs_forest_get_max_vers(struct super_block *sb, - struct scoutfs_super_block *super, - u64 *vers) +int scoutfs_forest_get_max_seq(struct super_block *sb, + struct scoutfs_super_block *super, + u64 *seq) { struct scoutfs_log_trees *lt; SCOUTFS_BTREE_ITEM_REF(iref); @@ -456,7 +461,7 @@ int scoutfs_forest_get_max_vers(struct super_block *sb, int ret; scoutfs_key_init_log_trees(<k, 0, 0); - *vers = 0; + *seq = 0; for (;; scoutfs_key_inc(<k)) { ret = scoutfs_btree_next(sb, &super->logs_root, <k, &iref); @@ -464,8 +469,7 @@ int scoutfs_forest_get_max_vers(struct super_block *sb, if (iref.val_len == sizeof(struct scoutfs_log_trees)) { ltk = *iref.key; lt = iref.val; - *vers = max(*vers, - le64_to_cpu(lt->max_item_vers)); + *seq = max(*seq, le64_to_cpu(lt->max_item_seq)); } else { ret = -EIO; } @@ -534,7 +538,7 @@ void scoutfs_forest_init_btrees(struct super_block *sb, memset(&finf->our_log, 0, sizeof(finf->our_log)); finf->our_log.item_root = lt->item_root; finf->our_log.bloom_ref = lt->bloom_ref; - finf->our_log.max_item_vers = lt->max_item_vers; + finf->our_log.max_item_seq = lt->max_item_seq; finf->our_log.rid = lt->rid; finf->our_log.nr = lt->nr; finf->srch_file = lt->srch_file; @@ -564,7 +568,7 @@ void scoutfs_forest_get_btrees(struct super_block *sb, lt->item_root = finf->our_log.item_root; lt->bloom_ref = finf->our_log.bloom_ref; lt->srch_file = finf->srch_file; - lt->max_item_vers = finf->our_log.max_item_vers; + lt->max_item_seq = finf->our_log.max_item_seq; scoutfs_block_put(sb, finf->srch_bl); finf->srch_bl = NULL; @@ -573,6 +577,149 @@ void scoutfs_forest_get_btrees(struct super_block *sb, <->bloom_ref); } +/* + * Compare input items to merge by their log item value seq when their + * keys match. + */ +static int merge_cmp(void *a_val, int a_val_len, void *b_val, int b_val_len) +{ + struct scoutfs_log_item_value *a = a_val; + struct scoutfs_log_item_value *b = b_val; + + /* sort merge item by seq */ + return scoutfs_cmp(le64_to_cpu(a->seq), le64_to_cpu(b->seq)); +} + +static bool merge_is_del(void *val, int val_len) +{ + struct scoutfs_log_item_value *liv = val; + + return !!(liv->flags & SCOUTFS_LOG_ITEM_FLAG_DELETION); +} + +#define LOG_MERGE_DELAY_MS (5 * MSEC_PER_SEC) + +/* + * Regularly try to get a log merge request from the server. If we get + * a request we walk the log_trees items to find input trees and pass + * them to btree_merge. All of our work is done in dirty blocks + * allocated from available free blocks that the server gave us. If we + * hit an error then we drop our dirty blocks without writing them and + * send an error flag to the server so they can reclaim our allocators + * and ignore the rest of our work. + */ +static void scoutfs_forest_log_merge_worker(struct work_struct *work) +{ + struct forest_info *finf = container_of(work, struct forest_info, + log_merge_dwork.work); + struct super_block *sb = finf->sb; + struct scoutfs_btree_root_head *rhead = NULL; + struct scoutfs_btree_root_head *tmp; + struct scoutfs_log_merge_complete comp; + struct scoutfs_log_merge_request req; + struct scoutfs_log_trees *lt; + struct scoutfs_block_writer wri; + struct scoutfs_alloc alloc; + SCOUTFS_BTREE_ITEM_REF(iref); + struct scoutfs_key next; + struct scoutfs_key key; + unsigned long delay; + LIST_HEAD(inputs); + int ret; + + ret = scoutfs_client_get_log_merge(sb, &req); + if (ret < 0) + goto resched; + + comp.root = req.root; + comp.start = req.start; + comp.end = req.end; + comp.remain = req.end; + comp.rid = req.rid; + comp.seq = req.seq; + comp.flags = 0; + + scoutfs_alloc_init(&alloc, &req.meta_avail, &req.meta_freed); + scoutfs_block_writer_init(sb, &wri); + + /* find finalized input log trees up to last_seq */ + for (scoutfs_key_init_log_trees(&key, 0, 0); ; scoutfs_key_inc(&key)) { + + if (!rhead) { + rhead = kmalloc(sizeof(*rhead), GFP_NOFS); + if (!rhead) { + ret = -ENOMEM; + goto out; + } + } + + ret = scoutfs_btree_next(sb, &req.logs_root, &key, &iref); + if (ret == 0) { + if (iref.val_len == sizeof(*lt)) { + key = *iref.key; + lt = iref.val; + if ((le64_to_cpu(lt->flags) & + SCOUTFS_LOG_TREES_FINALIZED) && + (le64_to_cpu(lt->max_item_seq) <= + le64_to_cpu(req.last_seq))) { + rhead->root = lt->item_root; + list_add_tail(&rhead->head, &inputs); + rhead = NULL; + } + } else { + ret = -EIO; + } + scoutfs_btree_put_iref(&iref); + } + if (ret < 0) { + if (ret == -ENOENT) { + ret = 0; + break; + } + goto out; + } + } + + /* shouldn't be possible, but it's harmless */ + if (list_empty(&inputs)) { + ret = 0; + goto out; + } + + ret = scoutfs_btree_merge(sb, &alloc, &wri, &req.start, &req.end, + &next, &comp.root, &inputs, merge_cmp, + merge_is_del, + !!(req.flags & cpu_to_le64(SCOUTFS_LOG_MERGE_REQUEST_SUBTREE)), + sizeof(struct scoutfs_log_item_value), + SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT, 10); + if (ret == -ERANGE) { + comp.remain = next; + le64_add_cpu(&comp.flags, SCOUTFS_LOG_MERGE_COMP_REMAIN); + ret = 0; + } + +out: + scoutfs_alloc_prepare_commit(sb, &alloc, &wri); + if (ret == 0) + ret = scoutfs_block_writer_write(sb, &wri); + scoutfs_block_writer_forget_all(sb, &wri); + + comp.meta_avail = alloc.avail; + comp.meta_freed = alloc.freed; + if (ret < 0) + le64_add_cpu(&comp.flags, SCOUTFS_LOG_MERGE_COMP_ERROR); + + ret = scoutfs_client_commit_log_merge(sb, &comp); + + kfree(rhead); + list_for_each_entry_safe(rhead, tmp, &inputs, head) + kfree(rhead); + +resched: + delay = ret == 0 ? 0 : msecs_to_jiffies(LOG_MERGE_DELAY_MS); + queue_delayed_work(finf->workq, &finf->log_merge_dwork, delay); +} + int scoutfs_forest_setup(struct super_block *sb) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); @@ -586,10 +733,23 @@ int scoutfs_forest_setup(struct super_block *sb) } /* the finf fields will be setup as we open a transaction */ + finf->sb = sb; mutex_init(&finf->mutex); mutex_init(&finf->srch_mutex); - + INIT_DELAYED_WORK(&finf->log_merge_dwork, + scoutfs_forest_log_merge_worker); sbi->forest_info = finf; + + finf->workq = alloc_workqueue("scoutfs_log_merge", WQ_NON_REENTRANT | + WQ_UNBOUND | WQ_HIGHPRI, 0); + if (!finf->workq) { + ret = -ENOMEM; + goto out; + } + + queue_delayed_work(finf->workq, &finf->log_merge_dwork, + msecs_to_jiffies(LOG_MERGE_DELAY_MS)); + ret = 0; out: if (ret) @@ -605,6 +765,12 @@ void scoutfs_forest_destroy(struct super_block *sb) if (finf) { scoutfs_block_put(sb, finf->srch_bl); + + if (finf->workq) { + cancel_delayed_work_sync(&finf->log_merge_dwork); + destroy_workqueue(finf->workq); + } + kfree(finf); sbi->forest_info = NULL; } diff --git a/kmod/src/forest.h b/kmod/src/forest.h index b73ea7a4..3ca50670 100644 --- a/kmod/src/forest.h +++ b/kmod/src/forest.h @@ -23,10 +23,10 @@ int scoutfs_forest_read_items(struct super_block *sb, scoutfs_forest_item_cb cb, void *arg); int scoutfs_forest_set_bloom_bits(struct super_block *sb, struct scoutfs_lock *lock); -void scoutfs_forest_set_max_vers(struct super_block *sb, u64 max_vers); -int scoutfs_forest_get_max_vers(struct super_block *sb, - struct scoutfs_super_block *super, - u64 *vers); +void scoutfs_forest_set_max_seq(struct super_block *sb, u64 max_seq); +int scoutfs_forest_get_max_seq(struct super_block *sb, + struct scoutfs_super_block *super, + u64 *seq); int scoutfs_forest_insert_list(struct super_block *sb, struct scoutfs_btree_item_list *lst); int scoutfs_forest_srch_add(struct super_block *sb, u64 hash, u64 ino, u64 id); diff --git a/kmod/src/format.h b/kmod/src/format.h index 924a1842..af2358a0 100644 --- a/kmod/src/format.h +++ b/kmod/src/format.h @@ -325,6 +325,7 @@ struct scoutfs_alloc_root { #define SCOUTFS_ALLOC_OWNER_SERVER 1 #define SCOUTFS_ALLOC_OWNER_MOUNT 2 #define SCOUTFS_ALLOC_OWNER_SRCH 3 +#define SCOUTFS_ALLOC_OWNER_LOG_MERGE 4 struct scoutfs_mounted_client_btree_val { union scoutfs_inet_addr addr; @@ -449,13 +450,16 @@ struct scoutfs_log_trees { struct scoutfs_srch_file srch_file; __le64 data_alloc_zone_blocks; __le64 data_alloc_zones[SCOUTFS_DATA_ALLOC_ZONE_LE64S]; - __le64 max_item_vers; + __le64 max_item_seq; __le64 rid; __le64 nr; + __le64 flags; }; +#define SCOUTFS_LOG_TREES_FINALIZED (1ULL << 0) + struct scoutfs_log_item_value { - __le64 vers; + __le64 seq; __u8 flags; __u8 __pad[7]; __u8 data[]; @@ -490,6 +494,78 @@ struct scoutfs_bloom_block { member_sizeof(struct scoutfs_bloom_block, bits[0]) * 8) #define SCOUTFS_FOREST_BLOOM_FUNC_BITS (SCOUTFS_BLOCK_LG_SHIFT + 3) +/* + * A private server btree item which records the status of a log merge + * operation that is in progress. + */ +struct scoutfs_log_merge_status { + struct scoutfs_key next_range_key; + __le64 nr_requests; + __le64 nr_complete; + __le64 last_seq; + __le64 seq; +}; + +/* + * A request is sent to the client and stored in a server btree item to + * record resources that would be reclaimed if the client failed. It + * has all the inputs needed for the client to perform its portion of a + * merge. + */ +struct scoutfs_log_merge_request { + struct scoutfs_alloc_list_head meta_avail; + struct scoutfs_alloc_list_head meta_freed; + struct scoutfs_btree_root logs_root; + struct scoutfs_btree_root root; + struct scoutfs_key start; + struct scoutfs_key end; + __le64 last_seq; + __le64 rid; + __le64 seq; + __le64 flags; +}; + +/* request root is subtree of fs root at parent, restricted merging modifications */ +#define SCOUTFS_LOG_MERGE_REQUEST_SUBTREE (1ULL << 0) + +/* + * The output of a client's merge of log btree items into a subtree + * rooted at a parent in the fs_root. The client sends it to the + * server, who stores it in a btree item for later splicing/rebalancing. + */ +struct scoutfs_log_merge_complete { + struct scoutfs_alloc_list_head meta_avail; + struct scoutfs_alloc_list_head meta_freed; + struct scoutfs_btree_root root; + struct scoutfs_key start; + struct scoutfs_key end; + struct scoutfs_key remain; + __le64 rid; + __le64 seq; + __le64 flags; +}; + +/* merge failed, ignore completion and reclaim stored request */ +#define SCOUTFS_LOG_MERGE_COMP_ERROR (1ULL << 0) +/* merge didn't complete range, restart from remain */ +#define SCOUTFS_LOG_MERGE_COMP_REMAIN (1ULL << 1) + +/* + * Range items record the ranges of the fs keyspace that still need to + * be merged. They're added as a merge starts, removed as requests are + * sent and added back if the request didn't consume its entire range. + */ +struct scoutfs_log_merge_range { + struct scoutfs_key start; + struct scoutfs_key end; +}; + +struct scoutfs_log_merge_freeing { + struct scoutfs_btree_root root; + struct scoutfs_key key; + __le64 seq; +}; + /* * Keys are first sorted by major key zones. */ @@ -504,6 +580,12 @@ struct scoutfs_bloom_block { #define SCOUTFS_SRCH_ZONE 9 #define SCOUTFS_FREE_EXTENT_BLKNO_ZONE 10 #define SCOUTFS_FREE_EXTENT_ORDER_ZONE 11 +/* Items only stored in log merge server btrees */ +#define SCOUTFS_LOG_MERGE_STATUS_ZONE 12 +#define SCOUTFS_LOG_MERGE_RANGE_ZONE 13 +#define SCOUTFS_LOG_MERGE_REQUEST_ZONE 14 +#define SCOUTFS_LOG_MERGE_COMPLETE_ZONE 15 +#define SCOUTFS_LOG_MERGE_FREEING_ZONE 16 /* inode index zone */ #define SCOUTFS_INODE_INDEX_META_SEQ_TYPE 1 @@ -688,8 +770,8 @@ struct scoutfs_super_block { __le64 version; __le64 flags; __u8 uuid[SCOUTFS_UUID_BYTES]; + __le64 seq; __le64 next_ino; - __le64 next_trans_seq; __le64 total_meta_blocks; /* both static and dynamic */ __le64 first_meta_blkno; /* first dynamically allocated */ __le64 last_meta_blkno; @@ -703,6 +785,7 @@ struct scoutfs_super_block { struct scoutfs_alloc_list_head server_meta_freed[2]; struct scoutfs_btree_root fs_root; struct scoutfs_btree_root logs_root; + struct scoutfs_btree_root log_merge; struct scoutfs_btree_root trans_seqs; struct scoutfs_btree_root mounted_clients; struct scoutfs_btree_root srch_root; @@ -895,6 +978,8 @@ enum scoutfs_net_cmd { SCOUTFS_NET_CMD_LOCK_RECOVER, SCOUTFS_NET_CMD_SRCH_GET_COMPACT, SCOUTFS_NET_CMD_SRCH_COMMIT_COMPACT, + SCOUTFS_NET_CMD_GET_LOG_MERGE, + SCOUTFS_NET_CMD_COMMIT_LOG_MERGE, SCOUTFS_NET_CMD_OPEN_INO_MAP, SCOUTFS_NET_CMD_GET_VOLOPT, SCOUTFS_NET_CMD_SET_VOLOPT, @@ -943,7 +1028,7 @@ struct scoutfs_net_roots { struct scoutfs_net_lock { struct scoutfs_key key; - __le64 write_version; + __le64 write_seq; __u8 old_mode; __u8 new_mode; __u8 __pad[6]; diff --git a/kmod/src/item.c b/kmod/src/item.c index 2b03c39f..d9cc2b2f 100644 --- a/kmod/src/item.c +++ b/kmod/src/item.c @@ -149,7 +149,8 @@ struct cached_item { static int item_val_bytes(int val_len) { - return round_up(offsetof(struct cached_item, val[val_len]), CACHED_ITEM_ALIGN); + return round_up(offsetof(struct cached_item, val[val_len]), + CACHED_ITEM_ALIGN); } /* @@ -345,7 +346,8 @@ static struct cached_page *alloc_pg(struct super_block *sb, gfp_t gfp) page = alloc_page(GFP_NOFS | gfp); if (!page || !pg) { kfree(pg); - __free_page(page); + if (page) + __free_page(page); return NULL; } @@ -420,8 +422,7 @@ static struct cached_item *alloc_item(struct cached_page *pg, static void erase_item(struct cached_page *pg, struct cached_item *item) { rbtree_erase(&item->node, &pg->item_root); - pg->erased_bytes += round_up(item_val_bytes(item->val_len), - CACHED_ITEM_ALIGN); + pg->erased_bytes += item_val_bytes(item->val_len); } static void lru_add(struct super_block *sb, struct item_cache_info *cinf, @@ -852,8 +853,7 @@ static void compact_page_items(struct super_block *sb, for (from = first_item(&pg->item_root); from; from = next_item(from)) { to = page_address(empty->page) + page_off; - page_off += round_up(item_val_bytes(from->val_len), - CACHED_ITEM_ALIGN); + page_off += item_val_bytes(from->val_len); /* copy the entire item, struct members and all */ memcpy(to, from, item_val_bytes(from->val_len)); @@ -1308,10 +1308,10 @@ static struct active_reader *active_rbtree_walk(struct rb_root *root, * on our root and aren't in dirty or lru lists. * * We need to store deletion items here as we read items from all the - * btrees so that they can override older versions of the items. The - * deletion items will be deleted before we insert the pages into the - * cache. We don't insert old versions of items into the tree here so - * that the trees don't have to compare versions. + * btrees so that they can override older items. The deletion items + * will be deleted before we insert the pages into the cache. We don't + * insert old versions of items into the tree here so that the trees + * don't have to compare seqs. */ static int read_page_item(struct super_block *sb, struct scoutfs_key *key, struct scoutfs_log_item_value *liv, void *val, @@ -1331,7 +1331,7 @@ static int read_page_item(struct super_block *sb, struct scoutfs_key *key, pg = page_rbtree_walk(sb, root, key, key, NULL, NULL, &p_par, &p_pnode); found = item_rbtree_walk(&pg->item_root, key, NULL, &par, &pnode); - if (found && (le64_to_cpu(found->liv.vers) >= le64_to_cpu(liv->vers))) + if (found && (le64_to_cpu(found->liv.seq) >= le64_to_cpu(liv->seq))) return 0; if (!page_has_room(pg, val_len)) { @@ -1783,6 +1783,21 @@ out: return ret; } +/* + * An item's seq is greater of the client transaction's seq and the + * lock's write_seq. This ensures that multiple commits in one lock + * grant will have increasing seqs, and new locks in open commits will + * also increase the seqs. It lets us limit the inputs of item merging + * to the last stable seq and ensure that all the items in open + * transactions and granted locks will have greater seqs. + */ +static __le64 item_seq(struct super_block *sb, struct scoutfs_lock *lock) +{ + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + + return cpu_to_le64(max(sbi->trans_seq, lock->write_seq)); +} + /* * Mark the item dirty. Dirtying while holding a transaction pins the * page holding the item and guarantees that the item can be deleted or @@ -1816,7 +1831,7 @@ int scoutfs_item_dirty(struct super_block *sb, struct scoutfs_key *key, ret = -ENOENT; } else { mark_item_dirty(sb, cinf, pg, NULL, item); - item->liv.vers = cpu_to_le64(lock->write_version); + item->liv.seq = item_seq(sb, lock); ret = 0; } @@ -1836,7 +1851,7 @@ static int item_create(struct super_block *sb, struct scoutfs_key *key, { DECLARE_ITEM_CACHE_INFO(sb, cinf); struct scoutfs_log_item_value liv = { - .vers = cpu_to_le64(lock->write_version), + .seq = item_seq(sb, lock), }; struct cached_item *found; struct cached_item *item; @@ -1911,7 +1926,7 @@ int scoutfs_item_update(struct super_block *sb, struct scoutfs_key *key, { DECLARE_ITEM_CACHE_INFO(sb, cinf); struct scoutfs_log_item_value liv = { - .vers = cpu_to_le64(lock->write_version), + .seq = item_seq(sb, lock), }; struct cached_item *item; struct cached_item *found; @@ -1944,9 +1959,10 @@ int scoutfs_item_update(struct super_block *sb, struct scoutfs_key *key, if (val_len) memcpy(found->val, val, val_len); if (val_len < found->val_len) - pg->erased_bytes += found->val_len - val_len; + pg->erased_bytes += item_val_bytes(found->val_len) - + item_val_bytes(val_len); found->val_len = val_len; - found->liv.vers = liv.vers; + found->liv.seq = liv.seq; mark_item_dirty(sb, cinf, pg, NULL, found); } else { item = alloc_item(pg, key, &liv, val, val_len); @@ -1978,7 +1994,7 @@ static int item_delete(struct super_block *sb, struct scoutfs_key *key, { DECLARE_ITEM_CACHE_INFO(sb, cinf); struct scoutfs_log_item_value liv = { - .vers = cpu_to_le64(lock->write_version), + .seq = item_seq(sb, lock), }; struct cached_item *item; struct cached_page *pg; @@ -2020,10 +2036,11 @@ static int item_delete(struct super_block *sb, struct scoutfs_key *key, erase_item(pg, item); } else { /* must emit deletion to clobber old persistent item */ - item->liv.vers = cpu_to_le64(lock->write_version); + item->liv.seq = liv.seq; item->liv.flags |= SCOUTFS_LOG_ITEM_FLAG_DELETION; item->deletion = 1; - pg->erased_bytes += item->val_len; + pg->erased_bytes += item_val_bytes(item->val_len) - + item_val_bytes(0); item->val_len = 0; mark_item_dirty(sb, cinf, pg, NULL, item); } @@ -2106,7 +2123,7 @@ int scoutfs_item_write_dirty(struct super_block *sb) struct page *page; LIST_HEAD(pages); LIST_HEAD(pos); - u64 max_vers = 0; + u64 max_seq = 0; int val_len; int bytes; int off; @@ -2171,7 +2188,7 @@ int scoutfs_item_write_dirty(struct super_block *sb) val_len = sizeof(item->liv) + item->val_len; bytes = offsetof(struct scoutfs_btree_item_list, val[val_len]); - max_vers = max(max_vers, le64_to_cpu(item->liv.vers)); + max_seq = max(max_seq, le64_to_cpu(item->liv.seq)); if (off + bytes > PAGE_SIZE) { page = second; @@ -2201,8 +2218,8 @@ int scoutfs_item_write_dirty(struct super_block *sb) read_unlock(&pg->rwlock); } - /* store max item vers in forest's log_trees */ - scoutfs_forest_set_max_vers(sb, max_vers); + /* store max item seq in forest's log_trees */ + scoutfs_forest_set_max_seq(sb, max_seq); /* write all the dirty items into log btree blocks */ ret = scoutfs_forest_insert_list(sb, first); diff --git a/kmod/src/key.h b/kmod/src/key.h index 5ea4dd4c..66a4c84a 100644 --- a/kmod/src/key.h +++ b/kmod/src/key.h @@ -108,6 +108,16 @@ static inline void scoutfs_key_set_ones(struct scoutfs_key *key) memset(key->__pad, 0, sizeof(key->__pad)); } +static inline bool scoutfs_key_is_ones(struct scoutfs_key *key) +{ + return key->sk_zone == U8_MAX && + key->_sk_first == cpu_to_le64(U64_MAX) && + key->sk_type == U8_MAX && + key->_sk_second == cpu_to_le64(U64_MAX) && + key->_sk_third == cpu_to_le64(U64_MAX) && + key->_sk_fourth == U8_MAX; +} + /* * Return a -1/0/1 comparison of keys. * diff --git a/kmod/src/lock.c b/kmod/src/lock.c index 50a33d26..36227eae 100644 --- a/kmod/src/lock.c +++ b/kmod/src/lock.c @@ -730,7 +730,7 @@ static void lock_grant_worker(struct work_struct *work) lock->request_pending = 0; lock->mode = nl->new_mode; - lock->write_version = le64_to_cpu(nl->write_version); + lock->write_seq = le64_to_cpu(nl->write_seq); if (lock_count_match_exists(nl->new_mode, lock->waiters)) extend_grace(sb, lock); @@ -988,7 +988,7 @@ int scoutfs_lock_recover_request(struct super_block *sb, u64 net_id, for (i = 0; lock && i < SCOUTFS_NET_LOCK_MAX_RECOVER_NR; i++) { nlr->locks[i].key = lock->start; - nlr->locks[i].write_version = cpu_to_le64(lock->write_version); + nlr->locks[i].write_seq = cpu_to_le64(lock->write_seq); nlr->locks[i].old_mode = lock->mode; nlr->locks[i].new_mode = lock->mode; diff --git a/kmod/src/lock.h b/kmod/src/lock.h index 40f8f5b9..d043f9fc 100644 --- a/kmod/src/lock.h +++ b/kmod/src/lock.h @@ -13,7 +13,7 @@ struct scoutfs_omap_lock; /* - * A few fields (start, end, refresh_gen, write_version, granted_mode) + * A few fields (start, end, refresh_gen, write_seq, granted_mode) * are referenced by code outside lock.c. */ struct scoutfs_lock { @@ -23,7 +23,7 @@ struct scoutfs_lock { struct rb_node node; struct rb_node range_node; u64 refresh_gen; - u64 write_version; + u64 write_seq; u64 dirty_trans_seq; struct list_head lru_head; wait_queue_head_t waitq; diff --git a/kmod/src/lock_server.c b/kmod/src/lock_server.c index 09ce48d7..5a3a0cd7 100644 --- a/kmod/src/lock_server.c +++ b/kmod/src/lock_server.c @@ -81,8 +81,6 @@ struct lock_server_info { struct scoutfs_alloc *alloc; struct scoutfs_block_writer *wri; - - atomic64_t write_version; }; #define DECLARE_LOCK_SERVER_INFO(sb, name) \ @@ -479,7 +477,7 @@ static int process_waiting_requests(struct super_block *sb, struct client_lock_entry *req_tmp; struct client_lock_entry *gr; struct client_lock_entry *gr_tmp; - u64 wv; + u64 seq; int ret; BUG_ON(!mutex_is_locked(&snode->mutex)); @@ -520,6 +518,7 @@ static int process_waiting_requests(struct super_block *sb, nl.key = snode->key; nl.new_mode = req->mode; + nl.write_seq = 0; /* see if there's an existing compatible grant to replace */ gr = find_entry(snode, &snode->granted, req->rid); @@ -532,8 +531,9 @@ static int process_waiting_requests(struct super_block *sb, if (nl.new_mode == SCOUTFS_LOCK_WRITE || nl.new_mode == SCOUTFS_LOCK_WRITE_ONLY) { - wv = atomic64_inc_return(&inf->write_version); - nl.write_version = cpu_to_le64(wv); + /* doesn't commit seq update, recovered with locks */ + seq = scoutfs_server_next_seq(sb); + nl.write_seq = cpu_to_le64(seq); } ret = scoutfs_server_lock_response(sb, req->rid, @@ -609,14 +609,6 @@ int scoutfs_lock_server_finished_recovery(struct super_block *sb) return ret; } -static void set_max_write_version(struct lock_server_info *inf, u64 new) -{ - u64 old; - - while (new > (old = atomic64_read(&inf->write_version)) && - (atomic64_cmpxchg(&inf->write_version, old, new) != old)); -} - /* * We sent a lock recover request to the client when we received its * greeting while in recovery. Here we instantiate all the locks it @@ -680,9 +672,9 @@ int scoutfs_lock_server_recover_response(struct super_block *sb, u64 rid, put_server_lock(inf, snode); - /* make sure next write lock is greater than all recovered */ - set_max_write_version(inf, - le64_to_cpu(nlr->locks[i].write_version)); + /* make sure next core seq is greater than all lock write seq */ + scoutfs_server_set_seq_if_greater(sb, + le64_to_cpu(nlr->locks[i].write_seq)); } /* send request for next batch of keys */ @@ -800,7 +792,7 @@ static void lock_server_tseq_show(struct seq_file *m, */ int scoutfs_lock_server_setup(struct super_block *sb, struct scoutfs_alloc *alloc, - struct scoutfs_block_writer *wri, u64 max_vers) + struct scoutfs_block_writer *wri) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct lock_server_info *inf; @@ -815,7 +807,6 @@ int scoutfs_lock_server_setup(struct super_block *sb, scoutfs_tseq_tree_init(&inf->tseq_tree, lock_server_tseq_show); inf->alloc = alloc; inf->wri = wri; - atomic64_set(&inf->write_version, max_vers); /* inc_return gives +1 */ inf->tseq_dentry = scoutfs_tseq_create("server_locks", sbi->debug_root, &inf->tseq_tree); diff --git a/kmod/src/lock_server.h b/kmod/src/lock_server.h index e77f116f..60ce31ce 100644 --- a/kmod/src/lock_server.h +++ b/kmod/src/lock_server.h @@ -13,7 +13,7 @@ int scoutfs_lock_server_farewell(struct super_block *sb, u64 rid); int scoutfs_lock_server_setup(struct super_block *sb, struct scoutfs_alloc *alloc, - struct scoutfs_block_writer *wri, u64 max_vers); + struct scoutfs_block_writer *wri); void scoutfs_lock_server_destroy(struct super_block *sb); #endif diff --git a/kmod/src/omap.c b/kmod/src/omap.c index 3dfcbea8..bbe80976 100644 --- a/kmod/src/omap.c +++ b/kmod/src/omap.c @@ -137,11 +137,10 @@ struct omap_request { /* * In each inode group cluster lock we store data to track the open ino * map which tracks all the inodes that the cluster lock covers. When - * the version shows that the map is stale we send a request to update - * it. + * the seq shows that the map is stale we send a request to update it. */ struct scoutfs_omap_lock_data { - u64 version; + u64 seq; bool req_in_flight; wait_queue_head_t waitq; struct scoutfs_open_ino_map map; @@ -833,8 +832,7 @@ static bool omap_req_in_flight(struct scoutfs_lock *lock, struct scoutfs_omap_lo /* * Make sure the map covered by the cluster lock is current. The caller * holds the cluster lock so once we store lock_data on the cluster lock - * it won't be freed and the write_version in the cluster lock won't - * change. + * it won't be freed and the write_seq in the cluster lock won't change. * * The omap_spinlock protects the omap_data in the cluster lock. We * have to drop it if we have to block to allocate lock_data, send a @@ -861,7 +859,7 @@ static int get_current_lock_data(struct super_block *sb, struct scoutfs_lock *lo } if (lock->omap_data == NULL) { - ldata->version = lock->write_version - 1; /* ensure refresh */ + ldata->seq = lock->write_seq - 1; /* ensure refresh */ init_waitqueue_head(&ldata->waitq); lock->omap_data = ldata; @@ -871,7 +869,7 @@ static int get_current_lock_data(struct super_block *sb, struct scoutfs_lock *lo } } - while (ldata->version != lock->write_version) { + while (ldata->seq != lock->write_seq) { /* only one waiter sends a request at a time */ if (!ldata->req_in_flight) { ldata->req_in_flight = true; @@ -891,7 +889,7 @@ static int get_current_lock_data(struct super_block *sb, struct scoutfs_lock *lo if (send_req) { ldata->req_in_flight = false; if (ret == 0) - ldata->version = lock->write_version; + ldata->seq = lock->write_seq; wake_up(&ldata->waitq); if (ret < 0) goto out; diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h index 7dce85f0..fb5ea548 100644 --- a/kmod/src/scoutfs_trace.h +++ b/kmod/src/scoutfs_trace.h @@ -1644,6 +1644,164 @@ TRACE_EVENT(scoutfs_btree_walk, __entry->level, __entry->ref_blkno, __entry->ref_seq) ); +TRACE_EVENT(scoutfs_btree_set_parent, + TP_PROTO(struct super_block *sb, + struct scoutfs_btree_root *root, struct scoutfs_key *key, + struct scoutfs_btree_root *par_root), + + TP_ARGS(sb, root, key, par_root), + + TP_STRUCT__entry( + SCSB_TRACE_FIELDS + __field(__u64, root_blkno) + __field(__u64, root_seq) + __field(__u8, root_height) + sk_trace_define(key) + __field(__u64, par_root_blkno) + __field(__u64, par_root_seq) + __field(__u8, par_root_height) + ), + + TP_fast_assign( + SCSB_TRACE_ASSIGN(sb); + __entry->root_blkno = le64_to_cpu(root->ref.blkno); + __entry->root_seq = le64_to_cpu(root->ref.seq); + __entry->root_height = root->height; + sk_trace_assign(key, key); + __entry->par_root_blkno = le64_to_cpu(par_root->ref.blkno); + __entry->par_root_seq = le64_to_cpu(par_root->ref.seq); + __entry->par_root_height = par_root->height; + ), + + TP_printk(SCSBF" root blkno %llu seq %llu height %u, key "SK_FMT", par_root blkno %llu seq %llu height %u", + SCSB_TRACE_ARGS, __entry->root_blkno, __entry->root_seq, + __entry->root_height, sk_trace_args(key), + __entry->par_root_blkno, __entry->par_root_seq, + __entry->par_root_height) +); + +TRACE_EVENT(scoutfs_btree_merge, + TP_PROTO(struct super_block *sb, struct scoutfs_btree_root *root, + struct scoutfs_key *start, struct scoutfs_key *end), + + TP_ARGS(sb, root, start, end), + + TP_STRUCT__entry( + SCSB_TRACE_FIELDS + __field(__u64, root_blkno) + __field(__u64, root_seq) + __field(__u8, root_height) + sk_trace_define(start) + sk_trace_define(end) + ), + + TP_fast_assign( + SCSB_TRACE_ASSIGN(sb); + __entry->root_blkno = le64_to_cpu(root->ref.blkno); + __entry->root_seq = le64_to_cpu(root->ref.seq); + __entry->root_height = root->height; + sk_trace_assign(start, start); + sk_trace_assign(end, end); + ), + + TP_printk(SCSBF" root blkno %llu seq %llu height %u start "SK_FMT" end "SK_FMT, + SCSB_TRACE_ARGS, __entry->root_blkno, __entry->root_seq, + __entry->root_height, sk_trace_args(start), + sk_trace_args(end)) +); + +TRACE_EVENT(scoutfs_btree_merge_items, + TP_PROTO(struct super_block *sb, + struct scoutfs_btree_root *m_root, + struct scoutfs_key *m_key, int m_val_len, + struct scoutfs_btree_root *f_root, + struct scoutfs_key *f_key, int f_val_len, + int is_del), + + TP_ARGS(sb, m_root, m_key, m_val_len, f_root, f_key, f_val_len, is_del), + + TP_STRUCT__entry( + SCSB_TRACE_FIELDS + __field(__u64, m_root_blkno) + __field(__u64, m_root_seq) + __field(__u8, m_root_height) + sk_trace_define(m_key) + __field(int, m_val_len) + __field(__u64, f_root_blkno) + __field(__u64, f_root_seq) + __field(__u8, f_root_height) + sk_trace_define(f_key) + __field(int, f_val_len) + __field(int, is_del) + ), + + TP_fast_assign( + SCSB_TRACE_ASSIGN(sb); + __entry->m_root_blkno = m_root ? + le64_to_cpu(m_root->ref.blkno) : 0; + __entry->m_root_seq = m_root ? le64_to_cpu(m_root->ref.seq) : 0; + __entry->m_root_height = m_root ? m_root->height : 0; + sk_trace_assign(m_key, m_key); + __entry->m_val_len = m_val_len; + __entry->f_root_blkno = f_root ? + le64_to_cpu(f_root->ref.blkno) : 0; + __entry->f_root_seq = f_root ? le64_to_cpu(f_root->ref.seq) : 0; + __entry->f_root_height = f_root ? f_root->height : 0; + sk_trace_assign(f_key, f_key); + __entry->f_val_len = f_val_len; + __entry->is_del = !!is_del; + ), + + TP_printk(SCSBF" merge item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, fs item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, is_del %d", + SCSB_TRACE_ARGS, __entry->m_root_blkno, __entry->m_root_seq, + __entry->m_root_height, sk_trace_args(m_key), + __entry->m_val_len, __entry->f_root_blkno, + __entry->f_root_seq, __entry->f_root_height, + sk_trace_args(f_key), __entry->f_val_len, __entry->is_del) +); + +DECLARE_EVENT_CLASS(scoutfs_btree_free_blocks, + TP_PROTO(struct super_block *sb, struct scoutfs_btree_root *root, + u64 blkno), + + TP_ARGS(sb, root, blkno), + + TP_STRUCT__entry( + SCSB_TRACE_FIELDS + __field(__u64, root_blkno) + __field(__u64, root_seq) + __field(__u8, root_height) + __field(__u64, blkno) + ), + + TP_fast_assign( + SCSB_TRACE_ASSIGN(sb); + __entry->root_blkno = le64_to_cpu(root->ref.blkno); + __entry->root_seq = le64_to_cpu(root->ref.seq); + __entry->root_height = root->height; + __entry->blkno = blkno; + ), + + TP_printk(SCSBF" root blkno %llu seq %llu height %u, free blkno %llu", + SCSB_TRACE_ARGS, __entry->root_blkno, __entry->root_seq, + __entry->root_height, __entry->blkno) +); +DEFINE_EVENT(scoutfs_btree_free_blocks, scoutfs_btree_free_blocks_single, + TP_PROTO(struct super_block *sb, struct scoutfs_btree_root *root, + u64 blkno), + TP_ARGS(sb, root, blkno) +); +DEFINE_EVENT(scoutfs_btree_free_blocks, scoutfs_btree_free_blocks_leaf, + TP_PROTO(struct super_block *sb, struct scoutfs_btree_root *root, + u64 blkno), + TP_ARGS(sb, root, blkno) +); +DEFINE_EVENT(scoutfs_btree_free_blocks, scoutfs_btree_free_blocks_parent, + TP_PROTO(struct super_block *sb, struct scoutfs_btree_root *root, + u64 blkno), + TP_ARGS(sb, root, blkno) +); + TRACE_EVENT(scoutfs_online_offline_blocks, TP_PROTO(struct inode *inode, s64 on_delta, s64 off_delta, u64 on_now, u64 off_now), @@ -1900,6 +2058,116 @@ TRACE_EVENT(scoutfs_trans_seq_last, SCSB_TRACE_ARGS, __entry->s_rid, __entry->trans_seq) ); +TRACE_EVENT(scoutfs_get_log_merge_status, + TP_PROTO(struct super_block *sb, u64 rid, struct scoutfs_key *next_range_key, + u64 nr_requests, u64 nr_complete, u64 last_seq, u64 seq), + + TP_ARGS(sb, rid, next_range_key, nr_requests, nr_complete, last_seq, seq), + + TP_STRUCT__entry( + SCSB_TRACE_FIELDS + __field(__u64, s_rid) + sk_trace_define(next_range_key) + __field(__u64, nr_requests) + __field(__u64, nr_complete) + __field(__u64, last_seq) + __field(__u64, seq) + ), + + TP_fast_assign( + SCSB_TRACE_ASSIGN(sb); + __entry->s_rid = rid; + sk_trace_assign(next_range_key, next_range_key); + __entry->nr_requests = nr_requests; + __entry->nr_complete = nr_complete; + __entry->last_seq = last_seq; + __entry->seq = seq; + ), + + TP_printk(SCSBF" rid %016llx next_range_key "SK_FMT" nr_requests %llu nr_complete %llu last_seq %llu seq %llu", + SCSB_TRACE_ARGS, __entry->s_rid, sk_trace_args(next_range_key), + __entry->nr_requests, __entry->nr_complete, __entry->last_seq, __entry->seq) +); + +TRACE_EVENT(scoutfs_get_log_merge_request, + TP_PROTO(struct super_block *sb, u64 rid, + struct scoutfs_btree_root *root, struct scoutfs_key *start, + struct scoutfs_key *end, u64 last_seq, u64 seq), + + TP_ARGS(sb, rid, root, start, end, last_seq, seq), + + TP_STRUCT__entry( + SCSB_TRACE_FIELDS + __field(__u64, s_rid) + __field(__u64, root_blkno) + __field(__u64, root_seq) + __field(__u8, root_height) + sk_trace_define(start) + sk_trace_define(end) + __field(__u64, last_seq) + __field(__u64, seq) + ), + + TP_fast_assign( + SCSB_TRACE_ASSIGN(sb); + __entry->s_rid = rid; + __entry->root_blkno = le64_to_cpu(root->ref.blkno); + __entry->root_seq = le64_to_cpu(root->ref.seq); + __entry->root_height = root->height; + sk_trace_assign(start, start); + sk_trace_assign(end, end); + __entry->last_seq = last_seq; + __entry->seq = seq; + ), + + TP_printk(SCSBF" rid %016llx root blkno %llu seq %llu height %u start "SK_FMT" end "SK_FMT" last_seq %llu seq %llu", + SCSB_TRACE_ARGS, __entry->s_rid, __entry->root_blkno, + __entry->root_seq, __entry->root_height, + sk_trace_args(start), sk_trace_args(end), __entry->last_seq, + __entry->seq) +); + +TRACE_EVENT(scoutfs_get_log_merge_complete, + TP_PROTO(struct super_block *sb, u64 rid, + struct scoutfs_btree_root *root, struct scoutfs_key *start, + struct scoutfs_key *end, struct scoutfs_key *remain, + u64 seq, u64 flags), + + TP_ARGS(sb, rid, root, start, end, remain, seq, flags), + + TP_STRUCT__entry( + SCSB_TRACE_FIELDS + __field(__u64, s_rid) + __field(__u64, root_blkno) + __field(__u64, root_seq) + __field(__u8, root_height) + sk_trace_define(start) + sk_trace_define(end) + sk_trace_define(remain) + __field(__u64, seq) + __field(__u64, flags) + ), + + TP_fast_assign( + SCSB_TRACE_ASSIGN(sb); + __entry->s_rid = rid; + __entry->root_blkno = le64_to_cpu(root->ref.blkno); + __entry->root_seq = le64_to_cpu(root->ref.seq); + __entry->root_height = root->height; + sk_trace_assign(start, start); + sk_trace_assign(end, end); + sk_trace_assign(remain, remain); + __entry->seq = seq; + __entry->flags = flags; + ), + + TP_printk(SCSBF" rid %016llx root blkno %llu seq %llu height %u start "SK_FMT" end "SK_FMT" remain "SK_FMT" seq %llu flags 0x%llx", + SCSB_TRACE_ARGS, __entry->s_rid, __entry->root_blkno, + __entry->root_seq, __entry->root_height, + sk_trace_args(start), sk_trace_args(end), + sk_trace_args(remain), __entry->seq, __entry->flags) +); + DECLARE_EVENT_CLASS(scoutfs_forest_bloom_class, TP_PROTO(struct super_block *sb, struct scoutfs_key *key, u64 rid, u64 nr, u64 blkno, u64 seq, unsigned int count), diff --git a/kmod/src/server.c b/kmod/src/server.c index 4eeefccd..9e8307b8 100644 --- a/kmod/src/server.c +++ b/kmod/src/server.c @@ -65,6 +65,9 @@ struct server_info { u64 term; struct scoutfs_net_connection *conn; + /* synced with superblock seq on commits */ + atomic64_t seq_atomic; + /* request processing coordinates shared commits */ struct rw_semaphore commit_rwsem; struct llist_head commit_waiters; @@ -93,6 +96,8 @@ struct server_info { struct scoutfs_block_writer wri; struct mutex logs_mutex; + struct work_struct log_merge_free_work; + struct mutex srch_mutex; struct mutex mounted_clients_mutex; @@ -187,15 +192,13 @@ static void stop_server(struct server_info *server) * (lock_server) and which are not called directly by the server core * (async timeout work). */ -int scoutfs_server_hold_commit(struct super_block *sb) +void scoutfs_server_hold_commit(struct super_block *sb) { DECLARE_SERVER_INFO(sb, server); scoutfs_inc_counter(sb, server_commit_hold); down_read(&server->commit_rwsem); - - return 0; } /* @@ -250,6 +253,35 @@ static void get_roots(struct super_block *sb, } while (read_seqcount_retry(&server->roots_seqcount, seq)); } +u64 scoutfs_server_seq(struct super_block *sb) +{ + DECLARE_SERVER_INFO(sb, server); + + return atomic64_read(&server->seq_atomic); +} + +u64 scoutfs_server_next_seq(struct super_block *sb) +{ + DECLARE_SERVER_INFO(sb, server); + + return atomic64_inc_return(&server->seq_atomic); +} + +void scoutfs_server_set_seq_if_greater(struct super_block *sb, u64 seq) +{ + DECLARE_SERVER_INFO(sb, server); + u64 expect; + u64 was; + + expect = atomic64_read(&server->seq_atomic); + while (seq > expect) { + was = atomic64_cmpxchg(&server->seq_atomic, expect, seq); + if (was == expect) + break; + expect = was; + } +} + static void set_roots(struct server_info *server, struct scoutfs_btree_root *fs_root, struct scoutfs_btree_root *logs_root, @@ -335,6 +367,7 @@ static void scoutfs_server_commit_func(struct work_struct *work) goto out; } + super->seq = cpu_to_le64(atomic64_read(&server->seq_atomic)); super->server_meta_avail[server->other_ind ^ 1] = server->alloc.avail; super->server_meta_freed[server->other_ind ^ 1] = server->alloc.freed; @@ -394,9 +427,7 @@ static int server_alloc_inodes(struct super_block *sb, memcpy(&lecount, arg, arg_len); - ret = scoutfs_server_hold_commit(sb); - if (ret) - goto out; + scoutfs_server_hold_commit(sb); spin_lock(&sbi->next_ino_lock); ino = le64_to_cpu(super->next_ino); @@ -404,7 +435,7 @@ static int server_alloc_inodes(struct super_block *sb, le64_add_cpu(&super->next_ino, nr); spin_unlock(&sbi->next_ino_lock); - ret = scoutfs_server_apply_commit(sb, ret); + ret = scoutfs_server_apply_commit(sb, 0); if (ret == 0) { ial.ino = cpu_to_le64(ino); ial.nr = cpu_to_le64(nr); @@ -575,6 +606,35 @@ static void set_extent_zone_bits(struct super_block *sb, void *cb_arg, struct sc mod_extent_bits(cba->zones, cba->zone_blocks, ext->start, ext->len, true); } +static int find_log_trees_item(struct super_block *sb, + struct scoutfs_btree_root *logs_root, + bool call_next, u64 rid, u64 nr, + struct scoutfs_log_trees *lt_ret) +{ + SCOUTFS_BTREE_ITEM_REF(iref); + struct scoutfs_key key; + int ret; + + scoutfs_key_init_log_trees(&key, rid, nr); + if (call_next) + ret = scoutfs_btree_next(sb, logs_root, &key, &iref); + else + ret = scoutfs_btree_prev(sb, logs_root, &key, &iref); + if (ret == 0) { + if (iref.val_len == sizeof(struct scoutfs_log_trees)) { + if (le64_to_cpu(iref.key->sklt_rid) != rid) + ret = -ENOENT; + else + memcpy(lt_ret, iref.val, iref.val_len); + } else { + ret = -EIO; + } + scoutfs_btree_put_iref(&iref); + } + + return ret; +} + /* * Give the client roots to all the trees that they'll use to build * their transaction. @@ -584,6 +644,9 @@ static void set_extent_zone_bits(struct super_block *sb, void *cb_arg, struct sc * trees back into the core allocators. They're were committed with the * previous transaction so they're stable and can now be reused, even by * the server in this commit. + * + * If the committed log trees are large enough we finalize them and make + * them available to log merging. */ static int server_get_log_trees(struct super_block *sb, struct scoutfs_net_connection *conn, @@ -595,10 +658,12 @@ static int server_get_log_trees(struct super_block *sb, __le64 exclusive[SCOUTFS_DATA_ALLOC_ZONE_LE64S]; __le64 vacant[SCOUTFS_DATA_ALLOC_ZONE_LE64S]; struct alloc_extent_cb_args cba; - SCOUTFS_BTREE_ITEM_REF(iref); + struct scoutfs_log_trees fin; struct scoutfs_log_trees lt; struct scoutfs_key key; + bool have_fin = false; u64 data_zone_blocks; + u64 nr; int ret; if (arg_len != 0) { @@ -606,38 +671,59 @@ static int server_get_log_trees(struct super_block *sb, goto out; } - ret = scoutfs_server_hold_commit(sb); - if (ret) - goto out; + scoutfs_server_hold_commit(sb); mutex_lock(&server->logs_mutex); - scoutfs_key_init_log_trees(&key, rid, U64_MAX); - - ret = scoutfs_btree_prev(sb, &super->logs_root, &key, &iref); + /* see if we have already have a finalized root from the rid */ + ret = find_log_trees_item(sb, &super->logs_root, true, rid, 0, <); if (ret < 0 && ret != -ENOENT) goto unlock; - if (ret == 0) { - if (iref.val_len == sizeof(struct scoutfs_log_trees)) { - key = *iref.key; - memcpy(<, iref.val, iref.val_len); - if (le64_to_cpu(key.sklt_rid) != rid) - ret = -ENOENT; - } else { - ret = -EIO; - } - scoutfs_btree_put_iref(&iref); - if (ret == -EIO) - goto unlock; + if (ret == 0 && le64_to_cpu(lt.flags) & SCOUTFS_LOG_TREES_FINALIZED) + have_fin = true; + + /* use the last non-finalized root, or start a new one */ + ret = find_log_trees_item(sb, &super->logs_root, false, rid, U64_MAX, + <); + if (ret < 0 && ret != -ENOENT) + goto unlock; + if (ret == 0 && le64_to_cpu(lt.flags) & SCOUTFS_LOG_TREES_FINALIZED) { + ret = -ENOENT; + nr = le64_to_cpu(lt.nr) + 1; + } else if (ret == -ENOENT) { + nr = 1; } - /* initialize new roots if we don't have any */ + /* initialize a new root if we don't have a non-finalized one */ if (ret == -ENOENT) { - key.sklt_rid = cpu_to_le64(rid); - key.sklt_nr = cpu_to_le64(1); memset(<, 0, sizeof(lt)); - lt.rid = key.sklt_rid; - lt.nr = key.sklt_nr; + lt.rid = cpu_to_le64(rid); + lt.nr = cpu_to_le64(nr); + } + + /* finalize an existing root when large enough and don't have one */ + if (lt.item_root.height > 2 && !have_fin) { + fin = lt; + memset(&fin.meta_avail, 0, sizeof(fin.meta_avail)); + memset(&fin.meta_freed, 0, sizeof(fin.meta_freed)); + memset(&fin.data_avail, 0, sizeof(fin.data_avail)); + memset(&fin.data_freed, 0, sizeof(fin.data_freed)); + memset(&fin.srch_file, 0, sizeof(fin.srch_file)); + le64_add_cpu(&fin.flags, SCOUTFS_LOG_TREES_FINALIZED); + + scoutfs_key_init_log_trees(&key, le64_to_cpu(fin.rid), + le64_to_cpu(fin.nr)); + ret = scoutfs_btree_update(sb, &server->alloc, &server->wri, + &super->logs_root, &key, &fin, + sizeof(fin)); + if (ret < 0) + goto unlock; + + memset(<.item_root, 0, sizeof(lt.item_root)); + memset(<.bloom_ref, 0, sizeof(lt.bloom_ref)); + lt.max_item_seq = 0; + le64_add_cpu(<.nr, 1); + lt.flags = 0; } if (get_volopt_val(server, SCOUTFS_VOLOPT_DATA_ALLOC_ZONE_BLOCKS_NR, &data_zone_blocks)) { @@ -681,6 +767,8 @@ static int server_get_log_trees(struct super_block *sb, } /* update client's log tree's item */ + scoutfs_key_init_log_trees(&key, le64_to_cpu(lt.rid), + le64_to_cpu(lt.nr)); ret = scoutfs_btree_force(sb, &server->alloc, &server->wri, &super->logs_root, &key, <, sizeof(lt)); unlock: @@ -717,11 +805,7 @@ static int server_commit_log_trees(struct super_block *sb, /* don't modify the caller's log_trees */ memcpy(<, arg, sizeof(struct scoutfs_log_trees)); - ret = scoutfs_server_hold_commit(sb); - if (ret < 0) { - scoutfs_err(sb, "server error preparing commit: %d", ret); - goto out; - } + scoutfs_server_hold_commit(sb); mutex_lock(&server->logs_mutex); @@ -739,7 +823,7 @@ static int server_commit_log_trees(struct super_block *sb, /* try to rotate the srch log when big enough */ mutex_lock(&server->srch_mutex); ret = scoutfs_srch_rotate_log(sb, &server->alloc, &server->wri, - &super->srch_root, <.srch_file); + &super->srch_root, <.srch_file, false); mutex_unlock(&server->srch_mutex); if (ret < 0) { scoutfs_err(sb, "server error, rotating srch log: %d", ret); @@ -788,8 +872,9 @@ static int server_get_roots(struct super_block *sb, /* * A client is being evicted so we want to reclaim resources from their - * log tree items. The item trees and bloom refs stay around to be read - * and eventually merged and we reclaim all the allocator items. + * open log tree item. The item tree and bloom ref stay around to be + * read and we finalize the tree so that it will be merged. We reclaim + * all the allocator items. * * The caller holds the commit rwsem which means we do all this work in * one server commit. We'll need to keep the total amount of blocks in @@ -803,7 +888,7 @@ static int server_get_roots(struct super_block *sb, * We can return an error without fully reclaiming all the log item's * referenced data. */ -static int reclaim_log_trees(struct super_block *sb, u64 rid) +static int reclaim_open_log_tree(struct super_block *sb, u64 rid) { struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super; DECLARE_SERVER_INFO(sb, server); @@ -815,14 +900,16 @@ static int reclaim_log_trees(struct super_block *sb, u64 rid) mutex_lock(&server->logs_mutex); - /* find the client's existing item */ - scoutfs_key_init_log_trees(&key, rid, 0); - ret = scoutfs_btree_next(sb, &super->logs_root, &key, &iref); + /* find the client's last open log_tree */ + scoutfs_key_init_log_trees(&key, rid, U64_MAX); + ret = scoutfs_btree_prev(sb, &super->logs_root, &key, &iref); if (ret == 0) { if (iref.val_len == sizeof(struct scoutfs_log_trees)) { key = *iref.key; memcpy(<, iref.val, iref.val_len); - if (le64_to_cpu(key.sklt_rid) != rid) + if ((le64_to_cpu(key.sklt_rid) != rid) || + (le64_to_cpu(lt.flags) & + SCOUTFS_LOG_TREES_FINALIZED)) ret = -ENOENT; } else { ret = -EIO; @@ -835,6 +922,16 @@ static int reclaim_log_trees(struct super_block *sb, u64 rid) goto out; } + /* for srch log file rotation if it's populated */ + mutex_lock(&server->srch_mutex); + ret = scoutfs_srch_rotate_log(sb, &server->alloc, &server->wri, + &super->srch_root, <.srch_file, true); + mutex_unlock(&server->srch_mutex); + if (ret < 0) { + scoutfs_err(sb, "server error, reclaim rotating srch log: %d", ret); + goto out; + } + /* * All of these can return errors after having modified the * allocator trees. We have to try and update the roots in the @@ -853,10 +950,11 @@ static int reclaim_log_trees(struct super_block *sb, u64 rid) /* the mount is no longer writing to the zones */ zero_data_alloc_zone_bits(<); + le64_add_cpu(<.flags, SCOUTFS_LOG_TREES_FINALIZED); err = scoutfs_btree_update(sb, &server->alloc, &server->wri, &super->logs_root, &key, <, sizeof(lt)); - BUG_ON(err != 0); /* alloc and log item roots out of sync */ + BUG_ON(err != 0); /* alloc, log, srch items out of sync */ out: mutex_unlock(&server->logs_mutex); @@ -952,9 +1050,7 @@ static int server_advance_seq(struct super_block *sb, goto out; } - ret = scoutfs_server_hold_commit(sb); - if (ret) - goto out; + scoutfs_server_hold_commit(sb); down_write(&server->seq_rwsem); @@ -962,8 +1058,7 @@ static int server_advance_seq(struct super_block *sb, if (ret < 0) goto unlock; - seq = le64_to_cpu(super->next_trans_seq); - le64_add_cpu(&super->next_trans_seq, 1); + seq = scoutfs_server_next_seq(sb); trace_scoutfs_trans_seq_advance(sb, rid, seq); @@ -1001,6 +1096,43 @@ static int remove_trans_seq(struct super_block *sb, u64 rid) return ret; } +/* + * Give the caller the last seq before outstanding client commits. All + * seqs up to and including this are stable, new client transactions can + * only have greater seqs. + */ +static int get_stable_trans_seq(struct super_block *sb, u64 *last_seq_ret) +{ + DECLARE_SERVER_INFO(sb, server); + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + struct scoutfs_super_block *super = &sbi->super; + SCOUTFS_BTREE_ITEM_REF(iref); + struct scoutfs_key key; + u64 last_seq = 0; + int ret; + + down_read(&server->seq_rwsem); + + init_trans_seq_key(&key, 0, 0); + ret = scoutfs_btree_next(sb, &super->trans_seqs, &key, &iref); + if (ret == 0) { + last_seq = le64_to_cpu(iref.key->skts_trans_seq) - 1; + scoutfs_btree_put_iref(&iref); + + } else if (ret == -ENOENT) { + last_seq = scoutfs_server_seq(sb) - 1; + ret = 0; + } + + up_read(&server->seq_rwsem); + + if (ret < 0) + last_seq = 0; + + *last_seq_ret = last_seq; + return ret; +} + /* * Give the calling client the last valid trans_seq that it can return * in results from the indices of trans seqs to inodes. These indices @@ -1013,13 +1145,9 @@ static int server_get_last_seq(struct super_block *sb, struct scoutfs_net_connection *conn, u8 cmd, u64 id, void *arg, u16 arg_len) { - DECLARE_SERVER_INFO(sb, server); - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct scoutfs_super_block *super = &sbi->super; - SCOUTFS_BTREE_ITEM_REF(iref); u64 rid = scoutfs_net_client_rid(conn); - struct scoutfs_key key; - __le64 last_seq = 0; + u64 last_seq = 0; + __le64 leseq; int ret; if (arg_len != 0) { @@ -1027,27 +1155,12 @@ static int server_get_last_seq(struct super_block *sb, goto out; } - down_read(&server->seq_rwsem); - - init_trans_seq_key(&key, 0, 0); - ret = scoutfs_btree_next(sb, &super->trans_seqs, &key, &iref); - if (ret == 0) { - key = *iref.key; - scoutfs_btree_put_iref(&iref); - last_seq = key.skts_trans_seq; - - } else if (ret == -ENOENT) { - last_seq = super->next_trans_seq; - ret = 0; - } - - le64_add_cpu(&last_seq, -1ULL); - trace_scoutfs_trans_seq_last(sb, rid, le64_to_cpu(last_seq)); - - up_read(&server->seq_rwsem); + ret = get_stable_trans_seq(sb, &last_seq); out: + trace_scoutfs_trans_seq_last(sb, rid, last_seq); + leseq = cpu_to_le64(last_seq); return scoutfs_net_response(sb, conn, cmd, id, ret, - &last_seq, sizeof(last_seq)); + &leseq, sizeof(leseq)); } static int server_lock(struct super_block *sb, @@ -1151,9 +1264,7 @@ static int server_srch_get_compact(struct super_block *sb, goto out; } - ret = scoutfs_server_hold_commit(sb); - if (ret) - goto out; + scoutfs_server_hold_commit(sb); mutex_lock(&server->srch_mutex); ret = scoutfs_srch_get_compact(sb, &server->alloc, &server->wri, @@ -1215,9 +1326,7 @@ static int server_srch_commit_compact(struct super_block *sb, } sc = arg; - ret = scoutfs_server_hold_commit(sb); - if (ret) - goto out; + scoutfs_server_hold_commit(sb); mutex_lock(&server->srch_mutex); ret = scoutfs_srch_commit_compact(sb, &server->alloc, &server->wri, @@ -1241,6 +1350,910 @@ out: return scoutfs_net_response(sb, conn, cmd, id, ret, NULL, 0); } +/* + * Log merge range items are stored at the starting fs key of the range. + * The only fs key field that doesn't hold information is the zone, so + * we use the zone to differentiate all types that we store in the log + * merge tree. + */ +static void init_log_merge_key(struct scoutfs_key *key, u8 zone, u64 first, + u64 second) +{ + *key = (struct scoutfs_key) { + .sk_zone = zone, + ._sk_first = cpu_to_le64(first), + ._sk_second = cpu_to_le64(second), + }; +} + +static int next_log_merge_item_key(struct super_block *sb, struct scoutfs_btree_root *root, + u8 zone, struct scoutfs_key *key, void *val, size_t val_len) +{ + SCOUTFS_BTREE_ITEM_REF(iref); + int ret; + + ret = scoutfs_btree_next(sb, root, key, &iref); + if (ret == 0) { + if (iref.key->sk_zone != zone) + ret = -ENOENT; + else if (iref.val_len != val_len) + ret = -EIO; + else + memcpy(val, iref.val, val_len); + scoutfs_btree_put_iref(&iref); + } + + return ret; +} + +static int next_log_merge_item(struct super_block *sb, + struct scoutfs_btree_root *root, + u8 zone, u64 first, u64 second, + void *val, size_t val_len) +{ + struct scoutfs_key key; + + init_log_merge_key(&key, zone, first, second); + return next_log_merge_item_key(sb, root, zone, &key, val, val_len); +} + +/* + * We start a log merge operation if there are any finalized log trees + * whose greatest seq is within the last stable seq. This is called by + * every client's get_log_merge handler at a relatively low frequency + * until a merge starts. + */ +static int start_log_merge(struct super_block *sb, + struct scoutfs_super_block *super, + struct scoutfs_log_merge_status *stat_ret) +{ + struct server_info *server = SCOUTFS_SB(sb)->server_info; + struct scoutfs_log_merge_status stat; + struct scoutfs_log_merge_range rng; + SCOUTFS_BTREE_ITEM_REF(iref); + struct scoutfs_log_trees *lt; + struct scoutfs_key key; + u64 last_seq; + bool start; + int ret; + int err; + + scoutfs_key_init_log_trees(&key, 0, 0); + + ret = get_stable_trans_seq(sb, &last_seq); + if (ret < 0) + goto out; + + scoutfs_key_init_log_trees(&key, 0, 0); + for (start = false; !start; scoutfs_key_inc(&key)) { + ret = scoutfs_btree_next(sb, &super->logs_root, &key, &iref); + if (ret == 0) { + if (iref.val_len == sizeof(*lt)) { + key = *iref.key; + lt = iref.val; + if ((le64_to_cpu(lt->flags) & + SCOUTFS_LOG_TREES_FINALIZED) && + (le64_to_cpu(lt->max_item_seq) <= + last_seq)) { + start = true; + } + } else { + ret = -EIO; + } + scoutfs_btree_put_iref(&iref); + } + if (ret < 0) + goto out; + } + + if (!start) { + ret = -ENOENT; + goto out; + } + + /* add an initial full-range */ + scoutfs_key_set_zeros(&rng.start); + scoutfs_key_set_ones(&rng.end); + key = rng.start; + key.sk_zone = SCOUTFS_LOG_MERGE_RANGE_ZONE; + ret = scoutfs_btree_insert(sb, &server->alloc, &server->wri, + &super->log_merge, &key, &rng, sizeof(rng)); + if (ret < 0) + goto out; + + /* and add the merge status item */ + scoutfs_key_set_zeros(&stat.next_range_key); + stat.nr_requests = 0; + stat.nr_complete = 0; + stat.last_seq = cpu_to_le64(last_seq); + stat.seq = cpu_to_le64(scoutfs_server_next_seq(sb)); + + init_log_merge_key(&key, SCOUTFS_LOG_MERGE_STATUS_ZONE, 0, 0); + ret = scoutfs_btree_insert(sb, &server->alloc, &server->wri, + &super->log_merge, &key, + &stat, sizeof(stat)); + if (ret < 0) { + key = rng.start; + key.sk_zone = SCOUTFS_LOG_MERGE_RANGE_ZONE; + err = scoutfs_btree_delete(sb, &server->alloc, &server->wri, + &super->log_merge, &key); + BUG_ON(err); /* inconsistent */ + } + + /* queue free to see if there's lingering items to process */ + if (ret == 0) + queue_work(server->wq, &server->log_merge_free_work); +out: + if (ret == 0) + *stat_ret = stat; + return ret; +} + +/* Requests drain once we get this many completions to splice */ +#define LOG_MERGE_SPLICE_BATCH 8 + +/* + * Splice the completed subtrees from the clients back into the fs log + * tree as parents. Once they're spliced in, try and rebalance a path + * through them in case they need to be split or joined before the rest + * of their range can be processed. + * + * It's only safe to splice in merged parents when all the requests have + * drained and no requests are relying on stable key ranges of parents + * in the fs root. + * + * It doesn't matter that the fs tree produced by these subtree splices + * itself contains inconsistent items because the subtrees can contain + * fragments of transactions. The read-only finalized log btrees that + * are the source of the spliced items are still preferred by readers. + * It's only once all the finalized items have been merged, and all + * transactions are consistent, that we remove the finalized log trees + * and the fs tree items are used. + * + * As we splice in the subtrees we're implicitly allocating all the + * blocks referenced by the new subtree, and freeing all the blocks + * referenced by the old subtree that's overwritten. These allocs and + * frees were performed by the client as it did cow updates and were + * stored in the allocators that were sent with the completion. We + * merge in those allocators as we splice in the subtree. + * + * We can add back any remaining ranges for any partial completions and + * reset the next range key if there's still work to do. If the + * operation is complete then we tear down the input log_trees items and + * delete the status. + */ +static int splice_log_merge_completions(struct super_block *sb, + struct scoutfs_log_merge_status *stat, + bool no_ranges) +{ + struct server_info *server = SCOUTFS_SB(sb)->server_info; + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + struct scoutfs_super_block *super = &sbi->super; + struct scoutfs_log_merge_complete comp; + struct scoutfs_log_merge_freeing fr; + struct scoutfs_log_merge_range rng; + struct scoutfs_log_trees lt = {{{0,}}}; + SCOUTFS_BTREE_ITEM_REF(iref); + struct scoutfs_key key; + u64 seq; + int ret; + + /* musn't rebalance fs tree parents while reqs rely on their key bounds */ + if (WARN_ON_ONCE(le64_to_cpu(stat->nr_requests) > 0)) + return -EIO; + + /* + * Splice in all the completed subtrees at the initial parent + * blocks in the main fs_tree before rebalancing any of them. + */ + for (seq = 0; ; seq++) { + + ret = next_log_merge_item(sb, &super->log_merge, + SCOUTFS_LOG_MERGE_COMPLETE_ZONE, seq, + 0, &comp, sizeof(comp)); + if (ret < 0) { + if (ret == -ENOENT) { + ret = 0; + break; + } + goto out; + } + + seq = le64_to_cpu(comp.seq); + + ret = scoutfs_btree_set_parent(sb, &server->alloc, &server->wri, + &super->fs_root, &comp.start, + &comp.root); + if (ret < 0) + goto out; + + mutex_lock(&server->alloc_mutex); + ret = scoutfs_alloc_splice_list(sb, &server->alloc, + &server->wri, + server->other_freed, + &comp.meta_avail) ?: + scoutfs_alloc_splice_list(sb, &server->alloc, + &server->wri, + server->other_freed, + &comp.meta_freed); + mutex_unlock(&server->alloc_mutex); + if (ret < 0) + goto out; + + /* clear allocators */ + memset(&comp.meta_avail, 0, sizeof(comp.meta_avail)); + memset(&comp.meta_freed, 0, sizeof(comp.meta_freed)); + + init_log_merge_key(&key, SCOUTFS_LOG_MERGE_COMPLETE_ZONE, + seq, 0); + ret = scoutfs_btree_update(sb, &server->alloc, &server->wri, + &super->log_merge, &key, + &comp, sizeof(comp)); + if (ret < 0) + goto out; + } + + /* + * Now with all the parent blocks spliced in, rebalance items + * amongst parents that needed to split/join and delete the + * completion items, possibly returning ranges to process. + */ + for (seq = 0; ; seq++) { + ret = next_log_merge_item(sb, &super->log_merge, + SCOUTFS_LOG_MERGE_COMPLETE_ZONE, seq, + 0, &comp, sizeof(comp)); + if (ret < 0) { + if (ret == -ENOENT) { + ret = 0; + break; + } + goto out; + } + + seq = le64_to_cpu(comp.seq); + + /* balance when there was a remaining key range */ + if (le64_to_cpu(comp.flags) & SCOUTFS_LOG_MERGE_COMP_REMAIN) { + ret = scoutfs_btree_rebalance(sb, &server->alloc, + &server->wri, + &super->fs_root, + &comp.start); + if (ret < 0) + goto out; + + rng.start = comp.remain; + rng.end = comp.end; + + key = rng.start; + key.sk_zone = SCOUTFS_LOG_MERGE_RANGE_ZONE; + ret = scoutfs_btree_insert(sb, &server->alloc, + &server->wri, + &super->log_merge, &key, + &rng, sizeof(rng)); + if (ret < 0) + goto out; + no_ranges = false; + } + + /* delete the completion item */ + init_log_merge_key(&key, SCOUTFS_LOG_MERGE_COMPLETE_ZONE, + seq, 0); + ret = scoutfs_btree_delete(sb, &server->alloc, &server->wri, + &super->log_merge, + &key); + if (ret < 0) + goto out; + } + + /* update the status once all completes are processed */ + scoutfs_key_set_zeros(&stat->next_range_key); + stat->nr_complete = 0; + + /* update counts and done if there's still ranges to process */ + if (!no_ranges) { + init_log_merge_key(&key, SCOUTFS_LOG_MERGE_STATUS_ZONE, 0, 0); + ret = scoutfs_btree_update(sb, &server->alloc, &server->wri, + &super->log_merge, &key, + stat, sizeof(*stat)); + goto out; + } + + /* no more ranges, free blooms and add freeing items for free work */ + lt.rid = 0; + lt.nr = 0; + for (;;) { + scoutfs_key_init_log_trees(&key, le64_to_cpu(lt.rid), + le64_to_cpu(lt.nr) + 1); + ret = scoutfs_btree_next(sb, &super->logs_root, &key, &iref); + if (ret == 0) { + if (iref.val_len == sizeof(lt)) { + key = *iref.key; + memcpy(<, iref.val, sizeof(lt)); + } else { + ret = -EIO; + } + scoutfs_btree_put_iref(&iref); + } + if (ret < 0) { + if (ret == -ENOENT) { + ret = 0; + break; + } + goto out; + } + + /* only free the inputs to the log merge that just finished */ + if (!(le64_to_cpu(lt.flags) & SCOUTFS_LOG_TREES_FINALIZED) || + (le64_to_cpu(lt.max_item_seq) > + le64_to_cpu(stat->last_seq))) + continue; + + fr.root = lt.item_root; + scoutfs_key_set_zeros(&fr.key); + fr.seq = cpu_to_le64(scoutfs_server_next_seq(sb)); + init_log_merge_key(&key, SCOUTFS_LOG_MERGE_FREEING_ZONE, + le64_to_cpu(fr.seq), 0); + ret = scoutfs_btree_insert(sb, &server->alloc, &server->wri, + &super->log_merge, &key, + &fr, sizeof(fr)); + if (ret < 0) + goto out; + + if (lt.bloom_ref.blkno) { + ret = scoutfs_free_meta(sb, &server->alloc, + &server->wri, + le64_to_cpu(lt.bloom_ref.blkno)); + if (ret < 0) + goto out; + } + + scoutfs_key_init_log_trees(&key, le64_to_cpu(lt.rid), + le64_to_cpu(lt.nr)); + ret = scoutfs_btree_delete(sb, &server->alloc, &server->wri, + &super->logs_root, &key); + if (ret < 0) + goto out; + } + + init_log_merge_key(&key, SCOUTFS_LOG_MERGE_STATUS_ZONE, 0, 0); + ret = scoutfs_btree_delete(sb, &server->alloc, &server->wri, + &super->log_merge, &key); + if (ret == 0) + queue_work(server->wq, &server->log_merge_free_work); +out: + BUG_ON(ret); /* inconsistent */ + + return ret; +} + +/* + * Search amongst the finalized log roots within the caller's merge seq looking + * for the earliest item within the caller's range. The caller has taken + * care of locking. + */ +static int next_least_log_item(struct super_block *sb, + struct scoutfs_btree_root *logs_root, + u64 seq, struct scoutfs_key *start, + struct scoutfs_key *end, + struct scoutfs_key *next_ret) +{ + struct scoutfs_btree_root item_root; + struct scoutfs_log_trees *lt; + SCOUTFS_BTREE_ITEM_REF(iref); + struct scoutfs_key key; + int ret; + + scoutfs_key_set_ones(next_ret); + + for (scoutfs_key_init_log_trees(&key, 0, 0); ; scoutfs_key_inc(&key)) { + + /* find the next finalized log root within the merge last_seq */ + ret = scoutfs_btree_next(sb, logs_root, &key, &iref); + if (ret == 0) { + if (iref.val_len == sizeof(*lt)) { + key = *iref.key; + lt = iref.val; + if ((le64_to_cpu(lt->flags) & + SCOUTFS_LOG_TREES_FINALIZED) && + (le64_to_cpu(lt->max_item_seq) <= seq)) + item_root = lt->item_root; + else + item_root.ref.blkno = 0; + } else { + ret = -EIO; + } + scoutfs_btree_put_iref(&iref); + } + if (ret < 0) { + if (ret == -ENOENT) + ret = 0; + goto out; + } + if (item_root.ref.blkno == 0) + continue; + + /* see if populated roots have item keys less than than next */ + ret = scoutfs_btree_next(sb, &item_root, start, &iref); + if (ret == 0) { + if (scoutfs_key_compare(iref.key, end) <= 0 && + scoutfs_key_compare(iref.key, next_ret) < 0) + *next_ret = *iref.key; + scoutfs_btree_put_iref(&iref); + } + if (ret < 0) { + if (ret == -ENOENT) + ret = 0; + else + goto out; + } + } + +out: + if (ret == 0 && scoutfs_key_is_ones(next_ret)) + ret = -ENOENT; + + return ret; +} + +/* + * Once a merge is fully completed all of the finalized input log btrees + * are redundant and can be freed. + * + * As merging finishes and the status item is deleted, we also move all + * the finalized roots from log_trees items over into freeing items. + * This work is then kicked off which iterates over all the freeing + * items calling into the btree to free all its referenced blocks, with + * the key tracking partial progress. + * + * The freeing work is reasonably light. We only read the btree blocks + * and add freed blocks to merge back into the core allocators. The + * server can handle this load and we avoid the io overhead and + * complexity of farming it out to clients. + */ +static void server_log_merge_free_work(struct work_struct *work) +{ + struct server_info *server = container_of(work, struct server_info, + log_merge_free_work); + struct super_block *sb = server->sb; + struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super; + struct scoutfs_log_merge_freeing fr; + struct scoutfs_key key; + bool commit = false; + int ret = 0; + + /* shutdown waits for us, we'll eventually load set shutting_down */ + while (!server->shutting_down) { + scoutfs_server_hold_commit(sb); + mutex_lock(&server->logs_mutex); + commit = true; + + ret = next_log_merge_item(sb, &super->log_merge, + SCOUTFS_LOG_MERGE_FREEING_ZONE, + 0, 0, &fr, sizeof(fr)); + if (ret < 0) { + if (ret == -ENOENT) + ret = 0; + break; + } + + ret = scoutfs_btree_free_blocks(sb, &server->alloc, + &server->wri, &fr.key, + &fr.root, 10); + if (ret < 0) + break; + + /* freed blocks are in allocator, we *have* to update key */ + init_log_merge_key(&key, SCOUTFS_LOG_MERGE_FREEING_ZONE, + le64_to_cpu(fr.seq), 0); + if (scoutfs_key_is_ones(&fr.key)) + ret = scoutfs_btree_delete(sb, &server->alloc, + &server->wri, + &super->log_merge, &key); + else + ret = scoutfs_btree_update(sb, &server->alloc, + &server->wri, + &super->log_merge, &key, + &fr, sizeof(fr)); + /* freed blocks are in allocator, we *have* to update fr */ + BUG_ON(ret < 0); + + mutex_unlock(&server->logs_mutex); + ret = scoutfs_server_apply_commit(sb, ret); + commit = false; + if (ret < 0) + break; + } + + if (commit) { + mutex_unlock(&server->logs_mutex); + ret = scoutfs_server_apply_commit(sb, ret); + } + + if (ret < 0) { + scoutfs_err(sb, "server error freeing merged btree blocks: %d", + ret); + stop_server(server); + } + + /* not re-arming, regularly queued by the server during merging */ +} + +/* + * This will return ENOENT to the client if there is no work to do. + */ +static int server_get_log_merge(struct super_block *sb, + struct scoutfs_net_connection *conn, + u8 cmd, u64 id, void *arg, u16 arg_len) +{ + DECLARE_SERVER_INFO(sb, server); + u64 rid = scoutfs_net_client_rid(conn); + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + struct scoutfs_super_block *super = &sbi->super; + struct scoutfs_log_merge_status stat; + struct scoutfs_log_merge_range rng; + struct scoutfs_log_merge_range remain; + struct scoutfs_log_merge_request req; + struct scoutfs_key par_start; + struct scoutfs_key par_end; + struct scoutfs_key next_key; + struct scoutfs_key key; + bool ins_rng; + bool del_remain; + bool del_req; + bool upd_stat; + bool no_ranges; + bool no_next; + int ret; + int err; + + if (arg_len != 0) + return -EINVAL; + + scoutfs_server_hold_commit(sb); + mutex_lock(&server->logs_mutex); + +restart: + memset(&req, 0, sizeof(req)); + ins_rng = false; + del_remain = false; + del_req = false; + upd_stat = false; + + /* get the status item, maybe creating a new one */ + ret = next_log_merge_item(sb, &super->log_merge, + SCOUTFS_LOG_MERGE_STATUS_ZONE, 0, 0, + &stat, sizeof(stat)); + if (ret == -ENOENT) + ret = start_log_merge(sb, super, &stat); + if (ret < 0) + goto out; + + trace_scoutfs_get_log_merge_status(sb, rid, &stat.next_range_key, + le64_to_cpu(stat.nr_requests), + le64_to_cpu(stat.nr_complete), + le64_to_cpu(stat.last_seq), + le64_to_cpu(stat.seq)); + + /* find the next range, always checking for splicing */ + for (;;) { + key = stat.next_range_key; + key.sk_zone = SCOUTFS_LOG_MERGE_RANGE_ZONE; + ret = next_log_merge_item_key(sb, &super->log_merge, SCOUTFS_LOG_MERGE_RANGE_ZONE, + &key, &rng, sizeof(rng)); + if (ret < 0 && ret != -ENOENT) + goto out; + + /* maybe splice now that we know if there's ranges */ + no_next = ret == -ENOENT; + no_ranges = scoutfs_key_is_zeros(&stat.next_range_key) && ret == -ENOENT; + if (le64_to_cpu(stat.nr_requests) == 0 && + (no_next || le64_to_cpu(stat.nr_complete) >= LOG_MERGE_SPLICE_BATCH)) { + ret = splice_log_merge_completions(sb, &stat, no_ranges); + if (ret < 0) + goto out; + /* splicing resets key and adds ranges, could finish status */ + goto restart; + } + + /* no ranges from next for requests, future attempts will create or splice */ + if (no_next) { + ret = -ENOENT; + goto out; + } + + /* see if we should back off after splicing might have deleted completions */ + if ((le64_to_cpu(stat.nr_requests) + + le64_to_cpu(stat.nr_complete)) >= LOG_MERGE_SPLICE_BATCH) { + ret = -ENOENT; + goto out; + } + + /* find the next logged item in the next range */ + ret = next_least_log_item(sb, &super->logs_root, + le64_to_cpu(stat.last_seq), + &rng.start, &rng.end, &next_key); + if (ret == 0) + break; + /* drop the range if it contained no logged items */ + if (ret == -ENOENT) { + key = rng.start; + key.sk_zone = SCOUTFS_LOG_MERGE_RANGE_ZONE; + ret = scoutfs_btree_delete(sb, &server->alloc, + &server->wri, + &super->log_merge, &key); + } + if (ret < 0) + goto out; + } + + /* start to build the request that's saved and sent to the client */ + req.logs_root = super->logs_root; + req.last_seq = stat.last_seq; + req.rid = cpu_to_le64(rid); + req.seq = cpu_to_le64(scoutfs_server_next_seq(sb)); + req.flags = 0; + if (super->fs_root.height > 2) + req.flags |= cpu_to_le64(SCOUTFS_LOG_MERGE_REQUEST_SUBTREE); + + /* find the fs_root parent block and its key range */ + ret = scoutfs_btree_get_parent(sb, &super->fs_root, &next_key, + &req.root) ?: + scoutfs_btree_parent_range(sb, &super->fs_root, &next_key, + &par_start, &par_end); + if (ret < 0) + goto out; + + /* start from next item, don't exceed parent key range */ + req.start = next_key; + req.end = rng.end; + if (scoutfs_key_compare(&par_end, &req.end) < 0) + req.end = par_end; + + /* delete the old range */ + key = rng.start; + key.sk_zone = SCOUTFS_LOG_MERGE_RANGE_ZONE; + ret = scoutfs_btree_delete(sb, &server->alloc, &server->wri, + &super->log_merge, &key); + if (ret < 0) + goto out; + ins_rng = true; + + /* add remaining range if we have to */ + if (scoutfs_key_compare(&rng.end, &req.end) > 0) { + remain.start = req.end; + scoutfs_key_inc(&remain.start); + remain.end = rng.end; + + key = remain.start; + key.sk_zone = SCOUTFS_LOG_MERGE_RANGE_ZONE; + ret = scoutfs_btree_insert(sb, &server->alloc, &server->wri, + &super->log_merge, &key, + &remain, sizeof(remain)); + if (ret < 0) + goto out; + del_remain = true; + } + + /* give the client an allocation pool to work with */ + mutex_lock(&server->alloc_mutex); + ret = scoutfs_alloc_fill_list(sb, &server->alloc, &server->wri, + &req.meta_avail, server->meta_avail, + SCOUTFS_SERVER_MERGE_FILL_LO, + SCOUTFS_SERVER_MERGE_FILL_TARGET); + mutex_unlock(&server->alloc_mutex); + if (ret < 0) + goto out; + + /* save the request that will be sent to the client */ + init_log_merge_key(&key, SCOUTFS_LOG_MERGE_REQUEST_ZONE, rid, + le64_to_cpu(req.seq)); + ret = scoutfs_btree_insert(sb, &server->alloc, &server->wri, + &super->log_merge, &key, + &req, sizeof(req)); + if (ret < 0) + goto out; + del_req = true; + + trace_scoutfs_get_log_merge_request(sb, rid, &req.root, + &req.start, &req.end, + le64_to_cpu(req.last_seq), + le64_to_cpu(req.seq)); + + /* make sure next range avoids ranges for parent in use */ + stat.next_range_key = par_end; + if (!scoutfs_key_is_ones(&stat.next_range_key)) + scoutfs_key_inc(&stat.next_range_key); + + /* update the status requests count */ + le64_add_cpu(&stat.nr_requests, 1); + init_log_merge_key(&key, SCOUTFS_LOG_MERGE_STATUS_ZONE, 0, 0); + ret = scoutfs_btree_update(sb, &server->alloc, &server->wri, + &super->log_merge, &key, + &stat, sizeof(stat)); + if (ret < 0) + goto out; + upd_stat = true; + +out: + if (ret < 0) { + /* undo any our partial item changes */ + if (upd_stat) { + le64_add_cpu(&stat.nr_requests, -1ULL); + init_log_merge_key(&key, SCOUTFS_LOG_MERGE_STATUS_ZONE, + 0, 0); + err = scoutfs_btree_update(sb, &server->alloc, + &server->wri, + &super->log_merge, &key, + &stat, sizeof(stat)); + BUG_ON(err); /* inconsistent */ + } + + if (del_req) { + init_log_merge_key(&key, SCOUTFS_LOG_MERGE_REQUEST_ZONE, + rid, le64_to_cpu(req.seq)); + err = scoutfs_btree_delete(sb, &server->alloc, + &server->wri, + &super->log_merge, &key); + BUG_ON(err); /* inconsistent */ + } + + if (del_remain) { + key = remain.start; + key.sk_zone = SCOUTFS_LOG_MERGE_RANGE_ZONE; + err = scoutfs_btree_delete(sb, &server->alloc, + &server->wri, + &super->log_merge, &key); + BUG_ON(err); /* inconsistent */ + } + + if (ins_rng) { + key = rng.start; + key.sk_zone = SCOUTFS_LOG_MERGE_RANGE_ZONE; + err = scoutfs_btree_insert(sb, &server->alloc, + &server->wri, + &super->log_merge, &key, + &rng, sizeof(rng)); + BUG_ON(err); /* inconsistent */ + } + + /* reclaim allocation if we failed */ + mutex_lock(&server->alloc_mutex); + err = scoutfs_alloc_splice_list(sb, &server->alloc, + &server->wri, + server->other_freed, + &req.meta_avail); + mutex_unlock(&server->alloc_mutex); + BUG_ON(err); /* inconsistent */ + } + + mutex_unlock(&server->logs_mutex); + ret = scoutfs_server_apply_commit(sb, ret); + + return scoutfs_net_response(sb, conn, cmd, id, ret, &req, sizeof(req)); +} + +/* + * Commit the client's leg merge work. Typically we store the + * completion so that we can later splice it back into the fs root and + * reclaim its allocators later in a batch. If it failed we reclaim it + * immediately. + */ +static int server_commit_log_merge(struct super_block *sb, + struct scoutfs_net_connection *conn, + u8 cmd, u64 id, void *arg, u16 arg_len) +{ + DECLARE_SERVER_INFO(sb, server); + u64 rid = scoutfs_net_client_rid(conn); + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + struct scoutfs_super_block *super = &sbi->super; + struct scoutfs_log_merge_request orig_req; + struct scoutfs_log_merge_complete *comp; + struct scoutfs_log_merge_status stat; + struct scoutfs_log_merge_range rng; + struct scoutfs_key key; + int ret; + + scoutfs_key_set_zeros(&rng.end); + + if (arg_len != sizeof(struct scoutfs_log_merge_complete)) + return -EINVAL; + comp = arg; + + trace_scoutfs_get_log_merge_complete(sb, rid, &comp->root, + &comp->start, &comp->end, + &comp->remain, + le64_to_cpu(comp->seq), + le64_to_cpu(comp->flags)); + + scoutfs_server_hold_commit(sb); + mutex_lock(&server->logs_mutex); + + /* find the status of the current log merge */ + ret = next_log_merge_item(sb, &super->log_merge, + SCOUTFS_LOG_MERGE_STATUS_ZONE, 0, 0, + &stat, sizeof(stat)); + if (ret < 0) { + WARN_ON_ONCE(ret == -ENOENT); /* inconsistent */ + goto out; + } + + /* find the completion's original saved request */ + ret = next_log_merge_item(sb, &super->log_merge, + SCOUTFS_LOG_MERGE_REQUEST_ZONE, + rid, le64_to_cpu(comp->seq), + &orig_req, sizeof(orig_req)); + if (WARN_ON_ONCE(ret == 0 && (comp->rid != orig_req.rid || + comp->seq != orig_req.seq))) + ret = -ENOENT; /* inconsistency */ + if (ret < 0) { + WARN_ON_ONCE(ret == -ENOENT); /* inconsistency */ + goto out; + } + + /* delete the original request item */ + init_log_merge_key(&key, SCOUTFS_LOG_MERGE_REQUEST_ZONE, rid, + le64_to_cpu(orig_req.seq)); + ret = scoutfs_btree_delete(sb, &server->alloc, &server->wri, + &super->log_merge, &key); + if (ret < 0) + goto out; + + if (le64_to_cpu(comp->flags) & SCOUTFS_LOG_MERGE_COMP_ERROR) { + /* restore the range and reclaim the allocator if it failed */ + rng.start = orig_req.start; + rng.end = orig_req.end; + + key = rng.start; + key.sk_zone = SCOUTFS_LOG_MERGE_RANGE_ZONE; + ret = scoutfs_btree_insert(sb, &server->alloc, &server->wri, + &super->log_merge, &key, + &rng, sizeof(rng)); + if (ret < 0) + goto out; + + mutex_lock(&server->alloc_mutex); + ret = scoutfs_alloc_splice_list(sb, &server->alloc, + &server->wri, + server->other_freed, + &orig_req.meta_avail) ?: + scoutfs_alloc_splice_list(sb, &server->alloc, + &server->wri, + server->other_freed, + &orig_req.meta_freed); + mutex_unlock(&server->alloc_mutex); + if (ret < 0) + goto out; + + } else { + /* otherwise store the completion for later splicing */ + init_log_merge_key(&key, SCOUTFS_LOG_MERGE_COMPLETE_ZONE, + le64_to_cpu(comp->seq), 0); + ret = scoutfs_btree_insert(sb, &server->alloc, &server->wri, + &super->log_merge, &key, + comp, sizeof(*comp)); + if (ret < 0) + goto out; + + le64_add_cpu(&stat.nr_complete, 1ULL); + } + + /* and update the status counts */ + le64_add_cpu(&stat.nr_requests, -1ULL); + init_log_merge_key(&key, SCOUTFS_LOG_MERGE_STATUS_ZONE, 0, 0); + ret = scoutfs_btree_update(sb, &server->alloc, &server->wri, + &super->log_merge, &key, + &stat, sizeof(stat)); + if (ret < 0) + goto out; + +out: + mutex_unlock(&server->logs_mutex); + ret = scoutfs_server_apply_commit(sb, ret); + BUG_ON(ret < 0); /* inconsistent */ + + return scoutfs_net_response(sb, conn, cmd, id, ret, NULL, 0); +} + /* The server is receiving an omap response from the client */ static int open_ino_map_response(struct super_block *sb, struct scoutfs_net_connection *conn, void *resp, unsigned int resp_len, int error, void *data) @@ -1347,9 +2360,7 @@ static int server_set_volopt(struct super_block *sb, struct scoutfs_net_connecti mutex_lock(&server->volopt_mutex); - ret = scoutfs_server_hold_commit(sb); - if (ret) - goto unlock; + scoutfs_server_hold_commit(sb); if (le64_to_cpu(volopt->set_bits) & SCOUTFS_VOLOPT_DATA_ALLOC_ZONE_BLOCKS_BIT) { opt = le64_to_cpu(volopt->data_alloc_zone_blocks); @@ -1389,7 +2400,6 @@ apply: super->volopt = server->volopt; write_seqcount_end(&server->volopt_seqcount); -unlock: mutex_unlock(&server->volopt_mutex); out: return scoutfs_net_response(sb, conn, cmd, id, ret, NULL, 0); @@ -1419,9 +2429,7 @@ static int server_clear_volopt(struct super_block *sb, struct scoutfs_net_connec mutex_lock(&server->volopt_mutex); - ret = scoutfs_server_hold_commit(sb); - if (ret) - goto unlock; + scoutfs_server_hold_commit(sb); for (i = 0, bit = 1, opt = first_valopt(&super->volopt); i < 64; i++, bit <<= 1, opt++) { if (le64_to_cpu(volopt->set_bits) & bit) { @@ -1439,7 +2447,6 @@ static int server_clear_volopt(struct super_block *sb, struct scoutfs_net_connec super->volopt = server->volopt; write_seqcount_end(&server->volopt_seqcount); -unlock: mutex_unlock(&server->volopt_mutex); out: return scoutfs_net_response(sb, conn, cmd, id, ret, NULL, 0); @@ -1585,6 +2592,113 @@ static int cancel_srch_compact(struct super_block *sb, u64 rid) return ret; } +/* + * Clean up any log merge requests which have now been abandoned because + * their client was evicted. This is always called on eviction and + * there may have been no merge in progres or our client had no + * outstanding requests. For each pending request, we reclaim its + * allocators, delte its item, and update the status. + * + * The request we cancel might have been the last request which + * prevented batch processing, but we don't check that here. This is in + * the client eviction path and we want that to be as light and + * responsive as possible so we can get back up and running. The next + * client get_log_merge request will see that no more requests are + * outstanding. + * + * The caller holds a commit, but we're responsible for locking. + */ +static int cancel_log_merge(struct super_block *sb, u64 rid) +{ + DECLARE_SERVER_INFO(sb, server); + struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super; + struct scoutfs_log_merge_status stat; + struct scoutfs_log_merge_request req; + struct scoutfs_log_merge_range rng; + struct scoutfs_key key; + bool update = false; + u64 seq; + int ret; + + mutex_lock(&server->logs_mutex); + + ret = next_log_merge_item(sb, &super->log_merge, + SCOUTFS_LOG_MERGE_STATUS_ZONE, 0, 0, + &stat, sizeof(stat)); + if (ret < 0) { + if (ret == -ENOENT) + ret = 0; + goto out; + } + + for (seq = 0; ; seq++) { + ret = next_log_merge_item(sb, &super->log_merge, + SCOUTFS_LOG_MERGE_REQUEST_ZONE, rid, + seq, &req, sizeof(req)); + if (ret == 0 && le64_to_cpu(req.rid) != rid) + ret = -ENOENT; + if (ret < 0) { + if (ret == -ENOENT) + ret = 0; + break; + } + + seq = le64_to_cpu(req.seq); + + /* remove request item */ + init_log_merge_key(&key, SCOUTFS_LOG_MERGE_REQUEST_ZONE, rid, + le64_to_cpu(req.seq)); + ret = scoutfs_btree_delete(sb, &server->alloc, &server->wri, + &super->log_merge, &key); + if (ret < 0) + goto out; + + /* restore range */ + rng.start = req.start; + rng.end = req.end; + + key = rng.start; + key.sk_zone = SCOUTFS_LOG_MERGE_RANGE_ZONE; + ret = scoutfs_btree_insert(sb, &server->alloc, + &server->wri, + &super->log_merge, &key, + &rng, sizeof(rng)); + if (ret < 0) + goto out; + + /* reclaim allocator */ + mutex_lock(&server->alloc_mutex); + ret = scoutfs_alloc_splice_list(sb, &server->alloc, + &server->wri, + server->other_freed, + &req.meta_avail) ?: + scoutfs_alloc_splice_list(sb, &server->alloc, + &server->wri, + server->other_freed, + &req.meta_freed); + mutex_unlock(&server->alloc_mutex); + if (ret < 0) + goto out; + + /* update count */ + le64_add_cpu(&stat.nr_requests, -1ULL); + update = true; + } + + if (update) { + /* and update the status counts */ + init_log_merge_key(&key, SCOUTFS_LOG_MERGE_STATUS_ZONE, 0, 0); + ret = scoutfs_btree_update(sb, &server->alloc, &server->wri, + &super->log_merge, &key, + &stat, sizeof(stat)); + } +out: + mutex_unlock(&server->logs_mutex); + + BUG_ON(ret < 0); /* XXX inconsistent */ + return ret; +} + /* * Farewell processing is async to the request processing work. Shutdown * waits for request processing to finish and then tears down the connection. @@ -1652,9 +2766,7 @@ static int server_greeting(struct super_block *sb, } if (gr->server_term == 0) { - ret = scoutfs_server_hold_commit(sb); - if (ret < 0) - goto send_err; + scoutfs_server_hold_commit(sb); ret = insert_mounted_client(sb, le64_to_cpu(gr->rid), le64_to_cpu(gr->flags), &conn->peername); @@ -1727,15 +2839,14 @@ static int reclaim_rid(struct super_block *sb, u64 rid) { int ret; - ret = scoutfs_server_hold_commit(sb); - if (ret < 0) - return ret; + scoutfs_server_hold_commit(sb); /* delete mounted client last, recovery looks for it */ ret = scoutfs_lock_server_farewell(sb, rid) ?: remove_trans_seq(sb, rid) ?: - reclaim_log_trees(sb, rid) ?: + reclaim_open_log_tree(sb, rid) ?: cancel_srch_compact(sb, rid) ?: + cancel_log_merge(sb, rid) ?: scoutfs_omap_remove_rid(sb, rid) ?: delete_mounted_client(sb, rid); @@ -1971,6 +3082,8 @@ static scoutfs_net_request_t server_req_funcs[] = { [SCOUTFS_NET_CMD_LOCK] = server_lock, [SCOUTFS_NET_CMD_SRCH_GET_COMPACT] = server_srch_get_compact, [SCOUTFS_NET_CMD_SRCH_COMMIT_COMPACT] = server_srch_commit_compact, + [SCOUTFS_NET_CMD_GET_LOG_MERGE] = server_get_log_merge, + [SCOUTFS_NET_CMD_COMMIT_LOG_MERGE] = server_commit_log_merge, [SCOUTFS_NET_CMD_OPEN_INO_MAP] = server_open_ino_map, [SCOUTFS_NET_CMD_GET_VOLOPT] = server_get_volopt, [SCOUTFS_NET_CMD_SET_VOLOPT] = server_set_volopt, @@ -2244,7 +3357,7 @@ static void scoutfs_server_worker(struct work_struct *work) struct scoutfs_net_connection *conn = NULL; DECLARE_WAIT_QUEUE_HEAD(waitq); struct sockaddr_in sin; - u64 max_vers; + u64 max_seq; int ret; trace_scoutfs_server_work_enter(sb, 0, 0); @@ -2284,6 +3397,7 @@ static void scoutfs_server_worker(struct work_struct *work) server->volopt = super->volopt; write_seqcount_end(&server->volopt_seqcount); + atomic64_set(&server->seq_atomic, le64_to_cpu(super->seq)); set_roots(server, &super->fs_root, &super->logs_root, &super->srch_root); scoutfs_block_writer_init(sb, &server->wri); @@ -2307,13 +3421,14 @@ static void scoutfs_server_worker(struct work_struct *work) le64_to_cpu(server->meta_avail->total_len)) swap(server->meta_avail, server->meta_freed); - ret = scoutfs_forest_get_max_vers(sb, super, &max_vers); + ret = scoutfs_forest_get_max_seq(sb, super, &max_seq); if (ret) { - scoutfs_err(sb, "server couldn't find max item vers: %d", ret); + scoutfs_err(sb, "server couldn't find max item seq: %d", ret); goto shutdown; } + scoutfs_server_set_seq_if_greater(sb, max_seq); - ret = scoutfs_lock_server_setup(sb, &server->alloc, &server->wri, max_vers) ?: + ret = scoutfs_lock_server_setup(sb, &server->alloc, &server->wri) ?: start_recovery(sb); if (ret) goto shutdown; @@ -2341,6 +3456,8 @@ shutdown: scoutfs_net_shutdown(sb, conn); server->conn = NULL; + flush_work(&server->log_merge_free_work); + /* stop tracking recovery, cancel timer, flush any fencing */ scoutfs_recov_shutdown(sb); flush_work(&server->fence_pending_recov_work); @@ -2408,6 +3525,7 @@ void scoutfs_server_stop(struct super_block *sb) cancel_work_sync(&server->work); cancel_work_sync(&server->farewell_work); cancel_work_sync(&server->commit_work); + cancel_work_sync(&server->log_merge_free_work); } int scoutfs_server_setup(struct super_block *sb) @@ -2433,6 +3551,7 @@ int scoutfs_server_setup(struct super_block *sb) INIT_WORK(&server->farewell_work, farewell_worker); mutex_init(&server->alloc_mutex); mutex_init(&server->logs_mutex); + INIT_WORK(&server->log_merge_free_work, server_log_merge_free_work); mutex_init(&server->srch_mutex); mutex_init(&server->mounted_clients_mutex); seqcount_init(&server->roots_seqcount); diff --git a/kmod/src/server.h b/kmod/src/server.h index 8d31a271..79fcb443 100644 --- a/kmod/src/server.h +++ b/kmod/src/server.h @@ -62,7 +62,7 @@ int scoutfs_server_lock_response(struct super_block *sb, u64 rid, u64 id, struct scoutfs_net_lock *nl); int scoutfs_server_lock_recover_request(struct super_block *sb, u64 rid, struct scoutfs_key *key); -int scoutfs_server_hold_commit(struct super_block *sb); +void scoutfs_server_hold_commit(struct super_block *sb); int scoutfs_server_apply_commit(struct super_block *sb, int err); void scoutfs_server_recov_finish(struct super_block *sb, u64 rid, int which); @@ -71,6 +71,10 @@ int scoutfs_server_send_omap_request(struct super_block *sb, u64 rid, int scoutfs_server_send_omap_response(struct super_block *sb, u64 rid, u64 id, struct scoutfs_open_ino_map *map, int err); +u64 scoutfs_server_seq(struct super_block *sb); +u64 scoutfs_server_next_seq(struct super_block *sb); +void scoutfs_server_set_seq_if_greater(struct super_block *sb, u64 seq); + struct sockaddr_in; struct scoutfs_quorum_elected_info; int scoutfs_server_start(struct super_block *sb, u64 term); diff --git a/kmod/src/srch.c b/kmod/src/srch.c index 372be7fe..9fbaaeb7 100644 --- a/kmod/src/srch.c +++ b/kmod/src/srch.c @@ -989,12 +989,13 @@ int scoutfs_srch_rotate_log(struct super_block *sb, struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, - struct scoutfs_srch_file *sfl) + struct scoutfs_srch_file *sfl, bool force) { struct scoutfs_key key; int ret; - if (le64_to_cpu(sfl->blocks) < SCOUTFS_SRCH_LOG_BLOCK_LIMIT) + if (sfl->ref.blkno == 0 || + (!force && le64_to_cpu(sfl->blocks) < SCOUTFS_SRCH_LOG_BLOCK_LIMIT)) return 0; init_srch_key(&key, SCOUTFS_SRCH_LOG_TYPE, diff --git a/kmod/src/srch.h b/kmod/src/srch.h index 69448ab3..7f30f04c 100644 --- a/kmod/src/srch.h +++ b/kmod/src/srch.h @@ -37,7 +37,7 @@ int scoutfs_srch_rotate_log(struct super_block *sb, struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, - struct scoutfs_srch_file *sfl); + struct scoutfs_srch_file *sfl, bool force); int scoutfs_srch_get_compact(struct super_block *sb, struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, diff --git a/utils/src/btree.c b/utils/src/btree.c index 9224f9de..201c47a5 100644 --- a/utils/src/btree.c +++ b/utils/src/btree.c @@ -40,7 +40,7 @@ static void *alloc_val(struct scoutfs_btree_block *bt, int len) { le16_add_cpu(&bt->mid_free_len, -len); le16_add_cpu(&bt->total_item_bytes, len); - return (void *)bt + le16_to_cpu(bt->mid_free_len); + return (void *)&bt->items[le16_to_cpu(bt->nr_items)] + le16_to_cpu(bt->mid_free_len); } /* diff --git a/utils/src/mkfs.c b/utils/src/mkfs.c index 92dc0b50..bcf07357 100644 --- a/utils/src/mkfs.c +++ b/utils/src/mkfs.c @@ -236,7 +236,7 @@ static int do_mkfs(struct mkfs_args *args) super->version = cpu_to_le64(SCOUTFS_INTEROP_VERSION); uuid_generate(super->uuid); super->next_ino = cpu_to_le64(SCOUTFS_ROOT_INO + 1); - super->next_trans_seq = cpu_to_le64(1); + super->seq = cpu_to_le64(1); super->total_meta_blocks = cpu_to_le64(last_meta + 1); super->first_meta_blkno = cpu_to_le64(next_meta); super->last_meta_blkno = cpu_to_le64(last_meta); diff --git a/utils/src/print.c b/utils/src/print.c index 5fa57bdb..c6ea1fe0 100644 --- a/utils/src/print.c +++ b/utils/src/print.c @@ -210,8 +210,8 @@ static int print_logs_item(struct scoutfs_key *key, void *val, /* only items in leaf blocks have values */ if (val) { liv = val; - printf(" log_item_value: vers %llu flags %x\n", - le64_to_cpu(liv->vers), liv->flags); + printf(" log_item_value: seq %llu flags %x\n", + le64_to_cpu(liv->seq), liv->flags); /* deletion items don't have values */ if (!(liv->flags & SCOUTFS_LOG_ITEM_FLAG_DELETION)) { @@ -289,9 +289,10 @@ static int print_log_trees_item(struct scoutfs_key *key, void *val, " data_avail: "ALCROOT_F"\n" " data_freed: "ALCROOT_F"\n" " srch_file: "SRF_FMT"\n" - " max_item_vers: %llu\n" + " max_item_seq: %llu\n" " rid: %016llx\n" " nr: %llu\n" + " flags: %llx\n" " data_alloc_zone_blocks: %llu\n" " data_alloc_zones: ", AL_HEAD_A(<->meta_avail), @@ -304,9 +305,10 @@ static int print_log_trees_item(struct scoutfs_key *key, void *val, ALCROOT_A(<->data_avail), ALCROOT_A(<->data_freed), SRF_A(<->srch_file), - le64_to_cpu(lt->max_item_vers), + le64_to_cpu(lt->max_item_seq), le64_to_cpu(lt->rid), le64_to_cpu(lt->nr), + le64_to_cpu(lt->flags), le64_to_cpu(lt->data_alloc_zone_blocks)); for (i = 0; i < SCOUTFS_DATA_ALLOC_ZONE_LE64S; i++) { @@ -383,6 +385,72 @@ static int print_mounted_client_entry(struct scoutfs_key *key, void *val, return 0; } +static int print_log_merge_item(struct scoutfs_key *key, void *val, + unsigned val_len, void *arg) +{ + struct scoutfs_log_merge_status *stat; + struct scoutfs_log_merge_range *rng; + struct scoutfs_log_merge_request *req; + struct scoutfs_log_merge_complete *comp; + struct scoutfs_log_merge_freeing *fr; + + switch (key->sk_zone) { + case SCOUTFS_LOG_MERGE_STATUS_ZONE: + stat = val; + printf(" status: next_range_key "SK_FMT" nr_req %llu nr_comp %llu" + " last_seq %llu seq %llu\n", + SK_ARG(&stat->next_range_key), + le64_to_cpu(stat->nr_requests), + le64_to_cpu(stat->nr_complete), + le64_to_cpu(stat->last_seq), + le64_to_cpu(stat->seq)); + break; + case SCOUTFS_LOG_MERGE_RANGE_ZONE: + rng = val; + printf(" range: start "SK_FMT" end "SK_FMT"\n", + SK_ARG(&rng->start), + SK_ARG(&rng->end)); + break; + case SCOUTFS_LOG_MERGE_REQUEST_ZONE: + req = val; + printf(" request: logs_root "BTROOT_F" logs_root "BTROOT_F" start "SK_FMT + " end "SK_FMT" last_seq %llu rid %016llx seq %llu flags 0x%llx\n", + BTROOT_A(&req->logs_root), + BTROOT_A(&req->root), + SK_ARG(&req->start), + SK_ARG(&req->end), + le64_to_cpu(req->last_seq), + le64_to_cpu(req->rid), + le64_to_cpu(req->seq), + le64_to_cpu(req->flags)); + break; + case SCOUTFS_LOG_MERGE_COMPLETE_ZONE: + comp = val; + printf(" complete: root "BTROOT_F" start "SK_FMT" end "SK_FMT + " remain "SK_FMT" rid %016llx seq %llu flags %llx\n", + BTROOT_A(&comp->root), + SK_ARG(&comp->start), + SK_ARG(&comp->end), + SK_ARG(&comp->remain), + le64_to_cpu(comp->rid), + le64_to_cpu(comp->seq), + le64_to_cpu(comp->flags)); + break; + case SCOUTFS_LOG_MERGE_FREEING_ZONE: + fr = val; + printf(" freeing: root "BTROOT_F" key "SK_FMT" seq %llu\n", + BTROOT_A(&fr->root), + SK_ARG(&fr->key), + le64_to_cpu(fr->seq)); + break; + default: + printf(" (unknown log merge key zone %u)\n", key->sk_zone); + break; + } + + return 0; +} + static int print_alloc_item(struct scoutfs_key *key, void *val, unsigned val_len, void *arg) { @@ -859,6 +927,10 @@ out: return ret; } +#define BTR_FMT "blkno %llu seq %016llx height %u" +#define BTR_ARG(rt) \ + le64_to_cpu((rt)->ref.blkno), le64_to_cpu((rt)->ref.seq), (rt)->height + static void print_super_block(struct scoutfs_super_block *super, u64 blkno) { char uuid_str[37]; @@ -878,7 +950,7 @@ static void print_super_block(struct scoutfs_super_block *super, u64 blkno) printf(" flags: 0x%016llx\n", le64_to_cpu(super->flags)); /* XXX these are all in a crazy order */ - printf(" next_ino %llu next_trans_seq %llu\n" + printf(" next_ino %llu seq %llu\n" " total_meta_blocks %llu first_meta_blkno %llu last_meta_blkno %llu\n" " total_data_blocks %llu first_data_blkno %llu last_data_blkno %llu\n" " meta_alloc[0]: "ALCROOT_F"\n" @@ -888,12 +960,14 @@ static void print_super_block(struct scoutfs_super_block *super, u64 blkno) " server_meta_avail[1]: "AL_HEAD_F"\n" " server_meta_freed[0]: "AL_HEAD_F"\n" " server_meta_freed[1]: "AL_HEAD_F"\n" - " mounted_clients root: height %u blkno %llu seq %llu\n" - " srch_root root: height %u blkno %llu seq %llu\n" - " trans_seqs root: height %u blkno %llu seq %llu\n" - " fs_root btree root: height %u blkno %llu seq %llu\n", + " fs_root: "BTR_FMT"\n" + " logs_root: "BTR_FMT"\n" + " log_merge: "BTR_FMT"\n" + " trans_seqs: "BTR_FMT"\n" + " mounted_clients: "BTR_FMT"\n" + " srch_root: "BTR_FMT"\n", le64_to_cpu(super->next_ino), - le64_to_cpu(super->next_trans_seq), + le64_to_cpu(super->seq), le64_to_cpu(super->total_meta_blocks), le64_to_cpu(super->first_meta_blkno), le64_to_cpu(super->last_meta_blkno), @@ -907,18 +981,12 @@ static void print_super_block(struct scoutfs_super_block *super, u64 blkno) AL_HEAD_A(&super->server_meta_avail[1]), AL_HEAD_A(&super->server_meta_freed[0]), AL_HEAD_A(&super->server_meta_freed[1]), - super->mounted_clients.height, - le64_to_cpu(super->mounted_clients.ref.blkno), - le64_to_cpu(super->mounted_clients.ref.seq), - super->srch_root.height, - le64_to_cpu(super->srch_root.ref.blkno), - le64_to_cpu(super->srch_root.ref.seq), - super->trans_seqs.height, - le64_to_cpu(super->trans_seqs.ref.blkno), - le64_to_cpu(super->trans_seqs.ref.seq), - super->fs_root.height, - le64_to_cpu(super->fs_root.ref.blkno), - le64_to_cpu(super->fs_root.ref.seq)); + BTR_ARG(&super->fs_root), + BTR_ARG(&super->logs_root), + BTR_ARG(&super->log_merge), + BTR_ARG(&super->trans_seqs), + BTR_ARG(&super->mounted_clients), + BTR_ARG(&super->srch_root)); printf(" volume options:\n" " set_bits: %016llx\n", @@ -973,6 +1041,11 @@ static int print_volume(int fd) if (err && !ret) ret = err; + err = print_btree(fd, super, "log_merge", &super->log_merge, + print_log_merge_item, NULL); + if (err && !ret) + ret = err; + for (i = 0; i < array_size(super->server_meta_avail); i++) { snprintf(str, sizeof(str), "server_meta_avail[%u]", i); err = print_alloc_list_block(fd, str,