diff --git a/kmod/src/alloc.h b/kmod/src/alloc.h index 167725b0..7b053756 100644 --- a/kmod/src/alloc.h +++ b/kmod/src/alloc.h @@ -53,9 +53,10 @@ /* * Each of the server meta_alloc roots will try to keep a minimum amount - * of free blocks. The server will use the next root once its current - * root gets this low. It must have room for all the largest allocation - * attempted in a transaction on the server. + * of free blocks. The server will swap roots when its current avail + * falls below the threshold while the freed root is still above it. It + * must have room for all the largest allocation attempted in a + * transaction on the server. */ #define SCOUTFS_SERVER_META_ALLOC_MIN \ (SCOUTFS_SERVER_META_FILL_TARGET * 2) diff --git a/kmod/src/btree.c b/kmod/src/btree.c index 5c97e2e0..1d249a3a 100644 --- a/kmod/src/btree.c +++ b/kmod/src/btree.c @@ -26,7 +26,7 @@ #include "options.h" #include "msg.h" #include "block.h" -#include "radix.h" +#include "alloc.h" #include "avl.h" #include "hash.h" @@ -674,7 +674,7 @@ static void move_items(struct scoutfs_btree_block *dst, * error. */ static int get_ref_block(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, int flags, struct scoutfs_btree_ref *ref, struct scoutfs_block **bl_ret) @@ -737,7 +737,7 @@ retry: goto out; } - ret = scoutfs_radix_alloc(sb, alloc, wri, &blkno); + ret = scoutfs_alloc_meta(sb, alloc, wri, &blkno); if (ret < 0) goto out; @@ -745,8 +745,8 @@ retry: new_bl = scoutfs_block_create(sb, blkno); if (IS_ERR(new_bl)) { - ret = scoutfs_radix_free(sb, alloc, wri, blkno); - BUG_ON(ret); /* radix should have been dirty */ + ret = scoutfs_free_meta(sb, alloc, wri, blkno); + BUG_ON(ret); ret = PTR_ERR(new_bl); goto out; } @@ -754,11 +754,11 @@ retry: /* free old stable blkno we're about to overwrite */ if (ref && ref->blkno) { - ret = scoutfs_radix_free(sb, alloc, wri, - le64_to_cpu(ref->blkno)); + ret = scoutfs_free_meta(sb, alloc, wri, + le64_to_cpu(ref->blkno)); if (ret) { - ret = scoutfs_radix_free(sb, alloc, wri, blkno); - BUG_ON(ret); /* radix should have been dirty */ + ret = scoutfs_free_meta(sb, alloc, wri, blkno); + BUG_ON(ret); scoutfs_block_put(sb, new_bl); new_bl = NULL; goto out; @@ -861,7 +861,7 @@ static void init_btree_block(struct scoutfs_btree_block *bt, int level) * Returns -errno, 0 if nothing done, or 1 if we split. */ static int try_split(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, struct scoutfs_key *key, unsigned val_len, @@ -901,8 +901,8 @@ static int try_split(struct super_block *sb, if (!parent) { ret = get_ref_block(sb, alloc, wri, BTW_ALLOC, NULL, &par_bl); if (ret) { - err = scoutfs_radix_free(sb, alloc, wri, - le64_to_cpu(left->hdr.blkno)); + err = scoutfs_free_meta(sb, alloc, wri, + le64_to_cpu(left->hdr.blkno)); BUG_ON(err); /* radix should have been dirty */ scoutfs_block_put(sb, left_bl); return ret; @@ -937,7 +937,7 @@ static int try_split(struct super_block *sb, * block. */ static int try_join(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, struct scoutfs_btree_block *parent, @@ -990,9 +990,9 @@ static int try_join(struct super_block *sb, /* update or delete sibling's parent item */ if (le16_to_cpu(sib->nr_items) == 0) { delete_item(parent, sib_par_item, NULL); - ret = scoutfs_radix_free(sb, alloc, wri, - le64_to_cpu(sib->hdr.blkno)); - BUG_ON(ret); /* could have dirtied alloc to avoid error */ + ret = scoutfs_free_meta(sb, alloc, wri, + le64_to_cpu(sib->hdr.blkno)); + BUG_ON(ret); } else if (move_right) { update_parent_item(parent, sib_par_item, sib); @@ -1003,9 +1003,9 @@ static int try_join(struct super_block *sb, root->height--; root->ref.blkno = bt->hdr.blkno; root->ref.seq = bt->hdr.seq; - ret = scoutfs_radix_free(sb, alloc, wri, - le64_to_cpu(parent->hdr.blkno)); - BUG_ON(ret); /* could have dirtied alloc to avoid error */ + ret = scoutfs_free_meta(sb, alloc, wri, + le64_to_cpu(parent->hdr.blkno)); + BUG_ON(ret); } scoutfs_block_put(sb, sib_bl); @@ -1219,7 +1219,7 @@ struct btree_walk_key_range { * blocks themselves. */ static int btree_walk(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, int flags, struct scoutfs_key *key, @@ -1464,7 +1464,7 @@ static bool invalid_item(unsigned val_len) * length value. */ int scoutfs_btree_insert(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, struct scoutfs_key *key, @@ -1531,7 +1531,7 @@ static void update_item_value(struct scoutfs_btree_block *bt, * which doesn't fit. */ int scoutfs_btree_update(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, struct scoutfs_key *key, @@ -1571,7 +1571,7 @@ int scoutfs_btree_update(struct super_block *sb, * which will insert instead of returning -ENOENT. */ int scoutfs_btree_force(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, struct scoutfs_key *key, @@ -1615,7 +1615,7 @@ int scoutfs_btree_force(struct super_block *sb, * found. */ int scoutfs_btree_delete(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, struct scoutfs_key *key) @@ -1636,8 +1636,8 @@ int scoutfs_btree_delete(struct super_block *sb, if (item) { if (le16_to_cpu(bt->nr_items) == 1) { /* remove final empty block */ - ret = scoutfs_radix_free(sb, alloc, wri, - bl->blkno); + ret = scoutfs_free_meta(sb, alloc, wri, + bl->blkno); if (ret == 0) { root->height = 0; root->ref.blkno = 0; @@ -1753,7 +1753,7 @@ int scoutfs_btree_prev(struct super_block *sb, struct scoutfs_btree_root *root, * <0 is returned on error, including -ENOENT if the key isn't present. */ int scoutfs_btree_dirty(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, struct scoutfs_key *key) @@ -1841,7 +1841,7 @@ out: * the caller to resolve this. */ int scoutfs_btree_insert_list(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, struct scoutfs_btree_item_list *lst) diff --git a/kmod/src/btree.h b/kmod/src/btree.h index c9bd6478..79d4de58 100644 --- a/kmod/src/btree.h +++ b/kmod/src/btree.h @@ -3,7 +3,7 @@ #include -struct scoutfs_radix_allocator; +struct scoutfs_alloc; struct scoutfs_block_writer; struct scoutfs_block; @@ -36,25 +36,25 @@ int scoutfs_btree_lookup(struct super_block *sb, struct scoutfs_key *key, struct scoutfs_btree_item_ref *iref); int scoutfs_btree_insert(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, struct scoutfs_key *key, void *val, unsigned val_len); int scoutfs_btree_update(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, struct scoutfs_key *key, void *val, unsigned val_len); int scoutfs_btree_force(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, struct scoutfs_key *key, void *val, unsigned val_len); int scoutfs_btree_delete(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, struct scoutfs_key *key); @@ -65,7 +65,7 @@ int scoutfs_btree_prev(struct super_block *sb, struct scoutfs_btree_root *root, struct scoutfs_key *key, struct scoutfs_btree_item_ref *iref); int scoutfs_btree_dirty(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, struct scoutfs_key *key); @@ -77,7 +77,7 @@ int scoutfs_btree_read_items(struct super_block *sb, struct scoutfs_key *end, scoutfs_btree_item_cb cb, void *arg); int scoutfs_btree_insert_list(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, struct scoutfs_btree_item_list *lst); diff --git a/kmod/src/counters.h b/kmod/src/counters.h index b8686bc9..e3c2e8ae 100644 --- a/kmod/src/counters.h +++ b/kmod/src/counters.h @@ -166,7 +166,6 @@ EXPAND_COUNTER(radix_undo_ref) \ EXPAND_COUNTER(radix_walk) \ EXPAND_COUNTER(server_commit_hold) \ - EXPAND_COUNTER(server_commit_prepare) \ EXPAND_COUNTER(server_commit_queue) \ EXPAND_COUNTER(server_commit_worker) \ EXPAND_COUNTER(srch_add_entry) \ @@ -188,8 +187,9 @@ EXPAND_COUNTER(srch_search_xattrs) \ EXPAND_COUNTER(srch_read_stale) \ EXPAND_COUNTER(trans_commit_data_alloc_low) \ + EXPAND_COUNTER(trans_commit_dirty_meta_full) \ EXPAND_COUNTER(trans_commit_fsync) \ - EXPAND_COUNTER(trans_commit_full) \ + EXPAND_COUNTER(trans_commit_meta_alloc_low) \ EXPAND_COUNTER(trans_commit_sync_fs) \ EXPAND_COUNTER(trans_commit_timer) \ EXPAND_COUNTER(trans_commit_written) diff --git a/kmod/src/data.c b/kmod/src/data.c index 9360b481..a8bca721 100644 --- a/kmod/src/data.c +++ b/kmod/src/data.c @@ -26,6 +26,7 @@ #include "super.h" #include "inode.h" #include "key.h" +#include "alloc.h" #include "data.h" #include "trans.h" #include "counters.h" @@ -37,657 +38,135 @@ #include "file.h" #include "msg.h" #include "count.h" -#include "radix.h" +#include "ext.h" /* - * Logical file blocks are mapped to device blocks with extents stored - * in items. Each extent item maps a fixed size logical region and can - * contain multiple extent records. Each extent record is packed to - * minimize the space it uses. The logical starting block is implicit - * so sparse extents are stored to skip unmapped blocks, and the mapped - * blkno is encoded as the difference from the previous extent and only - * its set bytes are stored. - * - * To operate on the extents we load their item and unpack them into an - * rbtree of full extent records in memory. Once the memory extents are - * modified they can be packed back into the item. Typically there are - * very few extents that cover the region. - * - * The client is given a radix allocator with trees for allocating - * blocks and recording frees at the start of each transaction. + * We want to amortize work done after dirtying the shared transaction + * accounting, but we don't want to blow out dirty allocator btree + * blocks. Each allocation can dirty quite a few allocator btree blocks + * so we check in pretty often. */ +#define EXTENTS_PER_HOLD 8 struct data_info { struct super_block *sb; - struct rw_semaphore alloc_rwsem; - struct scoutfs_radix_allocator *alloc; + struct mutex mutex; + struct scoutfs_alloc *alloc; struct scoutfs_block_writer *wri; - struct scoutfs_radix_root data_avail; - struct scoutfs_radix_root data_freed; + struct scoutfs_alloc_root data_avail; + struct scoutfs_alloc_root data_freed; + struct scoutfs_extent cached_ext; }; #define DECLARE_DATA_INFO(sb, name) \ struct data_info *name = SCOUTFS_SB(sb)->data_info -static void init_packed_extent_key(struct scoutfs_key *key, u64 ino, - u64 iblock, u8 part) +struct data_ext_args { + u64 ino; + struct inode *inode; + struct scoutfs_lock *lock; +}; + +static void item_from_extent(struct scoutfs_key *key, + struct scoutfs_data_extent_val *dv, u64 ino, + u64 start, u64 len, u64 map, u8 flags) { *key = (struct scoutfs_key) { .sk_zone = SCOUTFS_FS_ZONE, - .skpe_ino = cpu_to_le64(ino), - .sk_type = SCOUTFS_PACKED_EXTENT_TYPE, - .skpe_base = cpu_to_le64(iblock >> SCOUTFS_PACKEXT_BASE_SHIFT), - .skpe_part = part, + .skdx_ino = cpu_to_le64(ino), + .sk_type = SCOUTFS_DATA_EXTENT_TYPE, + .skdx_end = cpu_to_le64(start + len - 1), + .skdx_len = cpu_to_le64(len), }; + dv->blkno = cpu_to_le64(map); + dv->flags = flags; } -/* - * Packed extents are read from items and unpacked into this structure - * in memory so they can be easily manipulated before being packed and - * stored in items. - */ -struct unpacked_extents { - u64 iblock; - struct rb_root extents; - __u8 existing_parts; - bool changed; -}; - -struct unpacked_extent { - struct rb_node node; - u64 iblock; - u64 count; - u64 blkno; - u8 flags; -}; - -static void init_traced_extent(struct scoutfs_traced_extent *te, - u64 iblock, u64 count, u64 blkno, u8 flags) +static void ext_from_item(struct scoutfs_extent *ext, + struct scoutfs_key *key, + struct scoutfs_data_extent_val *dv) { - te->iblock = iblock; - te->count = count; - te->blkno = blkno; - te->flags = flags; + ext->start = le64_to_cpu(key->skdx_end) - + le64_to_cpu(key->skdx_len) + 1; + ext->len = le64_to_cpu(key->skdx_len); + ext->map = le64_to_cpu(dv->blkno); + ext->flags = dv->flags; } -static void copy_traced_extent(struct scoutfs_traced_extent *te, - struct unpacked_extent *ext) +static int data_ext_next(struct super_block *sb, void *arg, u64 start, u64 len, + struct scoutfs_extent *ext) { - te->iblock = ext->iblock; - te->count = ext->count; - te->blkno = ext->blkno; - te->flags = ext->flags; -} - -static u64 ext_last(struct unpacked_extent *ext) -{ - return ext->iblock + ext->count - 1; -} - -/* The first possible iblock in an item that contains the given iblock */ -static u64 first_iblock(u64 iblock) -{ - return iblock & SCOUTFS_PACKEXT_BASE_MASK; -} - -/* The last possible iblock in an item that contains the given iblock */ -static u64 last_iblock(u64 iblock) -{ - return iblock | ~SCOUTFS_PACKEXT_BASE_MASK; -} - -/* - * Extents can merge if they're logically contiguous, have block - * mappings or not which also must be contiguous, and have matching - * flags. - * - * We also require that a given extent's allocation be from only one - * radix bitmap leaf block because the radix freeing functions only - * operate on one leaf block. - */ -static bool extents_merge(struct unpacked_extent *left, - struct unpacked_extent *right) -{ - return (left->iblock + left->count == right->iblock) && - ((!left->blkno && !right->blkno) || - (left->blkno + left->count == right->blkno)) && - (left->flags == right->flags) && - (scoutfs_radix_bit_leaf_nr(left->blkno) == - scoutfs_radix_bit_leaf_nr(right->blkno + right->count - 1)); -} - -static struct unpacked_extent *first_extent(struct unpacked_extents *unpe) -{ - return rb_entry_safe(rb_first(&unpe->extents), - struct unpacked_extent, node); -} - -static struct unpacked_extent *last_extent(struct unpacked_extents *unpe) -{ - return rb_entry_safe(rb_last(&unpe->extents), - struct unpacked_extent, node); -} - -static struct unpacked_extent *next_extent(struct unpacked_extent *ext) -{ - return rb_entry_safe(rb_next(&ext->node), - struct unpacked_extent, node); -} - -static struct unpacked_extent *prev_extent(struct unpacked_extent *ext) -{ - return rb_entry_safe(rb_prev(&ext->node), - struct unpacked_extent, node); -} - -/* - * Find the first extent that intersects the requested range. NULL is - * returned if no extents intersect. - */ -static struct unpacked_extent *find_extent(struct unpacked_extents *unpe, - u64 iblock, u64 last) -{ - - struct rb_node *node = unpe->extents.rb_node; - struct unpacked_extent *ret = NULL; - struct unpacked_extent *ext; - - if (iblock > last) - return NULL; - - while (node) { - ext = rb_entry(node, struct unpacked_extent, node); - - if (last < ext->iblock) { - node = node->rb_left; - } else if (iblock > ext_last(ext)) { - node = node->rb_right; - } else { - ret = ext; - node = node->rb_left; - } - } - - return ret; -} - -static void track_blocks(struct unpacked_extent *ext, s64 delta, - s64 *on, s64 *off) -{ - if (ext->blkno && !(ext->flags & SEF_UNWRITTEN)) - *on += delta; - else if (ext->flags & SEF_OFFLINE) - *off += delta; -} - -static void modify_and_track_count(struct unpacked_extent *ext, u64 count, - s64 *on, s64 *off) -{ - track_blocks(ext, count - ext->count, on, off); - ext->count = count; -} - -/* - * Callers can temporarily insert extents with equal starting iblocks. - * We're careful to insert those to the left so that caller's can find - * these existing overlapping extents by iterating with next. - */ -static void insert_extent(struct unpacked_extents *unpe, - struct unpacked_extent *ins, s64 *on, s64 *off) -{ - struct rb_node **node = &unpe->extents.rb_node; - struct rb_node *parent = NULL; - struct unpacked_extent *ext; - int cmp; - - while (*node) { - parent = *node; - ext = rb_entry(*node, struct unpacked_extent, node); - - cmp = scoutfs_cmp_u64s(ins->iblock, ext->iblock); - if (cmp <= 0) - node = &(*node)->rb_left; - else - node = &(*node)->rb_right; - } - - rb_link_node(&ins->node, parent, node); - rb_insert_color(&ins->node, &unpe->extents); - - track_blocks(ins, ins->count, on, off); -} - -static void remove_extent(struct unpacked_extents *unpe, - struct unpacked_extent *ext, s64 *on, s64 *off) -{ - rb_erase(&ext->node, &unpe->extents); - track_blocks(ext, -ext->count, on, off); - kfree(ext); -} - -static void free_unpacked_extents(struct unpacked_extents *unpe) -{ - struct unpacked_extent *ext; - struct unpacked_extent *tmp; - - if (unpe) { - rbtree_postorder_for_each_entry_safe(ext, tmp, &unpe->extents, - node) { - kfree(ext); - } - kfree(unpe); - } -} - -static int unpack_extent(struct unpacked_extent *ext, u64 iblock, - struct scoutfs_packed_extent *pe, int size, - u64 prev_blkno) -{ - __le64 lediff; - u64 blkno; - u64 diff; - - if (size < sizeof(struct scoutfs_packed_extent) || - size < (sizeof(struct scoutfs_packed_extent) + pe->diff_bytes)) - return 0; - - if (pe->diff_bytes) { - lediff = 0; - memcpy(&lediff, pe->le_blkno_diff, pe->diff_bytes); - diff = le64_to_cpu(lediff); - diff = (diff >> 1) ^ (-(diff & 1)); - blkno = prev_blkno + diff; - } else { - blkno = 0; - } - - ext->iblock = iblock; - ext->blkno = blkno; - ext->count = le16_to_cpu(pe->count); - ext->flags = pe->flags; - - return sizeof(struct scoutfs_packed_extent) + pe->diff_bytes; -} - -static int load_unpacked_extents(struct super_block *sb, u64 ino, - u64 iblock, u64 last, bool empty_enoent, - struct unpacked_extents **unpe_ret, - struct scoutfs_lock *lock) -{ - struct unpacked_extents *unpe = NULL; - struct scoutfs_packed_extent *pe; - struct unpacked_extent *ext; + struct data_ext_args *args = arg; + struct scoutfs_data_extent_val dv; struct scoutfs_key key; - struct scoutfs_key end; - struct rb_node *parent; - struct rb_node **node; - void *buf = NULL; - u64 prev_blkno; - bool saw_final; - int size; + struct scoutfs_key last; int ret; - int p; - *unpe_ret = NULL; + item_from_extent(&last, &dv, args->ino, U64_MAX, 1, 0, 0); + item_from_extent(&key, &dv, args->ino, start, len, 0, 0); - unpe = kzalloc(sizeof(struct unpacked_extents), GFP_NOFS); - if (!unpe) { - ret = -ENOMEM; - goto out; + ret = scoutfs_item_next(sb, &key, &last, &dv, sizeof(dv), args->lock); + if (ret == sizeof(dv)) { + ext_from_item(ext, &key, &dv); + ret = 0; + } else if (ret >= 0) { + ret = -EIO; } - unpe->extents = RB_ROOT; - unpe->changed = true; - /* updated later if _next gives us a greater key */ - unpe->iblock = first_iblock(iblock); - - buf = kmalloc(SCOUTFS_PACKEXT_MAX_BYTES, GFP_NOFS); - if (!buf) { - ret = -ENOMEM; - goto out; - } - - if (last > iblock) - init_packed_extent_key(&end, ino, last, 0); - - parent = NULL; - node = &unpe->extents.rb_node; - prev_blkno = 0; - saw_final = false; - - for (p = 0; !saw_final; p++) { - init_packed_extent_key(&key, ino, iblock, p); - - /* maybe search for next initial item, lookup more parts */ - if (p == 0 && last > iblock) - ret = scoutfs_item_next(sb, &key, &end, buf, - SCOUTFS_PACKEXT_MAX_BYTES, - lock); - else - ret = scoutfs_item_lookup(sb, &key, buf, - SCOUTFS_PACKEXT_MAX_BYTES, - lock); - if (ret < 0) { - if (p == 0 && ret == -ENOENT && empty_enoent) - ret = 0; - goto out; - } - - if (key.skpe_part != p) { - ret = -EIO; /* corruption */ - goto out; - } - - if (p == 0) { - iblock = le64_to_cpu(key.skpe_base) << - SCOUTFS_PACKEXT_BASE_SHIFT; - unpe->iblock = iblock; - } - pe = buf; - size = ret; - - while (size > 0) { - ext = kmalloc(sizeof(struct unpacked_extent), GFP_NOFS); - if (!ext) { - ret = -ENOMEM; - goto out; - } - - ret = unpack_extent(ext, iblock, pe, size, prev_blkno); - if (ret == 0) { /* XXX corruption? */ - kfree(ext); - ret = -EIO; - goto out; - } - - saw_final = pe->final; - pe = (void *)pe + ret; - size -= ret; - - /* sparse packed extents advance iblock */ - if (ext->flags == 0 && ext->blkno == 0) { - iblock += ext->count; - kfree(ext); - ext = NULL; - continue; - } - - iblock += ext->count; - prev_blkno = ext->blkno + ext->count - 1; - - /* building the rbtree from sorted nodes */ - rb_link_node(&ext->node, parent, node); - rb_insert_color(&ext->node, &unpe->extents); - parent = &ext->node; - node = &ext->node.rb_right; - - if (saw_final) - unpe->existing_parts = p + 1; - } - } - - ret = 0; -out: - kfree(buf); if (ret < 0) - free_unpacked_extents(unpe); - else - *unpe_ret = unpe; - + memset(ext, 0, sizeof(struct scoutfs_extent)); return ret; } -static int pack_extent(struct scoutfs_packed_extent *pe, int size, - struct unpacked_extent *ext, - u64 prev_blkno, bool final) +static void add_onoff(struct inode *inode, u64 map, u8 flags, s64 len) { - int diff_bytes; - __le64 lediff; - u64 diff; - int bytes; - int last; - - diff = ext->blkno - prev_blkno; - diff = (diff << 1) ^ ((s64)diff >> 63); /* shift sign extend */ - lediff = cpu_to_le64(diff); - last = fls64(diff); - diff_bytes = (last + 7) >> 3; - - bytes = offsetof(struct scoutfs_packed_extent, - le_blkno_diff[diff_bytes]); - if (size < bytes) - return 0; - - pe->count = cpu_to_le16(ext->count); - pe->diff_bytes = diff_bytes; - pe->flags = ext->flags; - pe->final = !!final; - if (diff_bytes) - memcpy(pe->le_blkno_diff, &lediff, diff_bytes); - - return bytes; -} - -static int store_packed_extents(struct super_block *sb, u64 ino, - struct unpacked_extents *unpe, - struct scoutfs_lock *lock) -{ - struct scoutfs_packed_extent *pe; - struct unpacked_extent *final; - struct unpacked_extent *ext; - struct scoutfs_key key; - void *buf = NULL; - u64 prev_blkno; - u64 iblock; - int space; - int size; - int ret; - int p; - int i; - - if (!unpe->changed) - return 0; - - if (RB_EMPTY_ROOT(&unpe->extents)) { - for (p = 0; p < unpe->existing_parts; p++) { - init_packed_extent_key(&key, ino, unpe->iblock, p); - ret = scoutfs_item_delete(sb, &key, lock); - BUG_ON(ret); /* XXX inconsistent between parts */ - } - unpe->existing_parts = 0; - unpe->changed = false; - return 0; - } - - buf = kmalloc(SCOUTFS_PACKEXT_MAX_BYTES, GFP_NOFS); - if (!buf) { - ret = -ENOMEM; - goto out; - } - - final = last_extent(unpe); - prev_blkno = 0; - - pe = buf; - space = SCOUTFS_PACKEXT_MAX_BYTES; - size = 0; - p = 0; - iblock = unpe->iblock; - - ext = first_extent(unpe); - while (ext) { - /* encode sparse extent to advance iblock */ - if (ext->iblock > iblock && space >= sizeof(*pe)) { - pe->count = cpu_to_le16(ext->iblock - iblock); - pe->diff_bytes = 0; - pe->flags = 0; - pe->final = 0; - pe++; - space -= sizeof(*pe); - size += sizeof(*pe); - iblock = ext->iblock; - } - - /* encode actual extent */ - if (ext->iblock == iblock && - (ret = pack_extent(pe, space, ext, prev_blkno, - ext == final)) > 0) { - pe = (void *)pe + ret; - space -= ret; - size += ret; - iblock += ext->count; - prev_blkno = ext->blkno + ext->count - 1; - ext = next_extent(ext); - if (ext) - continue; - } - - /* store full item or after packing final extent */ - init_packed_extent_key(&key, ino, unpe->iblock, p); - if (p < unpe->existing_parts) - ret = scoutfs_item_update(sb, &key, buf, size, lock); - else - ret = scoutfs_item_create(sb, &key, buf, size, lock); - BUG_ON(ret); /* XXX inconsistent between parts */ - - pe = buf; - space = SCOUTFS_PACKEXT_MAX_BYTES; - size = 0; - p++; - } - - /* delete any remaining previous part items */ - for (i = p; i < unpe->existing_parts; i++) { - init_packed_extent_key(&key, ino, unpe->iblock, i); - ret = scoutfs_item_delete(sb, &key, lock); - BUG_ON(ret); /* XXX inconsistent between parts */ - } - - /* the next store has to know our stored parts */ - unpe->existing_parts = p; - unpe->changed = false; - ret = 0; -out: - kfree(buf); - - return ret; -} - -/* - * Set a logical extent mapping in the unpacked extents for a region of - * a file. The caller's extent is authoritative, any existing - * overlapping extents are trimmed or removed. The new extent can be - * merged with remaining adjacent and compatible extents. - * - * If the caller provides an inode struct then we'll keep the inode - * block counts in sync with flagged extents because updating the inode - * counts won't fail. The caller is expected to keep all other state - * consistent with the extents (i_size, i_blocks, allocator bitmaps). - */ -static int set_extent(struct super_block *sb, struct inode *inode, - u64 ino, struct unpacked_extents *unpe, - u64 iblock, u64 blkno, u64 count, u8 flags) -{ - struct unpacked_extent *split; - struct unpacked_extent *next; - struct unpacked_extent *prev; - struct unpacked_extent *ext; - u64 offset; s64 on = 0; s64 off = 0; - /* make sure the given extent fits entirely within one item */ - if (WARN_ON_ONCE(first_iblock(iblock) != - first_iblock(iblock + count - 1))) - return -EINVAL; + if (map && !(flags & SEF_UNWRITTEN)) + on += len; + else if (flags & SEF_OFFLINE) + off += len; - ext = kmalloc(sizeof(struct unpacked_extent), GFP_NOFS); - split = kmalloc(sizeof(struct unpacked_extent), GFP_NOFS); - if (!ext || !split) { - kfree(ext); - kfree(split); - return -ENOMEM; - } - - unpe->changed = true; - - ext->iblock = iblock; - ext->blkno = blkno; - ext->count = count; - ext->flags = flags; - - insert_extent(unpe, ext, &on, &off); - - prev = prev_extent(ext); - - /* splitting an existing extent? */ - if (prev && ext_last(prev) > ext_last(ext)) { - split->iblock = ext_last(ext) + 1; - split->count = ext_last(prev) - split->iblock + 1; - split->blkno = prev->blkno ? - prev->blkno + prev->count - split->count : 0; - split->flags = prev->flags; - - modify_and_track_count(prev, ext->iblock - prev->iblock, - &on, &off); - - insert_extent(unpe, split, &on, &off); - next = split; - split = NULL; - } else { - next = NULL; - } - - /* trimming a prev extent? */ - if (prev && ext_last(prev) >= ext->iblock) { - modify_and_track_count(prev, ext->iblock - prev->iblock, - &on, &off); - } - - /* merging with a prev extent? */ - if (prev && extents_merge(prev, ext)) { - ext->iblock = prev->iblock; - ext->blkno = prev->blkno; - modify_and_track_count(ext, ext->count + prev->count, - &on, &off); - remove_extent(unpe, prev, &on, &off); - } - - /* if didn't split find next, removing any totally within ours */ - if (!next) { - while ((next = next_extent(ext)) && - ext_last(next) <= ext_last(ext)) { - remove_extent(unpe, next, &on, &off); - } - } - - /* trimming a next extent? */ - if (next && next->iblock <= ext_last(ext)) { - offset = (ext_last(ext) + 1) - next->iblock; - next->iblock += offset; - next->blkno = next->blkno ? next->blkno + offset : 0; - modify_and_track_count(next, next->count - offset, - &on, &off); - } - - /* merging with a next extent? */ - if (next && extents_merge(ext, next)) { - modify_and_track_count(ext, ext->count + next->count, - &on, &off); - remove_extent(unpe, next, &on, &off); - } - - /* and finally remove our extent if it was only removing others */ - if (ext->blkno == 0 && ext->flags == 0) - remove_extent(unpe, ext, &on, &off); - - if (inode) - scoutfs_inode_add_onoff(inode, on, off); - - kfree(split); - return 0; + scoutfs_inode_add_onoff(inode, on, off); } +static int data_ext_insert(struct super_block *sb, void *arg, u64 start, + u64 len, u64 map, u8 flags) +{ + struct data_ext_args *args = arg; + struct scoutfs_data_extent_val dv; + struct scoutfs_key key; + int ret; + + item_from_extent(&key, &dv, args->ino, start, len, map, flags); + ret = scoutfs_item_create(sb, &key, &dv, sizeof(dv), args->lock); + if (ret == 0 && args->inode) + add_onoff(args->inode, map, flags, len); + return ret; +} + +static int data_ext_remove(struct super_block *sb, void *arg, u64 start, + u64 len, u64 map, u8 flags) +{ + struct data_ext_args *args = arg; + struct scoutfs_data_extent_val dv; + struct scoutfs_key key; + int ret; + + item_from_extent(&key, &dv, args->ino, start, len, map, flags); + ret = scoutfs_item_delete(sb, &key, args->lock); + if (ret == 0 && args->inode) + add_onoff(args->inode, map, flags, -len); + return ret; +} + +static struct scoutfs_ext_ops data_ext_ops = { + .next = data_ext_next, + .insert = data_ext_insert, + .remove = data_ext_remove, +}; + /* * Find and remove or mark offline the block mappings that intersect * with the caller's range. The caller is responsible for transactions @@ -703,74 +182,75 @@ static s64 truncate_extents(struct super_block *sb, struct inode *inode, struct scoutfs_lock *lock) { DECLARE_DATA_INFO(sb, datinf); - struct unpacked_extents *unpe = NULL; - struct unpacked_extent *ext; - struct scoutfs_traced_extent te; + struct data_ext_args args = { + .ino = ino, + .inode = inode, + .lock = lock, + }; + struct scoutfs_extent ext; + struct scoutfs_extent tr; u64 offset; - u64 blkno; - u64 count; - u8 flags; s64 ret; - int err; - - ret = load_unpacked_extents(sb, ino, iblock, last, false, &unpe, lock); - if (ret < 0) { - if (ret == -ENOENT) - ret = 0; - goto out; - } + u8 flags; + int i; flags = offline ? SEF_OFFLINE : 0; - ret = 0; - ext = find_extent(unpe, iblock, last); - while (ext && ext->iblock <= last) { + + for (i = 0; iblock <= last; i++) { + if (i == EXTENTS_PER_HOLD) { + ret = iblock; + break; + } + + ret = scoutfs_ext_next(sb, &data_ext_ops, &args, + iblock, 1, &ext); + if (ret < 0) { + if (ret == -ENOENT) + ret = 0; + break; + } + + /* done if we went past the region */ + if (ext.start > last) { + ret = 0; + break; + } /* nothing to do when already offline and unmapped */ - if ((offline && (ext->flags & SEF_OFFLINE)) && !ext->blkno) { - ext = next_extent(ext); + if ((offline && (ext.flags & SEF_OFFLINE)) && !ext.map) { + iblock = ext.start + ext.len; continue; } - iblock = max(ext->iblock, iblock); - offset = iblock - ext->iblock; - blkno = ext->blkno + offset; - count = min(ext->count - offset, last - iblock + 1); + iblock = max(ext.start, iblock); + offset = iblock - ext.start; - if (ext->blkno) { - down_write(&datinf->alloc_rwsem); - err = scoutfs_radix_free_data(sb, datinf->alloc, - datinf->wri, - &datinf->data_freed, - blkno, count); - up_write(&datinf->alloc_rwsem); - if (err < 0) { - ret = err; + tr.start = iblock; + tr.map = ext.map ? ext.map + offset : 0; + tr.len = min(ext.len - offset, last - iblock + 1); + tr.flags = ext.flags; + + if (tr.map) { + mutex_lock(&datinf->mutex); + ret = scoutfs_free_data(sb, datinf->alloc, + datinf->wri, + &datinf->data_freed, + tr.map, tr.len); + mutex_unlock(&datinf->mutex); + if (ret < 0) break; - } } - init_traced_extent(&te, iblock, count, 0, flags); - trace_scoutfs_data_extent_truncated(sb, ino, &te); + trace_scoutfs_data_extent_truncated(sb, ino, &tr); - err = set_extent(sb, inode, ino, unpe, iblock, 0, count, flags); - BUG_ON(err); /* inconsistent alloc and extents */ + ret = scoutfs_ext_set(sb, &data_ext_ops, &args, + tr.start, tr.len, 0, flags); + BUG_ON(ret); /* inconsistent, could prealloc items */ - /* modifying could have merged and deleted ext, search again */ - iblock += count; - if (iblock > last) - break; - ext = find_extent(unpe, iblock, last); + iblock += tr.len; } - err = store_packed_extents(sb, ino, unpe, lock); - BUG_ON(err); /* inconsistent alloc and extents */ - - /* continue after the packed extent item if we exhausted extents */ - if (ret == 0) - ret = unpe->iblock + SCOUTFS_PACKEXT_BLOCKS; -out: - free_unpacked_extents(unpe); return ret; } @@ -844,6 +324,11 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode, return ret; } +static inline u64 ext_last(struct scoutfs_extent *ext) +{ + return ext->start + ext->len - 1; +} + /* * The caller is writing to a logical iblock that doesn't have an * allocated extent. @@ -861,141 +346,111 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode, * block. It doesn't work for concurrent stages, releasing behind * staging, sparse files, multi-node writes, etc. fallocate() is always * a better tool to use. - * - * We can mangle the extents so the caller is going to search for the - * intersecting extent again if we succeed. */ static int alloc_block(struct super_block *sb, struct inode *inode, - struct unpacked_extents *unpe, - struct unpacked_extent *ext, u64 iblock, + struct scoutfs_extent *ext, u64 iblock, struct scoutfs_lock *lock) { DECLARE_DATA_INFO(sb, datinf); const u64 ino = scoutfs_ino(inode); - struct scoutfs_traced_extent te; + struct data_ext_args args = { + .ino = ino, + .inode = inode, + .lock = lock, + }; + struct scoutfs_extent found; + struct scoutfs_extent pre; u64 blkno = 0; u64 online; u64 offline; - u64 last; u8 flags; - int count; + u64 count; int ret; int err; + trace_scoutfs_data_alloc_block_enter(sb, ino, iblock, ext); + /* can only allocate over existing unallocated offline extent */ - if (WARN_ON_ONCE(ext && - !(iblock >= ext->iblock && iblock <= ext_last(ext) && - ext->blkno == 0 && (ext->flags & SEF_OFFLINE)))) + if (WARN_ON_ONCE(ext->len && + !(iblock >= ext->start && iblock <= ext_last(ext) && + ext->map == 0 && (ext->flags & SEF_OFFLINE)))) return -EINVAL; - down_write(&datinf->alloc_rwsem); + mutex_lock(&datinf->mutex); scoutfs_inode_get_onoff(inode, &online, &offline); - if (ext) { + if (ext->len) { /* limit preallocation to remaining existing (offline) extent */ - count = ext->count - (iblock - ext->iblock); + count = ext->len - (iblock - ext->start); flags = ext->flags; } else { - /* otherwise alloc to next extent or end of packed item */ - last = last_iblock(iblock); - ext = find_extent(unpe, iblock, last); - if (ext) - count = ext->iblock - iblock; + /* otherwise alloc to next extent */ + ret = scoutfs_ext_next(sb, &data_ext_ops, &args, + iblock, 1, &found); + if (ret < 0 && ret != -ENOENT) + goto out; + if (found.len && found.start > iblock) + count = found.start - iblock; else - count = last - iblock + 1; + count = SCOUTFS_DATA_EXTEND_PREALLOC_LIMIT; flags = 0; } + /* overall prealloc limit */ + count = min_t(u64, count, SCOUTFS_DATA_EXTEND_PREALLOC_LIMIT); + /* only strictly contiguous extending writes will try to preallocate */ if (iblock > 1 && iblock == online) - count = min_t(u64, iblock, count); + count = min(iblock, count); else count = 1; - ret = scoutfs_radix_alloc_data(sb, datinf->alloc, datinf->wri, - &datinf->data_avail, count, &blkno, - &count); + ret = scoutfs_alloc_data(sb, datinf->alloc, datinf->wri, + &datinf->data_avail, &datinf->cached_ext, + count, &blkno, &count); if (ret < 0) goto out; - ret = set_extent(sb, inode, ino, unpe, iblock, blkno, 1, 0); + ret = scoutfs_ext_set(sb, &data_ext_ops, &args, iblock, 1, blkno, 0); if (ret < 0) goto out; - init_traced_extent(&te, iblock, blkno, 1, 0); - trace_scoutfs_data_alloc_block(sb, ino, &te); - if (count > 1) { - ret = set_extent(sb, inode, ino, unpe, iblock + 1, - blkno + 1, count - 1, flags | SEF_UNWRITTEN); + pre.start = iblock + 1; + pre.len = count - 1; + pre.map = blkno + 1; + pre.flags = flags | SEF_UNWRITTEN; + ret = scoutfs_ext_set(sb, &data_ext_ops, &args, pre.start, + pre.len, pre.map, pre.flags); if (ret < 0) { - err = set_extent(sb, inode, ino, unpe, iblock, 0, 1, - flags); + err = scoutfs_ext_set(sb, &data_ext_ops, &args, iblock, + 1, 0, flags); BUG_ON(err); /* couldn't restore original */ + goto out; } - - init_traced_extent(&te, iblock + 1, blkno + 1, count - 1, - flags | SEF_UNWRITTEN); - trace_scoutfs_data_prealloc_unwritten(sb, ino, &te); } - ret = store_packed_extents(sb, ino, unpe, lock); - BUG_ON(ret); /* inconsistent previous extent state */ - + /* tell the caller we have a single block, could check next? */ + ext->start = iblock; + ext->len = 1; + ext->map = blkno; + ext->flags = 0; + ret = 0; out: if (ret < 0 && blkno > 0) { - err = scoutfs_radix_free_data(sb, datinf->alloc, datinf->wri, - &datinf->data_freed, - blkno, count); + err = scoutfs_free_data(sb, datinf->alloc, datinf->wri, + &datinf->data_freed, blkno, count); BUG_ON(err); /* leaked free blocks */ } - up_write(&datinf->alloc_rwsem); - - return ret; -} - -/* - * A caller is writing into an unwritten block. This can also be called - * for staging writes so we clear both the unwritten and offline flags. - * - * We don't have to wait for dirty block IO to complete before clearing - * the unwritten flag in metadata because we have strict synchronization - * between data and metadata. All dirty data in the current transaction - * is written before the metadata in the transaction that references it - * is committed. - */ -static int convert_unwritten(struct super_block *sb, struct inode *inode, - struct unpacked_extents *unpe, - struct unpacked_extent *ext, u64 iblock, - struct scoutfs_lock *lock) -{ - struct scoutfs_traced_extent te; - u64 blkno; - u8 ext_fl; - int err; - int ret; - - blkno = ext->blkno + (iblock - ext->iblock); - ext_fl = ext->flags; - - init_traced_extent(&te, iblock, 1, blkno, ext_fl); - trace_scoutfs_data_convert_unwritten(sb, scoutfs_ino(inode), &te); - - ret = set_extent(sb, inode, scoutfs_ino(inode), unpe, iblock, - blkno, 1, ext_fl & ~(SEF_OFFLINE|SEF_UNWRITTEN)); - if (ret < 0) - goto out; - - ret = store_packed_extents(sb, scoutfs_ino(inode), unpe, lock); - if (ret < 0) { - err = set_extent(sb, inode, scoutfs_ino(inode), unpe, iblock, - blkno, 1, ext_fl); - BUG_ON(err); /* packed and unpacked inconsistent */ + if (ret == 0) { + trace_scoutfs_data_alloc(sb, ino, ext); + trace_scoutfs_data_prealloc(sb, ino, &pre); } -out: + mutex_unlock(&datinf->mutex); + return ret; } @@ -1005,10 +460,10 @@ static int scoutfs_get_block(struct inode *inode, sector_t iblock, struct scoutfs_inode_info *si = SCOUTFS_I(inode); const u64 ino = scoutfs_ino(inode); struct super_block *sb = inode->i_sb; + struct data_ext_args args; struct scoutfs_lock *lock = NULL; - struct unpacked_extents *unpe = NULL; - struct unpacked_extent *ext = NULL; - DECLARE_TRACED_EXTENT(te); + struct scoutfs_extent ext = {0,}; + struct scoutfs_extent un; u64 offset; int ret; @@ -1021,53 +476,60 @@ static int scoutfs_get_block(struct inode *inode, sector_t iblock, goto out; } - ret = load_unpacked_extents(sb, ino, iblock, iblock, true, &unpe, lock); - if (ret < 0) + args.ino = ino; + args.inode = inode; + args.lock = lock; + + ret = scoutfs_ext_next(sb, &data_ext_ops, &args, iblock, 1, &ext); + if (ret == -ENOENT || (ret == 0 && ext.start > iblock)) + memset(&ext, 0, sizeof(ext)); + else if (ret < 0) goto out; - ext = find_extent(unpe, iblock, iblock); + if (ext.len) + trace_scoutfs_data_get_block_found(sb, ino, &ext); /* non-staging callers should have waited on offline blocks */ - if (WARN_ON_ONCE(ext && (ext->flags & SEF_OFFLINE) && !si->staging)) { + if (WARN_ON_ONCE(ext.map && (ext.flags & SEF_OFFLINE) && !si->staging)){ ret = -EIO; goto out; } - /* convert unwritten to written */ - if (create && ext && (ext->flags & SEF_UNWRITTEN)) { - ret = convert_unwritten(sb, inode, unpe, ext, iblock, lock); + /* convert unwritten to written, could be staging */ + if (create && ext.map && (ext.flags & SEF_UNWRITTEN)) { + un.start = iblock; + un.len = 1; + un.map = ext.map + (iblock - ext.start); + un.flags = ext.flags & ~(SEF_OFFLINE|SEF_UNWRITTEN); + ret = scoutfs_ext_set(sb, &data_ext_ops, &args, + un.start, un.len, un.map, un.flags); if (ret == 0) { + ext = un; set_buffer_new(bh); - ext = find_extent(unpe, iblock, iblock); } goto out; } /* allocate and map blocks containing our logical block */ - if (create && (!ext || !ext->blkno)) { - ret = alloc_block(sb, inode, unpe, ext, iblock, lock); - if (ret == 0) { + if (create && !ext.map) { + ret = alloc_block(sb, inode, &ext, iblock, lock); + if (ret == 0) set_buffer_new(bh); - ext = find_extent(unpe, iblock, iblock); - } } else { ret = 0; } out: /* map usable extent, else leave bh unmapped for sparse reads */ - if (ret == 0 && ext && ext->blkno && !(ext->flags & SEF_UNWRITTEN)) { - offset = iblock - ext->iblock; - map_bh(bh, inode->i_sb, ext->blkno + offset); + if (ret == 0 && ext.map && !(ext.flags & SEF_UNWRITTEN)) { + offset = iblock - ext.start; + map_bh(bh, inode->i_sb, ext.map + offset); bh->b_size = min_t(u64, bh->b_size, - (ext->count - offset) << SCOUTFS_BLOCK_SM_SHIFT); + (ext.len - offset) << SCOUTFS_BLOCK_SM_SHIFT); + trace_scoutfs_data_get_block_mapped(sb, ino, &ext); } - if (ext) - copy_traced_extent(&te, ext); - trace_scoutfs_get_block(sb, scoutfs_ino(inode), iblock, create, - &te, ret, bh->b_blocknr, bh->b_size); - free_unpacked_extents(unpe); + &ext, ret, bh->b_blocknr, bh->b_size); return ret; } @@ -1330,74 +792,82 @@ static int scoutfs_write_end(struct file *file, struct address_space *mapping, /* * Try to allocate unwritten extents for any unallocated regions of the - * logical block extent from the caller. We work one packed extent item - * at a time. + * logical block extent from the caller. The caller manages locks and + * transactions. We limit ourselves to a reasonable number of extents + * before returning to open another transaction. * - * We return an error or the numbet of contiguous blocks starting at - * iblock that were successfully processed. + * We return an error or the number of blocks starting at iblock that + * were successfully processed. The caller will continue after those + * blocks until they reach last. */ -static int fallocate_extents(struct super_block *sb, struct inode *inode, +static s64 fallocate_extents(struct super_block *sb, struct inode *inode, u64 iblock, u64 last, struct scoutfs_lock *lock) { DECLARE_DATA_INFO(sb, datinf); - const u64 ino = scoutfs_ino(inode); - struct unpacked_extents *unpe = NULL; - struct unpacked_extent *ext; + struct data_ext_args args = { + .ino = scoutfs_ino(inode), + .inode = inode, + .lock = lock, + }; + struct scoutfs_extent ext; u8 ext_fl; u64 blkno; - int count; - int done; - int ret; + u64 count; + s64 done = 0; + int ret = 0; int err; + int i; - /* work with the extents in one item at a time */ - last = min(last, last_iblock(iblock)); - done = 0; + for (i = 0; iblock <= last && i < EXTENTS_PER_HOLD; i++) { - ret = load_unpacked_extents(sb, ino, iblock, iblock, true, &unpe, lock); - if (ret < 0) - goto out; - - ext = find_extent(unpe, iblock, last); - while (iblock <= last) { + ret = scoutfs_ext_next(sb, &data_ext_ops, &args, + iblock, 1, &ext); + if (ret == -ENOENT) + ret = 0; + else if (ret < 0) + break; /* default to allocate to end of region */ count = last - iblock + 1; ext_fl = 0; - if (!ext) { + if (!ext.len) { /* no extent, default alloc from above */ - } else if (ext->iblock <= iblock && ext->blkno) { + } else if (ext.start <= iblock && ext.map) { /* skip portion of allocated extent */ count = min_t(u64, count, - ext->count - (iblock - ext->iblock)); + ext.len - (iblock - ext.start)); iblock += count; done += count; - ext = next_extent(ext); continue; - } else if (ext->iblock <= iblock && !ext->blkno) { + } else if (ext.start <= iblock && !ext.map) { /* alloc portion of unallocated extent */ count = min_t(u64, count, - ext->count - (iblock - ext->iblock)); - ext_fl = ext->flags; + ext.len - (iblock - ext.start)); + ext_fl = ext.flags; - } else if (iblock < ext->iblock) { + } else if (iblock < ext.start) { /* alloc hole until next extent */ - count = min_t(u64, count, ext->iblock - iblock); + count = min_t(u64, count, ext.start - iblock); } - down_write(&datinf->alloc_rwsem); + /* limit allocation attempts */ + count = min_t(u64, count, SCOUTFS_FALLOCATE_ALLOC_LIMIT); - ret = scoutfs_radix_alloc_data(sb, datinf->alloc, datinf->wri, - &datinf->data_avail, count, - &blkno, &count); + mutex_lock(&datinf->mutex); + + ret = scoutfs_alloc_data(sb, datinf->alloc, datinf->wri, + &datinf->data_avail, + &datinf->cached_ext, + count, &blkno, &count); if (ret == 0) { - ret = set_extent(sb, inode, ino, unpe, iblock, blkno, - count, ext_fl | SEF_UNWRITTEN); + ret = scoutfs_ext_set(sb, &data_ext_ops, &args, iblock, + count, blkno, + ext_fl | SEF_UNWRITTEN); if (ret < 0) { - err = scoutfs_radix_free_data(sb, datinf->alloc, + err = scoutfs_free_data(sb, datinf->alloc, datinf->wri, &datinf->data_avail, blkno, count); @@ -1405,25 +875,18 @@ static int fallocate_extents(struct super_block *sb, struct inode *inode, } } - up_write(&datinf->alloc_rwsem); + mutex_unlock(&datinf->mutex); if (ret < 0) break; iblock += count; done += count; - ext = find_extent(unpe, iblock, last); } - ret = store_packed_extents(sb, ino, unpe, lock); - BUG_ON(ret); /* inconsistent with unpacked and alloc */ - if (ret == 0) ret = done; -out: - free_unpacked_extents(unpe); - return ret; } @@ -1447,7 +910,7 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) loff_t end; u64 iblock; u64 last; - int ret; + s64 ret; mutex_lock(&inode->i_mutex); @@ -1527,79 +990,56 @@ out: * on regular files with no data extents. It's used to restore a file * with an offline extent which can then trigger staging. * - * The caller has taken care of locking. We're creating many packed - * extent items which may have to be written in multiple transactions. - * We create exetnts from the front of the file and use the offline - * block count to figure out where to continue from. + * The caller has taken care of locking the inode. We're updating the + * inode offline count as we create the offline extent so we take care + * of the index locking, updating, and transaction. */ int scoutfs_data_init_offline_extent(struct inode *inode, u64 size, struct scoutfs_lock *lock) { struct super_block *sb = inode->i_sb; - struct unpacked_extents *unpe = NULL; - u64 ino = scoutfs_ino(inode); + struct data_ext_args args = { + .ino = scoutfs_ino(inode), + .inode = inode, + .lock = lock, + }; + const u64 count = DIV_ROUND_UP(size, SCOUTFS_BLOCK_SM_SIZE); LIST_HEAD(ind_locks); - bool held = false; - u64 blocks; - u64 iblock; - u64 count; u64 on; u64 off; int ret; - blocks = DIV_ROUND_UP(size, SCOUTFS_BLOCK_SM_SIZE); - scoutfs_inode_get_onoff(inode, &on, &off); - iblock = off; - while (iblock < blocks) { - /* we're updating meta_seq with offline block count */ - ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, - SIC_SETATTR_MORE()); - if (ret < 0) - goto out; - held = true; - - ret = scoutfs_dirty_inode_item(inode, lock); - if (ret < 0) - goto out; - - ret = load_unpacked_extents(sb, ino, iblock, iblock, true, - &unpe, lock); - if (ret < 0) - goto out; - - count = min(blocks - iblock, last_iblock(iblock) - iblock + 1); - - ret = set_extent(sb, inode, ino, unpe, iblock, 0, count, - SEF_OFFLINE); - if (ret < 0) - goto out; - - ret = store_packed_extents(sb, ino, unpe, lock); - if (ret < 0) - goto out; - - free_unpacked_extents(unpe); - unpe = NULL; - - scoutfs_update_inode_item(inode, lock, &ind_locks); - - scoutfs_release_trans(sb); - scoutfs_inode_index_unlock(sb, &ind_locks); - held = false; - - iblock += count; + /* caller should have checked */ + if (on > 0 || off > 0) { + ret = -EINVAL; + goto out; } + /* we're updating meta_seq with offline block count */ + ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, + SIC_SETATTR_MORE()); + if (ret < 0) + goto out; + + ret = scoutfs_dirty_inode_item(inode, lock); + if (ret < 0) + goto unlock; + + ret = scoutfs_ext_insert(sb, &data_ext_ops, &args, + 0, count, 0, SEF_OFFLINE); + if (ret < 0) + goto unlock; + + scoutfs_update_inode_item(inode, lock, &ind_locks); + +unlock: + scoutfs_release_trans(sb); + scoutfs_inode_index_unlock(sb, &ind_locks); ret = 0; out: - if (held) { - scoutfs_release_trans(sb); - scoutfs_inode_index_unlock(sb, &ind_locks); - } - free_unpacked_extents(unpe); return ret; } @@ -1607,11 +1047,11 @@ out: * This copies to userspace :/ */ static int fill_extent(struct fiemap_extent_info *fieinfo, - struct unpacked_extent *ext, u32 fiemap_flags) + struct scoutfs_extent *ext, u32 fiemap_flags) { u32 flags; - if (ext->count == 0) + if (ext->len == 0) return 0; flags = fiemap_flags; @@ -1621,9 +1061,9 @@ static int fill_extent(struct fiemap_extent_info *fieinfo, flags |= FIEMAP_EXTENT_UNWRITTEN; return fiemap_fill_next_extent(fieinfo, - ext->iblock << SCOUTFS_BLOCK_SM_SHIFT, - ext->blkno << SCOUTFS_BLOCK_SM_SHIFT, - ext->count << SCOUTFS_BLOCK_SM_SHIFT, + ext->start << SCOUTFS_BLOCK_SM_SHIFT, + ext->map << SCOUTFS_BLOCK_SM_SHIFT, + ext->len << SCOUTFS_BLOCK_SM_SHIFT, flags); } @@ -1638,28 +1078,33 @@ int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, struct super_block *sb = inode->i_sb; const u64 ino = scoutfs_ino(inode); struct scoutfs_lock *lock = NULL; - struct unpacked_extents *unpe = NULL; - struct unpacked_extent *ext; - struct unpacked_extent cur; - struct scoutfs_traced_extent te; + struct scoutfs_extent ext; + struct scoutfs_extent cur; + struct data_ext_args args; u32 last_flags; u64 iblock; u64 last; int ret; - if (len == 0) - return 0; + if (len == 0) { + ret = 0; + goto out; + } ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); if (ret) - return ret; + goto out; /* XXX overkill? */ mutex_lock(&inode->i_mutex); ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, 0, inode, &lock); if (ret) - goto out; + goto unlock; + + args.ino = ino; + args.inode = inode; + args.lock = lock; /* use a dummy extent to track */ memset(&cur, 0, sizeof(cur)); @@ -1668,9 +1113,9 @@ int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, iblock = start >> SCOUTFS_BLOCK_SM_SHIFT; last = (start + len - 1) >> SCOUTFS_BLOCK_SM_SHIFT; - for (;;) { - ret = load_unpacked_extents(sb, ino, iblock, last, false, - &unpe, lock); + while (iblock <= last) { + ret = scoutfs_ext_next(sb, &data_ext_ops, &args, + iblock, 1, &ext); if (ret < 0) { if (ret == -ENOENT) ret = 0; @@ -1678,45 +1123,39 @@ int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, break; } - for (ext = find_extent(unpe, iblock, last); ext; - ext = next_extent(ext)) { + trace_scoutfs_data_fiemap_extent(sb, ino, &ext); - copy_traced_extent(&te, ext); - trace_scoutfs_data_fiemap_extent(sb, ino, &te); - - if (ext->iblock > last) { - /* not setting _LAST, it's for end of file */ - ret = 0; - break; - } - - if (extents_merge(&cur, ext)) { - cur.count += ext->count; - continue; - } - - ret = fill_extent(fieinfo, &cur, 0); - if (ret != 0) - goto out; - cur = *ext; + if (ext.start > last) { + /* not setting _LAST, it's for end of file */ + ret = 0; + break; } - iblock = unpe->iblock + SCOUTFS_PACKEXT_BLOCKS; - free_unpacked_extents(unpe); - unpe = NULL; + if (scoutfs_ext_can_merge(&cur, &ext)) { + /* merged extents could be greater than input len */ + cur.len += ext.len; + } else { + ret = fill_extent(fieinfo, &cur, 0); + if (ret != 0) + goto unlock; + cur = ext; + } + + iblock = ext.start + ext.len; } - if (cur.count) + if (cur.len) ret = fill_extent(fieinfo, &cur, last_flags); -out: +unlock: scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ); mutex_unlock(&inode->i_mutex); - free_unpacked_extents(unpe); - +out: if (ret == 1) ret = 0; + trace_scoutfs_data_fiemap(sb, start, len, ret); + return ret; } @@ -1803,11 +1242,14 @@ int scoutfs_data_wait_check(struct inode *inode, loff_t pos, loff_t len, { struct super_block *sb = inode->i_sb; const u64 ino = scoutfs_ino(inode); + struct data_ext_args args = { + .ino = ino, + .inode = inode, + .lock = lock, + }; DECLARE_DATA_WAIT_ROOT(sb, rt); DECLARE_DATA_WAITQ(inode, wq); - struct unpacked_extents *unpe = NULL; - struct unpacked_extent *ext; - DECLARE_TRACED_EXTENT(te); + struct scoutfs_extent ext = {0,}; u64 iblock; u64 last_block; u64 on; @@ -1834,50 +1276,40 @@ int scoutfs_data_wait_check(struct inode *inode, loff_t pos, loff_t len, last_block = (pos + len - 1) >> SCOUTFS_BLOCK_SM_SHIFT; while(iblock <= last_block) { - - free_unpacked_extents(unpe); - ret = load_unpacked_extents(sb, ino, iblock, last_block, false, - &unpe, lock); + ret = scoutfs_ext_next(sb, &data_ext_ops, &args, + iblock, 1, &ext); if (ret < 0) { if (ret == -ENOENT) ret = 0; - goto out; + break; } - for (ext = find_extent(unpe, iblock, last_block); ext; - ext = next_extent(ext)) { - - if (ext->iblock > last_block) { - ret = 0; - goto out; - } - - if (sef & ext->flags) { - if (dw) { - dw->chg = atomic64_read(&wq->changed); - dw->ino = ino; - dw->iblock = max(iblock, ext->iblock); - dw->op = op; - - spin_lock(&rt->lock); - insert_offline_waiting(&rt->root, dw); - spin_unlock(&rt->lock); - } - - copy_traced_extent(&te, ext); - ret = 1; - goto out; - } - + if (ext.start > last_block) { + ret = 0; + break; } - iblock = unpe->iblock + SCOUTFS_PACKEXT_BLOCKS; + if (sef & ext.flags) { + if (dw) { + dw->chg = atomic64_read(&wq->changed); + dw->ino = ino; + dw->iblock = max(iblock, ext.start); + dw->op = op; + + spin_lock(&rt->lock); + insert_offline_waiting(&rt->root, dw); + spin_unlock(&rt->lock); + } + + ret = 1; + break; + } + + iblock = ext.start + ext.len; } out: - trace_scoutfs_data_wait_check(sb, ino, pos, len, sef, op, &te, ret); - - free_unpacked_extents(unpe); + trace_scoutfs_data_wait_check(sb, ino, pos, len, sef, op, &ext, ret); return ret; } @@ -2019,20 +1451,20 @@ const struct file_operations scoutfs_file_fops = { }; void scoutfs_data_init_btrees(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_log_trees *lt) { DECLARE_DATA_INFO(sb, datinf); - down_write(&datinf->alloc_rwsem); + mutex_lock(&datinf->mutex); datinf->alloc = alloc; datinf->wri = wri; datinf->data_avail = lt->data_avail; datinf->data_freed = lt->data_freed; - up_write(&datinf->alloc_rwsem); + mutex_unlock(&datinf->mutex); } void scoutfs_data_get_btrees(struct super_block *sb, @@ -2040,12 +1472,38 @@ void scoutfs_data_get_btrees(struct super_block *sb, { DECLARE_DATA_INFO(sb, datinf); - down_read(&datinf->alloc_rwsem); + mutex_lock(&datinf->mutex); lt->data_avail = datinf->data_avail; lt->data_freed = datinf->data_freed; - up_read(&datinf->alloc_rwsem); + mutex_unlock(&datinf->mutex); +} + +/* + * This should be called before preparing the allocators for the commit + * because it can allocate and free btree blocks in the data allocator. + */ +int scoutfs_data_prepare_commit(struct super_block *sb) +{ + DECLARE_DATA_INFO(sb, datinf); + int ret; + + mutex_lock(&datinf->mutex); + if (datinf->cached_ext.len) { + ret = scoutfs_free_data(sb, datinf->alloc, datinf->wri, + &datinf->data_avail, + datinf->cached_ext.start, + datinf->cached_ext.len); + if (ret == 0) + memset(&datinf->cached_ext, 0, + sizeof(datinf->cached_ext)); + } else { + ret = 0; + } + mutex_unlock(&datinf->mutex); + + return ret; } /* @@ -2055,8 +1513,8 @@ u64 scoutfs_data_alloc_free_bytes(struct super_block *sb) { DECLARE_DATA_INFO(sb, datinf); - return scoutfs_radix_root_free_blocks(sb, &datinf->data_avail) << - SCOUTFS_BLOCK_SM_SHIFT; + return le64_to_cpu(datinf->data_avail.total_len) << + SCOUTFS_BLOCK_SM_SHIFT; } int scoutfs_data_setup(struct super_block *sb) @@ -2069,7 +1527,7 @@ int scoutfs_data_setup(struct super_block *sb) return -ENOMEM; datinf->sb = sb; - init_rwsem(&datinf->alloc_rwsem); + mutex_init(&datinf->mutex); sbi->data_info = datinf; return 0; diff --git a/kmod/src/data.h b/kmod/src/data.h index b4ee7344..09a64fe7 100644 --- a/kmod/src/data.h +++ b/kmod/src/data.h @@ -47,7 +47,7 @@ struct scoutfs_traced_extent { extern const struct address_space_operations scoutfs_file_aops; extern const struct file_operations scoutfs_file_fops; -struct scoutfs_radix_allocator; +struct scoutfs_alloc; struct scoutfs_block_writer; int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode, @@ -77,11 +77,12 @@ int scoutfs_data_waiting(struct super_block *sb, u64 ino, u64 iblock, unsigned int nr); void scoutfs_data_init_btrees(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_log_trees *lt); void scoutfs_data_get_btrees(struct super_block *sb, struct scoutfs_log_trees *lt); +int scoutfs_data_prepare_commit(struct super_block *sb); u64 scoutfs_data_alloc_free_bytes(struct super_block *sb); int scoutfs_data_setup(struct super_block *sb); diff --git a/kmod/src/forest.c b/kmod/src/forest.c index 2d53a1d9..49a255e7 100644 --- a/kmod/src/forest.c +++ b/kmod/src/forest.c @@ -20,7 +20,7 @@ #include "lock.h" #include "btree.h" #include "client.h" -#include "radix.h" +#include "alloc.h" #include "block.h" #include "forest.h" #include "hash.h" @@ -53,7 +53,7 @@ struct forest_info { struct mutex mutex; - struct scoutfs_radix_allocator *alloc; + struct scoutfs_alloc *alloc; struct scoutfs_block_writer *wri; struct scoutfs_log_trees our_log; @@ -421,22 +421,22 @@ int scoutfs_forest_set_bloom_bits(struct super_block *sb, if (!ref->blkno || !scoutfs_block_writer_is_dirty(sb, bl)) { - ret = scoutfs_radix_alloc(sb, finf->alloc, finf->wri, &blkno); + ret = scoutfs_alloc_meta(sb, finf->alloc, finf->wri, &blkno); if (ret < 0) goto unlock; new_bl = scoutfs_block_create(sb, blkno); if (IS_ERR(new_bl)) { - err = scoutfs_radix_free(sb, finf->alloc, finf->wri, - blkno); + err = scoutfs_free_meta(sb, finf->alloc, finf->wri, + blkno); BUG_ON(err); /* could have dirtied */ ret = PTR_ERR(new_bl); goto unlock; } if (bl) { - err = scoutfs_radix_free(sb, finf->alloc, finf->wri, - le64_to_cpu(ref->blkno)); + err = scoutfs_free_meta(sb, finf->alloc, finf->wri, + le64_to_cpu(ref->blkno)); BUG_ON(err); /* could have dirtied */ memcpy(new_bl->data, bl->data, SCOUTFS_BLOCK_LG_SIZE); } else { @@ -517,7 +517,7 @@ int scoutfs_forest_srch_add(struct super_block *sb, u64 hash, u64 ino, u64 id) * serialized with all writers. */ void scoutfs_forest_init_btrees(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_log_trees *lt) { diff --git a/kmod/src/forest.h b/kmod/src/forest.h index 6d0c0c8c..e6e72a4a 100644 --- a/kmod/src/forest.h +++ b/kmod/src/forest.h @@ -1,7 +1,7 @@ #ifndef _SCOUTFS_FOREST_H_ #define _SCOUTFS_FOREST_H_ -struct scoutfs_radix_allocator; +struct scoutfs_alloc; struct scoutfs_block_writer; struct scoutfs_block; @@ -28,7 +28,7 @@ int scoutfs_forest_insert_list(struct super_block *sb, int scoutfs_forest_srch_add(struct super_block *sb, u64 hash, u64 ino, u64 id); void scoutfs_forest_init_btrees(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_log_trees *lt); void scoutfs_forest_get_btrees(struct super_block *sb, diff --git a/kmod/src/format.h b/kmod/src/format.h index 15bd7d92..d5a78ade 100644 --- a/kmod/src/format.h +++ b/kmod/src/format.h @@ -144,10 +144,10 @@ struct scoutfs_key { #define sks_ino _sk_first #define sks_nr _sk_second -/* packed extents */ -#define skpe_ino _sk_first -#define skpe_base _sk_second -#define skpe_part _sk_fourth +/* data extents */ +#define skdx_ino _sk_first +#define skdx_end _sk_second +#define skdx_len _sk_third /* log trees */ #define sklt_rid _sk_first @@ -163,6 +163,13 @@ struct scoutfs_key { /* mounted clients */ #define skmc_rid _sk_first +/* free extents by blkno */ +#define skfb_end _sk_second +#define skfb_len _sk_third +/* free extents by len */ +#define skfl_neglen _sk_second +#define skfl_blkno _sk_third + struct scoutfs_radix_block { struct scoutfs_block_header hdr; union { @@ -386,8 +393,8 @@ struct scoutfs_srch_block { #define SCOUTFS_SRCH_COMPACT_NR (1 << SCOUTFS_SRCH_COMPACT_ORDER) struct scoutfs_srch_compact_input { - struct scoutfs_radix_root meta_avail; - struct scoutfs_radix_root meta_freed; + struct scoutfs_alloc_list_head meta_avail; + struct scoutfs_alloc_list_head meta_freed; __le64 id; __u8 nr; __u8 flags; @@ -395,8 +402,8 @@ struct scoutfs_srch_compact_input { } __packed; struct scoutfs_srch_compact_result { - struct scoutfs_radix_root meta_avail; - struct scoutfs_radix_root meta_freed; + struct scoutfs_alloc_list_head meta_avail; + struct scoutfs_alloc_list_head meta_freed; __le64 id; __u8 flags; struct scoutfs_srch_file sfl; @@ -413,24 +420,24 @@ struct scoutfs_srch_compact_result { * about item logs, it's about clients making changes to trees. */ struct scoutfs_log_trees { - struct scoutfs_radix_root meta_avail; - struct scoutfs_radix_root meta_freed; + struct scoutfs_alloc_list_head meta_avail; + struct scoutfs_alloc_list_head meta_freed; struct scoutfs_btree_root item_root; struct scoutfs_btree_ref bloom_ref; - struct scoutfs_radix_root data_avail; - struct scoutfs_radix_root data_freed; + struct scoutfs_alloc_root data_avail; + struct scoutfs_alloc_root data_freed; struct scoutfs_srch_file srch_file; __le64 rid; __le64 nr; } __packed; struct scoutfs_log_trees_val { - struct scoutfs_radix_root meta_avail; - struct scoutfs_radix_root meta_freed; + struct scoutfs_alloc_list_head meta_avail; + struct scoutfs_alloc_list_head meta_freed; struct scoutfs_btree_root item_root; struct scoutfs_btree_ref bloom_ref; - struct scoutfs_radix_root data_avail; - struct scoutfs_radix_root data_freed; + struct scoutfs_alloc_root data_avail; + struct scoutfs_alloc_root data_freed; struct scoutfs_srch_file srch_file; } __packed; @@ -482,6 +489,7 @@ struct scoutfs_bloom_block { #define SCOUTFS_TRANS_SEQ_ZONE 8 #define SCOUTFS_MOUNTED_CLIENT_ZONE 9 #define SCOUTFS_SRCH_ZONE 10 +#define SCOUTFS_FREE_EXTENT_ZONE 11 /* inode index zone */ #define SCOUTFS_INODE_INDEX_META_SEQ_TYPE 1 @@ -498,7 +506,7 @@ struct scoutfs_bloom_block { #define SCOUTFS_READDIR_TYPE 4 #define SCOUTFS_LINK_BACKREF_TYPE 5 #define SCOUTFS_SYMLINK_TYPE 6 -#define SCOUTFS_PACKED_EXTENT_TYPE 7 +#define SCOUTFS_DATA_EXTENT_TYPE 7 /* lock zone, only ever found in lock ranges, never in persistent items */ #define SCOUTFS_RENAME_TYPE 1 @@ -508,6 +516,10 @@ struct scoutfs_bloom_block { #define SCOUTFS_SRCH_BLOCKS_TYPE 2 #define SCOUTFS_SRCH_BUSY_TYPE 3 +/* free extents in allocator btrees in client and server, by blkno or len */ +#define SCOUTFS_FREE_EXTENT_BLKNO_TYPE 1 +#define SCOUTFS_FREE_EXTENT_LEN_TYPE 2 + /* * The extents that map blocks in a fixed-size logical region of a file * are packed and stored in item values. The packed extents are @@ -539,6 +551,12 @@ struct scoutfs_packed_extent { #define SCOUTFS_PACKEXT_BASE_MASK (~((__u64)SCOUTFS_PACKEXT_BLOCKS - 1)) #define SCOUTFS_PACKEXT_MAX_BYTES SCOUTFS_MAX_VAL_SIZE +/* file data extents have start and len in key */ +struct scoutfs_data_extent_val { + __le64 blkno; + __u8 flags; +} __packed; + #define SEF_OFFLINE (1 << 0) #define SEF_UNWRITTEN (1 << 1) #define SEF_UNKNOWN (U8_MAX << 2) @@ -623,10 +641,10 @@ struct scoutfs_super_block { __le64 unmount_barrier; __u8 quorum_count; struct scoutfs_inet_addr server_addr; - struct scoutfs_radix_root core_meta_avail; - struct scoutfs_radix_root core_meta_freed; - struct scoutfs_radix_root core_data_avail; - struct scoutfs_radix_root core_data_freed; + struct scoutfs_alloc_root meta_alloc[2]; + struct scoutfs_alloc_root data_alloc; + struct scoutfs_alloc_list_head server_meta_avail[2]; + struct scoutfs_alloc_list_head server_meta_freed[2]; struct scoutfs_btree_root fs_root; struct scoutfs_btree_root logs_root; struct scoutfs_btree_root lock_clients; diff --git a/kmod/src/lock_server.c b/kmod/src/lock_server.c index 5ef53cdd..ca590635 100644 --- a/kmod/src/lock_server.c +++ b/kmod/src/lock_server.c @@ -20,7 +20,6 @@ #include "tseq.h" #include "spbm.h" #include "block.h" -#include "radix.h" #include "btree.h" #include "msg.h" #include "scoutfs_trace.h" @@ -87,7 +86,7 @@ struct lock_server_info { struct scoutfs_tseq_tree tseq_tree; struct dentry *tseq_dentry; - struct scoutfs_radix_allocator *alloc; + struct scoutfs_alloc *alloc; struct scoutfs_block_writer *wri; }; @@ -956,7 +955,7 @@ static void lock_server_tseq_show(struct seq_file *m, * we time them out. */ int scoutfs_lock_server_setup(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); diff --git a/kmod/src/lock_server.h b/kmod/src/lock_server.h index 99c82b8d..c4fe5621 100644 --- a/kmod/src/lock_server.h +++ b/kmod/src/lock_server.h @@ -12,7 +12,7 @@ int scoutfs_lock_server_response(struct super_block *sb, u64 rid, int scoutfs_lock_server_farewell(struct super_block *sb, u64 rid); int scoutfs_lock_server_setup(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri); void scoutfs_lock_server_destroy(struct super_block *sb); diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h index a4d58bca..0465dd13 100644 --- a/kmod/src/scoutfs_trace.h +++ b/kmod/src/scoutfs_trace.h @@ -170,35 +170,35 @@ TRACE_EVENT(scoutfs_data_fallocate, ); TRACE_EVENT(scoutfs_data_fiemap, - TP_PROTO(struct super_block *sb, __u64 off, int i, __u64 blkno), + TP_PROTO(struct super_block *sb, __u64 start, __u64 len, int ret), - TP_ARGS(sb, off, i, blkno), + TP_ARGS(sb, start, len, ret), TP_STRUCT__entry( SCSB_TRACE_FIELDS - __field(__u64, off) - __field(int, i) - __field(__u64, blkno) + __field(__u64, start) + __field(__u64, len) + __field(int, ret) ), TP_fast_assign( SCSB_TRACE_ASSIGN(sb); - __entry->off = off; - __entry->i = i; - __entry->blkno = blkno; + __entry->start = start; + __entry->len = len; + __entry->ret = ret; ), - TP_printk(SCSBF" blk_off %llu i %u blkno %llu", SCSB_TRACE_ARGS, - __entry->off, __entry->i, __entry->blkno) + TP_printk(SCSBF" start %llu len %llu ret %d", SCSB_TRACE_ARGS, + __entry->start, __entry->len, __entry->ret) ); TRACE_EVENT(scoutfs_get_block, TP_PROTO(struct super_block *sb, __u64 ino, __u64 iblock, - int create, struct scoutfs_traced_extent *te, + int create, struct scoutfs_extent *ext, int ret, __u64 blkno, size_t size), - TP_ARGS(sb, ino, iblock, create, te, ret, blkno, size), + TP_ARGS(sb, ino, iblock, create, ext, ret, blkno, size), TP_STRUCT__entry( SCSB_TRACE_FIELDS @@ -216,7 +216,7 @@ TRACE_EVENT(scoutfs_get_block, __entry->ino = ino; __entry->iblock = iblock; __entry->create = create; - STE_ASSIGN(ext, te) + STE_ASSIGN(ext, ext) __entry->ret = ret; __entry->blkno = blkno; __entry->size = size; @@ -228,11 +228,35 @@ TRACE_EVENT(scoutfs_get_block, __entry->blkno, __entry->size) ); -TRACE_EVENT(scoutfs_data_file_extent_class, - TP_PROTO(struct super_block *sb, __u64 ino, - struct scoutfs_traced_extent *te), +TRACE_EVENT(scoutfs_data_alloc_block_enter, + TP_PROTO(struct super_block *sb, __u64 ino, __u64 iblock, + struct scoutfs_extent *ext), - TP_ARGS(sb, ino, te), + TP_ARGS(sb, ino, iblock, ext), + + TP_STRUCT__entry( + SCSB_TRACE_FIELDS + __field(__u64, ino) + __field(__u64, iblock) + STE_FIELDS(ext) + ), + + TP_fast_assign( + SCSB_TRACE_ASSIGN(sb); + __entry->ino = ino; + __entry->iblock = iblock; + STE_ASSIGN(ext, ext) + ), + + TP_printk(SCSBF" ino %llu iblock %llu ext "STE_FMT, + SCSB_TRACE_ARGS, __entry->ino, __entry->iblock, + STE_ENTRY_ARGS(ext)) +); + +DECLARE_EVENT_CLASS(scoutfs_data_file_extent_class, + TP_PROTO(struct super_block *sb, __u64 ino, struct scoutfs_extent *ext), + + TP_ARGS(sb, ino, ext), TP_STRUCT__entry( SCSB_TRACE_FIELDS @@ -243,36 +267,35 @@ TRACE_EVENT(scoutfs_data_file_extent_class, TP_fast_assign( SCSB_TRACE_ASSIGN(sb); __entry->ino = ino; - STE_ASSIGN(ext, te) + STE_ASSIGN(ext, ext) ), TP_printk(SCSBF" ino %llu ext "STE_FMT, SCSB_TRACE_ARGS, __entry->ino, STE_ENTRY_ARGS(ext)) ); -DEFINE_EVENT(scoutfs_data_file_extent_class, scoutfs_data_alloc_block, - TP_PROTO(struct super_block *sb, __u64 ino, - struct scoutfs_traced_extent *te), - TP_ARGS(sb, ino, te) +DEFINE_EVENT(scoutfs_data_file_extent_class, scoutfs_data_alloc, + TP_PROTO(struct super_block *sb, __u64 ino, struct scoutfs_extent *ext), + TP_ARGS(sb, ino, ext) ); -DEFINE_EVENT(scoutfs_data_file_extent_class, scoutfs_data_convert_unwritten, - TP_PROTO(struct super_block *sb, __u64 ino, - struct scoutfs_traced_extent *te), - TP_ARGS(sb, ino, te) +DEFINE_EVENT(scoutfs_data_file_extent_class, scoutfs_data_prealloc, + TP_PROTO(struct super_block *sb, __u64 ino, struct scoutfs_extent *ext), + TP_ARGS(sb, ino, ext) ); -DEFINE_EVENT(scoutfs_data_file_extent_class, scoutfs_data_prealloc_unwritten, - TP_PROTO(struct super_block *sb, __u64 ino, - struct scoutfs_traced_extent *te), - TP_ARGS(sb, ino, te) +DEFINE_EVENT(scoutfs_data_file_extent_class, scoutfs_data_get_block_found, + TP_PROTO(struct super_block *sb, __u64 ino, struct scoutfs_extent *ext), + TP_ARGS(sb, ino, ext) +); +DEFINE_EVENT(scoutfs_data_file_extent_class, scoutfs_data_get_block_mapped, + TP_PROTO(struct super_block *sb, __u64 ino, struct scoutfs_extent *ext), + TP_ARGS(sb, ino, ext) ); DEFINE_EVENT(scoutfs_data_file_extent_class, scoutfs_data_extent_truncated, - TP_PROTO(struct super_block *sb, __u64 ino, - struct scoutfs_traced_extent *te), - TP_ARGS(sb, ino, te) + TP_PROTO(struct super_block *sb, __u64 ino, struct scoutfs_extent *ext), + TP_ARGS(sb, ino, ext) ); DEFINE_EVENT(scoutfs_data_file_extent_class, scoutfs_data_fiemap_extent, - TP_PROTO(struct super_block *sb, __u64 ino, - struct scoutfs_traced_extent *te), - TP_ARGS(sb, ino, te) + TP_PROTO(struct super_block *sb, __u64 ino, struct scoutfs_extent *ext), + TP_ARGS(sb, ino, ext) ); TRACE_EVENT(scoutfs_data_truncate_items, @@ -300,9 +323,9 @@ TRACE_EVENT(scoutfs_data_truncate_items, TRACE_EVENT(scoutfs_data_wait_check, TP_PROTO(struct super_block *sb, __u64 ino, __u64 pos, __u64 len, - __u8 sef, __u8 op, struct scoutfs_traced_extent *te, int ret), + __u8 sef, __u8 op, struct scoutfs_extent *ext, int ret), - TP_ARGS(sb, ino, pos, len, sef, op, te, ret), + TP_ARGS(sb, ino, pos, len, sef, op, ext, ret), TP_STRUCT__entry( SCSB_TRACE_FIELDS @@ -322,7 +345,7 @@ TRACE_EVENT(scoutfs_data_wait_check, __entry->len = len; __entry->sef = sef; __entry->op = op; - STE_ASSIGN(ext, te) + STE_ASSIGN(ext, ext) __entry->ret = ret; ), diff --git a/kmod/src/server.c b/kmod/src/server.c index 751ea274..48e107ce 100644 --- a/kmod/src/server.c +++ b/kmod/src/server.c @@ -26,7 +26,6 @@ #include "counters.h" #include "inode.h" #include "block.h" -#include "radix.h" #include "btree.h" #include "scoutfs_trace.h" #include "msg.h" @@ -37,6 +36,7 @@ #include "quorum.h" #include "trans.h" #include "srch.h" +#include "alloc.h" /* * Every active mount can act as the server that listens on a net @@ -66,13 +66,10 @@ struct server_info { struct rw_semaphore commit_rwsem; struct llist_head commit_waiters; struct work_struct commit_work; - bool prepared_commit; /* server tracks seq use */ struct rw_semaphore seq_rwsem; - struct rw_semaphore alloc_rwsem; - struct list_head clients; unsigned long nr_clients; @@ -81,7 +78,15 @@ struct server_info { struct list_head farewell_requests; struct work_struct farewell_work; - struct scoutfs_radix_allocator alloc; + struct mutex alloc_mutex; + /* swap between two fs meta roots to increase time to reuse */ + struct scoutfs_alloc_root *meta_avail; + struct scoutfs_alloc_root *meta_freed; + /* server's meta allocators alternate between persistent heads */ + struct scoutfs_alloc alloc; + int other_ind; + struct scoutfs_alloc_list_head *other_avail; + struct scoutfs_alloc_list_head *other_freed; struct scoutfs_block_writer wri; struct mutex logs_mutex; @@ -119,15 +124,7 @@ static void stop_server(struct server_info *server) /* * Hold the shared rwsem that lets multiple holders modify blocks in the * current commit and prevents the commit worker from acquiring the - * exclusive write lock to write the commit. This can fail for the - * first holder failing to prepare a new commit. - * - * We reclaim the server's stable meta_freed blocks. This is run before - * anything has modified allocators in the server. We know that the - * stable meta_freed tree in the super contains all the stable free - * blocks which can be merged back into avail. We reference the stable - * freed tree in the super because the server allocator's freed tree is - * going to be added to as blocks are freed during the merge. + * exclusive write lock to write the commit. * * This is exported for server components isolated in their own files * (lock_server) and which are not called directly by the server core @@ -135,43 +132,13 @@ static void stop_server(struct server_info *server) */ int scoutfs_server_hold_commit(struct super_block *sb) { - struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super; DECLARE_SERVER_INFO(sb, server); - u64 tot; - int ret = 0; scoutfs_inc_counter(sb, server_commit_hold); down_read(&server->commit_rwsem); - while (!server->prepared_commit) { - up_read(&server->commit_rwsem); - down_write(&server->commit_rwsem); - - if (!server->prepared_commit) { - scoutfs_inc_counter(sb, server_commit_prepare); - BUG_ON(scoutfs_block_writer_dirty_bytes(sb, - &server->wri)); - tot = le64_to_cpu(super->core_meta_freed.ref.sm_total); - - ret = scoutfs_radix_merge(sb, &server->alloc, - &server->wri, - &server->alloc.avail, - &server->alloc.freed, - &super->core_meta_freed, - true, tot); - if (ret == 0) - server->prepared_commit = true; - } - - up_write(&server->commit_rwsem); - if (ret < 0) - break; - - down_read(&server->commit_rwsem); - } - - return ret; + return 0; } /* @@ -214,18 +181,6 @@ int scoutfs_server_apply_commit(struct super_block *sb, int err) return err; } -/* - * The caller is about to overwrite a ref to an alloc tree. As we do - * so we update the given super free block counter with the difference - * between the old and new allocator roots. - */ -static void update_free_blocks(__le64 *blocks, struct scoutfs_radix_root *prev, - struct scoutfs_radix_root *next) -{ - le64_add_cpu(blocks, le64_to_cpu(next->ref.sm_total) - - le64_to_cpu(prev->ref.sm_total)); -} - void scoutfs_server_get_roots(struct super_block *sb, struct scoutfs_net_roots *roots) { @@ -286,6 +241,31 @@ static void scoutfs_server_commit_func(struct work_struct *work) down_write(&server->commit_rwsem); + /* make sure next avail has sufficient blocks */ + ret = scoutfs_alloc_fill_list(sb, &server->alloc, &server->wri, + server->other_avail, + server->meta_avail, + SCOUTFS_SERVER_META_FILL_LO, + SCOUTFS_SERVER_META_FILL_TARGET); + if (ret) { + scoutfs_err(sb, "server error refilling avail: %d", ret); + goto out; + } + + /* merge freed blocks into extents, might be partial */ + ret = scoutfs_alloc_empty_list(sb, &server->alloc, &server->wri, + server->meta_freed, + server->other_freed); + if (ret) { + scoutfs_err(sb, "server error emptying freed: %d", ret); + goto out; + } + + ret = scoutfs_alloc_prepare_commit(sb, &server->alloc, &server->wri); + if (ret < 0) { + scoutfs_err(sb, "server error prepare alloc commit: %d", ret); + goto out; + } ret = scoutfs_block_writer_write(sb, &server->wri); if (ret) { @@ -293,13 +273,8 @@ static void scoutfs_server_commit_func(struct work_struct *work) goto out; } - update_free_blocks(&super->free_meta_blocks, &super->core_meta_avail, - &server->alloc.avail); - update_free_blocks(&super->free_meta_blocks, &super->core_meta_freed, - &server->alloc.freed); - - super->core_meta_avail = server->alloc.avail; - super->core_meta_freed = server->alloc.freed; + super->server_meta_avail[server->other_ind ^ 1] = server->alloc.avail; + super->server_meta_freed[server->other_ind ^ 1] = server->alloc.freed; ret = scoutfs_write_super(sb, super); if (ret) { @@ -307,9 +282,23 @@ static void scoutfs_server_commit_func(struct work_struct *work) goto out; } - server->prepared_commit = false; set_roots(server, &super->fs_root, &super->logs_root, &super->srch_root); + + /* swizzle the active and idle server alloc/freed heads */ + server->other_ind ^= 1; + server->alloc.avail = super->server_meta_avail[server->other_ind ^ 1]; + server->alloc.freed = super->server_meta_freed[server->other_ind ^ 1]; + server->other_avail = &super->server_meta_avail[server->other_ind]; + server->other_freed = &super->server_meta_freed[server->other_ind]; + + /* swap avail/free if avail gets low and freed is high */ + if (le64_to_cpu(server->meta_avail->total_len) <= + SCOUTFS_SERVER_META_ALLOC_MIN && + le64_to_cpu(server->meta_freed->total_len) > + SCOUTFS_SERVER_META_ALLOC_MIN) + swap(server->meta_avail, server->meta_freed); + ret = 0; out: node = llist_del_all(&server->commit_waiters); @@ -362,6 +351,34 @@ out: return scoutfs_net_response(sb, conn, cmd, id, ret, &ial, sizeof(ial)); } +/* + * Refill the destination root if it's fallen below the lo threshold by + * moving from the src root to bring it up to the target. + */ +static int alloc_move_refill(struct super_block *sb, + struct scoutfs_alloc_root *dst, + struct scoutfs_alloc_root *src, u64 lo, u64 target) +{ + DECLARE_SERVER_INFO(sb, server); + + if (le64_to_cpu(dst->total_len) >= lo) + return 0; + + return scoutfs_alloc_move(sb, &server->alloc, &server->wri, dst, src, + min(target - le64_to_cpu(dst->total_len), + le64_to_cpu(src->total_len))); +} + +static int alloc_move_empty(struct super_block *sb, + struct scoutfs_alloc_root *dst, + struct scoutfs_alloc_root *src) +{ + DECLARE_SERVER_INFO(sb, server); + + return scoutfs_alloc_move(sb, &server->alloc, &server->wri, + dst, src, le64_to_cpu(src->total_len)); +} + /* * Give the client roots to all the trees that they'll use to build * their transaction. @@ -383,8 +400,6 @@ static int server_get_log_trees(struct super_block *sb, struct scoutfs_log_trees_val ltv; struct scoutfs_log_trees lt; struct scoutfs_key key; - u64 count; - u64 target; int ret; if (arg_len != 0) { @@ -422,50 +437,25 @@ static int server_get_log_trees(struct super_block *sb, key.sklt_rid = cpu_to_le64(rid); key.sklt_nr = cpu_to_le64(1); memset(<v, 0, sizeof(ltv)); - scoutfs_radix_root_init(sb, <v.meta_avail, true); - scoutfs_radix_root_init(sb, <v.meta_freed, true); - scoutfs_radix_root_init(sb, <v.data_avail, false); - scoutfs_radix_root_init(sb, <v.data_freed, false); } - ret = scoutfs_radix_merge(sb, &server->alloc, &server->wri, - &server->alloc.avail, - <v.meta_freed, <v.meta_freed, true, - le64_to_cpu(ltv.meta_freed.ref.sm_total)) ?: - scoutfs_radix_merge(sb, &server->alloc, &server->wri, - &super->core_data_avail, - <v.data_freed, <v.data_freed, false, - le64_to_cpu(ltv.data_freed.ref.sm_total)); + /* return freed to server for emptying, refill avail */ + mutex_lock(&server->alloc_mutex); + ret = scoutfs_alloc_splice_list(sb, &server->alloc, &server->wri, + server->other_freed, + <v.meta_freed) ?: + alloc_move_empty(sb, &super->data_alloc, <v.data_freed) ?: + scoutfs_alloc_fill_list(sb, &server->alloc, &server->wri, + <v.meta_avail, server->meta_avail, + SCOUTFS_SERVER_META_FILL_LO, + SCOUTFS_SERVER_META_FILL_TARGET) ?: + alloc_move_refill(sb, <v.data_avail, &super->data_alloc, + SCOUTFS_SERVER_DATA_FILL_LO, + SCOUTFS_SERVER_DATA_FILL_TARGET); + mutex_unlock(&server->alloc_mutex); if (ret < 0) goto unlock; - /* ensure client has enough free metadata blocks for a transaction */ - target = (64*1024*1024) / SCOUTFS_BLOCK_LG_SIZE; - if (le64_to_cpu(ltv.meta_avail.ref.sm_total) < target) { - count = target - le64_to_cpu(ltv.meta_avail.ref.sm_total); - - ret = scoutfs_radix_merge(sb, &server->alloc, &server->wri, - <v.meta_avail, - &server->alloc.avail, - &server->alloc.avail, true, count); - if (ret < 0) - goto unlock; - } - - /* ensure client has enough free data blocks for a transaction */ - target = SCOUTFS_TRANS_DATA_ALLOC_HWM / SCOUTFS_BLOCK_SM_SIZE; - if (le64_to_cpu(ltv.data_avail.ref.sm_total) < target) { - count = target - le64_to_cpu(ltv.data_avail.ref.sm_total); - - ret = scoutfs_radix_merge(sb, &server->alloc, &server->wri, - <v.data_avail, - &super->core_data_avail, - &super->core_data_avail, false, - count); - if (ret < 0) - goto unlock; - } - /* update client's log tree's item */ ret = scoutfs_btree_force(sb, &server->alloc, &server->wri, &super->logs_root, &key, <v, sizeof(ltv)); @@ -553,21 +543,12 @@ static int server_commit_log_trees(struct super_block *sb, goto unlock; } - update_free_blocks(&super->free_meta_blocks, <v.meta_avail, - <->meta_avail); - update_free_blocks(&super->free_meta_blocks, <v.meta_freed, - <->meta_freed); - update_free_blocks(&super->free_data_blocks, <v.data_avail, - <->data_avail); - update_free_blocks(&super->free_data_blocks, <v.data_freed, - <->data_freed); - ltv.meta_avail = lt->meta_avail; ltv.meta_freed = lt->meta_freed; - ltv.item_root = lt->item_root; - ltv.bloom_ref = lt->bloom_ref; ltv.data_avail = lt->data_avail; ltv.data_freed = lt->data_freed; + ltv.item_root = lt->item_root; + ltv.bloom_ref = lt->bloom_ref; ltv.srch_file = lt->srch_file; ret = scoutfs_btree_update(sb, &server->alloc, &server->wri, @@ -638,7 +619,6 @@ static int reclaim_log_trees(struct super_block *sb, u64 rid) int err; mutex_lock(&server->logs_mutex); - down_write(&server->alloc_rwsem); /* find the client's existing item */ scoutfs_key_init_log_trees(&key, rid, 0); @@ -662,32 +642,25 @@ static int reclaim_log_trees(struct super_block *sb, u64 rid) /* * All of these can return errors after having modified the - * radix trees. We have to try and update the roots in the + * allocator trees. We have to try and update the roots in the * log item. */ - ret = scoutfs_radix_merge(sb, &server->alloc, &server->wri, - &server->alloc.avail, - <v.meta_avail, <v.meta_avail, true, - le64_to_cpu(ltv.meta_avail.ref.sm_total)) ?: - scoutfs_radix_merge(sb, &server->alloc, &server->wri, - &server->alloc.avail, - <v.meta_freed, <v.meta_freed, true, - le64_to_cpu(ltv.meta_freed.ref.sm_total)) ?: - scoutfs_radix_merge(sb, &server->alloc, &server->wri, - &super->core_data_avail, - <v.data_avail, <v.data_avail, false, - le64_to_cpu(ltv.data_avail.ref.sm_total)) ?: - scoutfs_radix_merge(sb, &server->alloc, &server->wri, - &super->core_data_avail, - <v.data_freed, <v.data_freed, false, - le64_to_cpu(ltv.data_freed.ref.sm_total)); + mutex_lock(&server->alloc_mutex); + ret = scoutfs_alloc_splice_list(sb, &server->alloc, &server->wri, + server->other_freed, + <v.meta_freed) ?: + scoutfs_alloc_splice_list(sb, &server->alloc, &server->wri, + server->other_freed, + <v.meta_avail) ?: + alloc_move_empty(sb, &super->data_alloc, <v.data_avail) ?: + alloc_move_empty(sb, &super->data_alloc, <v.data_freed); + mutex_unlock(&server->alloc_mutex); err = scoutfs_btree_update(sb, &server->alloc, &server->wri, &super->logs_root, &key, <v, sizeof(ltv)); BUG_ON(err != 0); /* alloc and log item roots out of sync */ out: - up_write(&server->alloc_rwsem); mutex_unlock(&server->logs_mutex); return ret; @@ -892,14 +865,14 @@ static int server_statfs(struct super_block *sb, nstatfs.next_ino = super->next_ino; spin_unlock(&sbi->next_ino_lock); - down_read(&server->alloc_rwsem); + mutex_lock(&server->alloc_mutex); nstatfs.total_blocks = le64_lg_to_sm(super->total_meta_blocks); le64_add_cpu(&nstatfs.total_blocks, le64_to_cpu(super->total_data_blocks)); nstatfs.bfree = le64_lg_to_sm(super->free_meta_blocks); le64_add_cpu(&nstatfs.bfree, le64_to_cpu(super->free_data_blocks)); - up_read(&server->alloc_rwsem); + mutex_unlock(&server->alloc_mutex); ret = 0; } else { ret = -EINVAL; @@ -1002,8 +975,6 @@ static int server_srch_get_compact(struct super_block *sb, int i; memset(&scin, 0, sizeof(scin)); - scoutfs_radix_root_init(sb, &scin.meta_avail, true); - scoutfs_radix_root_init(sb, &scin.meta_freed, true); if (arg_len != 0) { ret = -EINVAL; @@ -1028,9 +999,11 @@ static int server_srch_get_compact(struct super_block *sb, for (i = 0; i < scin.nr; i++) blocks += le64_to_cpu(scin.sfl[i].blocks); blocks *= 3; - ret = scoutfs_radix_merge(sb, &server->alloc, &server->wri, - &scin.meta_avail, &server->alloc.avail, - &server->alloc.avail, true, blocks); + mutex_lock(&server->alloc_mutex); + ret = scoutfs_alloc_fill_list(sb, &server->alloc, &server->wri, + &scin.meta_avail, server->meta_avail, + blocks, blocks); + mutex_unlock(&server->alloc_mutex); if (ret < 0) goto apply; @@ -1047,6 +1020,12 @@ out: &scin, sizeof(scin)); } +/* + * Commit the client's compaction. Their freed allocator contains the + * source srch files blocks that are currently in use which can't be + * available for allocation until after the commit. We move them into + * freed so they won't satisfy allocations. + */ static int server_srch_commit_compact(struct super_block *sb, struct scoutfs_net_connection *conn, u8 cmd, u64 id, void *arg, u16 arg_len) @@ -1056,8 +1035,8 @@ static int server_srch_commit_compact(struct super_block *sb, struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct scoutfs_super_block *super = &sbi->super; struct scoutfs_srch_compact_result *scres; - struct scoutfs_radix_root av; - struct scoutfs_radix_root fr; + struct scoutfs_alloc_list_head av; + struct scoutfs_alloc_list_head fr; int ret; scres = arg; @@ -1078,15 +1057,12 @@ static int server_srch_commit_compact(struct super_block *sb, if (ret < 0) /* XXX very bad, leaks allocators */ goto apply; - /* XXX like all merges, doesn't reclaim allocator blocks themselves */ - - /* merge the client's allocators into freed, commit before reuse */ - ret = scoutfs_radix_merge(sb, &server->alloc, &server->wri, - &server->alloc.freed, &av, &av, true, - le64_to_cpu(av.ref.sm_total)) ?: - scoutfs_radix_merge(sb, &server->alloc, &server->wri, - &server->alloc.freed, &fr, &fr, true, - le64_to_cpu(fr.ref.sm_total)); + mutex_lock(&server->alloc_mutex); + ret = scoutfs_alloc_splice_list(sb, &server->alloc, &server->wri, + server->other_freed, &av) ?: + scoutfs_alloc_splice_list(sb, &server->alloc, &server->wri, + server->other_freed, &fr); + mutex_unlock(&server->alloc_mutex); apply: ret = scoutfs_server_apply_commit(sb, ret); out: @@ -1149,14 +1125,15 @@ static int delete_mounted_client(struct super_block *sb, u64 rid) /* * Remove all the busy items for srch compactions that the mount might - * have been responsible for and reclaim all their allocators. + * have been responsible for and reclaim all their allocators. The freed + * allocator could still contain stable srch file blknos. */ static int cancel_srch_compact(struct super_block *sb, u64 rid) { DECLARE_SERVER_INFO(sb, server); struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super; - struct scoutfs_radix_root av; - struct scoutfs_radix_root fr; + struct scoutfs_alloc_list_head av; + struct scoutfs_alloc_list_head fr; int ret; for (;;) { @@ -1172,12 +1149,14 @@ static int cancel_srch_compact(struct super_block *sb, u64 rid) break; } - ret = scoutfs_radix_merge(sb, &server->alloc, &server->wri, - &server->alloc.freed, &av, &av, true, - le64_to_cpu(av.ref.sm_total)) ?: - scoutfs_radix_merge(sb, &server->alloc, &server->wri, - &server->alloc.freed, &fr, &fr, true, - le64_to_cpu(fr.ref.sm_total)); + mutex_lock(&server->alloc_mutex); + ret = scoutfs_alloc_splice_list(sb, &server->alloc, + &server->wri, + server->other_freed, &av) ?: + scoutfs_alloc_splice_list(sb, &server->alloc, + &server->wri, + server->other_freed, &fr); + mutex_unlock(&server->alloc_mutex); if (WARN_ON_ONCE(ret < 0)) break; } @@ -1650,10 +1629,27 @@ static void scoutfs_server_worker(struct work_struct *work) set_roots(server, &super->fs_root, &super->logs_root, &super->srch_root); - scoutfs_radix_init_alloc(&server->alloc, &super->core_meta_avail, - &super->core_meta_freed); scoutfs_block_writer_init(sb, &server->wri); + /* prepare server alloc for this transaction, larger first */ + if (le64_to_cpu(super->server_meta_avail[0].total_nr) < + le64_to_cpu(super->server_meta_avail[1].total_nr)) + server->other_ind = 0; + else + server->other_ind = 1; + scoutfs_alloc_init(&server->alloc, + &super->server_meta_avail[server->other_ind ^ 1], + &super->server_meta_freed[server->other_ind ^ 1]); + server->other_avail = &super->server_meta_avail[server->other_ind]; + server->other_freed = &super->server_meta_freed[server->other_ind]; + + /* use largest meta_alloc to start */ + server->meta_avail = &super->meta_alloc[0]; + server->meta_freed = &super->meta_alloc[1]; + if (le64_to_cpu(server->meta_freed->total_len) > + le64_to_cpu(server->meta_avail->total_len)) + swap(server->meta_avail, server->meta_freed); + ret = scoutfs_lock_server_setup(sb, &server->alloc, &server->wri); if (ret) goto shutdown; @@ -1783,11 +1779,11 @@ int scoutfs_server_setup(struct super_block *sb) init_llist_head(&server->commit_waiters); INIT_WORK(&server->commit_work, scoutfs_server_commit_func); init_rwsem(&server->seq_rwsem); - init_rwsem(&server->alloc_rwsem); INIT_LIST_HEAD(&server->clients); mutex_init(&server->farewell_mutex); INIT_LIST_HEAD(&server->farewell_requests); INIT_WORK(&server->farewell_work, farewell_worker); + mutex_init(&server->alloc_mutex); mutex_init(&server->logs_mutex); mutex_init(&server->srch_mutex); seqcount_init(&server->roots_seqcount); diff --git a/kmod/src/srch.c b/kmod/src/srch.c index db45fba8..2ecae4fb 100644 --- a/kmod/src/srch.c +++ b/kmod/src/srch.c @@ -23,7 +23,7 @@ #include "format.h" #include "counters.h" #include "block.h" -#include "radix.h" +#include "alloc.h" #include "srch.h" #include "btree.h" #include "spbm.h" @@ -309,7 +309,7 @@ enum { GFB_DIRTY = (1 << 1), }; static int get_file_block(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_srch_file *sfl, int gfb, u64 blk, struct scoutfs_block **bl_ret) @@ -335,7 +335,7 @@ static int get_file_block(struct super_block *sb, goto out; } - ret = scoutfs_radix_alloc(sb, alloc, wri, &blkno); + ret = scoutfs_alloc_meta(sb, alloc, wri, &blkno); if (ret < 0) goto out; @@ -383,7 +383,7 @@ static int get_file_block(struct super_block *sb, /* allocate a new block if we need it */ if (!ref->blkno || ((gfb & GFB_DIRTY) && !scoutfs_block_writer_is_dirty(sb, bl))) { - ret = scoutfs_radix_alloc(sb, alloc, wri, &blkno); + ret = scoutfs_alloc_meta(sb, alloc, wri, &blkno); if (ret < 0) goto out; @@ -395,8 +395,8 @@ static int get_file_block(struct super_block *sb, if (bl) { /* cow old block if we have one */ - ret = scoutfs_radix_free(sb, alloc, wri, - bl->blkno); + ret = scoutfs_free_meta(sb, alloc, wri, + bl->blkno); if (ret) goto out; @@ -442,7 +442,7 @@ out: /* return allocated blkno on error */ if (blkno > 0) { - err = scoutfs_radix_free(sb, alloc, wri, blkno); + err = scoutfs_free_meta(sb, alloc, wri, blkno); BUG_ON(err); /* radix should have been dirty */ } @@ -460,7 +460,7 @@ out: } int scoutfs_srch_add(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_srch_file *sfl, struct scoutfs_block **bl_ret, @@ -988,7 +988,7 @@ out: * it's large enough. */ int scoutfs_srch_rotate_log(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, struct scoutfs_srch_file *sfl) @@ -1018,13 +1018,13 @@ int scoutfs_srch_rotate_log(struct super_block *sb, * items. */ int scoutfs_srch_get_compact(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, u64 rid, struct scoutfs_srch_compact_input *scin) { - struct scoutfs_srch_compact_input busy_scin = {{0,}}; + struct scoutfs_srch_compact_input busy_scin = {{{0,}}}; struct scoutfs_srch_file sfl; SCOUTFS_BTREE_ITEM_REF(iref); struct scoutfs_spbm busy; @@ -1147,7 +1147,7 @@ out: * copy. */ int scoutfs_srch_update_compact(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, u64 rid, struct scoutfs_srch_compact_input *scin) @@ -1160,7 +1160,7 @@ int scoutfs_srch_update_compact(struct super_block *sb, } static int mod_srch_items(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, u8 scom_flags, bool ins, struct scoutfs_srch_file *sfls, int nr) @@ -1213,12 +1213,12 @@ static int mod_srch_items(struct super_block *sb, * We give the caller the allocator trees to merge if we return success. */ int scoutfs_srch_commit_compact(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, u64 rid, struct scoutfs_srch_compact_result *scres, - struct scoutfs_radix_root *av, - struct scoutfs_radix_root *fr) + struct scoutfs_alloc_list_head *av, + struct scoutfs_alloc_list_head *fr) { struct scoutfs_srch_compact_input scin; SCOUTFS_BTREE_ITEM_REF(iref); @@ -1268,11 +1268,11 @@ out: * allocators. Returns -ENOENT when there are no more items. */ int scoutfs_srch_cancel_compact(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, u64 rid, - struct scoutfs_radix_root *av, - struct scoutfs_radix_root *fr) + struct scoutfs_alloc_list_head *av, + struct scoutfs_alloc_list_head *fr) { struct scoutfs_srch_compact_input scin; SCOUTFS_BTREE_ITEM_REF(iref); @@ -1331,7 +1331,7 @@ typedef int (*kway_next_func_t)(struct super_block *sb, struct scoutfs_srch_entry *sre_ret, void *arg); static int kway_merge(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_srch_file *sfl, kway_next_func_t kway_next, void **args, int nr) @@ -1526,7 +1526,7 @@ static void swap_page_sre(void *A, void *B, int size) * typically, ~10x worst case). */ static int compact_logs(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_srch_file *sfl_out, struct scoutfs_srch_file *sfls, int nr_sfls) @@ -1715,7 +1715,7 @@ out: * which reads blocks and decodes entries. */ static int compact_sorted(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_srch_file *sfl_out, struct scoutfs_srch_file *sfls, int nr) @@ -1760,7 +1760,7 @@ out: * up our entire operation, partial state doesn't matter. */ static int free_file(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_srch_file *sfl) { @@ -1818,7 +1818,7 @@ static int free_file(struct super_block *sb, if (blkno == 0) continue; - ret = scoutfs_radix_free(sb, alloc, wri, blkno); + ret = scoutfs_free_meta(sb, alloc, wri, blkno); if (ret < 0) goto out; scoutfs_inc_counter(sb, srch_compact_free_block); @@ -1830,7 +1830,7 @@ static int free_file(struct super_block *sb, } free_root: - ret = scoutfs_radix_free(sb, alloc, wri, le64_to_cpu(sfl->ref.blkno)); + ret = scoutfs_free_meta(sb, alloc, wri, le64_to_cpu(sfl->ref.blkno)); if (ret < 0) goto out; @@ -1868,7 +1868,7 @@ static void scoutfs_srch_compact_worker(struct work_struct *work) struct srch_info *srinf = container_of(work, struct srch_info, compact_dwork.work); struct super_block *sb = srinf->sb; - struct scoutfs_radix_allocator alloc; + struct scoutfs_alloc alloc; struct scoutfs_srch_compact_result scres; struct scoutfs_srch_compact_input scin; struct scoutfs_block_writer wri; @@ -1883,7 +1883,7 @@ static void scoutfs_srch_compact_worker(struct work_struct *work) if (ret < 0 || scin.nr == 0) goto out; - scoutfs_radix_init_alloc(&alloc, &scin.meta_avail, &scin.meta_freed); + scoutfs_alloc_init(&alloc, &scin.meta_avail, &scin.meta_freed); if (scin.flags & SCOUTFS_SRCH_COMPACT_FLAG_LOG) ret = compact_logs(sb, &alloc, &wri, &scres.sfl, diff --git a/kmod/src/srch.h b/kmod/src/srch.h index 937692e9..97604bd6 100644 --- a/kmod/src/srch.h +++ b/kmod/src/srch.h @@ -22,7 +22,7 @@ struct scoutfs_srch_rb_node { node = rb_next(node)) int scoutfs_srch_add(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_srch_file *sfl, struct scoutfs_block **bl_ret, @@ -34,34 +34,34 @@ int scoutfs_srch_search_xattrs(struct super_block *sb, u64 hash, u64 ino, u64 last_ino, bool *done); int scoutfs_srch_rotate_log(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, struct scoutfs_srch_file *sfl); int scoutfs_srch_get_compact(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, u64 rid, struct scoutfs_srch_compact_input *scin_ret); int scoutfs_srch_update_compact(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, u64 rid, struct scoutfs_srch_compact_input *scin); int scoutfs_srch_commit_compact(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, u64 rid, struct scoutfs_srch_compact_result *scres, - struct scoutfs_radix_root *av, - struct scoutfs_radix_root *fr); + struct scoutfs_alloc_list_head *av, + struct scoutfs_alloc_list_head *fr); int scoutfs_srch_cancel_compact(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, u64 rid, - struct scoutfs_radix_root *av, - struct scoutfs_radix_root *fr); + struct scoutfs_alloc_list_head *av, + struct scoutfs_alloc_list_head *fr); void scoutfs_srch_destroy(struct super_block *sb); int scoutfs_srch_setup(struct super_block *sb); diff --git a/kmod/src/trans.c b/kmod/src/trans.c index af659bd9..9f36a19d 100644 --- a/kmod/src/trans.c +++ b/kmod/src/trans.c @@ -25,7 +25,7 @@ #include "counters.h" #include "client.h" #include "inode.h" -#include "radix.h" +#include "alloc.h" #include "block.h" #include "msg.h" #include "item.h" @@ -66,7 +66,7 @@ struct trans_info { bool writing; struct scoutfs_log_trees lt; - struct scoutfs_radix_allocator alloc; + struct scoutfs_alloc alloc; struct scoutfs_block_writer wri; }; @@ -112,8 +112,7 @@ int scoutfs_trans_get_log_trees(struct super_block *sb) ret = scoutfs_client_get_log_trees(sb, <); if (ret == 0) { tri->lt = lt; - scoutfs_radix_init_alloc(&tri->alloc, <.meta_avail, - <.meta_freed); + scoutfs_alloc_init(&tri->alloc, <.meta_avail, <.meta_freed); scoutfs_block_writer_init(sb, &tri->wri); scoutfs_forest_init_btrees(sb, &tri->alloc, &tri->wri, <); @@ -195,6 +194,9 @@ void scoutfs_trans_write_func(struct work_struct *work) /* XXX this all needs serious work for dealing with errors */ ret = (s = "data submit", scoutfs_inode_walk_writeback(sb, true)) ?: (s = "item dirty", scoutfs_item_write_dirty(sb)) ?: + (s = "data prepare", scoutfs_data_prepare_commit(sb)) ?: + (s = "alloc prepare", scoutfs_alloc_prepare_commit(sb, + &tri->alloc, &tri->wri)) ?: (s = "meta write", scoutfs_block_writer_write(sb, &tri->wri)) ?: (s = "data wait", scoutfs_inode_walk_writeback(sb, false)) ?: (s = "commit log trees", commit_btrees(sb)) ?: @@ -369,7 +371,13 @@ static bool acquired_hold(struct super_block *sb, /* XXX arbitrarily limit to 8 meg transactions */ if (scoutfs_item_dirty_bytes(sb) >= (8 * 1024 * 1024)) { - scoutfs_inc_counter(sb, trans_commit_full); + scoutfs_inc_counter(sb, trans_commit_dirty_meta_full); + queue_trans_work(sbi); + goto out; + } + + if (scoutfs_alloc_meta_lo_thresh(sb, &tri->alloc)) { + scoutfs_inc_counter(sb, trans_commit_meta_alloc_low); queue_trans_work(sbi); goto out; }