From e60f4e7082c6f1431c2fdc2ba879b08348687e70 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Mon, 21 Sep 2020 11:43:28 -0700 Subject: [PATCH] scoutfs: use full extents for data and alloc Previously we'd avoided full extents in file data mapping items because we were deleting items from forest btrees directly. That created deletion items for every version of file extents as they were modified. Now we have the item cache which can remove deleted items from memory when deletion items aren't necessary. By layering file data extents on an extent layer, we can also transition allocators to use extents and fix a lot of problems in the radix block allocator. Most of this change is churn from changing allocator function and struct names. File data extents no longer have to manage loading and storing from and to packed extent items at a fixed granularity. All those loops are torn out and data operations now call the extent layer with their callbacks instead of calling its packed item extent functions. This now means that fallocate and especially restoring offline extents can use larger extents. Small file block allocation now comes from a cached extent which reduces item calls for small file data streaming writes. The big change in the server is to use more root structures to manage recursive modification instead of relying on the allocator to notice and do the right thing. The radix allocator tried to notice when it was actively operating on a root that it was also using to allocate and free metadata blocks. This resulted in a lot of bugs. Instead we now double buffer the server's avail and freed roots so that the server fills and drains the stable roots from the previous transaction. We also double buffer the core fs metadata avail root so that we can increase the time to reuse freed metadata blocks. The server now only moves free extents into client allocators when they fall below a low threshold. This reduces the shared modification of the client's allocator roots which requires cold block reads on both the client and server. Signed-off-by: Zach Brown --- kmod/src/alloc.h | 7 +- kmod/src/btree.c | 56 +- kmod/src/btree.h | 14 +- kmod/src/counters.h | 4 +- kmod/src/data.c | 1380 ++++++++++++-------------------------- kmod/src/data.h | 5 +- kmod/src/forest.c | 16 +- kmod/src/forest.h | 4 +- kmod/src/format.h | 60 +- kmod/src/lock_server.c | 5 +- kmod/src/lock_server.h | 2 +- kmod/src/scoutfs_trace.h | 101 +-- kmod/src/server.c | 328 +++++---- kmod/src/srch.c | 54 +- kmod/src/srch.h | 20 +- kmod/src/trans.c | 18 +- 16 files changed, 789 insertions(+), 1285 deletions(-) diff --git a/kmod/src/alloc.h b/kmod/src/alloc.h index 167725b0..7b053756 100644 --- a/kmod/src/alloc.h +++ b/kmod/src/alloc.h @@ -53,9 +53,10 @@ /* * Each of the server meta_alloc roots will try to keep a minimum amount - * of free blocks. The server will use the next root once its current - * root gets this low. It must have room for all the largest allocation - * attempted in a transaction on the server. + * of free blocks. The server will swap roots when its current avail + * falls below the threshold while the freed root is still above it. It + * must have room for all the largest allocation attempted in a + * transaction on the server. */ #define SCOUTFS_SERVER_META_ALLOC_MIN \ (SCOUTFS_SERVER_META_FILL_TARGET * 2) diff --git a/kmod/src/btree.c b/kmod/src/btree.c index 5c97e2e0..1d249a3a 100644 --- a/kmod/src/btree.c +++ b/kmod/src/btree.c @@ -26,7 +26,7 @@ #include "options.h" #include "msg.h" #include "block.h" -#include "radix.h" +#include "alloc.h" #include "avl.h" #include "hash.h" @@ -674,7 +674,7 @@ static void move_items(struct scoutfs_btree_block *dst, * error. */ static int get_ref_block(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, int flags, struct scoutfs_btree_ref *ref, struct scoutfs_block **bl_ret) @@ -737,7 +737,7 @@ retry: goto out; } - ret = scoutfs_radix_alloc(sb, alloc, wri, &blkno); + ret = scoutfs_alloc_meta(sb, alloc, wri, &blkno); if (ret < 0) goto out; @@ -745,8 +745,8 @@ retry: new_bl = scoutfs_block_create(sb, blkno); if (IS_ERR(new_bl)) { - ret = scoutfs_radix_free(sb, alloc, wri, blkno); - BUG_ON(ret); /* radix should have been dirty */ + ret = scoutfs_free_meta(sb, alloc, wri, blkno); + BUG_ON(ret); ret = PTR_ERR(new_bl); goto out; } @@ -754,11 +754,11 @@ retry: /* free old stable blkno we're about to overwrite */ if (ref && ref->blkno) { - ret = scoutfs_radix_free(sb, alloc, wri, - le64_to_cpu(ref->blkno)); + ret = scoutfs_free_meta(sb, alloc, wri, + le64_to_cpu(ref->blkno)); if (ret) { - ret = scoutfs_radix_free(sb, alloc, wri, blkno); - BUG_ON(ret); /* radix should have been dirty */ + ret = scoutfs_free_meta(sb, alloc, wri, blkno); + BUG_ON(ret); scoutfs_block_put(sb, new_bl); new_bl = NULL; goto out; @@ -861,7 +861,7 @@ static void init_btree_block(struct scoutfs_btree_block *bt, int level) * Returns -errno, 0 if nothing done, or 1 if we split. */ static int try_split(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, struct scoutfs_key *key, unsigned val_len, @@ -901,8 +901,8 @@ static int try_split(struct super_block *sb, if (!parent) { ret = get_ref_block(sb, alloc, wri, BTW_ALLOC, NULL, &par_bl); if (ret) { - err = scoutfs_radix_free(sb, alloc, wri, - le64_to_cpu(left->hdr.blkno)); + err = scoutfs_free_meta(sb, alloc, wri, + le64_to_cpu(left->hdr.blkno)); BUG_ON(err); /* radix should have been dirty */ scoutfs_block_put(sb, left_bl); return ret; @@ -937,7 +937,7 @@ static int try_split(struct super_block *sb, * block. */ static int try_join(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, struct scoutfs_btree_block *parent, @@ -990,9 +990,9 @@ static int try_join(struct super_block *sb, /* update or delete sibling's parent item */ if (le16_to_cpu(sib->nr_items) == 0) { delete_item(parent, sib_par_item, NULL); - ret = scoutfs_radix_free(sb, alloc, wri, - le64_to_cpu(sib->hdr.blkno)); - BUG_ON(ret); /* could have dirtied alloc to avoid error */ + ret = scoutfs_free_meta(sb, alloc, wri, + le64_to_cpu(sib->hdr.blkno)); + BUG_ON(ret); } else if (move_right) { update_parent_item(parent, sib_par_item, sib); @@ -1003,9 +1003,9 @@ static int try_join(struct super_block *sb, root->height--; root->ref.blkno = bt->hdr.blkno; root->ref.seq = bt->hdr.seq; - ret = scoutfs_radix_free(sb, alloc, wri, - le64_to_cpu(parent->hdr.blkno)); - BUG_ON(ret); /* could have dirtied alloc to avoid error */ + ret = scoutfs_free_meta(sb, alloc, wri, + le64_to_cpu(parent->hdr.blkno)); + BUG_ON(ret); } scoutfs_block_put(sb, sib_bl); @@ -1219,7 +1219,7 @@ struct btree_walk_key_range { * blocks themselves. */ static int btree_walk(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, int flags, struct scoutfs_key *key, @@ -1464,7 +1464,7 @@ static bool invalid_item(unsigned val_len) * length value. */ int scoutfs_btree_insert(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, struct scoutfs_key *key, @@ -1531,7 +1531,7 @@ static void update_item_value(struct scoutfs_btree_block *bt, * which doesn't fit. */ int scoutfs_btree_update(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, struct scoutfs_key *key, @@ -1571,7 +1571,7 @@ int scoutfs_btree_update(struct super_block *sb, * which will insert instead of returning -ENOENT. */ int scoutfs_btree_force(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, struct scoutfs_key *key, @@ -1615,7 +1615,7 @@ int scoutfs_btree_force(struct super_block *sb, * found. */ int scoutfs_btree_delete(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, struct scoutfs_key *key) @@ -1636,8 +1636,8 @@ int scoutfs_btree_delete(struct super_block *sb, if (item) { if (le16_to_cpu(bt->nr_items) == 1) { /* remove final empty block */ - ret = scoutfs_radix_free(sb, alloc, wri, - bl->blkno); + ret = scoutfs_free_meta(sb, alloc, wri, + bl->blkno); if (ret == 0) { root->height = 0; root->ref.blkno = 0; @@ -1753,7 +1753,7 @@ int scoutfs_btree_prev(struct super_block *sb, struct scoutfs_btree_root *root, * <0 is returned on error, including -ENOENT if the key isn't present. */ int scoutfs_btree_dirty(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, struct scoutfs_key *key) @@ -1841,7 +1841,7 @@ out: * the caller to resolve this. */ int scoutfs_btree_insert_list(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, struct scoutfs_btree_item_list *lst) diff --git a/kmod/src/btree.h b/kmod/src/btree.h index c9bd6478..79d4de58 100644 --- a/kmod/src/btree.h +++ b/kmod/src/btree.h @@ -3,7 +3,7 @@ #include -struct scoutfs_radix_allocator; +struct scoutfs_alloc; struct scoutfs_block_writer; struct scoutfs_block; @@ -36,25 +36,25 @@ int scoutfs_btree_lookup(struct super_block *sb, struct scoutfs_key *key, struct scoutfs_btree_item_ref *iref); int scoutfs_btree_insert(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, struct scoutfs_key *key, void *val, unsigned val_len); int scoutfs_btree_update(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, struct scoutfs_key *key, void *val, unsigned val_len); int scoutfs_btree_force(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, struct scoutfs_key *key, void *val, unsigned val_len); int scoutfs_btree_delete(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, struct scoutfs_key *key); @@ -65,7 +65,7 @@ int scoutfs_btree_prev(struct super_block *sb, struct scoutfs_btree_root *root, struct scoutfs_key *key, struct scoutfs_btree_item_ref *iref); int scoutfs_btree_dirty(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, struct scoutfs_key *key); @@ -77,7 +77,7 @@ int scoutfs_btree_read_items(struct super_block *sb, struct scoutfs_key *end, scoutfs_btree_item_cb cb, void *arg); int scoutfs_btree_insert_list(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, struct scoutfs_btree_item_list *lst); diff --git a/kmod/src/counters.h b/kmod/src/counters.h index b8686bc9..e3c2e8ae 100644 --- a/kmod/src/counters.h +++ b/kmod/src/counters.h @@ -166,7 +166,6 @@ EXPAND_COUNTER(radix_undo_ref) \ EXPAND_COUNTER(radix_walk) \ EXPAND_COUNTER(server_commit_hold) \ - EXPAND_COUNTER(server_commit_prepare) \ EXPAND_COUNTER(server_commit_queue) \ EXPAND_COUNTER(server_commit_worker) \ EXPAND_COUNTER(srch_add_entry) \ @@ -188,8 +187,9 @@ EXPAND_COUNTER(srch_search_xattrs) \ EXPAND_COUNTER(srch_read_stale) \ EXPAND_COUNTER(trans_commit_data_alloc_low) \ + EXPAND_COUNTER(trans_commit_dirty_meta_full) \ EXPAND_COUNTER(trans_commit_fsync) \ - EXPAND_COUNTER(trans_commit_full) \ + EXPAND_COUNTER(trans_commit_meta_alloc_low) \ EXPAND_COUNTER(trans_commit_sync_fs) \ EXPAND_COUNTER(trans_commit_timer) \ EXPAND_COUNTER(trans_commit_written) diff --git a/kmod/src/data.c b/kmod/src/data.c index 9360b481..a8bca721 100644 --- a/kmod/src/data.c +++ b/kmod/src/data.c @@ -26,6 +26,7 @@ #include "super.h" #include "inode.h" #include "key.h" +#include "alloc.h" #include "data.h" #include "trans.h" #include "counters.h" @@ -37,657 +38,135 @@ #include "file.h" #include "msg.h" #include "count.h" -#include "radix.h" +#include "ext.h" /* - * Logical file blocks are mapped to device blocks with extents stored - * in items. Each extent item maps a fixed size logical region and can - * contain multiple extent records. Each extent record is packed to - * minimize the space it uses. The logical starting block is implicit - * so sparse extents are stored to skip unmapped blocks, and the mapped - * blkno is encoded as the difference from the previous extent and only - * its set bytes are stored. - * - * To operate on the extents we load their item and unpack them into an - * rbtree of full extent records in memory. Once the memory extents are - * modified they can be packed back into the item. Typically there are - * very few extents that cover the region. - * - * The client is given a radix allocator with trees for allocating - * blocks and recording frees at the start of each transaction. + * We want to amortize work done after dirtying the shared transaction + * accounting, but we don't want to blow out dirty allocator btree + * blocks. Each allocation can dirty quite a few allocator btree blocks + * so we check in pretty often. */ +#define EXTENTS_PER_HOLD 8 struct data_info { struct super_block *sb; - struct rw_semaphore alloc_rwsem; - struct scoutfs_radix_allocator *alloc; + struct mutex mutex; + struct scoutfs_alloc *alloc; struct scoutfs_block_writer *wri; - struct scoutfs_radix_root data_avail; - struct scoutfs_radix_root data_freed; + struct scoutfs_alloc_root data_avail; + struct scoutfs_alloc_root data_freed; + struct scoutfs_extent cached_ext; }; #define DECLARE_DATA_INFO(sb, name) \ struct data_info *name = SCOUTFS_SB(sb)->data_info -static void init_packed_extent_key(struct scoutfs_key *key, u64 ino, - u64 iblock, u8 part) +struct data_ext_args { + u64 ino; + struct inode *inode; + struct scoutfs_lock *lock; +}; + +static void item_from_extent(struct scoutfs_key *key, + struct scoutfs_data_extent_val *dv, u64 ino, + u64 start, u64 len, u64 map, u8 flags) { *key = (struct scoutfs_key) { .sk_zone = SCOUTFS_FS_ZONE, - .skpe_ino = cpu_to_le64(ino), - .sk_type = SCOUTFS_PACKED_EXTENT_TYPE, - .skpe_base = cpu_to_le64(iblock >> SCOUTFS_PACKEXT_BASE_SHIFT), - .skpe_part = part, + .skdx_ino = cpu_to_le64(ino), + .sk_type = SCOUTFS_DATA_EXTENT_TYPE, + .skdx_end = cpu_to_le64(start + len - 1), + .skdx_len = cpu_to_le64(len), }; + dv->blkno = cpu_to_le64(map); + dv->flags = flags; } -/* - * Packed extents are read from items and unpacked into this structure - * in memory so they can be easily manipulated before being packed and - * stored in items. - */ -struct unpacked_extents { - u64 iblock; - struct rb_root extents; - __u8 existing_parts; - bool changed; -}; - -struct unpacked_extent { - struct rb_node node; - u64 iblock; - u64 count; - u64 blkno; - u8 flags; -}; - -static void init_traced_extent(struct scoutfs_traced_extent *te, - u64 iblock, u64 count, u64 blkno, u8 flags) +static void ext_from_item(struct scoutfs_extent *ext, + struct scoutfs_key *key, + struct scoutfs_data_extent_val *dv) { - te->iblock = iblock; - te->count = count; - te->blkno = blkno; - te->flags = flags; + ext->start = le64_to_cpu(key->skdx_end) - + le64_to_cpu(key->skdx_len) + 1; + ext->len = le64_to_cpu(key->skdx_len); + ext->map = le64_to_cpu(dv->blkno); + ext->flags = dv->flags; } -static void copy_traced_extent(struct scoutfs_traced_extent *te, - struct unpacked_extent *ext) +static int data_ext_next(struct super_block *sb, void *arg, u64 start, u64 len, + struct scoutfs_extent *ext) { - te->iblock = ext->iblock; - te->count = ext->count; - te->blkno = ext->blkno; - te->flags = ext->flags; -} - -static u64 ext_last(struct unpacked_extent *ext) -{ - return ext->iblock + ext->count - 1; -} - -/* The first possible iblock in an item that contains the given iblock */ -static u64 first_iblock(u64 iblock) -{ - return iblock & SCOUTFS_PACKEXT_BASE_MASK; -} - -/* The last possible iblock in an item that contains the given iblock */ -static u64 last_iblock(u64 iblock) -{ - return iblock | ~SCOUTFS_PACKEXT_BASE_MASK; -} - -/* - * Extents can merge if they're logically contiguous, have block - * mappings or not which also must be contiguous, and have matching - * flags. - * - * We also require that a given extent's allocation be from only one - * radix bitmap leaf block because the radix freeing functions only - * operate on one leaf block. - */ -static bool extents_merge(struct unpacked_extent *left, - struct unpacked_extent *right) -{ - return (left->iblock + left->count == right->iblock) && - ((!left->blkno && !right->blkno) || - (left->blkno + left->count == right->blkno)) && - (left->flags == right->flags) && - (scoutfs_radix_bit_leaf_nr(left->blkno) == - scoutfs_radix_bit_leaf_nr(right->blkno + right->count - 1)); -} - -static struct unpacked_extent *first_extent(struct unpacked_extents *unpe) -{ - return rb_entry_safe(rb_first(&unpe->extents), - struct unpacked_extent, node); -} - -static struct unpacked_extent *last_extent(struct unpacked_extents *unpe) -{ - return rb_entry_safe(rb_last(&unpe->extents), - struct unpacked_extent, node); -} - -static struct unpacked_extent *next_extent(struct unpacked_extent *ext) -{ - return rb_entry_safe(rb_next(&ext->node), - struct unpacked_extent, node); -} - -static struct unpacked_extent *prev_extent(struct unpacked_extent *ext) -{ - return rb_entry_safe(rb_prev(&ext->node), - struct unpacked_extent, node); -} - -/* - * Find the first extent that intersects the requested range. NULL is - * returned if no extents intersect. - */ -static struct unpacked_extent *find_extent(struct unpacked_extents *unpe, - u64 iblock, u64 last) -{ - - struct rb_node *node = unpe->extents.rb_node; - struct unpacked_extent *ret = NULL; - struct unpacked_extent *ext; - - if (iblock > last) - return NULL; - - while (node) { - ext = rb_entry(node, struct unpacked_extent, node); - - if (last < ext->iblock) { - node = node->rb_left; - } else if (iblock > ext_last(ext)) { - node = node->rb_right; - } else { - ret = ext; - node = node->rb_left; - } - } - - return ret; -} - -static void track_blocks(struct unpacked_extent *ext, s64 delta, - s64 *on, s64 *off) -{ - if (ext->blkno && !(ext->flags & SEF_UNWRITTEN)) - *on += delta; - else if (ext->flags & SEF_OFFLINE) - *off += delta; -} - -static void modify_and_track_count(struct unpacked_extent *ext, u64 count, - s64 *on, s64 *off) -{ - track_blocks(ext, count - ext->count, on, off); - ext->count = count; -} - -/* - * Callers can temporarily insert extents with equal starting iblocks. - * We're careful to insert those to the left so that caller's can find - * these existing overlapping extents by iterating with next. - */ -static void insert_extent(struct unpacked_extents *unpe, - struct unpacked_extent *ins, s64 *on, s64 *off) -{ - struct rb_node **node = &unpe->extents.rb_node; - struct rb_node *parent = NULL; - struct unpacked_extent *ext; - int cmp; - - while (*node) { - parent = *node; - ext = rb_entry(*node, struct unpacked_extent, node); - - cmp = scoutfs_cmp_u64s(ins->iblock, ext->iblock); - if (cmp <= 0) - node = &(*node)->rb_left; - else - node = &(*node)->rb_right; - } - - rb_link_node(&ins->node, parent, node); - rb_insert_color(&ins->node, &unpe->extents); - - track_blocks(ins, ins->count, on, off); -} - -static void remove_extent(struct unpacked_extents *unpe, - struct unpacked_extent *ext, s64 *on, s64 *off) -{ - rb_erase(&ext->node, &unpe->extents); - track_blocks(ext, -ext->count, on, off); - kfree(ext); -} - -static void free_unpacked_extents(struct unpacked_extents *unpe) -{ - struct unpacked_extent *ext; - struct unpacked_extent *tmp; - - if (unpe) { - rbtree_postorder_for_each_entry_safe(ext, tmp, &unpe->extents, - node) { - kfree(ext); - } - kfree(unpe); - } -} - -static int unpack_extent(struct unpacked_extent *ext, u64 iblock, - struct scoutfs_packed_extent *pe, int size, - u64 prev_blkno) -{ - __le64 lediff; - u64 blkno; - u64 diff; - - if (size < sizeof(struct scoutfs_packed_extent) || - size < (sizeof(struct scoutfs_packed_extent) + pe->diff_bytes)) - return 0; - - if (pe->diff_bytes) { - lediff = 0; - memcpy(&lediff, pe->le_blkno_diff, pe->diff_bytes); - diff = le64_to_cpu(lediff); - diff = (diff >> 1) ^ (-(diff & 1)); - blkno = prev_blkno + diff; - } else { - blkno = 0; - } - - ext->iblock = iblock; - ext->blkno = blkno; - ext->count = le16_to_cpu(pe->count); - ext->flags = pe->flags; - - return sizeof(struct scoutfs_packed_extent) + pe->diff_bytes; -} - -static int load_unpacked_extents(struct super_block *sb, u64 ino, - u64 iblock, u64 last, bool empty_enoent, - struct unpacked_extents **unpe_ret, - struct scoutfs_lock *lock) -{ - struct unpacked_extents *unpe = NULL; - struct scoutfs_packed_extent *pe; - struct unpacked_extent *ext; + struct data_ext_args *args = arg; + struct scoutfs_data_extent_val dv; struct scoutfs_key key; - struct scoutfs_key end; - struct rb_node *parent; - struct rb_node **node; - void *buf = NULL; - u64 prev_blkno; - bool saw_final; - int size; + struct scoutfs_key last; int ret; - int p; - *unpe_ret = NULL; + item_from_extent(&last, &dv, args->ino, U64_MAX, 1, 0, 0); + item_from_extent(&key, &dv, args->ino, start, len, 0, 0); - unpe = kzalloc(sizeof(struct unpacked_extents), GFP_NOFS); - if (!unpe) { - ret = -ENOMEM; - goto out; + ret = scoutfs_item_next(sb, &key, &last, &dv, sizeof(dv), args->lock); + if (ret == sizeof(dv)) { + ext_from_item(ext, &key, &dv); + ret = 0; + } else if (ret >= 0) { + ret = -EIO; } - unpe->extents = RB_ROOT; - unpe->changed = true; - /* updated later if _next gives us a greater key */ - unpe->iblock = first_iblock(iblock); - - buf = kmalloc(SCOUTFS_PACKEXT_MAX_BYTES, GFP_NOFS); - if (!buf) { - ret = -ENOMEM; - goto out; - } - - if (last > iblock) - init_packed_extent_key(&end, ino, last, 0); - - parent = NULL; - node = &unpe->extents.rb_node; - prev_blkno = 0; - saw_final = false; - - for (p = 0; !saw_final; p++) { - init_packed_extent_key(&key, ino, iblock, p); - - /* maybe search for next initial item, lookup more parts */ - if (p == 0 && last > iblock) - ret = scoutfs_item_next(sb, &key, &end, buf, - SCOUTFS_PACKEXT_MAX_BYTES, - lock); - else - ret = scoutfs_item_lookup(sb, &key, buf, - SCOUTFS_PACKEXT_MAX_BYTES, - lock); - if (ret < 0) { - if (p == 0 && ret == -ENOENT && empty_enoent) - ret = 0; - goto out; - } - - if (key.skpe_part != p) { - ret = -EIO; /* corruption */ - goto out; - } - - if (p == 0) { - iblock = le64_to_cpu(key.skpe_base) << - SCOUTFS_PACKEXT_BASE_SHIFT; - unpe->iblock = iblock; - } - pe = buf; - size = ret; - - while (size > 0) { - ext = kmalloc(sizeof(struct unpacked_extent), GFP_NOFS); - if (!ext) { - ret = -ENOMEM; - goto out; - } - - ret = unpack_extent(ext, iblock, pe, size, prev_blkno); - if (ret == 0) { /* XXX corruption? */ - kfree(ext); - ret = -EIO; - goto out; - } - - saw_final = pe->final; - pe = (void *)pe + ret; - size -= ret; - - /* sparse packed extents advance iblock */ - if (ext->flags == 0 && ext->blkno == 0) { - iblock += ext->count; - kfree(ext); - ext = NULL; - continue; - } - - iblock += ext->count; - prev_blkno = ext->blkno + ext->count - 1; - - /* building the rbtree from sorted nodes */ - rb_link_node(&ext->node, parent, node); - rb_insert_color(&ext->node, &unpe->extents); - parent = &ext->node; - node = &ext->node.rb_right; - - if (saw_final) - unpe->existing_parts = p + 1; - } - } - - ret = 0; -out: - kfree(buf); if (ret < 0) - free_unpacked_extents(unpe); - else - *unpe_ret = unpe; - + memset(ext, 0, sizeof(struct scoutfs_extent)); return ret; } -static int pack_extent(struct scoutfs_packed_extent *pe, int size, - struct unpacked_extent *ext, - u64 prev_blkno, bool final) +static void add_onoff(struct inode *inode, u64 map, u8 flags, s64 len) { - int diff_bytes; - __le64 lediff; - u64 diff; - int bytes; - int last; - - diff = ext->blkno - prev_blkno; - diff = (diff << 1) ^ ((s64)diff >> 63); /* shift sign extend */ - lediff = cpu_to_le64(diff); - last = fls64(diff); - diff_bytes = (last + 7) >> 3; - - bytes = offsetof(struct scoutfs_packed_extent, - le_blkno_diff[diff_bytes]); - if (size < bytes) - return 0; - - pe->count = cpu_to_le16(ext->count); - pe->diff_bytes = diff_bytes; - pe->flags = ext->flags; - pe->final = !!final; - if (diff_bytes) - memcpy(pe->le_blkno_diff, &lediff, diff_bytes); - - return bytes; -} - -static int store_packed_extents(struct super_block *sb, u64 ino, - struct unpacked_extents *unpe, - struct scoutfs_lock *lock) -{ - struct scoutfs_packed_extent *pe; - struct unpacked_extent *final; - struct unpacked_extent *ext; - struct scoutfs_key key; - void *buf = NULL; - u64 prev_blkno; - u64 iblock; - int space; - int size; - int ret; - int p; - int i; - - if (!unpe->changed) - return 0; - - if (RB_EMPTY_ROOT(&unpe->extents)) { - for (p = 0; p < unpe->existing_parts; p++) { - init_packed_extent_key(&key, ino, unpe->iblock, p); - ret = scoutfs_item_delete(sb, &key, lock); - BUG_ON(ret); /* XXX inconsistent between parts */ - } - unpe->existing_parts = 0; - unpe->changed = false; - return 0; - } - - buf = kmalloc(SCOUTFS_PACKEXT_MAX_BYTES, GFP_NOFS); - if (!buf) { - ret = -ENOMEM; - goto out; - } - - final = last_extent(unpe); - prev_blkno = 0; - - pe = buf; - space = SCOUTFS_PACKEXT_MAX_BYTES; - size = 0; - p = 0; - iblock = unpe->iblock; - - ext = first_extent(unpe); - while (ext) { - /* encode sparse extent to advance iblock */ - if (ext->iblock > iblock && space >= sizeof(*pe)) { - pe->count = cpu_to_le16(ext->iblock - iblock); - pe->diff_bytes = 0; - pe->flags = 0; - pe->final = 0; - pe++; - space -= sizeof(*pe); - size += sizeof(*pe); - iblock = ext->iblock; - } - - /* encode actual extent */ - if (ext->iblock == iblock && - (ret = pack_extent(pe, space, ext, prev_blkno, - ext == final)) > 0) { - pe = (void *)pe + ret; - space -= ret; - size += ret; - iblock += ext->count; - prev_blkno = ext->blkno + ext->count - 1; - ext = next_extent(ext); - if (ext) - continue; - } - - /* store full item or after packing final extent */ - init_packed_extent_key(&key, ino, unpe->iblock, p); - if (p < unpe->existing_parts) - ret = scoutfs_item_update(sb, &key, buf, size, lock); - else - ret = scoutfs_item_create(sb, &key, buf, size, lock); - BUG_ON(ret); /* XXX inconsistent between parts */ - - pe = buf; - space = SCOUTFS_PACKEXT_MAX_BYTES; - size = 0; - p++; - } - - /* delete any remaining previous part items */ - for (i = p; i < unpe->existing_parts; i++) { - init_packed_extent_key(&key, ino, unpe->iblock, i); - ret = scoutfs_item_delete(sb, &key, lock); - BUG_ON(ret); /* XXX inconsistent between parts */ - } - - /* the next store has to know our stored parts */ - unpe->existing_parts = p; - unpe->changed = false; - ret = 0; -out: - kfree(buf); - - return ret; -} - -/* - * Set a logical extent mapping in the unpacked extents for a region of - * a file. The caller's extent is authoritative, any existing - * overlapping extents are trimmed or removed. The new extent can be - * merged with remaining adjacent and compatible extents. - * - * If the caller provides an inode struct then we'll keep the inode - * block counts in sync with flagged extents because updating the inode - * counts won't fail. The caller is expected to keep all other state - * consistent with the extents (i_size, i_blocks, allocator bitmaps). - */ -static int set_extent(struct super_block *sb, struct inode *inode, - u64 ino, struct unpacked_extents *unpe, - u64 iblock, u64 blkno, u64 count, u8 flags) -{ - struct unpacked_extent *split; - struct unpacked_extent *next; - struct unpacked_extent *prev; - struct unpacked_extent *ext; - u64 offset; s64 on = 0; s64 off = 0; - /* make sure the given extent fits entirely within one item */ - if (WARN_ON_ONCE(first_iblock(iblock) != - first_iblock(iblock + count - 1))) - return -EINVAL; + if (map && !(flags & SEF_UNWRITTEN)) + on += len; + else if (flags & SEF_OFFLINE) + off += len; - ext = kmalloc(sizeof(struct unpacked_extent), GFP_NOFS); - split = kmalloc(sizeof(struct unpacked_extent), GFP_NOFS); - if (!ext || !split) { - kfree(ext); - kfree(split); - return -ENOMEM; - } - - unpe->changed = true; - - ext->iblock = iblock; - ext->blkno = blkno; - ext->count = count; - ext->flags = flags; - - insert_extent(unpe, ext, &on, &off); - - prev = prev_extent(ext); - - /* splitting an existing extent? */ - if (prev && ext_last(prev) > ext_last(ext)) { - split->iblock = ext_last(ext) + 1; - split->count = ext_last(prev) - split->iblock + 1; - split->blkno = prev->blkno ? - prev->blkno + prev->count - split->count : 0; - split->flags = prev->flags; - - modify_and_track_count(prev, ext->iblock - prev->iblock, - &on, &off); - - insert_extent(unpe, split, &on, &off); - next = split; - split = NULL; - } else { - next = NULL; - } - - /* trimming a prev extent? */ - if (prev && ext_last(prev) >= ext->iblock) { - modify_and_track_count(prev, ext->iblock - prev->iblock, - &on, &off); - } - - /* merging with a prev extent? */ - if (prev && extents_merge(prev, ext)) { - ext->iblock = prev->iblock; - ext->blkno = prev->blkno; - modify_and_track_count(ext, ext->count + prev->count, - &on, &off); - remove_extent(unpe, prev, &on, &off); - } - - /* if didn't split find next, removing any totally within ours */ - if (!next) { - while ((next = next_extent(ext)) && - ext_last(next) <= ext_last(ext)) { - remove_extent(unpe, next, &on, &off); - } - } - - /* trimming a next extent? */ - if (next && next->iblock <= ext_last(ext)) { - offset = (ext_last(ext) + 1) - next->iblock; - next->iblock += offset; - next->blkno = next->blkno ? next->blkno + offset : 0; - modify_and_track_count(next, next->count - offset, - &on, &off); - } - - /* merging with a next extent? */ - if (next && extents_merge(ext, next)) { - modify_and_track_count(ext, ext->count + next->count, - &on, &off); - remove_extent(unpe, next, &on, &off); - } - - /* and finally remove our extent if it was only removing others */ - if (ext->blkno == 0 && ext->flags == 0) - remove_extent(unpe, ext, &on, &off); - - if (inode) - scoutfs_inode_add_onoff(inode, on, off); - - kfree(split); - return 0; + scoutfs_inode_add_onoff(inode, on, off); } +static int data_ext_insert(struct super_block *sb, void *arg, u64 start, + u64 len, u64 map, u8 flags) +{ + struct data_ext_args *args = arg; + struct scoutfs_data_extent_val dv; + struct scoutfs_key key; + int ret; + + item_from_extent(&key, &dv, args->ino, start, len, map, flags); + ret = scoutfs_item_create(sb, &key, &dv, sizeof(dv), args->lock); + if (ret == 0 && args->inode) + add_onoff(args->inode, map, flags, len); + return ret; +} + +static int data_ext_remove(struct super_block *sb, void *arg, u64 start, + u64 len, u64 map, u8 flags) +{ + struct data_ext_args *args = arg; + struct scoutfs_data_extent_val dv; + struct scoutfs_key key; + int ret; + + item_from_extent(&key, &dv, args->ino, start, len, map, flags); + ret = scoutfs_item_delete(sb, &key, args->lock); + if (ret == 0 && args->inode) + add_onoff(args->inode, map, flags, -len); + return ret; +} + +static struct scoutfs_ext_ops data_ext_ops = { + .next = data_ext_next, + .insert = data_ext_insert, + .remove = data_ext_remove, +}; + /* * Find and remove or mark offline the block mappings that intersect * with the caller's range. The caller is responsible for transactions @@ -703,74 +182,75 @@ static s64 truncate_extents(struct super_block *sb, struct inode *inode, struct scoutfs_lock *lock) { DECLARE_DATA_INFO(sb, datinf); - struct unpacked_extents *unpe = NULL; - struct unpacked_extent *ext; - struct scoutfs_traced_extent te; + struct data_ext_args args = { + .ino = ino, + .inode = inode, + .lock = lock, + }; + struct scoutfs_extent ext; + struct scoutfs_extent tr; u64 offset; - u64 blkno; - u64 count; - u8 flags; s64 ret; - int err; - - ret = load_unpacked_extents(sb, ino, iblock, last, false, &unpe, lock); - if (ret < 0) { - if (ret == -ENOENT) - ret = 0; - goto out; - } + u8 flags; + int i; flags = offline ? SEF_OFFLINE : 0; - ret = 0; - ext = find_extent(unpe, iblock, last); - while (ext && ext->iblock <= last) { + + for (i = 0; iblock <= last; i++) { + if (i == EXTENTS_PER_HOLD) { + ret = iblock; + break; + } + + ret = scoutfs_ext_next(sb, &data_ext_ops, &args, + iblock, 1, &ext); + if (ret < 0) { + if (ret == -ENOENT) + ret = 0; + break; + } + + /* done if we went past the region */ + if (ext.start > last) { + ret = 0; + break; + } /* nothing to do when already offline and unmapped */ - if ((offline && (ext->flags & SEF_OFFLINE)) && !ext->blkno) { - ext = next_extent(ext); + if ((offline && (ext.flags & SEF_OFFLINE)) && !ext.map) { + iblock = ext.start + ext.len; continue; } - iblock = max(ext->iblock, iblock); - offset = iblock - ext->iblock; - blkno = ext->blkno + offset; - count = min(ext->count - offset, last - iblock + 1); + iblock = max(ext.start, iblock); + offset = iblock - ext.start; - if (ext->blkno) { - down_write(&datinf->alloc_rwsem); - err = scoutfs_radix_free_data(sb, datinf->alloc, - datinf->wri, - &datinf->data_freed, - blkno, count); - up_write(&datinf->alloc_rwsem); - if (err < 0) { - ret = err; + tr.start = iblock; + tr.map = ext.map ? ext.map + offset : 0; + tr.len = min(ext.len - offset, last - iblock + 1); + tr.flags = ext.flags; + + if (tr.map) { + mutex_lock(&datinf->mutex); + ret = scoutfs_free_data(sb, datinf->alloc, + datinf->wri, + &datinf->data_freed, + tr.map, tr.len); + mutex_unlock(&datinf->mutex); + if (ret < 0) break; - } } - init_traced_extent(&te, iblock, count, 0, flags); - trace_scoutfs_data_extent_truncated(sb, ino, &te); + trace_scoutfs_data_extent_truncated(sb, ino, &tr); - err = set_extent(sb, inode, ino, unpe, iblock, 0, count, flags); - BUG_ON(err); /* inconsistent alloc and extents */ + ret = scoutfs_ext_set(sb, &data_ext_ops, &args, + tr.start, tr.len, 0, flags); + BUG_ON(ret); /* inconsistent, could prealloc items */ - /* modifying could have merged and deleted ext, search again */ - iblock += count; - if (iblock > last) - break; - ext = find_extent(unpe, iblock, last); + iblock += tr.len; } - err = store_packed_extents(sb, ino, unpe, lock); - BUG_ON(err); /* inconsistent alloc and extents */ - - /* continue after the packed extent item if we exhausted extents */ - if (ret == 0) - ret = unpe->iblock + SCOUTFS_PACKEXT_BLOCKS; -out: - free_unpacked_extents(unpe); return ret; } @@ -844,6 +324,11 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode, return ret; } +static inline u64 ext_last(struct scoutfs_extent *ext) +{ + return ext->start + ext->len - 1; +} + /* * The caller is writing to a logical iblock that doesn't have an * allocated extent. @@ -861,141 +346,111 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode, * block. It doesn't work for concurrent stages, releasing behind * staging, sparse files, multi-node writes, etc. fallocate() is always * a better tool to use. - * - * We can mangle the extents so the caller is going to search for the - * intersecting extent again if we succeed. */ static int alloc_block(struct super_block *sb, struct inode *inode, - struct unpacked_extents *unpe, - struct unpacked_extent *ext, u64 iblock, + struct scoutfs_extent *ext, u64 iblock, struct scoutfs_lock *lock) { DECLARE_DATA_INFO(sb, datinf); const u64 ino = scoutfs_ino(inode); - struct scoutfs_traced_extent te; + struct data_ext_args args = { + .ino = ino, + .inode = inode, + .lock = lock, + }; + struct scoutfs_extent found; + struct scoutfs_extent pre; u64 blkno = 0; u64 online; u64 offline; - u64 last; u8 flags; - int count; + u64 count; int ret; int err; + trace_scoutfs_data_alloc_block_enter(sb, ino, iblock, ext); + /* can only allocate over existing unallocated offline extent */ - if (WARN_ON_ONCE(ext && - !(iblock >= ext->iblock && iblock <= ext_last(ext) && - ext->blkno == 0 && (ext->flags & SEF_OFFLINE)))) + if (WARN_ON_ONCE(ext->len && + !(iblock >= ext->start && iblock <= ext_last(ext) && + ext->map == 0 && (ext->flags & SEF_OFFLINE)))) return -EINVAL; - down_write(&datinf->alloc_rwsem); + mutex_lock(&datinf->mutex); scoutfs_inode_get_onoff(inode, &online, &offline); - if (ext) { + if (ext->len) { /* limit preallocation to remaining existing (offline) extent */ - count = ext->count - (iblock - ext->iblock); + count = ext->len - (iblock - ext->start); flags = ext->flags; } else { - /* otherwise alloc to next extent or end of packed item */ - last = last_iblock(iblock); - ext = find_extent(unpe, iblock, last); - if (ext) - count = ext->iblock - iblock; + /* otherwise alloc to next extent */ + ret = scoutfs_ext_next(sb, &data_ext_ops, &args, + iblock, 1, &found); + if (ret < 0 && ret != -ENOENT) + goto out; + if (found.len && found.start > iblock) + count = found.start - iblock; else - count = last - iblock + 1; + count = SCOUTFS_DATA_EXTEND_PREALLOC_LIMIT; flags = 0; } + /* overall prealloc limit */ + count = min_t(u64, count, SCOUTFS_DATA_EXTEND_PREALLOC_LIMIT); + /* only strictly contiguous extending writes will try to preallocate */ if (iblock > 1 && iblock == online) - count = min_t(u64, iblock, count); + count = min(iblock, count); else count = 1; - ret = scoutfs_radix_alloc_data(sb, datinf->alloc, datinf->wri, - &datinf->data_avail, count, &blkno, - &count); + ret = scoutfs_alloc_data(sb, datinf->alloc, datinf->wri, + &datinf->data_avail, &datinf->cached_ext, + count, &blkno, &count); if (ret < 0) goto out; - ret = set_extent(sb, inode, ino, unpe, iblock, blkno, 1, 0); + ret = scoutfs_ext_set(sb, &data_ext_ops, &args, iblock, 1, blkno, 0); if (ret < 0) goto out; - init_traced_extent(&te, iblock, blkno, 1, 0); - trace_scoutfs_data_alloc_block(sb, ino, &te); - if (count > 1) { - ret = set_extent(sb, inode, ino, unpe, iblock + 1, - blkno + 1, count - 1, flags | SEF_UNWRITTEN); + pre.start = iblock + 1; + pre.len = count - 1; + pre.map = blkno + 1; + pre.flags = flags | SEF_UNWRITTEN; + ret = scoutfs_ext_set(sb, &data_ext_ops, &args, pre.start, + pre.len, pre.map, pre.flags); if (ret < 0) { - err = set_extent(sb, inode, ino, unpe, iblock, 0, 1, - flags); + err = scoutfs_ext_set(sb, &data_ext_ops, &args, iblock, + 1, 0, flags); BUG_ON(err); /* couldn't restore original */ + goto out; } - - init_traced_extent(&te, iblock + 1, blkno + 1, count - 1, - flags | SEF_UNWRITTEN); - trace_scoutfs_data_prealloc_unwritten(sb, ino, &te); } - ret = store_packed_extents(sb, ino, unpe, lock); - BUG_ON(ret); /* inconsistent previous extent state */ - + /* tell the caller we have a single block, could check next? */ + ext->start = iblock; + ext->len = 1; + ext->map = blkno; + ext->flags = 0; + ret = 0; out: if (ret < 0 && blkno > 0) { - err = scoutfs_radix_free_data(sb, datinf->alloc, datinf->wri, - &datinf->data_freed, - blkno, count); + err = scoutfs_free_data(sb, datinf->alloc, datinf->wri, + &datinf->data_freed, blkno, count); BUG_ON(err); /* leaked free blocks */ } - up_write(&datinf->alloc_rwsem); - - return ret; -} - -/* - * A caller is writing into an unwritten block. This can also be called - * for staging writes so we clear both the unwritten and offline flags. - * - * We don't have to wait for dirty block IO to complete before clearing - * the unwritten flag in metadata because we have strict synchronization - * between data and metadata. All dirty data in the current transaction - * is written before the metadata in the transaction that references it - * is committed. - */ -static int convert_unwritten(struct super_block *sb, struct inode *inode, - struct unpacked_extents *unpe, - struct unpacked_extent *ext, u64 iblock, - struct scoutfs_lock *lock) -{ - struct scoutfs_traced_extent te; - u64 blkno; - u8 ext_fl; - int err; - int ret; - - blkno = ext->blkno + (iblock - ext->iblock); - ext_fl = ext->flags; - - init_traced_extent(&te, iblock, 1, blkno, ext_fl); - trace_scoutfs_data_convert_unwritten(sb, scoutfs_ino(inode), &te); - - ret = set_extent(sb, inode, scoutfs_ino(inode), unpe, iblock, - blkno, 1, ext_fl & ~(SEF_OFFLINE|SEF_UNWRITTEN)); - if (ret < 0) - goto out; - - ret = store_packed_extents(sb, scoutfs_ino(inode), unpe, lock); - if (ret < 0) { - err = set_extent(sb, inode, scoutfs_ino(inode), unpe, iblock, - blkno, 1, ext_fl); - BUG_ON(err); /* packed and unpacked inconsistent */ + if (ret == 0) { + trace_scoutfs_data_alloc(sb, ino, ext); + trace_scoutfs_data_prealloc(sb, ino, &pre); } -out: + mutex_unlock(&datinf->mutex); + return ret; } @@ -1005,10 +460,10 @@ static int scoutfs_get_block(struct inode *inode, sector_t iblock, struct scoutfs_inode_info *si = SCOUTFS_I(inode); const u64 ino = scoutfs_ino(inode); struct super_block *sb = inode->i_sb; + struct data_ext_args args; struct scoutfs_lock *lock = NULL; - struct unpacked_extents *unpe = NULL; - struct unpacked_extent *ext = NULL; - DECLARE_TRACED_EXTENT(te); + struct scoutfs_extent ext = {0,}; + struct scoutfs_extent un; u64 offset; int ret; @@ -1021,53 +476,60 @@ static int scoutfs_get_block(struct inode *inode, sector_t iblock, goto out; } - ret = load_unpacked_extents(sb, ino, iblock, iblock, true, &unpe, lock); - if (ret < 0) + args.ino = ino; + args.inode = inode; + args.lock = lock; + + ret = scoutfs_ext_next(sb, &data_ext_ops, &args, iblock, 1, &ext); + if (ret == -ENOENT || (ret == 0 && ext.start > iblock)) + memset(&ext, 0, sizeof(ext)); + else if (ret < 0) goto out; - ext = find_extent(unpe, iblock, iblock); + if (ext.len) + trace_scoutfs_data_get_block_found(sb, ino, &ext); /* non-staging callers should have waited on offline blocks */ - if (WARN_ON_ONCE(ext && (ext->flags & SEF_OFFLINE) && !si->staging)) { + if (WARN_ON_ONCE(ext.map && (ext.flags & SEF_OFFLINE) && !si->staging)){ ret = -EIO; goto out; } - /* convert unwritten to written */ - if (create && ext && (ext->flags & SEF_UNWRITTEN)) { - ret = convert_unwritten(sb, inode, unpe, ext, iblock, lock); + /* convert unwritten to written, could be staging */ + if (create && ext.map && (ext.flags & SEF_UNWRITTEN)) { + un.start = iblock; + un.len = 1; + un.map = ext.map + (iblock - ext.start); + un.flags = ext.flags & ~(SEF_OFFLINE|SEF_UNWRITTEN); + ret = scoutfs_ext_set(sb, &data_ext_ops, &args, + un.start, un.len, un.map, un.flags); if (ret == 0) { + ext = un; set_buffer_new(bh); - ext = find_extent(unpe, iblock, iblock); } goto out; } /* allocate and map blocks containing our logical block */ - if (create && (!ext || !ext->blkno)) { - ret = alloc_block(sb, inode, unpe, ext, iblock, lock); - if (ret == 0) { + if (create && !ext.map) { + ret = alloc_block(sb, inode, &ext, iblock, lock); + if (ret == 0) set_buffer_new(bh); - ext = find_extent(unpe, iblock, iblock); - } } else { ret = 0; } out: /* map usable extent, else leave bh unmapped for sparse reads */ - if (ret == 0 && ext && ext->blkno && !(ext->flags & SEF_UNWRITTEN)) { - offset = iblock - ext->iblock; - map_bh(bh, inode->i_sb, ext->blkno + offset); + if (ret == 0 && ext.map && !(ext.flags & SEF_UNWRITTEN)) { + offset = iblock - ext.start; + map_bh(bh, inode->i_sb, ext.map + offset); bh->b_size = min_t(u64, bh->b_size, - (ext->count - offset) << SCOUTFS_BLOCK_SM_SHIFT); + (ext.len - offset) << SCOUTFS_BLOCK_SM_SHIFT); + trace_scoutfs_data_get_block_mapped(sb, ino, &ext); } - if (ext) - copy_traced_extent(&te, ext); - trace_scoutfs_get_block(sb, scoutfs_ino(inode), iblock, create, - &te, ret, bh->b_blocknr, bh->b_size); - free_unpacked_extents(unpe); + &ext, ret, bh->b_blocknr, bh->b_size); return ret; } @@ -1330,74 +792,82 @@ static int scoutfs_write_end(struct file *file, struct address_space *mapping, /* * Try to allocate unwritten extents for any unallocated regions of the - * logical block extent from the caller. We work one packed extent item - * at a time. + * logical block extent from the caller. The caller manages locks and + * transactions. We limit ourselves to a reasonable number of extents + * before returning to open another transaction. * - * We return an error or the numbet of contiguous blocks starting at - * iblock that were successfully processed. + * We return an error or the number of blocks starting at iblock that + * were successfully processed. The caller will continue after those + * blocks until they reach last. */ -static int fallocate_extents(struct super_block *sb, struct inode *inode, +static s64 fallocate_extents(struct super_block *sb, struct inode *inode, u64 iblock, u64 last, struct scoutfs_lock *lock) { DECLARE_DATA_INFO(sb, datinf); - const u64 ino = scoutfs_ino(inode); - struct unpacked_extents *unpe = NULL; - struct unpacked_extent *ext; + struct data_ext_args args = { + .ino = scoutfs_ino(inode), + .inode = inode, + .lock = lock, + }; + struct scoutfs_extent ext; u8 ext_fl; u64 blkno; - int count; - int done; - int ret; + u64 count; + s64 done = 0; + int ret = 0; int err; + int i; - /* work with the extents in one item at a time */ - last = min(last, last_iblock(iblock)); - done = 0; + for (i = 0; iblock <= last && i < EXTENTS_PER_HOLD; i++) { - ret = load_unpacked_extents(sb, ino, iblock, iblock, true, &unpe, lock); - if (ret < 0) - goto out; - - ext = find_extent(unpe, iblock, last); - while (iblock <= last) { + ret = scoutfs_ext_next(sb, &data_ext_ops, &args, + iblock, 1, &ext); + if (ret == -ENOENT) + ret = 0; + else if (ret < 0) + break; /* default to allocate to end of region */ count = last - iblock + 1; ext_fl = 0; - if (!ext) { + if (!ext.len) { /* no extent, default alloc from above */ - } else if (ext->iblock <= iblock && ext->blkno) { + } else if (ext.start <= iblock && ext.map) { /* skip portion of allocated extent */ count = min_t(u64, count, - ext->count - (iblock - ext->iblock)); + ext.len - (iblock - ext.start)); iblock += count; done += count; - ext = next_extent(ext); continue; - } else if (ext->iblock <= iblock && !ext->blkno) { + } else if (ext.start <= iblock && !ext.map) { /* alloc portion of unallocated extent */ count = min_t(u64, count, - ext->count - (iblock - ext->iblock)); - ext_fl = ext->flags; + ext.len - (iblock - ext.start)); + ext_fl = ext.flags; - } else if (iblock < ext->iblock) { + } else if (iblock < ext.start) { /* alloc hole until next extent */ - count = min_t(u64, count, ext->iblock - iblock); + count = min_t(u64, count, ext.start - iblock); } - down_write(&datinf->alloc_rwsem); + /* limit allocation attempts */ + count = min_t(u64, count, SCOUTFS_FALLOCATE_ALLOC_LIMIT); - ret = scoutfs_radix_alloc_data(sb, datinf->alloc, datinf->wri, - &datinf->data_avail, count, - &blkno, &count); + mutex_lock(&datinf->mutex); + + ret = scoutfs_alloc_data(sb, datinf->alloc, datinf->wri, + &datinf->data_avail, + &datinf->cached_ext, + count, &blkno, &count); if (ret == 0) { - ret = set_extent(sb, inode, ino, unpe, iblock, blkno, - count, ext_fl | SEF_UNWRITTEN); + ret = scoutfs_ext_set(sb, &data_ext_ops, &args, iblock, + count, blkno, + ext_fl | SEF_UNWRITTEN); if (ret < 0) { - err = scoutfs_radix_free_data(sb, datinf->alloc, + err = scoutfs_free_data(sb, datinf->alloc, datinf->wri, &datinf->data_avail, blkno, count); @@ -1405,25 +875,18 @@ static int fallocate_extents(struct super_block *sb, struct inode *inode, } } - up_write(&datinf->alloc_rwsem); + mutex_unlock(&datinf->mutex); if (ret < 0) break; iblock += count; done += count; - ext = find_extent(unpe, iblock, last); } - ret = store_packed_extents(sb, ino, unpe, lock); - BUG_ON(ret); /* inconsistent with unpacked and alloc */ - if (ret == 0) ret = done; -out: - free_unpacked_extents(unpe); - return ret; } @@ -1447,7 +910,7 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) loff_t end; u64 iblock; u64 last; - int ret; + s64 ret; mutex_lock(&inode->i_mutex); @@ -1527,79 +990,56 @@ out: * on regular files with no data extents. It's used to restore a file * with an offline extent which can then trigger staging. * - * The caller has taken care of locking. We're creating many packed - * extent items which may have to be written in multiple transactions. - * We create exetnts from the front of the file and use the offline - * block count to figure out where to continue from. + * The caller has taken care of locking the inode. We're updating the + * inode offline count as we create the offline extent so we take care + * of the index locking, updating, and transaction. */ int scoutfs_data_init_offline_extent(struct inode *inode, u64 size, struct scoutfs_lock *lock) { struct super_block *sb = inode->i_sb; - struct unpacked_extents *unpe = NULL; - u64 ino = scoutfs_ino(inode); + struct data_ext_args args = { + .ino = scoutfs_ino(inode), + .inode = inode, + .lock = lock, + }; + const u64 count = DIV_ROUND_UP(size, SCOUTFS_BLOCK_SM_SIZE); LIST_HEAD(ind_locks); - bool held = false; - u64 blocks; - u64 iblock; - u64 count; u64 on; u64 off; int ret; - blocks = DIV_ROUND_UP(size, SCOUTFS_BLOCK_SM_SIZE); - scoutfs_inode_get_onoff(inode, &on, &off); - iblock = off; - while (iblock < blocks) { - /* we're updating meta_seq with offline block count */ - ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, - SIC_SETATTR_MORE()); - if (ret < 0) - goto out; - held = true; - - ret = scoutfs_dirty_inode_item(inode, lock); - if (ret < 0) - goto out; - - ret = load_unpacked_extents(sb, ino, iblock, iblock, true, - &unpe, lock); - if (ret < 0) - goto out; - - count = min(blocks - iblock, last_iblock(iblock) - iblock + 1); - - ret = set_extent(sb, inode, ino, unpe, iblock, 0, count, - SEF_OFFLINE); - if (ret < 0) - goto out; - - ret = store_packed_extents(sb, ino, unpe, lock); - if (ret < 0) - goto out; - - free_unpacked_extents(unpe); - unpe = NULL; - - scoutfs_update_inode_item(inode, lock, &ind_locks); - - scoutfs_release_trans(sb); - scoutfs_inode_index_unlock(sb, &ind_locks); - held = false; - - iblock += count; + /* caller should have checked */ + if (on > 0 || off > 0) { + ret = -EINVAL; + goto out; } + /* we're updating meta_seq with offline block count */ + ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, + SIC_SETATTR_MORE()); + if (ret < 0) + goto out; + + ret = scoutfs_dirty_inode_item(inode, lock); + if (ret < 0) + goto unlock; + + ret = scoutfs_ext_insert(sb, &data_ext_ops, &args, + 0, count, 0, SEF_OFFLINE); + if (ret < 0) + goto unlock; + + scoutfs_update_inode_item(inode, lock, &ind_locks); + +unlock: + scoutfs_release_trans(sb); + scoutfs_inode_index_unlock(sb, &ind_locks); ret = 0; out: - if (held) { - scoutfs_release_trans(sb); - scoutfs_inode_index_unlock(sb, &ind_locks); - } - free_unpacked_extents(unpe); return ret; } @@ -1607,11 +1047,11 @@ out: * This copies to userspace :/ */ static int fill_extent(struct fiemap_extent_info *fieinfo, - struct unpacked_extent *ext, u32 fiemap_flags) + struct scoutfs_extent *ext, u32 fiemap_flags) { u32 flags; - if (ext->count == 0) + if (ext->len == 0) return 0; flags = fiemap_flags; @@ -1621,9 +1061,9 @@ static int fill_extent(struct fiemap_extent_info *fieinfo, flags |= FIEMAP_EXTENT_UNWRITTEN; return fiemap_fill_next_extent(fieinfo, - ext->iblock << SCOUTFS_BLOCK_SM_SHIFT, - ext->blkno << SCOUTFS_BLOCK_SM_SHIFT, - ext->count << SCOUTFS_BLOCK_SM_SHIFT, + ext->start << SCOUTFS_BLOCK_SM_SHIFT, + ext->map << SCOUTFS_BLOCK_SM_SHIFT, + ext->len << SCOUTFS_BLOCK_SM_SHIFT, flags); } @@ -1638,28 +1078,33 @@ int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, struct super_block *sb = inode->i_sb; const u64 ino = scoutfs_ino(inode); struct scoutfs_lock *lock = NULL; - struct unpacked_extents *unpe = NULL; - struct unpacked_extent *ext; - struct unpacked_extent cur; - struct scoutfs_traced_extent te; + struct scoutfs_extent ext; + struct scoutfs_extent cur; + struct data_ext_args args; u32 last_flags; u64 iblock; u64 last; int ret; - if (len == 0) - return 0; + if (len == 0) { + ret = 0; + goto out; + } ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); if (ret) - return ret; + goto out; /* XXX overkill? */ mutex_lock(&inode->i_mutex); ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, 0, inode, &lock); if (ret) - goto out; + goto unlock; + + args.ino = ino; + args.inode = inode; + args.lock = lock; /* use a dummy extent to track */ memset(&cur, 0, sizeof(cur)); @@ -1668,9 +1113,9 @@ int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, iblock = start >> SCOUTFS_BLOCK_SM_SHIFT; last = (start + len - 1) >> SCOUTFS_BLOCK_SM_SHIFT; - for (;;) { - ret = load_unpacked_extents(sb, ino, iblock, last, false, - &unpe, lock); + while (iblock <= last) { + ret = scoutfs_ext_next(sb, &data_ext_ops, &args, + iblock, 1, &ext); if (ret < 0) { if (ret == -ENOENT) ret = 0; @@ -1678,45 +1123,39 @@ int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, break; } - for (ext = find_extent(unpe, iblock, last); ext; - ext = next_extent(ext)) { + trace_scoutfs_data_fiemap_extent(sb, ino, &ext); - copy_traced_extent(&te, ext); - trace_scoutfs_data_fiemap_extent(sb, ino, &te); - - if (ext->iblock > last) { - /* not setting _LAST, it's for end of file */ - ret = 0; - break; - } - - if (extents_merge(&cur, ext)) { - cur.count += ext->count; - continue; - } - - ret = fill_extent(fieinfo, &cur, 0); - if (ret != 0) - goto out; - cur = *ext; + if (ext.start > last) { + /* not setting _LAST, it's for end of file */ + ret = 0; + break; } - iblock = unpe->iblock + SCOUTFS_PACKEXT_BLOCKS; - free_unpacked_extents(unpe); - unpe = NULL; + if (scoutfs_ext_can_merge(&cur, &ext)) { + /* merged extents could be greater than input len */ + cur.len += ext.len; + } else { + ret = fill_extent(fieinfo, &cur, 0); + if (ret != 0) + goto unlock; + cur = ext; + } + + iblock = ext.start + ext.len; } - if (cur.count) + if (cur.len) ret = fill_extent(fieinfo, &cur, last_flags); -out: +unlock: scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ); mutex_unlock(&inode->i_mutex); - free_unpacked_extents(unpe); - +out: if (ret == 1) ret = 0; + trace_scoutfs_data_fiemap(sb, start, len, ret); + return ret; } @@ -1803,11 +1242,14 @@ int scoutfs_data_wait_check(struct inode *inode, loff_t pos, loff_t len, { struct super_block *sb = inode->i_sb; const u64 ino = scoutfs_ino(inode); + struct data_ext_args args = { + .ino = ino, + .inode = inode, + .lock = lock, + }; DECLARE_DATA_WAIT_ROOT(sb, rt); DECLARE_DATA_WAITQ(inode, wq); - struct unpacked_extents *unpe = NULL; - struct unpacked_extent *ext; - DECLARE_TRACED_EXTENT(te); + struct scoutfs_extent ext = {0,}; u64 iblock; u64 last_block; u64 on; @@ -1834,50 +1276,40 @@ int scoutfs_data_wait_check(struct inode *inode, loff_t pos, loff_t len, last_block = (pos + len - 1) >> SCOUTFS_BLOCK_SM_SHIFT; while(iblock <= last_block) { - - free_unpacked_extents(unpe); - ret = load_unpacked_extents(sb, ino, iblock, last_block, false, - &unpe, lock); + ret = scoutfs_ext_next(sb, &data_ext_ops, &args, + iblock, 1, &ext); if (ret < 0) { if (ret == -ENOENT) ret = 0; - goto out; + break; } - for (ext = find_extent(unpe, iblock, last_block); ext; - ext = next_extent(ext)) { - - if (ext->iblock > last_block) { - ret = 0; - goto out; - } - - if (sef & ext->flags) { - if (dw) { - dw->chg = atomic64_read(&wq->changed); - dw->ino = ino; - dw->iblock = max(iblock, ext->iblock); - dw->op = op; - - spin_lock(&rt->lock); - insert_offline_waiting(&rt->root, dw); - spin_unlock(&rt->lock); - } - - copy_traced_extent(&te, ext); - ret = 1; - goto out; - } - + if (ext.start > last_block) { + ret = 0; + break; } - iblock = unpe->iblock + SCOUTFS_PACKEXT_BLOCKS; + if (sef & ext.flags) { + if (dw) { + dw->chg = atomic64_read(&wq->changed); + dw->ino = ino; + dw->iblock = max(iblock, ext.start); + dw->op = op; + + spin_lock(&rt->lock); + insert_offline_waiting(&rt->root, dw); + spin_unlock(&rt->lock); + } + + ret = 1; + break; + } + + iblock = ext.start + ext.len; } out: - trace_scoutfs_data_wait_check(sb, ino, pos, len, sef, op, &te, ret); - - free_unpacked_extents(unpe); + trace_scoutfs_data_wait_check(sb, ino, pos, len, sef, op, &ext, ret); return ret; } @@ -2019,20 +1451,20 @@ const struct file_operations scoutfs_file_fops = { }; void scoutfs_data_init_btrees(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_log_trees *lt) { DECLARE_DATA_INFO(sb, datinf); - down_write(&datinf->alloc_rwsem); + mutex_lock(&datinf->mutex); datinf->alloc = alloc; datinf->wri = wri; datinf->data_avail = lt->data_avail; datinf->data_freed = lt->data_freed; - up_write(&datinf->alloc_rwsem); + mutex_unlock(&datinf->mutex); } void scoutfs_data_get_btrees(struct super_block *sb, @@ -2040,12 +1472,38 @@ void scoutfs_data_get_btrees(struct super_block *sb, { DECLARE_DATA_INFO(sb, datinf); - down_read(&datinf->alloc_rwsem); + mutex_lock(&datinf->mutex); lt->data_avail = datinf->data_avail; lt->data_freed = datinf->data_freed; - up_read(&datinf->alloc_rwsem); + mutex_unlock(&datinf->mutex); +} + +/* + * This should be called before preparing the allocators for the commit + * because it can allocate and free btree blocks in the data allocator. + */ +int scoutfs_data_prepare_commit(struct super_block *sb) +{ + DECLARE_DATA_INFO(sb, datinf); + int ret; + + mutex_lock(&datinf->mutex); + if (datinf->cached_ext.len) { + ret = scoutfs_free_data(sb, datinf->alloc, datinf->wri, + &datinf->data_avail, + datinf->cached_ext.start, + datinf->cached_ext.len); + if (ret == 0) + memset(&datinf->cached_ext, 0, + sizeof(datinf->cached_ext)); + } else { + ret = 0; + } + mutex_unlock(&datinf->mutex); + + return ret; } /* @@ -2055,8 +1513,8 @@ u64 scoutfs_data_alloc_free_bytes(struct super_block *sb) { DECLARE_DATA_INFO(sb, datinf); - return scoutfs_radix_root_free_blocks(sb, &datinf->data_avail) << - SCOUTFS_BLOCK_SM_SHIFT; + return le64_to_cpu(datinf->data_avail.total_len) << + SCOUTFS_BLOCK_SM_SHIFT; } int scoutfs_data_setup(struct super_block *sb) @@ -2069,7 +1527,7 @@ int scoutfs_data_setup(struct super_block *sb) return -ENOMEM; datinf->sb = sb; - init_rwsem(&datinf->alloc_rwsem); + mutex_init(&datinf->mutex); sbi->data_info = datinf; return 0; diff --git a/kmod/src/data.h b/kmod/src/data.h index b4ee7344..09a64fe7 100644 --- a/kmod/src/data.h +++ b/kmod/src/data.h @@ -47,7 +47,7 @@ struct scoutfs_traced_extent { extern const struct address_space_operations scoutfs_file_aops; extern const struct file_operations scoutfs_file_fops; -struct scoutfs_radix_allocator; +struct scoutfs_alloc; struct scoutfs_block_writer; int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode, @@ -77,11 +77,12 @@ int scoutfs_data_waiting(struct super_block *sb, u64 ino, u64 iblock, unsigned int nr); void scoutfs_data_init_btrees(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_log_trees *lt); void scoutfs_data_get_btrees(struct super_block *sb, struct scoutfs_log_trees *lt); +int scoutfs_data_prepare_commit(struct super_block *sb); u64 scoutfs_data_alloc_free_bytes(struct super_block *sb); int scoutfs_data_setup(struct super_block *sb); diff --git a/kmod/src/forest.c b/kmod/src/forest.c index 2d53a1d9..49a255e7 100644 --- a/kmod/src/forest.c +++ b/kmod/src/forest.c @@ -20,7 +20,7 @@ #include "lock.h" #include "btree.h" #include "client.h" -#include "radix.h" +#include "alloc.h" #include "block.h" #include "forest.h" #include "hash.h" @@ -53,7 +53,7 @@ struct forest_info { struct mutex mutex; - struct scoutfs_radix_allocator *alloc; + struct scoutfs_alloc *alloc; struct scoutfs_block_writer *wri; struct scoutfs_log_trees our_log; @@ -421,22 +421,22 @@ int scoutfs_forest_set_bloom_bits(struct super_block *sb, if (!ref->blkno || !scoutfs_block_writer_is_dirty(sb, bl)) { - ret = scoutfs_radix_alloc(sb, finf->alloc, finf->wri, &blkno); + ret = scoutfs_alloc_meta(sb, finf->alloc, finf->wri, &blkno); if (ret < 0) goto unlock; new_bl = scoutfs_block_create(sb, blkno); if (IS_ERR(new_bl)) { - err = scoutfs_radix_free(sb, finf->alloc, finf->wri, - blkno); + err = scoutfs_free_meta(sb, finf->alloc, finf->wri, + blkno); BUG_ON(err); /* could have dirtied */ ret = PTR_ERR(new_bl); goto unlock; } if (bl) { - err = scoutfs_radix_free(sb, finf->alloc, finf->wri, - le64_to_cpu(ref->blkno)); + err = scoutfs_free_meta(sb, finf->alloc, finf->wri, + le64_to_cpu(ref->blkno)); BUG_ON(err); /* could have dirtied */ memcpy(new_bl->data, bl->data, SCOUTFS_BLOCK_LG_SIZE); } else { @@ -517,7 +517,7 @@ int scoutfs_forest_srch_add(struct super_block *sb, u64 hash, u64 ino, u64 id) * serialized with all writers. */ void scoutfs_forest_init_btrees(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_log_trees *lt) { diff --git a/kmod/src/forest.h b/kmod/src/forest.h index 6d0c0c8c..e6e72a4a 100644 --- a/kmod/src/forest.h +++ b/kmod/src/forest.h @@ -1,7 +1,7 @@ #ifndef _SCOUTFS_FOREST_H_ #define _SCOUTFS_FOREST_H_ -struct scoutfs_radix_allocator; +struct scoutfs_alloc; struct scoutfs_block_writer; struct scoutfs_block; @@ -28,7 +28,7 @@ int scoutfs_forest_insert_list(struct super_block *sb, int scoutfs_forest_srch_add(struct super_block *sb, u64 hash, u64 ino, u64 id); void scoutfs_forest_init_btrees(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_log_trees *lt); void scoutfs_forest_get_btrees(struct super_block *sb, diff --git a/kmod/src/format.h b/kmod/src/format.h index 15bd7d92..d5a78ade 100644 --- a/kmod/src/format.h +++ b/kmod/src/format.h @@ -144,10 +144,10 @@ struct scoutfs_key { #define sks_ino _sk_first #define sks_nr _sk_second -/* packed extents */ -#define skpe_ino _sk_first -#define skpe_base _sk_second -#define skpe_part _sk_fourth +/* data extents */ +#define skdx_ino _sk_first +#define skdx_end _sk_second +#define skdx_len _sk_third /* log trees */ #define sklt_rid _sk_first @@ -163,6 +163,13 @@ struct scoutfs_key { /* mounted clients */ #define skmc_rid _sk_first +/* free extents by blkno */ +#define skfb_end _sk_second +#define skfb_len _sk_third +/* free extents by len */ +#define skfl_neglen _sk_second +#define skfl_blkno _sk_third + struct scoutfs_radix_block { struct scoutfs_block_header hdr; union { @@ -386,8 +393,8 @@ struct scoutfs_srch_block { #define SCOUTFS_SRCH_COMPACT_NR (1 << SCOUTFS_SRCH_COMPACT_ORDER) struct scoutfs_srch_compact_input { - struct scoutfs_radix_root meta_avail; - struct scoutfs_radix_root meta_freed; + struct scoutfs_alloc_list_head meta_avail; + struct scoutfs_alloc_list_head meta_freed; __le64 id; __u8 nr; __u8 flags; @@ -395,8 +402,8 @@ struct scoutfs_srch_compact_input { } __packed; struct scoutfs_srch_compact_result { - struct scoutfs_radix_root meta_avail; - struct scoutfs_radix_root meta_freed; + struct scoutfs_alloc_list_head meta_avail; + struct scoutfs_alloc_list_head meta_freed; __le64 id; __u8 flags; struct scoutfs_srch_file sfl; @@ -413,24 +420,24 @@ struct scoutfs_srch_compact_result { * about item logs, it's about clients making changes to trees. */ struct scoutfs_log_trees { - struct scoutfs_radix_root meta_avail; - struct scoutfs_radix_root meta_freed; + struct scoutfs_alloc_list_head meta_avail; + struct scoutfs_alloc_list_head meta_freed; struct scoutfs_btree_root item_root; struct scoutfs_btree_ref bloom_ref; - struct scoutfs_radix_root data_avail; - struct scoutfs_radix_root data_freed; + struct scoutfs_alloc_root data_avail; + struct scoutfs_alloc_root data_freed; struct scoutfs_srch_file srch_file; __le64 rid; __le64 nr; } __packed; struct scoutfs_log_trees_val { - struct scoutfs_radix_root meta_avail; - struct scoutfs_radix_root meta_freed; + struct scoutfs_alloc_list_head meta_avail; + struct scoutfs_alloc_list_head meta_freed; struct scoutfs_btree_root item_root; struct scoutfs_btree_ref bloom_ref; - struct scoutfs_radix_root data_avail; - struct scoutfs_radix_root data_freed; + struct scoutfs_alloc_root data_avail; + struct scoutfs_alloc_root data_freed; struct scoutfs_srch_file srch_file; } __packed; @@ -482,6 +489,7 @@ struct scoutfs_bloom_block { #define SCOUTFS_TRANS_SEQ_ZONE 8 #define SCOUTFS_MOUNTED_CLIENT_ZONE 9 #define SCOUTFS_SRCH_ZONE 10 +#define SCOUTFS_FREE_EXTENT_ZONE 11 /* inode index zone */ #define SCOUTFS_INODE_INDEX_META_SEQ_TYPE 1 @@ -498,7 +506,7 @@ struct scoutfs_bloom_block { #define SCOUTFS_READDIR_TYPE 4 #define SCOUTFS_LINK_BACKREF_TYPE 5 #define SCOUTFS_SYMLINK_TYPE 6 -#define SCOUTFS_PACKED_EXTENT_TYPE 7 +#define SCOUTFS_DATA_EXTENT_TYPE 7 /* lock zone, only ever found in lock ranges, never in persistent items */ #define SCOUTFS_RENAME_TYPE 1 @@ -508,6 +516,10 @@ struct scoutfs_bloom_block { #define SCOUTFS_SRCH_BLOCKS_TYPE 2 #define SCOUTFS_SRCH_BUSY_TYPE 3 +/* free extents in allocator btrees in client and server, by blkno or len */ +#define SCOUTFS_FREE_EXTENT_BLKNO_TYPE 1 +#define SCOUTFS_FREE_EXTENT_LEN_TYPE 2 + /* * The extents that map blocks in a fixed-size logical region of a file * are packed and stored in item values. The packed extents are @@ -539,6 +551,12 @@ struct scoutfs_packed_extent { #define SCOUTFS_PACKEXT_BASE_MASK (~((__u64)SCOUTFS_PACKEXT_BLOCKS - 1)) #define SCOUTFS_PACKEXT_MAX_BYTES SCOUTFS_MAX_VAL_SIZE +/* file data extents have start and len in key */ +struct scoutfs_data_extent_val { + __le64 blkno; + __u8 flags; +} __packed; + #define SEF_OFFLINE (1 << 0) #define SEF_UNWRITTEN (1 << 1) #define SEF_UNKNOWN (U8_MAX << 2) @@ -623,10 +641,10 @@ struct scoutfs_super_block { __le64 unmount_barrier; __u8 quorum_count; struct scoutfs_inet_addr server_addr; - struct scoutfs_radix_root core_meta_avail; - struct scoutfs_radix_root core_meta_freed; - struct scoutfs_radix_root core_data_avail; - struct scoutfs_radix_root core_data_freed; + struct scoutfs_alloc_root meta_alloc[2]; + struct scoutfs_alloc_root data_alloc; + struct scoutfs_alloc_list_head server_meta_avail[2]; + struct scoutfs_alloc_list_head server_meta_freed[2]; struct scoutfs_btree_root fs_root; struct scoutfs_btree_root logs_root; struct scoutfs_btree_root lock_clients; diff --git a/kmod/src/lock_server.c b/kmod/src/lock_server.c index 5ef53cdd..ca590635 100644 --- a/kmod/src/lock_server.c +++ b/kmod/src/lock_server.c @@ -20,7 +20,6 @@ #include "tseq.h" #include "spbm.h" #include "block.h" -#include "radix.h" #include "btree.h" #include "msg.h" #include "scoutfs_trace.h" @@ -87,7 +86,7 @@ struct lock_server_info { struct scoutfs_tseq_tree tseq_tree; struct dentry *tseq_dentry; - struct scoutfs_radix_allocator *alloc; + struct scoutfs_alloc *alloc; struct scoutfs_block_writer *wri; }; @@ -956,7 +955,7 @@ static void lock_server_tseq_show(struct seq_file *m, * we time them out. */ int scoutfs_lock_server_setup(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); diff --git a/kmod/src/lock_server.h b/kmod/src/lock_server.h index 99c82b8d..c4fe5621 100644 --- a/kmod/src/lock_server.h +++ b/kmod/src/lock_server.h @@ -12,7 +12,7 @@ int scoutfs_lock_server_response(struct super_block *sb, u64 rid, int scoutfs_lock_server_farewell(struct super_block *sb, u64 rid); int scoutfs_lock_server_setup(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri); void scoutfs_lock_server_destroy(struct super_block *sb); diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h index a4d58bca..0465dd13 100644 --- a/kmod/src/scoutfs_trace.h +++ b/kmod/src/scoutfs_trace.h @@ -170,35 +170,35 @@ TRACE_EVENT(scoutfs_data_fallocate, ); TRACE_EVENT(scoutfs_data_fiemap, - TP_PROTO(struct super_block *sb, __u64 off, int i, __u64 blkno), + TP_PROTO(struct super_block *sb, __u64 start, __u64 len, int ret), - TP_ARGS(sb, off, i, blkno), + TP_ARGS(sb, start, len, ret), TP_STRUCT__entry( SCSB_TRACE_FIELDS - __field(__u64, off) - __field(int, i) - __field(__u64, blkno) + __field(__u64, start) + __field(__u64, len) + __field(int, ret) ), TP_fast_assign( SCSB_TRACE_ASSIGN(sb); - __entry->off = off; - __entry->i = i; - __entry->blkno = blkno; + __entry->start = start; + __entry->len = len; + __entry->ret = ret; ), - TP_printk(SCSBF" blk_off %llu i %u blkno %llu", SCSB_TRACE_ARGS, - __entry->off, __entry->i, __entry->blkno) + TP_printk(SCSBF" start %llu len %llu ret %d", SCSB_TRACE_ARGS, + __entry->start, __entry->len, __entry->ret) ); TRACE_EVENT(scoutfs_get_block, TP_PROTO(struct super_block *sb, __u64 ino, __u64 iblock, - int create, struct scoutfs_traced_extent *te, + int create, struct scoutfs_extent *ext, int ret, __u64 blkno, size_t size), - TP_ARGS(sb, ino, iblock, create, te, ret, blkno, size), + TP_ARGS(sb, ino, iblock, create, ext, ret, blkno, size), TP_STRUCT__entry( SCSB_TRACE_FIELDS @@ -216,7 +216,7 @@ TRACE_EVENT(scoutfs_get_block, __entry->ino = ino; __entry->iblock = iblock; __entry->create = create; - STE_ASSIGN(ext, te) + STE_ASSIGN(ext, ext) __entry->ret = ret; __entry->blkno = blkno; __entry->size = size; @@ -228,11 +228,35 @@ TRACE_EVENT(scoutfs_get_block, __entry->blkno, __entry->size) ); -TRACE_EVENT(scoutfs_data_file_extent_class, - TP_PROTO(struct super_block *sb, __u64 ino, - struct scoutfs_traced_extent *te), +TRACE_EVENT(scoutfs_data_alloc_block_enter, + TP_PROTO(struct super_block *sb, __u64 ino, __u64 iblock, + struct scoutfs_extent *ext), - TP_ARGS(sb, ino, te), + TP_ARGS(sb, ino, iblock, ext), + + TP_STRUCT__entry( + SCSB_TRACE_FIELDS + __field(__u64, ino) + __field(__u64, iblock) + STE_FIELDS(ext) + ), + + TP_fast_assign( + SCSB_TRACE_ASSIGN(sb); + __entry->ino = ino; + __entry->iblock = iblock; + STE_ASSIGN(ext, ext) + ), + + TP_printk(SCSBF" ino %llu iblock %llu ext "STE_FMT, + SCSB_TRACE_ARGS, __entry->ino, __entry->iblock, + STE_ENTRY_ARGS(ext)) +); + +DECLARE_EVENT_CLASS(scoutfs_data_file_extent_class, + TP_PROTO(struct super_block *sb, __u64 ino, struct scoutfs_extent *ext), + + TP_ARGS(sb, ino, ext), TP_STRUCT__entry( SCSB_TRACE_FIELDS @@ -243,36 +267,35 @@ TRACE_EVENT(scoutfs_data_file_extent_class, TP_fast_assign( SCSB_TRACE_ASSIGN(sb); __entry->ino = ino; - STE_ASSIGN(ext, te) + STE_ASSIGN(ext, ext) ), TP_printk(SCSBF" ino %llu ext "STE_FMT, SCSB_TRACE_ARGS, __entry->ino, STE_ENTRY_ARGS(ext)) ); -DEFINE_EVENT(scoutfs_data_file_extent_class, scoutfs_data_alloc_block, - TP_PROTO(struct super_block *sb, __u64 ino, - struct scoutfs_traced_extent *te), - TP_ARGS(sb, ino, te) +DEFINE_EVENT(scoutfs_data_file_extent_class, scoutfs_data_alloc, + TP_PROTO(struct super_block *sb, __u64 ino, struct scoutfs_extent *ext), + TP_ARGS(sb, ino, ext) ); -DEFINE_EVENT(scoutfs_data_file_extent_class, scoutfs_data_convert_unwritten, - TP_PROTO(struct super_block *sb, __u64 ino, - struct scoutfs_traced_extent *te), - TP_ARGS(sb, ino, te) +DEFINE_EVENT(scoutfs_data_file_extent_class, scoutfs_data_prealloc, + TP_PROTO(struct super_block *sb, __u64 ino, struct scoutfs_extent *ext), + TP_ARGS(sb, ino, ext) ); -DEFINE_EVENT(scoutfs_data_file_extent_class, scoutfs_data_prealloc_unwritten, - TP_PROTO(struct super_block *sb, __u64 ino, - struct scoutfs_traced_extent *te), - TP_ARGS(sb, ino, te) +DEFINE_EVENT(scoutfs_data_file_extent_class, scoutfs_data_get_block_found, + TP_PROTO(struct super_block *sb, __u64 ino, struct scoutfs_extent *ext), + TP_ARGS(sb, ino, ext) +); +DEFINE_EVENT(scoutfs_data_file_extent_class, scoutfs_data_get_block_mapped, + TP_PROTO(struct super_block *sb, __u64 ino, struct scoutfs_extent *ext), + TP_ARGS(sb, ino, ext) ); DEFINE_EVENT(scoutfs_data_file_extent_class, scoutfs_data_extent_truncated, - TP_PROTO(struct super_block *sb, __u64 ino, - struct scoutfs_traced_extent *te), - TP_ARGS(sb, ino, te) + TP_PROTO(struct super_block *sb, __u64 ino, struct scoutfs_extent *ext), + TP_ARGS(sb, ino, ext) ); DEFINE_EVENT(scoutfs_data_file_extent_class, scoutfs_data_fiemap_extent, - TP_PROTO(struct super_block *sb, __u64 ino, - struct scoutfs_traced_extent *te), - TP_ARGS(sb, ino, te) + TP_PROTO(struct super_block *sb, __u64 ino, struct scoutfs_extent *ext), + TP_ARGS(sb, ino, ext) ); TRACE_EVENT(scoutfs_data_truncate_items, @@ -300,9 +323,9 @@ TRACE_EVENT(scoutfs_data_truncate_items, TRACE_EVENT(scoutfs_data_wait_check, TP_PROTO(struct super_block *sb, __u64 ino, __u64 pos, __u64 len, - __u8 sef, __u8 op, struct scoutfs_traced_extent *te, int ret), + __u8 sef, __u8 op, struct scoutfs_extent *ext, int ret), - TP_ARGS(sb, ino, pos, len, sef, op, te, ret), + TP_ARGS(sb, ino, pos, len, sef, op, ext, ret), TP_STRUCT__entry( SCSB_TRACE_FIELDS @@ -322,7 +345,7 @@ TRACE_EVENT(scoutfs_data_wait_check, __entry->len = len; __entry->sef = sef; __entry->op = op; - STE_ASSIGN(ext, te) + STE_ASSIGN(ext, ext) __entry->ret = ret; ), diff --git a/kmod/src/server.c b/kmod/src/server.c index 751ea274..48e107ce 100644 --- a/kmod/src/server.c +++ b/kmod/src/server.c @@ -26,7 +26,6 @@ #include "counters.h" #include "inode.h" #include "block.h" -#include "radix.h" #include "btree.h" #include "scoutfs_trace.h" #include "msg.h" @@ -37,6 +36,7 @@ #include "quorum.h" #include "trans.h" #include "srch.h" +#include "alloc.h" /* * Every active mount can act as the server that listens on a net @@ -66,13 +66,10 @@ struct server_info { struct rw_semaphore commit_rwsem; struct llist_head commit_waiters; struct work_struct commit_work; - bool prepared_commit; /* server tracks seq use */ struct rw_semaphore seq_rwsem; - struct rw_semaphore alloc_rwsem; - struct list_head clients; unsigned long nr_clients; @@ -81,7 +78,15 @@ struct server_info { struct list_head farewell_requests; struct work_struct farewell_work; - struct scoutfs_radix_allocator alloc; + struct mutex alloc_mutex; + /* swap between two fs meta roots to increase time to reuse */ + struct scoutfs_alloc_root *meta_avail; + struct scoutfs_alloc_root *meta_freed; + /* server's meta allocators alternate between persistent heads */ + struct scoutfs_alloc alloc; + int other_ind; + struct scoutfs_alloc_list_head *other_avail; + struct scoutfs_alloc_list_head *other_freed; struct scoutfs_block_writer wri; struct mutex logs_mutex; @@ -119,15 +124,7 @@ static void stop_server(struct server_info *server) /* * Hold the shared rwsem that lets multiple holders modify blocks in the * current commit and prevents the commit worker from acquiring the - * exclusive write lock to write the commit. This can fail for the - * first holder failing to prepare a new commit. - * - * We reclaim the server's stable meta_freed blocks. This is run before - * anything has modified allocators in the server. We know that the - * stable meta_freed tree in the super contains all the stable free - * blocks which can be merged back into avail. We reference the stable - * freed tree in the super because the server allocator's freed tree is - * going to be added to as blocks are freed during the merge. + * exclusive write lock to write the commit. * * This is exported for server components isolated in their own files * (lock_server) and which are not called directly by the server core @@ -135,43 +132,13 @@ static void stop_server(struct server_info *server) */ int scoutfs_server_hold_commit(struct super_block *sb) { - struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super; DECLARE_SERVER_INFO(sb, server); - u64 tot; - int ret = 0; scoutfs_inc_counter(sb, server_commit_hold); down_read(&server->commit_rwsem); - while (!server->prepared_commit) { - up_read(&server->commit_rwsem); - down_write(&server->commit_rwsem); - - if (!server->prepared_commit) { - scoutfs_inc_counter(sb, server_commit_prepare); - BUG_ON(scoutfs_block_writer_dirty_bytes(sb, - &server->wri)); - tot = le64_to_cpu(super->core_meta_freed.ref.sm_total); - - ret = scoutfs_radix_merge(sb, &server->alloc, - &server->wri, - &server->alloc.avail, - &server->alloc.freed, - &super->core_meta_freed, - true, tot); - if (ret == 0) - server->prepared_commit = true; - } - - up_write(&server->commit_rwsem); - if (ret < 0) - break; - - down_read(&server->commit_rwsem); - } - - return ret; + return 0; } /* @@ -214,18 +181,6 @@ int scoutfs_server_apply_commit(struct super_block *sb, int err) return err; } -/* - * The caller is about to overwrite a ref to an alloc tree. As we do - * so we update the given super free block counter with the difference - * between the old and new allocator roots. - */ -static void update_free_blocks(__le64 *blocks, struct scoutfs_radix_root *prev, - struct scoutfs_radix_root *next) -{ - le64_add_cpu(blocks, le64_to_cpu(next->ref.sm_total) - - le64_to_cpu(prev->ref.sm_total)); -} - void scoutfs_server_get_roots(struct super_block *sb, struct scoutfs_net_roots *roots) { @@ -286,6 +241,31 @@ static void scoutfs_server_commit_func(struct work_struct *work) down_write(&server->commit_rwsem); + /* make sure next avail has sufficient blocks */ + ret = scoutfs_alloc_fill_list(sb, &server->alloc, &server->wri, + server->other_avail, + server->meta_avail, + SCOUTFS_SERVER_META_FILL_LO, + SCOUTFS_SERVER_META_FILL_TARGET); + if (ret) { + scoutfs_err(sb, "server error refilling avail: %d", ret); + goto out; + } + + /* merge freed blocks into extents, might be partial */ + ret = scoutfs_alloc_empty_list(sb, &server->alloc, &server->wri, + server->meta_freed, + server->other_freed); + if (ret) { + scoutfs_err(sb, "server error emptying freed: %d", ret); + goto out; + } + + ret = scoutfs_alloc_prepare_commit(sb, &server->alloc, &server->wri); + if (ret < 0) { + scoutfs_err(sb, "server error prepare alloc commit: %d", ret); + goto out; + } ret = scoutfs_block_writer_write(sb, &server->wri); if (ret) { @@ -293,13 +273,8 @@ static void scoutfs_server_commit_func(struct work_struct *work) goto out; } - update_free_blocks(&super->free_meta_blocks, &super->core_meta_avail, - &server->alloc.avail); - update_free_blocks(&super->free_meta_blocks, &super->core_meta_freed, - &server->alloc.freed); - - super->core_meta_avail = server->alloc.avail; - super->core_meta_freed = server->alloc.freed; + super->server_meta_avail[server->other_ind ^ 1] = server->alloc.avail; + super->server_meta_freed[server->other_ind ^ 1] = server->alloc.freed; ret = scoutfs_write_super(sb, super); if (ret) { @@ -307,9 +282,23 @@ static void scoutfs_server_commit_func(struct work_struct *work) goto out; } - server->prepared_commit = false; set_roots(server, &super->fs_root, &super->logs_root, &super->srch_root); + + /* swizzle the active and idle server alloc/freed heads */ + server->other_ind ^= 1; + server->alloc.avail = super->server_meta_avail[server->other_ind ^ 1]; + server->alloc.freed = super->server_meta_freed[server->other_ind ^ 1]; + server->other_avail = &super->server_meta_avail[server->other_ind]; + server->other_freed = &super->server_meta_freed[server->other_ind]; + + /* swap avail/free if avail gets low and freed is high */ + if (le64_to_cpu(server->meta_avail->total_len) <= + SCOUTFS_SERVER_META_ALLOC_MIN && + le64_to_cpu(server->meta_freed->total_len) > + SCOUTFS_SERVER_META_ALLOC_MIN) + swap(server->meta_avail, server->meta_freed); + ret = 0; out: node = llist_del_all(&server->commit_waiters); @@ -362,6 +351,34 @@ out: return scoutfs_net_response(sb, conn, cmd, id, ret, &ial, sizeof(ial)); } +/* + * Refill the destination root if it's fallen below the lo threshold by + * moving from the src root to bring it up to the target. + */ +static int alloc_move_refill(struct super_block *sb, + struct scoutfs_alloc_root *dst, + struct scoutfs_alloc_root *src, u64 lo, u64 target) +{ + DECLARE_SERVER_INFO(sb, server); + + if (le64_to_cpu(dst->total_len) >= lo) + return 0; + + return scoutfs_alloc_move(sb, &server->alloc, &server->wri, dst, src, + min(target - le64_to_cpu(dst->total_len), + le64_to_cpu(src->total_len))); +} + +static int alloc_move_empty(struct super_block *sb, + struct scoutfs_alloc_root *dst, + struct scoutfs_alloc_root *src) +{ + DECLARE_SERVER_INFO(sb, server); + + return scoutfs_alloc_move(sb, &server->alloc, &server->wri, + dst, src, le64_to_cpu(src->total_len)); +} + /* * Give the client roots to all the trees that they'll use to build * their transaction. @@ -383,8 +400,6 @@ static int server_get_log_trees(struct super_block *sb, struct scoutfs_log_trees_val ltv; struct scoutfs_log_trees lt; struct scoutfs_key key; - u64 count; - u64 target; int ret; if (arg_len != 0) { @@ -422,50 +437,25 @@ static int server_get_log_trees(struct super_block *sb, key.sklt_rid = cpu_to_le64(rid); key.sklt_nr = cpu_to_le64(1); memset(<v, 0, sizeof(ltv)); - scoutfs_radix_root_init(sb, <v.meta_avail, true); - scoutfs_radix_root_init(sb, <v.meta_freed, true); - scoutfs_radix_root_init(sb, <v.data_avail, false); - scoutfs_radix_root_init(sb, <v.data_freed, false); } - ret = scoutfs_radix_merge(sb, &server->alloc, &server->wri, - &server->alloc.avail, - <v.meta_freed, <v.meta_freed, true, - le64_to_cpu(ltv.meta_freed.ref.sm_total)) ?: - scoutfs_radix_merge(sb, &server->alloc, &server->wri, - &super->core_data_avail, - <v.data_freed, <v.data_freed, false, - le64_to_cpu(ltv.data_freed.ref.sm_total)); + /* return freed to server for emptying, refill avail */ + mutex_lock(&server->alloc_mutex); + ret = scoutfs_alloc_splice_list(sb, &server->alloc, &server->wri, + server->other_freed, + <v.meta_freed) ?: + alloc_move_empty(sb, &super->data_alloc, <v.data_freed) ?: + scoutfs_alloc_fill_list(sb, &server->alloc, &server->wri, + <v.meta_avail, server->meta_avail, + SCOUTFS_SERVER_META_FILL_LO, + SCOUTFS_SERVER_META_FILL_TARGET) ?: + alloc_move_refill(sb, <v.data_avail, &super->data_alloc, + SCOUTFS_SERVER_DATA_FILL_LO, + SCOUTFS_SERVER_DATA_FILL_TARGET); + mutex_unlock(&server->alloc_mutex); if (ret < 0) goto unlock; - /* ensure client has enough free metadata blocks for a transaction */ - target = (64*1024*1024) / SCOUTFS_BLOCK_LG_SIZE; - if (le64_to_cpu(ltv.meta_avail.ref.sm_total) < target) { - count = target - le64_to_cpu(ltv.meta_avail.ref.sm_total); - - ret = scoutfs_radix_merge(sb, &server->alloc, &server->wri, - <v.meta_avail, - &server->alloc.avail, - &server->alloc.avail, true, count); - if (ret < 0) - goto unlock; - } - - /* ensure client has enough free data blocks for a transaction */ - target = SCOUTFS_TRANS_DATA_ALLOC_HWM / SCOUTFS_BLOCK_SM_SIZE; - if (le64_to_cpu(ltv.data_avail.ref.sm_total) < target) { - count = target - le64_to_cpu(ltv.data_avail.ref.sm_total); - - ret = scoutfs_radix_merge(sb, &server->alloc, &server->wri, - <v.data_avail, - &super->core_data_avail, - &super->core_data_avail, false, - count); - if (ret < 0) - goto unlock; - } - /* update client's log tree's item */ ret = scoutfs_btree_force(sb, &server->alloc, &server->wri, &super->logs_root, &key, <v, sizeof(ltv)); @@ -553,21 +543,12 @@ static int server_commit_log_trees(struct super_block *sb, goto unlock; } - update_free_blocks(&super->free_meta_blocks, <v.meta_avail, - <->meta_avail); - update_free_blocks(&super->free_meta_blocks, <v.meta_freed, - <->meta_freed); - update_free_blocks(&super->free_data_blocks, <v.data_avail, - <->data_avail); - update_free_blocks(&super->free_data_blocks, <v.data_freed, - <->data_freed); - ltv.meta_avail = lt->meta_avail; ltv.meta_freed = lt->meta_freed; - ltv.item_root = lt->item_root; - ltv.bloom_ref = lt->bloom_ref; ltv.data_avail = lt->data_avail; ltv.data_freed = lt->data_freed; + ltv.item_root = lt->item_root; + ltv.bloom_ref = lt->bloom_ref; ltv.srch_file = lt->srch_file; ret = scoutfs_btree_update(sb, &server->alloc, &server->wri, @@ -638,7 +619,6 @@ static int reclaim_log_trees(struct super_block *sb, u64 rid) int err; mutex_lock(&server->logs_mutex); - down_write(&server->alloc_rwsem); /* find the client's existing item */ scoutfs_key_init_log_trees(&key, rid, 0); @@ -662,32 +642,25 @@ static int reclaim_log_trees(struct super_block *sb, u64 rid) /* * All of these can return errors after having modified the - * radix trees. We have to try and update the roots in the + * allocator trees. We have to try and update the roots in the * log item. */ - ret = scoutfs_radix_merge(sb, &server->alloc, &server->wri, - &server->alloc.avail, - <v.meta_avail, <v.meta_avail, true, - le64_to_cpu(ltv.meta_avail.ref.sm_total)) ?: - scoutfs_radix_merge(sb, &server->alloc, &server->wri, - &server->alloc.avail, - <v.meta_freed, <v.meta_freed, true, - le64_to_cpu(ltv.meta_freed.ref.sm_total)) ?: - scoutfs_radix_merge(sb, &server->alloc, &server->wri, - &super->core_data_avail, - <v.data_avail, <v.data_avail, false, - le64_to_cpu(ltv.data_avail.ref.sm_total)) ?: - scoutfs_radix_merge(sb, &server->alloc, &server->wri, - &super->core_data_avail, - <v.data_freed, <v.data_freed, false, - le64_to_cpu(ltv.data_freed.ref.sm_total)); + mutex_lock(&server->alloc_mutex); + ret = scoutfs_alloc_splice_list(sb, &server->alloc, &server->wri, + server->other_freed, + <v.meta_freed) ?: + scoutfs_alloc_splice_list(sb, &server->alloc, &server->wri, + server->other_freed, + <v.meta_avail) ?: + alloc_move_empty(sb, &super->data_alloc, <v.data_avail) ?: + alloc_move_empty(sb, &super->data_alloc, <v.data_freed); + mutex_unlock(&server->alloc_mutex); err = scoutfs_btree_update(sb, &server->alloc, &server->wri, &super->logs_root, &key, <v, sizeof(ltv)); BUG_ON(err != 0); /* alloc and log item roots out of sync */ out: - up_write(&server->alloc_rwsem); mutex_unlock(&server->logs_mutex); return ret; @@ -892,14 +865,14 @@ static int server_statfs(struct super_block *sb, nstatfs.next_ino = super->next_ino; spin_unlock(&sbi->next_ino_lock); - down_read(&server->alloc_rwsem); + mutex_lock(&server->alloc_mutex); nstatfs.total_blocks = le64_lg_to_sm(super->total_meta_blocks); le64_add_cpu(&nstatfs.total_blocks, le64_to_cpu(super->total_data_blocks)); nstatfs.bfree = le64_lg_to_sm(super->free_meta_blocks); le64_add_cpu(&nstatfs.bfree, le64_to_cpu(super->free_data_blocks)); - up_read(&server->alloc_rwsem); + mutex_unlock(&server->alloc_mutex); ret = 0; } else { ret = -EINVAL; @@ -1002,8 +975,6 @@ static int server_srch_get_compact(struct super_block *sb, int i; memset(&scin, 0, sizeof(scin)); - scoutfs_radix_root_init(sb, &scin.meta_avail, true); - scoutfs_radix_root_init(sb, &scin.meta_freed, true); if (arg_len != 0) { ret = -EINVAL; @@ -1028,9 +999,11 @@ static int server_srch_get_compact(struct super_block *sb, for (i = 0; i < scin.nr; i++) blocks += le64_to_cpu(scin.sfl[i].blocks); blocks *= 3; - ret = scoutfs_radix_merge(sb, &server->alloc, &server->wri, - &scin.meta_avail, &server->alloc.avail, - &server->alloc.avail, true, blocks); + mutex_lock(&server->alloc_mutex); + ret = scoutfs_alloc_fill_list(sb, &server->alloc, &server->wri, + &scin.meta_avail, server->meta_avail, + blocks, blocks); + mutex_unlock(&server->alloc_mutex); if (ret < 0) goto apply; @@ -1047,6 +1020,12 @@ out: &scin, sizeof(scin)); } +/* + * Commit the client's compaction. Their freed allocator contains the + * source srch files blocks that are currently in use which can't be + * available for allocation until after the commit. We move them into + * freed so they won't satisfy allocations. + */ static int server_srch_commit_compact(struct super_block *sb, struct scoutfs_net_connection *conn, u8 cmd, u64 id, void *arg, u16 arg_len) @@ -1056,8 +1035,8 @@ static int server_srch_commit_compact(struct super_block *sb, struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct scoutfs_super_block *super = &sbi->super; struct scoutfs_srch_compact_result *scres; - struct scoutfs_radix_root av; - struct scoutfs_radix_root fr; + struct scoutfs_alloc_list_head av; + struct scoutfs_alloc_list_head fr; int ret; scres = arg; @@ -1078,15 +1057,12 @@ static int server_srch_commit_compact(struct super_block *sb, if (ret < 0) /* XXX very bad, leaks allocators */ goto apply; - /* XXX like all merges, doesn't reclaim allocator blocks themselves */ - - /* merge the client's allocators into freed, commit before reuse */ - ret = scoutfs_radix_merge(sb, &server->alloc, &server->wri, - &server->alloc.freed, &av, &av, true, - le64_to_cpu(av.ref.sm_total)) ?: - scoutfs_radix_merge(sb, &server->alloc, &server->wri, - &server->alloc.freed, &fr, &fr, true, - le64_to_cpu(fr.ref.sm_total)); + mutex_lock(&server->alloc_mutex); + ret = scoutfs_alloc_splice_list(sb, &server->alloc, &server->wri, + server->other_freed, &av) ?: + scoutfs_alloc_splice_list(sb, &server->alloc, &server->wri, + server->other_freed, &fr); + mutex_unlock(&server->alloc_mutex); apply: ret = scoutfs_server_apply_commit(sb, ret); out: @@ -1149,14 +1125,15 @@ static int delete_mounted_client(struct super_block *sb, u64 rid) /* * Remove all the busy items for srch compactions that the mount might - * have been responsible for and reclaim all their allocators. + * have been responsible for and reclaim all their allocators. The freed + * allocator could still contain stable srch file blknos. */ static int cancel_srch_compact(struct super_block *sb, u64 rid) { DECLARE_SERVER_INFO(sb, server); struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super; - struct scoutfs_radix_root av; - struct scoutfs_radix_root fr; + struct scoutfs_alloc_list_head av; + struct scoutfs_alloc_list_head fr; int ret; for (;;) { @@ -1172,12 +1149,14 @@ static int cancel_srch_compact(struct super_block *sb, u64 rid) break; } - ret = scoutfs_radix_merge(sb, &server->alloc, &server->wri, - &server->alloc.freed, &av, &av, true, - le64_to_cpu(av.ref.sm_total)) ?: - scoutfs_radix_merge(sb, &server->alloc, &server->wri, - &server->alloc.freed, &fr, &fr, true, - le64_to_cpu(fr.ref.sm_total)); + mutex_lock(&server->alloc_mutex); + ret = scoutfs_alloc_splice_list(sb, &server->alloc, + &server->wri, + server->other_freed, &av) ?: + scoutfs_alloc_splice_list(sb, &server->alloc, + &server->wri, + server->other_freed, &fr); + mutex_unlock(&server->alloc_mutex); if (WARN_ON_ONCE(ret < 0)) break; } @@ -1650,10 +1629,27 @@ static void scoutfs_server_worker(struct work_struct *work) set_roots(server, &super->fs_root, &super->logs_root, &super->srch_root); - scoutfs_radix_init_alloc(&server->alloc, &super->core_meta_avail, - &super->core_meta_freed); scoutfs_block_writer_init(sb, &server->wri); + /* prepare server alloc for this transaction, larger first */ + if (le64_to_cpu(super->server_meta_avail[0].total_nr) < + le64_to_cpu(super->server_meta_avail[1].total_nr)) + server->other_ind = 0; + else + server->other_ind = 1; + scoutfs_alloc_init(&server->alloc, + &super->server_meta_avail[server->other_ind ^ 1], + &super->server_meta_freed[server->other_ind ^ 1]); + server->other_avail = &super->server_meta_avail[server->other_ind]; + server->other_freed = &super->server_meta_freed[server->other_ind]; + + /* use largest meta_alloc to start */ + server->meta_avail = &super->meta_alloc[0]; + server->meta_freed = &super->meta_alloc[1]; + if (le64_to_cpu(server->meta_freed->total_len) > + le64_to_cpu(server->meta_avail->total_len)) + swap(server->meta_avail, server->meta_freed); + ret = scoutfs_lock_server_setup(sb, &server->alloc, &server->wri); if (ret) goto shutdown; @@ -1783,11 +1779,11 @@ int scoutfs_server_setup(struct super_block *sb) init_llist_head(&server->commit_waiters); INIT_WORK(&server->commit_work, scoutfs_server_commit_func); init_rwsem(&server->seq_rwsem); - init_rwsem(&server->alloc_rwsem); INIT_LIST_HEAD(&server->clients); mutex_init(&server->farewell_mutex); INIT_LIST_HEAD(&server->farewell_requests); INIT_WORK(&server->farewell_work, farewell_worker); + mutex_init(&server->alloc_mutex); mutex_init(&server->logs_mutex); mutex_init(&server->srch_mutex); seqcount_init(&server->roots_seqcount); diff --git a/kmod/src/srch.c b/kmod/src/srch.c index db45fba8..2ecae4fb 100644 --- a/kmod/src/srch.c +++ b/kmod/src/srch.c @@ -23,7 +23,7 @@ #include "format.h" #include "counters.h" #include "block.h" -#include "radix.h" +#include "alloc.h" #include "srch.h" #include "btree.h" #include "spbm.h" @@ -309,7 +309,7 @@ enum { GFB_DIRTY = (1 << 1), }; static int get_file_block(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_srch_file *sfl, int gfb, u64 blk, struct scoutfs_block **bl_ret) @@ -335,7 +335,7 @@ static int get_file_block(struct super_block *sb, goto out; } - ret = scoutfs_radix_alloc(sb, alloc, wri, &blkno); + ret = scoutfs_alloc_meta(sb, alloc, wri, &blkno); if (ret < 0) goto out; @@ -383,7 +383,7 @@ static int get_file_block(struct super_block *sb, /* allocate a new block if we need it */ if (!ref->blkno || ((gfb & GFB_DIRTY) && !scoutfs_block_writer_is_dirty(sb, bl))) { - ret = scoutfs_radix_alloc(sb, alloc, wri, &blkno); + ret = scoutfs_alloc_meta(sb, alloc, wri, &blkno); if (ret < 0) goto out; @@ -395,8 +395,8 @@ static int get_file_block(struct super_block *sb, if (bl) { /* cow old block if we have one */ - ret = scoutfs_radix_free(sb, alloc, wri, - bl->blkno); + ret = scoutfs_free_meta(sb, alloc, wri, + bl->blkno); if (ret) goto out; @@ -442,7 +442,7 @@ out: /* return allocated blkno on error */ if (blkno > 0) { - err = scoutfs_radix_free(sb, alloc, wri, blkno); + err = scoutfs_free_meta(sb, alloc, wri, blkno); BUG_ON(err); /* radix should have been dirty */ } @@ -460,7 +460,7 @@ out: } int scoutfs_srch_add(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_srch_file *sfl, struct scoutfs_block **bl_ret, @@ -988,7 +988,7 @@ out: * it's large enough. */ int scoutfs_srch_rotate_log(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, struct scoutfs_srch_file *sfl) @@ -1018,13 +1018,13 @@ int scoutfs_srch_rotate_log(struct super_block *sb, * items. */ int scoutfs_srch_get_compact(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, u64 rid, struct scoutfs_srch_compact_input *scin) { - struct scoutfs_srch_compact_input busy_scin = {{0,}}; + struct scoutfs_srch_compact_input busy_scin = {{{0,}}}; struct scoutfs_srch_file sfl; SCOUTFS_BTREE_ITEM_REF(iref); struct scoutfs_spbm busy; @@ -1147,7 +1147,7 @@ out: * copy. */ int scoutfs_srch_update_compact(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, u64 rid, struct scoutfs_srch_compact_input *scin) @@ -1160,7 +1160,7 @@ int scoutfs_srch_update_compact(struct super_block *sb, } static int mod_srch_items(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, u8 scom_flags, bool ins, struct scoutfs_srch_file *sfls, int nr) @@ -1213,12 +1213,12 @@ static int mod_srch_items(struct super_block *sb, * We give the caller the allocator trees to merge if we return success. */ int scoutfs_srch_commit_compact(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, u64 rid, struct scoutfs_srch_compact_result *scres, - struct scoutfs_radix_root *av, - struct scoutfs_radix_root *fr) + struct scoutfs_alloc_list_head *av, + struct scoutfs_alloc_list_head *fr) { struct scoutfs_srch_compact_input scin; SCOUTFS_BTREE_ITEM_REF(iref); @@ -1268,11 +1268,11 @@ out: * allocators. Returns -ENOENT when there are no more items. */ int scoutfs_srch_cancel_compact(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, u64 rid, - struct scoutfs_radix_root *av, - struct scoutfs_radix_root *fr) + struct scoutfs_alloc_list_head *av, + struct scoutfs_alloc_list_head *fr) { struct scoutfs_srch_compact_input scin; SCOUTFS_BTREE_ITEM_REF(iref); @@ -1331,7 +1331,7 @@ typedef int (*kway_next_func_t)(struct super_block *sb, struct scoutfs_srch_entry *sre_ret, void *arg); static int kway_merge(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_srch_file *sfl, kway_next_func_t kway_next, void **args, int nr) @@ -1526,7 +1526,7 @@ static void swap_page_sre(void *A, void *B, int size) * typically, ~10x worst case). */ static int compact_logs(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_srch_file *sfl_out, struct scoutfs_srch_file *sfls, int nr_sfls) @@ -1715,7 +1715,7 @@ out: * which reads blocks and decodes entries. */ static int compact_sorted(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_srch_file *sfl_out, struct scoutfs_srch_file *sfls, int nr) @@ -1760,7 +1760,7 @@ out: * up our entire operation, partial state doesn't matter. */ static int free_file(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_srch_file *sfl) { @@ -1818,7 +1818,7 @@ static int free_file(struct super_block *sb, if (blkno == 0) continue; - ret = scoutfs_radix_free(sb, alloc, wri, blkno); + ret = scoutfs_free_meta(sb, alloc, wri, blkno); if (ret < 0) goto out; scoutfs_inc_counter(sb, srch_compact_free_block); @@ -1830,7 +1830,7 @@ static int free_file(struct super_block *sb, } free_root: - ret = scoutfs_radix_free(sb, alloc, wri, le64_to_cpu(sfl->ref.blkno)); + ret = scoutfs_free_meta(sb, alloc, wri, le64_to_cpu(sfl->ref.blkno)); if (ret < 0) goto out; @@ -1868,7 +1868,7 @@ static void scoutfs_srch_compact_worker(struct work_struct *work) struct srch_info *srinf = container_of(work, struct srch_info, compact_dwork.work); struct super_block *sb = srinf->sb; - struct scoutfs_radix_allocator alloc; + struct scoutfs_alloc alloc; struct scoutfs_srch_compact_result scres; struct scoutfs_srch_compact_input scin; struct scoutfs_block_writer wri; @@ -1883,7 +1883,7 @@ static void scoutfs_srch_compact_worker(struct work_struct *work) if (ret < 0 || scin.nr == 0) goto out; - scoutfs_radix_init_alloc(&alloc, &scin.meta_avail, &scin.meta_freed); + scoutfs_alloc_init(&alloc, &scin.meta_avail, &scin.meta_freed); if (scin.flags & SCOUTFS_SRCH_COMPACT_FLAG_LOG) ret = compact_logs(sb, &alloc, &wri, &scres.sfl, diff --git a/kmod/src/srch.h b/kmod/src/srch.h index 937692e9..97604bd6 100644 --- a/kmod/src/srch.h +++ b/kmod/src/srch.h @@ -22,7 +22,7 @@ struct scoutfs_srch_rb_node { node = rb_next(node)) int scoutfs_srch_add(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_srch_file *sfl, struct scoutfs_block **bl_ret, @@ -34,34 +34,34 @@ int scoutfs_srch_search_xattrs(struct super_block *sb, u64 hash, u64 ino, u64 last_ino, bool *done); int scoutfs_srch_rotate_log(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, struct scoutfs_srch_file *sfl); int scoutfs_srch_get_compact(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, u64 rid, struct scoutfs_srch_compact_input *scin_ret); int scoutfs_srch_update_compact(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, u64 rid, struct scoutfs_srch_compact_input *scin); int scoutfs_srch_commit_compact(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, u64 rid, struct scoutfs_srch_compact_result *scres, - struct scoutfs_radix_root *av, - struct scoutfs_radix_root *fr); + struct scoutfs_alloc_list_head *av, + struct scoutfs_alloc_list_head *fr); int scoutfs_srch_cancel_compact(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, + struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, u64 rid, - struct scoutfs_radix_root *av, - struct scoutfs_radix_root *fr); + struct scoutfs_alloc_list_head *av, + struct scoutfs_alloc_list_head *fr); void scoutfs_srch_destroy(struct super_block *sb); int scoutfs_srch_setup(struct super_block *sb); diff --git a/kmod/src/trans.c b/kmod/src/trans.c index af659bd9..9f36a19d 100644 --- a/kmod/src/trans.c +++ b/kmod/src/trans.c @@ -25,7 +25,7 @@ #include "counters.h" #include "client.h" #include "inode.h" -#include "radix.h" +#include "alloc.h" #include "block.h" #include "msg.h" #include "item.h" @@ -66,7 +66,7 @@ struct trans_info { bool writing; struct scoutfs_log_trees lt; - struct scoutfs_radix_allocator alloc; + struct scoutfs_alloc alloc; struct scoutfs_block_writer wri; }; @@ -112,8 +112,7 @@ int scoutfs_trans_get_log_trees(struct super_block *sb) ret = scoutfs_client_get_log_trees(sb, <); if (ret == 0) { tri->lt = lt; - scoutfs_radix_init_alloc(&tri->alloc, <.meta_avail, - <.meta_freed); + scoutfs_alloc_init(&tri->alloc, <.meta_avail, <.meta_freed); scoutfs_block_writer_init(sb, &tri->wri); scoutfs_forest_init_btrees(sb, &tri->alloc, &tri->wri, <); @@ -195,6 +194,9 @@ void scoutfs_trans_write_func(struct work_struct *work) /* XXX this all needs serious work for dealing with errors */ ret = (s = "data submit", scoutfs_inode_walk_writeback(sb, true)) ?: (s = "item dirty", scoutfs_item_write_dirty(sb)) ?: + (s = "data prepare", scoutfs_data_prepare_commit(sb)) ?: + (s = "alloc prepare", scoutfs_alloc_prepare_commit(sb, + &tri->alloc, &tri->wri)) ?: (s = "meta write", scoutfs_block_writer_write(sb, &tri->wri)) ?: (s = "data wait", scoutfs_inode_walk_writeback(sb, false)) ?: (s = "commit log trees", commit_btrees(sb)) ?: @@ -369,7 +371,13 @@ static bool acquired_hold(struct super_block *sb, /* XXX arbitrarily limit to 8 meg transactions */ if (scoutfs_item_dirty_bytes(sb) >= (8 * 1024 * 1024)) { - scoutfs_inc_counter(sb, trans_commit_full); + scoutfs_inc_counter(sb, trans_commit_dirty_meta_full); + queue_trans_work(sbi); + goto out; + } + + if (scoutfs_alloc_meta_lo_thresh(sb, &tri->alloc)) { + scoutfs_inc_counter(sb, trans_commit_meta_alloc_low); queue_trans_work(sbi); goto out; }