From 1fde47170ba1fe69143d1af40fe2fbf0e3159d7b Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Tue, 2 Aug 2016 13:26:52 -0700 Subject: [PATCH] scoutfs: simplify btree block format Now that we are using fixed smaller blocks we can make the btree format significantly simpler. The fixed small block size limits the number of items that will be stored in each block. We can use a simple sorted array of item offsets to maintain the item sort order instead of the treap. Getting rid of the treap not only removes a bunch of code, it makes tasks like verifying or repairing a btree block a lot simpler. The main impact on the code is that now an item doesn't record its position in the sort order. Users of sorted item position now need to track an items sorted position instead of just the item. Signed-off-by: Zach Brown --- kmod/src/Makefile | 2 +- kmod/src/btree.c | 625 ++++++++++++++++++++++++++-------------------- kmod/src/btree.h | 4 +- kmod/src/format.h | 29 +-- kmod/src/treap.c | 386 ---------------------------- kmod/src/treap.h | 38 --- 6 files changed, 375 insertions(+), 709 deletions(-) delete mode 100644 kmod/src/treap.c delete mode 100644 kmod/src/treap.h diff --git a/kmod/src/Makefile b/kmod/src/Makefile index 2e484055..143929b7 100644 --- a/kmod/src/Makefile +++ b/kmod/src/Makefile @@ -4,4 +4,4 @@ CFLAGS_scoutfs_trace.o = -I$(src) # define_trace.h double include scoutfs-y += block.o btree.o buddy.o counters.o crc.o dir.o filerw.o \ inode.o ioctl.o msg.o name.o scoutfs_trace.o super.o trans.o \ - treap.o xattr.o + xattr.o diff --git a/kmod/src/btree.c b/kmod/src/btree.c index df0b75e3..57e9ffe9 100644 --- a/kmod/src/btree.c +++ b/kmod/src/btree.c @@ -13,12 +13,12 @@ #include #include #include +#include #include "super.h" #include "format.h" #include "block.h" #include "key.h" -#include "treap.h" #include "btree.h" /* @@ -26,19 +26,14 @@ * sized keys and variable length values. * * Items are stored as a small header with the key followed by the - * value. New items are appended to the end of the block. Free space - * is not indexed. Deleted items can be reclaimed by walking all the - * items from the front of the block and moving later live items onto - * earlier deleted items. + * value. New items are allocated from the back of the block towards + * the front. Deleted items can be reclaimed by packing items towards + * the back of the block by walking them in reverse offset order. * - * The items are kept in a treap sorted by their keys. Using a dynamic - * structure keeps the modification costs low. Modifying persistent - * structures avoids translation to and from run-time structures around - * read and write. The treap was chosen because it's very simple to - * implement and has some cool merging and splitting functions that we - * could make use of. The treap has parent pointers so that we can - * perform operations relative to a node without having to keep a record - * of the path down the tree. + * A dense array of item offsets after the btree block header header + * maintains the sorted order of the items by their keys. The array is + * small enough that the memmoves to keep it dense involves a few cache + * lines at most. * * Parent blocks in the btree have the same format as leaf blocks. * There's one key for every child reference instead of having separator @@ -61,142 +56,161 @@ * XXX * - do we want a level in the btree header? seems like we would? * - validate structures on read? + * - internal bh/pos/cmp interface is clumsy.. could use cursor */ -/* size of the item with a value of the given length */ +/* number of contiguous bytes used by the item header and val of given len */ static inline unsigned int val_bytes(unsigned int val_len) { return sizeof(struct scoutfs_btree_item) + val_len; } +/* number of contiguous bytes used by the item header its current value */ static inline unsigned int item_bytes(struct scoutfs_btree_item *item) { return val_bytes(le16_to_cpu(item->val_len)); } +/* total bytes consumed by an item with given val len: offset, header, value */ +static inline unsigned int all_val_bytes(unsigned int val_len) +{ + return sizeof(((struct scoutfs_btree_block *)NULL)->item_offs[0]) + + val_bytes(val_len); +} + +/* total bytes consumed by an item with its current value */ +static inline unsigned int all_item_bytes(struct scoutfs_btree_item *item) +{ + return all_val_bytes(le16_to_cpu(item->val_len)); +} + +/* number of contig free bytes between item offset and first item */ +static inline unsigned int contig_free(struct scoutfs_btree_block *bt) +{ + return le16_to_cpu(bt->free_end) - + offsetof(struct scoutfs_btree_block, item_offs[bt->nr_items]); +} + +/* number of contig bytes free after reclaiming free amongst items */ +static inline unsigned int reclaimable_free(struct scoutfs_btree_block *bt) +{ + return contig_free(bt) + le16_to_cpu(bt->free_reclaim); +} + +/* all bytes used by item offsets, headers, and values */ static inline unsigned int used_total(struct scoutfs_btree_block *bt) { return SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_btree_block) - - le16_to_cpu(bt->total_free); + reclaimable_free(bt); } -static int cmp_tnode_items(struct scoutfs_treap_node *A, - struct scoutfs_treap_node *B) +static inline struct scoutfs_btree_item * +off_item(struct scoutfs_btree_block *bt, __le16 off) { - struct scoutfs_btree_item *a; - struct scoutfs_btree_item *b; - - a = container_of(A, struct scoutfs_btree_item, tnode); - b = container_of(B, struct scoutfs_btree_item, tnode); - - return scoutfs_key_cmp(&a->key, &b->key); + return (void *)bt + le16_to_cpu(off); } -/* A bunch of wrappers for navigating items through treap nodes. */ - -#define BT_TREAP_KEY_WRAPPER(which) \ -static struct scoutfs_btree_item *bt_##which(struct scoutfs_btree_block *bt, \ - struct scoutfs_key *key) \ -{ \ - struct scoutfs_btree_item dummy = { .key = *key }; \ - struct scoutfs_treap_node *node; \ - \ - node = scoutfs_treap_##which(&bt->treap, cmp_tnode_items, \ - &dummy.tnode); \ - if (!node) \ - return NULL; \ - \ - return container_of(node, struct scoutfs_btree_item, tnode); \ -} - -BT_TREAP_KEY_WRAPPER(lookup) -/* BT_TREAP_KEY_WRAPPER(before) */ -BT_TREAP_KEY_WRAPPER(after) - -#define BT_TREAP_ROOT_WRAPPER(which) \ -static struct scoutfs_btree_item *bt_##which(struct scoutfs_btree_block *bt) \ -{ \ - struct scoutfs_treap_node *node; \ - \ - node = scoutfs_treap_##which(&bt->treap); \ - if (!node) \ - return NULL; \ - \ - return container_of(node, struct scoutfs_btree_item, tnode); \ -} - -BT_TREAP_ROOT_WRAPPER(first) -BT_TREAP_ROOT_WRAPPER(last) - -#define BT_TREAP_NODE_WRAPPER(which) \ -static struct scoutfs_btree_item *bt_##which(struct scoutfs_btree_block *bt, \ - struct scoutfs_btree_item *item)\ -{ \ - struct scoutfs_treap_node *node; \ - \ - node = scoutfs_treap_##which(&bt->treap, &item->tnode); \ - if (!node) \ - return NULL; \ - \ - return container_of(node, struct scoutfs_btree_item, tnode); \ -} - -BT_TREAP_NODE_WRAPPER(next) -BT_TREAP_NODE_WRAPPER(prev) - -static inline struct scoutfs_key *least_key(struct scoutfs_btree_block *bt) +static inline struct scoutfs_btree_item * +pos_item(struct scoutfs_btree_block *bt, unsigned int pos) { - return &bt_first(bt)->key; + return off_item(bt, bt->item_offs[pos]); } static inline struct scoutfs_key *greatest_key(struct scoutfs_btree_block *bt) { - return &bt_last(bt)->key; + return &pos_item(bt, bt->nr_items - 1)->key; } /* - * Allocate and insert a new item into the block. + * Returns the sorted item position that an item with the given key + * should occupy. * - * The caller has made sure that there's room for everything. + * It sets *cmp to the final comparison of the given key and the + * position's item key. * - * The caller is responsible for initializing the value. + * If the given key is greater then all items' keys then the number of + * items can be returned. Callers need to be careful to test for this + * invalid index. + */ +static int find_pos(struct scoutfs_btree_block *bt, struct scoutfs_key *key, + int *cmp) +{ + unsigned int start = 0; + unsigned int end = bt->nr_items; + unsigned int pos = 0; + + *cmp = -1; + + while (start < end) { + pos = start + (end - start) / 2; + + *cmp = scoutfs_key_cmp(key, &pos_item(bt, pos)->key); + if (*cmp < 0) { + end = pos; + } else if (*cmp > 0) { + start = ++pos; + *cmp = -1; + } else { + break; + } + } + + return pos; +} + +/* move a number of contigous elements from the src index to the dst index */ +#define memmove_arr(arr, dst, src, nr) \ + memmove(&(arr)[dst], &(arr)[src], (nr) * sizeof(*(arr))) + +/* + * Allocate and insert a new item into the block. The caller has made + * sure that there's room for everything. The caller is responsible for + * initializing the value. */ static struct scoutfs_btree_item *create_item(struct scoutfs_btree_block *bt, + unsigned int pos, struct scoutfs_key *key, unsigned int val_len) { - unsigned int bytes = val_bytes(val_len); struct scoutfs_btree_item *item; - item = (void *)((char *)bt + SCOUTFS_BLOCK_SIZE - - le16_to_cpu(bt->tail_free)); - le16_add_cpu(&bt->tail_free, -bytes); - le16_add_cpu(&bt->total_free, -bytes); - le16_add_cpu(&bt->nr_items, 1); + if (pos < bt->nr_items) + memmove_arr(bt->item_offs, pos + 1, pos, bt->nr_items - pos); + le16_add_cpu(&bt->free_end, -val_bytes(val_len)); + bt->item_offs[pos] = bt->free_end; + bt->nr_items++; + + item = pos_item(bt, pos); item->key = *key; item->seq = bt->hdr.seq; item->val_len = cpu_to_le16(val_len); - scoutfs_treap_insert(&bt->treap, cmp_tnode_items, &item->tnode); + trace_printk("pos %u off %u\n", pos, le16_to_cpu(bt->item_offs[pos])); return item; } -#define MAGIC_DELETED_PARENT cpu_to_le16(1) - /* - * Delete an item from a btree block. We set the deleted item's parent - * treap offset to a magic value for compaction. + * Delete an item from a btree block. We record the amount of space it + * frees to later decide if we can satisfy an insertion by compaction + * instead of splitting. */ -static void delete_item(struct scoutfs_btree_block *bt, - struct scoutfs_btree_item *item) +static void delete_item(struct scoutfs_btree_block *bt, unsigned int pos) { - scoutfs_treap_delete(&bt->treap, &item->tnode); - item->tnode.parent = MAGIC_DELETED_PARENT; + struct scoutfs_btree_item *item = pos_item(bt, pos); - le16_add_cpu(&bt->total_free, item_bytes(item)); - le16_add_cpu(&bt->nr_items, -1); + trace_printk("pos %u off %u\n", pos, le16_to_cpu(bt->item_offs[pos])); + + if (pos < (bt->nr_items - 1)) + memmove_arr(bt->item_offs, pos, pos + 1, + bt->nr_items - 1 - pos); + + le16_add_cpu(&bt->free_reclaim, item_bytes(item)); + bt->nr_items--; + + /* wipe deleted items to avoid leaking data */ + memset(item, 0, item_bytes(item)); } /* @@ -204,40 +218,71 @@ static void delete_item(struct scoutfs_btree_block *bt, * tells us if we're moving from the tail of the source block right to * the head of the destination block, or vice versa. We stop moving * once we've moved enough bytes of items. - * - * XXX This could use fancy treap splitting and merging. We don't need - * to go there yet. */ static void move_items(struct scoutfs_btree_block *dst, struct scoutfs_btree_block *src, bool move_right, int to_move) { struct scoutfs_btree_item *from; - struct scoutfs_btree_item *del; struct scoutfs_btree_item *to; - unsigned int val_len; + unsigned int t; + unsigned int f; - if (move_right) - from = bt_last(src); - else - from = bt_first(src); - - while (from && to_move > 0) { - val_len = le16_to_cpu(from->val_len); - - to = create_item(dst, &from->key, val_len); - memcpy(to->val, from->val, val_len); - to->seq = from->seq; - - del = from; - if (move_right) - from = bt_prev(src, from); - else - from = bt_next(src, from); - - delete_item(src, del); - to_move -= item_bytes(to); + if (move_right) { + f = src->nr_items - 1; + t = 0; + } else { + f = 0; + t = dst->nr_items; } + + while (f < src->nr_items && to_move > 0) { + from = pos_item(src, f); + + to = create_item(dst, t, &from->key, + le16_to_cpu(from->val_len)); + + memcpy(to, from, item_bytes(from)); + to_move -= all_item_bytes(from); + + delete_item(src, f); + if (move_right) + f--; + else + t++; + } +} + +static struct scoutfs_btree_block *aligned_bt(const void *ptr) +{ + unsigned long addr = (unsigned long)ptr; + + return (void *)(addr & ~((unsigned long)SCOUTFS_BLOCK_MASK)); +} + +static int sort_key_cmp(const void *A, const void *B) +{ + struct scoutfs_btree_block *bt = aligned_bt(A); + const __le16 * __packed a = A; + const __le16 * __packed b = B; + + return scoutfs_key_cmp(&off_item(bt, *a)->key, &off_item(bt, *b)->key); +} + +static int sort_off_cmp(const void *A, const void *B) +{ + const __le16 * __packed a = A; + const __le16 * __packed b = B; + + return (int)le16_to_cpu(*a) - (int)le16_to_cpu(*b); +} + +static void sort_off_swap(void *A, void *B, int size) +{ + __le16 * __packed a = A; + __le16 * __packed b = B; + + swap(*a, *b); } /* @@ -248,14 +293,20 @@ static void move_items(struct scoutfs_btree_block *dst, * items. * * We don't bother implementing free space indexing and addressing that - * corner case. Instead we track the number of total free bytes in the - * block. If free space needed is available in the block but is not - * available at the end of the block then we reclaim the fragmented free - * space by compacting the items. + * corner case. Instead we track the number of bytes that could be + * reclaimed if we compacted the item space after the free_end offset. + * block. If this additional free space would satisfy an insertion then + * we compact the items instead of splitting the block. * - * We move the free space to the tail of the block by walk forward - * through the items in allocated order moving live items back in to - * free space. + * We move the free space to the center of the block by walking + * backwards through the items in offset order, moving items into free + * space between items towards the end of the block. + * + * We don't have specific metadata to either walk the items in offset + * order or to update the item offsets as we move items. We sort the + * item offset array to achieve both ends. First we sort it by offset + * so we can walk in reverse order. As we move items we update their + * position and then sort by keys once we're done. * * Compaction is only attempted during descent as we find a block that * needs more or less free space. The caller has the parent locked for @@ -264,30 +315,49 @@ static void move_items(struct scoutfs_btree_block *dst, */ static void compact_items(struct scoutfs_btree_block *bt) { - struct scoutfs_btree_item *from = (void *)(bt + 1); - struct scoutfs_btree_item *to = from; + struct scoutfs_btree_item *from; + struct scoutfs_btree_item *to; unsigned int bytes; - unsigned int i; + __le16 end; + int i; + + trace_printk("free_reclaim %u\n", le16_to_cpu(bt->free_reclaim)); + + sort(bt->item_offs, bt->nr_items, sizeof(bt->item_offs[0]), + sort_off_cmp, sort_off_swap); + + end = cpu_to_le16(SCOUTFS_BLOCK_SIZE); + + for (i = bt->nr_items - 1; i >= 0; i--) { + from = pos_item(bt, i); - for (i = 0; i < le16_to_cpu(bt->nr_items); i++) { bytes = item_bytes(from); + le16_add_cpu(&end, -bytes); + to = off_item(bt, end); + bt->item_offs[i] = end; - if (from->tnode.parent != MAGIC_DELETED_PARENT) { - if (from != to) { - memmove(to, from, bytes); - scoutfs_treap_move(&bt->treap, &from->tnode, - &to->tnode); - } - to = (void *)to + bytes; - } else { - i--; - } - - from = (void *)from + bytes; + if (from != to) + memmove(to, from, bytes); } - bytes = SCOUTFS_BLOCK_SIZE - ((char *)to - (char *)bt); - bt->tail_free = cpu_to_le16(bytes); + bt->free_end = end; + bt->free_reclaim = 0; + + sort(bt->item_offs, bt->nr_items, sizeof(bt->item_offs[0]), + sort_key_cmp, sort_off_swap); +} + +/* sorting relies on masking pointers to find the containing block */ +static inline struct buffer_head *check_bh_alignment(struct buffer_head *bh) +{ + struct scoutfs_btree_block *bt = bh_data(bh); + + if (!IS_ERR_OR_NULL(bh) && WARN_ON_ONCE(aligned_bt(bt) != bt)) { + scoutfs_block_put(bh); + return ERR_PTR(-EIO); + } + + return bh; } /* @@ -303,14 +373,12 @@ static struct buffer_head *alloc_tree_block(struct super_block *sb) if (!IS_ERR(bh)) { bt = bh_data(bh); - bt->treap.off = 0; - bt->total_free = cpu_to_le16(SCOUTFS_BLOCK_SIZE - - sizeof(struct scoutfs_btree_block)); - bt->tail_free = bt->total_free; + bt->free_end = cpu_to_le16(SCOUTFS_BLOCK_SIZE); + bt->free_reclaim = 0; bt->nr_items = 0; } - return bh; + return check_bh_alignment(bh); } /* the caller has ensured that the free must succeed */ @@ -342,11 +410,26 @@ static struct buffer_head *grow_tree(struct super_block *sb, return bh; } +static struct buffer_head *get_block_ref(struct super_block *sb, + struct scoutfs_block_ref *ref, + bool dirty) +{ + struct buffer_head *bh; + + if (dirty) + bh = scoutfs_block_dirty_ref(sb, ref); + else + bh = scoutfs_block_read_ref(sb, ref); + + return check_bh_alignment(bh); +} + /* * Create a new item in the parent which references the child. The caller * specifies the key in the item that describes the items in the child. */ static void create_parent_item(struct scoutfs_btree_block *parent, + unsigned int pos, struct scoutfs_btree_block *child, struct scoutfs_key *key) { @@ -356,7 +439,7 @@ static void create_parent_item(struct scoutfs_btree_block *parent, .seq = child->hdr.seq, }; - item = create_item(parent, key, sizeof(ref)); + item = create_item(parent, pos, key, sizeof(ref)); memcpy(&item->val, &ref, sizeof(ref)); } @@ -385,23 +468,24 @@ static struct buffer_head *try_split(struct super_block *sb, int level, struct scoutfs_key *key, unsigned int val_len, struct scoutfs_btree_block *parent, - struct scoutfs_btree_item *par_item, + unsigned int parent_pos, struct buffer_head *right_bh) { struct scoutfs_btree_block *right = bh_data(right_bh); struct scoutfs_btree_block *left; struct buffer_head *left_bh; struct buffer_head *par_bh = NULL; - unsigned int bytes; + struct scoutfs_key maximal; + unsigned int all_bytes; if (level) val_len = sizeof(struct scoutfs_block_ref); - bytes = val_bytes(val_len); + all_bytes = all_val_bytes(val_len); - if (le16_to_cpu(right->tail_free) >= bytes) + if (contig_free(right) >= all_bytes) return right_bh; - if (le16_to_cpu(right->total_free) >= bytes) { + if (reclaimable_free(right) >= all_bytes) { compact_items(right); return right_bh; } @@ -424,27 +508,25 @@ static struct buffer_head *try_split(struct super_block *sb, } parent = bh_data(par_bh); - } + parent_pos = 0; - /* only grow the tree once we have the split neighbour */ - if (par_bh) { - struct scoutfs_key maximal; scoutfs_set_max_key(&maximal); - create_parent_item(parent, right, &maximal); + create_parent_item(parent, parent_pos, right, &maximal); } move_items(left, right, false, used_total(right) / 2); - create_parent_item(parent, left, greatest_key(left)); + create_parent_item(parent, parent_pos, left, greatest_key(left)); + parent_pos++; /* not that anything uses it again :P */ if (scoutfs_key_cmp(key, greatest_key(left)) <= 0) { /* insertion will go to the new left block */ scoutfs_block_put(right_bh); right_bh = left_bh; } else { - /* insertion will still go through us, might need to compact */ scoutfs_block_put(left_bh); - if (le16_to_cpu(right->tail_free) < bytes) + /* insertion will still go through us, might need to compact */ + if (contig_free(right) < all_bytes) compact_items(right); } @@ -477,29 +559,31 @@ static struct buffer_head *try_split(struct super_block *sb, static struct buffer_head *try_merge(struct super_block *sb, struct scoutfs_btree_root *root, struct scoutfs_btree_block *parent, - struct scoutfs_btree_item *par_item, + unsigned int pos, struct buffer_head *bh) { struct scoutfs_btree_block *bt = bh_data(bh); + struct scoutfs_btree_item *sib_item; struct scoutfs_btree_block *sib_bt; struct buffer_head *sib_bh; - struct scoutfs_btree_item *sib_item; - int to_move; + unsigned int sib_pos; bool move_right; + int to_move; - if (le16_to_cpu(bt->total_free) <= SCOUTFS_BTREE_FREE_LIMIT) + if (reclaimable_free(bt) <= SCOUTFS_BTREE_FREE_LIMIT) return bh; /* move items right into our block if we have a left sibling */ - sib_item = bt_prev(parent, par_item); - if (sib_item) { - move_right = false; - } else { - sib_item = bt_next(parent, par_item); + if (pos) { + sib_pos = pos - 1; move_right = true; + } else { + sib_pos = pos + 1; + move_right = false; } + sib_item = pos_item(parent, sib_pos); - sib_bh = scoutfs_block_dirty_ref(sb, (void *)sib_item->val); + sib_bh = get_block_ref(sb, (void *)sib_item->val, true); if (IS_ERR(sib_bh)) { /* XXX do we need to unlock this? don't think so */ scoutfs_block_put(bh); @@ -507,31 +591,33 @@ static struct buffer_head *try_merge(struct super_block *sb, } sib_bt = bh_data(sib_bh); - if (used_total(sib_bt) <= le16_to_cpu(bt->total_free)) + if (used_total(sib_bt) <= reclaimable_free(bt)) to_move = used_total(sib_bt); else - to_move = le16_to_cpu(bt->total_free) - - SCOUTFS_BTREE_FREE_LIMIT; + to_move = reclaimable_free(bt) - SCOUTFS_BTREE_FREE_LIMIT; - if (le16_to_cpu(bt->tail_free) < to_move) + if (contig_free(bt) < to_move) compact_items(bt); + trace_printk("sib_pos %d move_right %u to_move %u\n", + sib_pos, move_right, to_move); + move_items(bt, sib_bt, move_right, to_move); /* update our parent's ref if we changed our greatest key */ if (!move_right) - par_item->key = *greatest_key(bt); + pos_item(parent, pos)->key = *greatest_key(bt); /* delete an empty sib or update if we changed its greatest key */ if (sib_bt->nr_items == 0) { - delete_item(parent, sib_item); + delete_item(parent, sib_pos); free_tree_block(sb, sib_bt->hdr.blkno); } else if (move_right) { sib_item->key = *greatest_key(sib_bt); } /* and finally shrink the tree if our parent is the root with 1 */ - if (le16_to_cpu(parent->nr_items) == 1) { + if (parent->nr_items == 1) { root->height--; root->ref.blkno = bt->hdr.blkno; root->ref.seq = bt->hdr.seq; @@ -599,44 +685,50 @@ static u64 item_block_ref_seq(struct scoutfs_btree_item *item) * number. If it's a parent then we test the block ref's seq, if it's a * leaf item then we check the item's seq. */ -static int item_skip_seq(struct scoutfs_btree_item *item, +static bool skip_pos_seq(struct scoutfs_btree_block *bt, unsigned int pos, int level, u64 seq, int op) { - return op == WALK_NEXT_SEQ && item && - ((level > 0 && item_block_ref_seq(item) < seq) || + struct scoutfs_btree_item *item; + + if (op != WALK_NEXT_SEQ || pos >= bt->nr_items) + return false; + + item = pos_item(bt, pos); + + return ((level > 0 && item_block_ref_seq(item) < seq) || (level == 0 && le64_to_cpu(item->seq) < seq)); } /* - * Return the next item, possibly skipping those with sequence numbers - * less than the desired sequence number. + * Return the next sorted item position, possibly skipping those with + * sequence numbers less than the desired sequence number. */ -static struct scoutfs_btree_item * -item_next_seq(struct scoutfs_btree_block *bt, struct scoutfs_btree_item *item, - int level, u64 seq, int op) +static unsigned int next_pos_seq(struct scoutfs_btree_block *bt, + unsigned int pos, int level, u64 seq, int op) { do { - item = bt_next(bt, item); - } while (item_skip_seq(item, level, seq, op)); + pos++; + } while (skip_pos_seq(bt, pos, level, seq, op)); - return item; + return pos; } /* * Return the first item after the given key, possibly skipping those * with sequence numbers less than the desired sequence number. */ -static struct scoutfs_btree_item * -item_after_seq(struct scoutfs_btree_block *bt, struct scoutfs_key *key, - int level, u64 seq, int op) +static unsigned int find_pos_after_seq(struct scoutfs_btree_block *bt, + struct scoutfs_key *key, int level, + u64 seq, int op) { - struct scoutfs_btree_item *item; + unsigned int pos; + int cmp; - item = bt_after(bt, key); - if (item_skip_seq(item, level, seq, op)) - item = item_next_seq(bt, item, level, seq, op); + pos = find_pos(bt, key, &cmp); + if (skip_pos_seq(bt, pos, level, seq, op)) + pos = next_pos_seq(bt, pos, level, seq, op); - return item; + return pos; } /* @@ -662,6 +754,7 @@ static struct buffer_head *btree_walk(struct super_block *sb, struct scoutfs_btree_item *item = NULL; struct scoutfs_block_ref *ref; unsigned int level; + unsigned int pos = 0; const bool dirty = op == WALK_INSERT || op == WALK_DELETE || op == WALK_DIRTY; @@ -697,29 +790,15 @@ static struct buffer_head *btree_walk(struct super_block *sb, while (level--) { /* XXX hmm, need to think about retry */ - if (dirty) { - bh = scoutfs_block_dirty_ref(sb, ref); - } else { - bh = scoutfs_block_read_ref(sb, ref); - } + bh = get_block_ref(sb, ref, dirty); if (IS_ERR(bh)) break; - /* - * Update the next key an iterator should read from. - * Keep in mind that iteration is read only so the - * parent item won't be changed splitting or merging. - */ - if (parent && next_key) { - *next_key = item->key; - scoutfs_inc_key(next_key); - } - if (op == WALK_INSERT) bh = try_split(sb, root, level, key, val_len, parent, - item, bh); + pos, bh); if ((op == WALK_DELETE) && parent) - bh = try_merge(sb, root, parent, item, bh); + bh = try_merge(sb, root, parent, pos, bh); if (IS_ERR(bh)) break; @@ -740,18 +819,29 @@ static struct buffer_head *btree_walk(struct super_block *sb, * seqs then we might not have any child items to * search. */ - item = item_after_seq(parent, key, level, seq, op); - if (!item) { + pos = find_pos_after_seq(parent, key, level, seq, op); + if (pos >= parent->nr_items) { /* current block dropped as parent below */ - if (op == WALK_NEXT_SEQ) { + if (op == WALK_NEXT_SEQ) bh = ERR_PTR(-ENOENT); - } else { + else bh = ERR_PTR(-EIO); - } break; + break; } /* XXX verify sane length */ + item = pos_item(parent, pos); ref = (void *)item->val; + + /* + * Update the next key an iterator should read from. + * Keep in mind that iteration is read only so the + * parent item won't be changed splitting or merging. + */ + if (next_key) { + *next_key = item->key; + scoutfs_inc_key(next_key); + } } unlock_block(sbi, par_bh, dirty); @@ -761,16 +851,19 @@ static struct buffer_head *btree_walk(struct super_block *sb, } static void set_cursor(struct scoutfs_btree_cursor *curs, - struct buffer_head *bh, - struct scoutfs_btree_item *item, bool write) + struct buffer_head *bh, unsigned int pos, bool write) { + struct scoutfs_btree_block *bt = bh_data(bh); + struct scoutfs_btree_item *item = pos_item(bt, pos); + curs->bh = bh; - curs->item = item; + curs->pos = pos; + curs->write = write; + curs->key = &item->key; curs->seq = le64_to_cpu(item->seq); curs->val = item->val; curs->val_len = le16_to_cpu(item->val_len); - curs->write = !!write; } /* @@ -780,9 +873,10 @@ static void set_cursor(struct scoutfs_btree_cursor *curs, int scoutfs_btree_lookup(struct super_block *sb, struct scoutfs_key *key, struct scoutfs_btree_cursor *curs) { - struct scoutfs_btree_item *item; struct scoutfs_btree_block *bt; struct buffer_head *bh; + unsigned int pos; + int cmp; int ret; BUG_ON(curs->bh); @@ -790,11 +884,11 @@ int scoutfs_btree_lookup(struct super_block *sb, struct scoutfs_key *key, bh = btree_walk(sb, key, NULL, 0, 0, 0); if (IS_ERR(bh)) return PTR_ERR(bh); - bt = bh_data(bh); - item = bt_lookup(bt, key); - if (item) { - set_cursor(curs, bh, item, false); + + pos = find_pos(bt, key, &cmp); + if (cmp == 0) { + set_cursor(curs, bh, pos, false); ret = 0; } else { unlock_block(NULL, bh, false); @@ -817,9 +911,10 @@ int scoutfs_btree_insert(struct super_block *sb, struct scoutfs_key *key, unsigned int val_len, struct scoutfs_btree_cursor *curs) { - struct scoutfs_btree_item *item; struct scoutfs_btree_block *bt; struct buffer_head *bh; + int pos; + int cmp; int ret; BUG_ON(curs->bh); @@ -829,16 +924,15 @@ int scoutfs_btree_insert(struct super_block *sb, struct scoutfs_key *key, return PTR_ERR(bh); bt = bh_data(bh); - /* XXX should this return -eexist? */ - item = bt_lookup(bt, key); - if (!item) { - item = create_item(bt, key, val_len); - set_cursor(curs, bh, item, true); + pos = find_pos(bt, key, &cmp); + if (cmp) { + create_item(bt, pos, key, val_len); + set_cursor(curs, bh, pos, true); ret = 0; } else { unlock_block(NULL, bh, true); scoutfs_block_put(bh); - ret = -ENOENT; + ret = -EEXIST; } return ret; @@ -852,9 +946,10 @@ int scoutfs_btree_delete(struct super_block *sb, struct scoutfs_key *key) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct scoutfs_btree_root *root; - struct scoutfs_btree_item *item; struct scoutfs_btree_block *bt; struct buffer_head *bh; + int pos; + int cmp; int ret; bh = btree_walk(sb, key, NULL, 0, 0, WALK_DELETE); @@ -862,9 +957,9 @@ int scoutfs_btree_delete(struct super_block *sb, struct scoutfs_key *key) return PTR_ERR(bh); bt = bh_data(bh); - item = bt_lookup(bt, key); - if (item) { - delete_item(bt, item); + pos = find_pos(bt, key, &cmp); + if (cmp == 0) { + delete_item(bt, pos); ret = 0; /* XXX this locking is broken.. hold root rwsem? */ @@ -920,12 +1015,12 @@ static int btree_next(struct super_block *sb, struct scoutfs_key *first, /* find the next item after the cursor, releasing if we're done */ if (curs->bh) { bt = bh_data(curs->bh); - key = curs->item->key; + key = *curs->key; scoutfs_inc_key(&key); - curs->item = item_next_seq(bt, curs->item, 0, seq, op); - if (curs->item) - set_cursor(curs, curs->bh, curs->item, curs->write); + curs->pos = next_pos_seq(bt, curs->pos, 0, seq, op); + if (curs->pos < bt->nr_items) + set_cursor(curs, curs->bh, curs->pos, curs->write); else scoutfs_btree_release(curs); } @@ -949,25 +1044,20 @@ static int btree_next(struct super_block *sb, struct scoutfs_key *first, bt = bh_data(bh); /* keep trying leaves until next_key passes last */ - curs->item = item_after_seq(bt, &key, 0, seq, op); - if (!curs->item) { + curs->pos = find_pos_after_seq(bt, &key, 0, seq, op); + if (curs->pos >= bt->nr_items) { key = next_key; unlock_block(NULL, bh, false); scoutfs_block_put(bh); continue; } - if (curs->item) { - set_cursor(curs, bh, curs->item, false); - } else { - unlock_block(NULL, bh, false); - scoutfs_block_put(bh); - } + set_cursor(curs, bh, curs->pos, false); break; } /* only return the next item if it's within last */ - if (curs->item && scoutfs_key_cmp(curs->key, last) <= 0) { + if (curs->bh && scoutfs_key_cmp(curs->key, last) <= 0) { ret = 1; } else { scoutfs_btree_release(curs); @@ -1000,18 +1090,18 @@ int scoutfs_btree_since(struct super_block *sb, struct scoutfs_key *first, */ int scoutfs_btree_dirty(struct super_block *sb, struct scoutfs_key *key) { - struct scoutfs_btree_item *item; struct scoutfs_btree_block *bt; struct buffer_head *bh; + int cmp; int ret; bh = btree_walk(sb, key, NULL, 0, 0, WALK_DIRTY); if (IS_ERR(bh)) return PTR_ERR(bh); - bt = bh_data(bh); - item = bt_lookup(bt, key); - if (item) { + + find_pos(bt, key, &cmp); + if (cmp == 0) { ret = 0; } else { ret = -ENOENT; @@ -1033,6 +1123,8 @@ void scoutfs_btree_update(struct super_block *sb, struct scoutfs_key *key, struct scoutfs_btree_item *item; struct scoutfs_btree_block *bt; struct buffer_head *bh; + int pos; + int cmp; BUG_ON(curs->bh); @@ -1040,11 +1132,12 @@ void scoutfs_btree_update(struct super_block *sb, struct scoutfs_key *key, BUG_ON(IS_ERR(bh)); bt = bh_data(bh); - item = bt_lookup(bt, key); - BUG_ON(!item); + pos = find_pos(bt, key, &cmp); + BUG_ON(cmp); + item = pos_item(bt, pos); item->seq = bt->hdr.seq; - set_cursor(curs, bh, item, true); + set_cursor(curs, bh, pos, true); } void scoutfs_btree_release(struct scoutfs_btree_cursor *curs) diff --git a/kmod/src/btree.h b/kmod/src/btree.h index 2dc05bb5..a7faca87 100644 --- a/kmod/src/btree.h +++ b/kmod/src/btree.h @@ -4,14 +4,14 @@ struct scoutfs_btree_cursor { /* for btree.c */ struct buffer_head *bh; - struct scoutfs_btree_item *item; + unsigned int pos; + bool write; /* for callers */ struct scoutfs_key *key; u64 seq; void *val; u16 val_len; - u16 write:1; }; #define DECLARE_SCOUTFS_BTREE_CURSOR(name) \ diff --git a/kmod/src/format.h b/kmod/src/format.h index 4e9eeaad..2a3605c4 100644 --- a/kmod/src/format.h +++ b/kmod/src/format.h @@ -104,28 +104,26 @@ struct scoutfs_key { #define SCOUTFS_MAX_ITEM_LEN 512 -struct scoutfs_treap_root { - __le16 off; -} __packed; - -struct scoutfs_treap_node { - __le16 parent; - __le16 left; - __le16 right; - __le32 prio; -} __packed; - struct scoutfs_btree_root { u8 height; struct scoutfs_block_ref ref; } __packed; +/* + * @free_end: records the byte offset of the first byte after the free + * space in the block between the header and the first item. New items + * are allocated by subtracting the space they need. + * + * @free_reclaim: records the number of bytes of free space amongst the + * items after free_end. If a block is compacted then this much new + * free space would be reclaimed. + */ struct scoutfs_btree_block { struct scoutfs_block_header hdr; - struct scoutfs_treap_root treap; - __le16 total_free; - __le16 tail_free; - __le16 nr_items; + __le16 free_end; + __le16 free_reclaim; + __u8 nr_items; + __le16 item_offs[0]; } __packed; /* @@ -134,7 +132,6 @@ struct scoutfs_btree_block { */ struct scoutfs_btree_item { struct scoutfs_key key; - struct scoutfs_treap_node tnode; __le64 seq; __le16 val_len; char val[0]; diff --git a/kmod/src/treap.c b/kmod/src/treap.c deleted file mode 100644 index 39ab4a3b..00000000 --- a/kmod/src/treap.c +++ /dev/null @@ -1,386 +0,0 @@ -/* - * Copyright (C) 2016 Versity Software, Inc. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ -#include -#include - -#include "format.h" -#include "treap.h" - -/* - * Implement a simple treap in memory. The caller is responsible for - * allocating and freeing roots and nodes. This only performs the tree - * operations on them. - * - * Node references are stored as byte offsets from the root to the node. - * As long as we have the root the byte offsets or node pointers are - * interchangeable. The code tries to prefer to use pointers to be - * slightly easier to read. - * - * The caller is responsible for locking access to the tree. - */ - -/* - * treap nodes are embedded in btree items. Their offset is relative to - * the treap root which is embedded in the btree block header. Their - * offset can't have the item overlap the btree block header, nor can - * the item fall off the end of the block. - */ -static void bug_on_bad_node_off(u16 off) -{ - BUG_ON(off < (sizeof(struct scoutfs_btree_block) - - offsetof(struct scoutfs_btree_block, treap) + - offsetof(struct scoutfs_btree_item, tnode))); - BUG_ON(off > (SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_btree_item) + - offsetof(struct scoutfs_btree_item, tnode))); -} - -static struct scoutfs_treap_node *off_node(struct scoutfs_treap_root *root, - __le16 off) -{ - if (!off) - return NULL; - - bug_on_bad_node_off(le16_to_cpu(off)); - - return (void *)root + le16_to_cpu(off); -} - -static __le16 node_off(struct scoutfs_treap_root *root, - struct scoutfs_treap_node *node) -{ - u16 off; - - if (!node) - return 0; - - off = (char *)node - (char *)root; - bug_on_bad_node_off(off); - - return cpu_to_le16(off); -} - -/* - * Walk the tree looking for a node that matches a node in the tree. - * Return the found node or the last node traversed. Set the caller's - * cmp to the comparison between the key and the returned node. The - * caller can ask that we set their pointers to the most recently - * traversed node before or after the returned node. - */ -static struct scoutfs_treap_node *descend(struct scoutfs_treap_root *root, - scoutfs_treap_cmp_t cmp_func, - struct scoutfs_treap_node *key, - int *cmp, - struct scoutfs_treap_node **before, - struct scoutfs_treap_node **after) -{ - struct scoutfs_treap_node *node = NULL; - __le16 off = root->off; - - *cmp = -1; - if (before) - *before = NULL; - if (after) - *after = NULL; - - while (off) { - node = off_node(root, off); - *cmp = cmp_func(key, node); - if (*cmp < 0) { - if (after) - *after = node; - off = node->left; - } else if (*cmp > 0) { - if (before) - *before = node; - off = node->right; - } else { - break; - } - } - - return node; -} - -/* - * Link the two nodes together by setting their child and parent pointers - * as needed. Both parent and child can be null. - */ -static void set_links(struct scoutfs_treap_root *root, - struct scoutfs_treap_node *parent, bool left, - struct scoutfs_treap_node *child) -{ - if (!parent) - root->off = node_off(root, child); - else if (left) - parent->left = node_off(root, child); - else - parent->right = node_off(root, child); - - if (child) - child->parent = node_off(root, parent); -} - -/* - * Perform a tree rotation. The node pointer names describe their - * relationships before the rotation. We use the relationship between - * the node and its child to determine the direction of the rotation. - * After the rotation the child will be higher than the node. Only the - * node and child must exist. - * - * Here's a right rotation: - * - * parent parent - * | | - * node child - * / \ / \ - * child a b node - * / \ / \ - * b gr_chi gr_chi a - * - */ -static void rotation(struct scoutfs_treap_root *root, - struct scoutfs_treap_node *node, - struct scoutfs_treap_node *child) -{ - struct scoutfs_treap_node *parent = off_node(root, node->parent); - struct scoutfs_treap_node *grand_child; - bool right; - - if (node->left == node_off(root, child)) { - right = true; - grand_child = off_node(root, child->right); - } else { - right = false; - grand_child = off_node(root, child->left); - } - - set_links(root, parent, - parent && (parent->left == node_off(root, node)), child); - set_links(root, node, right, grand_child); - set_links(root, child, !right, node); -} - -/* - * Insertion links a node in at a leaf and then rotates it up the - * tree until its parent has a higher priority. - */ -int scoutfs_treap_insert(struct scoutfs_treap_root *root, - scoutfs_treap_cmp_t cmp_func, - struct scoutfs_treap_node *ins) -{ - struct scoutfs_treap_node *parent; - int cmp; - - ins->prio = cpu_to_le32(get_random_int()); - ins->parent = 0; - ins->left = 0; - ins->right = 0; - - parent = descend(root, cmp_func, ins, &cmp, NULL, NULL); - if (cmp == 0) - return -EEXIST; - - set_links(root, parent, cmp < 0, ins); - - while (ins->parent) { - parent = off_node(root, ins->parent); - if (le32_to_cpu(ins->prio) < le32_to_cpu(parent->prio)) - break; - - rotation(root, parent, ins); - } - - return 0; -} - -/* - * Deletion rotates the node down the tree until it doesn't have two - * children so that it can be unlinked by pointing its parent at its - * child, if it has one. - */ -void scoutfs_treap_delete(struct scoutfs_treap_root *root, - struct scoutfs_treap_node *node) -{ - struct scoutfs_treap_node *left; - struct scoutfs_treap_node *right; - struct scoutfs_treap_node *child; - struct scoutfs_treap_node *parent; - - while (node->left && node->right) { - left = off_node(root, node->left); - right = off_node(root, node->right); - - if (le32_to_cpu(left->prio) > le32_to_cpu(right->prio)) - rotation(root, node, left); - else - rotation(root, node, right); - } - - parent = off_node(root, node->parent); - - if (node->left) - child = off_node(root, node->left); - else - child = off_node(root, node->right); - - set_links(root, parent, - parent && parent->left == node_off(root, node), child); -} - -struct scoutfs_treap_node *scoutfs_treap_lookup(struct scoutfs_treap_root *root, - scoutfs_treap_cmp_t cmp_func, - struct scoutfs_treap_node *key) -{ - struct scoutfs_treap_node *node; - int cmp; - - node = descend(root, cmp_func, key, &cmp, NULL, NULL); - if (cmp != 0) - return NULL; - - return node; -} - -/* return the first node in the tree */ -struct scoutfs_treap_node *scoutfs_treap_first(struct scoutfs_treap_root *root) -{ - struct scoutfs_treap_node *node = off_node(root, root->off); - - while (node && node->left) - node = off_node(root, node->left); - - return node; -} - -/* return the last node in the tree */ -struct scoutfs_treap_node *scoutfs_treap_last(struct scoutfs_treap_root *root) -{ - struct scoutfs_treap_node *node = off_node(root, root->off); - - while (node && node->right) - node = off_node(root, node->right); - - return node; -} - -/* return the last node whose key is less than or equal to the key */ -struct scoutfs_treap_node *scoutfs_treap_before(struct scoutfs_treap_root *root, - scoutfs_treap_cmp_t cmp_func, - struct scoutfs_treap_node *key) -{ - struct scoutfs_treap_node *before; - struct scoutfs_treap_node *node; - int cmp; - - node = descend(root, cmp_func, key, &cmp, &before, NULL); - if (cmp == 0) - return node; - - return before; -} - -/* return the first node whose key is greater than or equal to the key */ -struct scoutfs_treap_node *scoutfs_treap_after(struct scoutfs_treap_root *root, - scoutfs_treap_cmp_t cmp_func, - struct scoutfs_treap_node *key) -{ - struct scoutfs_treap_node *after; - struct scoutfs_treap_node *node; - int cmp; - - node = descend(root, cmp_func, key, &cmp, NULL, &after); - if (cmp == 0) - return node; - - return after; -} - -/* - * The usual BST iteration: either the least descendant or the first - * ancestor in the direction of the iteration. - */ -struct scoutfs_treap_node *scoutfs_treap_next(struct scoutfs_treap_root *root, - struct scoutfs_treap_node *node) -{ - struct scoutfs_treap_node *parent; - - if (node->right) { - node = off_node(root, node->right); - while (node->left) - node = off_node(root, node->left); - return node; - } - - while ((parent = off_node(root, node->parent)) && - parent->right == node_off(root, node)) { - node = parent; - } - - return parent; -} - -struct scoutfs_treap_node *scoutfs_treap_prev(struct scoutfs_treap_root *root, - struct scoutfs_treap_node *node) -{ - struct scoutfs_treap_node *parent; - - if (node->left) { - node = off_node(root, node->left); - while (node->right) - node = off_node(root, node->right); - return node; - } - - while ((parent = off_node(root, node->parent)) && - parent->left == node_off(root, node)) { - node = parent; - } - - return parent; -} - -static void update_relative(struct scoutfs_treap_root *root, __le16 node_off, - __le16 from_off, __le16 to_off) -{ - struct scoutfs_treap_node *node = off_node(root, node_off); - - if (node) { - if (node->parent == from_off) - node->parent = to_off; - else if (node->left == from_off) - node->left = to_off; - else if (node->right == from_off) - node->right = to_off; - } -} - -/* - * A node has moved from one storage location to another. Update the - * nodes that refer to it. The from pointer can only be used to - * determine the old offset. Its contents are undefined. - */ -void scoutfs_treap_move(struct scoutfs_treap_root *root, - struct scoutfs_treap_node *from, - struct scoutfs_treap_node *to) -{ - __le16 from_off = node_off(root, from); - __le16 to_off = node_off(root, to); - - if (root->off == from_off) - root->off = to_off; - else - update_relative(root, to->parent, from_off, to_off); - - update_relative(root, to->left, from_off, to_off); - update_relative(root, to->right, from_off, to_off); -} diff --git a/kmod/src/treap.h b/kmod/src/treap.h deleted file mode 100644 index d02f406c..00000000 --- a/kmod/src/treap.h +++ /dev/null @@ -1,38 +0,0 @@ -#ifndef _SCOUTFS_TREAP_H_ -#define _SCOUTFS_TREAP_H_ - -#include "format.h" - -typedef int (*scoutfs_treap_cmp_t)(struct scoutfs_treap_node *a, - struct scoutfs_treap_node *b); - -static inline void scoutfs_treap_init(struct scoutfs_treap_root *root) -{ - root->off = 0; -} - -int scoutfs_treap_insert(struct scoutfs_treap_root *root, - scoutfs_treap_cmp_t cmp_func, - struct scoutfs_treap_node *ins); -void scoutfs_treap_delete(struct scoutfs_treap_root *root, - struct scoutfs_treap_node *node); -struct scoutfs_treap_node *scoutfs_treap_lookup(struct scoutfs_treap_root *root, - scoutfs_treap_cmp_t cmp_func, - struct scoutfs_treap_node *key); -struct scoutfs_treap_node *scoutfs_treap_first(struct scoutfs_treap_root *root); -struct scoutfs_treap_node *scoutfs_treap_last(struct scoutfs_treap_root *root); -struct scoutfs_treap_node *scoutfs_treap_before(struct scoutfs_treap_root *root, - scoutfs_treap_cmp_t cmp_func, - struct scoutfs_treap_node *key); -struct scoutfs_treap_node *scoutfs_treap_after(struct scoutfs_treap_root *root, - scoutfs_treap_cmp_t cmp_func, - struct scoutfs_treap_node *key); -struct scoutfs_treap_node *scoutfs_treap_next(struct scoutfs_treap_root *root, - struct scoutfs_treap_node *node); -struct scoutfs_treap_node *scoutfs_treap_prev(struct scoutfs_treap_root *root, - struct scoutfs_treap_node *node); -void scoutfs_treap_move(struct scoutfs_treap_root *root, - struct scoutfs_treap_node *from, - struct scoutfs_treap_node *to); - -#endif