diff --git a/kmod/src/btree.c b/kmod/src/btree.c index 0aa2edd6..a62535f5 100644 --- a/kmod/src/btree.c +++ b/kmod/src/btree.c @@ -28,48 +28,51 @@ #include "msg.h" #include "block.h" #include "radix.h" +#include "avl.h" #include "scoutfs_trace.h" /* * scoutfs uses a cow btree to index fs metadata. * - * Using a cow btree lets nodes determine the validity of cached blocks - * based on a single root ref (blkno, seq) that is communicated through - * locking and messaging. As long as their cached blocks aren't - * overwritten in the ring they can continue to use those cached blocks - * as the newer cowed blocks continue to reference them. - * * Today callers provide all the locking. They serialize readers and * writers and writers and committing all the dirty blocks. * - * Btree items are stored in each block as a small header with the key - * followed by the value. New items are allocated from the back of the - * block towards the front. + * Block reference have sufficient metadata to discover corrupt + * references. If a reader encounters a bad block it backs off which + * gives the caller the opportunity to resample the root in case it was + * reading through a stale btree that has been overwritten. This lets + * mounts read trees that are modified by other mounts without exclusive + * locking. * - * A dense array of item headers after the btree block header stores the - * offsets of the items and is kept sorted by the item's keys. The - * array is small enough that keeping it sorted with memmove() involves - * a few cache lines at most. + * Btree items are stored as a dense array of structs at the front of + * each block. New items are allocated at the end of the array. + * Deleted items are swapped with the last item to maintain the dense + * array. The items are indexed by a balanced binary tree with parent + * pointers so the relocated item can have references to it updated. * - * Parent blocks in the btree have the same format as leaf blocks. - * There's one key for every child reference instead of having separator - * keys between child references. The key in a child reference contains - * the largest key that may be found in the child subtree. The right - * spine of the tree has maximal keys so that they don't have to be - * updated if we insert an item with a key greater than everything in - * the tree. - */ - -/* - * XXX: - * - counters and tracing - * - could issue read-ahead around reads up to dirty blkno - * - have barrier as we cross to prevent refreshing clobbering stale reads - * - audit/comment that dirty blknos can wrap around ring - * - figure out some max transaction size so ring won't wrap in one - * - update the world of comments - * - validate structures on read? + * Values are allocated from the end of the block towards the front, + * consuming the end of free space in the center of the block. Deleted + * values can be merged with this free space, but more likely they'll + * create fragmented free space amongst other existing values. All + * values are stored with an offset at the end which contains either the + * offset of their item or the offset of the start of their free space. + * This lets an infrequent compaction process move items towards the + * back of the block to reclaim free space. + * + * Exact item searches are only performed on leaf blocks. Leaf blocks + * have a hash table at the end of the block which is used to find items + * with a specific key. It uses linear probing and maintains a low load + * factor so any given search will most likely only need a single + * cacheline. + * + * Parent block reference items are stored as items with a block + * reference as a value. There's an item with a key for every child + * reference instead of having separator keys between child references. + * The key in a child reference contains the largest key that may be + * found in the child subtree. The right spine of the tree has maximal + * keys so that they don't have to be updated if we insert an item with + * a key greater than everything in the tree. */ /* btree walking has a bunch of behavioural bit flags */ @@ -81,88 +84,75 @@ enum { BTW_DIRTY = (1 << 4), /* cow stable blocks */ BTW_ALLOC = (1 << 5), /* allocate a new block for 0 ref */ BTW_INSERT = (1 << 6), /* walking to insert, try splitting */ - BTW_DELETE = (1 << 7), /* walking to delete, try merging */ + BTW_DELETE = (1 << 7), /* walking to delete, try joining */ }; -/* number of contiguous bytes used by the item and it's value */ -static inline unsigned int len_bytes(unsigned val_len) +/* total length of the value payload */ +static inline unsigned int val_bytes(unsigned val_len) { - return sizeof(struct scoutfs_btree_item) + val_len; + return val_len + (val_len ? SCOUTFS_BTREE_VAL_OWNER_BYTES : 0); } -/* number of contiguous bytes used an existing item */ +/* number of bytes in a block used by an item with the given value length */ +static inline unsigned int item_len_bytes(unsigned val_len) +{ + return sizeof(struct scoutfs_btree_item) + val_bytes(val_len); +} + +/* number of bytes used by an existing item */ static inline unsigned int item_bytes(struct scoutfs_btree_item *item) { - return len_bytes(le16_to_cpu(item->val_len)); -} - -/* total block bytes used by an item: header, item, key, value */ -static inline unsigned int all_len_bytes(unsigned val_len) -{ - return sizeof(struct scoutfs_btree_item_header) + len_bytes(val_len); + return item_len_bytes(le16_to_cpu(item->val_len)); } /* - * The minimum number of bytes we allow in a block. During descent to - * modify if we see a block with fewer used bytes then we'll try to - * merge items from neighbours. If the neighbour also has less than the - * min bytes then the two blocks are merged. - * - * This is carefully calculated so that if two blocks are merged the - * resulting block will have at least parent min free bytes free so - * that it's not immediately split again. - * - * new_used = min_used + min_used - hdr - * new_used <= (bs - parent_min_free) - * - * min_used + min_used - hdr <= (bs - parent_min_free) - * 2 * min_used <= (bs - parent_min_free - hdr) - * min_used <= (bs - parent_min_free - hdr) / 2 + * Join blocks when they both are 1/4 full. This puts some distance + * between the join threshold and the full threshold for splitting. + * Blocks that just split or joined need to undergo a reasonable amount + * of item modification before they'll split or join again. */ -static inline int min_used_bytes(int block_size) +static unsigned int join_low_watermark(void) { - return (block_size - sizeof(struct scoutfs_btree_block) - - SCOUTFS_BTREE_PARENT_MIN_FREE_BYTES) / 2; + return (SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_btree_block)) / 4; } -/* total block bytes used by an existing item */ -static inline unsigned int all_item_bytes(struct scoutfs_btree_item *item) +/* + * return the integer percentages of total space the block could have + * consumed by items that is currently consumed. + */ +static unsigned int item_full_pct(struct scoutfs_btree_block *bt) { - return all_len_bytes(le16_to_cpu(item->val_len)); + return (int)le16_to_cpu(bt->total_item_bytes) * 100 / + (SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_btree_block)); } -/* number of free bytes between last item header and first item */ -static inline unsigned int free_bytes(struct scoutfs_btree_block *bt) +static inline __le16 ptr_off(struct scoutfs_btree_block *bt, void *ptr) { - unsigned int nr = le32_to_cpu(bt->nr_items); - - return le32_to_cpu(bt->free_end) - - offsetof(struct scoutfs_btree_block, item_hdrs[nr]); + return cpu_to_le16(ptr - (void *)bt); } -/* all bytes used by item offsets, headers, and values */ -static inline unsigned int used_total(struct scoutfs_btree_block *bt) +static inline void *off_ptr(struct scoutfs_btree_block *bt, u16 off) { - return SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_btree_block) - - free_bytes(bt); + return (void *)bt + off; } static inline struct scoutfs_btree_item * -off_item(struct scoutfs_btree_block *bt, __le32 off) +off_item(struct scoutfs_btree_block *bt, __le16 off) { - return (void *)bt + le32_to_cpu(off); + return (void *)bt + le16_to_cpu(off); } -static inline struct scoutfs_btree_item * -pos_item(struct scoutfs_btree_block *bt, unsigned int pos) +static struct scoutfs_btree_item *last_item(struct scoutfs_btree_block *bt) { - return off_item(bt, bt->item_hdrs[pos].off); + BUG_ON(bt->nr_items == 0); + + return &bt->items[le16_to_cpu(bt->nr_items) - 1]; } -static inline struct scoutfs_btree_item * -last_item(struct scoutfs_btree_block *bt) +/* offset of the start of the free range in the middle of the block */ +static inline unsigned int mid_free_off(struct scoutfs_btree_block *bt) { - return pos_item(bt, le32_to_cpu(bt->nr_items) - 1); + return le16_to_cpu(ptr_off(bt, &bt->items[le16_to_cpu(bt->nr_items)])); } static inline struct scoutfs_key *item_key(struct scoutfs_btree_item *item) @@ -170,9 +160,10 @@ static inline struct scoutfs_key *item_key(struct scoutfs_btree_item *item) return &item->key; } -static inline void *item_val(struct scoutfs_btree_item *item) +static inline void *item_val(struct scoutfs_btree_block *bt, + struct scoutfs_btree_item *item) { - return item->val; + return off_ptr(bt, le16_to_cpu(item->val_off)); } static inline unsigned item_val_len(struct scoutfs_btree_item *item) @@ -180,173 +171,482 @@ static inline unsigned item_val_len(struct scoutfs_btree_item *item) return le16_to_cpu(item->val_len); } -/* - * Returns the sorted item position that an item with the given key - * should occupy. - * - * It sets *cmp to the final comparison of the given key and the - * position's item key. This can only be -1 or 0 because we bias - * towards returning the pos that a key should occupy. - * - * If the given key is greater then all items' keys then the number of - * items can be returned. - */ -static int find_pos(struct scoutfs_btree_block *bt, struct scoutfs_key *key, - int *cmp) +static struct scoutfs_btree_item *node_item(struct scoutfs_avl_node *node) { + if (node == NULL) + return NULL; + return container_of(node, struct scoutfs_btree_item, node); +} + +static struct scoutfs_btree_item *prev_item(struct scoutfs_btree_block *bt, + struct scoutfs_btree_item *item) +{ + if (item == NULL) + return NULL; + return node_item(scoutfs_avl_prev(&bt->item_root, &item->node)); +} + +static struct scoutfs_btree_item *next_item(struct scoutfs_btree_block *bt, + struct scoutfs_btree_item *item) +{ + if (item == NULL) + return NULL; + return node_item(scoutfs_avl_next(&bt->item_root, &item->node)); +} + +static int cmp_key_item(void *arg, struct scoutfs_avl_node *node) +{ + struct scoutfs_key *key = arg; + struct scoutfs_btree_item *item = node_item(node); + + return scoutfs_key_compare(key, item_key(item)); +} + +/* + * We have a small fixed-size linearly probed hash table at the end of + * leaf blocks which is used for direct item lookups (as opposed to + * iterators). The hash table only stores non-zero offsets to the + * items. If an item is moved then its offset is updated. The hash + * table is sized to allow a max load of 75%, but most items are larger + * and most blocks aren't full. + */ +static int leaf_item_hash_ind(struct scoutfs_key *key) +{ + return crc32c(~0, key, sizeof(struct scoutfs_key)) % + SCOUTFS_BTREE_LEAF_ITEM_HASH_NR; +} + +static __le16 *leaf_item_hash_buckets(struct scoutfs_btree_block *bt) +{ + return (void *)bt + SCOUTFS_BLOCK_SIZE - + SCOUTFS_BTREE_LEAF_ITEM_HASH_BYTES; +} + +static inline int leaf_item_hash_next_bucket(int i) +{ + if (++i >= SCOUTFS_BTREE_LEAF_ITEM_HASH_NR) + i = 0; + return i; +} + +#define foreach_leaf_item_hash_bucket(i, nr, key) \ + for (i = leaf_item_hash_ind(key), nr = SCOUTFS_BTREE_LEAF_ITEM_HASH_NR;\ + nr-- > 0; \ + i = leaf_item_hash_next_bucket(i)) + +static struct scoutfs_btree_item * +leaf_item_hash_search(struct scoutfs_btree_block *bt, struct scoutfs_key *key) +{ + __le16 *buckets = leaf_item_hash_buckets(bt); struct scoutfs_btree_item *item; - unsigned int start = 0; - unsigned int end = le32_to_cpu(bt->nr_items); - unsigned int pos = 0; + __le16 off; + int nr; + int i; - *cmp = -1; + if (WARN_ON_ONCE(bt->level > 0)) + return NULL; - while (start < end) { - pos = start + (end - start) / 2; + foreach_leaf_item_hash_bucket(i, nr, key) { + off = buckets[i]; + if (off == 0) + return NULL; - item = pos_item(bt, pos); - *cmp = scoutfs_key_compare(key, item_key(item)); - if (*cmp < 0) { - end = pos; - } else if (*cmp > 0) { - start = ++pos; - *cmp = -1; - } else { + item = off_item(bt, off); + if (scoutfs_key_compare(key, item_key(item)) == 0) + return item; + } + + return NULL; +} + +static void leaf_item_hash_insert(struct scoutfs_btree_block *bt, + struct scoutfs_key *key, __le16 off) +{ + __le16 *buckets = leaf_item_hash_buckets(bt); + int nr; + int i; + + if (bt->level > 0) + return; + + foreach_leaf_item_hash_bucket(i, nr, key) { + if (buckets[i] == 0) { + buckets[i] = off; + return; + } + } + + /* table should have been been enough for all items */ + BUG(); +} + +/* + * Deletion clears the offset in a bucket. That could create a + * discontinuity that would stop a search from seeing colliding + * insertions that were pushed into further buckets. Each time we zero + * a bucket we rehash all the populated buckets following it. There + * won't be many in our light load tables and this works reliably as the + * contiguous population wraps past the end of table. Comparing hashed + * bucket positions to find candidates to relocate after the wrap is + * tricky. + */ +static void leaf_item_hash_delete(struct scoutfs_btree_block *bt, + struct scoutfs_key *key, __le16 del_off) +{ + __le16 *buckets = leaf_item_hash_buckets(bt); + __le16 off; + int nr; + int i; + + if (bt->level > 0) + return; + + foreach_leaf_item_hash_bucket(i, nr, key) { + off = buckets[i]; + /* we must find the item we're trying to delete */ + BUG_ON(off == 0); + + if (off == del_off) { + buckets[i] = 0; break; } } - return pos; + while ((i = leaf_item_hash_next_bucket(i)), buckets[i] != 0) { + off = buckets[i]; + buckets[i] = 0; + leaf_item_hash_insert(bt, item_key(off_item(bt, off)), off); + } } -/* move a number of contigous elements from the src index to the dst index */ -#define memmove_arr(arr, dst, src, nr) \ - memmove(&(arr)[dst], &(arr)[src], (nr) * sizeof(*(arr))) +static void leaf_item_hash_change(struct scoutfs_btree_block *bt, + struct scoutfs_key *key, __le16 to, + __le16 from) +{ + __le16 *buckets = leaf_item_hash_buckets(bt); + __le16 off; + int nr; + int i; + + if (bt->level > 0) + return; + + foreach_leaf_item_hash_bucket(i, nr, key) { + off = buckets[i]; + /* we must find the item we're trying to change */ + BUG_ON(off == 0); + + if (off == from) { + buckets[i] = to; + return; + } + } +} + +/* + * Given an offset to the start of a value, return info describing the + * previous value in the block. Each value ends with an owner offset + * which points to either the value's item if it's in use or to the + * start of the value if it's been freed. Either the item is returned + * or the length of the previous value is set. + */ +static struct scoutfs_btree_item * +get_prev_val_owner(struct scoutfs_btree_block *bt, unsigned int off, + unsigned int *prev_val_bytes) +{ + __le16 *owner = off_ptr(bt, off - sizeof(*owner)); + unsigned int own = get_unaligned_le16(owner); + + if (own >= mid_free_off(bt)) { + *prev_val_bytes = off - own; + return NULL; + } else { + *prev_val_bytes = 0; + return off_ptr(bt, own); + } +} + +/* + * Set the owner offset at the end of a full value, the given length includes + * the offset. + */ +static void set_val_owner(struct scoutfs_btree_block *bt, unsigned int val_off, + unsigned int vb, __le16 item_off) +{ + __le16 *owner = off_ptr(bt, val_off + vb - sizeof(*owner)); + + put_unaligned_le16(le16_to_cpu(item_off) ?: val_off, owner); +} + +/* + * As values are freed they can leave fragmented free space amongst + * other values. This is called when we can't insert because there + * isn't enough free space but we know that there's sufficient free + * space amongst the values for the new insertion. + * + * But we only want to do this when there is enough free space to + * justify the cost of the compaction. We don't want to bother + * compacting if the block is almost full and we just be split in a few + * more operations. The split heuristic requires a generous amount of + * fragmented free space that will avoid a split. + */ +static void compact_values(struct scoutfs_btree_block *bt) +{ + struct scoutfs_btree_item *item; + unsigned int free_off; + unsigned int free_len; + unsigned int to_off; + unsigned int end; + unsigned int vb; + void *from; + void *to; + + if (bt->last_free_off == 0) + return; + + free_off = le16_to_cpu(bt->last_free_off); + free_len = le16_to_cpu(bt->last_free_len); + end = mid_free_off(bt) + le16_to_cpu(bt->mid_free_len); + + while (free_off > end) { + item = get_prev_val_owner(bt, free_off, &vb); + if (item == NULL) { + free_off -= vb; + free_len += vb; + continue; + } + + from = off_ptr(bt, le16_to_cpu(item->val_off)); + vb = val_bytes(le16_to_cpu(item->val_len)); + to_off = free_off + free_len - vb; + to = off_ptr(bt, to_off); + if (to >= from + vb) + memcpy(to, from, vb); + else + memmove(to, from, vb); + + free_off = le16_to_cpu(item->val_off); + item->val_off = cpu_to_le16(to_off); + } + + le16_add_cpu(&bt->mid_free_len, free_len); + bt->last_free_off = 0; + bt->last_free_len = 0; +} + +/* + * Insert an item's value into the block. The caller has made sure + * there's free space. We store the value at the end of free space in + * the block and point its final offset at its owning item, and copy the + * value into place. + */ +static __le16 insert_value(struct scoutfs_btree_block *bt, __le16 item_off, + void *val, unsigned val_len) +{ + unsigned int val_off; + unsigned int vb; + + if (val_len == 0) + return 0; + + BUG_ON(le16_to_cpu(bt->mid_free_len) < val_bytes(val_len)); + + vb = val_bytes(val_len); + val_off = mid_free_off(bt) + le16_to_cpu(bt->mid_free_len) - vb; + le16_add_cpu(&bt->mid_free_len, -vb); + + memcpy(off_ptr(bt, val_off), val, val_len); + set_val_owner(bt, val_off, vb, item_off); + + return cpu_to_le16(val_off); +} + +/* + * Delete an item's value from the block. The caller has updated the + * item. We leave behind a free region whose owner offset indicates + * that the value isn't in use. It might merge with the central free + * region or the final freed value, and might become the final freed + * value. + */ +static void delete_value(struct scoutfs_btree_block *bt, + unsigned int val_off, unsigned int val_len) +{ + unsigned int free_off; + unsigned int free_len; + bool is_last; + + if (val_len == 0) + return; + + free_off = val_off; + free_len = val_bytes(val_len); + is_last = false; + + /* see if we can merge with mid free region */ + if (mid_free_off(bt) + le16_to_cpu(bt->mid_free_len) == free_off) { + le16_add_cpu(&bt->mid_free_len, free_len); + return; + } + + if (free_off + free_len == le16_to_cpu(bt->last_free_off)) { + /* merge with front of last free */ + free_len += le16_to_cpu(bt->last_free_len); + is_last = true; + + } else if ((le16_to_cpu(bt->last_free_off) + + le16_to_cpu(bt->last_free_len)) == free_off) { + /* merge with end of last free */ + free_off = le16_to_cpu(bt->last_free_off); + free_len += le16_to_cpu(bt->last_free_len); + is_last = true; + + } else if (free_off > le16_to_cpu(bt->last_free_off)) { + /* become new last */ + is_last = true; + } + + set_val_owner(bt, free_off, free_len, 0); + if (is_last) { + bt->last_free_off = cpu_to_le16(free_off); + bt->last_free_len = cpu_to_le16(free_len); + } +} /* * Insert a new item into the block. The caller has made sure that - * there's space for the item and its metadata. + * there is sufficient free space in block for the new item. We might + * have to compact the values to the end of the block to reclaim + * fragmented free space between values. + * + * This only consumes free space. It's safe to use references to block + * structures after this call. */ -static void create_item(struct scoutfs_btree_block *bt, unsigned int pos, - struct scoutfs_key *key, void *val, unsigned val_len) +static void create_item(struct scoutfs_btree_block *bt, + struct scoutfs_key *key, void *val, unsigned val_len, + struct scoutfs_avl_node *parent, int cmp) { - unsigned int nr = le32_to_cpu(bt->nr_items); struct scoutfs_btree_item *item; - unsigned all_bytes; - all_bytes = all_len_bytes(val_len); - BUG_ON(free_bytes(bt) < all_bytes); + BUG_ON(le16_to_cpu(bt->mid_free_len) < item_len_bytes(val_len)); - if (pos < nr) - memmove_arr(bt->item_hdrs, pos + 1, pos, nr - pos); + le16_add_cpu(&bt->mid_free_len, + -(u16)sizeof(struct scoutfs_btree_item)); + le16_add_cpu(&bt->nr_items, 1); + item = last_item(bt); - le32_add_cpu(&bt->free_end, -len_bytes(val_len)); - bt->item_hdrs[pos].off = bt->free_end; - nr++; - bt->nr_items = cpu_to_le32(nr); + item->key = *key; - BUG_ON(le32_to_cpu(bt->free_end) < - offsetof(struct scoutfs_btree_block, item_hdrs[nr])); + scoutfs_avl_insert(&bt->item_root, parent, &item->node, cmp); + leaf_item_hash_insert(bt, item_key(item), ptr_off(bt, item)); - item = pos_item(bt, pos); - *item_key(item) = *key; + item->val_off = insert_value(bt, ptr_off(bt, item), val, val_len); item->val_len = cpu_to_le16(val_len); - if (val_len) - memcpy(item_val(item), val, val_len); + le16_add_cpu(&bt->total_item_bytes, item_bytes(item)); } /* * Delete an item from a btree block. * - * This moves all the headers after the item (in sort order) towards the - * start of the header array. It moves all the items before the removed - * item towards the end of the block. The items that have to be moved - * can be anywhere in the sort order. We first move the item region - * and then walk the headers looking for offsets that need to be updated. - * - * The item motion means that callers can not hold item references - * across item deletion. + * As we delete the item we can relocate an unrelated item to maintain + * the dense array of items. The caller can use another single item + * after this call if they give us the opportunity to let them know if + * we move it. */ -static void delete_item(struct scoutfs_btree_block *bt, unsigned int pos) +static void delete_item(struct scoutfs_btree_block *bt, + struct scoutfs_btree_item *item, + struct scoutfs_btree_item **use_after) { - unsigned int nr = le32_to_cpu(bt->nr_items); - unsigned int updated; - unsigned int total; - unsigned int first; - unsigned int bytes; - unsigned int last; - unsigned int off; - int i; + struct scoutfs_btree_item *last; + unsigned int val_off; + unsigned int val_len; - /* calculate region of items to move */ - first = le32_to_cpu(bt->free_end); - last = le32_to_cpu(bt->item_hdrs[pos].off); - total = last - first; - bytes = item_bytes(pos_item(bt, pos)); + /* save some values before we delete the item */ + val_off = le16_to_cpu(item->val_off); + val_len = le16_to_cpu(item->val_len); + last = last_item(bt); - /* move items before deleted to the back of the block */ - if (total > 0) { - /* update headers before memove overwrites deleted item */ - for (i = 0, updated = 0; i < nr && updated < total; i++) { - off = le32_to_cpu(bt->item_hdrs[i].off); - if (off >= first && off < last) { - updated += item_bytes(pos_item(bt, i)); - le32_add_cpu(&bt->item_hdrs[i].off, bytes); - } - } - BUG_ON(updated != total); - - memmove(off_item(bt, cpu_to_le32(first + bytes)), - off_item(bt, cpu_to_le32(first)), total); + /* delete the item */ + scoutfs_avl_delete(&bt->item_root, &item->node); + leaf_item_hash_delete(bt, item_key(item), ptr_off(bt, item)); + le16_add_cpu(&bt->nr_items, -1); + le16_add_cpu(&bt->mid_free_len, sizeof(struct scoutfs_btree_item)); + le16_add_cpu(&bt->total_item_bytes, -item_bytes(item)); + /* move the final item into the deleted space */ + if (last != item) { + item->key = last->key; + item->val_off = last->val_off; + item->val_len = last->val_len; + if (last->val_len) + set_val_owner(bt, le16_to_cpu(last->val_off), + val_bytes(le16_to_cpu(last->val_len)), + ptr_off(bt, item)); + leaf_item_hash_change(bt, &last->key, ptr_off(bt, item), + ptr_off(bt, last)); + scoutfs_avl_relocate(&bt->item_root, &item->node,&last->node); + if (use_after && *use_after == last) + *use_after = item; } - /* wipe deleted bytes to avoid leaking data */ - memset(off_item(bt, cpu_to_le32(first)), 0, bytes); - - if (pos < (nr - 1)) - memmove_arr(bt->item_hdrs, pos, pos + 1, nr - 1 - pos); - - le32_add_cpu(&bt->free_end, bytes); - le32_add_cpu(&bt->nr_items, -1); + delete_value(bt, val_off, val_len); } /* * Move items from a source block to a destination block. The caller - * tells us if we're moving from the tail of the source block right to - * the head of the destination block, or vice versa. We stop moving - * once we've moved enough bytes of items. + * has made sure there's sufficient free space in the destination block, + * though item creation may need to compact values. The caller tells us + * if we're moving from the tail of the source block right to the head + * of the destination block, or vice versa. We're always adding the + * first or last item to the avl, so the parent is always the previous + * first or last node. */ static void move_items(struct scoutfs_btree_block *dst, struct scoutfs_btree_block *src, bool move_right, int to_move) { + struct scoutfs_avl_node *par; + struct scoutfs_avl_node *node; struct scoutfs_btree_item *from; - unsigned int t; - unsigned int f; + struct scoutfs_btree_item *next; + int cmp; if (move_right) { - f = le32_to_cpu(src->nr_items) - 1; - t = 0; + node = scoutfs_avl_last(&src->item_root); + par = scoutfs_avl_first(&dst->item_root); + cmp = -1; } else { - f = 0; - t = le32_to_cpu(dst->nr_items); + node = scoutfs_avl_first(&src->item_root); + par = scoutfs_avl_last(&dst->item_root); + cmp = 1; } + from = node_item(node); - while (f < le32_to_cpu(src->nr_items) && to_move > 0) { - from = pos_item(src, f); + while (to_move > 0 && from != NULL) { + to_move -= item_bytes(from); - create_item(dst, t, item_key(from), item_val(from), - item_val_len(from)); - - to_move -= all_item_bytes(from); - - delete_item(src, f); if (move_right) - f--; + next = prev_item(src, from); else - t++; + next = next_item(src, from); + + create_item(dst, item_key(from), item_val(src, from), + item_val_len(from), par, cmp); + + if (move_right) { + if (par) + par = scoutfs_avl_prev(&dst->item_root, par); + else + par = scoutfs_avl_first(&dst->item_root); + } else { + if (par) + par = scoutfs_avl_next(&dst->item_root, par); + else + par = scoutfs_avl_last(&dst->item_root); + } + + delete_item(src, from, &next); + from = next; } } @@ -468,7 +768,6 @@ retry: /* returning a newly allocated block */ memset(new, 0, SCOUTFS_BLOCK_SIZE); new->hdr.fsid = super->hdr.fsid; - new->free_end = cpu_to_le32(SCOUTFS_BLOCK_SIZE); } bl = new_bl; bt = new; @@ -497,35 +796,52 @@ out: * specifies the key in the item that describes the items in the child. */ static void create_parent_item(struct scoutfs_btree_block *parent, - unsigned pos, struct scoutfs_btree_block *child, + struct scoutfs_btree_block *child, struct scoutfs_key *key) { + struct scoutfs_avl_node *par; + int cmp; struct scoutfs_btree_ref ref = { .blkno = child->hdr.blkno, .seq = child->hdr.seq, }; - create_item(parent, pos, key, &ref, sizeof(ref)); + scoutfs_avl_search(&parent->item_root, cmp_key_item, key, &cmp, &par, + NULL, NULL); + create_item(parent, key, &ref, sizeof(ref), par, cmp); } /* - * Update the parent item that refers to a child by deleting and - * recreating it. Descent should have ensured that there was always - * room for a maximal key in parents. + * Update an existing parent item reference to a child who may be new or + * may have had its last item changed. */ static void update_parent_item(struct scoutfs_btree_block *parent, - unsigned pos, struct scoutfs_btree_block *child) + struct scoutfs_btree_item *par_item, + struct scoutfs_btree_block *child) { - struct scoutfs_btree_item *item = last_item(child); + struct scoutfs_btree_ref *ref = item_val(parent, par_item); - delete_item(parent, pos); - create_parent_item(parent, pos, child, item_key(item)); + par_item->key = *item_key(last_item(child)); + ref->blkno = child->hdr.blkno; + ref->seq = child->hdr.seq; +} + +static void init_btree_block(struct scoutfs_btree_block *bt, int level) +{ + int free; + + free = SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_btree_block); + if (level == 0) + free -= SCOUTFS_BTREE_LEAF_ITEM_HASH_BYTES; + + bt->level = level; + bt->mid_free_len = cpu_to_le16(free); } /* * See if we need to split this block while descending for insertion so * that we have enough space to insert. Parent blocks need enough space - * for a new item and child ref if a child block splits. Leaf blocks + * to insert a new parent item if a child block splits. Leaf blocks * need enough space to insert the new item with its value. * * We split to the left so that the greatest key in the existing block @@ -538,35 +854,36 @@ static int try_split(struct super_block *sb, struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root, struct scoutfs_key *key, unsigned val_len, - struct scoutfs_btree_block *parent, unsigned pos, + struct scoutfs_btree_block *parent, struct scoutfs_btree_block *right) { struct scoutfs_block *left_bl = NULL; struct scoutfs_block *par_bl = NULL; struct scoutfs_btree_block *left; - struct scoutfs_btree_item *item; struct scoutfs_key max_key; - unsigned int all_bytes; int ret; int err; - if (scoutfs_option_bool(sb, Opt_btree_force_tiny_blocks)) - all_bytes = SCOUTFS_BLOCK_SIZE - SCOUTFS_BTREE_TINY_BLOCK_SIZE; - else if (right->level) - all_bytes = SCOUTFS_BTREE_PARENT_MIN_FREE_BYTES; - else - all_bytes = all_len_bytes(val_len); + /* parents need to leave room for child references */ + if (right->level) + val_len = sizeof(struct scoutfs_btree_ref); - if (free_bytes(right) >= all_bytes) + /* don't need to split if there's enough space for the item */ + if (le16_to_cpu(right->mid_free_len) >= item_len_bytes(val_len)) return 0; + if (item_full_pct(right) < 80) { + compact_values(right); + return 0; + } + /* alloc split neighbour first to avoid unwinding tree growth */ ret = get_ref_block(sb, alloc, wri, BTW_ALLOC, NULL, &left_bl); if (ret) return ret; left = left_bl->data; - left->level = right->level; + init_btree_block(left, right->level); if (!parent) { ret = get_ref_block(sb, alloc, wri, BTW_ALLOC, NULL, &par_bl); @@ -579,21 +896,19 @@ static int try_split(struct super_block *sb, } parent = par_bl->data; - parent->level = root->height; + init_btree_block(parent, root->height); root->height++; root->ref.blkno = parent->hdr.blkno; root->ref.seq = parent->hdr.seq; scoutfs_key_set_ones(&max_key); - - pos = 0; - create_parent_item(parent, pos, right, &max_key); + create_parent_item(parent, right, &max_key); } - move_items(left, right, false, used_total(right) / 2); + move_items(left, right, false, + le16_to_cpu(right->total_item_bytes) / 2); - item = last_item(left); - create_parent_item(parent, pos, left, item_key(item)); + create_parent_item(parent, left, item_key(last_item(left))); scoutfs_block_put(sb, left_bl); scoutfs_block_put(sb, par_bl); @@ -603,78 +918,73 @@ static int try_split(struct super_block *sb, /* * This is called during descent for deletion when we have a parent and - * might need to merge items from a sibling block if this block has too - * much free space. Eventually we'll be able to fit all of the + * might need to join this block with a sibling block if this block has + * too much free space. Eventually we'll be able to fit all of the * sibling's items in our free space which lets us delete the sibling * block. - * - * XXX this could more cleverly chose a merge candidate sibling */ -static int try_merge(struct super_block *sb, - struct scoutfs_radix_allocator *alloc, - struct scoutfs_block_writer *wri, - struct scoutfs_btree_root *root, - struct scoutfs_btree_block *parent, unsigned pos, - struct scoutfs_btree_block *bt) +static int try_join(struct super_block *sb, + struct scoutfs_radix_allocator *alloc, + struct scoutfs_block_writer *wri, + struct scoutfs_btree_root *root, + struct scoutfs_btree_block *parent, + struct scoutfs_btree_item *par_item, + struct scoutfs_btree_block *bt) { + struct scoutfs_btree_item *sib_par_item; struct scoutfs_btree_block *sib; struct scoutfs_block *sib_bl; struct scoutfs_btree_ref *ref; - unsigned int min_used; - unsigned int sib_pos; + unsigned int sib_tot; bool move_right; int to_move; int ret; - BUILD_BUG_ON(min_used_bytes(SCOUTFS_BTREE_TINY_BLOCK_SIZE) < 0); - - if (scoutfs_option_bool(sb, Opt_btree_force_tiny_blocks)) - min_used = min_used_bytes(SCOUTFS_BTREE_TINY_BLOCK_SIZE); - else - min_used = min_used_bytes(SCOUTFS_BLOCK_SIZE); - - if (used_total(bt) >= min_used) + if (le16_to_cpu(bt->total_item_bytes) >= join_low_watermark()) return 0; /* move items right into our block if we have a left sibling */ - if (pos) { - sib_pos = pos - 1; + sib_par_item = prev_item(parent, par_item); + if (sib_par_item) { move_right = true; } else { - sib_pos = pos + 1; + sib_par_item = next_item(parent, par_item); move_right = false; } - ref = item_val(pos_item(parent, sib_pos)); + ref = item_val(parent, sib_par_item); ret = get_ref_block(sb, alloc, wri, BTW_DIRTY, ref, &sib_bl); if (ret) return ret; sib = sib_bl->data; - if (used_total(sib) < min_used) - to_move = used_total(sib); + sib_tot = le16_to_cpu(bt->total_item_bytes); + if (sib_tot < join_low_watermark()) + to_move = sib_tot; else - to_move = min_used - used_total(bt); + to_move = sib_tot - join_low_watermark(); + if (le16_to_cpu(bt->mid_free_len) < to_move) + compact_values(bt); move_items(bt, sib, move_right, to_move); /* update our parent's item */ if (!move_right) - update_parent_item(parent, pos, bt); + update_parent_item(parent, par_item, bt); /* update or delete sibling's parent item */ - if (le32_to_cpu(sib->nr_items) == 0) { - delete_item(parent, sib_pos); + if (le16_to_cpu(sib->nr_items) == 0) { + delete_item(parent, sib_par_item, NULL); ret = scoutfs_radix_free(sb, alloc, wri, le64_to_cpu(sib->hdr.blkno)); BUG_ON(ret); /* could have dirtied alloc to avoid error */ } else if (move_right) { - update_parent_item(parent, sib_pos, sib); + update_parent_item(parent, sib_par_item, sib); } /* and finally shrink the tree if our parent is the root with 1 */ - if (le32_to_cpu(parent->nr_items) == 1) { + if (le16_to_cpu(parent->nr_items) == 1) { root->height--; root->ref.blkno = bt->hdr.blkno; root->ref.seq = bt->hdr.seq; @@ -688,77 +998,6 @@ static int try_merge(struct super_block *sb, return 1; } -/* - * A quick and dirty verification of the btree block. We could add a - * lot more checks and make it only verified on read or after - * significant events like splitting and merging. - */ -static int verify_btree_block(struct scoutfs_btree_block *bt, int level) -{ - struct scoutfs_btree_item *item; - struct scoutfs_btree_item *prev = NULL; - unsigned int bytes = 0; - unsigned int after_off = sizeof(struct scoutfs_btree_block); - unsigned int first_off; - unsigned int off; - unsigned int nr; - unsigned int i = 0; - int bad = 1; - - nr = le32_to_cpu(bt->nr_items); - if (nr == 0) - goto out; - - after_off = offsetof(struct scoutfs_btree_block, item_hdrs[nr]); - first_off = SCOUTFS_BLOCK_SIZE; - - if (after_off > SCOUTFS_BLOCK_SIZE) { - nr = 0; - goto out; - } - - for (i = 0; i < nr; i++) { - off = le32_to_cpu(bt->item_hdrs[i].off); - if (off >= SCOUTFS_BLOCK_SIZE || off < after_off) - goto out; - - first_off = min(first_off, off); - - item = pos_item(bt, i); - bytes += item_bytes(item); - - if (i > 0 && scoutfs_key_compare(item_key(item), - item_key(prev)) <= 0) - goto out; - - prev = item; - } - - if (first_off < le32_to_cpu(bt->free_end)) - goto out; - - if ((le32_to_cpu(bt->free_end) + bytes) != SCOUTFS_BLOCK_SIZE) - goto out; - - bad = 0; -out: - if (bad) { - printk("bt %p blkno %llu level %d end %u nr %u (after %u bytes %u)\n", - bt, le64_to_cpu(bt->hdr.blkno), level, - le32_to_cpu(bt->free_end), le32_to_cpu(bt->nr_items), - after_off, bytes); - for (i = 0; i < nr; i++) { - item = pos_item(bt, i); - printk(" [%u] off %u val_len %u\n", - i, le32_to_cpu(bt->item_hdrs[i].off), - item_val_len(item)); - } - BUG_ON(bad); - } - - return 0; -} - /* * Return the leaf block that should contain the given key. The caller * is responsible for searching the leaf block and performing their @@ -788,12 +1027,14 @@ static int btree_walk(struct super_block *sb, struct scoutfs_block *bl = NULL; struct scoutfs_btree_block *parent = NULL; struct scoutfs_btree_block *bt; + struct scoutfs_btree_item *par_item; struct scoutfs_btree_item *item; + struct scoutfs_btree_item *prev; + struct scoutfs_avl_node *next_node; + struct scoutfs_avl_node *node; struct scoutfs_btree_ref *ref; unsigned int level; - unsigned int pos; unsigned int nr; - int cmp; int ret; if (WARN_ON_ONCE((flags & (BTW_NEXT|BTW_PREV)) && iter_key == NULL) || @@ -804,11 +1045,11 @@ restart: scoutfs_block_put(sb, par_bl); par_bl = NULL; parent = NULL; + par_item = NULL; scoutfs_block_put(sb, bl); bl = NULL; bt = NULL; level = root->height; - pos = 0; ret = 0; if (!root->height) { @@ -819,7 +1060,7 @@ restart: &root->ref, &bl); if (ret == 0) { bt = bl->data; - bt->level = 0; + init_btree_block(bt, 0); root->height = 1; } } @@ -834,11 +1075,6 @@ restart: break; bt = bl->data; - /* XXX it'd be nice to make this tunable */ - ret = 0 && verify_btree_block(bt, level); - if (ret) - break; - /* XXX more aggressive block verification, before ref updates? */ if (bt->level != level) { scoutfs_corruption(sb, SC_BTREE_BLOCK_LEVEL, @@ -855,19 +1091,19 @@ restart: } /* - * Splitting and merging can add or remove parents or - * change the pos we take through parents to reach the + * Splitting and joining can add or remove parents or + * change the parent item we use to reach the child * block with the search key. In the rare case that we - * split or merge we simply restart the walk rather than - * try and special case modifying the path to reflect - * the tree changes. + * split or join we simply restart the walk instead of + * update our state to reflect the tree changes. */ ret = 0; if (flags & (BTW_INSERT | BTW_DELETE)) ret = try_split(sb, alloc, wri, root, key, val_len, - parent, pos, bt); + parent, bt); if (ret == 0 && (flags & BTW_DELETE) && parent) - ret = try_merge(sb, alloc, wri, root, parent, pos, bt); + ret = try_join(sb, alloc, wri, root, parent, par_item, + bt); if (ret > 0) goto restart; else if (ret < 0) @@ -877,33 +1113,33 @@ restart: if (level == 0) break; - nr = le32_to_cpu(bt->nr_items); - + nr = le16_to_cpu(bt->nr_items); /* Find the next child block for the search key. */ - pos = find_pos(bt, key, &cmp); - if (pos >= nr) { + node = scoutfs_avl_search(&bt->item_root, cmp_key_item, key, + NULL, NULL, &next_node, NULL); + item = node_item(node ?: next_node); + if (item == NULL) { scoutfs_corruption(sb, SC_BTREE_NO_CHILD_REF, corrupt_btree_block_level, - "root_height %u root_blkno %llu root_seq %llu blkno %llu seq %llu level %u nr %u pos %u cmp %d", + "root_height %u root_blkno %llu root_seq %llu blkno %llu seq %llu level %u nr %u", root->height, le64_to_cpu(root->ref.blkno), le64_to_cpu(root->ref.seq), le64_to_cpu(bt->hdr.blkno), le64_to_cpu(bt->hdr.seq), bt->level, - nr, pos, cmp); + nr); ret = -EIO; break; } /* give the caller the next key to iterate towards */ - if (iter_key && (flags & BTW_NEXT) && (pos < (nr - 1))) { - item = pos_item(bt, pos); + if (iter_key && (flags & BTW_NEXT) && next_item(bt, item)) { *iter_key = *item_key(item); scoutfs_key_inc(iter_key); - } else if (iter_key && (flags & BTW_PREV) && (pos > 0)) { - item = pos_item(bt, pos - 1); - *iter_key = *item_key(item); + } else if (iter_key && (flags & BTW_PREV) && + (prev = prev_item(bt, item))) { + *iter_key = *item_key(prev); } scoutfs_block_put(sb, par_bl); @@ -912,7 +1148,8 @@ restart: bl = NULL; bt = NULL; - ref = item_val(pos_item(parent, pos)); + par_item = item; + ref = item_val(parent, par_item); } out: @@ -935,10 +1172,12 @@ static void init_item_ref(struct scoutfs_btree_item_ref *iref, struct scoutfs_block *bl, struct scoutfs_btree_item *item) { + struct scoutfs_btree_block *bt = bl->data; + iref->sb = sb; iref->bl = bl; iref->key = item_key(item); - iref->val = item_val(item); + iref->val = item_val(bt, item); iref->val_len = le16_to_cpu(item->val_len); } @@ -963,8 +1202,6 @@ int scoutfs_btree_lookup(struct super_block *sb, struct scoutfs_btree_item *item; struct scoutfs_btree_block *bt; struct scoutfs_block *bl; - unsigned int pos; - int cmp; int ret; if (WARN_ON_ONCE(iref->key)) @@ -973,16 +1210,15 @@ int scoutfs_btree_lookup(struct super_block *sb, ret = btree_walk(sb, NULL, NULL, root, 0, key, 0, &bl, NULL); if (ret == 0) { bt = bl->data; - pos = find_pos(bt, key, &cmp); - if (cmp == 0) { - item = pos_item(bt, pos); + + item = leaf_item_hash_search(bt, key); + if (item) { init_item_ref(iref, sb, bl, item); ret = 0; } else { scoutfs_block_put(sb, bl); ret = -ENOENT; } - } return ret; @@ -1009,9 +1245,11 @@ int scoutfs_btree_insert(struct super_block *sb, struct scoutfs_key *key, void *val, unsigned val_len) { + struct scoutfs_btree_item *item; struct scoutfs_btree_block *bt; + struct scoutfs_avl_node *node; + struct scoutfs_avl_node *par; struct scoutfs_block *bl; - int pos; int cmp; int ret; @@ -1022,12 +1260,19 @@ int scoutfs_btree_insert(struct super_block *sb, val_len, &bl, NULL); if (ret == 0) { bt = bl->data; - pos = find_pos(bt, key, &cmp); - if (cmp) { - create_item(bt, pos, key, val, val_len); - ret = 0; - } else { + + item = leaf_item_hash_search(bt, key); + if (item) { ret = -EEXIST; + } else { + node = scoutfs_avl_search(&bt->item_root, cmp_key_item, + key, &cmp, &par, NULL, NULL); + if (node) { + ret = -EEXIST; + } else { + create_item(bt, key, val, val_len, par, cmp); + ret = 0; + } } scoutfs_block_put(sb, bl); @@ -1036,6 +1281,18 @@ int scoutfs_btree_insert(struct super_block *sb, return ret; } +static void update_item_value(struct scoutfs_btree_block *bt, + struct scoutfs_btree_item *item, + void *val, unsigned val_len) +{ + le16_add_cpu(&bt->total_item_bytes, val_bytes(val_len) - + val_bytes(le16_to_cpu(item->val_len))); + delete_value(bt, le16_to_cpu(item->val_off), + le16_to_cpu(item->val_len)); + item->val_off = insert_value(bt, ptr_off(bt, item), val, val_len); + item->val_len = cpu_to_le16(val_len); +} + /* * Update a btree item. -ENOENT is returned if the item didn't exist. * @@ -1053,10 +1310,9 @@ int scoutfs_btree_update(struct super_block *sb, struct scoutfs_key *key, void *val, unsigned val_len) { + struct scoutfs_btree_item *item; struct scoutfs_btree_block *bt; struct scoutfs_block *bl; - int pos; - int cmp; int ret; if (invalid_item(val_len)) @@ -1066,10 +1322,10 @@ int scoutfs_btree_update(struct super_block *sb, val_len, &bl, NULL); if (ret == 0) { bt = bl->data; - pos = find_pos(bt, key, &cmp); - if (cmp == 0) { - delete_item(bt, pos); - create_item(bt, pos, key, val, val_len); + + item = leaf_item_hash_search(bt, key); + if (item) { + update_item_value(bt, item, val, val_len); ret = 0; } else { ret = -ENOENT; @@ -1092,9 +1348,10 @@ int scoutfs_btree_force(struct super_block *sb, struct scoutfs_key *key, void *val, unsigned val_len) { + struct scoutfs_btree_item *item; + struct scoutfs_avl_node *par; struct scoutfs_btree_block *bt; struct scoutfs_block *bl; - int pos; int cmp; int ret; @@ -1105,10 +1362,17 @@ int scoutfs_btree_force(struct super_block *sb, val_len, &bl, NULL); if (ret == 0) { bt = bl->data; - pos = find_pos(bt, key, &cmp); - if (cmp == 0) - delete_item(bt, pos); - create_item(bt, pos, key, val, val_len); + + item = leaf_item_hash_search(bt, key); + if (item) { + update_item_value(bt, item, val, val_len); + } else { + scoutfs_avl_search(&bt->item_root, cmp_key_item, key, + &cmp, &par, NULL, NULL); + create_item(bt, key, val, val_len, par, cmp); + } + ret = 0; + scoutfs_block_put(sb, bl); } @@ -1125,19 +1389,19 @@ int scoutfs_btree_delete(struct super_block *sb, struct scoutfs_btree_root *root, struct scoutfs_key *key) { + struct scoutfs_btree_item *item; struct scoutfs_btree_block *bt; struct scoutfs_block *bl; - int pos; - int cmp; int ret; ret = btree_walk(sb, alloc, wri, root, BTW_DELETE | BTW_DIRTY, key, 0, &bl, NULL); if (ret == 0) { bt = bl->data; - pos = find_pos(bt, key, &cmp); - if (cmp == 0) { - if (le32_to_cpu(bt->nr_items) == 1) { + + item = leaf_item_hash_search(bt, key); + if (item) { + if (le16_to_cpu(bt->nr_items) == 1) { /* remove final empty block */ ret = scoutfs_radix_free(sb, alloc, wri, bl->blkno); @@ -1147,7 +1411,7 @@ int scoutfs_btree_delete(struct super_block *sb, root->ref.seq = 0; } } else { - delete_item(bt, pos); + delete_item(bt, item, NULL); ret = 0; } } else { @@ -1162,8 +1426,8 @@ int scoutfs_btree_delete(struct super_block *sb, /* * Iterate from a key value to the next item in the direction of - * iteration. Callers set flags to tell which way to iterate and - * whether the search key is inclusive, or not. + * iteration. Callers set flags to tell which way to iterate. The + * first key is always inclusive. * * Walking can land in a leaf that doesn't contain any items in the * direction of the iteration. Walking gives us the next key to walk @@ -1176,13 +1440,14 @@ static int btree_iter(struct super_block *sb,struct scoutfs_btree_root *root, int flags, struct scoutfs_key *key, struct scoutfs_btree_item_ref *iref) { + struct scoutfs_avl_node *node; + struct scoutfs_avl_node *next; + struct scoutfs_avl_node *prev; struct scoutfs_btree_item *item; struct scoutfs_btree_block *bt; - struct scoutfs_block *bl; struct scoutfs_key iter_key; struct scoutfs_key walk_key; - int pos; - int cmp; + struct scoutfs_block *bl; int ret; if (WARN_ON_ONCE(flags & BTW_DIRTY) || @@ -1199,19 +1464,15 @@ static int btree_iter(struct super_block *sb,struct scoutfs_btree_root *root, break; bt = bl->data; - pos = find_pos(bt, key, &cmp); + node = scoutfs_avl_search(&bt->item_root, cmp_key_item, key, + NULL, NULL, &next, &prev); - /* point pos towards iteration, find_pos already for _NEXT */ - if ((flags & BTW_AFTER) && cmp == 0) - pos++; - else if ((flags & BTW_PREV) && cmp < 0) - pos--; - else if ((flags & BTW_BEFORE) && cmp == 0) - pos--; - - /* found the next item in this leaf */ - if (pos >= 0 && pos < le32_to_cpu(bt->nr_items)) { - item = pos_item(bt, pos); + if (node == NULL && (flags & BTW_NEXT)) + node = next; + else if (node == NULL && (flags & BTW_PREV)) + node = prev; + item = node_item(node); + if (item) { init_item_ref(iref, sb, bl, item); ret = 0; break; @@ -1274,16 +1535,17 @@ int scoutfs_btree_dirty(struct super_block *sb, struct scoutfs_btree_root *root, struct scoutfs_key *key) { + struct scoutfs_btree_item *item; struct scoutfs_btree_block *bt; struct scoutfs_block *bl; - int cmp; int ret; ret = btree_walk(sb, alloc, wri, root, BTW_DIRTY, key, 0, &bl, NULL); if (ret == 0) { bt = bl->data; - find_pos(bt, key, &cmp); - if (cmp == 0) + + item = leaf_item_hash_search(bt, key); + if (item) ret = 0; else ret = -ENOENT; diff --git a/kmod/src/format.h b/kmod/src/format.h index 413774da..e8b4ec86 100644 --- a/kmod/src/format.h +++ b/kmod/src/format.h @@ -196,17 +196,10 @@ struct scoutfs_avl_node { } __packed; /* when we split we want to have multiple items on each side */ -#define SCOUTFS_BTREE_MAX_VAL_LEN (SCOUTFS_BLOCK_SIZE / 8) +#define SCOUTFS_BTREE_MAX_VAL_LEN 512 -/* - * The min number of free bytes we must leave in a parent as we descend - * to modify. This guarantees enough free bytes in a parent to insert a - * new child reference item as a child block splits. - */ -#define SCOUTFS_BTREE_PARENT_MIN_FREE_BYTES \ - (sizeof(struct scoutfs_btree_item_header) + \ - sizeof(struct scoutfs_btree_item) + \ - sizeof(struct scoutfs_btree_ref)) +/* each value ends with an offset which lets compaction iterate over values */ +#define SCOUTFS_BTREE_VAL_OWNER_BYTES sizeof(__le16) /* * When debugging we can tune the splitting and merging thresholds to @@ -236,24 +229,37 @@ struct scoutfs_btree_root { __u8 height; } __packed; -struct scoutfs_btree_item_header { - __le32 off; -} __packed; - struct scoutfs_btree_item { + struct scoutfs_avl_node node; struct scoutfs_key key; + __le16 val_off; __le16 val_len; - __u8 val[0]; } __packed; struct scoutfs_btree_block { struct scoutfs_block_header hdr; - __le32 free_end; - __le32 nr_items; + struct scoutfs_avl_root item_root; + __le16 nr_items; + __le16 total_item_bytes; + __le16 mid_free_len; + __le16 last_free_off; + __le16 last_free_len; __u8 level; - struct scoutfs_btree_item_header item_hdrs[0]; + struct scoutfs_btree_item items[0]; + /* leaf blocks have a fixed size item offset hash table at the end */ } __packed; +/* + * Try to aim for a 75% load in a leaf full of items with no value. + * We'll almost never see this because most items have values and most + * blocks aren't full. + */ +#define SCOUTFS_BTREE_LEAF_ITEM_HASH_NR \ + ((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_btree_block)) / \ + (sizeof(struct scoutfs_btree_item) + (sizeof(__le16))) * 100 / 75) +#define SCOUTFS_BTREE_LEAF_ITEM_HASH_BYTES \ + (SCOUTFS_BTREE_LEAF_ITEM_HASH_NR * sizeof(__le16)) + struct scoutfs_mounted_client_btree_val { __u8 flags; } __packed;