scoutfs-utils: support btree avl and hash

Update the internal structure of btree blocks to use the avl item index
and hash table direct item lookup.

Signed-off-by: Zach Brown <zab@versity.com>
This commit is contained in:
Zach Brown
2020-04-30 12:00:15 -07:00
committed by Zach Brown
parent aa84f7c601
commit b86a1bebbb
7 changed files with 237 additions and 52 deletions

40
utils/src/avl.c Normal file
View File

@@ -0,0 +1,40 @@
#include "sparse.h"
#include "util.h"
#include "format.h"
#include "avl.h"
static struct scoutfs_avl_node *node_ptr(struct scoutfs_avl_root *root,
__le16 off)
{
return off ? (void *)root + le16_to_cpu(off) : NULL;
}
struct scoutfs_avl_node *avl_first(struct scoutfs_avl_root *root)
{
struct scoutfs_avl_node *node = node_ptr(root, root->node);
while (node && node->left)
node = node_ptr(root, node->left);
return node;
}
struct scoutfs_avl_node *avl_next(struct scoutfs_avl_root *root,
struct scoutfs_avl_node *node)
{
struct scoutfs_avl_node *parent;
if (node->right) {
node = node_ptr(root, node->right);
while (node->left)
node = node_ptr(root, node->left);
return node;
}
while ((parent = node_ptr(root, node->parent)) &&
node == node_ptr(root, parent->right))
node = parent;
return parent;
}

8
utils/src/avl.h Normal file
View File

@@ -0,0 +1,8 @@
#ifndef _AVL_H_
#define _AVL_H_
struct scoutfs_avl_node *avl_first(struct scoutfs_avl_root *root);
struct scoutfs_avl_node *avl_next(struct scoutfs_avl_root *root,
struct scoutfs_avl_node *node);
#endif

View File

@@ -184,26 +184,22 @@ struct scoutfs_radix_root {
~(__u64)SCOUTFS_RADIX_LG_MASK)
#define SCOUTFS_RADIX_BITS_BYTES (SCOUTFS_RADIX_BITS / 8)
struct scoutfs_avl_root {
__le16 node;
} __packed;
struct scoutfs_avl_node {
__le16 parent;
__le16 left;
__le16 right;
__u8 height;
} __packed;
/* when we split we want to have multiple items on each side */
#define SCOUTFS_BTREE_MAX_VAL_LEN (SCOUTFS_BLOCK_SIZE / 8)
#define SCOUTFS_BTREE_MAX_VAL_LEN 512
/*
* The min number of free bytes we must leave in a parent as we descend
* to modify. This guarantees enough free bytes in a parent to insert a
* new child reference item as a child block splits.
*/
#define SCOUTFS_BTREE_PARENT_MIN_FREE_BYTES \
(sizeof(struct scoutfs_btree_item_header) + \
sizeof(struct scoutfs_btree_item) + \
sizeof(struct scoutfs_btree_ref))
/*
* When debugging we can tune the splitting and merging thresholds to
* create much larger trees by having blocks with many fewer items. We
* implement this by pretending the blocks are tiny. They're still
* large enough for a handful of items.
*/
#define SCOUTFS_BTREE_TINY_BLOCK_SIZE 512
/* each value ends with an offset which lets compaction iterate over values */
#define SCOUTFS_BTREE_VAL_OWNER_BYTES sizeof(__le16)
/*
* A 4EB test image measured a worst case height of 17. This is plenty
@@ -225,24 +221,37 @@ struct scoutfs_btree_root {
__u8 height;
} __packed;
struct scoutfs_btree_item_header {
__le32 off;
} __packed;
struct scoutfs_btree_item {
struct scoutfs_avl_node node;
struct scoutfs_key key;
__le16 val_off;
__le16 val_len;
__u8 val[0];
} __packed;
struct scoutfs_btree_block {
struct scoutfs_block_header hdr;
__le32 free_end;
__le32 nr_items;
struct scoutfs_avl_root item_root;
__le16 nr_items;
__le16 total_item_bytes;
__le16 mid_free_len;
__le16 last_free_off;
__le16 last_free_len;
__u8 level;
struct scoutfs_btree_item_header item_hdrs[0];
struct scoutfs_btree_item items[0];
/* leaf blocks have a fixed size item offset hash table at the end */
} __packed;
/*
* Try to aim for a 75% load in a leaf full of items with no value.
* We'll almost never see this because most items have values and most
* blocks aren't full.
*/
#define SCOUTFS_BTREE_LEAF_ITEM_HASH_NR \
((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_btree_block)) / \
(sizeof(struct scoutfs_btree_item) + (sizeof(__le16))) * 100 / 75)
#define SCOUTFS_BTREE_LEAF_ITEM_HASH_BYTES \
(SCOUTFS_BTREE_LEAF_ITEM_HASH_NR * sizeof(__le16))
struct scoutfs_mounted_client_btree_val {
__u8 flags;
} __packed;

View File

@@ -0,0 +1,39 @@
#include "sparse.h"
#include "util.h"
#include "format.h"
#include "crc.h"
#include "leaf_item_hash.h"
/*
* A minimal extraction of the leaf item hash from the kernel's btree.
*/
int leaf_item_hash_ind(struct scoutfs_key *key)
{
return crc32c(~0, key, sizeof(struct scoutfs_key)) %
SCOUTFS_BTREE_LEAF_ITEM_HASH_NR;
}
__le16 *leaf_item_hash_buckets(struct scoutfs_btree_block *bt)
{
return (void *)bt + SCOUTFS_BLOCK_SIZE -
SCOUTFS_BTREE_LEAF_ITEM_HASH_BYTES;
}
void leaf_item_hash_insert(struct scoutfs_btree_block *bt,
struct scoutfs_key *key, __le16 off)
{
__le16 *buckets = leaf_item_hash_buckets(bt);
int i;
if (bt->level > 0)
return;
for (i = leaf_item_hash_ind(key);
i < SCOUTFS_BTREE_LEAF_ITEM_HASH_NR; i++) {
if (buckets[i] == 0) {
buckets[i] = off;
return;
}
}
}

View File

@@ -0,0 +1,9 @@
#ifndef _LEAF_ITEM_HASH_H_
#define _LEAF_ITEM_HASH_H_
int leaf_item_hash_ind(struct scoutfs_key *key);
__le16 *leaf_item_hash_buckets(struct scoutfs_btree_block *bt);
void leaf_item_hash_insert(struct scoutfs_btree_block *bt,
struct scoutfs_key *key, __le16 off);
#endif

View File

@@ -27,6 +27,7 @@
#include "key.h"
#include "bitops.h"
#include "radix.h"
#include "leaf_item_hash.h"
static int write_raw_block(int fd, u64 blkno, void *blk)
{
@@ -290,9 +291,11 @@ static int write_new_fs(char *path, int fd, u8 quorum_count)
struct scoutfs_inode *inode;
struct scoutfs_btree_block *bt;
struct scoutfs_btree_item *btitem;
struct scoutfs_avl_node *par;
struct scoutfs_key *key;
struct timeval tv;
char uuid_str[37];
__le16 *own;
void *zeros;
u64 blkno;
u64 limit;
@@ -370,28 +373,43 @@ static int write_new_fs(char *path, int fd, u8 quorum_count)
bt->hdr.fsid = super->hdr.fsid;
bt->hdr.blkno = cpu_to_le64(blkno);
bt->hdr.seq = cpu_to_le64(1);
bt->nr_items = cpu_to_le32(2);
/* btree item allocated from the back of the block */
key = (void *)bt + SCOUTFS_BLOCK_SIZE - sizeof(*key);
btitem = (void *)key - sizeof(*btitem);
/* meta seq index for the root inode */
btitem = &bt->items[le16_to_cpu(bt->nr_items)];
le16_add_cpu(&bt->nr_items, 1);
key = &btitem->key;
bt->item_hdrs[0].off = cpu_to_le32((long)btitem - (long)bt);
bt->item_root.node = cpu_to_le16((void *)&btitem->node -
(void *)&bt->item_root);
btitem->node.height = 2;
btitem->val_len = cpu_to_le16(0);
memset(key, 0, sizeof(*key));
key->sk_zone = SCOUTFS_INODE_INDEX_ZONE;
key->sk_type = SCOUTFS_INODE_INDEX_META_SEQ_TYPE;
key->skii_ino = cpu_to_le64(SCOUTFS_ROOT_INO);
inode = (void *)btitem - sizeof(*inode);
key = (void *)inode - sizeof(*key);
btitem = (void *)key - sizeof(*btitem);
leaf_item_hash_insert(bt, &btitem->key,
cpu_to_le16((void *)btitem - (void *)bt));
bt->item_hdrs[1].off = cpu_to_le32((long)btitem - (long)bt);
/* root inode */
par = &btitem->node;
btitem = &bt->items[le16_to_cpu(bt->nr_items)];
le16_add_cpu(&bt->nr_items, 1);
key = &btitem->key;
own = (void *)bt + SCOUTFS_BLOCK_SIZE -
SCOUTFS_BTREE_LEAF_ITEM_HASH_BYTES -
SCOUTFS_BTREE_VAL_OWNER_BYTES;
inode = (void *)own - sizeof(*inode);
par->right = cpu_to_le16((void *)&btitem->node -
(void *)&bt->item_root);
btitem->node.height = 1;
btitem->node.parent = cpu_to_le16((void *)par - (void *)&bt->item_root);
btitem->val_off = cpu_to_le16((void *)inode - (void *)bt);
btitem->val_len = cpu_to_le16(sizeof(*inode));
le16_add_cpu(&bt->total_item_bytes, le16_to_cpu(btitem->val_len) +
SCOUTFS_BTREE_VAL_OWNER_BYTES);
memset(key, 0, sizeof(*key));
key->sk_zone = SCOUTFS_FS_ZONE;
key->ski_ino = cpu_to_le64(SCOUTFS_ROOT_INO);
key->sk_type = SCOUTFS_INODE_TYPE;
@@ -406,8 +424,14 @@ static int write_new_fs(char *path, int fd, u8 quorum_count)
inode->mtime.sec = inode->atime.sec;
inode->mtime.nsec = inode->atime.nsec;
bt->free_end = bt->item_hdrs[le32_to_cpu(bt->nr_items) - 1].off;
leaf_item_hash_insert(bt, &btitem->key,
cpu_to_le16((void *)btitem - (void *)bt));
*own = cpu_to_le16((void *)btitem - (void *)bt);
le16_add_cpu(&bt->total_item_bytes, le16_to_cpu(bt->nr_items) *
sizeof(struct scoutfs_btree_item));
bt->mid_free_len = cpu_to_le16((void *)inode -
(void *)&bt->items[le16_to_cpu(bt->nr_items)]);
bt->hdr.magic = cpu_to_le32(SCOUTFS_BLOCK_MAGIC_BTREE);
bt->hdr.crc = cpu_to_le32(crc_block(&bt->hdr));

View File

@@ -21,6 +21,8 @@
#include "crc.h"
#include "key.h"
#include "radix.h"
#include "avl.h"
#include "leaf_item_hash.h"
static void *read_block(int fd, u64 blkno)
{
@@ -395,14 +397,48 @@ static int print_btree_ref(struct scoutfs_key *key, void *val,
return 0;
}
static void print_leaf_item_hash(struct scoutfs_btree_block *bt)
{
__le16 *b;
int col;
int nr;
int i;
/* print the leaf item hash */
printf(" item hash: ");
col = 13;
b = leaf_item_hash_buckets(bt);
nr = 0;
for (i = 0; i < SCOUTFS_BTREE_LEAF_ITEM_HASH_NR; i++) {
if (b[i] == 0)
continue;
nr++;
col += snprintf(NULL, 0, "%u,%u ", i, le16_to_cpu(b[i]));
if (col >= 78) {
printf("\n ");
col = 3;
}
printf("%u,%u ", i, le16_to_cpu(b[i]));
}
if (col != 3)
printf("\n");
printf(" (%u / %u populated, %u%% load)\n",
nr, (int)SCOUTFS_BTREE_LEAF_ITEM_HASH_NR,
nr * 100 / (int)SCOUTFS_BTREE_LEAF_ITEM_HASH_NR);
}
static int print_btree_block(int fd, struct scoutfs_super_block *super,
char *which, struct scoutfs_btree_ref *ref,
print_item_func func, void *arg, u8 level)
{
struct scoutfs_btree_item *item;
struct scoutfs_avl_node *node;
struct scoutfs_btree_block *bt;
struct scoutfs_key *key;
unsigned val_len;
unsigned int val_len;
unsigned int off;
void *val;
int ret;
int i;
@@ -414,22 +450,35 @@ static int print_btree_block(int fd, struct scoutfs_super_block *super,
if (bt->level == level) {
printf("%s btree blkno %llu\n"
" crc %08x fsid %llx seq %llu blkno %llu \n"
" level %u free_end %u nr_items %u\n",
" total_item_bytes %u mid_free_len %u last_free_off %u "
"last_free_len %u\n"
" level %u nr_items %u item_root.node %u\n",
which, le64_to_cpu(ref->blkno),
le32_to_cpu(bt->hdr.crc),
le64_to_cpu(bt->hdr.fsid),
le64_to_cpu(bt->hdr.seq),
le64_to_cpu(bt->hdr.blkno),
le16_to_cpu(bt->total_item_bytes),
le16_to_cpu(bt->mid_free_len),
le16_to_cpu(bt->last_free_off),
le16_to_cpu(bt->last_free_len),
bt->level,
le32_to_cpu(bt->free_end),
le32_to_cpu(bt->nr_items));
le16_to_cpu(bt->nr_items),
le16_to_cpu(bt->item_root.node));
if (bt->level == 0)
print_leaf_item_hash(bt);
}
for (i = 0; i < le32_to_cpu(bt->nr_items); i++) {
item = (void *)bt + le32_to_cpu(bt->item_hdrs[i].off);
for (i = 0, node = avl_first(&bt->item_root);
node;
i++, node = avl_next(&bt->item_root, node)) {
item = container_of(node, struct scoutfs_btree_item, node);
off = (void *)item - (void *)bt;
val_len = le16_to_cpu(item->val_len);
key = &item->key;
val = item->val;
val = (void *)bt + le16_to_cpu(item->val_off);
if (level < bt->level) {
ref = val;
@@ -443,8 +492,12 @@ static int print_btree_block(int fd, struct scoutfs_super_block *super,
continue;
}
printf(" item [%u] off %u val_len %u\n",
i, le32_to_cpu(bt->item_hdrs[i].off), val_len);
printf(" [%u] off %u par %u l %u r %u h %u vo %u vl %u\n",
i, off, le16_to_cpu(item->node.parent),
le16_to_cpu(item->node.left),
le16_to_cpu(item->node.right),
item->node.height, le16_to_cpu(item->val_off),
val_len);
if (level)
print_btree_ref(key, val, val_len, func, arg);
@@ -589,12 +642,12 @@ static int print_btree_leaf_items(int fd, struct scoutfs_super_block *super,
print_item_func func, void *arg)
{
struct scoutfs_btree_item *item;
struct scoutfs_avl_node *node;
struct scoutfs_btree_block *bt;
unsigned val_len;
void *key;
void *val;
int ret;
int i;
if (ref->blkno == 0)
return 0;
@@ -603,11 +656,12 @@ static int print_btree_leaf_items(int fd, struct scoutfs_super_block *super,
if (!bt)
return -ENOMEM;
for (i = 0; i < le32_to_cpu(bt->nr_items); i++) {
item = (void *)bt + le32_to_cpu(bt->item_hdrs[i].off);
node = avl_first(&bt->item_root);
while (node) {
item = container_of(node, struct scoutfs_btree_item, node);
val_len = le16_to_cpu(item->val_len);
key = (void *)(item + 1);
val = (void *)(key + 1);
key = &item->key;
val = (void *)bt + le16_to_cpu(item->val_off);
if (bt->level > 0) {
ret = print_btree_leaf_items(fd, super, val, func, arg);
@@ -617,6 +671,8 @@ static int print_btree_leaf_items(int fd, struct scoutfs_super_block *super,
} else {
func(key, val, val_len, arg);
}
node = avl_next(&bt->item_root, node);
}
free(bt);