From 3776c18c66145a820719d54da6b9bdbff13b46d0 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Mon, 30 Sep 2019 11:26:36 -0700 Subject: [PATCH] scoutfs-utils: switch to btree forest Remove all the lsm code from mkfs and print, replacing it with the forest of btrees. Signed-off-by: Zach Brown --- utils/src/crc.c | 9 - utils/src/crc.h | 1 - utils/src/format.h | 237 ++++++++--------------- utils/src/ioctl.h | 32 +-- utils/src/item-cache-keys.c | 92 --------- utils/src/mkfs.c | 298 ++++++++++------------------ utils/src/print.c | 376 ++++++++++++++++++++++-------------- 7 files changed, 414 insertions(+), 631 deletions(-) delete mode 100644 utils/src/item-cache-keys.c diff --git a/utils/src/crc.c b/utils/src/crc.c index 714afa90..38640fbc 100644 --- a/utils/src/crc.c +++ b/utils/src/crc.c @@ -37,12 +37,3 @@ u32 crc_block(struct scoutfs_block_header *hdr) return crc32c(~0, (char *)hdr + sizeof(hdr->crc), SCOUTFS_BLOCK_SIZE - sizeof(hdr->crc)); } - -u32 crc_segment(struct scoutfs_segment_block *sblk) -{ - u32 off = offsetof(struct scoutfs_segment_block, _padding) + - sizeof(sblk->_padding); - - return crc32c(~0, (char *)sblk + off, - le32_to_cpu(sblk->total_bytes) - off); -} diff --git a/utils/src/crc.h b/utils/src/crc.h index a928bf0a..6878bf2f 100644 --- a/utils/src/crc.h +++ b/utils/src/crc.h @@ -8,6 +8,5 @@ u32 crc32c(u32 crc, const void *data, unsigned int len); u64 crc32c_64(u32 crc, const void *data, unsigned int len); u32 crc_block(struct scoutfs_block_header *hdr); -u32 crc_segment(struct scoutfs_segment_block *seg); #endif diff --git a/utils/src/format.h b/utils/src/format.h index 2534fe16..ad408c39 100644 --- a/utils/src/format.h +++ b/utils/src/format.h @@ -7,6 +7,7 @@ /* block header magic values, chosen at random */ #define SCOUTFS_BLOCK_MAGIC_SUPER 0x103c428b #define SCOUTFS_BLOCK_MAGIC_BTREE 0xe597f96d +#define SCOUTFS_BLOCK_MAGIC_BLOOM 0x31995604 /* * The super block and btree blocks are fixed 4k. @@ -19,18 +20,6 @@ #define SCOUTFS_BLOCK_SECTORS (1 << SCOUTFS_BLOCK_SECTOR_SHIFT) #define SCOUTFS_BLOCK_MAX (U64_MAX >> SCOUTFS_BLOCK_SHIFT) -/* - * FS data is stored in segments, for now they're fixed size. They'll - * be dynamic. - */ -#define SCOUTFS_SEGMENT_SHIFT 20 -#define SCOUTFS_SEGMENT_SIZE (1 << SCOUTFS_SEGMENT_SHIFT) -#define SCOUTFS_SEGMENT_MASK (SCOUTFS_SEGMENT_SIZE - 1) -#define SCOUTFS_SEGMENT_PAGES (SCOUTFS_SEGMENT_SIZE / PAGE_SIZE) -#define SCOUTFS_SEGMENT_BLOCKS (SCOUTFS_SEGMENT_SIZE / SCOUTFS_BLOCK_SIZE) -#define SCOUTFS_SEGMENT_BLOCK_SHIFT \ - (SCOUTFS_SEGMENT_SHIFT - SCOUTFS_BLOCK_SHIFT) - #define SCOUTFS_PAGES_PER_BLOCK (SCOUTFS_BLOCK_SIZE / PAGE_SIZE) #define SCOUTFS_BLOCK_PAGE_ORDER (SCOUTFS_BLOCK_SHIFT - PAGE_SHIFT) @@ -162,7 +151,7 @@ struct scoutfs_key_be { /* chose reasonable max key and value lens that have room for some u64s */ #define SCOUTFS_BTREE_MAX_KEY_LEN 40 -#define SCOUTFS_BTREE_MAX_VAL_LEN 64 +#define SCOUTFS_BTREE_MAX_VAL_LEN 256 /* * The min number of free bytes we must leave in a parent as we descend @@ -198,19 +187,14 @@ struct scoutfs_btree_ref { /* * A height of X means that the first block read will have level X-1 and * the leaves will have level 0. - * - * The migration key is used to walk the tree finding old blocks to migrate - * into the current half of the ring. */ struct scoutfs_btree_root { struct scoutfs_btree_ref ref; __u8 height; - __le16 migration_key_len; - __u8 migration_key[SCOUTFS_BTREE_MAX_KEY_LEN]; } __packed; struct scoutfs_btree_item_header { - __le16 off; + __le32 off; } __packed; struct scoutfs_btree_item { @@ -221,52 +205,32 @@ struct scoutfs_btree_item { struct scoutfs_btree_block { struct scoutfs_block_header hdr; - __le16 free_end; - __le16 free_reclaim; - __le16 nr_items; + __le32 free_end; + __le32 nr_items; __u8 level; struct scoutfs_btree_item_header item_hdrs[0]; } __packed; -struct scoutfs_btree_ring { - __le64 first_blkno; - __le64 nr_blocks; - __le64 next_block; - __le64 next_seq; -} __packed; - /* - * This is absurdly huge. If there was only ever 1 item per segment and - * 2^64 items the tree could get this deep. + * Free metadata blocks are tracked by block allocator items. */ -#define SCOUTFS_MANIFEST_MAX_LEVEL 20 - -#define SCOUTFS_MANIFEST_FANOUT 10 - -struct scoutfs_manifest { +struct scoutfs_balloc_root { struct scoutfs_btree_root root; - __le64 level_counts[SCOUTFS_MANIFEST_MAX_LEVEL]; + __le64 total_free; +} __packed; +struct scoutfs_balloc_item_key { + __be64 base; } __packed; -/* - * Manifest entries are split across btree keys and values. Putting - * some entry fields in the value keeps the key smaller and increases - * the fanout of the btree which keeps the tree smaller and reduces - * block IO. - * - * The key is made up of the level, first key, and seq. At level 0 - * segments can completely overlap and have identical key ranges but we - * avoid duplicate btree keys by including the unique seq. - */ -struct scoutfs_manifest_btree_key { - __u8 level; - struct scoutfs_key_be first_key; - __be64 seq; -} __packed; +#define SCOUTFS_BALLOC_ITEM_BYTES 256 +#define SCOUTFS_BALLOC_ITEM_U64S (SCOUTFS_BALLOC_ITEM_BYTES / \ + sizeof(__u64)) +#define SCOUTFS_BALLOC_ITEM_BITS (SCOUTFS_BALLOC_ITEM_BYTES * 8) +#define SCOUTFS_BALLOC_ITEM_BASE_SHIFT ilog2(SCOUTFS_BALLOC_ITEM_BITS) +#define SCOUTFS_BALLOC_ITEM_BIT_MASK (SCOUTFS_BALLOC_ITEM_BITS - 1) -struct scoutfs_manifest_btree_val { - __le64 segno; - struct scoutfs_key last_key; +struct scoutfs_balloc_item_val { + __le64 bits[SCOUTFS_BALLOC_ITEM_U64S]; } __packed; /* @@ -312,50 +276,61 @@ struct scoutfs_mounted_client_btree_val { #define SCOUTFS_MOUNTED_CLIENT_VOTER (1 << 0) -/* - * The max number of links defines the max number of entries that we can - * index in o(log n) and the static list head storage size in the - * segment block. We always pay the static storage cost, which is tiny, - * and we can look at the number of items to know the greatest number of - * links and skip most of the initial 0 links. - */ -#define SCOUTFS_MAX_SKIP_LINKS 32 +struct scoutfs_log_trees { + struct scoutfs_balloc_root alloc_root; + struct scoutfs_balloc_root free_root; + struct scoutfs_btree_root item_root; + struct scoutfs_btree_ref bloom_ref; + __le64 rid; + __le64 nr; +} __packed; -/* - * Items are packed into segments and linked together in a skip list. - * Each item's header, links, key, and value are stored contiguously. - * They're not allowed to cross a block boundary. - */ -struct scoutfs_segment_item { - struct scoutfs_key key; - __le16 val_len; +struct scoutfs_log_trees_key { + __be64 rid; + __be64 nr; +} __packed; + +struct scoutfs_log_trees_val { + struct scoutfs_balloc_root alloc_root; + struct scoutfs_balloc_root free_root; + struct scoutfs_btree_root item_root; + struct scoutfs_btree_ref bloom_ref; +} __packed; + +struct scoutfs_log_item_value { + __le64 vers; __u8 flags; - __u8 nr_links; - __le32 skip_links[0]; - /* __u8 val_bytes[val_len] */ + __u8 data[0]; } __packed; -#define SCOUTFS_ITEM_FLAG_DELETION (1 << 0) - /* - * Each large segment starts with a segment block that describes the - * rest of the blocks that make up the segment. - * - * The crc covers the initial total_bytes of the segment but starts - * after the padding. + * FS items are limited by the max btree value length with the log item + * value header. */ -struct scoutfs_segment_block { - __le32 crc; - __le32 _padding; - __le64 segno; - __le64 seq; - __le32 last_item_off; - __le32 total_bytes; - __le32 nr_items; - __le32 skip_links[SCOUTFS_MAX_SKIP_LINKS]; - /* packed items */ +#define SCOUTFS_MAX_VAL_SIZE \ + (SCOUTFS_BTREE_MAX_VAL_LEN - sizeof(struct scoutfs_log_item_value)) + +#define SCOUTFS_LOG_ITEM_FLAG_DELETION (1 << 0) + +struct scoutfs_bloom_block { + struct scoutfs_block_header hdr; + __le64 total_set; + __le64 bits[0]; } __packed; +/* + * Log trees include a tree of items that make up a fixed size bloom + * filter. Just a few megs worth of items lets us test for the presence + * of locks that cover billions of files with a .1% chance of false + * positives. The log trees should be finalized and merged long before + * the bloom filters fill up and start returning excessive false positives. + */ +#define SCOUTFS_FOREST_BLOOM_NRS 7 +#define SCOUTFS_FOREST_BLOOM_BITS \ + (((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_bloom_block)) / \ + member_sizeof(struct scoutfs_bloom_block, bits[0])) * \ + member_sizeof(struct scoutfs_bloom_block, bits[0]) * 8) \ + /* * Keys are first sorted by major key zones. */ @@ -475,18 +450,21 @@ struct scoutfs_super_block { __le64 next_ino; __le64 next_trans_seq; __le64 total_blocks; + __le64 next_uninit_free_block; + __le64 core_balloc_cursor; __le64 free_blocks; - __le64 alloc_cursor; - struct scoutfs_btree_ring bring; - __le64 next_seg_seq; - __le64 next_compact_id; + __le64 first_fs_blkno; + __le64 last_fs_blkno; __le64 quorum_fenced_term; __le64 quorum_server_term; __le64 unmount_barrier; __u8 quorum_count; struct scoutfs_inet_addr server_addr; + struct scoutfs_balloc_root core_balloc_alloc; + struct scoutfs_balloc_root core_balloc_free; struct scoutfs_btree_root alloc_root; - struct scoutfs_manifest manifest; + struct scoutfs_btree_root fs_root; + struct scoutfs_btree_root logs_root; struct scoutfs_btree_root lock_clients; struct scoutfs_btree_root trans_seqs; struct scoutfs_btree_root mounted_clients; @@ -594,8 +572,6 @@ enum { DIV_ROUND_UP(sizeof(struct scoutfs_xattr) + name_len + val_len, \ SCOUTFS_XATTR_MAX_PART_SIZE); -#define SCOUTFS_MAX_VAL_SIZE SCOUTFS_XATTR_MAX_PART_SIZE - #define SCOUTFS_LOCK_INODE_GROUP_NR 1024 #define SCOUTFS_LOCK_INODE_GROUP_MASK (SCOUTFS_LOCK_INODE_GROUP_NR - 1) #define SCOUTFS_LOCK_SEQ_GROUP_MASK ((1ULL << 10) - 1) @@ -678,13 +654,11 @@ enum { SCOUTFS_NET_CMD_ALLOC_INODES, SCOUTFS_NET_CMD_ALLOC_EXTENT, SCOUTFS_NET_CMD_FREE_EXTENTS, - SCOUTFS_NET_CMD_ALLOC_SEGNO, - SCOUTFS_NET_CMD_RECORD_SEGMENT, + SCOUTFS_NET_CMD_GET_LOG_TREES, + SCOUTFS_NET_CMD_COMMIT_LOG_TREES, SCOUTFS_NET_CMD_ADVANCE_SEQ, SCOUTFS_NET_CMD_GET_LAST_SEQ, - SCOUTFS_NET_CMD_GET_MANIFEST_ROOT, SCOUTFS_NET_CMD_STATFS, - SCOUTFS_NET_CMD_COMPACT, SCOUTFS_NET_CMD_LOCK, SCOUTFS_NET_CMD_LOCK_RECOVER, SCOUTFS_NET_CMD_FAREWELL, @@ -723,20 +697,6 @@ struct scoutfs_net_inode_alloc { __le64 nr; } __packed; -struct scoutfs_net_key_range { - __le16 start_len; - __le16 end_len; - __u8 key_bytes[0]; -} __packed; - -struct scoutfs_net_manifest_entry { - __le64 segno; - __le64 seq; - struct scoutfs_key first; - struct scoutfs_key last; - __u8 level; -} __packed; - struct scoutfs_net_statfs { __le64 total_blocks; /* total blocks in device */ __le64 next_ino; /* next unused inode number */ @@ -763,52 +723,9 @@ struct scoutfs_net_extent_list { /* arbitrarily makes a nice ~1k extent list payload */ #define SCOUTFS_NET_EXTENT_LIST_MAX_NR 64 -/* one upper segment and fanout lower segments */ -#define SCOUTFS_COMPACTION_MAX_INPUT (1 + SCOUTFS_MANIFEST_FANOUT) -/* sticky can split the input and item alignment padding can add a lower */ -#define SCOUTFS_COMPACTION_SEGNO_OVERHEAD 2 -#define SCOUTFS_COMPACTION_MAX_OUTPUT \ - (SCOUTFS_COMPACTION_MAX_INPUT + SCOUTFS_COMPACTION_SEGNO_OVERHEAD) - -/* - * A compact request is sent by the server to the client. It provides - * the input segments and enough allocated segnos to write the results. - * The id uniquely identifies this compaction request and is included in - * the response to clean up its allocated resources. - */ -struct scoutfs_net_compact_request { - __le64 id; - __u8 last_level; - __u8 flags; - __le64 segnos[SCOUTFS_COMPACTION_MAX_OUTPUT]; - struct scoutfs_net_manifest_entry ents[SCOUTFS_COMPACTION_MAX_INPUT]; -} __packed; - -/* - * A sticky compaction has more lower level segments that overlap with - * the end of the upper after the last lower level segment included in - * the compaction. Items left in the upper segment after the last lower - * need to be written to the upper level instead of the lower. The - * upper segment "sticks" in place instead of moving down to the lower - * level. - */ -#define SCOUTFS_NET_COMPACT_FLAG_STICKY (1 << 0) - -/* - * A compact response is sent by the client to the server. It describes - * the written output segments that need to be added to the manifest. - * The server compares the response to the request to free unused - * allocated segnos and input manifest entries. An empty response is - * valid and can happen if, say, the upper input segment completely - * deleted all the items in a single overlapping lower segment. - */ -struct scoutfs_net_compact_response { - __le64 id; - struct scoutfs_net_manifest_entry ents[SCOUTFS_COMPACTION_MAX_OUTPUT]; -} __packed; - struct scoutfs_net_lock { struct scoutfs_key key; + __le64 write_version; __u8 old_mode; __u8 new_mode; } __packed; diff --git a/utils/src/ioctl.h b/utils/src/ioctl.h index 5693668e..df0c1b54 100644 --- a/utils/src/ioctl.h +++ b/utils/src/ioctl.h @@ -238,28 +238,6 @@ struct scoutfs_ioctl_stat_more { struct scoutfs_ioctl_stat_more) -/* - * Fills the buffer with either the keys for the cached items or the - * keys for the cached ranges found starting with the given key. The - * number of keys filled in the buffer is returned. When filling range - * keys the returned number will always be a multiple of two. - */ -struct scoutfs_ioctl_item_cache_keys { - struct scoutfs_ioctl_key ikey; - __u64 buf_ptr; - __u16 buf_nr; - __u8 which; - __u8 _pad[21]; /* padded to align _ioctl_key total size */ -}; - -enum { - SCOUTFS_IOC_ITEM_CACHE_KEYS_ITEMS = 0, - SCOUTFS_IOC_ITEM_CACHE_KEYS_RANGES, -}; - -#define SCOUTFS_IOC_ITEM_CACHE_KEYS _IOR(SCOUTFS_IOCTL_MAGIC, 6, \ - struct scoutfs_ioctl_item_cache_keys) - struct scoutfs_ioctl_data_waiting_entry { __u64 ino; __u64 iblock; @@ -283,7 +261,7 @@ struct scoutfs_ioctl_data_waiting { #define SCOUTFS_IOC_DATA_WAITING_FLAGS_UNKNOWN (U8_MAX << 0) -#define SCOUTFS_IOC_DATA_WAITING _IOR(SCOUTFS_IOCTL_MAGIC, 7, \ +#define SCOUTFS_IOC_DATA_WAITING _IOR(SCOUTFS_IOCTL_MAGIC, 6, \ struct scoutfs_ioctl_data_waiting) /* @@ -303,7 +281,7 @@ struct scoutfs_ioctl_setattr_more { #define SCOUTFS_IOC_SETATTR_MORE_OFFLINE (1 << 0) #define SCOUTFS_IOC_SETATTR_MORE_UNKNOWN (U8_MAX << 1) -#define SCOUTFS_IOC_SETATTR_MORE _IOW(SCOUTFS_IOCTL_MAGIC, 8, \ +#define SCOUTFS_IOC_SETATTR_MORE _IOW(SCOUTFS_IOCTL_MAGIC, 7, \ struct scoutfs_ioctl_setattr_more) struct scoutfs_ioctl_listxattr_hidden { @@ -313,7 +291,7 @@ struct scoutfs_ioctl_listxattr_hidden { __u32 hash_pos; }; -#define SCOUTFS_IOC_LISTXATTR_HIDDEN _IOR(SCOUTFS_IOCTL_MAGIC, 9, \ +#define SCOUTFS_IOC_LISTXATTR_HIDDEN _IOR(SCOUTFS_IOCTL_MAGIC, 8, \ struct scoutfs_ioctl_listxattr_hidden) /* @@ -344,7 +322,7 @@ struct scoutfs_ioctl_find_xattrs { __u8 _pad[4]; }; -#define SCOUTFS_IOC_FIND_XATTRS _IOR(SCOUTFS_IOCTL_MAGIC, 10, \ +#define SCOUTFS_IOC_FIND_XATTRS _IOR(SCOUTFS_IOCTL_MAGIC, 9, \ struct scoutfs_ioctl_find_xattrs) /* @@ -365,7 +343,7 @@ struct scoutfs_ioctl_statfs_more { __u64 rid; } __packed; -#define SCOUTFS_IOC_STATFS_MORE _IOR(SCOUTFS_IOCTL_MAGIC, 11, \ +#define SCOUTFS_IOC_STATFS_MORE _IOR(SCOUTFS_IOCTL_MAGIC, 10, \ struct scoutfs_ioctl_statfs_more) diff --git a/utils/src/item-cache-keys.c b/utils/src/item-cache-keys.c deleted file mode 100644 index a7577cb3..00000000 --- a/utils/src/item-cache-keys.c +++ /dev/null @@ -1,92 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "sparse.h" -#include "util.h" -#include "format.h" -#include "ioctl.h" -#include "cmd.h" -#include "key.h" - -static int item_cache_keys(int argc, char **argv, int which) -{ - struct scoutfs_ioctl_item_cache_keys ick; - struct scoutfs_ioctl_key ikeys[32]; - struct scoutfs_key key; - int ret; - int fd; - int i; - - if (argc != 2) { - fprintf(stderr, "too many arguments, only scoutfs path needed"); - return -EINVAL; - } - - fd = open(argv[1], O_RDONLY); - if (fd < 0) { - ret = -errno; - fprintf(stderr, "failed to open '%s': %s (%d)\n", - argv[1], strerror(errno), errno); - return ret; - } - - memset(&ick, 0, sizeof(ick)); - ick.buf_ptr = (unsigned long)ikeys; - ick.buf_nr = array_size(ikeys); - ick.which = which; - - for (;;) { - ret = ioctl(fd, SCOUTFS_IOC_ITEM_CACHE_KEYS, &ick); - if (ret < 0) { - ret = -errno; - fprintf(stderr, "walk_inodes ioctl failed: %s (%d)\n", - strerror(errno), errno); - break; - } else if (ret == 0) { - break; - } - - for (i = 0; i < ret; i++) { - scoutfs_key_copy_types(&key, &ikeys[i]); - printf(SK_FMT, SK_ARG(&key)); - - if (which == SCOUTFS_IOC_ITEM_CACHE_KEYS_ITEMS || - (i & 1)) - printf("\n"); - else - printf(" - "); - } - - scoutfs_key_inc(&key); - scoutfs_key_copy_types(&ick.ikey, &key); - } - - close(fd); - return ret; -}; - -static int item_keys(int argc, char **argv) -{ - return item_cache_keys(argc, argv, SCOUTFS_IOC_ITEM_CACHE_KEYS_ITEMS); -} - -static int range_keys(int argc, char **argv) -{ - return item_cache_keys(argc, argv, SCOUTFS_IOC_ITEM_CACHE_KEYS_RANGES); -} - -static void __attribute__((constructor)) item_cache_key_ctor(void) -{ - cmd_register("item-cache-keys", "", - "print range of indexed inodes", item_keys); - cmd_register("item-cache-range-keys", "", - "print range of indexed inodes", range_keys); -} diff --git a/utils/src/mkfs.c b/utils/src/mkfs.c index 7a28f9ef..54d26abf 100644 --- a/utils/src/mkfs.c +++ b/utils/src/mkfs.c @@ -25,6 +25,7 @@ #include "rand.h" #include "dev.h" #include "key.h" +#include "bitops.h" static int write_raw_block(int fd, u64 blkno, void *blk) { @@ -54,80 +55,6 @@ static int write_block(int fd, u64 blkno, struct scoutfs_super_block *super, return write_raw_block(fd, blkno, hdr); } -/* - * Calculate the greatest number of btree blocks that might be needed to - * store the given item population. At most all blocks will be half - * full. All keys will be the max size including parent items which - * determines the fanout. - * - * We will never hit this in practice. But some joker *could* fill a - * filesystem with empty files with enormous file names. - */ -static u64 calc_btree_blocks(u64 nr, u64 max_key, u64 max_val) -{ - u64 item_bytes; - u64 fanout; - u64 block_items; - u64 leaf_blocks; - u64 level_blocks; - u64 total_blocks; - - /* figure out the parent fanout for these silly huge possible items */ - item_bytes = sizeof(struct scoutfs_btree_item_header) + - sizeof(struct scoutfs_btree_item) + - max_key + sizeof(struct scoutfs_btree_ref); - fanout = ((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_btree_block) - - SCOUTFS_BTREE_PARENT_MIN_FREE_BYTES) / 2) / item_bytes; - - /* figure out how many items we have to store */ - item_bytes = sizeof(struct scoutfs_btree_item_header) + - sizeof(struct scoutfs_btree_item) + - max_key + max_val; - block_items = ((SCOUTFS_BLOCK_SIZE - - sizeof(struct scoutfs_btree_block)) / 2) / item_bytes; - leaf_blocks = DIV_ROUND_UP(nr, block_items); - - /* then calc total blocks as we grow to have enough blocks for items */ - level_blocks = 1; - total_blocks = level_blocks; - while (level_blocks < leaf_blocks) { - level_blocks *= fanout; - level_blocks = min(leaf_blocks, level_blocks); - total_blocks += level_blocks; - } - - return total_blocks; -} - -/* - * Figure out how many btree ring blocks we'll need for all the btree - * items that could be needed to describe this many segments. - * - * We can have either a free extent or manifest ref for every segment in - * the system. Free extent items are smaller than manifest refs, and - * they merge if they're adjacent, so the largest possible tree is a ref - * for every segment. - */ -static u64 calc_btree_ring_blocks(u64 total_segs) -{ - u64 blocks; - - /* key is smaller for wider parent fanout */ - assert(sizeof(struct scoutfs_extent_btree_key) <= - sizeof(struct scoutfs_manifest_btree_key)); - - /* 2 extent items is smaller than a manifest ref */ - assert((2 * sizeof(struct scoutfs_extent_btree_key)) <= - (sizeof(struct scoutfs_manifest_btree_key) + - sizeof(struct scoutfs_manifest_btree_val))); - - blocks = calc_btree_blocks(total_segs, - sizeof(struct scoutfs_manifest_btree_key), - sizeof(struct scoutfs_manifest_btree_val)); - - return round_up(blocks * 4, SCOUTFS_SEGMENT_BLOCKS); -} - static float size_flt(u64 nr, unsigned size) { float x = (float)nr * (float)size; @@ -166,28 +93,22 @@ static char *size_str(u64 nr, unsigned size) static int write_new_fs(char *path, int fd, u8 quorum_count) { struct scoutfs_super_block *super; - struct scoutfs_key *ino_key; - struct scoutfs_key *idx_key; + struct scoutfs_key_be *kbe; struct scoutfs_inode *inode; - struct scoutfs_segment_block *sblk; - struct scoutfs_manifest_btree_key *mkey; - struct scoutfs_manifest_btree_val *mval; struct scoutfs_extent_btree_key *ebk; struct scoutfs_btree_block *bt; struct scoutfs_btree_item *btitem; - struct scoutfs_segment_item *item; + struct scoutfs_balloc_item_key *bik; + struct scoutfs_balloc_item_val *biv; struct scoutfs_key key; - __le32 *prev_link; struct timeval tv; char uuid_str[37]; void *zeros; u64 blkno; u64 limit; u64 size; - u64 ring_blocks; - u64 total_segs; u64 total_blocks; - u64 first_segno; + u64 free_blkno; u64 free_start; u64 free_len; int ret; @@ -197,9 +118,8 @@ static int write_new_fs(char *path, int fd, u8 quorum_count) super = calloc(1, SCOUTFS_BLOCK_SIZE); bt = calloc(1, SCOUTFS_BLOCK_SIZE); - sblk = calloc(1, SCOUTFS_SEGMENT_SIZE); - zeros = calloc(1, SCOUTFS_SEGMENT_SIZE); - if (!super || !bt || !sblk || !zeros) { + zeros = calloc(1, SCOUTFS_BLOCK_SIZE); + if (!super || !bt || !zeros) { ret = -errno; fprintf(stderr, "failed to allocate block mem: %s (%d)\n", strerror(errno), errno); @@ -213,15 +133,14 @@ static int write_new_fs(char *path, int fd, u8 quorum_count) goto out; } - /* arbitrarily require space for a handful of segments */ - limit = SCOUTFS_SEGMENT_SIZE * 16; + /* arbitrarily require a reasonably large device */ + limit = 8ULL * (1024 * 1024 * 1024); if (size < limit) { fprintf(stderr, "%llu byte device too small for min %llu byte fs\n", size, limit); goto out; } - total_segs = size / SCOUTFS_SEGMENT_SIZE; total_blocks = size / SCOUTFS_BLOCK_SIZE; /* partially initialize the super so we can use it to init others */ @@ -234,25 +153,21 @@ static int write_new_fs(char *path, int fd, u8 quorum_count) super->next_ino = cpu_to_le64(SCOUTFS_ROOT_INO + 1); super->next_trans_seq = cpu_to_le64(1); super->total_blocks = cpu_to_le64(total_blocks); - super->next_seg_seq = cpu_to_le64(2); - super->next_compact_id = cpu_to_le64(1); super->quorum_count = quorum_count; - /* align the btree ring to the segment after the super */ - blkno = round_up(SCOUTFS_SUPER_BLKNO + 1, SCOUTFS_SEGMENT_BLOCKS); - /* first usable segno follows manifest ring */ - ring_blocks = calc_btree_ring_blocks(total_segs); - first_segno = (blkno + ring_blocks) / SCOUTFS_SEGMENT_BLOCKS; - free_start = ((first_segno + 1) << SCOUTFS_SEGMENT_BLOCK_SHIFT); + /* metadata blocks start after the quorum blocks */ + free_blkno = SCOUTFS_QUORUM_BLKNO + SCOUTFS_QUORUM_BLOCKS; + + /* extents start after btree blocks */ + free_start = total_blocks - (total_blocks / 4); free_len = total_blocks - free_start; + /* fill out some alloc boundaries before using */ super->free_blocks = cpu_to_le64(free_len); - super->bring.first_blkno = cpu_to_le64(blkno); - super->bring.nr_blocks = cpu_to_le64(ring_blocks); - super->bring.next_block = cpu_to_le64(2); - super->bring.next_seq = cpu_to_le64(2); - /* allocator btree has item with space after first segno */ + /* extent allocator btree indexes free data extent */ + blkno = free_blkno++; + super->alloc_root.ref.blkno = cpu_to_le64(blkno); super->alloc_root.ref.seq = cpu_to_le64(1); super->alloc_root.height = 1; @@ -261,14 +176,13 @@ static int write_new_fs(char *path, int fd, u8 quorum_count) bt->hdr.fsid = super->hdr.fsid; bt->hdr.blkno = cpu_to_le64(blkno); bt->hdr.seq = cpu_to_le64(1); - bt->nr_items = cpu_to_le16(2); + bt->nr_items = cpu_to_le32(2); /* btree item allocated from the back of the block */ ebk = (void *)bt + SCOUTFS_BLOCK_SIZE - sizeof(*ebk); btitem = (void *)ebk - sizeof(*btitem); - bt->item_hdrs[0].off = cpu_to_le16((long)btitem - (long)bt); - bt->free_end = bt->item_hdrs[0].off; + bt->item_hdrs[0].off = cpu_to_le32((long)btitem - (long)bt); btitem->key_len = cpu_to_le16(sizeof(*ebk)); btitem->val_len = cpu_to_le16(0); @@ -279,8 +193,7 @@ static int write_new_fs(char *path, int fd, u8 quorum_count) ebk = (void *)btitem - sizeof(*ebk); btitem = (void *)ebk - sizeof(*btitem); - bt->item_hdrs[1].off = cpu_to_le16((long)btitem - (long)bt); - bt->free_end = bt->item_hdrs[1].off; + bt->item_hdrs[1].off = cpu_to_le32((long)btitem - (long)bt); btitem->key_len = cpu_to_le16(sizeof(*ebk)); btitem->val_len = cpu_to_le16(0); @@ -288,6 +201,8 @@ static int write_new_fs(char *path, int fd, u8 quorum_count) ebk->major = cpu_to_be64(free_len); ebk->minor = cpu_to_be64(free_start + free_len - 1); + bt->free_end = bt->item_hdrs[le32_to_cpu(bt->nr_items) - 1].off; + bt->hdr.magic = cpu_to_le32(SCOUTFS_BLOCK_MAGIC_BTREE); bt->hdr.crc = cpu_to_le32(crc_block(&bt->hdr)); @@ -296,85 +211,46 @@ static int write_new_fs(char *path, int fd, u8 quorum_count) goto out; blkno++; - /* manifest btree has a block with an item for the segment */ - super->manifest.root.ref.blkno = cpu_to_le64(blkno); - super->manifest.root.ref.seq = cpu_to_le64(1); - super->manifest.root.height = 1; - super->manifest.level_counts[1] = cpu_to_le64(1); + /* fs root starts with root inode and its index items */ + blkno = free_blkno++; + + super->fs_root.ref.blkno = cpu_to_le64(blkno); + super->fs_root.ref.seq = cpu_to_le64(1); + super->fs_root.height = 1; memset(bt, 0, SCOUTFS_BLOCK_SIZE); bt->hdr.fsid = super->hdr.fsid; bt->hdr.blkno = cpu_to_le64(blkno); bt->hdr.seq = cpu_to_le64(1); - bt->nr_items = cpu_to_le16(1); + bt->nr_items = cpu_to_le32(2); /* btree item allocated from the back of the block */ - mval = (void *)bt + SCOUTFS_BLOCK_SIZE - sizeof(*mval); - ino_key = &mval->last_key; - mkey = (void *)mval - sizeof(*mkey); - btitem = (void *)mkey - sizeof(*btitem); + kbe = (void *)bt + SCOUTFS_BLOCK_SIZE - sizeof(*kbe); + btitem = (void *)kbe - sizeof(*btitem); - bt->item_hdrs[0].off = cpu_to_le16((long)btitem - (long)bt); - bt->free_end = bt->item_hdrs[0].off; + bt->item_hdrs[0].off = cpu_to_le32((long)btitem - (long)bt); + btitem->key_len = cpu_to_le16(sizeof(*kbe)); + btitem->val_len = cpu_to_le16(0); - btitem->key_len = cpu_to_le16(sizeof(*mkey)); - btitem->val_len = cpu_to_le16(sizeof(*mval)); - - mkey->level = 1; - mkey->seq = cpu_to_be64(1); memset(&key, 0, sizeof(key)); key.sk_zone = SCOUTFS_INODE_INDEX_ZONE; key.sk_type = SCOUTFS_INODE_INDEX_META_SEQ_TYPE; key.skii_ino = cpu_to_le64(SCOUTFS_ROOT_INO); - scoutfs_key_to_be(&mkey->first_key, &key); + scoutfs_key_to_be(kbe, &key); - mval->segno = cpu_to_le64(first_segno); - ino_key->sk_zone = SCOUTFS_FS_ZONE; - ino_key->ski_ino = cpu_to_le64(SCOUTFS_ROOT_INO); - ino_key->sk_type = SCOUTFS_INODE_TYPE; + inode = (void *)btitem - sizeof(*inode); + kbe = (void *)inode - sizeof(*kbe); + btitem = (void *)kbe - sizeof(*btitem); - bt->hdr.magic = cpu_to_le32(SCOUTFS_BLOCK_MAGIC_BTREE); - bt->hdr.crc = cpu_to_le32(crc_block(&bt->hdr)); + bt->item_hdrs[1].off = cpu_to_le32((long)btitem - (long)bt); + btitem->key_len = cpu_to_le16(sizeof(*kbe)); + btitem->val_len = cpu_to_le16(sizeof(*inode)); - ret = write_raw_block(fd, blkno, bt); - if (ret) - goto out; - blkno += ring_blocks; - - /* write seg with root inode */ - sblk->segno = cpu_to_le64(first_segno); - sblk->seq = cpu_to_le64(1); - prev_link = &sblk->skip_links[0]; - - item = (void *)(sblk + 1); - *prev_link = cpu_to_le32((long)item -(long)sblk); - prev_link = &item->skip_links[0]; - - item->val_len = 0; - item->nr_links = 1; - le32_add_cpu(&sblk->nr_items, 1); - - idx_key = &item->key; - idx_key->sk_zone = SCOUTFS_INODE_INDEX_ZONE; - idx_key->sk_type = SCOUTFS_INODE_INDEX_META_SEQ_TYPE; - idx_key->skii_ino = cpu_to_le64(SCOUTFS_ROOT_INO); - - item = (void *)&item->skip_links[1]; - *prev_link = cpu_to_le32((long)item -(long)sblk); - prev_link = &item->skip_links[0]; - - sblk->last_item_off = cpu_to_le32((long)item - (long)sblk); - - ino_key = (void *)&item->key; - inode = (void *)&item->skip_links[1]; - - item->val_len = cpu_to_le16(sizeof(struct scoutfs_inode)); - item->nr_links = 1; - le32_add_cpu(&sblk->nr_items, 1); - - ino_key->sk_zone = SCOUTFS_FS_ZONE; - ino_key->ski_ino = cpu_to_le64(SCOUTFS_ROOT_INO); - ino_key->sk_type = SCOUTFS_INODE_TYPE; + memset(&key, 0, sizeof(key)); + key.sk_zone = SCOUTFS_FS_ZONE; + key.ski_ino = cpu_to_le64(SCOUTFS_ROOT_INO); + key.sk_type = SCOUTFS_INODE_TYPE; + scoutfs_key_to_be(kbe, &key); inode->next_readdir_pos = cpu_to_le64(2); inode->nlink = cpu_to_le32(SCOUTFS_DIRENT_FIRST_POS); @@ -386,16 +262,55 @@ static int write_new_fs(char *path, int fd, u8 quorum_count) inode->mtime.sec = inode->atime.sec; inode->mtime.nsec = inode->atime.nsec; - item = (void *)(inode + 1); - sblk->total_bytes = cpu_to_le32((long)item - (long)sblk); - sblk->crc = cpu_to_le32(crc_segment(sblk)); + bt->free_end = bt->item_hdrs[le32_to_cpu(bt->nr_items) - 1].off; - ret = pwrite(fd, sblk, SCOUTFS_SEGMENT_SIZE, - first_segno << SCOUTFS_SEGMENT_SHIFT); - if (ret != SCOUTFS_SEGMENT_SIZE) { - ret = -EIO; + bt->hdr.magic = cpu_to_le32(SCOUTFS_BLOCK_MAGIC_BTREE); + bt->hdr.crc = cpu_to_le32(crc_block(&bt->hdr)); + + ret = write_raw_block(fd, blkno, bt); + if (ret) + goto out; + + /* metadata block allocator has single item, server continues init */ + blkno = free_blkno++; + + super->core_balloc_alloc.root.ref.blkno = cpu_to_le64(blkno); + super->core_balloc_alloc.root.ref.seq = cpu_to_le64(1); + super->core_balloc_alloc.root.height = 1; + + /* XXX magic */ + + memset(bt, 0, SCOUTFS_BLOCK_SIZE); + bt->hdr.fsid = super->hdr.fsid; + bt->hdr.blkno = cpu_to_le64(blkno); + bt->hdr.seq = cpu_to_le64(1); + bt->nr_items = cpu_to_le32(1); + + /* btree item allocated from the back of the block */ + biv = (void *)bt + SCOUTFS_BLOCK_SIZE - sizeof(*biv); + bik = (void *)biv - sizeof(*bik); + btitem = (void *)bik - sizeof(*btitem); + + bt->item_hdrs[0].off = cpu_to_le32((long)btitem - (long)bt); + btitem->key_len = cpu_to_le16(sizeof(*bik)); + btitem->val_len = cpu_to_le16(sizeof(*biv)); + + bik->base = cpu_to_be64(0); /* XXX true? */ + + /* set all the bits past our final used blkno */ + super->core_balloc_free.total_free = + cpu_to_le64(SCOUTFS_BALLOC_ITEM_BITS - free_blkno); + for (i = free_blkno; i < SCOUTFS_BALLOC_ITEM_BITS; i++) + set_bit_le(i, &biv->bits); + + bt->free_end = bt->item_hdrs[le32_to_cpu(bt->nr_items) - 1].off; + + bt->hdr.magic = cpu_to_le32(SCOUTFS_BLOCK_MAGIC_BTREE); + bt->hdr.crc = cpu_to_le32(crc_block(&bt->hdr)); + + ret = write_raw_block(fd, blkno, bt); + if (ret) goto out; - } /* zero out quorum blocks */ for (i = 0; i < SCOUTFS_QUORUM_BLOCKS; i++) { @@ -407,6 +322,8 @@ static int write_new_fs(char *path, int fd, u8 quorum_count) } } + super->next_uninit_free_block = cpu_to_le64(SCOUTFS_BALLOC_ITEM_BITS); + /* write the super block */ super->hdr.seq = cpu_to_le64(1); ret = write_block(fd, SCOUTFS_SUPER_BLKNO, NULL, &super->hdr); @@ -423,22 +340,21 @@ static int write_new_fs(char *path, int fd, u8 quorum_count) uuid_unparse(super->uuid, uuid_str); printf("Created scoutfs filesystem:\n" - " device path: %s\n" - " fsid: %llx\n" - " format hash: %llx\n" - " uuid: %s\n" - " device bytes: "SIZE_FMT"\n" - " device blocks: "SIZE_FMT"\n" - " btree ring blocks: "SIZE_FMT"\n" - " free blocks: "SIZE_FMT"\n" - " quorum count: %u\n", + " device path: %s\n" + " fsid: %llx\n" + " format hash: %llx\n" + " uuid: %s\n" + " device blocks: "SIZE_FMT"\n" + " metadata blocks: "SIZE_FMT"\n" + " file extent blocks: "SIZE_FMT"\n" + " quorum count: %u\n", path, le64_to_cpu(super->hdr.fsid), le64_to_cpu(super->format_hash), uuid_str, - SIZE_ARGS(size, 1), SIZE_ARGS(total_blocks, SCOUTFS_BLOCK_SIZE), - SIZE_ARGS(le64_to_cpu(super->bring.nr_blocks), + SIZE_ARGS(le64_to_cpu(super->total_blocks) - + le64_to_cpu(super->free_blocks), SCOUTFS_BLOCK_SIZE), SIZE_ARGS(le64_to_cpu(super->free_blocks), SCOUTFS_BLOCK_SIZE), @@ -450,8 +366,6 @@ out: free(super); if (bt) free(bt); - if (sblk) - free(sblk); if (zeros) free(zeros); return ret; diff --git a/utils/src/print.c b/utils/src/print.c index 5e714b09..eac87402 100644 --- a/utils/src/print.c +++ b/utils/src/print.c @@ -41,27 +41,6 @@ static void *read_block(int fd, u64 blkno) return buf; } -static void *read_segment(int fd, u64 segno) -{ - ssize_t ret; - void *buf; - - buf = malloc(SCOUTFS_SEGMENT_SIZE); - if (!buf) - return NULL; - - ret = pread(fd, buf, SCOUTFS_SEGMENT_SIZE, - segno << SCOUTFS_SEGMENT_SHIFT); - if (ret != SCOUTFS_SEGMENT_SIZE) { - fprintf(stderr, "read segno %llu returned %zd: %s (%d)\n", - segno, ret, strerror(errno), errno); - free(buf); - buf = NULL; - } - - return buf; -} - static void print_block_header(struct scoutfs_block_header *hdr) { u32 crc = crc_block(hdr); @@ -240,93 +219,92 @@ static print_func_t find_printer(u8 zone, u8 type) return NULL; } -static void print_item(struct scoutfs_segment_block *sblk, - struct scoutfs_segment_item *item, u32 which, u32 off) +static int print_fs_item(void *key, unsigned key_len, void *val, + unsigned val_len, void *arg) { + struct scoutfs_key item_key; print_func_t printer; - void *val; - int i; - val = (char *)&item->skip_links[item->nr_links]; + scoutfs_key_from_be(&item_key, key); - printer = find_printer(item->key.sk_zone, item->key.sk_type); + printf(" "SK_FMT"\n", SK_ARG(&item_key)); - printf(" [%u]: key "SK_FMT" off %u val_len %u nr_links %u flags %x%s\n", - which, SK_ARG(&item->key), off, le16_to_cpu(item->val_len), - item->nr_links, - item->flags, printer ? "" : " (unrecognized zone+type)"); - printf(" links:"); - for (i = 0; i < item->nr_links; i++) - printf(" %u", le32_to_cpu(item->skip_links[i])); - printf("\n"); - - if (printer) - printer(&item->key, val, le16_to_cpu(item->val_len)); -} - -static void print_segment_block(struct scoutfs_segment_block *sblk) -{ - int i; - - printf(" sblk: segno %llu seq %llu last_item_off %u total_bytes %u " - "nr_items %u\n", - le64_to_cpu(sblk->segno), le64_to_cpu(sblk->seq), - le32_to_cpu(sblk->last_item_off), le32_to_cpu(sblk->total_bytes), - le32_to_cpu(sblk->nr_items)); - printf(" links:"); - for (i = 0; sblk->skip_links[i]; i++) - printf(" %u", le32_to_cpu(sblk->skip_links[i])); - printf("\n"); -} - -static int print_segments(int fd, unsigned long *seg_map, u64 total) -{ - struct scoutfs_segment_block *sblk; - struct scoutfs_segment_item *item; - u32 off; - u64 s; - u64 i; - - for (s = 0; (s = find_next_set_bit(seg_map, s, total)) < total; s++) { - sblk = read_segment(fd, s); - if (!sblk) - return -ENOMEM; - - printf("segment segno %llu\n", s); - print_segment_block(sblk); - - off = le32_to_cpu(sblk->skip_links[0]); - for (i = 0; i < le32_to_cpu(sblk->nr_items); i++) { - item = (void *)sblk + off; - print_item(sblk, item, i, off); - off = le32_to_cpu(item->skip_links[0]); - } - - free(sblk); + /* only items in leaf blocks have values */ + if (val) { + printer = find_printer(item_key.sk_zone, item_key.sk_type); + if (printer) + printer(&item_key, val, val_len); + else + printf(" (unknown zone %u type %u)\n", + item_key.sk_zone, item_key.sk_type); } return 0; } -static int print_manifest_entry(void *key, unsigned key_len, void *val, - unsigned val_len, void *arg) +/* same as fs item but with a small header in the value */ +static int print_logs_item(void *key, unsigned key_len, void *val, + unsigned val_len, void *arg) { - struct scoutfs_manifest_btree_key *mkey = key; - struct scoutfs_manifest_btree_val *mval = val; - struct scoutfs_key first; - unsigned long *seg_map = arg; + struct scoutfs_key item_key; + struct scoutfs_log_item_value *liv; + print_func_t printer; - scoutfs_key_from_be(&first, &mkey->first_key); + scoutfs_key_from_be(&item_key, key); - printf(" level %u first "SK_FMT" seq %llu\n", - mkey->level, SK_ARG(&first), be64_to_cpu(mkey->seq)); + printf(" "SK_FMT"\n", SK_ARG(&item_key)); /* only items in leaf blocks have values */ if (val) { - printf(" segno %llu last "SK_FMT"\n", - le64_to_cpu(mval->segno), SK_ARG(&mval->last_key)); + liv = val; + printf(" log_item_value: vers %llu flags %x\n", + le64_to_cpu(liv->vers), liv->flags); - set_bit(seg_map, le64_to_cpu(mval->segno)); + /* deletion items don't have values */ + if (!(liv->flags & SCOUTFS_LOG_ITEM_FLAG_DELETION)) { + printer = find_printer(item_key.sk_zone, + item_key.sk_type); + if (printer) + printer(&item_key, val + sizeof(*liv), + val_len - sizeof(*liv)); + else + printf(" (unknown zone %u type %u)\n", + item_key.sk_zone, item_key.sk_type); + } + } + + return 0; +} + +/* same as fs item but with a small header in the value */ +static int print_log_trees_item(void *key, unsigned key_len, void *val, + unsigned val_len, void *arg) +{ + struct scoutfs_log_trees_key *ltk = key; + struct scoutfs_log_trees_val *ltv = val; + + printf(" rid %llu nr %llu\n", + be64_to_cpu(ltk->rid), be64_to_cpu(ltk->nr)); + + /* only items in leaf blocks have values */ + if (val) { + printf(" alloc_root: total_free %llu root: height %u blkno %llu seq %llu\n" + " free_root: total_free %llu root: height %u blkno %llu seq %llu\n" + " item_root: height %u blkno %llu seq %llu\n" + " bloom_ref: blkno %llu seq %llu\n", + le64_to_cpu(ltv->alloc_root.total_free), + ltv->alloc_root.root.height, + le64_to_cpu(ltv->alloc_root.root.ref.blkno), + le64_to_cpu(ltv->alloc_root.root.ref.seq), + le64_to_cpu(ltv->free_root.total_free), + ltv->free_root.root.height, + le64_to_cpu(ltv->free_root.root.ref.blkno), + le64_to_cpu(ltv->free_root.root.ref.seq), + ltv->item_root.height, + le64_to_cpu(ltv->item_root.ref.blkno), + le64_to_cpu(ltv->item_root.ref.seq), + le64_to_cpu(ltv->bloom_ref.blkno), + le64_to_cpu(ltv->bloom_ref.seq)); } return 0; @@ -375,7 +353,18 @@ static int print_trans_seqs_entry(void *key, unsigned key_len, void *val, return 0; } -/* XXX should make sure that the val is null terminated */ +static int print_balloc_entry(void *key, unsigned key_len, void *val, + unsigned val_len, void *arg) +{ + struct scoutfs_balloc_item_key *bik = key; +// struct scoutfs_balloc_item_val *biv = val; + + printf(" base %llu\n", + be64_to_cpu(bik->base)); + + return 0; +} + static int print_mounted_client_entry(void *key, unsigned key_len, void *val, unsigned val_len, void *arg) { @@ -423,20 +412,19 @@ static int print_btree_block(int fd, struct scoutfs_super_block *super, if (bt->level == level) { printf("%s btree blkno %llu\n" " crc %08x fsid %llx seq %llu blkno %llu \n" - " level %u free_end %u free_reclaim %u nr_items %u\n", + " level %u free_end %u nr_items %u\n", which, le64_to_cpu(ref->blkno), le32_to_cpu(bt->hdr.crc), le64_to_cpu(bt->hdr.fsid), le64_to_cpu(bt->hdr.seq), le64_to_cpu(bt->hdr.blkno), bt->level, - le16_to_cpu(bt->free_end), - le16_to_cpu(bt->free_reclaim), - le16_to_cpu(bt->nr_items)); + le32_to_cpu(bt->free_end), + le32_to_cpu(bt->nr_items)); } - for (i = 0; i < le16_to_cpu(bt->nr_items); i++) { - item = (void *)bt + le16_to_cpu(bt->item_hdrs[i].off); + for (i = 0; i < le32_to_cpu(bt->nr_items); i++) { + item = (void *)bt + le32_to_cpu(bt->item_hdrs[i].off); key_len = le16_to_cpu(item->key_len); val_len = le16_to_cpu(item->val_len); key = (void *)(item + 1); @@ -455,7 +443,7 @@ static int print_btree_block(int fd, struct scoutfs_super_block *super, } printf(" item [%u] off %u key_len %u val_len %u\n", - i, le16_to_cpu(bt->item_hdrs[i].off), key_len, val_len); + i, le32_to_cpu(bt->item_hdrs[i].off), key_len, val_len); if (level) print_btree_ref(key, key_len, val, val_len, func, arg); @@ -489,6 +477,98 @@ static int print_btree(int fd, struct scoutfs_super_block *super, char *which, return ret; } +struct print_recursion_args { + struct scoutfs_super_block *super; + int fd; +}; + +/* same as fs item but with a small header in the value */ +static int print_log_trees_roots(void *key, unsigned key_len, void *val, + unsigned val_len, void *arg) +{ + struct scoutfs_log_trees_key *ltk = key; + struct scoutfs_log_trees_val *ltv = val; + struct print_recursion_args *pa = arg; + struct log_trees_roots { + char *fmt; + struct scoutfs_btree_root *root; + print_item_func func; + } roots[] = { + { "log_tree_rid:%llu_nr:%llu_alloc", + <v->alloc_root.root, + print_balloc_entry, + }, + { "log_tree_rid:%llu_nr:%llu_free", + <v->free_root.root, + print_balloc_entry, + }, + { "log_tree_rid:%llu_nr:%llu_item", + <v->item_root, + print_logs_item, + }, + }; + char which[100]; + int ret; + int err; + int i; + + /* XXX doesn't print the bloom block */ + + ret = 0; + for (i = 0; i < array_size(roots); i++) { + snprintf(which, sizeof(which) - 1, roots[i].fmt, + be64_to_cpu(ltk->rid), be64_to_cpu(ltk->nr)); + + err = print_btree(pa->fd, pa->super, which, roots[i].root, + roots[i].func, NULL); + if (err && !ret) + ret = err; + } + + return ret; +} + +static int print_btree_leaf_items(int fd, struct scoutfs_super_block *super, + struct scoutfs_btree_ref *ref, + print_item_func func, void *arg) +{ + struct scoutfs_btree_item *item; + struct scoutfs_btree_block *bt; + unsigned key_len; + unsigned val_len; + void *key; + void *val; + int ret; + int i; + + if (ref->blkno == 0) + return 0; + + bt = read_block(fd, le64_to_cpu(ref->blkno)); + if (!bt) + return -ENOMEM; + + for (i = 0; i < le32_to_cpu(bt->nr_items); i++) { + item = (void *)bt + le32_to_cpu(bt->item_hdrs[i].off); + key_len = le16_to_cpu(item->key_len); + val_len = le16_to_cpu(item->val_len); + key = (void *)(item + 1); + val = (void *)key + key_len; + + if (bt->level > 0) { + ret = print_btree_leaf_items(fd, super, val, func, arg); + if (ret) + break; + continue; + } else { + func(key, key_len, val, val_len, arg); + } + } + + free(bt); + return 0; +} + static char *alloc_addr_str(struct scoutfs_inet_addr *ia) { struct in_addr addr; @@ -572,8 +652,6 @@ static void print_super_block(struct scoutfs_super_block *super, u64 blkno) { char uuid_str[37]; char *server_addr; - u64 count; - int i; uuid_unparse(super->uuid, uuid_str); @@ -587,62 +665,52 @@ static void print_super_block(struct scoutfs_super_block *super, u64 blkno) return; /* XXX these are all in a crazy order */ - printf(" next_ino %llu next_trans_seq %llu next_seg_seq %llu\n" - " next_compact_id %llu\n" - " total_blocks %llu free_blocks %llu alloc_cursor %llu\n" + printf(" next_ino %llu next_trans_seq %llu\n" + " total_blocks %llu free_blocks %llu\n" + " next_uninit_free_block %llu core_balloc_blocks %llu\n" " quorum_fenced_term %llu quorum_server_term %llu unmount_barrier %llu\n" " quorum_count %u server_addr %s\n" - " btree ring: first_blkno %llu nr_blocks %llu next_block %llu " - "next_seq %llu\n" - " lock_clients root: height %u blkno %llu seq %llu mig_len %u\n" - " mounted_clients root: height %u blkno %llu seq %llu mig_len %u\n" - " trans_seqs root: height %u blkno %llu seq %llu mig_len %u\n" - " alloc btree root: height %u blkno %llu seq %llu mig_len %u\n" - " manifest btree root: height %u blkno %llu seq %llu mig_len %u\n", + " core_balloc_alloc: total_free %llu root: height %u blkno %llu seq %llu\n" + " core_balloc_free: total_free %llu root: height %u blkno %llu seq %llu\n" + " lock_clients root: height %u blkno %llu seq %llu\n" + " mounted_clients root: height %u blkno %llu seq %llu\n" + " trans_seqs root: height %u blkno %llu seq %llu\n" + " alloc btree root: height %u blkno %llu seq %llu\n" + " fs_root btree root: height %u blkno %llu seq %llu\n", le64_to_cpu(super->next_ino), le64_to_cpu(super->next_trans_seq), - le64_to_cpu(super->next_seg_seq), - le64_to_cpu(super->next_compact_id), le64_to_cpu(super->total_blocks), le64_to_cpu(super->free_blocks), - le64_to_cpu(super->alloc_cursor), + le64_to_cpu(super->next_uninit_free_block), + le64_to_cpu(super->core_balloc_cursor), le64_to_cpu(super->quorum_fenced_term), le64_to_cpu(super->quorum_server_term), le64_to_cpu(super->unmount_barrier), super->quorum_count, server_addr, - le64_to_cpu(super->bring.first_blkno), - le64_to_cpu(super->bring.nr_blocks), - le64_to_cpu(super->bring.next_block), - le64_to_cpu(super->bring.next_seq), + le64_to_cpu(super->core_balloc_alloc.total_free), + super->core_balloc_alloc.root.height, + le64_to_cpu(super->core_balloc_alloc.root.ref.blkno), + le64_to_cpu(super->core_balloc_alloc.root.ref.seq), + le64_to_cpu(super->core_balloc_free.total_free), + super->core_balloc_free.root.height, + le64_to_cpu(super->core_balloc_free.root.ref.blkno), + le64_to_cpu(super->core_balloc_free.root.ref.seq), super->lock_clients.height, le64_to_cpu(super->lock_clients.ref.blkno), le64_to_cpu(super->lock_clients.ref.seq), - le16_to_cpu(super->lock_clients.migration_key_len), super->mounted_clients.height, le64_to_cpu(super->mounted_clients.ref.blkno), le64_to_cpu(super->mounted_clients.ref.seq), - le16_to_cpu(super->mounted_clients.migration_key_len), super->trans_seqs.height, le64_to_cpu(super->trans_seqs.ref.blkno), le64_to_cpu(super->trans_seqs.ref.seq), - le16_to_cpu(super->trans_seqs.migration_key_len), super->alloc_root.height, le64_to_cpu(super->alloc_root.ref.blkno), le64_to_cpu(super->alloc_root.ref.seq), - le16_to_cpu(super->alloc_root.migration_key_len), - super->manifest.root.height, - le64_to_cpu(super->manifest.root.ref.blkno), - le64_to_cpu(super->manifest.root.ref.seq), - le16_to_cpu(super->manifest.root.migration_key_len)); - - printf(" level_counts:"); - for (i = 0; i < SCOUTFS_MANIFEST_MAX_LEVEL; i++) { - count = le64_to_cpu(super->manifest.level_counts[i]); - if (count) - printf(" %u: %llu", i, count); - } - printf("\n"); + super->fs_root.height, + le64_to_cpu(super->fs_root.ref.blkno), + le64_to_cpu(super->fs_root.ref.seq)); free(server_addr); } @@ -650,8 +718,7 @@ static void print_super_block(struct scoutfs_super_block *super, u64 blkno) static int print_volume(int fd) { struct scoutfs_super_block *super = NULL; - unsigned long *seg_map = NULL; - u64 nr_segs; + struct print_recursion_args pa; int ret = 0; int err; @@ -661,15 +728,6 @@ static int print_volume(int fd) print_super_block(super, SCOUTFS_SUPER_BLKNO); - nr_segs = le64_to_cpu(super->total_blocks) / SCOUTFS_SEGMENT_BLOCKS; - seg_map = alloc_bits(nr_segs); - if (!seg_map) { - ret = -ENOMEM; - fprintf(stderr, "failed to alloc %llu seg map: %s (%d)\n", - nr_segs, strerror(errno), errno); - goto out; - } - ret = print_quorum_blocks(fd, super); err = print_btree(fd, super, "lock_clients", &super->lock_clients, @@ -687,23 +745,41 @@ static int print_volume(int fd) if (err && !ret) ret = err; + err = print_btree(fd, super, "core_balloc_alloc", + &super->core_balloc_alloc.root, + print_balloc_entry, NULL); + if (err && !ret) + ret = err; + + err = print_btree(fd, super, "core_balloc_free", + &super->core_balloc_free.root, + print_balloc_entry, NULL); + if (err && !ret) + ret = err; + err = print_btree(fd, super, "alloc", &super->alloc_root, print_alloc_item, NULL); if (err && !ret) ret = err; - err = print_btree(fd, super, "manifest", &super->manifest.root, - print_manifest_entry, seg_map); + err = print_btree(fd, super, "logs_root", &super->logs_root, + print_log_trees_item, NULL); if (err && !ret) ret = err; - err = print_segments(fd, seg_map, nr_segs); + pa.super = super; + pa.fd = fd; + err = print_btree_leaf_items(fd, super, &super->logs_root.ref, + print_log_trees_roots, &pa); + if (err && !ret) + ret = err; + + err = print_btree(fd, super, "fs_root", &super->fs_root, + print_fs_item, NULL); if (err && !ret) ret = err; -out: free(super); - free(seg_map); return ret; }