From 56077b61a1ee15c86500f049f07ea8e029ceb756 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Tue, 12 Apr 2016 15:02:02 -0700 Subject: [PATCH] Move to btree blocks Update mkfs and printing for the btree experiment. Signed-off-by: Zach Brown --- utils/src/bloom.c | 70 -------- utils/src/bloom.h | 14 -- utils/src/format.h | 236 ++++++++----------------- utils/src/mkfs.c | 122 +++---------- utils/src/print.c | 431 ++++++++------------------------------------- 5 files changed, 165 insertions(+), 708 deletions(-) delete mode 100644 utils/src/bloom.c delete mode 100644 utils/src/bloom.h diff --git a/utils/src/bloom.c b/utils/src/bloom.c deleted file mode 100644 index f513acff..00000000 --- a/utils/src/bloom.c +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (C) 2016 Versity Software, Inc. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ -#include "sparse.h" -#include "util.h" -#include "format.h" -#include "bloom.h" -#include "crc.h" -#include "bitops.h" - -/* XXX garbage hack until we have siphash */ -static u32 bloom_hash(struct scoutfs_key *key, __le32 salt) -{ - return crc32c(le32_to_cpu(salt), key, sizeof(struct scoutfs_key)); -} - -/* - * Find the bits in the bloom filter for the given key. The caller calculates - * these once and uses them to test all the blocks. - */ -void scoutfs_calc_bloom_bits(struct scoutfs_bloom_bits *bits, - struct scoutfs_key *key, __le32 *salts) -{ - unsigned h_bits = 0; - unsigned int b; - unsigned s = 0; - u64 h = 0; - int i; - - for (i = 0; i < SCOUTFS_BLOOM_BITS; i++) { - if (h_bits < SCOUTFS_BLOOM_BIT_WIDTH) { - h = (h << 32) | bloom_hash(key, salts[s++]); - h_bits += 32; - } - - b = h & SCOUTFS_BLOOM_BIT_MASK; - h >>= SCOUTFS_BLOOM_BIT_WIDTH; - h_bits -= SCOUTFS_BLOOM_BIT_WIDTH; - - bits->block[i] = (b / SCOUTFS_BLOOM_BITS_PER_BLOCK) % - SCOUTFS_BLOOM_BLOCKS; - bits->bit_off[i] = b % SCOUTFS_BLOOM_BITS_PER_BLOCK; - } -} - -/* - * This interface is different than in the kernel because we don't - * have a block IO interface here yet. The caller gives us each - * bloom block and we set each bit that falls in the block. - */ -void scoutfs_set_bloom_bits(struct scoutfs_bloom_block *blm, unsigned int nr, - struct scoutfs_bloom_bits *bits) -{ - int i; - - for (i = 0; i < SCOUTFS_BLOOM_BITS; i++) { - if (nr == bits->block[i]) { - set_bit_le(bits->bit_off[i], blm->bits); - } - } -} diff --git a/utils/src/bloom.h b/utils/src/bloom.h deleted file mode 100644 index fd9246cb..00000000 --- a/utils/src/bloom.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef _BLOOM_H_ -#define _BLOOM_H_ - -struct scoutfs_bloom_bits { - u16 bit_off[SCOUTFS_BLOOM_BITS]; - u8 block[SCOUTFS_BLOOM_BITS]; -}; - -void scoutfs_calc_bloom_bits(struct scoutfs_bloom_bits *bits, - struct scoutfs_key *key, __le32 *salts); -void scoutfs_set_bloom_bits(struct scoutfs_bloom_block *blm, unsigned int nr, - struct scoutfs_bloom_bits *bits); - -#endif diff --git a/utils/src/format.h b/utils/src/format.h index c097d0c4..a80e95b0 100644 --- a/utils/src/format.h +++ b/utils/src/format.h @@ -6,27 +6,12 @@ /* super block id */ #define SCOUTFS_SUPER_ID 0x2e736674756f6373ULL /* "scoutfs." */ -/* - * Everything is stored in and addressed as 4k fixed size blocks. This - * avoids having to manage contiguous cpu mappings of larger blocks. - * Larger structures are read and written as multiple blocks. - */ -#define SCOUTFS_BLOCK_SHIFT 12 +#define SCOUTFS_BLOCK_SHIFT 14 #define SCOUTFS_BLOCK_SIZE (1 << SCOUTFS_BLOCK_SHIFT) #define SCOUTFS_BLOCK_MASK (SCOUTFS_BLOCK_SIZE - 1) -/* - * The allocator works on larger chunks. Smaller metadata structures - * like the super blocks and the ring are stored in chunks. - * - * A log segment is a collection of smaller blocks (bloom filter, item blocks) - * stored in a chunk. - */ -#define SCOUTFS_CHUNK_SHIFT 22 -#define SCOUTFS_CHUNK_SIZE (1 << SCOUTFS_CHUNK_SHIFT) -#define SCOUTFS_CHUNK_BLOCK_SHIFT (SCOUTFS_CHUNK_SHIFT - SCOUTFS_BLOCK_SHIFT) -#define SCOUTFS_CHUNK_BLOCK_MASK ((1 << SCOUTFS_CHUNK_BLOCK_SHIFT) - 1) -#define SCOUTFS_BLOCKS_PER_CHUNK (1 << SCOUTFS_CHUNK_BLOCK_SHIFT) +#define SCOUTFS_PAGES_PER_BLOCK (SCOUTFS_BLOCK_SIZE / PAGE_SIZE) +#define SCOUTFS_BLOCK_PAGE_ORDER (SCOUTFS_BLOCK_SHIFT - PAGE_SHIFT) /* * The super blocks leave some room at the start of the first block for @@ -35,22 +20,6 @@ #define SCOUTFS_SUPER_BLKNO ((64 * 1024) >> SCOUTFS_BLOCK_SHIFT) #define SCOUTFS_SUPER_NR 2 -/* - * The bloom filters are statically sized. It's a tradeoff between - * storage overhead and false positive rate. At the moment we have - * as few as 1000 and as many as 18000 items in a segment. We can - * get a ~1% false positive rate (triggering header search) rate at - * the high end with a ~20k bloom filter. - * - * n = 18,000, p = 0.01 (1 in 100) → m = 172,532 (21.06KB), k = 7 - */ -#define SCOUTFS_BLOOM_BITS 7 -#define SCOUTFS_BLOOM_BIT_WIDTH 18 /* 2^18 > m */ -#define SCOUTFS_BLOOM_BIT_MASK ((1 << SCOUTFS_BLOOM_BIT_WIDTH) - 1) -#define SCOUTFS_BLOOM_BLOCKS ((20 * 1024) / SCOUTFS_BLOCK_SIZE) -#define SCOUTFS_BLOOM_SALTS \ - DIV_ROUND_UP(SCOUTFS_BLOOM_BITS * SCOUTFS_BLOOM_BIT_WIDTH, 32) - /* * This header is found at the start of every block so that we can * verify that it's what we were looking for. The crc and padding @@ -65,6 +34,72 @@ struct scoutfs_block_header { __le64 blkno; } __packed; +/* + * We should be able to make the offset smaller if neither dirents nor + * data items use the full 64 bits. + */ +struct scoutfs_key { + __le64 inode; + u8 type; + __le64 offset; +} __packed; + +/* + * Currently we sort keys by the numeric value of the types, but that + * isn't necessary. We could have an arbitrary sort order. So we don't + * have to stress about cleverly allocating the types. + */ +#define SCOUTFS_INODE_KEY 1 +#define SCOUTFS_DIRENT_KEY 2 +#define SCOUTFS_DATA_KEY 3 + +#define SCOUTFS_MAX_ITEM_LEN 2048 + +/* + * Block references include the sequence number so that we can detect + * readers racing with writers and so that we can tell that we don't + * need to follow a reference when traversing based on seqs. + */ +struct scoutfs_block_ref { + __le64 blkno; + __le64 seq; +} __packed; + +struct scoutfs_treap_root { + __le16 off; +} __packed; + +struct scoutfs_treap_node { + __le16 parent; + __le16 left; + __le16 right; + __le32 prio; +} __packed; + +struct scoutfs_btree_root { + u8 height; + struct scoutfs_block_ref ref; +} __packed; + +struct scoutfs_btree_block { + struct scoutfs_block_header hdr; + struct scoutfs_treap_root treap; + __le16 total_free; + __le16 tail_free; + __le16 nr_items; +} __packed; + +struct scoutfs_btree_item { + struct scoutfs_key key; + struct scoutfs_treap_node tnode; + __le16 val_len; + char val[0]; +} __packed; + +/* Blocks are no more than half free. */ +#define SCOUTFS_BTREE_FREE_LIMIT \ + ((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_btree_block)) / 2) + #define SCOUTFS_UUID_BYTES 16 /* @@ -81,142 +116,11 @@ struct scoutfs_super_block { struct scoutfs_block_header hdr; __le64 id; __u8 uuid[SCOUTFS_UUID_BYTES]; - __le32 bloom_salts[SCOUTFS_BLOOM_SALTS]; - __le64 total_chunks; - __le64 ring_map_blkno; - __le64 ring_map_seq; - __le64 ring_first_block; - __le64 ring_active_blocks; - __le64 ring_total_blocks; - __le64 ring_seq; -} __packed; - -/* - * We should be able to make the offset smaller if neither dirents nor - * data items use the full 64 bits. - */ -struct scoutfs_key { - __le64 inode; - u8 type; - __le64 offset; + struct scoutfs_btree_root btree_root; } __packed; #define SCOUTFS_ROOT_INO 1 -/* - * Currently we sort keys by the numeric value of the types, but that - * isn't necessary. We could have an arbitrary sort order. So we don't - * have to stress about cleverly allocating the types. - */ -#define SCOUTFS_INODE_KEY 1 -#define SCOUTFS_DIRENT_KEY 2 -#define SCOUTFS_DATA_KEY 3 - -struct scoutfs_ring_map_block { - struct scoutfs_block_header hdr; - __le32 nr_chunks; - __le64 blknos[0]; -} __packed; - -#define SCOUTFS_RING_MAP_BLOCKS \ - ((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_ring_map_block)) / \ - sizeof(__le64)) - -struct scoutfs_ring_entry { - u8 type; - __le16 len; -} __packed; - -/* - * Ring blocks are stored in chunks described by the ring map blocks. - * - * The manifest entries describe the position of a given log segment in - * the manifest. They're keyed by the block number so that we can - * record movement of a log segment in the manifest with one ring entry - * and we can record deletion with just the block number. - */ -struct scoutfs_ring_block { - struct scoutfs_block_header hdr; - __le16 nr_entries; -} __packed; - -enum { - SCOUTFS_RING_ADD_MANIFEST = 0, - SCOUTFS_RING_DEL_MANIFEST, - SCOUTFS_RING_BITMAP, -}; - -/* - * Including both keys might make the manifest too large. It might be - * better to only include one key and infer a block's range from the - * neighbour's key. The downside of that is that we assume that there - * isn't unused key space between blocks in a level. We might search - * blocks when we didn't need to. - */ -struct scoutfs_manifest_entry { - __le64 blkno; - __le64 seq; - __u8 level; - struct scoutfs_key first; - struct scoutfs_key last; -} __packed; - -#define SCOUTFS_MANIFESTS_PER_LEVEL 10 - -/* 2^22 * 10^13 > 2^64 */ -#define SCOUTFS_MAX_LEVEL 13 - -struct scoutfs_ring_bitmap { - __le32 offset; - __le64 bits[2]; -} __packed; - - -struct scoutfs_bloom_block { - struct scoutfs_block_header hdr; - __le64 bits[0]; -} __packed; - -#define SCOUTFS_BLOOM_BITS_PER_BLOCK \ - (((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_block_header)) / 8) * 64) - -/* - * Items in log segments are sorted in a skip list by their key. We - * have a rough limit of 64k items. - */ -#define SCOUTFS_SKIP_HEIGHT 16 -struct scoutfs_skip_root { - __le32 next[SCOUTFS_SKIP_HEIGHT]; -} __packed; - -/* - * An item block follows the bloom filter blocks at the start of a log - * segment. Its skip root references the item structs which then - * reference the item values in the rest of the block. The references - * are byte offsets from the start of the chunk. - */ -struct scoutfs_item_block { - struct scoutfs_block_header hdr; - struct scoutfs_key first; - struct scoutfs_key last; - struct scoutfs_skip_root skip_root; -} __packed; - -struct scoutfs_item { - struct scoutfs_key key; - __le32 offset; - __le16 len; - u8 skip_height; - __le32 skip_next[0]; -} __packed; - -/* - * Item size caps item file data item length so that they fit in checksummed - * 4k blocks with a bit of expansion room. - */ -#define SCOUTFS_MAX_ITEM_LEN \ - (SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_block_header) - 32) - struct scoutfs_timespec { __le64 sec; __le32 nsec; diff --git a/utils/src/mkfs.c b/utils/src/mkfs.c index 5658d3d9..ac0c196e 100644 --- a/utils/src/mkfs.c +++ b/utils/src/mkfs.c @@ -17,7 +17,6 @@ #include "crc.h" #include "rand.h" #include "dev.h" -#include "bloom.h" #include "bitops.h" /* @@ -44,21 +43,13 @@ static int write_new_fs(char *path, int fd) { struct scoutfs_super_block *super; struct scoutfs_inode *inode; - struct scoutfs_ring_map_block *map; - struct scoutfs_ring_block *ring; - struct scoutfs_ring_entry *ent; - struct scoutfs_manifest_entry *mani; - struct scoutfs_ring_bitmap *bm; - struct scoutfs_item_block *iblk; - struct scoutfs_bloom_bits bits; - struct scoutfs_bloom_block *blm; - struct scoutfs_item *item; + struct scoutfs_btree_block *bt; + struct scoutfs_btree_item *item; struct scoutfs_key root_key; struct timeval tv; char uuid_str[37]; unsigned int i; u64 size; - u64 total_chunks; u64 blkno; void *buf; int ret; @@ -81,14 +72,12 @@ static int write_new_fs(char *path, int fd) goto out; } - total_chunks = size >> SCOUTFS_CHUNK_SHIFT; - root_key.inode = cpu_to_le64(SCOUTFS_ROOT_INO); root_key.type = SCOUTFS_INODE_KEY; root_key.offset = 0; - /* first chunk has super blocks, log segment chunk is next */ - blkno = 1 << SCOUTFS_CHUNK_BLOCK_SHIFT; + /* start with the block after the supers */ + blkno = SCOUTFS_SUPER_BLKNO + SCOUTFS_SUPER_NR; /* first initialize the super so we can use it to build structures */ memset(super, 0, SCOUTFS_BLOCK_SIZE); @@ -96,45 +85,21 @@ static int write_new_fs(char *path, int fd) super->hdr.seq = cpu_to_le64(1); super->id = cpu_to_le64(SCOUTFS_SUPER_ID); uuid_generate(super->uuid); - pseudo_random_bytes(super->bloom_salts, sizeof(super->bloom_salts)); - super->total_chunks = cpu_to_le64(total_chunks); - super->ring_map_seq = super->hdr.seq; - super->ring_first_block = cpu_to_le64(0); - super->ring_active_blocks = cpu_to_le64(1); - super->ring_total_blocks = cpu_to_le64(SCOUTFS_BLOCKS_PER_CHUNK); - super->ring_seq = super->hdr.seq; - /* - * There's only the root item so we check for its bloom bits as - * we write the bloom blocks. - */ - scoutfs_calc_bloom_bits(&bits, &root_key, super->bloom_salts); - for (i = 0; i < SCOUTFS_BLOOM_BLOCKS; i++) { - memset(buf, 0, SCOUTFS_BLOCK_SIZE); - blm = buf; - blm->hdr = super->hdr; - - scoutfs_set_bloom_bits(blm, i, &bits); - - ret = write_block(fd, blkno, &blm->hdr); - if (ret) - goto out; - blkno++; - } - - /* write a single log segment with the root inode item */ + /* write a btree leaf root inode item */ memset(buf, 0, SCOUTFS_BLOCK_SIZE); - iblk = buf; - iblk->hdr = super->hdr; - iblk->skip_root.next[0] = cpu_to_le32((SCOUTFS_BLOOM_BLOCKS << - SCOUTFS_BLOCK_SHIFT) + - sizeof(struct scoutfs_item_block)); - item = (void *)(iblk + 1); + bt = buf; + bt->hdr = super->hdr; + bt->nr_items = cpu_to_le16(1); + + item = (void *)(bt + 1); item->key = root_key; - item->offset = cpu_to_le32(le32_to_cpu(iblk->skip_root.next[0]) + - sizeof(struct scoutfs_item)); - item->len = cpu_to_le16(sizeof(struct scoutfs_inode)); - item->skip_height = 1; + item->tnode.parent = 0; + item->tnode.left = 0; + item->tnode.right = 0; + pseudo_random_bytes(&item->tnode.prio, sizeof(item->tnode.prio)); + item->val_len = cpu_to_le16(sizeof(struct scoutfs_inode)); + inode = (void *)(item + 1); inode->nlink = cpu_to_le32(2); inode->mode = cpu_to_le32(0755 | 0040000); @@ -145,52 +110,19 @@ static int write_new_fs(char *path, int fd) inode->mtime.sec = inode->atime.sec; inode->mtime.nsec = inode->atime.nsec; - ret = write_block(fd, blkno, &iblk->hdr); - if (ret) - goto out; - blkno = round_up(blkno, SCOUTFS_BLOCKS_PER_CHUNK); + bt->treap.off = cpu_to_le16((char *)&item->tnode - (char *)&bt->treap); + bt->total_free = cpu_to_le16(SCOUTFS_BLOCK_SIZE - + ((char *)(inode + 1) - (char *)bt)); + bt->tail_free = bt->total_free; - /* write the ring block whose manifest entry references the log block */ - memset(buf, 0, SCOUTFS_BLOCK_SIZE); - ring = buf; - ring->hdr = super->hdr; - ring->nr_entries = cpu_to_le16(2); - ent = (void *)(ring + 1); - ent->type = SCOUTFS_RING_ADD_MANIFEST; - ent->len = cpu_to_le16(sizeof(*mani)); - mani = (void *)(ent + 1); - mani->blkno = cpu_to_le64(blkno - SCOUTFS_BLOCKS_PER_CHUNK); - mani->seq = super->hdr.seq; - mani->level = 0; - mani->first = root_key; - mani->last = root_key; - ent = (void *)(mani + 1); - ent->type = SCOUTFS_RING_BITMAP; - ent->len = cpu_to_le16(sizeof(*bm)); - bm = (void *)(ent + 1); - memset(bm->bits, 0xff, sizeof(bm->bits)); - /* the first four chunks are allocated */ - bm->bits[0] = cpu_to_le64(~15ULL); - bm->bits[1] = cpu_to_le64(~0ULL); - - ret = write_block(fd, blkno, &ring->hdr); - if (ret) - goto out; - blkno += SCOUTFS_BLOCKS_PER_CHUNK; - - /* the ring has a single chunk for now */ - memset(buf, 0, SCOUTFS_BLOCK_SIZE); - map = buf; - map->hdr = super->hdr; - map->nr_chunks = cpu_to_le32(1); - map->blknos[0] = cpu_to_le64(blkno - SCOUTFS_BLOCKS_PER_CHUNK); - - ret = write_block(fd, blkno, &map->hdr); + ret = write_block(fd, blkno, &bt->hdr); if (ret) goto out; /* make sure the super references everything we just wrote */ - super->ring_map_blkno = cpu_to_le64(blkno); + super->btree_root.height = 1; + super->btree_root.ref.blkno = bt->hdr.blkno; + super->btree_root.ref.seq = bt->hdr.seq; /* write the two super blocks */ for (i = 0; i < SCOUTFS_SUPER_NR; i++) { @@ -210,12 +142,10 @@ static int write_new_fs(char *path, int fd) uuid_unparse(super->uuid, uuid_str); printf("Created scoutfs filesystem:\n" - " chunk bytes: %u\n" - " total chunks: %llu\n" + " block size: %u\n" " fsid: %llx\n" " uuid: %s\n", - SCOUTFS_CHUNK_SIZE, total_chunks, - le64_to_cpu(super->hdr.fsid), uuid_str); + SCOUTFS_BLOCK_SIZE, le64_to_cpu(super->hdr.fsid), uuid_str); ret = 0; out: diff --git a/utils/src/print.c b/utils/src/print.c index 9dc322ac..ca0275a3 100644 --- a/utils/src/print.c +++ b/utils/src/print.c @@ -42,91 +42,28 @@ static void *read_block(int fd, u64 blkno) return buf; } -static void *read_chunk(int fd, u64 blkno) -{ - ssize_t ret; - void *buf; - - buf = malloc(SCOUTFS_CHUNK_SIZE); - if (!buf) - return NULL; - - ret = pread(fd, buf, SCOUTFS_CHUNK_SIZE, blkno << SCOUTFS_BLOCK_SHIFT); - if (ret != SCOUTFS_CHUNK_SIZE) { - fprintf(stderr, "read blkno %llu returned %zd: %s (%d)\n", - blkno, ret, strerror(errno), errno); - free(buf); - buf = NULL; - } - - return buf; -} - -static void print_le32_list(int indent, __le32 *data, int nr) -{ - char *fmt; - int pos; - int len; - int i; - u32 d; - - printf("["); - - pos = indent; - for (i = 0; i < nr; i++) { - if (i + 1 < nr) - fmt = "%u, "; - else - fmt = "%u"; - - d = le32_to_cpu(data[i]); - len = snprintf(NULL, 0, fmt, d); - if (pos + len > 78) { - printf("\n%*c", indent, ' '); - pos = indent; - } - - printf(fmt, d); - pos += len; - } - - printf("]\n"); -} - static void print_block_header(struct scoutfs_block_header *hdr) { u32 crc = crc_block(hdr); char valid_str[40]; if (crc != le32_to_cpu(hdr->crc)) - sprintf(valid_str, "# != %08x", crc); + sprintf(valid_str, "(!= %08x) ", crc); else valid_str[0] = '\0'; - printf(" header:\n" - " crc: %08x %s\n" - " fsid: %llx\n" - " seq: %llu\n" - " blkno: %llu\n", + printf(" hdr: crc %08x %sfsid %llx seq %llu blkno %llu\n", le32_to_cpu(hdr->crc), valid_str, le64_to_cpu(hdr->fsid), le64_to_cpu(hdr->seq), le64_to_cpu(hdr->blkno)); } static void print_inode(struct scoutfs_inode *inode) { - printf(" inode:\n" - " size: %llu\n" - " blocks: %llu\n" - " nlink: %u\n" - " uid: %u\n" - " gid: %u\n" - " mode: 0%o\n" - " rdev: 0x%x\n" - " salt: 0x%x\n" - " max_dirent_hash_nr: %u\n" - " atime: %llu.%08u\n" - " ctime: %llu.%08u\n" - " mtime: %llu.%08u\n", + printf(" inode: size: %llu blocks: %llu nlink: %u\n" + " uid: %u gid: %u mode: 0%o rdev: 0x%x\n" + " salt: 0x%x max_dirent_hash_nr: %u\n" + " atime: %llu.%08u ctime: %llu.%08u\n" + " mtime: %llu.%08u\n", le64_to_cpu(inode->size), le64_to_cpu(inode->blocks), le32_to_cpu(inode->nlink), le32_to_cpu(inode->uid), le32_to_cpu(inode->gid), le32_to_cpu(inode->mode), @@ -150,271 +87,83 @@ static void print_dirent(struct scoutfs_dirent *dent, unsigned int val_len) name[i] = isprint(dent->name[i]) ? dent->name[i] : '.'; name[i] = '\0'; - printf(" dirent:\n" - " ino: %llu\n" - " type: %u\n" - " name: \"%.*s\"\n", + printf(" dirent: ino: %llu type: %u name: \"%.*s\"\n", le64_to_cpu(dent->ino), dent->type, i, name); } -static void print_item(struct scoutfs_item *item, void *val) +static void print_btree_item(unsigned int off, struct scoutfs_btree_item *item) { - printf(" item:\n" - " key: "SKF"\n" - " offset: %u\n" - " len: %u\n" - " skip_height: %u\n" - " skip_next[]: ", - SKA(&item->key), - le32_to_cpu(item->offset), - le16_to_cpu(item->len), - item->skip_height); - - print_le32_list(22, item->skip_next, item->skip_height); + printf(" item: key "SKF" val_len %u off %u tnode: parent %u left %u right %u " + "prio %x\n", + SKA(&item->key), le16_to_cpu(item->val_len), off, + le16_to_cpu(item->tnode.parent), + le16_to_cpu(item->tnode.left), + le16_to_cpu(item->tnode.right), + le32_to_cpu(item->tnode.prio)); switch(item->key.type) { case SCOUTFS_INODE_KEY: - print_inode(val); + print_inode((void *)item->val); break; case SCOUTFS_DIRENT_KEY: - print_dirent(val, le16_to_cpu(item->len)); + print_dirent((void *)item->val, le16_to_cpu(item->val_len)); break; } } -static int print_log_segment(int fd, u64 nr) -{ - struct scoutfs_item_block *iblk; - struct scoutfs_bloom_block *blm; - struct scoutfs_item *item; - char *buf; - char *val; - __le32 next; - int i; - - buf = read_chunk(fd, nr); - if (!buf) - return -ENOMEM; - - for (i = 0; i < SCOUTFS_BLOOM_BLOCKS; i++) { - - blm = (void *)(buf + (i << SCOUTFS_BLOCK_SHIFT)); - - printf("bloom block:\n"); - print_block_header(&blm->hdr); - } - - iblk = (void *)(buf + (SCOUTFS_BLOOM_BLOCKS << SCOUTFS_BLOCK_SHIFT)); - - printf("item block:\n"); - print_block_header(&iblk->hdr); - printf(" first: "SKF"\n" - " last: "SKF"\n" - " skip_root.next[]: ", - SKA(&iblk->first), SKA(&iblk->last)); - print_le32_list(23, iblk->skip_root.next, SCOUTFS_SKIP_HEIGHT); - - next = iblk->skip_root.next[0]; - while (next) { - item = (void *)(buf + le32_to_cpu(next)); - val = (void *)(buf + le32_to_cpu(item->offset)); - print_item(item, val); - next = item->skip_next[0]; - } - - free(buf); - - return 0; -} - -static int print_log_segments(int fd, __le64 *log_segs, u64 total_chunks) +static int print_btree_block(int fd, __le64 blkno, u8 level) { + struct scoutfs_btree_item *item; + struct scoutfs_btree_block *bt; + struct scoutfs_block_ref *ref; + unsigned int off; int ret = 0; int err; - s64 nr; - - while ((nr = find_first_le_bit(log_segs, total_chunks)) >= 0) { - clear_le_bit(log_segs, nr); - - err = print_log_segment(fd, nr << SCOUTFS_CHUNK_BLOCK_SHIFT); - if (!ret && err) - ret = err; - } - - return ret; -} - -static char *ent_type_str(u8 type) -{ - switch (type) { - case SCOUTFS_RING_ADD_MANIFEST: - return "ADD_MANIFEST"; - case SCOUTFS_RING_DEL_MANIFEST: - return "DEL_MANIFEST"; - case SCOUTFS_RING_BITMAP: - return "BITMAP"; - default: - return "(unknown)"; - } -} - -static void print_ring_entry(int fd, struct scoutfs_ring_entry *ent) -{ - struct scoutfs_manifest_entry *ment; - struct scoutfs_ring_bitmap *bm; - - printf(" entry:\n" - " type: %u # %s\n" - " len: %u\n", - ent->type, ent_type_str(ent->type), le16_to_cpu(ent->len)); - - switch(ent->type) { - case SCOUTFS_RING_ADD_MANIFEST: - ment = (void *)(ent + 1); - printf(" blkno: %llu\n" - " seq: %llu\n" - " level: %u\n" - " first: "SKF"\n" - " last: "SKF"\n", - le64_to_cpu(ment->blkno), le64_to_cpu(ment->seq), - ment->level, SKA(&ment->first), SKA(&ment->last)); - break; - case SCOUTFS_RING_DEL_MANIFEST: - ment = (void *)(ent + 1); - printf(" blkno: %llu\n" - " seq: %llu\n" - " level: %u\n" - " first: "SKF"\n" - " last: "SKF"\n", - le64_to_cpu(ment->blkno), le64_to_cpu(ment->seq), - ment->level, SKA(&ment->first), SKA(&ment->last)); - break; - case SCOUTFS_RING_BITMAP: - bm = (void *)(ent + 1); - printf(" offset: %u\n" - " bits: 0x%llx%llx\n", - le32_to_cpu(bm->offset), - le64_to_cpu(bm->bits[1]), le64_to_cpu(bm->bits[0])); - break; - } -} - -static void update_log_segs(struct scoutfs_ring_entry *ent, - __le64 *log_segs) -{ - struct scoutfs_manifest_entry *ment; - u64 bit; - - switch(ent->type) { - case SCOUTFS_RING_ADD_MANIFEST: - ment = (void *)(ent + 1); - bit = le64_to_cpu(ment->blkno) >> SCOUTFS_CHUNK_BLOCK_SHIFT; - set_le_bit(log_segs, bit); - break; - case SCOUTFS_RING_DEL_MANIFEST: - ment = (void *)(ent + 1); - bit = le64_to_cpu(ment->blkno) >> SCOUTFS_CHUNK_BLOCK_SHIFT; - clear_le_bit(log_segs, bit); - break; - } -} - -static int print_ring_block(int fd, u64 blkno, __le64 *log_segs) -{ - struct scoutfs_ring_block *ring; - struct scoutfs_ring_entry *ent; - size_t off; - int ret = 0; int i; - /* XXX just printing the first block for now */ - - ring = read_block(fd, blkno); - if (!ring) + bt = read_block(fd, le64_to_cpu(blkno)); + if (!bt) return -ENOMEM; - printf("ring block:\n"); - print_block_header(&ring->hdr); - printf(" nr_entries: %u\n", le16_to_cpu(ring->nr_entries)); + printf("btree blkno %llu\n", le64_to_cpu(blkno)); + print_block_header(&bt->hdr); + printf(" treap.off %u total_free %u tail_free %u nr_items %u\n", + le16_to_cpu(bt->treap.off), + le16_to_cpu(bt->total_free), + le16_to_cpu(bt->tail_free), + le16_to_cpu(bt->nr_items)); - off = sizeof(struct scoutfs_ring_block); - for (i = 0; i < le16_to_cpu(ring->nr_entries); i++) { - ent = (void *)((char *)ring + off); - - update_log_segs(ent, log_segs); - print_ring_entry(fd, ent); - - off += sizeof(struct scoutfs_ring_entry) + - le16_to_cpu(ent->len); - } - - free(ring); - return ret; -} - -/* - * Print all the active ring blocks that are referenced by the super - * and which were mapped by the map blocks that we printed. - */ -static int print_ring_blocks(int fd, struct scoutfs_super_block *super, - u64 *ring_blknos, __le64 *log_segs) -{ - u64 block; - u64 blkno; - u64 i; - int ret = 0; - int err; - - block = le64_to_cpu(super->ring_first_block); - - for (i = 0; i < le64_to_cpu(super->ring_active_blocks); i++) { - blkno = ring_blknos[block >> SCOUTFS_CHUNK_BLOCK_SHIFT] + - (block & SCOUTFS_CHUNK_BLOCK_MASK); - - err = print_ring_block(fd, blkno, log_segs); - if (err && !ret) - ret = err; - - if (++block == le64_to_cpu(super->ring_total_blocks)) - block = 0; - } - - return ret; -} - -/* - * print a chunk's worth of map blocks and stop if we hit a partial - * block. - */ -static int print_map_blocks(int fd, u64 blkno, u64 *ring_blknos) -{ - struct scoutfs_ring_map_block *map; - int r = 0; - int b; - int i; - - for (b = 0; SCOUTFS_BLOCKS_PER_CHUNK; b++) { - map = read_block(fd, blkno + b); - if (!map) - return -ENOMEM; - - printf("map block:\n"); - print_block_header(&map->hdr); - printf(" nr_chunks: %u\n", le32_to_cpu(map->nr_chunks)); - - printf(" blknos: "); - for (i = 0; i < le32_to_cpu(map->nr_chunks); i++, r++) { - printf(" %llu\n", le64_to_cpu(map->blknos[i])); - ring_blknos[r] = le64_to_cpu(map->blknos[i]); + /* XXX just print in offset order */ + item = (void *)(bt + 1); + for (i = 0; i < le16_to_cpu(bt->nr_items); i++) { + if (item->tnode.parent == cpu_to_le16(1)) { + i--; + } else { + off = (char *)&item->tnode - (char *)&bt->treap; + print_btree_item(off, item); } - free(map); - - if (i != SCOUTFS_RING_MAP_BLOCKS) - break; + item = (void *)&item->val[le16_to_cpu(item->val_len)]; } - return 0; + item = (void *)(bt + 1); + for (i = 0; level && i < le16_to_cpu(bt->nr_items); i++) { + if (item->tnode.parent == cpu_to_le16(1)) { + i--; + } else { + ref = (void *)item->val; + + err = print_btree_block(fd, ref->blkno, level - 1); + if (err && !ret) + ret = err; + } + + item = (void *)&item->val[le16_to_cpu(item->val_len)]; + } + + free(bt); + + return ret; } static int print_super_blocks(int fd) @@ -422,9 +171,6 @@ static int print_super_blocks(int fd) struct scoutfs_super_block *super; struct scoutfs_super_block recent = { .hdr.seq = 0 }; char uuid_str[37]; - __le64 *log_segs; - u64 *ring_blknos; - u64 total_chunks; int ret = 0; int err; int i; @@ -436,28 +182,14 @@ static int print_super_blocks(int fd) uuid_unparse(super->uuid, uuid_str); - printf("super:\n"); + printf("super blkno %llu\n", (u64)SCOUTFS_SUPER_BLKNO + i); print_block_header(&super->hdr); - printf(" id: %llx\n" - " uuid: %s\n" - " bloom_salts: ", - le64_to_cpu(super->id), - uuid_str); - print_le32_list(18, super->bloom_salts, SCOUTFS_BLOOM_SALTS); - printf(" total_chunks: %llu\n" - " ring_map_blkno: %llu\n" - " ring_map_seq: %llu\n" - " ring_first_block: %llu\n" - " ring_active_blocks: %llu\n" - " ring_total_blocks: %llu\n" - " ring_seq: %llu\n", - le64_to_cpu(super->total_chunks), - le64_to_cpu(super->ring_map_blkno), - le64_to_cpu(super->ring_map_seq), - le64_to_cpu(super->ring_first_block), - le64_to_cpu(super->ring_active_blocks), - le64_to_cpu(super->ring_total_blocks), - le64_to_cpu(super->ring_seq)); + printf(" id %llx uuid %s\n", + le64_to_cpu(super->id), uuid_str); + printf(" btree_root: height %u seq %llu blkno %llu\n", + super->btree_root.height, + le64_to_cpu(super->btree_root.ref.seq), + le64_to_cpu(super->btree_root.ref.blkno)); if (le64_to_cpu(super->hdr.seq) > le64_to_cpu(recent.hdr.seq)) memcpy(&recent, super, sizeof(recent)); @@ -466,37 +198,12 @@ static int print_super_blocks(int fd) } super = &recent; - total_chunks = le64_to_cpu(super->total_chunks); - /* - * Allocate a bitmap big enough to describe all the chunks and - * we can have at most a full chunk worth of map blocks. - */ - log_segs = calloc(1, (total_chunks + 63) / 8); - ring_blknos = calloc(1, SCOUTFS_CHUNK_SIZE); - if (!log_segs || !ring_blknos) { - ret = -ENOMEM; - goto out; - } - - err = print_map_blocks(fd, le64_to_cpu(super->ring_map_blkno), - ring_blknos); + if (super->btree_root.height) + err = print_btree_block(fd, super->btree_root.ref.blkno, + super->btree_root.height - 1); if (err && !ret) ret = err; - - err = print_ring_blocks(fd, super, ring_blknos, log_segs); - if (err && !ret) - ret = err; - - err = print_log_segments(fd, log_segs, total_chunks); - if (err && !ret) - ret = err; - -out: - if (log_segs) - free(log_segs); - if (ring_blknos) - free(ring_blknos); return ret; }