Move to btree blocks

Update mkfs and printing for the btree experiment.

Signed-off-by: Zach Brown <zab@versity.com>
This commit is contained in:
Zach Brown
2016-04-12 15:02:02 -07:00
parent c4fcf40097
commit 56077b61a1
5 changed files with 165 additions and 708 deletions

View File

@@ -1,70 +0,0 @@
/*
* Copyright (C) 2016 Versity Software, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include "sparse.h"
#include "util.h"
#include "format.h"
#include "bloom.h"
#include "crc.h"
#include "bitops.h"
/* XXX garbage hack until we have siphash */
static u32 bloom_hash(struct scoutfs_key *key, __le32 salt)
{
return crc32c(le32_to_cpu(salt), key, sizeof(struct scoutfs_key));
}
/*
* Find the bits in the bloom filter for the given key. The caller calculates
* these once and uses them to test all the blocks.
*/
void scoutfs_calc_bloom_bits(struct scoutfs_bloom_bits *bits,
struct scoutfs_key *key, __le32 *salts)
{
unsigned h_bits = 0;
unsigned int b;
unsigned s = 0;
u64 h = 0;
int i;
for (i = 0; i < SCOUTFS_BLOOM_BITS; i++) {
if (h_bits < SCOUTFS_BLOOM_BIT_WIDTH) {
h = (h << 32) | bloom_hash(key, salts[s++]);
h_bits += 32;
}
b = h & SCOUTFS_BLOOM_BIT_MASK;
h >>= SCOUTFS_BLOOM_BIT_WIDTH;
h_bits -= SCOUTFS_BLOOM_BIT_WIDTH;
bits->block[i] = (b / SCOUTFS_BLOOM_BITS_PER_BLOCK) %
SCOUTFS_BLOOM_BLOCKS;
bits->bit_off[i] = b % SCOUTFS_BLOOM_BITS_PER_BLOCK;
}
}
/*
* This interface is different than in the kernel because we don't
* have a block IO interface here yet. The caller gives us each
* bloom block and we set each bit that falls in the block.
*/
void scoutfs_set_bloom_bits(struct scoutfs_bloom_block *blm, unsigned int nr,
struct scoutfs_bloom_bits *bits)
{
int i;
for (i = 0; i < SCOUTFS_BLOOM_BITS; i++) {
if (nr == bits->block[i]) {
set_bit_le(bits->bit_off[i], blm->bits);
}
}
}

View File

@@ -1,14 +0,0 @@
#ifndef _BLOOM_H_
#define _BLOOM_H_
struct scoutfs_bloom_bits {
u16 bit_off[SCOUTFS_BLOOM_BITS];
u8 block[SCOUTFS_BLOOM_BITS];
};
void scoutfs_calc_bloom_bits(struct scoutfs_bloom_bits *bits,
struct scoutfs_key *key, __le32 *salts);
void scoutfs_set_bloom_bits(struct scoutfs_bloom_block *blm, unsigned int nr,
struct scoutfs_bloom_bits *bits);
#endif

View File

@@ -6,27 +6,12 @@
/* super block id */
#define SCOUTFS_SUPER_ID 0x2e736674756f6373ULL /* "scoutfs." */
/*
* Everything is stored in and addressed as 4k fixed size blocks. This
* avoids having to manage contiguous cpu mappings of larger blocks.
* Larger structures are read and written as multiple blocks.
*/
#define SCOUTFS_BLOCK_SHIFT 12
#define SCOUTFS_BLOCK_SHIFT 14
#define SCOUTFS_BLOCK_SIZE (1 << SCOUTFS_BLOCK_SHIFT)
#define SCOUTFS_BLOCK_MASK (SCOUTFS_BLOCK_SIZE - 1)
/*
* The allocator works on larger chunks. Smaller metadata structures
* like the super blocks and the ring are stored in chunks.
*
* A log segment is a collection of smaller blocks (bloom filter, item blocks)
* stored in a chunk.
*/
#define SCOUTFS_CHUNK_SHIFT 22
#define SCOUTFS_CHUNK_SIZE (1 << SCOUTFS_CHUNK_SHIFT)
#define SCOUTFS_CHUNK_BLOCK_SHIFT (SCOUTFS_CHUNK_SHIFT - SCOUTFS_BLOCK_SHIFT)
#define SCOUTFS_CHUNK_BLOCK_MASK ((1 << SCOUTFS_CHUNK_BLOCK_SHIFT) - 1)
#define SCOUTFS_BLOCKS_PER_CHUNK (1 << SCOUTFS_CHUNK_BLOCK_SHIFT)
#define SCOUTFS_PAGES_PER_BLOCK (SCOUTFS_BLOCK_SIZE / PAGE_SIZE)
#define SCOUTFS_BLOCK_PAGE_ORDER (SCOUTFS_BLOCK_SHIFT - PAGE_SHIFT)
/*
* The super blocks leave some room at the start of the first block for
@@ -35,22 +20,6 @@
#define SCOUTFS_SUPER_BLKNO ((64 * 1024) >> SCOUTFS_BLOCK_SHIFT)
#define SCOUTFS_SUPER_NR 2
/*
* The bloom filters are statically sized. It's a tradeoff between
* storage overhead and false positive rate. At the moment we have
* as few as 1000 and as many as 18000 items in a segment. We can
* get a ~1% false positive rate (triggering header search) rate at
* the high end with a ~20k bloom filter.
*
* n = 18,000, p = 0.01 (1 in 100) → m = 172,532 (21.06KB), k = 7
*/
#define SCOUTFS_BLOOM_BITS 7
#define SCOUTFS_BLOOM_BIT_WIDTH 18 /* 2^18 > m */
#define SCOUTFS_BLOOM_BIT_MASK ((1 << SCOUTFS_BLOOM_BIT_WIDTH) - 1)
#define SCOUTFS_BLOOM_BLOCKS ((20 * 1024) / SCOUTFS_BLOCK_SIZE)
#define SCOUTFS_BLOOM_SALTS \
DIV_ROUND_UP(SCOUTFS_BLOOM_BITS * SCOUTFS_BLOOM_BIT_WIDTH, 32)
/*
* This header is found at the start of every block so that we can
* verify that it's what we were looking for. The crc and padding
@@ -65,6 +34,72 @@ struct scoutfs_block_header {
__le64 blkno;
} __packed;
/*
* We should be able to make the offset smaller if neither dirents nor
* data items use the full 64 bits.
*/
struct scoutfs_key {
__le64 inode;
u8 type;
__le64 offset;
} __packed;
/*
* Currently we sort keys by the numeric value of the types, but that
* isn't necessary. We could have an arbitrary sort order. So we don't
* have to stress about cleverly allocating the types.
*/
#define SCOUTFS_INODE_KEY 1
#define SCOUTFS_DIRENT_KEY 2
#define SCOUTFS_DATA_KEY 3
#define SCOUTFS_MAX_ITEM_LEN 2048
/*
* Block references include the sequence number so that we can detect
* readers racing with writers and so that we can tell that we don't
* need to follow a reference when traversing based on seqs.
*/
struct scoutfs_block_ref {
__le64 blkno;
__le64 seq;
} __packed;
struct scoutfs_treap_root {
__le16 off;
} __packed;
struct scoutfs_treap_node {
__le16 parent;
__le16 left;
__le16 right;
__le32 prio;
} __packed;
struct scoutfs_btree_root {
u8 height;
struct scoutfs_block_ref ref;
} __packed;
struct scoutfs_btree_block {
struct scoutfs_block_header hdr;
struct scoutfs_treap_root treap;
__le16 total_free;
__le16 tail_free;
__le16 nr_items;
} __packed;
struct scoutfs_btree_item {
struct scoutfs_key key;
struct scoutfs_treap_node tnode;
__le16 val_len;
char val[0];
} __packed;
/* Blocks are no more than half free. */
#define SCOUTFS_BTREE_FREE_LIMIT \
((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_btree_block)) / 2)
#define SCOUTFS_UUID_BYTES 16
/*
@@ -81,142 +116,11 @@ struct scoutfs_super_block {
struct scoutfs_block_header hdr;
__le64 id;
__u8 uuid[SCOUTFS_UUID_BYTES];
__le32 bloom_salts[SCOUTFS_BLOOM_SALTS];
__le64 total_chunks;
__le64 ring_map_blkno;
__le64 ring_map_seq;
__le64 ring_first_block;
__le64 ring_active_blocks;
__le64 ring_total_blocks;
__le64 ring_seq;
} __packed;
/*
* We should be able to make the offset smaller if neither dirents nor
* data items use the full 64 bits.
*/
struct scoutfs_key {
__le64 inode;
u8 type;
__le64 offset;
struct scoutfs_btree_root btree_root;
} __packed;
#define SCOUTFS_ROOT_INO 1
/*
* Currently we sort keys by the numeric value of the types, but that
* isn't necessary. We could have an arbitrary sort order. So we don't
* have to stress about cleverly allocating the types.
*/
#define SCOUTFS_INODE_KEY 1
#define SCOUTFS_DIRENT_KEY 2
#define SCOUTFS_DATA_KEY 3
struct scoutfs_ring_map_block {
struct scoutfs_block_header hdr;
__le32 nr_chunks;
__le64 blknos[0];
} __packed;
#define SCOUTFS_RING_MAP_BLOCKS \
((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_ring_map_block)) / \
sizeof(__le64))
struct scoutfs_ring_entry {
u8 type;
__le16 len;
} __packed;
/*
* Ring blocks are stored in chunks described by the ring map blocks.
*
* The manifest entries describe the position of a given log segment in
* the manifest. They're keyed by the block number so that we can
* record movement of a log segment in the manifest with one ring entry
* and we can record deletion with just the block number.
*/
struct scoutfs_ring_block {
struct scoutfs_block_header hdr;
__le16 nr_entries;
} __packed;
enum {
SCOUTFS_RING_ADD_MANIFEST = 0,
SCOUTFS_RING_DEL_MANIFEST,
SCOUTFS_RING_BITMAP,
};
/*
* Including both keys might make the manifest too large. It might be
* better to only include one key and infer a block's range from the
* neighbour's key. The downside of that is that we assume that there
* isn't unused key space between blocks in a level. We might search
* blocks when we didn't need to.
*/
struct scoutfs_manifest_entry {
__le64 blkno;
__le64 seq;
__u8 level;
struct scoutfs_key first;
struct scoutfs_key last;
} __packed;
#define SCOUTFS_MANIFESTS_PER_LEVEL 10
/* 2^22 * 10^13 > 2^64 */
#define SCOUTFS_MAX_LEVEL 13
struct scoutfs_ring_bitmap {
__le32 offset;
__le64 bits[2];
} __packed;
struct scoutfs_bloom_block {
struct scoutfs_block_header hdr;
__le64 bits[0];
} __packed;
#define SCOUTFS_BLOOM_BITS_PER_BLOCK \
(((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_block_header)) / 8) * 64)
/*
* Items in log segments are sorted in a skip list by their key. We
* have a rough limit of 64k items.
*/
#define SCOUTFS_SKIP_HEIGHT 16
struct scoutfs_skip_root {
__le32 next[SCOUTFS_SKIP_HEIGHT];
} __packed;
/*
* An item block follows the bloom filter blocks at the start of a log
* segment. Its skip root references the item structs which then
* reference the item values in the rest of the block. The references
* are byte offsets from the start of the chunk.
*/
struct scoutfs_item_block {
struct scoutfs_block_header hdr;
struct scoutfs_key first;
struct scoutfs_key last;
struct scoutfs_skip_root skip_root;
} __packed;
struct scoutfs_item {
struct scoutfs_key key;
__le32 offset;
__le16 len;
u8 skip_height;
__le32 skip_next[0];
} __packed;
/*
* Item size caps item file data item length so that they fit in checksummed
* 4k blocks with a bit of expansion room.
*/
#define SCOUTFS_MAX_ITEM_LEN \
(SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_block_header) - 32)
struct scoutfs_timespec {
__le64 sec;
__le32 nsec;

View File

@@ -17,7 +17,6 @@
#include "crc.h"
#include "rand.h"
#include "dev.h"
#include "bloom.h"
#include "bitops.h"
/*
@@ -44,21 +43,13 @@ static int write_new_fs(char *path, int fd)
{
struct scoutfs_super_block *super;
struct scoutfs_inode *inode;
struct scoutfs_ring_map_block *map;
struct scoutfs_ring_block *ring;
struct scoutfs_ring_entry *ent;
struct scoutfs_manifest_entry *mani;
struct scoutfs_ring_bitmap *bm;
struct scoutfs_item_block *iblk;
struct scoutfs_bloom_bits bits;
struct scoutfs_bloom_block *blm;
struct scoutfs_item *item;
struct scoutfs_btree_block *bt;
struct scoutfs_btree_item *item;
struct scoutfs_key root_key;
struct timeval tv;
char uuid_str[37];
unsigned int i;
u64 size;
u64 total_chunks;
u64 blkno;
void *buf;
int ret;
@@ -81,14 +72,12 @@ static int write_new_fs(char *path, int fd)
goto out;
}
total_chunks = size >> SCOUTFS_CHUNK_SHIFT;
root_key.inode = cpu_to_le64(SCOUTFS_ROOT_INO);
root_key.type = SCOUTFS_INODE_KEY;
root_key.offset = 0;
/* first chunk has super blocks, log segment chunk is next */
blkno = 1 << SCOUTFS_CHUNK_BLOCK_SHIFT;
/* start with the block after the supers */
blkno = SCOUTFS_SUPER_BLKNO + SCOUTFS_SUPER_NR;
/* first initialize the super so we can use it to build structures */
memset(super, 0, SCOUTFS_BLOCK_SIZE);
@@ -96,45 +85,21 @@ static int write_new_fs(char *path, int fd)
super->hdr.seq = cpu_to_le64(1);
super->id = cpu_to_le64(SCOUTFS_SUPER_ID);
uuid_generate(super->uuid);
pseudo_random_bytes(super->bloom_salts, sizeof(super->bloom_salts));
super->total_chunks = cpu_to_le64(total_chunks);
super->ring_map_seq = super->hdr.seq;
super->ring_first_block = cpu_to_le64(0);
super->ring_active_blocks = cpu_to_le64(1);
super->ring_total_blocks = cpu_to_le64(SCOUTFS_BLOCKS_PER_CHUNK);
super->ring_seq = super->hdr.seq;
/*
* There's only the root item so we check for its bloom bits as
* we write the bloom blocks.
*/
scoutfs_calc_bloom_bits(&bits, &root_key, super->bloom_salts);
for (i = 0; i < SCOUTFS_BLOOM_BLOCKS; i++) {
memset(buf, 0, SCOUTFS_BLOCK_SIZE);
blm = buf;
blm->hdr = super->hdr;
scoutfs_set_bloom_bits(blm, i, &bits);
ret = write_block(fd, blkno, &blm->hdr);
if (ret)
goto out;
blkno++;
}
/* write a single log segment with the root inode item */
/* write a btree leaf root inode item */
memset(buf, 0, SCOUTFS_BLOCK_SIZE);
iblk = buf;
iblk->hdr = super->hdr;
iblk->skip_root.next[0] = cpu_to_le32((SCOUTFS_BLOOM_BLOCKS <<
SCOUTFS_BLOCK_SHIFT) +
sizeof(struct scoutfs_item_block));
item = (void *)(iblk + 1);
bt = buf;
bt->hdr = super->hdr;
bt->nr_items = cpu_to_le16(1);
item = (void *)(bt + 1);
item->key = root_key;
item->offset = cpu_to_le32(le32_to_cpu(iblk->skip_root.next[0]) +
sizeof(struct scoutfs_item));
item->len = cpu_to_le16(sizeof(struct scoutfs_inode));
item->skip_height = 1;
item->tnode.parent = 0;
item->tnode.left = 0;
item->tnode.right = 0;
pseudo_random_bytes(&item->tnode.prio, sizeof(item->tnode.prio));
item->val_len = cpu_to_le16(sizeof(struct scoutfs_inode));
inode = (void *)(item + 1);
inode->nlink = cpu_to_le32(2);
inode->mode = cpu_to_le32(0755 | 0040000);
@@ -145,52 +110,19 @@ static int write_new_fs(char *path, int fd)
inode->mtime.sec = inode->atime.sec;
inode->mtime.nsec = inode->atime.nsec;
ret = write_block(fd, blkno, &iblk->hdr);
if (ret)
goto out;
blkno = round_up(blkno, SCOUTFS_BLOCKS_PER_CHUNK);
bt->treap.off = cpu_to_le16((char *)&item->tnode - (char *)&bt->treap);
bt->total_free = cpu_to_le16(SCOUTFS_BLOCK_SIZE -
((char *)(inode + 1) - (char *)bt));
bt->tail_free = bt->total_free;
/* write the ring block whose manifest entry references the log block */
memset(buf, 0, SCOUTFS_BLOCK_SIZE);
ring = buf;
ring->hdr = super->hdr;
ring->nr_entries = cpu_to_le16(2);
ent = (void *)(ring + 1);
ent->type = SCOUTFS_RING_ADD_MANIFEST;
ent->len = cpu_to_le16(sizeof(*mani));
mani = (void *)(ent + 1);
mani->blkno = cpu_to_le64(blkno - SCOUTFS_BLOCKS_PER_CHUNK);
mani->seq = super->hdr.seq;
mani->level = 0;
mani->first = root_key;
mani->last = root_key;
ent = (void *)(mani + 1);
ent->type = SCOUTFS_RING_BITMAP;
ent->len = cpu_to_le16(sizeof(*bm));
bm = (void *)(ent + 1);
memset(bm->bits, 0xff, sizeof(bm->bits));
/* the first four chunks are allocated */
bm->bits[0] = cpu_to_le64(~15ULL);
bm->bits[1] = cpu_to_le64(~0ULL);
ret = write_block(fd, blkno, &ring->hdr);
if (ret)
goto out;
blkno += SCOUTFS_BLOCKS_PER_CHUNK;
/* the ring has a single chunk for now */
memset(buf, 0, SCOUTFS_BLOCK_SIZE);
map = buf;
map->hdr = super->hdr;
map->nr_chunks = cpu_to_le32(1);
map->blknos[0] = cpu_to_le64(blkno - SCOUTFS_BLOCKS_PER_CHUNK);
ret = write_block(fd, blkno, &map->hdr);
ret = write_block(fd, blkno, &bt->hdr);
if (ret)
goto out;
/* make sure the super references everything we just wrote */
super->ring_map_blkno = cpu_to_le64(blkno);
super->btree_root.height = 1;
super->btree_root.ref.blkno = bt->hdr.blkno;
super->btree_root.ref.seq = bt->hdr.seq;
/* write the two super blocks */
for (i = 0; i < SCOUTFS_SUPER_NR; i++) {
@@ -210,12 +142,10 @@ static int write_new_fs(char *path, int fd)
uuid_unparse(super->uuid, uuid_str);
printf("Created scoutfs filesystem:\n"
" chunk bytes: %u\n"
" total chunks: %llu\n"
" block size: %u\n"
" fsid: %llx\n"
" uuid: %s\n",
SCOUTFS_CHUNK_SIZE, total_chunks,
le64_to_cpu(super->hdr.fsid), uuid_str);
SCOUTFS_BLOCK_SIZE, le64_to_cpu(super->hdr.fsid), uuid_str);
ret = 0;
out:

View File

@@ -42,91 +42,28 @@ static void *read_block(int fd, u64 blkno)
return buf;
}
static void *read_chunk(int fd, u64 blkno)
{
ssize_t ret;
void *buf;
buf = malloc(SCOUTFS_CHUNK_SIZE);
if (!buf)
return NULL;
ret = pread(fd, buf, SCOUTFS_CHUNK_SIZE, blkno << SCOUTFS_BLOCK_SHIFT);
if (ret != SCOUTFS_CHUNK_SIZE) {
fprintf(stderr, "read blkno %llu returned %zd: %s (%d)\n",
blkno, ret, strerror(errno), errno);
free(buf);
buf = NULL;
}
return buf;
}
static void print_le32_list(int indent, __le32 *data, int nr)
{
char *fmt;
int pos;
int len;
int i;
u32 d;
printf("[");
pos = indent;
for (i = 0; i < nr; i++) {
if (i + 1 < nr)
fmt = "%u, ";
else
fmt = "%u";
d = le32_to_cpu(data[i]);
len = snprintf(NULL, 0, fmt, d);
if (pos + len > 78) {
printf("\n%*c", indent, ' ');
pos = indent;
}
printf(fmt, d);
pos += len;
}
printf("]\n");
}
static void print_block_header(struct scoutfs_block_header *hdr)
{
u32 crc = crc_block(hdr);
char valid_str[40];
if (crc != le32_to_cpu(hdr->crc))
sprintf(valid_str, "# != %08x", crc);
sprintf(valid_str, "(!= %08x) ", crc);
else
valid_str[0] = '\0';
printf(" header:\n"
" crc: %08x %s\n"
" fsid: %llx\n"
" seq: %llu\n"
" blkno: %llu\n",
printf(" hdr: crc %08x %sfsid %llx seq %llu blkno %llu\n",
le32_to_cpu(hdr->crc), valid_str, le64_to_cpu(hdr->fsid),
le64_to_cpu(hdr->seq), le64_to_cpu(hdr->blkno));
}
static void print_inode(struct scoutfs_inode *inode)
{
printf(" inode:\n"
" size: %llu\n"
" blocks: %llu\n"
" nlink: %u\n"
" uid: %u\n"
" gid: %u\n"
" mode: 0%o\n"
" rdev: 0x%x\n"
" salt: 0x%x\n"
" max_dirent_hash_nr: %u\n"
" atime: %llu.%08u\n"
" ctime: %llu.%08u\n"
" mtime: %llu.%08u\n",
printf(" inode: size: %llu blocks: %llu nlink: %u\n"
" uid: %u gid: %u mode: 0%o rdev: 0x%x\n"
" salt: 0x%x max_dirent_hash_nr: %u\n"
" atime: %llu.%08u ctime: %llu.%08u\n"
" mtime: %llu.%08u\n",
le64_to_cpu(inode->size), le64_to_cpu(inode->blocks),
le32_to_cpu(inode->nlink), le32_to_cpu(inode->uid),
le32_to_cpu(inode->gid), le32_to_cpu(inode->mode),
@@ -150,271 +87,83 @@ static void print_dirent(struct scoutfs_dirent *dent, unsigned int val_len)
name[i] = isprint(dent->name[i]) ? dent->name[i] : '.';
name[i] = '\0';
printf(" dirent:\n"
" ino: %llu\n"
" type: %u\n"
" name: \"%.*s\"\n",
printf(" dirent: ino: %llu type: %u name: \"%.*s\"\n",
le64_to_cpu(dent->ino), dent->type, i, name);
}
static void print_item(struct scoutfs_item *item, void *val)
static void print_btree_item(unsigned int off, struct scoutfs_btree_item *item)
{
printf(" item:\n"
" key: "SKF"\n"
" offset: %u\n"
" len: %u\n"
" skip_height: %u\n"
" skip_next[]: ",
SKA(&item->key),
le32_to_cpu(item->offset),
le16_to_cpu(item->len),
item->skip_height);
print_le32_list(22, item->skip_next, item->skip_height);
printf(" item: key "SKF" val_len %u off %u tnode: parent %u left %u right %u "
"prio %x\n",
SKA(&item->key), le16_to_cpu(item->val_len), off,
le16_to_cpu(item->tnode.parent),
le16_to_cpu(item->tnode.left),
le16_to_cpu(item->tnode.right),
le32_to_cpu(item->tnode.prio));
switch(item->key.type) {
case SCOUTFS_INODE_KEY:
print_inode(val);
print_inode((void *)item->val);
break;
case SCOUTFS_DIRENT_KEY:
print_dirent(val, le16_to_cpu(item->len));
print_dirent((void *)item->val, le16_to_cpu(item->val_len));
break;
}
}
static int print_log_segment(int fd, u64 nr)
{
struct scoutfs_item_block *iblk;
struct scoutfs_bloom_block *blm;
struct scoutfs_item *item;
char *buf;
char *val;
__le32 next;
int i;
buf = read_chunk(fd, nr);
if (!buf)
return -ENOMEM;
for (i = 0; i < SCOUTFS_BLOOM_BLOCKS; i++) {
blm = (void *)(buf + (i << SCOUTFS_BLOCK_SHIFT));
printf("bloom block:\n");
print_block_header(&blm->hdr);
}
iblk = (void *)(buf + (SCOUTFS_BLOOM_BLOCKS << SCOUTFS_BLOCK_SHIFT));
printf("item block:\n");
print_block_header(&iblk->hdr);
printf(" first: "SKF"\n"
" last: "SKF"\n"
" skip_root.next[]: ",
SKA(&iblk->first), SKA(&iblk->last));
print_le32_list(23, iblk->skip_root.next, SCOUTFS_SKIP_HEIGHT);
next = iblk->skip_root.next[0];
while (next) {
item = (void *)(buf + le32_to_cpu(next));
val = (void *)(buf + le32_to_cpu(item->offset));
print_item(item, val);
next = item->skip_next[0];
}
free(buf);
return 0;
}
static int print_log_segments(int fd, __le64 *log_segs, u64 total_chunks)
static int print_btree_block(int fd, __le64 blkno, u8 level)
{
struct scoutfs_btree_item *item;
struct scoutfs_btree_block *bt;
struct scoutfs_block_ref *ref;
unsigned int off;
int ret = 0;
int err;
s64 nr;
while ((nr = find_first_le_bit(log_segs, total_chunks)) >= 0) {
clear_le_bit(log_segs, nr);
err = print_log_segment(fd, nr << SCOUTFS_CHUNK_BLOCK_SHIFT);
if (!ret && err)
ret = err;
}
return ret;
}
static char *ent_type_str(u8 type)
{
switch (type) {
case SCOUTFS_RING_ADD_MANIFEST:
return "ADD_MANIFEST";
case SCOUTFS_RING_DEL_MANIFEST:
return "DEL_MANIFEST";
case SCOUTFS_RING_BITMAP:
return "BITMAP";
default:
return "(unknown)";
}
}
static void print_ring_entry(int fd, struct scoutfs_ring_entry *ent)
{
struct scoutfs_manifest_entry *ment;
struct scoutfs_ring_bitmap *bm;
printf(" entry:\n"
" type: %u # %s\n"
" len: %u\n",
ent->type, ent_type_str(ent->type), le16_to_cpu(ent->len));
switch(ent->type) {
case SCOUTFS_RING_ADD_MANIFEST:
ment = (void *)(ent + 1);
printf(" blkno: %llu\n"
" seq: %llu\n"
" level: %u\n"
" first: "SKF"\n"
" last: "SKF"\n",
le64_to_cpu(ment->blkno), le64_to_cpu(ment->seq),
ment->level, SKA(&ment->first), SKA(&ment->last));
break;
case SCOUTFS_RING_DEL_MANIFEST:
ment = (void *)(ent + 1);
printf(" blkno: %llu\n"
" seq: %llu\n"
" level: %u\n"
" first: "SKF"\n"
" last: "SKF"\n",
le64_to_cpu(ment->blkno), le64_to_cpu(ment->seq),
ment->level, SKA(&ment->first), SKA(&ment->last));
break;
case SCOUTFS_RING_BITMAP:
bm = (void *)(ent + 1);
printf(" offset: %u\n"
" bits: 0x%llx%llx\n",
le32_to_cpu(bm->offset),
le64_to_cpu(bm->bits[1]), le64_to_cpu(bm->bits[0]));
break;
}
}
static void update_log_segs(struct scoutfs_ring_entry *ent,
__le64 *log_segs)
{
struct scoutfs_manifest_entry *ment;
u64 bit;
switch(ent->type) {
case SCOUTFS_RING_ADD_MANIFEST:
ment = (void *)(ent + 1);
bit = le64_to_cpu(ment->blkno) >> SCOUTFS_CHUNK_BLOCK_SHIFT;
set_le_bit(log_segs, bit);
break;
case SCOUTFS_RING_DEL_MANIFEST:
ment = (void *)(ent + 1);
bit = le64_to_cpu(ment->blkno) >> SCOUTFS_CHUNK_BLOCK_SHIFT;
clear_le_bit(log_segs, bit);
break;
}
}
static int print_ring_block(int fd, u64 blkno, __le64 *log_segs)
{
struct scoutfs_ring_block *ring;
struct scoutfs_ring_entry *ent;
size_t off;
int ret = 0;
int i;
/* XXX just printing the first block for now */
ring = read_block(fd, blkno);
if (!ring)
bt = read_block(fd, le64_to_cpu(blkno));
if (!bt)
return -ENOMEM;
printf("ring block:\n");
print_block_header(&ring->hdr);
printf(" nr_entries: %u\n", le16_to_cpu(ring->nr_entries));
printf("btree blkno %llu\n", le64_to_cpu(blkno));
print_block_header(&bt->hdr);
printf(" treap.off %u total_free %u tail_free %u nr_items %u\n",
le16_to_cpu(bt->treap.off),
le16_to_cpu(bt->total_free),
le16_to_cpu(bt->tail_free),
le16_to_cpu(bt->nr_items));
off = sizeof(struct scoutfs_ring_block);
for (i = 0; i < le16_to_cpu(ring->nr_entries); i++) {
ent = (void *)((char *)ring + off);
update_log_segs(ent, log_segs);
print_ring_entry(fd, ent);
off += sizeof(struct scoutfs_ring_entry) +
le16_to_cpu(ent->len);
}
free(ring);
return ret;
}
/*
* Print all the active ring blocks that are referenced by the super
* and which were mapped by the map blocks that we printed.
*/
static int print_ring_blocks(int fd, struct scoutfs_super_block *super,
u64 *ring_blknos, __le64 *log_segs)
{
u64 block;
u64 blkno;
u64 i;
int ret = 0;
int err;
block = le64_to_cpu(super->ring_first_block);
for (i = 0; i < le64_to_cpu(super->ring_active_blocks); i++) {
blkno = ring_blknos[block >> SCOUTFS_CHUNK_BLOCK_SHIFT] +
(block & SCOUTFS_CHUNK_BLOCK_MASK);
err = print_ring_block(fd, blkno, log_segs);
if (err && !ret)
ret = err;
if (++block == le64_to_cpu(super->ring_total_blocks))
block = 0;
}
return ret;
}
/*
* print a chunk's worth of map blocks and stop if we hit a partial
* block.
*/
static int print_map_blocks(int fd, u64 blkno, u64 *ring_blknos)
{
struct scoutfs_ring_map_block *map;
int r = 0;
int b;
int i;
for (b = 0; SCOUTFS_BLOCKS_PER_CHUNK; b++) {
map = read_block(fd, blkno + b);
if (!map)
return -ENOMEM;
printf("map block:\n");
print_block_header(&map->hdr);
printf(" nr_chunks: %u\n", le32_to_cpu(map->nr_chunks));
printf(" blknos: ");
for (i = 0; i < le32_to_cpu(map->nr_chunks); i++, r++) {
printf(" %llu\n", le64_to_cpu(map->blknos[i]));
ring_blknos[r] = le64_to_cpu(map->blknos[i]);
/* XXX just print in offset order */
item = (void *)(bt + 1);
for (i = 0; i < le16_to_cpu(bt->nr_items); i++) {
if (item->tnode.parent == cpu_to_le16(1)) {
i--;
} else {
off = (char *)&item->tnode - (char *)&bt->treap;
print_btree_item(off, item);
}
free(map);
if (i != SCOUTFS_RING_MAP_BLOCKS)
break;
item = (void *)&item->val[le16_to_cpu(item->val_len)];
}
return 0;
item = (void *)(bt + 1);
for (i = 0; level && i < le16_to_cpu(bt->nr_items); i++) {
if (item->tnode.parent == cpu_to_le16(1)) {
i--;
} else {
ref = (void *)item->val;
err = print_btree_block(fd, ref->blkno, level - 1);
if (err && !ret)
ret = err;
}
item = (void *)&item->val[le16_to_cpu(item->val_len)];
}
free(bt);
return ret;
}
static int print_super_blocks(int fd)
@@ -422,9 +171,6 @@ static int print_super_blocks(int fd)
struct scoutfs_super_block *super;
struct scoutfs_super_block recent = { .hdr.seq = 0 };
char uuid_str[37];
__le64 *log_segs;
u64 *ring_blknos;
u64 total_chunks;
int ret = 0;
int err;
int i;
@@ -436,28 +182,14 @@ static int print_super_blocks(int fd)
uuid_unparse(super->uuid, uuid_str);
printf("super:\n");
printf("super blkno %llu\n", (u64)SCOUTFS_SUPER_BLKNO + i);
print_block_header(&super->hdr);
printf(" id: %llx\n"
" uuid: %s\n"
" bloom_salts: ",
le64_to_cpu(super->id),
uuid_str);
print_le32_list(18, super->bloom_salts, SCOUTFS_BLOOM_SALTS);
printf(" total_chunks: %llu\n"
" ring_map_blkno: %llu\n"
" ring_map_seq: %llu\n"
" ring_first_block: %llu\n"
" ring_active_blocks: %llu\n"
" ring_total_blocks: %llu\n"
" ring_seq: %llu\n",
le64_to_cpu(super->total_chunks),
le64_to_cpu(super->ring_map_blkno),
le64_to_cpu(super->ring_map_seq),
le64_to_cpu(super->ring_first_block),
le64_to_cpu(super->ring_active_blocks),
le64_to_cpu(super->ring_total_blocks),
le64_to_cpu(super->ring_seq));
printf(" id %llx uuid %s\n",
le64_to_cpu(super->id), uuid_str);
printf(" btree_root: height %u seq %llu blkno %llu\n",
super->btree_root.height,
le64_to_cpu(super->btree_root.ref.seq),
le64_to_cpu(super->btree_root.ref.blkno));
if (le64_to_cpu(super->hdr.seq) > le64_to_cpu(recent.hdr.seq))
memcpy(&recent, super, sizeof(recent));
@@ -466,37 +198,12 @@ static int print_super_blocks(int fd)
}
super = &recent;
total_chunks = le64_to_cpu(super->total_chunks);
/*
* Allocate a bitmap big enough to describe all the chunks and
* we can have at most a full chunk worth of map blocks.
*/
log_segs = calloc(1, (total_chunks + 63) / 8);
ring_blknos = calloc(1, SCOUTFS_CHUNK_SIZE);
if (!log_segs || !ring_blknos) {
ret = -ENOMEM;
goto out;
}
err = print_map_blocks(fd, le64_to_cpu(super->ring_map_blkno),
ring_blknos);
if (super->btree_root.height)
err = print_btree_block(fd, super->btree_root.ref.blkno,
super->btree_root.height - 1);
if (err && !ret)
ret = err;
err = print_ring_blocks(fd, super, ring_blknos, log_segs);
if (err && !ret)
ret = err;
err = print_log_segments(fd, log_segs, total_chunks);
if (err && !ret)
ret = err;
out:
if (log_segs)
free(log_segs);
if (ring_blknos)
free(ring_blknos);
return ret;
}