mirror of
https://github.com/versity/scoutfs.git
synced 2026-01-05 11:45:09 +00:00
Move to btree blocks
Update mkfs and printing for the btree experiment. Signed-off-by: Zach Brown <zab@versity.com>
This commit is contained in:
@@ -1,70 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2016 Versity Software, Inc. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License v2 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*/
|
||||
#include "sparse.h"
|
||||
#include "util.h"
|
||||
#include "format.h"
|
||||
#include "bloom.h"
|
||||
#include "crc.h"
|
||||
#include "bitops.h"
|
||||
|
||||
/* XXX garbage hack until we have siphash */
|
||||
static u32 bloom_hash(struct scoutfs_key *key, __le32 salt)
|
||||
{
|
||||
return crc32c(le32_to_cpu(salt), key, sizeof(struct scoutfs_key));
|
||||
}
|
||||
|
||||
/*
|
||||
* Find the bits in the bloom filter for the given key. The caller calculates
|
||||
* these once and uses them to test all the blocks.
|
||||
*/
|
||||
void scoutfs_calc_bloom_bits(struct scoutfs_bloom_bits *bits,
|
||||
struct scoutfs_key *key, __le32 *salts)
|
||||
{
|
||||
unsigned h_bits = 0;
|
||||
unsigned int b;
|
||||
unsigned s = 0;
|
||||
u64 h = 0;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < SCOUTFS_BLOOM_BITS; i++) {
|
||||
if (h_bits < SCOUTFS_BLOOM_BIT_WIDTH) {
|
||||
h = (h << 32) | bloom_hash(key, salts[s++]);
|
||||
h_bits += 32;
|
||||
}
|
||||
|
||||
b = h & SCOUTFS_BLOOM_BIT_MASK;
|
||||
h >>= SCOUTFS_BLOOM_BIT_WIDTH;
|
||||
h_bits -= SCOUTFS_BLOOM_BIT_WIDTH;
|
||||
|
||||
bits->block[i] = (b / SCOUTFS_BLOOM_BITS_PER_BLOCK) %
|
||||
SCOUTFS_BLOOM_BLOCKS;
|
||||
bits->bit_off[i] = b % SCOUTFS_BLOOM_BITS_PER_BLOCK;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This interface is different than in the kernel because we don't
|
||||
* have a block IO interface here yet. The caller gives us each
|
||||
* bloom block and we set each bit that falls in the block.
|
||||
*/
|
||||
void scoutfs_set_bloom_bits(struct scoutfs_bloom_block *blm, unsigned int nr,
|
||||
struct scoutfs_bloom_bits *bits)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < SCOUTFS_BLOOM_BITS; i++) {
|
||||
if (nr == bits->block[i]) {
|
||||
set_bit_le(bits->bit_off[i], blm->bits);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,14 +0,0 @@
|
||||
#ifndef _BLOOM_H_
|
||||
#define _BLOOM_H_
|
||||
|
||||
struct scoutfs_bloom_bits {
|
||||
u16 bit_off[SCOUTFS_BLOOM_BITS];
|
||||
u8 block[SCOUTFS_BLOOM_BITS];
|
||||
};
|
||||
|
||||
void scoutfs_calc_bloom_bits(struct scoutfs_bloom_bits *bits,
|
||||
struct scoutfs_key *key, __le32 *salts);
|
||||
void scoutfs_set_bloom_bits(struct scoutfs_bloom_block *blm, unsigned int nr,
|
||||
struct scoutfs_bloom_bits *bits);
|
||||
|
||||
#endif
|
||||
@@ -6,27 +6,12 @@
|
||||
/* super block id */
|
||||
#define SCOUTFS_SUPER_ID 0x2e736674756f6373ULL /* "scoutfs." */
|
||||
|
||||
/*
|
||||
* Everything is stored in and addressed as 4k fixed size blocks. This
|
||||
* avoids having to manage contiguous cpu mappings of larger blocks.
|
||||
* Larger structures are read and written as multiple blocks.
|
||||
*/
|
||||
#define SCOUTFS_BLOCK_SHIFT 12
|
||||
#define SCOUTFS_BLOCK_SHIFT 14
|
||||
#define SCOUTFS_BLOCK_SIZE (1 << SCOUTFS_BLOCK_SHIFT)
|
||||
#define SCOUTFS_BLOCK_MASK (SCOUTFS_BLOCK_SIZE - 1)
|
||||
|
||||
/*
|
||||
* The allocator works on larger chunks. Smaller metadata structures
|
||||
* like the super blocks and the ring are stored in chunks.
|
||||
*
|
||||
* A log segment is a collection of smaller blocks (bloom filter, item blocks)
|
||||
* stored in a chunk.
|
||||
*/
|
||||
#define SCOUTFS_CHUNK_SHIFT 22
|
||||
#define SCOUTFS_CHUNK_SIZE (1 << SCOUTFS_CHUNK_SHIFT)
|
||||
#define SCOUTFS_CHUNK_BLOCK_SHIFT (SCOUTFS_CHUNK_SHIFT - SCOUTFS_BLOCK_SHIFT)
|
||||
#define SCOUTFS_CHUNK_BLOCK_MASK ((1 << SCOUTFS_CHUNK_BLOCK_SHIFT) - 1)
|
||||
#define SCOUTFS_BLOCKS_PER_CHUNK (1 << SCOUTFS_CHUNK_BLOCK_SHIFT)
|
||||
#define SCOUTFS_PAGES_PER_BLOCK (SCOUTFS_BLOCK_SIZE / PAGE_SIZE)
|
||||
#define SCOUTFS_BLOCK_PAGE_ORDER (SCOUTFS_BLOCK_SHIFT - PAGE_SHIFT)
|
||||
|
||||
/*
|
||||
* The super blocks leave some room at the start of the first block for
|
||||
@@ -35,22 +20,6 @@
|
||||
#define SCOUTFS_SUPER_BLKNO ((64 * 1024) >> SCOUTFS_BLOCK_SHIFT)
|
||||
#define SCOUTFS_SUPER_NR 2
|
||||
|
||||
/*
|
||||
* The bloom filters are statically sized. It's a tradeoff between
|
||||
* storage overhead and false positive rate. At the moment we have
|
||||
* as few as 1000 and as many as 18000 items in a segment. We can
|
||||
* get a ~1% false positive rate (triggering header search) rate at
|
||||
* the high end with a ~20k bloom filter.
|
||||
*
|
||||
* n = 18,000, p = 0.01 (1 in 100) → m = 172,532 (21.06KB), k = 7
|
||||
*/
|
||||
#define SCOUTFS_BLOOM_BITS 7
|
||||
#define SCOUTFS_BLOOM_BIT_WIDTH 18 /* 2^18 > m */
|
||||
#define SCOUTFS_BLOOM_BIT_MASK ((1 << SCOUTFS_BLOOM_BIT_WIDTH) - 1)
|
||||
#define SCOUTFS_BLOOM_BLOCKS ((20 * 1024) / SCOUTFS_BLOCK_SIZE)
|
||||
#define SCOUTFS_BLOOM_SALTS \
|
||||
DIV_ROUND_UP(SCOUTFS_BLOOM_BITS * SCOUTFS_BLOOM_BIT_WIDTH, 32)
|
||||
|
||||
/*
|
||||
* This header is found at the start of every block so that we can
|
||||
* verify that it's what we were looking for. The crc and padding
|
||||
@@ -65,6 +34,72 @@ struct scoutfs_block_header {
|
||||
__le64 blkno;
|
||||
} __packed;
|
||||
|
||||
/*
|
||||
* We should be able to make the offset smaller if neither dirents nor
|
||||
* data items use the full 64 bits.
|
||||
*/
|
||||
struct scoutfs_key {
|
||||
__le64 inode;
|
||||
u8 type;
|
||||
__le64 offset;
|
||||
} __packed;
|
||||
|
||||
/*
|
||||
* Currently we sort keys by the numeric value of the types, but that
|
||||
* isn't necessary. We could have an arbitrary sort order. So we don't
|
||||
* have to stress about cleverly allocating the types.
|
||||
*/
|
||||
#define SCOUTFS_INODE_KEY 1
|
||||
#define SCOUTFS_DIRENT_KEY 2
|
||||
#define SCOUTFS_DATA_KEY 3
|
||||
|
||||
#define SCOUTFS_MAX_ITEM_LEN 2048
|
||||
|
||||
/*
|
||||
* Block references include the sequence number so that we can detect
|
||||
* readers racing with writers and so that we can tell that we don't
|
||||
* need to follow a reference when traversing based on seqs.
|
||||
*/
|
||||
struct scoutfs_block_ref {
|
||||
__le64 blkno;
|
||||
__le64 seq;
|
||||
} __packed;
|
||||
|
||||
struct scoutfs_treap_root {
|
||||
__le16 off;
|
||||
} __packed;
|
||||
|
||||
struct scoutfs_treap_node {
|
||||
__le16 parent;
|
||||
__le16 left;
|
||||
__le16 right;
|
||||
__le32 prio;
|
||||
} __packed;
|
||||
|
||||
struct scoutfs_btree_root {
|
||||
u8 height;
|
||||
struct scoutfs_block_ref ref;
|
||||
} __packed;
|
||||
|
||||
struct scoutfs_btree_block {
|
||||
struct scoutfs_block_header hdr;
|
||||
struct scoutfs_treap_root treap;
|
||||
__le16 total_free;
|
||||
__le16 tail_free;
|
||||
__le16 nr_items;
|
||||
} __packed;
|
||||
|
||||
struct scoutfs_btree_item {
|
||||
struct scoutfs_key key;
|
||||
struct scoutfs_treap_node tnode;
|
||||
__le16 val_len;
|
||||
char val[0];
|
||||
} __packed;
|
||||
|
||||
/* Blocks are no more than half free. */
|
||||
#define SCOUTFS_BTREE_FREE_LIMIT \
|
||||
((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_btree_block)) / 2)
|
||||
|
||||
#define SCOUTFS_UUID_BYTES 16
|
||||
|
||||
/*
|
||||
@@ -81,142 +116,11 @@ struct scoutfs_super_block {
|
||||
struct scoutfs_block_header hdr;
|
||||
__le64 id;
|
||||
__u8 uuid[SCOUTFS_UUID_BYTES];
|
||||
__le32 bloom_salts[SCOUTFS_BLOOM_SALTS];
|
||||
__le64 total_chunks;
|
||||
__le64 ring_map_blkno;
|
||||
__le64 ring_map_seq;
|
||||
__le64 ring_first_block;
|
||||
__le64 ring_active_blocks;
|
||||
__le64 ring_total_blocks;
|
||||
__le64 ring_seq;
|
||||
} __packed;
|
||||
|
||||
/*
|
||||
* We should be able to make the offset smaller if neither dirents nor
|
||||
* data items use the full 64 bits.
|
||||
*/
|
||||
struct scoutfs_key {
|
||||
__le64 inode;
|
||||
u8 type;
|
||||
__le64 offset;
|
||||
struct scoutfs_btree_root btree_root;
|
||||
} __packed;
|
||||
|
||||
#define SCOUTFS_ROOT_INO 1
|
||||
|
||||
/*
|
||||
* Currently we sort keys by the numeric value of the types, but that
|
||||
* isn't necessary. We could have an arbitrary sort order. So we don't
|
||||
* have to stress about cleverly allocating the types.
|
||||
*/
|
||||
#define SCOUTFS_INODE_KEY 1
|
||||
#define SCOUTFS_DIRENT_KEY 2
|
||||
#define SCOUTFS_DATA_KEY 3
|
||||
|
||||
struct scoutfs_ring_map_block {
|
||||
struct scoutfs_block_header hdr;
|
||||
__le32 nr_chunks;
|
||||
__le64 blknos[0];
|
||||
} __packed;
|
||||
|
||||
#define SCOUTFS_RING_MAP_BLOCKS \
|
||||
((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_ring_map_block)) / \
|
||||
sizeof(__le64))
|
||||
|
||||
struct scoutfs_ring_entry {
|
||||
u8 type;
|
||||
__le16 len;
|
||||
} __packed;
|
||||
|
||||
/*
|
||||
* Ring blocks are stored in chunks described by the ring map blocks.
|
||||
*
|
||||
* The manifest entries describe the position of a given log segment in
|
||||
* the manifest. They're keyed by the block number so that we can
|
||||
* record movement of a log segment in the manifest with one ring entry
|
||||
* and we can record deletion with just the block number.
|
||||
*/
|
||||
struct scoutfs_ring_block {
|
||||
struct scoutfs_block_header hdr;
|
||||
__le16 nr_entries;
|
||||
} __packed;
|
||||
|
||||
enum {
|
||||
SCOUTFS_RING_ADD_MANIFEST = 0,
|
||||
SCOUTFS_RING_DEL_MANIFEST,
|
||||
SCOUTFS_RING_BITMAP,
|
||||
};
|
||||
|
||||
/*
|
||||
* Including both keys might make the manifest too large. It might be
|
||||
* better to only include one key and infer a block's range from the
|
||||
* neighbour's key. The downside of that is that we assume that there
|
||||
* isn't unused key space between blocks in a level. We might search
|
||||
* blocks when we didn't need to.
|
||||
*/
|
||||
struct scoutfs_manifest_entry {
|
||||
__le64 blkno;
|
||||
__le64 seq;
|
||||
__u8 level;
|
||||
struct scoutfs_key first;
|
||||
struct scoutfs_key last;
|
||||
} __packed;
|
||||
|
||||
#define SCOUTFS_MANIFESTS_PER_LEVEL 10
|
||||
|
||||
/* 2^22 * 10^13 > 2^64 */
|
||||
#define SCOUTFS_MAX_LEVEL 13
|
||||
|
||||
struct scoutfs_ring_bitmap {
|
||||
__le32 offset;
|
||||
__le64 bits[2];
|
||||
} __packed;
|
||||
|
||||
|
||||
struct scoutfs_bloom_block {
|
||||
struct scoutfs_block_header hdr;
|
||||
__le64 bits[0];
|
||||
} __packed;
|
||||
|
||||
#define SCOUTFS_BLOOM_BITS_PER_BLOCK \
|
||||
(((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_block_header)) / 8) * 64)
|
||||
|
||||
/*
|
||||
* Items in log segments are sorted in a skip list by their key. We
|
||||
* have a rough limit of 64k items.
|
||||
*/
|
||||
#define SCOUTFS_SKIP_HEIGHT 16
|
||||
struct scoutfs_skip_root {
|
||||
__le32 next[SCOUTFS_SKIP_HEIGHT];
|
||||
} __packed;
|
||||
|
||||
/*
|
||||
* An item block follows the bloom filter blocks at the start of a log
|
||||
* segment. Its skip root references the item structs which then
|
||||
* reference the item values in the rest of the block. The references
|
||||
* are byte offsets from the start of the chunk.
|
||||
*/
|
||||
struct scoutfs_item_block {
|
||||
struct scoutfs_block_header hdr;
|
||||
struct scoutfs_key first;
|
||||
struct scoutfs_key last;
|
||||
struct scoutfs_skip_root skip_root;
|
||||
} __packed;
|
||||
|
||||
struct scoutfs_item {
|
||||
struct scoutfs_key key;
|
||||
__le32 offset;
|
||||
__le16 len;
|
||||
u8 skip_height;
|
||||
__le32 skip_next[0];
|
||||
} __packed;
|
||||
|
||||
/*
|
||||
* Item size caps item file data item length so that they fit in checksummed
|
||||
* 4k blocks with a bit of expansion room.
|
||||
*/
|
||||
#define SCOUTFS_MAX_ITEM_LEN \
|
||||
(SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_block_header) - 32)
|
||||
|
||||
struct scoutfs_timespec {
|
||||
__le64 sec;
|
||||
__le32 nsec;
|
||||
|
||||
122
utils/src/mkfs.c
122
utils/src/mkfs.c
@@ -17,7 +17,6 @@
|
||||
#include "crc.h"
|
||||
#include "rand.h"
|
||||
#include "dev.h"
|
||||
#include "bloom.h"
|
||||
#include "bitops.h"
|
||||
|
||||
/*
|
||||
@@ -44,21 +43,13 @@ static int write_new_fs(char *path, int fd)
|
||||
{
|
||||
struct scoutfs_super_block *super;
|
||||
struct scoutfs_inode *inode;
|
||||
struct scoutfs_ring_map_block *map;
|
||||
struct scoutfs_ring_block *ring;
|
||||
struct scoutfs_ring_entry *ent;
|
||||
struct scoutfs_manifest_entry *mani;
|
||||
struct scoutfs_ring_bitmap *bm;
|
||||
struct scoutfs_item_block *iblk;
|
||||
struct scoutfs_bloom_bits bits;
|
||||
struct scoutfs_bloom_block *blm;
|
||||
struct scoutfs_item *item;
|
||||
struct scoutfs_btree_block *bt;
|
||||
struct scoutfs_btree_item *item;
|
||||
struct scoutfs_key root_key;
|
||||
struct timeval tv;
|
||||
char uuid_str[37];
|
||||
unsigned int i;
|
||||
u64 size;
|
||||
u64 total_chunks;
|
||||
u64 blkno;
|
||||
void *buf;
|
||||
int ret;
|
||||
@@ -81,14 +72,12 @@ static int write_new_fs(char *path, int fd)
|
||||
goto out;
|
||||
}
|
||||
|
||||
total_chunks = size >> SCOUTFS_CHUNK_SHIFT;
|
||||
|
||||
root_key.inode = cpu_to_le64(SCOUTFS_ROOT_INO);
|
||||
root_key.type = SCOUTFS_INODE_KEY;
|
||||
root_key.offset = 0;
|
||||
|
||||
/* first chunk has super blocks, log segment chunk is next */
|
||||
blkno = 1 << SCOUTFS_CHUNK_BLOCK_SHIFT;
|
||||
/* start with the block after the supers */
|
||||
blkno = SCOUTFS_SUPER_BLKNO + SCOUTFS_SUPER_NR;
|
||||
|
||||
/* first initialize the super so we can use it to build structures */
|
||||
memset(super, 0, SCOUTFS_BLOCK_SIZE);
|
||||
@@ -96,45 +85,21 @@ static int write_new_fs(char *path, int fd)
|
||||
super->hdr.seq = cpu_to_le64(1);
|
||||
super->id = cpu_to_le64(SCOUTFS_SUPER_ID);
|
||||
uuid_generate(super->uuid);
|
||||
pseudo_random_bytes(super->bloom_salts, sizeof(super->bloom_salts));
|
||||
super->total_chunks = cpu_to_le64(total_chunks);
|
||||
super->ring_map_seq = super->hdr.seq;
|
||||
super->ring_first_block = cpu_to_le64(0);
|
||||
super->ring_active_blocks = cpu_to_le64(1);
|
||||
super->ring_total_blocks = cpu_to_le64(SCOUTFS_BLOCKS_PER_CHUNK);
|
||||
super->ring_seq = super->hdr.seq;
|
||||
|
||||
/*
|
||||
* There's only the root item so we check for its bloom bits as
|
||||
* we write the bloom blocks.
|
||||
*/
|
||||
scoutfs_calc_bloom_bits(&bits, &root_key, super->bloom_salts);
|
||||
for (i = 0; i < SCOUTFS_BLOOM_BLOCKS; i++) {
|
||||
memset(buf, 0, SCOUTFS_BLOCK_SIZE);
|
||||
blm = buf;
|
||||
blm->hdr = super->hdr;
|
||||
|
||||
scoutfs_set_bloom_bits(blm, i, &bits);
|
||||
|
||||
ret = write_block(fd, blkno, &blm->hdr);
|
||||
if (ret)
|
||||
goto out;
|
||||
blkno++;
|
||||
}
|
||||
|
||||
/* write a single log segment with the root inode item */
|
||||
/* write a btree leaf root inode item */
|
||||
memset(buf, 0, SCOUTFS_BLOCK_SIZE);
|
||||
iblk = buf;
|
||||
iblk->hdr = super->hdr;
|
||||
iblk->skip_root.next[0] = cpu_to_le32((SCOUTFS_BLOOM_BLOCKS <<
|
||||
SCOUTFS_BLOCK_SHIFT) +
|
||||
sizeof(struct scoutfs_item_block));
|
||||
item = (void *)(iblk + 1);
|
||||
bt = buf;
|
||||
bt->hdr = super->hdr;
|
||||
bt->nr_items = cpu_to_le16(1);
|
||||
|
||||
item = (void *)(bt + 1);
|
||||
item->key = root_key;
|
||||
item->offset = cpu_to_le32(le32_to_cpu(iblk->skip_root.next[0]) +
|
||||
sizeof(struct scoutfs_item));
|
||||
item->len = cpu_to_le16(sizeof(struct scoutfs_inode));
|
||||
item->skip_height = 1;
|
||||
item->tnode.parent = 0;
|
||||
item->tnode.left = 0;
|
||||
item->tnode.right = 0;
|
||||
pseudo_random_bytes(&item->tnode.prio, sizeof(item->tnode.prio));
|
||||
item->val_len = cpu_to_le16(sizeof(struct scoutfs_inode));
|
||||
|
||||
inode = (void *)(item + 1);
|
||||
inode->nlink = cpu_to_le32(2);
|
||||
inode->mode = cpu_to_le32(0755 | 0040000);
|
||||
@@ -145,52 +110,19 @@ static int write_new_fs(char *path, int fd)
|
||||
inode->mtime.sec = inode->atime.sec;
|
||||
inode->mtime.nsec = inode->atime.nsec;
|
||||
|
||||
ret = write_block(fd, blkno, &iblk->hdr);
|
||||
if (ret)
|
||||
goto out;
|
||||
blkno = round_up(blkno, SCOUTFS_BLOCKS_PER_CHUNK);
|
||||
bt->treap.off = cpu_to_le16((char *)&item->tnode - (char *)&bt->treap);
|
||||
bt->total_free = cpu_to_le16(SCOUTFS_BLOCK_SIZE -
|
||||
((char *)(inode + 1) - (char *)bt));
|
||||
bt->tail_free = bt->total_free;
|
||||
|
||||
/* write the ring block whose manifest entry references the log block */
|
||||
memset(buf, 0, SCOUTFS_BLOCK_SIZE);
|
||||
ring = buf;
|
||||
ring->hdr = super->hdr;
|
||||
ring->nr_entries = cpu_to_le16(2);
|
||||
ent = (void *)(ring + 1);
|
||||
ent->type = SCOUTFS_RING_ADD_MANIFEST;
|
||||
ent->len = cpu_to_le16(sizeof(*mani));
|
||||
mani = (void *)(ent + 1);
|
||||
mani->blkno = cpu_to_le64(blkno - SCOUTFS_BLOCKS_PER_CHUNK);
|
||||
mani->seq = super->hdr.seq;
|
||||
mani->level = 0;
|
||||
mani->first = root_key;
|
||||
mani->last = root_key;
|
||||
ent = (void *)(mani + 1);
|
||||
ent->type = SCOUTFS_RING_BITMAP;
|
||||
ent->len = cpu_to_le16(sizeof(*bm));
|
||||
bm = (void *)(ent + 1);
|
||||
memset(bm->bits, 0xff, sizeof(bm->bits));
|
||||
/* the first four chunks are allocated */
|
||||
bm->bits[0] = cpu_to_le64(~15ULL);
|
||||
bm->bits[1] = cpu_to_le64(~0ULL);
|
||||
|
||||
ret = write_block(fd, blkno, &ring->hdr);
|
||||
if (ret)
|
||||
goto out;
|
||||
blkno += SCOUTFS_BLOCKS_PER_CHUNK;
|
||||
|
||||
/* the ring has a single chunk for now */
|
||||
memset(buf, 0, SCOUTFS_BLOCK_SIZE);
|
||||
map = buf;
|
||||
map->hdr = super->hdr;
|
||||
map->nr_chunks = cpu_to_le32(1);
|
||||
map->blknos[0] = cpu_to_le64(blkno - SCOUTFS_BLOCKS_PER_CHUNK);
|
||||
|
||||
ret = write_block(fd, blkno, &map->hdr);
|
||||
ret = write_block(fd, blkno, &bt->hdr);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
/* make sure the super references everything we just wrote */
|
||||
super->ring_map_blkno = cpu_to_le64(blkno);
|
||||
super->btree_root.height = 1;
|
||||
super->btree_root.ref.blkno = bt->hdr.blkno;
|
||||
super->btree_root.ref.seq = bt->hdr.seq;
|
||||
|
||||
/* write the two super blocks */
|
||||
for (i = 0; i < SCOUTFS_SUPER_NR; i++) {
|
||||
@@ -210,12 +142,10 @@ static int write_new_fs(char *path, int fd)
|
||||
uuid_unparse(super->uuid, uuid_str);
|
||||
|
||||
printf("Created scoutfs filesystem:\n"
|
||||
" chunk bytes: %u\n"
|
||||
" total chunks: %llu\n"
|
||||
" block size: %u\n"
|
||||
" fsid: %llx\n"
|
||||
" uuid: %s\n",
|
||||
SCOUTFS_CHUNK_SIZE, total_chunks,
|
||||
le64_to_cpu(super->hdr.fsid), uuid_str);
|
||||
SCOUTFS_BLOCK_SIZE, le64_to_cpu(super->hdr.fsid), uuid_str);
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
|
||||
@@ -42,91 +42,28 @@ static void *read_block(int fd, u64 blkno)
|
||||
return buf;
|
||||
}
|
||||
|
||||
static void *read_chunk(int fd, u64 blkno)
|
||||
{
|
||||
ssize_t ret;
|
||||
void *buf;
|
||||
|
||||
buf = malloc(SCOUTFS_CHUNK_SIZE);
|
||||
if (!buf)
|
||||
return NULL;
|
||||
|
||||
ret = pread(fd, buf, SCOUTFS_CHUNK_SIZE, blkno << SCOUTFS_BLOCK_SHIFT);
|
||||
if (ret != SCOUTFS_CHUNK_SIZE) {
|
||||
fprintf(stderr, "read blkno %llu returned %zd: %s (%d)\n",
|
||||
blkno, ret, strerror(errno), errno);
|
||||
free(buf);
|
||||
buf = NULL;
|
||||
}
|
||||
|
||||
return buf;
|
||||
}
|
||||
|
||||
static void print_le32_list(int indent, __le32 *data, int nr)
|
||||
{
|
||||
char *fmt;
|
||||
int pos;
|
||||
int len;
|
||||
int i;
|
||||
u32 d;
|
||||
|
||||
printf("[");
|
||||
|
||||
pos = indent;
|
||||
for (i = 0; i < nr; i++) {
|
||||
if (i + 1 < nr)
|
||||
fmt = "%u, ";
|
||||
else
|
||||
fmt = "%u";
|
||||
|
||||
d = le32_to_cpu(data[i]);
|
||||
len = snprintf(NULL, 0, fmt, d);
|
||||
if (pos + len > 78) {
|
||||
printf("\n%*c", indent, ' ');
|
||||
pos = indent;
|
||||
}
|
||||
|
||||
printf(fmt, d);
|
||||
pos += len;
|
||||
}
|
||||
|
||||
printf("]\n");
|
||||
}
|
||||
|
||||
static void print_block_header(struct scoutfs_block_header *hdr)
|
||||
{
|
||||
u32 crc = crc_block(hdr);
|
||||
char valid_str[40];
|
||||
|
||||
if (crc != le32_to_cpu(hdr->crc))
|
||||
sprintf(valid_str, "# != %08x", crc);
|
||||
sprintf(valid_str, "(!= %08x) ", crc);
|
||||
else
|
||||
valid_str[0] = '\0';
|
||||
|
||||
printf(" header:\n"
|
||||
" crc: %08x %s\n"
|
||||
" fsid: %llx\n"
|
||||
" seq: %llu\n"
|
||||
" blkno: %llu\n",
|
||||
printf(" hdr: crc %08x %sfsid %llx seq %llu blkno %llu\n",
|
||||
le32_to_cpu(hdr->crc), valid_str, le64_to_cpu(hdr->fsid),
|
||||
le64_to_cpu(hdr->seq), le64_to_cpu(hdr->blkno));
|
||||
}
|
||||
|
||||
static void print_inode(struct scoutfs_inode *inode)
|
||||
{
|
||||
printf(" inode:\n"
|
||||
" size: %llu\n"
|
||||
" blocks: %llu\n"
|
||||
" nlink: %u\n"
|
||||
" uid: %u\n"
|
||||
" gid: %u\n"
|
||||
" mode: 0%o\n"
|
||||
" rdev: 0x%x\n"
|
||||
" salt: 0x%x\n"
|
||||
" max_dirent_hash_nr: %u\n"
|
||||
" atime: %llu.%08u\n"
|
||||
" ctime: %llu.%08u\n"
|
||||
" mtime: %llu.%08u\n",
|
||||
printf(" inode: size: %llu blocks: %llu nlink: %u\n"
|
||||
" uid: %u gid: %u mode: 0%o rdev: 0x%x\n"
|
||||
" salt: 0x%x max_dirent_hash_nr: %u\n"
|
||||
" atime: %llu.%08u ctime: %llu.%08u\n"
|
||||
" mtime: %llu.%08u\n",
|
||||
le64_to_cpu(inode->size), le64_to_cpu(inode->blocks),
|
||||
le32_to_cpu(inode->nlink), le32_to_cpu(inode->uid),
|
||||
le32_to_cpu(inode->gid), le32_to_cpu(inode->mode),
|
||||
@@ -150,271 +87,83 @@ static void print_dirent(struct scoutfs_dirent *dent, unsigned int val_len)
|
||||
name[i] = isprint(dent->name[i]) ? dent->name[i] : '.';
|
||||
name[i] = '\0';
|
||||
|
||||
printf(" dirent:\n"
|
||||
" ino: %llu\n"
|
||||
" type: %u\n"
|
||||
" name: \"%.*s\"\n",
|
||||
printf(" dirent: ino: %llu type: %u name: \"%.*s\"\n",
|
||||
le64_to_cpu(dent->ino), dent->type, i, name);
|
||||
}
|
||||
|
||||
static void print_item(struct scoutfs_item *item, void *val)
|
||||
static void print_btree_item(unsigned int off, struct scoutfs_btree_item *item)
|
||||
{
|
||||
printf(" item:\n"
|
||||
" key: "SKF"\n"
|
||||
" offset: %u\n"
|
||||
" len: %u\n"
|
||||
" skip_height: %u\n"
|
||||
" skip_next[]: ",
|
||||
SKA(&item->key),
|
||||
le32_to_cpu(item->offset),
|
||||
le16_to_cpu(item->len),
|
||||
item->skip_height);
|
||||
|
||||
print_le32_list(22, item->skip_next, item->skip_height);
|
||||
printf(" item: key "SKF" val_len %u off %u tnode: parent %u left %u right %u "
|
||||
"prio %x\n",
|
||||
SKA(&item->key), le16_to_cpu(item->val_len), off,
|
||||
le16_to_cpu(item->tnode.parent),
|
||||
le16_to_cpu(item->tnode.left),
|
||||
le16_to_cpu(item->tnode.right),
|
||||
le32_to_cpu(item->tnode.prio));
|
||||
|
||||
switch(item->key.type) {
|
||||
case SCOUTFS_INODE_KEY:
|
||||
print_inode(val);
|
||||
print_inode((void *)item->val);
|
||||
break;
|
||||
case SCOUTFS_DIRENT_KEY:
|
||||
print_dirent(val, le16_to_cpu(item->len));
|
||||
print_dirent((void *)item->val, le16_to_cpu(item->val_len));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static int print_log_segment(int fd, u64 nr)
|
||||
{
|
||||
struct scoutfs_item_block *iblk;
|
||||
struct scoutfs_bloom_block *blm;
|
||||
struct scoutfs_item *item;
|
||||
char *buf;
|
||||
char *val;
|
||||
__le32 next;
|
||||
int i;
|
||||
|
||||
buf = read_chunk(fd, nr);
|
||||
if (!buf)
|
||||
return -ENOMEM;
|
||||
|
||||
for (i = 0; i < SCOUTFS_BLOOM_BLOCKS; i++) {
|
||||
|
||||
blm = (void *)(buf + (i << SCOUTFS_BLOCK_SHIFT));
|
||||
|
||||
printf("bloom block:\n");
|
||||
print_block_header(&blm->hdr);
|
||||
}
|
||||
|
||||
iblk = (void *)(buf + (SCOUTFS_BLOOM_BLOCKS << SCOUTFS_BLOCK_SHIFT));
|
||||
|
||||
printf("item block:\n");
|
||||
print_block_header(&iblk->hdr);
|
||||
printf(" first: "SKF"\n"
|
||||
" last: "SKF"\n"
|
||||
" skip_root.next[]: ",
|
||||
SKA(&iblk->first), SKA(&iblk->last));
|
||||
print_le32_list(23, iblk->skip_root.next, SCOUTFS_SKIP_HEIGHT);
|
||||
|
||||
next = iblk->skip_root.next[0];
|
||||
while (next) {
|
||||
item = (void *)(buf + le32_to_cpu(next));
|
||||
val = (void *)(buf + le32_to_cpu(item->offset));
|
||||
print_item(item, val);
|
||||
next = item->skip_next[0];
|
||||
}
|
||||
|
||||
free(buf);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int print_log_segments(int fd, __le64 *log_segs, u64 total_chunks)
|
||||
static int print_btree_block(int fd, __le64 blkno, u8 level)
|
||||
{
|
||||
struct scoutfs_btree_item *item;
|
||||
struct scoutfs_btree_block *bt;
|
||||
struct scoutfs_block_ref *ref;
|
||||
unsigned int off;
|
||||
int ret = 0;
|
||||
int err;
|
||||
s64 nr;
|
||||
|
||||
while ((nr = find_first_le_bit(log_segs, total_chunks)) >= 0) {
|
||||
clear_le_bit(log_segs, nr);
|
||||
|
||||
err = print_log_segment(fd, nr << SCOUTFS_CHUNK_BLOCK_SHIFT);
|
||||
if (!ret && err)
|
||||
ret = err;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static char *ent_type_str(u8 type)
|
||||
{
|
||||
switch (type) {
|
||||
case SCOUTFS_RING_ADD_MANIFEST:
|
||||
return "ADD_MANIFEST";
|
||||
case SCOUTFS_RING_DEL_MANIFEST:
|
||||
return "DEL_MANIFEST";
|
||||
case SCOUTFS_RING_BITMAP:
|
||||
return "BITMAP";
|
||||
default:
|
||||
return "(unknown)";
|
||||
}
|
||||
}
|
||||
|
||||
static void print_ring_entry(int fd, struct scoutfs_ring_entry *ent)
|
||||
{
|
||||
struct scoutfs_manifest_entry *ment;
|
||||
struct scoutfs_ring_bitmap *bm;
|
||||
|
||||
printf(" entry:\n"
|
||||
" type: %u # %s\n"
|
||||
" len: %u\n",
|
||||
ent->type, ent_type_str(ent->type), le16_to_cpu(ent->len));
|
||||
|
||||
switch(ent->type) {
|
||||
case SCOUTFS_RING_ADD_MANIFEST:
|
||||
ment = (void *)(ent + 1);
|
||||
printf(" blkno: %llu\n"
|
||||
" seq: %llu\n"
|
||||
" level: %u\n"
|
||||
" first: "SKF"\n"
|
||||
" last: "SKF"\n",
|
||||
le64_to_cpu(ment->blkno), le64_to_cpu(ment->seq),
|
||||
ment->level, SKA(&ment->first), SKA(&ment->last));
|
||||
break;
|
||||
case SCOUTFS_RING_DEL_MANIFEST:
|
||||
ment = (void *)(ent + 1);
|
||||
printf(" blkno: %llu\n"
|
||||
" seq: %llu\n"
|
||||
" level: %u\n"
|
||||
" first: "SKF"\n"
|
||||
" last: "SKF"\n",
|
||||
le64_to_cpu(ment->blkno), le64_to_cpu(ment->seq),
|
||||
ment->level, SKA(&ment->first), SKA(&ment->last));
|
||||
break;
|
||||
case SCOUTFS_RING_BITMAP:
|
||||
bm = (void *)(ent + 1);
|
||||
printf(" offset: %u\n"
|
||||
" bits: 0x%llx%llx\n",
|
||||
le32_to_cpu(bm->offset),
|
||||
le64_to_cpu(bm->bits[1]), le64_to_cpu(bm->bits[0]));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static void update_log_segs(struct scoutfs_ring_entry *ent,
|
||||
__le64 *log_segs)
|
||||
{
|
||||
struct scoutfs_manifest_entry *ment;
|
||||
u64 bit;
|
||||
|
||||
switch(ent->type) {
|
||||
case SCOUTFS_RING_ADD_MANIFEST:
|
||||
ment = (void *)(ent + 1);
|
||||
bit = le64_to_cpu(ment->blkno) >> SCOUTFS_CHUNK_BLOCK_SHIFT;
|
||||
set_le_bit(log_segs, bit);
|
||||
break;
|
||||
case SCOUTFS_RING_DEL_MANIFEST:
|
||||
ment = (void *)(ent + 1);
|
||||
bit = le64_to_cpu(ment->blkno) >> SCOUTFS_CHUNK_BLOCK_SHIFT;
|
||||
clear_le_bit(log_segs, bit);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static int print_ring_block(int fd, u64 blkno, __le64 *log_segs)
|
||||
{
|
||||
struct scoutfs_ring_block *ring;
|
||||
struct scoutfs_ring_entry *ent;
|
||||
size_t off;
|
||||
int ret = 0;
|
||||
int i;
|
||||
|
||||
/* XXX just printing the first block for now */
|
||||
|
||||
ring = read_block(fd, blkno);
|
||||
if (!ring)
|
||||
bt = read_block(fd, le64_to_cpu(blkno));
|
||||
if (!bt)
|
||||
return -ENOMEM;
|
||||
|
||||
printf("ring block:\n");
|
||||
print_block_header(&ring->hdr);
|
||||
printf(" nr_entries: %u\n", le16_to_cpu(ring->nr_entries));
|
||||
printf("btree blkno %llu\n", le64_to_cpu(blkno));
|
||||
print_block_header(&bt->hdr);
|
||||
printf(" treap.off %u total_free %u tail_free %u nr_items %u\n",
|
||||
le16_to_cpu(bt->treap.off),
|
||||
le16_to_cpu(bt->total_free),
|
||||
le16_to_cpu(bt->tail_free),
|
||||
le16_to_cpu(bt->nr_items));
|
||||
|
||||
off = sizeof(struct scoutfs_ring_block);
|
||||
for (i = 0; i < le16_to_cpu(ring->nr_entries); i++) {
|
||||
ent = (void *)((char *)ring + off);
|
||||
|
||||
update_log_segs(ent, log_segs);
|
||||
print_ring_entry(fd, ent);
|
||||
|
||||
off += sizeof(struct scoutfs_ring_entry) +
|
||||
le16_to_cpu(ent->len);
|
||||
}
|
||||
|
||||
free(ring);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Print all the active ring blocks that are referenced by the super
|
||||
* and which were mapped by the map blocks that we printed.
|
||||
*/
|
||||
static int print_ring_blocks(int fd, struct scoutfs_super_block *super,
|
||||
u64 *ring_blknos, __le64 *log_segs)
|
||||
{
|
||||
u64 block;
|
||||
u64 blkno;
|
||||
u64 i;
|
||||
int ret = 0;
|
||||
int err;
|
||||
|
||||
block = le64_to_cpu(super->ring_first_block);
|
||||
|
||||
for (i = 0; i < le64_to_cpu(super->ring_active_blocks); i++) {
|
||||
blkno = ring_blknos[block >> SCOUTFS_CHUNK_BLOCK_SHIFT] +
|
||||
(block & SCOUTFS_CHUNK_BLOCK_MASK);
|
||||
|
||||
err = print_ring_block(fd, blkno, log_segs);
|
||||
if (err && !ret)
|
||||
ret = err;
|
||||
|
||||
if (++block == le64_to_cpu(super->ring_total_blocks))
|
||||
block = 0;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* print a chunk's worth of map blocks and stop if we hit a partial
|
||||
* block.
|
||||
*/
|
||||
static int print_map_blocks(int fd, u64 blkno, u64 *ring_blknos)
|
||||
{
|
||||
struct scoutfs_ring_map_block *map;
|
||||
int r = 0;
|
||||
int b;
|
||||
int i;
|
||||
|
||||
for (b = 0; SCOUTFS_BLOCKS_PER_CHUNK; b++) {
|
||||
map = read_block(fd, blkno + b);
|
||||
if (!map)
|
||||
return -ENOMEM;
|
||||
|
||||
printf("map block:\n");
|
||||
print_block_header(&map->hdr);
|
||||
printf(" nr_chunks: %u\n", le32_to_cpu(map->nr_chunks));
|
||||
|
||||
printf(" blknos: ");
|
||||
for (i = 0; i < le32_to_cpu(map->nr_chunks); i++, r++) {
|
||||
printf(" %llu\n", le64_to_cpu(map->blknos[i]));
|
||||
ring_blknos[r] = le64_to_cpu(map->blknos[i]);
|
||||
/* XXX just print in offset order */
|
||||
item = (void *)(bt + 1);
|
||||
for (i = 0; i < le16_to_cpu(bt->nr_items); i++) {
|
||||
if (item->tnode.parent == cpu_to_le16(1)) {
|
||||
i--;
|
||||
} else {
|
||||
off = (char *)&item->tnode - (char *)&bt->treap;
|
||||
print_btree_item(off, item);
|
||||
}
|
||||
|
||||
free(map);
|
||||
|
||||
if (i != SCOUTFS_RING_MAP_BLOCKS)
|
||||
break;
|
||||
item = (void *)&item->val[le16_to_cpu(item->val_len)];
|
||||
}
|
||||
|
||||
return 0;
|
||||
item = (void *)(bt + 1);
|
||||
for (i = 0; level && i < le16_to_cpu(bt->nr_items); i++) {
|
||||
if (item->tnode.parent == cpu_to_le16(1)) {
|
||||
i--;
|
||||
} else {
|
||||
ref = (void *)item->val;
|
||||
|
||||
err = print_btree_block(fd, ref->blkno, level - 1);
|
||||
if (err && !ret)
|
||||
ret = err;
|
||||
}
|
||||
|
||||
item = (void *)&item->val[le16_to_cpu(item->val_len)];
|
||||
}
|
||||
|
||||
free(bt);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int print_super_blocks(int fd)
|
||||
@@ -422,9 +171,6 @@ static int print_super_blocks(int fd)
|
||||
struct scoutfs_super_block *super;
|
||||
struct scoutfs_super_block recent = { .hdr.seq = 0 };
|
||||
char uuid_str[37];
|
||||
__le64 *log_segs;
|
||||
u64 *ring_blknos;
|
||||
u64 total_chunks;
|
||||
int ret = 0;
|
||||
int err;
|
||||
int i;
|
||||
@@ -436,28 +182,14 @@ static int print_super_blocks(int fd)
|
||||
|
||||
uuid_unparse(super->uuid, uuid_str);
|
||||
|
||||
printf("super:\n");
|
||||
printf("super blkno %llu\n", (u64)SCOUTFS_SUPER_BLKNO + i);
|
||||
print_block_header(&super->hdr);
|
||||
printf(" id: %llx\n"
|
||||
" uuid: %s\n"
|
||||
" bloom_salts: ",
|
||||
le64_to_cpu(super->id),
|
||||
uuid_str);
|
||||
print_le32_list(18, super->bloom_salts, SCOUTFS_BLOOM_SALTS);
|
||||
printf(" total_chunks: %llu\n"
|
||||
" ring_map_blkno: %llu\n"
|
||||
" ring_map_seq: %llu\n"
|
||||
" ring_first_block: %llu\n"
|
||||
" ring_active_blocks: %llu\n"
|
||||
" ring_total_blocks: %llu\n"
|
||||
" ring_seq: %llu\n",
|
||||
le64_to_cpu(super->total_chunks),
|
||||
le64_to_cpu(super->ring_map_blkno),
|
||||
le64_to_cpu(super->ring_map_seq),
|
||||
le64_to_cpu(super->ring_first_block),
|
||||
le64_to_cpu(super->ring_active_blocks),
|
||||
le64_to_cpu(super->ring_total_blocks),
|
||||
le64_to_cpu(super->ring_seq));
|
||||
printf(" id %llx uuid %s\n",
|
||||
le64_to_cpu(super->id), uuid_str);
|
||||
printf(" btree_root: height %u seq %llu blkno %llu\n",
|
||||
super->btree_root.height,
|
||||
le64_to_cpu(super->btree_root.ref.seq),
|
||||
le64_to_cpu(super->btree_root.ref.blkno));
|
||||
|
||||
if (le64_to_cpu(super->hdr.seq) > le64_to_cpu(recent.hdr.seq))
|
||||
memcpy(&recent, super, sizeof(recent));
|
||||
@@ -466,37 +198,12 @@ static int print_super_blocks(int fd)
|
||||
}
|
||||
|
||||
super = &recent;
|
||||
total_chunks = le64_to_cpu(super->total_chunks);
|
||||
|
||||
/*
|
||||
* Allocate a bitmap big enough to describe all the chunks and
|
||||
* we can have at most a full chunk worth of map blocks.
|
||||
*/
|
||||
log_segs = calloc(1, (total_chunks + 63) / 8);
|
||||
ring_blknos = calloc(1, SCOUTFS_CHUNK_SIZE);
|
||||
if (!log_segs || !ring_blknos) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
err = print_map_blocks(fd, le64_to_cpu(super->ring_map_blkno),
|
||||
ring_blknos);
|
||||
if (super->btree_root.height)
|
||||
err = print_btree_block(fd, super->btree_root.ref.blkno,
|
||||
super->btree_root.height - 1);
|
||||
if (err && !ret)
|
||||
ret = err;
|
||||
|
||||
err = print_ring_blocks(fd, super, ring_blknos, log_segs);
|
||||
if (err && !ret)
|
||||
ret = err;
|
||||
|
||||
err = print_log_segments(fd, log_segs, total_chunks);
|
||||
if (err && !ret)
|
||||
ret = err;
|
||||
|
||||
out:
|
||||
if (log_segs)
|
||||
free(log_segs);
|
||||
if (ring_blknos)
|
||||
free(ring_blknos);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user