Update mkfs and print for treap ring

Update mkfs and print now that the manifest and allocator are stored in
treaps in the ring.

Signed-off-by: Zach Brown <zab@versity.com>
This commit is contained in:
Zach Brown
2016-12-30 17:45:02 -08:00
parent 7c4bc528c6
commit 484b34057a
6 changed files with 243 additions and 171 deletions

View File

@@ -37,3 +37,12 @@ u32 crc_block(struct scoutfs_block_header *hdr)
return crc32c(~0, (char *)hdr + sizeof(hdr->crc),
SCOUTFS_BLOCK_SIZE - sizeof(hdr->crc));
}
__le32 crc_node(struct scoutfs_treap_node *node)
{
unsigned int skip = sizeof(node->crc);
unsigned int bytes = offsetof(struct scoutfs_treap_node,
data[le16_to_cpu(node->bytes)]);
return cpu_to_le32(crc32c(~0, (char *)node + skip, bytes - skip));
}

View File

@@ -8,5 +8,6 @@
u32 crc32c(u32 crc, const void *data, unsigned int len);
u64 crc32c_64(u32 crc, const void *data, unsigned int len);
u32 crc_block(struct scoutfs_block_header *hdr);
__le32 crc_node(struct scoutfs_treap_node *node);
#endif

View File

@@ -51,22 +51,63 @@ struct scoutfs_block_header {
__le64 blkno;
} __packed;
struct scoutfs_ring_entry_header {
__u8 type;
__le16 len;
struct scoutfs_treap_ref {
__le64 off;
__le64 gen;
__u8 aug_bits;
} __packed;
#define SCOUTFS_RING_ADD_MANIFEST 1
#define SCOUTFS_RING_ADD_ALLOC 2
/*
* The lesser and greater bits are persistent on disk so that we can migrate
* nodes from the older half of the ring.
*
* The dirty bit is only used for in-memory nodes.
*/
#define SCOUTFS_TREAP_AUG_LESSER (1 << 0)
#define SCOUTFS_TREAP_AUG_GREATER (1 << 1)
#define SCOUTFS_TREAP_AUG_HALVES (SCOUTFS_TREAP_AUG_LESSER | \
SCOUTFS_TREAP_AUG_GREATER)
#define SCOUTFS_TREAP_AUG_DIRTY (1 << 2)
struct scoutfs_ring_add_manifest {
struct scoutfs_ring_entry_header eh;
/*
* Treap nodes are stored at byte offset in the ring of blocks described
* by the super block. Each reference contains the off and gen that it
* will find in the node for verification. Each node has the header
* and data payload covered by a crc.
*/
struct scoutfs_treap_node {
__le32 crc;
__le64 off;
__le64 gen;
__le64 prio;
struct scoutfs_treap_ref left;
struct scoutfs_treap_ref right;
__le16 bytes;
u8 data[0];
} __packed;
struct scoutfs_treap_root {
struct scoutfs_treap_ref ref;
} __packed;
/*
* This is absurdly huge. If there was only ever 1 item per segment and
* 2^64 items the tree could get this deep.
*/
#define SCOUTFS_MANIFEST_MAX_LEVEL 20
struct scoutfs_manifest {
struct scoutfs_treap_root root;
__le64 level_counts[SCOUTFS_MANIFEST_MAX_LEVEL];
} __packed;
struct scoutfs_manifest_entry {
__le64 segno;
__le64 seq;
__le16 first_key_len;
__le16 last_key_len;
__u8 level;
/* first and last key bytes */
__u8 keys[0];
} __packed;
#define SCOUTFS_ALLOC_REGION_SHIFT 8
@@ -77,27 +118,11 @@ struct scoutfs_ring_add_manifest {
* The bits need to be aligned so that the host can use native long
* bitops on the bits in memory.
*/
struct scoutfs_ring_alloc_region {
struct scoutfs_ring_entry_header eh;
struct scoutfs_alloc_region {
__le64 index;
__u8 pad[5];
__le64 bits[SCOUTFS_ALLOC_REGION_BITS / 64];
} __packed;
/*
* This is absurdly huge. If there was only ever 1 item per segment and
* 2^64 items the tree could get this deep.
*/
#define SCOUTFS_MANIFEST_MAX_LEVEL 20
/*
* The packed entries in the block are terminated by a header with a 0 length.
*/
struct scoutfs_ring_block {
struct scoutfs_block_header hdr;
struct scoutfs_ring_entry_header entries[0];
} __packed;
/*
* We really want these to be a power of two size so that they're naturally
* aligned. This ensures that they won't cross page boundaries and we
@@ -315,12 +340,13 @@ struct scoutfs_super_block {
__le64 free_blocks;
__le64 ring_blkno;
__le64 ring_blocks;
__le64 ring_index;
__le64 ring_nr;
__le64 ring_seq;
__le64 ring_tail_block;
__le64 ring_gen;
__le64 buddy_blocks;
struct scoutfs_buddy_root buddy_root;
struct scoutfs_btree_root btree_root;
struct scoutfs_treap_root alloc_treap_root;
struct scoutfs_manifest manifest;
} __packed;
#define SCOUTFS_ROOT_INO 1

View File

@@ -21,20 +21,11 @@
#include "buddy.h"
#include "item.h"
/*
* Update the block's header and write it out.
*/
static int write_block(int fd, u64 blkno, struct scoutfs_super_block *super,
struct scoutfs_block_header *hdr)
static int write_raw_block(int fd, u64 blkno, void *blk)
{
ssize_t ret;
if (super)
*hdr = super->hdr;
hdr->blkno = cpu_to_le64(blkno);
hdr->crc = cpu_to_le32(crc_block(hdr));
ret = pwrite(fd, hdr, SCOUTFS_BLOCK_SIZE, blkno << SCOUTFS_BLOCK_SHIFT);
ret = pwrite(fd, blk, SCOUTFS_BLOCK_SIZE, blkno << SCOUTFS_BLOCK_SHIFT);
if (ret != SCOUTFS_BLOCK_SIZE) {
fprintf(stderr, "write to blkno %llu returned %zd: %s (%d)\n",
blkno, ret, strerror(errno), errno);
@@ -45,41 +36,55 @@ static int write_block(int fd, u64 blkno, struct scoutfs_super_block *super,
}
/*
* Figure out how many blocks the ring will need. This goes crazy
* with the variables to make the calculation clear.
*
* XXX just a place holder. The real calculation is more like:
*
* - max size add manifest entries for all segments
* - (some day) allocator entries for all segments
* - ring block header overhead
* - ring block unused tail space overhead
*
* Update the block's header and write it out.
*/
static u64 calc_ring_blocks(u64 size)
static int write_block(int fd, u64 blkno, struct scoutfs_super_block *super,
struct scoutfs_block_header *hdr)
{
u64 first_seg_blocks;
u64 max_entry_bytes;
u64 total_bytes;
u64 blocks;
u64 segs;
if (super)
*hdr = super->hdr;
hdr->blkno = cpu_to_le64(blkno);
hdr->crc = cpu_to_le32(crc_block(hdr));
segs = size >> SCOUTFS_SEGMENT_SHIFT;
max_entry_bytes = sizeof(struct scoutfs_ring_add_manifest) +
(2 * SCOUTFS_MAX_KEY_SIZE);
total_bytes = (segs * max_entry_bytes) * 4;
blocks = DIV_ROUND_UP(total_bytes, SCOUTFS_BLOCK_SIZE);
return write_raw_block(fd, blkno, hdr);
}
first_seg_blocks = SCOUTFS_SEGMENT_BLOCKS -
(SCOUTFS_SUPER_BLKNO + SCOUTFS_SUPER_NR);
/*
* Figure out how many blocks the ring will need. The ring has to hold:
*
* - manifest entries for every segment with largest keys
* - allocator regions for bits to reference every segment
* - empty space at the end of blocks so nodes don't cross blocks
* - double that to account for repeatedly duplicating entries
* - double that so we can migrate everything before wrapping
*/
static u64 calc_ring_blocks(u64 segs)
{
u64 alloc_blocks;
u64 ment_blocks;
u64 block_bytes;
u64 node_bytes;
u64 regions;
return max(first_seg_blocks, blocks);
node_bytes = sizeof(struct scoutfs_treap_node) +
sizeof(struct scoutfs_manifest_entry) +
(2 * SCOUTFS_MAX_KEY_SIZE);
block_bytes = SCOUTFS_BLOCK_SIZE - (node_bytes - 1);
ment_blocks = DIV_ROUND_UP(segs * node_bytes, block_bytes);
node_bytes = sizeof(struct scoutfs_treap_node) +
sizeof(struct scoutfs_alloc_region);
regions = DIV_ROUND_UP(segs, SCOUTFS_ALLOC_REGION_BITS);
block_bytes = SCOUTFS_BLOCK_SIZE - (node_bytes - 1);
alloc_blocks = DIV_ROUND_UP(regions * node_bytes, block_bytes);
return ALIGN((ment_blocks + alloc_blocks) * 4, SCOUTFS_SEGMENT_BLOCKS);
}
/*
* Make a new file system by writing:
* - super blocks
* - ring block with manifest entry
* - ring block with manifest node
* - segment with root inode
*/
static int write_new_fs(char *path, int fd)
@@ -88,13 +93,12 @@ static int write_new_fs(char *path, int fd)
struct scoutfs_inode_key *ikey;
struct scoutfs_inode *inode;
struct scoutfs_segment_block *sblk;
struct scoutfs_ring_block *ring;
struct scoutfs_ring_add_manifest *am;
struct scoutfs_ring_alloc_region *reg;
struct scoutfs_manifest_entry *ment;
struct scoutfs_treap_node *node;
struct native_item item;
struct timeval tv;
char uuid_str[37];
unsigned int i;
void *ring;
u64 limit;
u64 size;
u64 total_blocks;
@@ -103,6 +107,7 @@ static int write_new_fs(char *path, int fd)
u64 first_segno;
__u8 *type;
int ret;
u64 i;
gettimeofday(&tv, NULL);
@@ -133,7 +138,7 @@ static int write_new_fs(char *path, int fd)
total_blocks = size / SCOUTFS_BLOCK_SIZE;
total_segs = size / SCOUTFS_SEGMENT_SIZE;
ring_blocks = calc_ring_blocks(size);
ring_blocks = calc_ring_blocks(total_segs);
/* first initialize the super so we can use it to build structures */
memset(super, 0, SCOUTFS_BLOCK_SIZE);
@@ -144,16 +149,18 @@ static int write_new_fs(char *path, int fd)
super->next_ino = cpu_to_le64(SCOUTFS_ROOT_INO + 1);
super->total_blocks = cpu_to_le64(total_blocks);
super->total_segs = cpu_to_le64(total_segs);
super->alloc_uninit = cpu_to_le64(SCOUTFS_ALLOC_REGION_BITS);
super->ring_blkno = cpu_to_le64(SCOUTFS_SUPER_BLKNO + 2);
super->ring_blocks = cpu_to_le64(ring_blocks);
super->ring_nr = cpu_to_le64(1);
super->ring_seq = cpu_to_le64(1);
super->ring_tail_block = cpu_to_le64(1);
super->ring_gen = cpu_to_le64(1);
first_segno = DIV_ROUND_UP(le64_to_cpu(super->ring_blkno) +
le64_to_cpu(super->ring_blocks),
SCOUTFS_SEGMENT_BLOCKS);
/* alloc from uninit, don't need regions yet */
super->alloc_uninit = cpu_to_le64(first_segno + 1);
/* write seg with root inode */
sblk->segno = cpu_to_le64(first_segno);
sblk->max_seq = cpu_to_le64(1);
@@ -189,30 +196,29 @@ static int write_new_fs(char *path, int fd)
}
/* a single manifest entry points to the single segment */
am = (void *)ring->entries;
am->eh.type = SCOUTFS_RING_ADD_MANIFEST;
am->eh.len = cpu_to_le16(sizeof(struct scoutfs_ring_add_manifest) + 1);
am->segno = sblk->segno;
am->seq = cpu_to_le64(1);
am->first_key_len = 0;
am->last_key_len = cpu_to_le16(1);
am->level = 1;
type = (void *)(am + 1);
node = ring;
node->off = cpu_to_le64((char *)node - (char *)ring);
node->gen = cpu_to_le64(1);
node->bytes = cpu_to_le16(sizeof(struct scoutfs_manifest_entry) + 1);
pseudo_random_bytes(&node->prio, sizeof(node->prio));
ment = (void *)node->data;
ment->segno = sblk->segno;
ment->seq = cpu_to_le64(1);
ment->first_key_len = 0;
ment->last_key_len = cpu_to_le16(1);
ment->level = 1;
type = (void *)ment->keys;
*type = SCOUTFS_MAX_UNUSED_KEY;
/* a single alloc region records the first two segs as allocated */
reg = (void *)am + le16_to_cpu(am->eh.len);
reg->eh.type = SCOUTFS_RING_ADD_ALLOC;
reg->eh.len = cpu_to_le16(sizeof(struct scoutfs_ring_alloc_region));
/* initial super, ring, and first seg are all allocated */
memset(reg->bits, 0xff, sizeof(reg->bits));
for (i = 0; i <= first_segno; i++)
clear_bit_le(i, reg->bits);
node->crc = crc_node(node);
/* block is already zeroed and so contains a 0 len terminating header */
super->manifest.root.ref.off = node->off;
super->manifest.root.ref.gen = node->gen;
super->manifest.root.ref.aug_bits = SCOUTFS_TREAP_AUG_LESSER;
super->manifest.level_counts[1] = cpu_to_le64(1);
ret = write_block(fd, le64_to_cpu(super->ring_blkno), super,
&ring->hdr);
ret = write_raw_block(fd, le64_to_cpu(super->ring_blkno), ring);
if (ret)
goto out;

View File

@@ -262,87 +262,90 @@ static int print_segments(int fd, unsigned long *seg_map, u64 total_segs)
return ret;
}
static int print_ring_block(int fd, unsigned long *seg_map, u64 blkno)
enum {
TREAP_MANIFEST,
TREAP_ALLOC,
};
static void print_treap_ref(struct scoutfs_treap_ref *ref)
{
struct scoutfs_ring_alloc_region *reg;
struct scoutfs_ring_entry_header *eh;
struct scoutfs_ring_add_manifest *am;
struct scoutfs_ring_block *ring;
u32 off;
int i;
ring = read_block(fd, blkno);
if (!ring)
return -ENOMEM;
printf("ring blkno %llu\n", blkno);
print_block_header(&ring->hdr);
eh = ring->entries;
while (eh->len) {
off = (char *)eh - (char *)ring;
printf(" [%u]: type %u len %u\n",
off, eh->type, le16_to_cpu(eh->len));
switch(eh->type) {
case SCOUTFS_RING_ADD_MANIFEST:
am = (void *)eh;
printf(" add ment: segno %llu seq %llu "
"first_len %u last_len %u level %u\n",
le64_to_cpu(am->segno),
le64_to_cpu(am->seq),
le16_to_cpu(am->first_key_len),
le16_to_cpu(am->last_key_len),
am->level);
/* XXX verify, 'int nr' limits segno precision */
set_bit_le(le64_to_cpu(am->segno), seg_map);
break;
case SCOUTFS_RING_ADD_ALLOC:
reg = (void *)eh;
printf(" add alloc: index %llu bits",
le64_to_cpu(reg->index));
for (i = 0; i < array_size(reg->bits); i++)
printf(" %016llx", le64_to_cpu(reg->bits[i]));
printf("\n");
break;
}
eh = (void *)eh + le16_to_cpu(eh->len);
}
free(ring);
return 0;
printf(" off %llu gen %llu aug_bits %x",
le64_to_cpu(ref->off), le64_to_cpu(ref->gen),
ref->aug_bits);
}
static int print_ring_blocks(int fd, struct scoutfs_super_block *super,
unsigned long *seg_map)
static int print_treap_node(int fd, struct scoutfs_super_block *super,
unsigned treap, struct scoutfs_treap_ref *ref,
unsigned long *seg_map)
{
int ret = 0;
struct scoutfs_manifest_entry *ment;
struct scoutfs_alloc_region *reg;
struct scoutfs_treap_node *tnode;
char valid_str[40];
__le32 crc;
u64 blkno;
u64 index;
u64 nr;
int err;
void *blk;
u64 off;
int i;
index = le64_to_cpu(super->ring_index);
nr = le64_to_cpu(super->ring_nr);
if (!ref->gen)
return 0;
while (nr) {
blkno = le64_to_cpu(super->ring_blkno) + index;
off = le64_to_cpu(ref->off);
blkno = le64_to_cpu(super->ring_blkno) + (off >> SCOUTFS_BLOCK_SHIFT);
err = print_ring_block(fd, seg_map, blkno);
if (err && !ret)
ret = err;
blk = read_block(fd, blkno);
if (!blk)
return -ENOMEM;
if (++index == le64_to_cpu(super->ring_blocks))
index = 0;
nr--;
};
tnode = blk + (off & SCOUTFS_BLOCK_MASK);
return ret;
crc = crc_node(tnode);
if (crc != tnode->crc)
sprintf(valid_str, "(!= %08x) ", le32_to_cpu(crc));
else
valid_str[0] = '\0';
printf(" node: crc %08x %soff %llu gen %llu bytes %u prio %016llx\n"
" l:",
le32_to_cpu(tnode->crc), valid_str, le64_to_cpu(tnode->off),
le64_to_cpu(tnode->gen), le16_to_cpu(tnode->bytes),
le64_to_cpu(tnode->prio));
print_treap_ref(&tnode->left);
printf(" r:");
print_treap_ref(&tnode->right);
printf("\n");
switch(treap) {
case TREAP_MANIFEST:
ment = (void *)tnode->data;
printf(" ment: segno %llu seq %llu "
"first_len %u last_len %u level %u\n",
le64_to_cpu(ment->segno),
le64_to_cpu(ment->seq),
le16_to_cpu(ment->first_key_len),
le16_to_cpu(ment->last_key_len),
ment->level);
/* XXX verify, 'int nr' limits segno precision */
set_bit_le(le64_to_cpu(ment->segno), seg_map);
break;
case TREAP_ALLOC:
reg = (void *)tnode->data;
printf(" reg: index %llu bits",
le64_to_cpu(reg->index));
for (i = 0; i < array_size(reg->bits); i++)
printf(" %016llx", le64_to_cpu(reg->bits[i]));
printf("\n");
break;
}
print_treap_node(fd, super, treap, &tnode->left, seg_map);
print_treap_node(fd, super, treap, &tnode->right, seg_map);
free(blk);
return 0;
}
static int print_super_blocks(int fd)
@@ -351,10 +354,13 @@ static int print_super_blocks(int fd)
struct scoutfs_super_block recent = { .hdr.seq = 0 };
unsigned long *seg_map;
char uuid_str[37];
__le64 *counts;
u64 total_segs;
u64 longs;
int ret = 0;
int err;
int i;
int j;
for (i = 0; i < SCOUTFS_SUPER_NR; i++) {
super = read_block(fd, SCOUTFS_SUPER_BLKNO + i);
@@ -369,19 +375,31 @@ static int print_super_blocks(int fd)
le64_to_cpu(super->id), uuid_str);
/* XXX these are all in a crazy order */
printf(" next_ino %llu total_blocks %llu free_blocks %llu\n"
" ring_blkno %llu ring_blocks %llu ring_index %llu\n"
" ring_nr %llu ring_seq %llu alloc_uninit %llu\n"
" total_segs %llu\n",
" ring_blkno %llu ring_blocks %llu ring_tail_block %llu\n"
" ring_gen %llu alloc_uninit %llu total_segs %llu\n",
le64_to_cpu(super->next_ino),
le64_to_cpu(super->total_blocks),
le64_to_cpu(super->free_blocks),
le64_to_cpu(super->ring_blkno),
le64_to_cpu(super->ring_blocks),
le64_to_cpu(super->ring_index),
le64_to_cpu(super->ring_nr),
le64_to_cpu(super->ring_seq),
le64_to_cpu(super->ring_tail_block),
le64_to_cpu(super->ring_gen),
le64_to_cpu(super->alloc_uninit),
le64_to_cpu(super->total_segs));
printf(" alloc root:");
print_treap_ref(&super->alloc_treap_root.ref);
printf("\n");
printf(" manifest root:");
print_treap_ref(&super->manifest.root.ref);
printf("\n");
printf(" level_counts:");
counts = super->manifest.level_counts;
for (j = 0; j < SCOUTFS_MANIFEST_MAX_LEVEL; j++) {
if (le64_to_cpu(counts[j]))
printf(" %u: %llu", j, le64_to_cpu(counts[j]));
}
printf("\n");
if (le64_to_cpu(super->hdr.seq) > le64_to_cpu(recent.hdr.seq))
memcpy(&recent, super, sizeof(recent));
@@ -398,8 +416,19 @@ static int print_super_blocks(int fd)
if (!seg_map)
return -ENOMEM;
ret = print_ring_blocks(fd, super, seg_map) ?:
print_segments(fd, seg_map, total_segs);
printf("manifest treap:\n");
ret = print_treap_node(fd, super, TREAP_MANIFEST,
&super->manifest.root.ref, seg_map);
printf("alloc treap:\n");
err = print_treap_node(fd, super, TREAP_ALLOC,
&super->alloc_treap_root.ref, NULL);
if (err && !ret)
ret = err;
err = print_segments(fd, seg_map, total_segs);
if (err && !ret)
ret = err;
free(seg_map);

View File

@@ -51,6 +51,7 @@ do { \
})
#define DIV_ROUND_UP(x, y) (((x) + (y) - 1) / (y))
#define ALIGN(x, y) (((x) + (y) - 1) & ~((y) - 1))
#ifndef offsetof
#define offsetof(type, memb) ((unsigned long)&((type *)0)->memb)