Update to segment format with skiplists and bloom

Update to the format rev which has large log segments that start with
bloom filter blocks, have items linked in a skip list, and item values
stored at offsets in the block.

Signed-off-by: Zach Brown <zab@versity.com>
This commit is contained in:
Zach Brown
2016-03-23 15:23:54 -07:00
parent 463f5e5a07
commit 502783e1bc
3 changed files with 205 additions and 68 deletions

View File

@@ -13,6 +13,7 @@
*/
#define SCOUTFS_BLOCK_SHIFT 12
#define SCOUTFS_BLOCK_SIZE (1 << SCOUTFS_BLOCK_SHIFT)
#define SCOUTFS_BLOCK_MASK (SCOUTFS_BLOCK_SIZE - 1)
/*
* The allocator works on larger chunks. Smaller metadata structures
@@ -34,6 +35,19 @@
#define SCOUTFS_SUPER_BLKNO ((64 * 1024) >> SCOUTFS_BLOCK_SHIFT)
#define SCOUTFS_SUPER_NR 2
/*
* 7 bits in a ~76k bloom filter gives ~1% false positive for our max
* of 64k items.
*
* n = 65,536, p = 0.01 (1 in 100) → m = 628,167 (76.68KB), k = 7
*/
#define SCOUTFS_BLOOM_BITS 7
#define SCOUTFS_BLOOM_BIT_WIDTH 20 /* 2^20 > m */
#define SCOUTFS_BLOOM_BIT_MASK ((1 << SCOUTFS_BLOOM_BIT_WIDTH) - 1)
#define SCOUTFS_BLOOM_BLOCKS ((76 * 1024) / SCOUTFS_BLOCK_SIZE)
#define SCOUTFS_BLOOM_SALTS \
DIV_ROUND_UP(SCOUTFS_BLOOM_BITS * SCOUTFS_BLOOM_BIT_WIDTH, 32)
/*
* This header is found at the start of every block so that we can
* verify that it's what we were looking for. The crc and padding
@@ -64,6 +78,7 @@ struct scoutfs_super_block {
struct scoutfs_block_header hdr;
__le64 id;
__u8 uuid[SCOUTFS_UUID_BYTES];
__le32 bloom_salts[SCOUTFS_BLOOM_SALTS];
__le64 total_chunks;
__le64 ring_map_blkno;
__le64 ring_map_seq;
@@ -149,22 +164,43 @@ struct scoutfs_ring_bitmap {
__le64 bits[2];
} __packed;
struct scoutfs_bloom_block {
struct scoutfs_block_header hdr;
__le64 bits[0];
} __packed;
#define SCOUTFS_BLOOM_BITS_PER_BLOCK \
(((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_block_header)) / 8) * 64)
/*
* To start the log segments are a trivial single item block. We'll
* flesh this out into larger blocks once the rest of the architecture
* is in place.
* Items in log segments are sorted in a skip list by their key. We
* have a rough limit of 64k items.
*/
#define SCOUTFS_SKIP_HEIGHT 16
struct scoutfs_skip_root {
__le32 next[SCOUTFS_SKIP_HEIGHT];
} __packed;
/*
* An item block follows the bloom filters blocks at the start of a log
* segment chunk. Its skip list root references the item structs which
* reference the item values in the rest of the block. The references
* are byte offsets from the start of the chunk.
*/
struct scoutfs_item_block {
struct scoutfs_block_header hdr;
struct scoutfs_key first;
struct scoutfs_key last;
__le32 nr_items;
/* struct scoutfs_item_header items[0] .. */
struct scoutfs_skip_root skip_root;
} __packed;
struct scoutfs_item_header {
struct scoutfs_item {
struct scoutfs_key key;
__le32 offset;
__le16 len;
u8 skip_height;
__le32 skip_next[0];
} __packed;
struct scoutfs_timespec {

View File

@@ -17,6 +17,8 @@
#include "crc.h"
#include "rand.h"
#include "dev.h"
#include "bloom.h"
#include "bitops.h"
/*
* Update the block's header and write it out.
@@ -41,7 +43,6 @@ static int write_block(int fd, u64 blkno, struct scoutfs_block_header *hdr)
static int write_new_fs(char *path, int fd)
{
struct scoutfs_super_block *super;
struct scoutfs_block_header hdr;
struct scoutfs_inode *inode;
struct scoutfs_ring_map_block *map;
struct scoutfs_ring_block *ring;
@@ -49,7 +50,9 @@ static int write_new_fs(char *path, int fd)
struct scoutfs_ring_manifest_entry *mani;
struct scoutfs_ring_bitmap *bm;
struct scoutfs_item_block *iblk;
struct scoutfs_item_header *ihdr;
struct scoutfs_bloom_bits bits;
struct scoutfs_bloom_block *blm;
struct scoutfs_item *item;
struct scoutfs_key root_key;
struct timeval tv;
char uuid_str[37];
@@ -61,13 +64,10 @@ static int write_new_fs(char *path, int fd)
int ret;
gettimeofday(&tv, NULL);
/* crc and blkno written for each write */
hdr._pad = 0;
pseudo_random_bytes(&hdr.fsid, sizeof(hdr.fsid));
hdr.seq = cpu_to_le64(1);
buf = malloc(SCOUTFS_BLOCK_SIZE);
if (!buf) {
super = malloc(SCOUTFS_BLOCK_SIZE);
if (!buf || !super) {
ret = -errno;
fprintf(stderr, "failed to allocate a block: %s (%d)\n",
strerror(errno), errno);
@@ -87,20 +87,55 @@ static int write_new_fs(char *path, int fd)
root_key.type = SCOUTFS_INODE_KEY;
root_key.offset = 0;
/* super in log 0, first fs log block in log 1 */
/* first chunk has super blocks, log segment chunk is next */
blkno = 1 << SCOUTFS_CHUNK_BLOCK_SHIFT;
/* write a single log block with the root inode item */
/* first initialize the super so we can use it to build structures */
memset(super, 0, SCOUTFS_BLOCK_SIZE);
pseudo_random_bytes(&super->hdr.fsid, sizeof(super->hdr.fsid));
super->hdr.seq = cpu_to_le64(1);
super->id = cpu_to_le64(SCOUTFS_SUPER_ID);
uuid_generate(super->uuid);
pseudo_random_bytes(super->bloom_salts, sizeof(super->bloom_salts));
super->total_chunks = cpu_to_le64(total_chunks);
super->ring_map_seq = super->hdr.seq;
super->ring_first_block = cpu_to_le64(0);
super->ring_active_blocks = cpu_to_le64(1);
super->ring_total_blocks = cpu_to_le64(SCOUTFS_BLOCKS_PER_CHUNK);
super->ring_seq = super->hdr.seq;
/*
* There's only the root item so we check for its bloom bits as
* we write the bloom blocks.
*/
scoutfs_calc_bloom_bits(&bits, &root_key, super->bloom_salts);
for (i = 0; i < SCOUTFS_BLOOM_BLOCKS; i++) {
memset(buf, 0, SCOUTFS_BLOCK_SIZE);
blm = buf;
blm->hdr = super->hdr;
scoutfs_set_bloom_bits(blm, i, &bits);
ret = write_block(fd, blkno, &blm->hdr);
if (ret)
goto out;
blkno++;
}
/* write a single log segment with the root inode item */
memset(buf, 0, SCOUTFS_BLOCK_SIZE);
iblk = buf;
iblk->hdr = hdr;
iblk->first = root_key;
iblk->last = root_key;
iblk->nr_items = cpu_to_le32(1);
ihdr = (void *)(iblk + 1);
ihdr->key = root_key;
ihdr->len = cpu_to_le16(sizeof(struct scoutfs_inode));
inode = (void *)(ihdr + 1);
iblk->hdr = super->hdr;
iblk->skip_root.next[0] = cpu_to_le32((SCOUTFS_BLOOM_BLOCKS <<
SCOUTFS_BLOCK_SHIFT) +
sizeof(struct scoutfs_item_block));
item = (void *)(iblk + 1);
item->key = root_key;
item->offset = cpu_to_le32(le32_to_cpu(iblk->skip_root.next[0]) +
sizeof(struct scoutfs_item));
item->len = cpu_to_le16(sizeof(struct scoutfs_inode));
item->skip_height = 1;
inode = (void *)(item + 1);
inode->nlink = cpu_to_le32(2);
inode->mode = cpu_to_le32(0755 | 0040000);
inode->atime.sec = cpu_to_le64(tv.tv_sec);
@@ -113,18 +148,19 @@ static int write_new_fs(char *path, int fd)
ret = write_block(fd, blkno, &iblk->hdr);
if (ret)
goto out;
blkno = round_up(blkno, SCOUTFS_BLOCKS_PER_CHUNK);
/* write the ring block whose manifest entry references the log block */
memset(buf, 0, SCOUTFS_BLOCK_SIZE);
ring = buf;
ring->hdr = hdr;
ring->hdr = super->hdr;
ring->nr_entries = cpu_to_le16(2);
ent = (void *)(ring + 1);
ent->type = SCOUTFS_RING_ADD_MANIFEST;
ent->len = cpu_to_le16(sizeof(*mani));
mani = (void *)(ent + 1);
mani->blkno = cpu_to_le64(blkno);
mani->seq = hdr.seq;
mani->blkno = cpu_to_le64(blkno - SCOUTFS_BLOCKS_PER_CHUNK);
mani->seq = super->hdr.seq;
mani->level = 0;
mani->first = root_key;
mani->last = root_key;
@@ -137,37 +173,26 @@ static int write_new_fs(char *path, int fd)
bm->bits[0] = cpu_to_le64(~15ULL);
bm->bits[1] = cpu_to_le64(~0ULL);
blkno += SCOUTFS_BLOCKS_PER_CHUNK;
ret = write_block(fd, blkno, &ring->hdr);
if (ret)
goto out;
blkno += SCOUTFS_BLOCKS_PER_CHUNK;
/* the ring has a single chunk for now */
memset(buf, 0, SCOUTFS_BLOCK_SIZE);
map = buf;
map->hdr = hdr;
map->hdr = super->hdr;
map->nr_chunks = cpu_to_le32(1);
map->blknos[0] = cpu_to_le64(blkno);
map->blknos[0] = cpu_to_le64(blkno - SCOUTFS_BLOCKS_PER_CHUNK);
blkno += SCOUTFS_BLOCKS_PER_CHUNK;
ret = write_block(fd, blkno, &map->hdr);
if (ret)
goto out;
/* write the two super blocks */
memset(buf, 0, SCOUTFS_BLOCK_SIZE);
super = buf;
super->hdr = hdr;
super->id = cpu_to_le64(SCOUTFS_SUPER_ID);
uuid_generate(super->uuid);
super->total_chunks = cpu_to_le64(total_chunks);
/* make sure the super references everything we just wrote */
super->ring_map_blkno = cpu_to_le64(blkno);
super->ring_map_seq = hdr.seq;
super->ring_first_block = cpu_to_le64(0);
super->ring_active_blocks = cpu_to_le64(1);
super->ring_total_blocks = cpu_to_le64(SCOUTFS_BLOCKS_PER_CHUNK);
super->ring_seq = hdr.seq;
/* write the two super blocks */
for (i = 0; i < SCOUTFS_SUPER_NR; i++) {
super->hdr.seq = cpu_to_le64(i + 1);
ret = write_block(fd, SCOUTFS_SUPER_BLKNO + i, &super->hdr);
@@ -194,7 +219,10 @@ static int write_new_fs(char *path, int fd)
ret = 0;
out:
free(buf);
if (super)
free(super);
if (buf)
free(buf);
return ret;
}

View File

@@ -42,6 +42,57 @@ static void *read_block(int fd, u64 blkno)
return buf;
}
static void *read_chunk(int fd, u64 blkno)
{
ssize_t ret;
void *buf;
buf = malloc(SCOUTFS_CHUNK_SIZE);
if (!buf)
return NULL;
ret = pread(fd, buf, SCOUTFS_CHUNK_SIZE, blkno << SCOUTFS_BLOCK_SHIFT);
if (ret != SCOUTFS_CHUNK_SIZE) {
fprintf(stderr, "read blkno %llu returned %zd: %s (%d)\n",
blkno, ret, strerror(errno), errno);
free(buf);
buf = NULL;
}
return buf;
}
static void print_le32_list(int indent, __le32 *data, int nr)
{
char *fmt;
int pos;
int len;
int i;
u32 d;
printf("[");
pos = indent;
for (i = 0; i < nr; i++) {
if (i + 1 < nr)
fmt = "%u, ";
else
fmt = "%u";
d = le32_to_cpu(data[i]);
len = snprintf(NULL, 0, fmt, d);
if (pos + len > 78) {
printf("\n%*c", indent, ' ');
pos = indent;
}
printf(fmt, d);
pos += len;
}
printf("]\n");
}
static void print_block_header(struct scoutfs_block_header *hdr)
{
u32 crc = crc_block(hdr);
@@ -87,49 +138,69 @@ static void print_inode(struct scoutfs_inode *inode)
le32_to_cpu(inode->mtime.nsec));
}
static void print_item(struct scoutfs_item_header *ihdr)
static void print_item(struct scoutfs_item *item, void *val)
{
printf(" item:\n"
" key: "SKF"\n"
" len: %u\n",
SKA(&ihdr->key), le16_to_cpu(ihdr->len));
" offset: %u\n"
" len: %u\n"
" skip_height: %u\n"
" skip_next[]: ",
SKA(&item->key),
le32_to_cpu(item->offset),
le16_to_cpu(item->len),
item->skip_height);
switch(ihdr->key.type) {
print_le32_list(22, item->skip_next, item->skip_height);
switch(item->key.type) {
case SCOUTFS_INODE_KEY:
print_inode((void *)(ihdr + 1));
print_inode(val);
break;
}
}
static int print_item_block(int fd, u64 nr)
static int print_log_segment(int fd, u64 nr)
{
struct scoutfs_item_header *ihdr;
struct scoutfs_item_block *iblk;
size_t off;
struct scoutfs_bloom_block *blm;
struct scoutfs_item *item;
char *buf;
char *val;
__le32 next;
int i;
iblk = read_block(fd, nr);
if (!iblk)
buf = read_chunk(fd, nr);
if (!buf)
return -ENOMEM;
for (i = 0; i < SCOUTFS_BLOOM_BLOCKS; i++) {
blm = (void *)(buf + (i << SCOUTFS_BLOCK_SHIFT));
printf("bloom block:\n");
print_block_header(&blm->hdr);
}
iblk = (void *)(buf + (SCOUTFS_BLOOM_BLOCKS << SCOUTFS_BLOCK_SHIFT));
printf("item block:\n");
print_block_header(&iblk->hdr);
printf(" first: "SKF"\n"
" last: "SKF"\n"
" nr_items: %u\n",
SKA(&iblk->first), SKA(&iblk->last),
le32_to_cpu(iblk->nr_items));
" skip_root.next[]: ",
SKA(&iblk->first), SKA(&iblk->last));
print_le32_list(23, iblk->skip_root.next, SCOUTFS_SKIP_HEIGHT);
off = sizeof(struct scoutfs_item_block);
for (i = 0; i < le32_to_cpu(iblk->nr_items); i++) {
ihdr = (void *)((char *)iblk + off);
print_item(ihdr);
off += sizeof(struct scoutfs_item_header) +
le16_to_cpu(ihdr->len);
next = iblk->skip_root.next[0];
while (next) {
item = (void *)(buf + le32_to_cpu(next));
val = (void *)(buf + le32_to_cpu(item->offset));
print_item(item, val);
next = item->skip_next[0];
}
free(iblk);
free(buf);
return 0;
}
@@ -143,7 +214,7 @@ static int print_log_segments(int fd, __le64 *log_segs, u64 total_chunks)
while ((nr = find_first_le_bit(log_segs, total_chunks)) >= 0) {
clear_le_bit(log_segs, nr);
err = print_item_block(fd, nr << SCOUTFS_CHUNK_BLOCK_SHIFT);
err = print_log_segment(fd, nr << SCOUTFS_CHUNK_BLOCK_SHIFT);
if (!ret && err)
ret = err;
}
@@ -344,15 +415,17 @@ static int print_super_brick(int fd)
print_block_header(&super->hdr);
printf(" id: %llx\n"
" uuid: %s\n"
" total_chunks: %llu\n"
" bloom_salts: ",
le64_to_cpu(super->id),
uuid_str);
print_le32_list(18, super->bloom_salts, SCOUTFS_BLOOM_SALTS);
printf(" total_chunks: %llu\n"
" ring_map_blkno: %llu\n"
" ring_map_seq: %llu\n"
" ring_first_block: %llu\n"
" ring_active_blocks: %llu\n"
" ring_total_blocks: %llu\n"
" ring_seq: %llu\n",
le64_to_cpu(super->id),
uuid_str,
total_chunks,
le64_to_cpu(super->ring_map_blkno),
le64_to_cpu(super->ring_map_seq),