mirror of
https://github.com/versity/scoutfs.git
synced 2026-01-04 03:14:02 +00:00
Update to segment format with skiplists and bloom
Update to the format rev which has large log segments that start with bloom filter blocks, have items linked in a skip list, and item values stored at offsets in the block. Signed-off-by: Zach Brown <zab@versity.com>
This commit is contained in:
@@ -13,6 +13,7 @@
|
||||
*/
|
||||
#define SCOUTFS_BLOCK_SHIFT 12
|
||||
#define SCOUTFS_BLOCK_SIZE (1 << SCOUTFS_BLOCK_SHIFT)
|
||||
#define SCOUTFS_BLOCK_MASK (SCOUTFS_BLOCK_SIZE - 1)
|
||||
|
||||
/*
|
||||
* The allocator works on larger chunks. Smaller metadata structures
|
||||
@@ -34,6 +35,19 @@
|
||||
#define SCOUTFS_SUPER_BLKNO ((64 * 1024) >> SCOUTFS_BLOCK_SHIFT)
|
||||
#define SCOUTFS_SUPER_NR 2
|
||||
|
||||
/*
|
||||
* 7 bits in a ~76k bloom filter gives ~1% false positive for our max
|
||||
* of 64k items.
|
||||
*
|
||||
* n = 65,536, p = 0.01 (1 in 100) → m = 628,167 (76.68KB), k = 7
|
||||
*/
|
||||
#define SCOUTFS_BLOOM_BITS 7
|
||||
#define SCOUTFS_BLOOM_BIT_WIDTH 20 /* 2^20 > m */
|
||||
#define SCOUTFS_BLOOM_BIT_MASK ((1 << SCOUTFS_BLOOM_BIT_WIDTH) - 1)
|
||||
#define SCOUTFS_BLOOM_BLOCKS ((76 * 1024) / SCOUTFS_BLOCK_SIZE)
|
||||
#define SCOUTFS_BLOOM_SALTS \
|
||||
DIV_ROUND_UP(SCOUTFS_BLOOM_BITS * SCOUTFS_BLOOM_BIT_WIDTH, 32)
|
||||
|
||||
/*
|
||||
* This header is found at the start of every block so that we can
|
||||
* verify that it's what we were looking for. The crc and padding
|
||||
@@ -64,6 +78,7 @@ struct scoutfs_super_block {
|
||||
struct scoutfs_block_header hdr;
|
||||
__le64 id;
|
||||
__u8 uuid[SCOUTFS_UUID_BYTES];
|
||||
__le32 bloom_salts[SCOUTFS_BLOOM_SALTS];
|
||||
__le64 total_chunks;
|
||||
__le64 ring_map_blkno;
|
||||
__le64 ring_map_seq;
|
||||
@@ -149,22 +164,43 @@ struct scoutfs_ring_bitmap {
|
||||
__le64 bits[2];
|
||||
} __packed;
|
||||
|
||||
|
||||
struct scoutfs_bloom_block {
|
||||
struct scoutfs_block_header hdr;
|
||||
__le64 bits[0];
|
||||
} __packed;
|
||||
|
||||
#define SCOUTFS_BLOOM_BITS_PER_BLOCK \
|
||||
(((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_block_header)) / 8) * 64)
|
||||
|
||||
/*
|
||||
* To start the log segments are a trivial single item block. We'll
|
||||
* flesh this out into larger blocks once the rest of the architecture
|
||||
* is in place.
|
||||
* Items in log segments are sorted in a skip list by their key. We
|
||||
* have a rough limit of 64k items.
|
||||
*/
|
||||
#define SCOUTFS_SKIP_HEIGHT 16
|
||||
struct scoutfs_skip_root {
|
||||
__le32 next[SCOUTFS_SKIP_HEIGHT];
|
||||
} __packed;
|
||||
|
||||
/*
|
||||
* An item block follows the bloom filters blocks at the start of a log
|
||||
* segment chunk. Its skip list root references the item structs which
|
||||
* reference the item values in the rest of the block. The references
|
||||
* are byte offsets from the start of the chunk.
|
||||
*/
|
||||
struct scoutfs_item_block {
|
||||
struct scoutfs_block_header hdr;
|
||||
struct scoutfs_key first;
|
||||
struct scoutfs_key last;
|
||||
__le32 nr_items;
|
||||
/* struct scoutfs_item_header items[0] .. */
|
||||
struct scoutfs_skip_root skip_root;
|
||||
} __packed;
|
||||
|
||||
struct scoutfs_item_header {
|
||||
struct scoutfs_item {
|
||||
struct scoutfs_key key;
|
||||
__le32 offset;
|
||||
__le16 len;
|
||||
u8 skip_height;
|
||||
__le32 skip_next[0];
|
||||
} __packed;
|
||||
|
||||
struct scoutfs_timespec {
|
||||
|
||||
102
utils/src/mkfs.c
102
utils/src/mkfs.c
@@ -17,6 +17,8 @@
|
||||
#include "crc.h"
|
||||
#include "rand.h"
|
||||
#include "dev.h"
|
||||
#include "bloom.h"
|
||||
#include "bitops.h"
|
||||
|
||||
/*
|
||||
* Update the block's header and write it out.
|
||||
@@ -41,7 +43,6 @@ static int write_block(int fd, u64 blkno, struct scoutfs_block_header *hdr)
|
||||
static int write_new_fs(char *path, int fd)
|
||||
{
|
||||
struct scoutfs_super_block *super;
|
||||
struct scoutfs_block_header hdr;
|
||||
struct scoutfs_inode *inode;
|
||||
struct scoutfs_ring_map_block *map;
|
||||
struct scoutfs_ring_block *ring;
|
||||
@@ -49,7 +50,9 @@ static int write_new_fs(char *path, int fd)
|
||||
struct scoutfs_ring_manifest_entry *mani;
|
||||
struct scoutfs_ring_bitmap *bm;
|
||||
struct scoutfs_item_block *iblk;
|
||||
struct scoutfs_item_header *ihdr;
|
||||
struct scoutfs_bloom_bits bits;
|
||||
struct scoutfs_bloom_block *blm;
|
||||
struct scoutfs_item *item;
|
||||
struct scoutfs_key root_key;
|
||||
struct timeval tv;
|
||||
char uuid_str[37];
|
||||
@@ -61,13 +64,10 @@ static int write_new_fs(char *path, int fd)
|
||||
int ret;
|
||||
|
||||
gettimeofday(&tv, NULL);
|
||||
/* crc and blkno written for each write */
|
||||
hdr._pad = 0;
|
||||
pseudo_random_bytes(&hdr.fsid, sizeof(hdr.fsid));
|
||||
hdr.seq = cpu_to_le64(1);
|
||||
|
||||
buf = malloc(SCOUTFS_BLOCK_SIZE);
|
||||
if (!buf) {
|
||||
super = malloc(SCOUTFS_BLOCK_SIZE);
|
||||
if (!buf || !super) {
|
||||
ret = -errno;
|
||||
fprintf(stderr, "failed to allocate a block: %s (%d)\n",
|
||||
strerror(errno), errno);
|
||||
@@ -87,20 +87,55 @@ static int write_new_fs(char *path, int fd)
|
||||
root_key.type = SCOUTFS_INODE_KEY;
|
||||
root_key.offset = 0;
|
||||
|
||||
/* super in log 0, first fs log block in log 1 */
|
||||
/* first chunk has super blocks, log segment chunk is next */
|
||||
blkno = 1 << SCOUTFS_CHUNK_BLOCK_SHIFT;
|
||||
|
||||
/* write a single log block with the root inode item */
|
||||
/* first initialize the super so we can use it to build structures */
|
||||
memset(super, 0, SCOUTFS_BLOCK_SIZE);
|
||||
pseudo_random_bytes(&super->hdr.fsid, sizeof(super->hdr.fsid));
|
||||
super->hdr.seq = cpu_to_le64(1);
|
||||
super->id = cpu_to_le64(SCOUTFS_SUPER_ID);
|
||||
uuid_generate(super->uuid);
|
||||
pseudo_random_bytes(super->bloom_salts, sizeof(super->bloom_salts));
|
||||
super->total_chunks = cpu_to_le64(total_chunks);
|
||||
super->ring_map_seq = super->hdr.seq;
|
||||
super->ring_first_block = cpu_to_le64(0);
|
||||
super->ring_active_blocks = cpu_to_le64(1);
|
||||
super->ring_total_blocks = cpu_to_le64(SCOUTFS_BLOCKS_PER_CHUNK);
|
||||
super->ring_seq = super->hdr.seq;
|
||||
|
||||
/*
|
||||
* There's only the root item so we check for its bloom bits as
|
||||
* we write the bloom blocks.
|
||||
*/
|
||||
scoutfs_calc_bloom_bits(&bits, &root_key, super->bloom_salts);
|
||||
for (i = 0; i < SCOUTFS_BLOOM_BLOCKS; i++) {
|
||||
memset(buf, 0, SCOUTFS_BLOCK_SIZE);
|
||||
blm = buf;
|
||||
blm->hdr = super->hdr;
|
||||
|
||||
scoutfs_set_bloom_bits(blm, i, &bits);
|
||||
|
||||
ret = write_block(fd, blkno, &blm->hdr);
|
||||
if (ret)
|
||||
goto out;
|
||||
blkno++;
|
||||
}
|
||||
|
||||
/* write a single log segment with the root inode item */
|
||||
memset(buf, 0, SCOUTFS_BLOCK_SIZE);
|
||||
iblk = buf;
|
||||
iblk->hdr = hdr;
|
||||
iblk->first = root_key;
|
||||
iblk->last = root_key;
|
||||
iblk->nr_items = cpu_to_le32(1);
|
||||
ihdr = (void *)(iblk + 1);
|
||||
ihdr->key = root_key;
|
||||
ihdr->len = cpu_to_le16(sizeof(struct scoutfs_inode));
|
||||
inode = (void *)(ihdr + 1);
|
||||
iblk->hdr = super->hdr;
|
||||
iblk->skip_root.next[0] = cpu_to_le32((SCOUTFS_BLOOM_BLOCKS <<
|
||||
SCOUTFS_BLOCK_SHIFT) +
|
||||
sizeof(struct scoutfs_item_block));
|
||||
item = (void *)(iblk + 1);
|
||||
item->key = root_key;
|
||||
item->offset = cpu_to_le32(le32_to_cpu(iblk->skip_root.next[0]) +
|
||||
sizeof(struct scoutfs_item));
|
||||
item->len = cpu_to_le16(sizeof(struct scoutfs_inode));
|
||||
item->skip_height = 1;
|
||||
inode = (void *)(item + 1);
|
||||
inode->nlink = cpu_to_le32(2);
|
||||
inode->mode = cpu_to_le32(0755 | 0040000);
|
||||
inode->atime.sec = cpu_to_le64(tv.tv_sec);
|
||||
@@ -113,18 +148,19 @@ static int write_new_fs(char *path, int fd)
|
||||
ret = write_block(fd, blkno, &iblk->hdr);
|
||||
if (ret)
|
||||
goto out;
|
||||
blkno = round_up(blkno, SCOUTFS_BLOCKS_PER_CHUNK);
|
||||
|
||||
/* write the ring block whose manifest entry references the log block */
|
||||
memset(buf, 0, SCOUTFS_BLOCK_SIZE);
|
||||
ring = buf;
|
||||
ring->hdr = hdr;
|
||||
ring->hdr = super->hdr;
|
||||
ring->nr_entries = cpu_to_le16(2);
|
||||
ent = (void *)(ring + 1);
|
||||
ent->type = SCOUTFS_RING_ADD_MANIFEST;
|
||||
ent->len = cpu_to_le16(sizeof(*mani));
|
||||
mani = (void *)(ent + 1);
|
||||
mani->blkno = cpu_to_le64(blkno);
|
||||
mani->seq = hdr.seq;
|
||||
mani->blkno = cpu_to_le64(blkno - SCOUTFS_BLOCKS_PER_CHUNK);
|
||||
mani->seq = super->hdr.seq;
|
||||
mani->level = 0;
|
||||
mani->first = root_key;
|
||||
mani->last = root_key;
|
||||
@@ -137,37 +173,26 @@ static int write_new_fs(char *path, int fd)
|
||||
bm->bits[0] = cpu_to_le64(~15ULL);
|
||||
bm->bits[1] = cpu_to_le64(~0ULL);
|
||||
|
||||
blkno += SCOUTFS_BLOCKS_PER_CHUNK;
|
||||
ret = write_block(fd, blkno, &ring->hdr);
|
||||
if (ret)
|
||||
goto out;
|
||||
blkno += SCOUTFS_BLOCKS_PER_CHUNK;
|
||||
|
||||
/* the ring has a single chunk for now */
|
||||
memset(buf, 0, SCOUTFS_BLOCK_SIZE);
|
||||
map = buf;
|
||||
map->hdr = hdr;
|
||||
map->hdr = super->hdr;
|
||||
map->nr_chunks = cpu_to_le32(1);
|
||||
map->blknos[0] = cpu_to_le64(blkno);
|
||||
map->blknos[0] = cpu_to_le64(blkno - SCOUTFS_BLOCKS_PER_CHUNK);
|
||||
|
||||
blkno += SCOUTFS_BLOCKS_PER_CHUNK;
|
||||
ret = write_block(fd, blkno, &map->hdr);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
/* write the two super blocks */
|
||||
memset(buf, 0, SCOUTFS_BLOCK_SIZE);
|
||||
super = buf;
|
||||
super->hdr = hdr;
|
||||
super->id = cpu_to_le64(SCOUTFS_SUPER_ID);
|
||||
uuid_generate(super->uuid);
|
||||
super->total_chunks = cpu_to_le64(total_chunks);
|
||||
/* make sure the super references everything we just wrote */
|
||||
super->ring_map_blkno = cpu_to_le64(blkno);
|
||||
super->ring_map_seq = hdr.seq;
|
||||
super->ring_first_block = cpu_to_le64(0);
|
||||
super->ring_active_blocks = cpu_to_le64(1);
|
||||
super->ring_total_blocks = cpu_to_le64(SCOUTFS_BLOCKS_PER_CHUNK);
|
||||
super->ring_seq = hdr.seq;
|
||||
|
||||
/* write the two super blocks */
|
||||
for (i = 0; i < SCOUTFS_SUPER_NR; i++) {
|
||||
super->hdr.seq = cpu_to_le64(i + 1);
|
||||
ret = write_block(fd, SCOUTFS_SUPER_BLKNO + i, &super->hdr);
|
||||
@@ -194,7 +219,10 @@ static int write_new_fs(char *path, int fd)
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
free(buf);
|
||||
if (super)
|
||||
free(super);
|
||||
if (buf)
|
||||
free(buf);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
@@ -42,6 +42,57 @@ static void *read_block(int fd, u64 blkno)
|
||||
return buf;
|
||||
}
|
||||
|
||||
static void *read_chunk(int fd, u64 blkno)
|
||||
{
|
||||
ssize_t ret;
|
||||
void *buf;
|
||||
|
||||
buf = malloc(SCOUTFS_CHUNK_SIZE);
|
||||
if (!buf)
|
||||
return NULL;
|
||||
|
||||
ret = pread(fd, buf, SCOUTFS_CHUNK_SIZE, blkno << SCOUTFS_BLOCK_SHIFT);
|
||||
if (ret != SCOUTFS_CHUNK_SIZE) {
|
||||
fprintf(stderr, "read blkno %llu returned %zd: %s (%d)\n",
|
||||
blkno, ret, strerror(errno), errno);
|
||||
free(buf);
|
||||
buf = NULL;
|
||||
}
|
||||
|
||||
return buf;
|
||||
}
|
||||
|
||||
static void print_le32_list(int indent, __le32 *data, int nr)
|
||||
{
|
||||
char *fmt;
|
||||
int pos;
|
||||
int len;
|
||||
int i;
|
||||
u32 d;
|
||||
|
||||
printf("[");
|
||||
|
||||
pos = indent;
|
||||
for (i = 0; i < nr; i++) {
|
||||
if (i + 1 < nr)
|
||||
fmt = "%u, ";
|
||||
else
|
||||
fmt = "%u";
|
||||
|
||||
d = le32_to_cpu(data[i]);
|
||||
len = snprintf(NULL, 0, fmt, d);
|
||||
if (pos + len > 78) {
|
||||
printf("\n%*c", indent, ' ');
|
||||
pos = indent;
|
||||
}
|
||||
|
||||
printf(fmt, d);
|
||||
pos += len;
|
||||
}
|
||||
|
||||
printf("]\n");
|
||||
}
|
||||
|
||||
static void print_block_header(struct scoutfs_block_header *hdr)
|
||||
{
|
||||
u32 crc = crc_block(hdr);
|
||||
@@ -87,49 +138,69 @@ static void print_inode(struct scoutfs_inode *inode)
|
||||
le32_to_cpu(inode->mtime.nsec));
|
||||
}
|
||||
|
||||
static void print_item(struct scoutfs_item_header *ihdr)
|
||||
static void print_item(struct scoutfs_item *item, void *val)
|
||||
{
|
||||
printf(" item:\n"
|
||||
" key: "SKF"\n"
|
||||
" len: %u\n",
|
||||
SKA(&ihdr->key), le16_to_cpu(ihdr->len));
|
||||
" offset: %u\n"
|
||||
" len: %u\n"
|
||||
" skip_height: %u\n"
|
||||
" skip_next[]: ",
|
||||
SKA(&item->key),
|
||||
le32_to_cpu(item->offset),
|
||||
le16_to_cpu(item->len),
|
||||
item->skip_height);
|
||||
|
||||
switch(ihdr->key.type) {
|
||||
print_le32_list(22, item->skip_next, item->skip_height);
|
||||
|
||||
switch(item->key.type) {
|
||||
case SCOUTFS_INODE_KEY:
|
||||
print_inode((void *)(ihdr + 1));
|
||||
print_inode(val);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static int print_item_block(int fd, u64 nr)
|
||||
static int print_log_segment(int fd, u64 nr)
|
||||
{
|
||||
struct scoutfs_item_header *ihdr;
|
||||
struct scoutfs_item_block *iblk;
|
||||
size_t off;
|
||||
struct scoutfs_bloom_block *blm;
|
||||
struct scoutfs_item *item;
|
||||
char *buf;
|
||||
char *val;
|
||||
__le32 next;
|
||||
int i;
|
||||
|
||||
iblk = read_block(fd, nr);
|
||||
if (!iblk)
|
||||
buf = read_chunk(fd, nr);
|
||||
if (!buf)
|
||||
return -ENOMEM;
|
||||
|
||||
for (i = 0; i < SCOUTFS_BLOOM_BLOCKS; i++) {
|
||||
|
||||
blm = (void *)(buf + (i << SCOUTFS_BLOCK_SHIFT));
|
||||
|
||||
printf("bloom block:\n");
|
||||
print_block_header(&blm->hdr);
|
||||
}
|
||||
|
||||
iblk = (void *)(buf + (SCOUTFS_BLOOM_BLOCKS << SCOUTFS_BLOCK_SHIFT));
|
||||
|
||||
printf("item block:\n");
|
||||
print_block_header(&iblk->hdr);
|
||||
printf(" first: "SKF"\n"
|
||||
" last: "SKF"\n"
|
||||
" nr_items: %u\n",
|
||||
SKA(&iblk->first), SKA(&iblk->last),
|
||||
le32_to_cpu(iblk->nr_items));
|
||||
" skip_root.next[]: ",
|
||||
SKA(&iblk->first), SKA(&iblk->last));
|
||||
print_le32_list(23, iblk->skip_root.next, SCOUTFS_SKIP_HEIGHT);
|
||||
|
||||
off = sizeof(struct scoutfs_item_block);
|
||||
for (i = 0; i < le32_to_cpu(iblk->nr_items); i++) {
|
||||
ihdr = (void *)((char *)iblk + off);
|
||||
print_item(ihdr);
|
||||
|
||||
off += sizeof(struct scoutfs_item_header) +
|
||||
le16_to_cpu(ihdr->len);
|
||||
next = iblk->skip_root.next[0];
|
||||
while (next) {
|
||||
item = (void *)(buf + le32_to_cpu(next));
|
||||
val = (void *)(buf + le32_to_cpu(item->offset));
|
||||
print_item(item, val);
|
||||
next = item->skip_next[0];
|
||||
}
|
||||
|
||||
free(iblk);
|
||||
free(buf);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -143,7 +214,7 @@ static int print_log_segments(int fd, __le64 *log_segs, u64 total_chunks)
|
||||
while ((nr = find_first_le_bit(log_segs, total_chunks)) >= 0) {
|
||||
clear_le_bit(log_segs, nr);
|
||||
|
||||
err = print_item_block(fd, nr << SCOUTFS_CHUNK_BLOCK_SHIFT);
|
||||
err = print_log_segment(fd, nr << SCOUTFS_CHUNK_BLOCK_SHIFT);
|
||||
if (!ret && err)
|
||||
ret = err;
|
||||
}
|
||||
@@ -344,15 +415,17 @@ static int print_super_brick(int fd)
|
||||
print_block_header(&super->hdr);
|
||||
printf(" id: %llx\n"
|
||||
" uuid: %s\n"
|
||||
" total_chunks: %llu\n"
|
||||
" bloom_salts: ",
|
||||
le64_to_cpu(super->id),
|
||||
uuid_str);
|
||||
print_le32_list(18, super->bloom_salts, SCOUTFS_BLOOM_SALTS);
|
||||
printf(" total_chunks: %llu\n"
|
||||
" ring_map_blkno: %llu\n"
|
||||
" ring_map_seq: %llu\n"
|
||||
" ring_first_block: %llu\n"
|
||||
" ring_active_blocks: %llu\n"
|
||||
" ring_total_blocks: %llu\n"
|
||||
" ring_seq: %llu\n",
|
||||
le64_to_cpu(super->id),
|
||||
uuid_str,
|
||||
total_chunks,
|
||||
le64_to_cpu(super->ring_map_blkno),
|
||||
le64_to_cpu(super->ring_map_seq),
|
||||
|
||||
Reference in New Issue
Block a user