diff --git a/utils/src/format.h b/utils/src/format.h index bafaef80..efdfa6b9 100644 --- a/utils/src/format.h +++ b/utils/src/format.h @@ -13,6 +13,7 @@ */ #define SCOUTFS_BLOCK_SHIFT 12 #define SCOUTFS_BLOCK_SIZE (1 << SCOUTFS_BLOCK_SHIFT) +#define SCOUTFS_BLOCK_MASK (SCOUTFS_BLOCK_SIZE - 1) /* * The allocator works on larger chunks. Smaller metadata structures @@ -34,6 +35,19 @@ #define SCOUTFS_SUPER_BLKNO ((64 * 1024) >> SCOUTFS_BLOCK_SHIFT) #define SCOUTFS_SUPER_NR 2 +/* + * 7 bits in a ~76k bloom filter gives ~1% false positive for our max + * of 64k items. + * + * n = 65,536, p = 0.01 (1 in 100) → m = 628,167 (76.68KB), k = 7 + */ +#define SCOUTFS_BLOOM_BITS 7 +#define SCOUTFS_BLOOM_BIT_WIDTH 20 /* 2^20 > m */ +#define SCOUTFS_BLOOM_BIT_MASK ((1 << SCOUTFS_BLOOM_BIT_WIDTH) - 1) +#define SCOUTFS_BLOOM_BLOCKS ((76 * 1024) / SCOUTFS_BLOCK_SIZE) +#define SCOUTFS_BLOOM_SALTS \ + DIV_ROUND_UP(SCOUTFS_BLOOM_BITS * SCOUTFS_BLOOM_BIT_WIDTH, 32) + /* * This header is found at the start of every block so that we can * verify that it's what we were looking for. The crc and padding @@ -64,6 +78,7 @@ struct scoutfs_super_block { struct scoutfs_block_header hdr; __le64 id; __u8 uuid[SCOUTFS_UUID_BYTES]; + __le32 bloom_salts[SCOUTFS_BLOOM_SALTS]; __le64 total_chunks; __le64 ring_map_blkno; __le64 ring_map_seq; @@ -149,22 +164,43 @@ struct scoutfs_ring_bitmap { __le64 bits[2]; } __packed; + +struct scoutfs_bloom_block { + struct scoutfs_block_header hdr; + __le64 bits[0]; +} __packed; + +#define SCOUTFS_BLOOM_BITS_PER_BLOCK \ + (((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_block_header)) / 8) * 64) + /* - * To start the log segments are a trivial single item block. We'll - * flesh this out into larger blocks once the rest of the architecture - * is in place. + * Items in log segments are sorted in a skip list by their key. We + * have a rough limit of 64k items. + */ +#define SCOUTFS_SKIP_HEIGHT 16 +struct scoutfs_skip_root { + __le32 next[SCOUTFS_SKIP_HEIGHT]; +} __packed; + +/* + * An item block follows the bloom filters blocks at the start of a log + * segment chunk. Its skip list root references the item structs which + * reference the item values in the rest of the block. The references + * are byte offsets from the start of the chunk. */ struct scoutfs_item_block { struct scoutfs_block_header hdr; struct scoutfs_key first; struct scoutfs_key last; - __le32 nr_items; - /* struct scoutfs_item_header items[0] .. */ + struct scoutfs_skip_root skip_root; } __packed; -struct scoutfs_item_header { +struct scoutfs_item { struct scoutfs_key key; + __le32 offset; __le16 len; + u8 skip_height; + __le32 skip_next[0]; } __packed; struct scoutfs_timespec { diff --git a/utils/src/mkfs.c b/utils/src/mkfs.c index ec181656..c48de7f9 100644 --- a/utils/src/mkfs.c +++ b/utils/src/mkfs.c @@ -17,6 +17,8 @@ #include "crc.h" #include "rand.h" #include "dev.h" +#include "bloom.h" +#include "bitops.h" /* * Update the block's header and write it out. @@ -41,7 +43,6 @@ static int write_block(int fd, u64 blkno, struct scoutfs_block_header *hdr) static int write_new_fs(char *path, int fd) { struct scoutfs_super_block *super; - struct scoutfs_block_header hdr; struct scoutfs_inode *inode; struct scoutfs_ring_map_block *map; struct scoutfs_ring_block *ring; @@ -49,7 +50,9 @@ static int write_new_fs(char *path, int fd) struct scoutfs_ring_manifest_entry *mani; struct scoutfs_ring_bitmap *bm; struct scoutfs_item_block *iblk; - struct scoutfs_item_header *ihdr; + struct scoutfs_bloom_bits bits; + struct scoutfs_bloom_block *blm; + struct scoutfs_item *item; struct scoutfs_key root_key; struct timeval tv; char uuid_str[37]; @@ -61,13 +64,10 @@ static int write_new_fs(char *path, int fd) int ret; gettimeofday(&tv, NULL); - /* crc and blkno written for each write */ - hdr._pad = 0; - pseudo_random_bytes(&hdr.fsid, sizeof(hdr.fsid)); - hdr.seq = cpu_to_le64(1); buf = malloc(SCOUTFS_BLOCK_SIZE); - if (!buf) { + super = malloc(SCOUTFS_BLOCK_SIZE); + if (!buf || !super) { ret = -errno; fprintf(stderr, "failed to allocate a block: %s (%d)\n", strerror(errno), errno); @@ -87,20 +87,55 @@ static int write_new_fs(char *path, int fd) root_key.type = SCOUTFS_INODE_KEY; root_key.offset = 0; - /* super in log 0, first fs log block in log 1 */ + /* first chunk has super blocks, log segment chunk is next */ blkno = 1 << SCOUTFS_CHUNK_BLOCK_SHIFT; - /* write a single log block with the root inode item */ + /* first initialize the super so we can use it to build structures */ + memset(super, 0, SCOUTFS_BLOCK_SIZE); + pseudo_random_bytes(&super->hdr.fsid, sizeof(super->hdr.fsid)); + super->hdr.seq = cpu_to_le64(1); + super->id = cpu_to_le64(SCOUTFS_SUPER_ID); + uuid_generate(super->uuid); + pseudo_random_bytes(super->bloom_salts, sizeof(super->bloom_salts)); + super->total_chunks = cpu_to_le64(total_chunks); + super->ring_map_seq = super->hdr.seq; + super->ring_first_block = cpu_to_le64(0); + super->ring_active_blocks = cpu_to_le64(1); + super->ring_total_blocks = cpu_to_le64(SCOUTFS_BLOCKS_PER_CHUNK); + super->ring_seq = super->hdr.seq; + + /* + * There's only the root item so we check for its bloom bits as + * we write the bloom blocks. + */ + scoutfs_calc_bloom_bits(&bits, &root_key, super->bloom_salts); + for (i = 0; i < SCOUTFS_BLOOM_BLOCKS; i++) { + memset(buf, 0, SCOUTFS_BLOCK_SIZE); + blm = buf; + blm->hdr = super->hdr; + + scoutfs_set_bloom_bits(blm, i, &bits); + + ret = write_block(fd, blkno, &blm->hdr); + if (ret) + goto out; + blkno++; + } + + /* write a single log segment with the root inode item */ memset(buf, 0, SCOUTFS_BLOCK_SIZE); iblk = buf; - iblk->hdr = hdr; - iblk->first = root_key; - iblk->last = root_key; - iblk->nr_items = cpu_to_le32(1); - ihdr = (void *)(iblk + 1); - ihdr->key = root_key; - ihdr->len = cpu_to_le16(sizeof(struct scoutfs_inode)); - inode = (void *)(ihdr + 1); + iblk->hdr = super->hdr; + iblk->skip_root.next[0] = cpu_to_le32((SCOUTFS_BLOOM_BLOCKS << + SCOUTFS_BLOCK_SHIFT) + + sizeof(struct scoutfs_item_block)); + item = (void *)(iblk + 1); + item->key = root_key; + item->offset = cpu_to_le32(le32_to_cpu(iblk->skip_root.next[0]) + + sizeof(struct scoutfs_item)); + item->len = cpu_to_le16(sizeof(struct scoutfs_inode)); + item->skip_height = 1; + inode = (void *)(item + 1); inode->nlink = cpu_to_le32(2); inode->mode = cpu_to_le32(0755 | 0040000); inode->atime.sec = cpu_to_le64(tv.tv_sec); @@ -113,18 +148,19 @@ static int write_new_fs(char *path, int fd) ret = write_block(fd, blkno, &iblk->hdr); if (ret) goto out; + blkno = round_up(blkno, SCOUTFS_BLOCKS_PER_CHUNK); /* write the ring block whose manifest entry references the log block */ memset(buf, 0, SCOUTFS_BLOCK_SIZE); ring = buf; - ring->hdr = hdr; + ring->hdr = super->hdr; ring->nr_entries = cpu_to_le16(2); ent = (void *)(ring + 1); ent->type = SCOUTFS_RING_ADD_MANIFEST; ent->len = cpu_to_le16(sizeof(*mani)); mani = (void *)(ent + 1); - mani->blkno = cpu_to_le64(blkno); - mani->seq = hdr.seq; + mani->blkno = cpu_to_le64(blkno - SCOUTFS_BLOCKS_PER_CHUNK); + mani->seq = super->hdr.seq; mani->level = 0; mani->first = root_key; mani->last = root_key; @@ -137,37 +173,26 @@ static int write_new_fs(char *path, int fd) bm->bits[0] = cpu_to_le64(~15ULL); bm->bits[1] = cpu_to_le64(~0ULL); - blkno += SCOUTFS_BLOCKS_PER_CHUNK; ret = write_block(fd, blkno, &ring->hdr); if (ret) goto out; + blkno += SCOUTFS_BLOCKS_PER_CHUNK; /* the ring has a single chunk for now */ memset(buf, 0, SCOUTFS_BLOCK_SIZE); map = buf; - map->hdr = hdr; + map->hdr = super->hdr; map->nr_chunks = cpu_to_le32(1); - map->blknos[0] = cpu_to_le64(blkno); + map->blknos[0] = cpu_to_le64(blkno - SCOUTFS_BLOCKS_PER_CHUNK); - blkno += SCOUTFS_BLOCKS_PER_CHUNK; ret = write_block(fd, blkno, &map->hdr); if (ret) goto out; - /* write the two super blocks */ - memset(buf, 0, SCOUTFS_BLOCK_SIZE); - super = buf; - super->hdr = hdr; - super->id = cpu_to_le64(SCOUTFS_SUPER_ID); - uuid_generate(super->uuid); - super->total_chunks = cpu_to_le64(total_chunks); + /* make sure the super references everything we just wrote */ super->ring_map_blkno = cpu_to_le64(blkno); - super->ring_map_seq = hdr.seq; - super->ring_first_block = cpu_to_le64(0); - super->ring_active_blocks = cpu_to_le64(1); - super->ring_total_blocks = cpu_to_le64(SCOUTFS_BLOCKS_PER_CHUNK); - super->ring_seq = hdr.seq; + /* write the two super blocks */ for (i = 0; i < SCOUTFS_SUPER_NR; i++) { super->hdr.seq = cpu_to_le64(i + 1); ret = write_block(fd, SCOUTFS_SUPER_BLKNO + i, &super->hdr); @@ -194,7 +219,10 @@ static int write_new_fs(char *path, int fd) ret = 0; out: - free(buf); + if (super) + free(super); + if (buf) + free(buf); return ret; } diff --git a/utils/src/print.c b/utils/src/print.c index fe64f18a..0a174fce 100644 --- a/utils/src/print.c +++ b/utils/src/print.c @@ -42,6 +42,57 @@ static void *read_block(int fd, u64 blkno) return buf; } +static void *read_chunk(int fd, u64 blkno) +{ + ssize_t ret; + void *buf; + + buf = malloc(SCOUTFS_CHUNK_SIZE); + if (!buf) + return NULL; + + ret = pread(fd, buf, SCOUTFS_CHUNK_SIZE, blkno << SCOUTFS_BLOCK_SHIFT); + if (ret != SCOUTFS_CHUNK_SIZE) { + fprintf(stderr, "read blkno %llu returned %zd: %s (%d)\n", + blkno, ret, strerror(errno), errno); + free(buf); + buf = NULL; + } + + return buf; +} + +static void print_le32_list(int indent, __le32 *data, int nr) +{ + char *fmt; + int pos; + int len; + int i; + u32 d; + + printf("["); + + pos = indent; + for (i = 0; i < nr; i++) { + if (i + 1 < nr) + fmt = "%u, "; + else + fmt = "%u"; + + d = le32_to_cpu(data[i]); + len = snprintf(NULL, 0, fmt, d); + if (pos + len > 78) { + printf("\n%*c", indent, ' '); + pos = indent; + } + + printf(fmt, d); + pos += len; + } + + printf("]\n"); +} + static void print_block_header(struct scoutfs_block_header *hdr) { u32 crc = crc_block(hdr); @@ -87,49 +138,69 @@ static void print_inode(struct scoutfs_inode *inode) le32_to_cpu(inode->mtime.nsec)); } -static void print_item(struct scoutfs_item_header *ihdr) +static void print_item(struct scoutfs_item *item, void *val) { printf(" item:\n" " key: "SKF"\n" - " len: %u\n", - SKA(&ihdr->key), le16_to_cpu(ihdr->len)); + " offset: %u\n" + " len: %u\n" + " skip_height: %u\n" + " skip_next[]: ", + SKA(&item->key), + le32_to_cpu(item->offset), + le16_to_cpu(item->len), + item->skip_height); - switch(ihdr->key.type) { + print_le32_list(22, item->skip_next, item->skip_height); + + switch(item->key.type) { case SCOUTFS_INODE_KEY: - print_inode((void *)(ihdr + 1)); + print_inode(val); break; } } -static int print_item_block(int fd, u64 nr) +static int print_log_segment(int fd, u64 nr) { - struct scoutfs_item_header *ihdr; struct scoutfs_item_block *iblk; - size_t off; + struct scoutfs_bloom_block *blm; + struct scoutfs_item *item; + char *buf; + char *val; + __le32 next; int i; - iblk = read_block(fd, nr); - if (!iblk) + buf = read_chunk(fd, nr); + if (!buf) return -ENOMEM; + for (i = 0; i < SCOUTFS_BLOOM_BLOCKS; i++) { + + blm = (void *)(buf + (i << SCOUTFS_BLOCK_SHIFT)); + + printf("bloom block:\n"); + print_block_header(&blm->hdr); + } + + iblk = (void *)(buf + (SCOUTFS_BLOOM_BLOCKS << SCOUTFS_BLOCK_SHIFT)); + printf("item block:\n"); print_block_header(&iblk->hdr); printf(" first: "SKF"\n" " last: "SKF"\n" - " nr_items: %u\n", - SKA(&iblk->first), SKA(&iblk->last), - le32_to_cpu(iblk->nr_items)); + " skip_root.next[]: ", + SKA(&iblk->first), SKA(&iblk->last)); + print_le32_list(23, iblk->skip_root.next, SCOUTFS_SKIP_HEIGHT); - off = sizeof(struct scoutfs_item_block); - for (i = 0; i < le32_to_cpu(iblk->nr_items); i++) { - ihdr = (void *)((char *)iblk + off); - print_item(ihdr); - - off += sizeof(struct scoutfs_item_header) + - le16_to_cpu(ihdr->len); + next = iblk->skip_root.next[0]; + while (next) { + item = (void *)(buf + le32_to_cpu(next)); + val = (void *)(buf + le32_to_cpu(item->offset)); + print_item(item, val); + next = item->skip_next[0]; } - free(iblk); + free(buf); return 0; } @@ -143,7 +214,7 @@ static int print_log_segments(int fd, __le64 *log_segs, u64 total_chunks) while ((nr = find_first_le_bit(log_segs, total_chunks)) >= 0) { clear_le_bit(log_segs, nr); - err = print_item_block(fd, nr << SCOUTFS_CHUNK_BLOCK_SHIFT); + err = print_log_segment(fd, nr << SCOUTFS_CHUNK_BLOCK_SHIFT); if (!ret && err) ret = err; } @@ -344,15 +415,17 @@ static int print_super_brick(int fd) print_block_header(&super->hdr); printf(" id: %llx\n" " uuid: %s\n" - " total_chunks: %llu\n" + " bloom_salts: ", + le64_to_cpu(super->id), + uuid_str); + print_le32_list(18, super->bloom_salts, SCOUTFS_BLOOM_SALTS); + printf(" total_chunks: %llu\n" " ring_map_blkno: %llu\n" " ring_map_seq: %llu\n" " ring_first_block: %llu\n" " ring_active_blocks: %llu\n" " ring_total_blocks: %llu\n" " ring_seq: %llu\n", - le64_to_cpu(super->id), - uuid_str, total_chunks, le64_to_cpu(super->ring_map_blkno), le64_to_cpu(super->ring_map_seq),