From 484b34057ad7a1984cbbacf48623d34b3e775d29 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Fri, 30 Dec 2016 17:45:02 -0800 Subject: [PATCH] Update mkfs and print for treap ring Update mkfs and print now that the manifest and allocator are stored in treaps in the ring. Signed-off-by: Zach Brown --- utils/src/crc.c | 9 +++ utils/src/crc.h | 1 + utils/src/format.h | 82 +++++++++++++------- utils/src/mkfs.c | 134 ++++++++++++++++---------------- utils/src/print.c | 187 ++++++++++++++++++++++++++------------------- utils/src/util.h | 1 + 6 files changed, 243 insertions(+), 171 deletions(-) diff --git a/utils/src/crc.c b/utils/src/crc.c index 38640fbc..2932cb88 100644 --- a/utils/src/crc.c +++ b/utils/src/crc.c @@ -37,3 +37,12 @@ u32 crc_block(struct scoutfs_block_header *hdr) return crc32c(~0, (char *)hdr + sizeof(hdr->crc), SCOUTFS_BLOCK_SIZE - sizeof(hdr->crc)); } + +__le32 crc_node(struct scoutfs_treap_node *node) +{ + unsigned int skip = sizeof(node->crc); + unsigned int bytes = offsetof(struct scoutfs_treap_node, + data[le16_to_cpu(node->bytes)]); + + return cpu_to_le32(crc32c(~0, (char *)node + skip, bytes - skip)); +} diff --git a/utils/src/crc.h b/utils/src/crc.h index 6878bf2f..b584315d 100644 --- a/utils/src/crc.h +++ b/utils/src/crc.h @@ -8,5 +8,6 @@ u32 crc32c(u32 crc, const void *data, unsigned int len); u64 crc32c_64(u32 crc, const void *data, unsigned int len); u32 crc_block(struct scoutfs_block_header *hdr); +__le32 crc_node(struct scoutfs_treap_node *node); #endif diff --git a/utils/src/format.h b/utils/src/format.h index 71027e90..c611ea32 100644 --- a/utils/src/format.h +++ b/utils/src/format.h @@ -51,22 +51,63 @@ struct scoutfs_block_header { __le64 blkno; } __packed; -struct scoutfs_ring_entry_header { - __u8 type; - __le16 len; +struct scoutfs_treap_ref { + __le64 off; + __le64 gen; + __u8 aug_bits; } __packed; -#define SCOUTFS_RING_ADD_MANIFEST 1 -#define SCOUTFS_RING_ADD_ALLOC 2 +/* + * The lesser and greater bits are persistent on disk so that we can migrate + * nodes from the older half of the ring. + * + * The dirty bit is only used for in-memory nodes. + */ +#define SCOUTFS_TREAP_AUG_LESSER (1 << 0) +#define SCOUTFS_TREAP_AUG_GREATER (1 << 1) +#define SCOUTFS_TREAP_AUG_HALVES (SCOUTFS_TREAP_AUG_LESSER | \ + SCOUTFS_TREAP_AUG_GREATER) +#define SCOUTFS_TREAP_AUG_DIRTY (1 << 2) -struct scoutfs_ring_add_manifest { - struct scoutfs_ring_entry_header eh; +/* + * Treap nodes are stored at byte offset in the ring of blocks described + * by the super block. Each reference contains the off and gen that it + * will find in the node for verification. Each node has the header + * and data payload covered by a crc. + */ +struct scoutfs_treap_node { + __le32 crc; + __le64 off; + __le64 gen; + __le64 prio; + struct scoutfs_treap_ref left; + struct scoutfs_treap_ref right; + __le16 bytes; + u8 data[0]; +} __packed; + +struct scoutfs_treap_root { + struct scoutfs_treap_ref ref; +} __packed; + +/* + * This is absurdly huge. If there was only ever 1 item per segment and + * 2^64 items the tree could get this deep. + */ +#define SCOUTFS_MANIFEST_MAX_LEVEL 20 + +struct scoutfs_manifest { + struct scoutfs_treap_root root; + __le64 level_counts[SCOUTFS_MANIFEST_MAX_LEVEL]; +} __packed; + +struct scoutfs_manifest_entry { __le64 segno; __le64 seq; __le16 first_key_len; __le16 last_key_len; __u8 level; - /* first and last key bytes */ + __u8 keys[0]; } __packed; #define SCOUTFS_ALLOC_REGION_SHIFT 8 @@ -77,27 +118,11 @@ struct scoutfs_ring_add_manifest { * The bits need to be aligned so that the host can use native long * bitops on the bits in memory. */ -struct scoutfs_ring_alloc_region { - struct scoutfs_ring_entry_header eh; +struct scoutfs_alloc_region { __le64 index; - __u8 pad[5]; __le64 bits[SCOUTFS_ALLOC_REGION_BITS / 64]; } __packed; -/* - * This is absurdly huge. If there was only ever 1 item per segment and - * 2^64 items the tree could get this deep. - */ -#define SCOUTFS_MANIFEST_MAX_LEVEL 20 - -/* - * The packed entries in the block are terminated by a header with a 0 length. - */ -struct scoutfs_ring_block { - struct scoutfs_block_header hdr; - struct scoutfs_ring_entry_header entries[0]; -} __packed; - /* * We really want these to be a power of two size so that they're naturally * aligned. This ensures that they won't cross page boundaries and we @@ -315,12 +340,13 @@ struct scoutfs_super_block { __le64 free_blocks; __le64 ring_blkno; __le64 ring_blocks; - __le64 ring_index; - __le64 ring_nr; - __le64 ring_seq; + __le64 ring_tail_block; + __le64 ring_gen; __le64 buddy_blocks; struct scoutfs_buddy_root buddy_root; struct scoutfs_btree_root btree_root; + struct scoutfs_treap_root alloc_treap_root; + struct scoutfs_manifest manifest; } __packed; #define SCOUTFS_ROOT_INO 1 diff --git a/utils/src/mkfs.c b/utils/src/mkfs.c index 1efc8a05..0a10d5d3 100644 --- a/utils/src/mkfs.c +++ b/utils/src/mkfs.c @@ -21,20 +21,11 @@ #include "buddy.h" #include "item.h" -/* - * Update the block's header and write it out. - */ -static int write_block(int fd, u64 blkno, struct scoutfs_super_block *super, - struct scoutfs_block_header *hdr) +static int write_raw_block(int fd, u64 blkno, void *blk) { ssize_t ret; - if (super) - *hdr = super->hdr; - hdr->blkno = cpu_to_le64(blkno); - hdr->crc = cpu_to_le32(crc_block(hdr)); - - ret = pwrite(fd, hdr, SCOUTFS_BLOCK_SIZE, blkno << SCOUTFS_BLOCK_SHIFT); + ret = pwrite(fd, blk, SCOUTFS_BLOCK_SIZE, blkno << SCOUTFS_BLOCK_SHIFT); if (ret != SCOUTFS_BLOCK_SIZE) { fprintf(stderr, "write to blkno %llu returned %zd: %s (%d)\n", blkno, ret, strerror(errno), errno); @@ -45,41 +36,55 @@ static int write_block(int fd, u64 blkno, struct scoutfs_super_block *super, } /* - * Figure out how many blocks the ring will need. This goes crazy - * with the variables to make the calculation clear. - * - * XXX just a place holder. The real calculation is more like: - * - * - max size add manifest entries for all segments - * - (some day) allocator entries for all segments - * - ring block header overhead - * - ring block unused tail space overhead - * + * Update the block's header and write it out. */ -static u64 calc_ring_blocks(u64 size) +static int write_block(int fd, u64 blkno, struct scoutfs_super_block *super, + struct scoutfs_block_header *hdr) { - u64 first_seg_blocks; - u64 max_entry_bytes; - u64 total_bytes; - u64 blocks; - u64 segs; + if (super) + *hdr = super->hdr; + hdr->blkno = cpu_to_le64(blkno); + hdr->crc = cpu_to_le32(crc_block(hdr)); - segs = size >> SCOUTFS_SEGMENT_SHIFT; - max_entry_bytes = sizeof(struct scoutfs_ring_add_manifest) + - (2 * SCOUTFS_MAX_KEY_SIZE); - total_bytes = (segs * max_entry_bytes) * 4; - blocks = DIV_ROUND_UP(total_bytes, SCOUTFS_BLOCK_SIZE); + return write_raw_block(fd, blkno, hdr); +} - first_seg_blocks = SCOUTFS_SEGMENT_BLOCKS - - (SCOUTFS_SUPER_BLKNO + SCOUTFS_SUPER_NR); +/* + * Figure out how many blocks the ring will need. The ring has to hold: + * + * - manifest entries for every segment with largest keys + * - allocator regions for bits to reference every segment + * - empty space at the end of blocks so nodes don't cross blocks + * - double that to account for repeatedly duplicating entries + * - double that so we can migrate everything before wrapping + */ +static u64 calc_ring_blocks(u64 segs) +{ + u64 alloc_blocks; + u64 ment_blocks; + u64 block_bytes; + u64 node_bytes; + u64 regions; - return max(first_seg_blocks, blocks); + node_bytes = sizeof(struct scoutfs_treap_node) + + sizeof(struct scoutfs_manifest_entry) + + (2 * SCOUTFS_MAX_KEY_SIZE); + block_bytes = SCOUTFS_BLOCK_SIZE - (node_bytes - 1); + ment_blocks = DIV_ROUND_UP(segs * node_bytes, block_bytes); + + node_bytes = sizeof(struct scoutfs_treap_node) + + sizeof(struct scoutfs_alloc_region); + regions = DIV_ROUND_UP(segs, SCOUTFS_ALLOC_REGION_BITS); + block_bytes = SCOUTFS_BLOCK_SIZE - (node_bytes - 1); + alloc_blocks = DIV_ROUND_UP(regions * node_bytes, block_bytes); + + return ALIGN((ment_blocks + alloc_blocks) * 4, SCOUTFS_SEGMENT_BLOCKS); } /* * Make a new file system by writing: * - super blocks - * - ring block with manifest entry + * - ring block with manifest node * - segment with root inode */ static int write_new_fs(char *path, int fd) @@ -88,13 +93,12 @@ static int write_new_fs(char *path, int fd) struct scoutfs_inode_key *ikey; struct scoutfs_inode *inode; struct scoutfs_segment_block *sblk; - struct scoutfs_ring_block *ring; - struct scoutfs_ring_add_manifest *am; - struct scoutfs_ring_alloc_region *reg; + struct scoutfs_manifest_entry *ment; + struct scoutfs_treap_node *node; struct native_item item; struct timeval tv; char uuid_str[37]; - unsigned int i; + void *ring; u64 limit; u64 size; u64 total_blocks; @@ -103,6 +107,7 @@ static int write_new_fs(char *path, int fd) u64 first_segno; __u8 *type; int ret; + u64 i; gettimeofday(&tv, NULL); @@ -133,7 +138,7 @@ static int write_new_fs(char *path, int fd) total_blocks = size / SCOUTFS_BLOCK_SIZE; total_segs = size / SCOUTFS_SEGMENT_SIZE; - ring_blocks = calc_ring_blocks(size); + ring_blocks = calc_ring_blocks(total_segs); /* first initialize the super so we can use it to build structures */ memset(super, 0, SCOUTFS_BLOCK_SIZE); @@ -144,16 +149,18 @@ static int write_new_fs(char *path, int fd) super->next_ino = cpu_to_le64(SCOUTFS_ROOT_INO + 1); super->total_blocks = cpu_to_le64(total_blocks); super->total_segs = cpu_to_le64(total_segs); - super->alloc_uninit = cpu_to_le64(SCOUTFS_ALLOC_REGION_BITS); super->ring_blkno = cpu_to_le64(SCOUTFS_SUPER_BLKNO + 2); super->ring_blocks = cpu_to_le64(ring_blocks); - super->ring_nr = cpu_to_le64(1); - super->ring_seq = cpu_to_le64(1); + super->ring_tail_block = cpu_to_le64(1); + super->ring_gen = cpu_to_le64(1); first_segno = DIV_ROUND_UP(le64_to_cpu(super->ring_blkno) + le64_to_cpu(super->ring_blocks), SCOUTFS_SEGMENT_BLOCKS); + /* alloc from uninit, don't need regions yet */ + super->alloc_uninit = cpu_to_le64(first_segno + 1); + /* write seg with root inode */ sblk->segno = cpu_to_le64(first_segno); sblk->max_seq = cpu_to_le64(1); @@ -189,30 +196,29 @@ static int write_new_fs(char *path, int fd) } /* a single manifest entry points to the single segment */ - am = (void *)ring->entries; - am->eh.type = SCOUTFS_RING_ADD_MANIFEST; - am->eh.len = cpu_to_le16(sizeof(struct scoutfs_ring_add_manifest) + 1); - am->segno = sblk->segno; - am->seq = cpu_to_le64(1); - am->first_key_len = 0; - am->last_key_len = cpu_to_le16(1); - am->level = 1; - type = (void *)(am + 1); + node = ring; + node->off = cpu_to_le64((char *)node - (char *)ring); + node->gen = cpu_to_le64(1); + node->bytes = cpu_to_le16(sizeof(struct scoutfs_manifest_entry) + 1); + pseudo_random_bytes(&node->prio, sizeof(node->prio)); + + ment = (void *)node->data; + ment->segno = sblk->segno; + ment->seq = cpu_to_le64(1); + ment->first_key_len = 0; + ment->last_key_len = cpu_to_le16(1); + ment->level = 1; + type = (void *)ment->keys; *type = SCOUTFS_MAX_UNUSED_KEY; - /* a single alloc region records the first two segs as allocated */ - reg = (void *)am + le16_to_cpu(am->eh.len); - reg->eh.type = SCOUTFS_RING_ADD_ALLOC; - reg->eh.len = cpu_to_le16(sizeof(struct scoutfs_ring_alloc_region)); - /* initial super, ring, and first seg are all allocated */ - memset(reg->bits, 0xff, sizeof(reg->bits)); - for (i = 0; i <= first_segno; i++) - clear_bit_le(i, reg->bits); + node->crc = crc_node(node); - /* block is already zeroed and so contains a 0 len terminating header */ + super->manifest.root.ref.off = node->off; + super->manifest.root.ref.gen = node->gen; + super->manifest.root.ref.aug_bits = SCOUTFS_TREAP_AUG_LESSER; + super->manifest.level_counts[1] = cpu_to_le64(1); - ret = write_block(fd, le64_to_cpu(super->ring_blkno), super, - &ring->hdr); + ret = write_raw_block(fd, le64_to_cpu(super->ring_blkno), ring); if (ret) goto out; diff --git a/utils/src/print.c b/utils/src/print.c index bff7c592..14de06d7 100644 --- a/utils/src/print.c +++ b/utils/src/print.c @@ -262,87 +262,90 @@ static int print_segments(int fd, unsigned long *seg_map, u64 total_segs) return ret; } -static int print_ring_block(int fd, unsigned long *seg_map, u64 blkno) +enum { + TREAP_MANIFEST, + TREAP_ALLOC, +}; + +static void print_treap_ref(struct scoutfs_treap_ref *ref) { - struct scoutfs_ring_alloc_region *reg; - struct scoutfs_ring_entry_header *eh; - struct scoutfs_ring_add_manifest *am; - struct scoutfs_ring_block *ring; - u32 off; - int i; - - ring = read_block(fd, blkno); - if (!ring) - return -ENOMEM; - - printf("ring blkno %llu\n", blkno); - print_block_header(&ring->hdr); - - eh = ring->entries; - while (eh->len) { - off = (char *)eh - (char *)ring; - printf(" [%u]: type %u len %u\n", - off, eh->type, le16_to_cpu(eh->len)); - - switch(eh->type) { - - case SCOUTFS_RING_ADD_MANIFEST: - am = (void *)eh; - printf(" add ment: segno %llu seq %llu " - "first_len %u last_len %u level %u\n", - le64_to_cpu(am->segno), - le64_to_cpu(am->seq), - le16_to_cpu(am->first_key_len), - le16_to_cpu(am->last_key_len), - am->level); - - /* XXX verify, 'int nr' limits segno precision */ - set_bit_le(le64_to_cpu(am->segno), seg_map); - break; - - case SCOUTFS_RING_ADD_ALLOC: - reg = (void *)eh; - printf(" add alloc: index %llu bits", - le64_to_cpu(reg->index)); - for (i = 0; i < array_size(reg->bits); i++) - printf(" %016llx", le64_to_cpu(reg->bits[i])); - printf("\n"); - break; - } - - eh = (void *)eh + le16_to_cpu(eh->len); - } - - free(ring); - - return 0; + printf(" off %llu gen %llu aug_bits %x", + le64_to_cpu(ref->off), le64_to_cpu(ref->gen), + ref->aug_bits); } -static int print_ring_blocks(int fd, struct scoutfs_super_block *super, - unsigned long *seg_map) +static int print_treap_node(int fd, struct scoutfs_super_block *super, + unsigned treap, struct scoutfs_treap_ref *ref, + unsigned long *seg_map) { - int ret = 0; + struct scoutfs_manifest_entry *ment; + struct scoutfs_alloc_region *reg; + struct scoutfs_treap_node *tnode; + char valid_str[40]; + __le32 crc; u64 blkno; - u64 index; - u64 nr; - int err; + void *blk; + u64 off; + int i; - index = le64_to_cpu(super->ring_index); - nr = le64_to_cpu(super->ring_nr); + if (!ref->gen) + return 0; - while (nr) { - blkno = le64_to_cpu(super->ring_blkno) + index; + off = le64_to_cpu(ref->off); + blkno = le64_to_cpu(super->ring_blkno) + (off >> SCOUTFS_BLOCK_SHIFT); - err = print_ring_block(fd, seg_map, blkno); - if (err && !ret) - ret = err; + blk = read_block(fd, blkno); + if (!blk) + return -ENOMEM; - if (++index == le64_to_cpu(super->ring_blocks)) - index = 0; - nr--; - }; + tnode = blk + (off & SCOUTFS_BLOCK_MASK); - return ret; + crc = crc_node(tnode); + if (crc != tnode->crc) + sprintf(valid_str, "(!= %08x) ", le32_to_cpu(crc)); + else + valid_str[0] = '\0'; + + printf(" node: crc %08x %soff %llu gen %llu bytes %u prio %016llx\n" + " l:", + le32_to_cpu(tnode->crc), valid_str, le64_to_cpu(tnode->off), + le64_to_cpu(tnode->gen), le16_to_cpu(tnode->bytes), + le64_to_cpu(tnode->prio)); + print_treap_ref(&tnode->left); + printf(" r:"); + print_treap_ref(&tnode->right); + printf("\n"); + + switch(treap) { + case TREAP_MANIFEST: + ment = (void *)tnode->data; + printf(" ment: segno %llu seq %llu " + "first_len %u last_len %u level %u\n", + le64_to_cpu(ment->segno), + le64_to_cpu(ment->seq), + le16_to_cpu(ment->first_key_len), + le16_to_cpu(ment->last_key_len), + ment->level); + /* XXX verify, 'int nr' limits segno precision */ + set_bit_le(le64_to_cpu(ment->segno), seg_map); + break; + + case TREAP_ALLOC: + reg = (void *)tnode->data; + printf(" reg: index %llu bits", + le64_to_cpu(reg->index)); + for (i = 0; i < array_size(reg->bits); i++) + printf(" %016llx", le64_to_cpu(reg->bits[i])); + printf("\n"); + break; + } + + print_treap_node(fd, super, treap, &tnode->left, seg_map); + print_treap_node(fd, super, treap, &tnode->right, seg_map); + + free(blk); + + return 0; } static int print_super_blocks(int fd) @@ -351,10 +354,13 @@ static int print_super_blocks(int fd) struct scoutfs_super_block recent = { .hdr.seq = 0 }; unsigned long *seg_map; char uuid_str[37]; + __le64 *counts; u64 total_segs; u64 longs; int ret = 0; + int err; int i; + int j; for (i = 0; i < SCOUTFS_SUPER_NR; i++) { super = read_block(fd, SCOUTFS_SUPER_BLKNO + i); @@ -369,19 +375,31 @@ static int print_super_blocks(int fd) le64_to_cpu(super->id), uuid_str); /* XXX these are all in a crazy order */ printf(" next_ino %llu total_blocks %llu free_blocks %llu\n" - " ring_blkno %llu ring_blocks %llu ring_index %llu\n" - " ring_nr %llu ring_seq %llu alloc_uninit %llu\n" - " total_segs %llu\n", + " ring_blkno %llu ring_blocks %llu ring_tail_block %llu\n" + " ring_gen %llu alloc_uninit %llu total_segs %llu\n", le64_to_cpu(super->next_ino), le64_to_cpu(super->total_blocks), le64_to_cpu(super->free_blocks), le64_to_cpu(super->ring_blkno), le64_to_cpu(super->ring_blocks), - le64_to_cpu(super->ring_index), - le64_to_cpu(super->ring_nr), - le64_to_cpu(super->ring_seq), + le64_to_cpu(super->ring_tail_block), + le64_to_cpu(super->ring_gen), le64_to_cpu(super->alloc_uninit), le64_to_cpu(super->total_segs)); + printf(" alloc root:"); + print_treap_ref(&super->alloc_treap_root.ref); + printf("\n"); + printf(" manifest root:"); + print_treap_ref(&super->manifest.root.ref); + printf("\n"); + + printf(" level_counts:"); + counts = super->manifest.level_counts; + for (j = 0; j < SCOUTFS_MANIFEST_MAX_LEVEL; j++) { + if (le64_to_cpu(counts[j])) + printf(" %u: %llu", j, le64_to_cpu(counts[j])); + } + printf("\n"); if (le64_to_cpu(super->hdr.seq) > le64_to_cpu(recent.hdr.seq)) memcpy(&recent, super, sizeof(recent)); @@ -398,8 +416,19 @@ static int print_super_blocks(int fd) if (!seg_map) return -ENOMEM; - ret = print_ring_blocks(fd, super, seg_map) ?: - print_segments(fd, seg_map, total_segs); + printf("manifest treap:\n"); + ret = print_treap_node(fd, super, TREAP_MANIFEST, + &super->manifest.root.ref, seg_map); + + printf("alloc treap:\n"); + err = print_treap_node(fd, super, TREAP_ALLOC, + &super->alloc_treap_root.ref, NULL); + if (err && !ret) + ret = err; + + err = print_segments(fd, seg_map, total_segs); + if (err && !ret) + ret = err; free(seg_map); diff --git a/utils/src/util.h b/utils/src/util.h index daa28a98..ee6516e7 100644 --- a/utils/src/util.h +++ b/utils/src/util.h @@ -51,6 +51,7 @@ do { \ }) #define DIV_ROUND_UP(x, y) (((x) + (y) - 1) / (y)) +#define ALIGN(x, y) (((x) + (y) - 1) & ~((y) - 1)) #ifndef offsetof #define offsetof(type, memb) ((unsigned long)&((type *)0)->memb)