mirror of
https://github.com/versity/scoutfs.git
synced 2026-01-09 05:13:18 +00:00
Make sure that the manifest entries for a given level fully cover the possible key space. This helps item reading describe cached key ranges that extend around items. Signed-off-by: Zach Brown <zab@versity.com>
449 lines
12 KiB
C
449 lines
12 KiB
C
#ifndef _SCOUTFS_FORMAT_H_
|
|
#define _SCOUTFS_FORMAT_H_
|
|
|
|
/* statfs(2) f_type */
|
|
#define SCOUTFS_SUPER_MAGIC 0x554f4353 /* "SCOU" */
|
|
/* super block id */
|
|
#define SCOUTFS_SUPER_ID 0x2e736674756f6373ULL /* "scoutfs." */
|
|
|
|
/*
|
|
* The super block and ring blocks are fixed 4k.
|
|
*/
|
|
#define SCOUTFS_BLOCK_SHIFT 12
|
|
#define SCOUTFS_BLOCK_SIZE (1 << SCOUTFS_BLOCK_SHIFT)
|
|
#define SCOUTFS_BLOCK_MASK (SCOUTFS_BLOCK_SIZE - 1)
|
|
#define SCOUTFS_BLOCKS_PER_PAGE (PAGE_SIZE / SCOUTFS_BLOCK_SIZE)
|
|
|
|
/*
|
|
* FS data is stored in segments, for now they're fixed size. They'll
|
|
* be dynamic.
|
|
*/
|
|
#define SCOUTFS_SEGMENT_SHIFT 20
|
|
#define SCOUTFS_SEGMENT_SIZE (1 << SCOUTFS_SEGMENT_SHIFT)
|
|
#define SCOUTFS_SEGMENT_MASK (SCOUTFS_SEGMENT_SIZE - 1)
|
|
#define SCOUTFS_SEGMENT_PAGES (SCOUTFS_SEGMENT_SIZE / PAGE_SIZE)
|
|
#define SCOUTFS_SEGMENT_BLOCKS (SCOUTFS_SEGMENT_SIZE / SCOUTFS_BLOCK_SIZE)
|
|
|
|
#define SCOUTFS_PAGES_PER_BLOCK (SCOUTFS_BLOCK_SIZE / PAGE_SIZE)
|
|
#define SCOUTFS_BLOCK_PAGE_ORDER (SCOUTFS_BLOCK_SHIFT - PAGE_SHIFT)
|
|
|
|
/*
|
|
* The super blocks leave some room at the start of the first block for
|
|
* platform structures like boot loaders.
|
|
*/
|
|
#define SCOUTFS_SUPER_BLKNO ((64 * 1024) >> SCOUTFS_BLOCK_SHIFT)
|
|
#define SCOUTFS_SUPER_NR 2
|
|
#define SCOUTFS_BUDDY_BLKNO (SCOUTFS_SUPER_BLKNO + SCOUTFS_SUPER_NR)
|
|
|
|
#define SCOUTFS_MAX_TRANS_BLOCKS (128 * 1024 * 1024 / SCOUTFS_BLOCK_SIZE)
|
|
|
|
/*
|
|
* This header is found at the start of every block so that we can
|
|
* verify that it's what we were looking for. The crc and padding
|
|
* starts the block so that its calculation operations on a nice 64bit
|
|
* aligned region.
|
|
*/
|
|
struct scoutfs_block_header {
|
|
__le32 crc;
|
|
__le32 _pad;
|
|
__le64 fsid;
|
|
__le64 seq;
|
|
__le64 blkno;
|
|
} __packed;
|
|
|
|
struct scoutfs_ring_entry_header {
|
|
__u8 type;
|
|
__le16 len;
|
|
} __packed;
|
|
|
|
#define SCOUTFS_RING_ADD_MANIFEST 1
|
|
#define SCOUTFS_RING_ADD_ALLOC 2
|
|
|
|
struct scoutfs_ring_add_manifest {
|
|
struct scoutfs_ring_entry_header eh;
|
|
__le64 segno;
|
|
__le64 seq;
|
|
__le16 first_key_len;
|
|
__le16 last_key_len;
|
|
__u8 level;
|
|
/* first and last key bytes */
|
|
} __packed;
|
|
|
|
#define SCOUTFS_ALLOC_REGION_SHIFT 8
|
|
#define SCOUTFS_ALLOC_REGION_BITS (1 << SCOUTFS_ALLOC_REGION_SHIFT)
|
|
#define SCOUTFS_ALLOC_REGION_MASK (SCOUTFS_ALLOC_REGION_BITS - 1)
|
|
|
|
/*
|
|
* The bits need to be aligned so that the host can use native long
|
|
* bitops on the bits in memory.
|
|
*/
|
|
struct scoutfs_ring_alloc_region {
|
|
struct scoutfs_ring_entry_header eh;
|
|
__le64 index;
|
|
__u8 pad[5];
|
|
__le64 bits[SCOUTFS_ALLOC_REGION_BITS / 64];
|
|
} __packed;
|
|
|
|
/*
|
|
* This is absurdly huge. If there was only ever 1 item per segment and
|
|
* 2^64 items the tree could get this deep.
|
|
*/
|
|
#define SCOUTFS_MANIFEST_MAX_LEVEL 20
|
|
|
|
/*
|
|
* The packed entries in the block are terminated by a header with a 0 length.
|
|
*/
|
|
struct scoutfs_ring_block {
|
|
struct scoutfs_block_header hdr;
|
|
struct scoutfs_ring_entry_header entries[0];
|
|
} __packed;
|
|
|
|
/*
|
|
* We really want these to be a power of two size so that they're naturally
|
|
* aligned. This ensures that they won't cross page boundaries and we
|
|
* can use pointers to them in the page vecs that make up segments without
|
|
* funny business.
|
|
*
|
|
* We limit segment sizes to 8 megs (23 bits) and value lengths to 512 bytes
|
|
* (9 bits). The item offsets and lengths then take up 64 bits.
|
|
*
|
|
* We then operate on the items in on-stack nice native structs.
|
|
*/
|
|
struct scoutfs_segment_item {
|
|
__le64 seq;
|
|
__le32 key_off_len;
|
|
__le32 val_off_len;
|
|
} __packed;
|
|
|
|
#define SCOUTFS_SEGMENT_ITEM_OFF_SHIFT 9
|
|
#define SCOUTFS_SEGMENT_ITEM_LEN_MASK ((1 << SCOUTFS_SEGMENT_ITEM_OFF_SHIFT)-1)
|
|
|
|
/*
|
|
* Each large segment starts with a segment block that describes the
|
|
* rest of the blocks that make up the segment.
|
|
*/
|
|
struct scoutfs_segment_block {
|
|
__le32 crc;
|
|
__le32 _padding;
|
|
__le64 segno;
|
|
__le64 max_seq;
|
|
__le32 nr_items;
|
|
__le32 _moar_pads;
|
|
struct scoutfs_segment_item items[0];
|
|
/* packed keys */
|
|
/* packed vals */
|
|
} __packed;
|
|
|
|
/*
|
|
* Block references include the sequence number so that we can detect
|
|
* readers racing with writers and so that we can tell that we don't
|
|
* need to follow a reference when traversing based on seqs.
|
|
*/
|
|
struct scoutfs_block_ref {
|
|
__le64 blkno;
|
|
__le64 seq;
|
|
} __packed;
|
|
|
|
/*
|
|
* If the block was full of bits the largest possible order would be
|
|
* the block size shift + 3 (BITS_PER_BYTE). But the header uses
|
|
* up some space and then the buddy bits mean two bits per block.
|
|
* Then +1 for this being the number, not the greatest order.
|
|
*/
|
|
#define SCOUTFS_BUDDY_ORDERS (SCOUTFS_BLOCK_SHIFT + 3 - 2 + 1)
|
|
|
|
struct scoutfs_buddy_block {
|
|
struct scoutfs_block_header hdr;
|
|
__le16 first_set[SCOUTFS_BUDDY_ORDERS];
|
|
__u8 level;
|
|
__u8 __pad[3]; /* naturally align bits */
|
|
union {
|
|
struct scoutfs_buddy_slot {
|
|
__le64 seq;
|
|
__le16 free_orders;
|
|
/* XXX seems like we could hide a bit somewhere */
|
|
__u8 blkno_off;
|
|
} __packed slots[0];
|
|
__le64 bits[0];
|
|
} __packed;
|
|
} __packed;
|
|
|
|
/*
|
|
* Each buddy leaf block references order 0 blocks with half of its
|
|
* bitmap. The other half of the bits are used for the higher order
|
|
* bits.
|
|
*/
|
|
#define SCOUTFS_BUDDY_ORDER0_BITS \
|
|
(((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_buddy_block)) * 8) / 2)
|
|
|
|
#define SCOUTFS_BUDDY_SLOTS \
|
|
((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_buddy_block)) / \
|
|
sizeof(struct scoutfs_buddy_slot))
|
|
|
|
struct scoutfs_buddy_root {
|
|
struct scoutfs_buddy_slot slot;
|
|
__u8 height;
|
|
} __packed;
|
|
|
|
/* ((SCOUTFS_BUDDY_SLOTS^5) * SCOUTFS_BUDDY_ORDER0_BITS) > 2^52 */
|
|
#define SCOUTFS_BUDDY_MAX_HEIGHT 6
|
|
|
|
/*
|
|
* We should be able to make the offset smaller if neither dirents nor
|
|
* data items use the full 64 bits.
|
|
*/
|
|
struct scoutfs_key {
|
|
__le64 inode;
|
|
u8 type;
|
|
__le64 offset;
|
|
} __packed;
|
|
|
|
/*
|
|
* Currently we sort keys by the numeric value of the types, but that
|
|
* isn't necessary. We could have an arbitrary sort order. So we don't
|
|
* have to stress about cleverly allocating the types.
|
|
*/
|
|
#define SCOUTFS_INODE_KEY 1
|
|
#define SCOUTFS_XATTR_KEY 2
|
|
#define SCOUTFS_XATTR_NAME_HASH_KEY 3
|
|
#define SCOUTFS_XATTR_VAL_HASH_KEY 4
|
|
#define SCOUTFS_DIRENT_KEY 5
|
|
#define SCOUTFS_READDIR_KEY 6
|
|
#define SCOUTFS_LINK_BACKREF_KEY 7
|
|
#define SCOUTFS_SYMLINK_KEY 8
|
|
#define SCOUTFS_EXTENT_KEY 9
|
|
#define SCOUTFS_ORPHAN_KEY 10
|
|
#define SCOUTFS_MAX_UNUSED_KEY 255
|
|
|
|
#define SCOUTFS_MAX_ITEM_LEN 512
|
|
|
|
/* value is struct scoutfs_inode */
|
|
struct scoutfs_inode_key {
|
|
__u8 type;
|
|
__be64 ino;
|
|
} __packed;
|
|
|
|
/* value is struct scoutfs_dirent without the name */
|
|
struct scoutfs_dirent_key {
|
|
__u8 type;
|
|
__be64 ino;
|
|
__u8 name[0];
|
|
} __packed;
|
|
|
|
/* value is struct scoutfs_dirent with the name */
|
|
struct scoutfs_readdir_key {
|
|
__u8 type;
|
|
__be64 ino;
|
|
__be64 pos;
|
|
} __packed;
|
|
|
|
struct scoutfs_btree_root {
|
|
u8 height;
|
|
struct scoutfs_block_ref ref;
|
|
} __packed;
|
|
|
|
/*
|
|
* @free_end: records the byte offset of the first byte after the free
|
|
* space in the block between the header and the first item. New items
|
|
* are allocated by subtracting the space they need.
|
|
*
|
|
* @free_reclaim: records the number of bytes of free space amongst the
|
|
* items after free_end. If a block is compacted then this much new
|
|
* free space would be reclaimed.
|
|
*/
|
|
struct scoutfs_btree_block {
|
|
struct scoutfs_block_header hdr;
|
|
__le16 free_end;
|
|
__le16 free_reclaim;
|
|
__le16 nr_items;
|
|
__le16 item_offs[0];
|
|
} __packed;
|
|
|
|
/*
|
|
* The item sequence number is set to the dirty block's sequence number
|
|
* when the item is modified. It is not changed by splits or merges.
|
|
*/
|
|
struct scoutfs_btree_item {
|
|
struct scoutfs_key key;
|
|
__le64 seq;
|
|
__le16 val_len;
|
|
char val[0];
|
|
} __packed;
|
|
|
|
/* Blocks are no more than half free. */
|
|
#define SCOUTFS_BTREE_FREE_LIMIT \
|
|
((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_btree_block)) / 2)
|
|
|
|
/* XXX does this exist upstream somewhere? */
|
|
#define member_sizeof(TYPE, MEMBER) (sizeof(((TYPE *)0)->MEMBER))
|
|
|
|
#define SCOUTFS_BTREE_MAX_ITEMS \
|
|
((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_btree_block)) / \
|
|
(member_sizeof(struct scoutfs_btree_block, item_offs[0]) + \
|
|
sizeof(struct scoutfs_btree_item)))
|
|
|
|
/*
|
|
* We can calculate the max tree depth by calculating how many leaf
|
|
* blocks the tree could reference. The block device can only reference
|
|
* 2^64 bytes. The tallest parent tree has half full parent blocks.
|
|
*
|
|
* So we have the relation:
|
|
*
|
|
* ceil(max_items / 2) ^ (max_depth - 1) >= 2^64 / block_size
|
|
*
|
|
* and solve for depth:
|
|
*
|
|
* max_depth = log(ceil(max_items / 2), 2^64 / block_size) + 1
|
|
*/
|
|
#define SCOUTFS_BTREE_MAX_DEPTH 10
|
|
|
|
#define SCOUTFS_UUID_BYTES 16
|
|
|
|
/*
|
|
* The ring fields describe the statically allocated ring log. The
|
|
* head and tail indexes are logical 4k blocks offsets inside the ring.
|
|
* The head block should contain the seq.
|
|
*/
|
|
struct scoutfs_super_block {
|
|
struct scoutfs_block_header hdr;
|
|
__le64 id;
|
|
__u8 uuid[SCOUTFS_UUID_BYTES];
|
|
__le64 next_ino;
|
|
__le64 alloc_uninit;
|
|
__le64 total_segs;
|
|
__le64 total_blocks;
|
|
__le64 free_blocks;
|
|
__le64 ring_blkno;
|
|
__le64 ring_blocks;
|
|
__le64 ring_index;
|
|
__le64 ring_nr;
|
|
__le64 ring_seq;
|
|
__le64 buddy_blocks;
|
|
struct scoutfs_buddy_root buddy_root;
|
|
struct scoutfs_btree_root btree_root;
|
|
} __packed;
|
|
|
|
#define SCOUTFS_ROOT_INO 1
|
|
|
|
struct scoutfs_timespec {
|
|
__le64 sec;
|
|
__le32 nsec;
|
|
} __packed;
|
|
|
|
/*
|
|
* @data_version: incremented every time the contents of a file could
|
|
* have changed. It is exposed via an ioctl and is then provided as an
|
|
* argument to data functions to protect racing modification.
|
|
*
|
|
* XXX
|
|
* - otime?
|
|
* - compat flags?
|
|
* - version?
|
|
* - generation?
|
|
* - be more careful with rdev?
|
|
*/
|
|
struct scoutfs_inode {
|
|
__le64 size;
|
|
__le64 blocks;
|
|
__le64 link_counter;
|
|
__le64 data_version;
|
|
__le32 nlink;
|
|
__le32 uid;
|
|
__le32 gid;
|
|
__le32 mode;
|
|
__le32 rdev;
|
|
__le32 salt;
|
|
struct scoutfs_timespec atime;
|
|
struct scoutfs_timespec ctime;
|
|
struct scoutfs_timespec mtime;
|
|
} __packed;
|
|
|
|
#define SCOUTFS_ROOT_INO 1
|
|
|
|
/* like the block size, a reasonable min PATH_MAX across platforms */
|
|
#define SCOUTFS_SYMLINK_MAX_SIZE 4096
|
|
|
|
/*
|
|
* Dirents are stored in items with an offset of the hash of their name.
|
|
* Colliding names are packed into the value.
|
|
*/
|
|
struct scoutfs_dirent {
|
|
__le64 ino;
|
|
__le64 counter;
|
|
__u8 type;
|
|
__u8 name[0];
|
|
} __packed;
|
|
|
|
/*
|
|
* Dirent items are stored at keys with the offset set to the hash of
|
|
* the name. Creation can find that hash values collide and will
|
|
* attempt to linearly probe this many following hash values looking for
|
|
* an unused value.
|
|
*
|
|
* In small directories this doesn't really matter because hash values
|
|
* will so very rarely collide. At around 50k items we start to see our
|
|
* first collisions. 16 slots is still pretty quick to scan in the
|
|
* btree and it gets us up into the hundreds of millions of entries
|
|
* before enospc is returned as we run out of hash values.
|
|
*/
|
|
#define SCOUTFS_DIRENT_COLL_NR 16
|
|
|
|
#define SCOUTFS_NAME_LEN 255
|
|
|
|
/* S32_MAX avoids the (int) sign bit and might avoid sloppy bugs */
|
|
#define SCOUTFS_LINK_MAX S32_MAX
|
|
|
|
/*
|
|
* We only use 31 bits for readdir positions so that we don't confuse
|
|
* old signed 32bit f_pos applications or those on the other side of
|
|
* network protocols that have limited readir positions.
|
|
*/
|
|
|
|
#define SCOUTFS_DIRENT_OFF_BITS 31
|
|
#define SCOUTFS_DIRENT_OFF_MASK ((1U << SCOUTFS_DIRENT_OFF_BITS) - 1)
|
|
/* getdents returns next pos with an entry, no entry at (f_pos)~0 */
|
|
#define SCOUTFS_DIRENT_LAST_POS (INT_MAX - 1)
|
|
|
|
enum {
|
|
SCOUTFS_DT_FIFO = 0,
|
|
SCOUTFS_DT_CHR,
|
|
SCOUTFS_DT_DIR,
|
|
SCOUTFS_DT_BLK,
|
|
SCOUTFS_DT_REG,
|
|
SCOUTFS_DT_LNK,
|
|
SCOUTFS_DT_SOCK,
|
|
SCOUTFS_DT_WHT,
|
|
};
|
|
|
|
#define SCOUTFS_MAX_XATTR_LEN 255
|
|
#define SCOUTFS_XATTR_NAME_HASH_MASK 7ULL
|
|
|
|
struct scoutfs_xattr {
|
|
__u8 name_len;
|
|
__u8 value_len;
|
|
__u8 name[0];
|
|
} __packed;
|
|
|
|
struct scoutfs_extent {
|
|
__le64 blkno;
|
|
__le64 len;
|
|
__u8 flags;
|
|
} __packed;
|
|
|
|
#define SCOUTFS_EXTENT_FLAG_OFFLINE (1 << 0)
|
|
|
|
/*
|
|
* link backrefs give us a way to find all the hard links that refer
|
|
* to a target inode. They're stored at an offset determined by an
|
|
* advancing counter in their inode.
|
|
*/
|
|
struct scoutfs_link_backref {
|
|
__le64 ino;
|
|
__le64 offset;
|
|
} __packed;
|
|
|
|
#define SCOUTFS_MAX_KEY_SIZE \
|
|
offsetof(struct scoutfs_dirent_key, name[SCOUTFS_NAME_LEN])
|
|
|
|
#endif
|