mirror of
https://github.com/versity/scoutfs.git
synced 2026-01-09 05:13:18 +00:00
Add basic file data support by implementing the address space file and page read and write methods. This passis basic read/write tests but is only the seed of a final implementation. Signed-off-by: Zach Brown <zab@versity.com>
291 lines
7.5 KiB
C
291 lines
7.5 KiB
C
#ifndef _SCOUTFS_FORMAT_H_
|
|
#define _SCOUTFS_FORMAT_H_
|
|
|
|
/* statfs(2) f_type */
|
|
#define SCOUTFS_SUPER_MAGIC 0x554f4353 /* "SCOU" */
|
|
/* super block id */
|
|
#define SCOUTFS_SUPER_ID 0x2e736674756f6373ULL /* "scoutfs." */
|
|
|
|
/*
|
|
* Everything is stored in and addressed as 4k fixed size blocks. This
|
|
* avoids having to manage contiguous cpu mappings of larger blocks.
|
|
* Larger structures are read and written as multiple blocks.
|
|
*/
|
|
#define SCOUTFS_BLOCK_SHIFT 12
|
|
#define SCOUTFS_BLOCK_SIZE (1 << SCOUTFS_BLOCK_SHIFT)
|
|
#define SCOUTFS_BLOCK_MASK (SCOUTFS_BLOCK_SIZE - 1)
|
|
|
|
/*
|
|
* The allocator works on larger chunks. Smaller metadata structures
|
|
* like the super blocks and the ring are stored in chunks.
|
|
*
|
|
* A log segment is a collection of smaller blocks (bloom filter, item blocks)
|
|
* stored in a chunk.
|
|
*/
|
|
#define SCOUTFS_CHUNK_SHIFT 22
|
|
#define SCOUTFS_CHUNK_SIZE (1 << SCOUTFS_CHUNK_SHIFT)
|
|
#define SCOUTFS_CHUNK_BLOCK_SHIFT (SCOUTFS_CHUNK_SHIFT - SCOUTFS_BLOCK_SHIFT)
|
|
#define SCOUTFS_CHUNK_BLOCK_MASK ((1 << SCOUTFS_CHUNK_BLOCK_SHIFT) - 1)
|
|
#define SCOUTFS_BLOCKS_PER_CHUNK (1 << SCOUTFS_CHUNK_BLOCK_SHIFT)
|
|
|
|
/*
|
|
* The super blocks leave some room at the start of the first block for
|
|
* platform structures like boot loaders.
|
|
*/
|
|
#define SCOUTFS_SUPER_BLKNO ((64 * 1024) >> SCOUTFS_BLOCK_SHIFT)
|
|
#define SCOUTFS_SUPER_NR 2
|
|
|
|
/*
|
|
* 7 bits in a ~76k bloom filter gives ~1% false positive for our max
|
|
* of 64k items.
|
|
*
|
|
* n = 65,536, p = 0.01 (1 in 100) → m = 628,167 (76.68KB), k = 7
|
|
*/
|
|
#define SCOUTFS_BLOOM_BITS 7
|
|
#define SCOUTFS_BLOOM_BIT_WIDTH 20 /* 2^20 > m */
|
|
#define SCOUTFS_BLOOM_BIT_MASK ((1 << SCOUTFS_BLOOM_BIT_WIDTH) - 1)
|
|
#define SCOUTFS_BLOOM_BLOCKS ((76 * 1024) / SCOUTFS_BLOCK_SIZE)
|
|
#define SCOUTFS_BLOOM_SALTS \
|
|
DIV_ROUND_UP(SCOUTFS_BLOOM_BITS * SCOUTFS_BLOOM_BIT_WIDTH, 32)
|
|
|
|
/*
|
|
* This header is found at the start of every block so that we can
|
|
* verify that it's what we were looking for. The crc and padding
|
|
* starts the block so that its calculation operations on a nice 64bit
|
|
* aligned region.
|
|
*/
|
|
struct scoutfs_block_header {
|
|
__le32 crc;
|
|
__le32 _pad;
|
|
__le64 fsid;
|
|
__le64 seq;
|
|
__le64 blkno;
|
|
} __packed;
|
|
|
|
#define SCOUTFS_UUID_BYTES 16
|
|
|
|
/*
|
|
* The super is stored in a pair of blocks in the first chunk on the
|
|
* device.
|
|
*
|
|
* The ring map blocks describe the chunks that make up the ring.
|
|
*
|
|
* The rest of the ring fields describe the state of the ring blocks
|
|
* that are stored in their chunks. The active portion of the ring
|
|
* describes the current state of the system and is replayed on mount.
|
|
*/
|
|
struct scoutfs_super_block {
|
|
struct scoutfs_block_header hdr;
|
|
__le64 id;
|
|
__u8 uuid[SCOUTFS_UUID_BYTES];
|
|
__le32 bloom_salts[SCOUTFS_BLOOM_SALTS];
|
|
__le64 total_chunks;
|
|
__le64 ring_map_blkno;
|
|
__le64 ring_map_seq;
|
|
__le64 ring_first_block;
|
|
__le64 ring_active_blocks;
|
|
__le64 ring_total_blocks;
|
|
__le64 ring_seq;
|
|
} __packed;
|
|
|
|
/*
|
|
* We should be able to make the offset smaller if neither dirents nor
|
|
* data items use the full 64 bits.
|
|
*/
|
|
struct scoutfs_key {
|
|
__le64 inode;
|
|
u8 type;
|
|
__le64 offset;
|
|
} __packed;
|
|
|
|
#define SCOUTFS_ROOT_INO 1
|
|
|
|
/*
|
|
* Currently we sort keys by the numeric value of the types, but that
|
|
* isn't necessary. We could have an arbitrary sort order. So we don't
|
|
* have to stress about cleverly allocating the types.
|
|
*/
|
|
#define SCOUTFS_INODE_KEY 1
|
|
#define SCOUTFS_DIRENT_KEY 2
|
|
#define SCOUTFS_DATA_KEY 3
|
|
|
|
struct scoutfs_ring_map_block {
|
|
struct scoutfs_block_header hdr;
|
|
__le32 nr_chunks;
|
|
__le64 blknos[0];
|
|
} __packed;
|
|
|
|
#define SCOUTFS_RING_MAP_BLOCKS \
|
|
((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_ring_map_block)) / \
|
|
sizeof(__le64))
|
|
|
|
struct scoutfs_ring_entry {
|
|
u8 type;
|
|
__le16 len;
|
|
} __packed;
|
|
|
|
/*
|
|
* Ring blocks are stored in chunks described by the ring map blocks.
|
|
*
|
|
* The manifest entries describe the position of a given log segment in
|
|
* the manifest. They're keyed by the block number so that we can
|
|
* record movement of a log segment in the manifest with one ring entry
|
|
* and we can record deletion with just the block number.
|
|
*/
|
|
struct scoutfs_ring_block {
|
|
struct scoutfs_block_header hdr;
|
|
__le16 nr_entries;
|
|
} __packed;
|
|
|
|
enum {
|
|
SCOUTFS_RING_ADD_MANIFEST = 0,
|
|
SCOUTFS_RING_DEL_MANIFEST,
|
|
SCOUTFS_RING_BITMAP,
|
|
};
|
|
|
|
/*
|
|
* Including both keys might make the manifest too large. It might be
|
|
* better to only include one key and infer a block's range from the
|
|
* neighbour's key. The downside of that is that we assume that there
|
|
* isn't unused key space between blocks in a level. We might search
|
|
* blocks when we didn't need to.
|
|
*/
|
|
struct scoutfs_ring_manifest_entry {
|
|
__le64 blkno;
|
|
__le64 seq;
|
|
__u8 level;
|
|
struct scoutfs_key first;
|
|
struct scoutfs_key last;
|
|
} __packed;
|
|
|
|
struct scoutfs_ring_del_manifest {
|
|
__le64 blkno;
|
|
} __packed;
|
|
|
|
/* 2^22 * 10^13 > 2^64 */
|
|
#define SCOUTFS_MAX_LEVEL 13
|
|
|
|
struct scoutfs_ring_bitmap {
|
|
__le32 offset;
|
|
__le64 bits[2];
|
|
} __packed;
|
|
|
|
|
|
struct scoutfs_bloom_block {
|
|
struct scoutfs_block_header hdr;
|
|
__le64 bits[0];
|
|
} __packed;
|
|
|
|
#define SCOUTFS_BLOOM_BITS_PER_BLOCK \
|
|
(((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_block_header)) / 8) * 64)
|
|
|
|
/*
|
|
* Items in log segments are sorted in a skip list by their key. We
|
|
* have a rough limit of 64k items.
|
|
*/
|
|
#define SCOUTFS_SKIP_HEIGHT 16
|
|
struct scoutfs_skip_root {
|
|
__le32 next[SCOUTFS_SKIP_HEIGHT];
|
|
} __packed;
|
|
|
|
/*
|
|
* An item block follows the bloom filter blocks at the start of a log
|
|
* segment. Its skip root references the item structs which then
|
|
* reference the item values in the rest of the block. The references
|
|
* are byte offsets from the start of the chunk.
|
|
*/
|
|
struct scoutfs_item_block {
|
|
struct scoutfs_block_header hdr;
|
|
struct scoutfs_key first;
|
|
struct scoutfs_key last;
|
|
struct scoutfs_skip_root skip_root;
|
|
} __packed;
|
|
|
|
struct scoutfs_item {
|
|
struct scoutfs_key key;
|
|
__le32 offset;
|
|
__le16 len;
|
|
u8 skip_height;
|
|
__le32 skip_next[0];
|
|
} __packed;
|
|
|
|
/*
|
|
* Item size caps item file data item length so that they fit in checksummed
|
|
* 4k blocks with a bit of expansion room.
|
|
*/
|
|
#define SCOUTFS_MAX_ITEM_LEN \
|
|
(SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_block_header) - 32)
|
|
|
|
struct scoutfs_timespec {
|
|
__le64 sec;
|
|
__le32 nsec;
|
|
} __packed;
|
|
|
|
/*
|
|
* XXX
|
|
* - otime?
|
|
* - compat flags?
|
|
* - version?
|
|
* - generation?
|
|
* - be more careful with rdev?
|
|
*/
|
|
struct scoutfs_inode {
|
|
__le64 size;
|
|
__le64 blocks;
|
|
__le32 nlink;
|
|
__le32 uid;
|
|
__le32 gid;
|
|
__le32 mode;
|
|
__le32 rdev;
|
|
__le32 salt;
|
|
__u8 max_dirent_hash_nr;
|
|
struct scoutfs_timespec atime;
|
|
struct scoutfs_timespec ctime;
|
|
struct scoutfs_timespec mtime;
|
|
} __packed;
|
|
|
|
#define SCOUTFS_ROOT_INO 1
|
|
|
|
/*
|
|
* Dirents are stored in items with an offset of the hash of their name.
|
|
* Colliding names are packed into the value.
|
|
*/
|
|
struct scoutfs_dirent {
|
|
__le64 ino;
|
|
__u8 type;
|
|
__u8 name[0];
|
|
} __packed;
|
|
|
|
/*
|
|
* The max number of dirent hash values determines the overhead of
|
|
* lookups in very large directories. With 31bit offsets the number
|
|
* of entries stored before enospc tends to plateau around 200 million
|
|
* entries around 8 functions. That seems OK for now.
|
|
*/
|
|
#define SCOUTFS_MAX_DENT_HASH_NR 8
|
|
#define SCOUTFS_NAME_LEN 255
|
|
|
|
/*
|
|
* We only use 31 bits for readdir positions so that we don't confuse
|
|
* old signed 32bit f_pos applications or those on the other side of
|
|
* network protocols that have limited readir positions.
|
|
*/
|
|
|
|
#define SCOUTFS_DIRENT_OFF_BITS 31
|
|
#define SCOUTFS_DIRENT_OFF_MASK ((1U << SCOUTFS_DIRENT_OFF_BITS) - 1)
|
|
/* getdents returns next pos with an entry, no entry at (f_pos)~0 */
|
|
#define SCOUTFS_DIRENT_LAST_POS (INT_MAX - 1)
|
|
|
|
enum {
|
|
SCOUTFS_DT_FIFO = 0,
|
|
SCOUTFS_DT_CHR,
|
|
SCOUTFS_DT_DIR,
|
|
SCOUTFS_DT_BLK,
|
|
SCOUTFS_DT_REG,
|
|
SCOUTFS_DT_LNK,
|
|
SCOUTFS_DT_SOCK,
|
|
SCOUTFS_DT_WHT,
|
|
};
|
|
|
|
#endif
|