Files
scoutfs/kmod/src/format.h
Zach Brown 9cf87ee571 scoutfs: add basic file page cache read and write
Add basic file data support by implementing the address space file and
page read and write methods.  This passis basic read/write tests but is
only the seed of a final implementation.

Signed-off-by: Zach Brown <zab@versity.com>
2016-03-26 10:58:06 -07:00

291 lines
7.5 KiB
C

#ifndef _SCOUTFS_FORMAT_H_
#define _SCOUTFS_FORMAT_H_
/* statfs(2) f_type */
#define SCOUTFS_SUPER_MAGIC 0x554f4353 /* "SCOU" */
/* super block id */
#define SCOUTFS_SUPER_ID 0x2e736674756f6373ULL /* "scoutfs." */
/*
* Everything is stored in and addressed as 4k fixed size blocks. This
* avoids having to manage contiguous cpu mappings of larger blocks.
* Larger structures are read and written as multiple blocks.
*/
#define SCOUTFS_BLOCK_SHIFT 12
#define SCOUTFS_BLOCK_SIZE (1 << SCOUTFS_BLOCK_SHIFT)
#define SCOUTFS_BLOCK_MASK (SCOUTFS_BLOCK_SIZE - 1)
/*
* The allocator works on larger chunks. Smaller metadata structures
* like the super blocks and the ring are stored in chunks.
*
* A log segment is a collection of smaller blocks (bloom filter, item blocks)
* stored in a chunk.
*/
#define SCOUTFS_CHUNK_SHIFT 22
#define SCOUTFS_CHUNK_SIZE (1 << SCOUTFS_CHUNK_SHIFT)
#define SCOUTFS_CHUNK_BLOCK_SHIFT (SCOUTFS_CHUNK_SHIFT - SCOUTFS_BLOCK_SHIFT)
#define SCOUTFS_CHUNK_BLOCK_MASK ((1 << SCOUTFS_CHUNK_BLOCK_SHIFT) - 1)
#define SCOUTFS_BLOCKS_PER_CHUNK (1 << SCOUTFS_CHUNK_BLOCK_SHIFT)
/*
* The super blocks leave some room at the start of the first block for
* platform structures like boot loaders.
*/
#define SCOUTFS_SUPER_BLKNO ((64 * 1024) >> SCOUTFS_BLOCK_SHIFT)
#define SCOUTFS_SUPER_NR 2
/*
* 7 bits in a ~76k bloom filter gives ~1% false positive for our max
* of 64k items.
*
* n = 65,536, p = 0.01 (1 in 100) → m = 628,167 (76.68KB), k = 7
*/
#define SCOUTFS_BLOOM_BITS 7
#define SCOUTFS_BLOOM_BIT_WIDTH 20 /* 2^20 > m */
#define SCOUTFS_BLOOM_BIT_MASK ((1 << SCOUTFS_BLOOM_BIT_WIDTH) - 1)
#define SCOUTFS_BLOOM_BLOCKS ((76 * 1024) / SCOUTFS_BLOCK_SIZE)
#define SCOUTFS_BLOOM_SALTS \
DIV_ROUND_UP(SCOUTFS_BLOOM_BITS * SCOUTFS_BLOOM_BIT_WIDTH, 32)
/*
* This header is found at the start of every block so that we can
* verify that it's what we were looking for. The crc and padding
* starts the block so that its calculation operations on a nice 64bit
* aligned region.
*/
struct scoutfs_block_header {
__le32 crc;
__le32 _pad;
__le64 fsid;
__le64 seq;
__le64 blkno;
} __packed;
#define SCOUTFS_UUID_BYTES 16
/*
* The super is stored in a pair of blocks in the first chunk on the
* device.
*
* The ring map blocks describe the chunks that make up the ring.
*
* The rest of the ring fields describe the state of the ring blocks
* that are stored in their chunks. The active portion of the ring
* describes the current state of the system and is replayed on mount.
*/
struct scoutfs_super_block {
struct scoutfs_block_header hdr;
__le64 id;
__u8 uuid[SCOUTFS_UUID_BYTES];
__le32 bloom_salts[SCOUTFS_BLOOM_SALTS];
__le64 total_chunks;
__le64 ring_map_blkno;
__le64 ring_map_seq;
__le64 ring_first_block;
__le64 ring_active_blocks;
__le64 ring_total_blocks;
__le64 ring_seq;
} __packed;
/*
* We should be able to make the offset smaller if neither dirents nor
* data items use the full 64 bits.
*/
struct scoutfs_key {
__le64 inode;
u8 type;
__le64 offset;
} __packed;
#define SCOUTFS_ROOT_INO 1
/*
* Currently we sort keys by the numeric value of the types, but that
* isn't necessary. We could have an arbitrary sort order. So we don't
* have to stress about cleverly allocating the types.
*/
#define SCOUTFS_INODE_KEY 1
#define SCOUTFS_DIRENT_KEY 2
#define SCOUTFS_DATA_KEY 3
struct scoutfs_ring_map_block {
struct scoutfs_block_header hdr;
__le32 nr_chunks;
__le64 blknos[0];
} __packed;
#define SCOUTFS_RING_MAP_BLOCKS \
((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_ring_map_block)) / \
sizeof(__le64))
struct scoutfs_ring_entry {
u8 type;
__le16 len;
} __packed;
/*
* Ring blocks are stored in chunks described by the ring map blocks.
*
* The manifest entries describe the position of a given log segment in
* the manifest. They're keyed by the block number so that we can
* record movement of a log segment in the manifest with one ring entry
* and we can record deletion with just the block number.
*/
struct scoutfs_ring_block {
struct scoutfs_block_header hdr;
__le16 nr_entries;
} __packed;
enum {
SCOUTFS_RING_ADD_MANIFEST = 0,
SCOUTFS_RING_DEL_MANIFEST,
SCOUTFS_RING_BITMAP,
};
/*
* Including both keys might make the manifest too large. It might be
* better to only include one key and infer a block's range from the
* neighbour's key. The downside of that is that we assume that there
* isn't unused key space between blocks in a level. We might search
* blocks when we didn't need to.
*/
struct scoutfs_ring_manifest_entry {
__le64 blkno;
__le64 seq;
__u8 level;
struct scoutfs_key first;
struct scoutfs_key last;
} __packed;
struct scoutfs_ring_del_manifest {
__le64 blkno;
} __packed;
/* 2^22 * 10^13 > 2^64 */
#define SCOUTFS_MAX_LEVEL 13
struct scoutfs_ring_bitmap {
__le32 offset;
__le64 bits[2];
} __packed;
struct scoutfs_bloom_block {
struct scoutfs_block_header hdr;
__le64 bits[0];
} __packed;
#define SCOUTFS_BLOOM_BITS_PER_BLOCK \
(((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_block_header)) / 8) * 64)
/*
* Items in log segments are sorted in a skip list by their key. We
* have a rough limit of 64k items.
*/
#define SCOUTFS_SKIP_HEIGHT 16
struct scoutfs_skip_root {
__le32 next[SCOUTFS_SKIP_HEIGHT];
} __packed;
/*
* An item block follows the bloom filter blocks at the start of a log
* segment. Its skip root references the item structs which then
* reference the item values in the rest of the block. The references
* are byte offsets from the start of the chunk.
*/
struct scoutfs_item_block {
struct scoutfs_block_header hdr;
struct scoutfs_key first;
struct scoutfs_key last;
struct scoutfs_skip_root skip_root;
} __packed;
struct scoutfs_item {
struct scoutfs_key key;
__le32 offset;
__le16 len;
u8 skip_height;
__le32 skip_next[0];
} __packed;
/*
* Item size caps item file data item length so that they fit in checksummed
* 4k blocks with a bit of expansion room.
*/
#define SCOUTFS_MAX_ITEM_LEN \
(SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_block_header) - 32)
struct scoutfs_timespec {
__le64 sec;
__le32 nsec;
} __packed;
/*
* XXX
* - otime?
* - compat flags?
* - version?
* - generation?
* - be more careful with rdev?
*/
struct scoutfs_inode {
__le64 size;
__le64 blocks;
__le32 nlink;
__le32 uid;
__le32 gid;
__le32 mode;
__le32 rdev;
__le32 salt;
__u8 max_dirent_hash_nr;
struct scoutfs_timespec atime;
struct scoutfs_timespec ctime;
struct scoutfs_timespec mtime;
} __packed;
#define SCOUTFS_ROOT_INO 1
/*
* Dirents are stored in items with an offset of the hash of their name.
* Colliding names are packed into the value.
*/
struct scoutfs_dirent {
__le64 ino;
__u8 type;
__u8 name[0];
} __packed;
/*
* The max number of dirent hash values determines the overhead of
* lookups in very large directories. With 31bit offsets the number
* of entries stored before enospc tends to plateau around 200 million
* entries around 8 functions. That seems OK for now.
*/
#define SCOUTFS_MAX_DENT_HASH_NR 8
#define SCOUTFS_NAME_LEN 255
/*
* We only use 31 bits for readdir positions so that we don't confuse
* old signed 32bit f_pos applications or those on the other side of
* network protocols that have limited readir positions.
*/
#define SCOUTFS_DIRENT_OFF_BITS 31
#define SCOUTFS_DIRENT_OFF_MASK ((1U << SCOUTFS_DIRENT_OFF_BITS) - 1)
/* getdents returns next pos with an entry, no entry at (f_pos)~0 */
#define SCOUTFS_DIRENT_LAST_POS (INT_MAX - 1)
enum {
SCOUTFS_DT_FIFO = 0,
SCOUTFS_DT_CHR,
SCOUTFS_DT_DIR,
SCOUTFS_DT_BLK,
SCOUTFS_DT_REG,
SCOUTFS_DT_LNK,
SCOUTFS_DT_SOCK,
SCOUTFS_DT_WHT,
};
#endif