mirror of
https://github.com/versity/scoutfs.git
synced 2026-01-11 06:00:19 +00:00
Tests such as quorum-heartbeat-timeout were failing with EIO messages in dmesg output due to expected errors during forced unmount. Use ENOLINK instead, and filter all errors from dmesg with this errno (67). Signed-off-by: Chris Kirby <ckirby@versity.com>
1255 lines
36 KiB
C
1255 lines
36 KiB
C
#ifndef _SCOUTFS_FORMAT_H_
|
|
#define _SCOUTFS_FORMAT_H_
|
|
|
|
/*
|
|
* The format version defines the format of structures on devices,
|
|
* structures that are communicated over the wire, and the protocol
|
|
* behind the structures.
|
|
*/
|
|
#define SCOUTFS_FORMAT_VERSION_MIN 1
|
|
#define SCOUTFS_FORMAT_VERSION_MIN_STR __stringify(SCOUTFS_FORMAT_VERSION_MIN)
|
|
#define SCOUTFS_FORMAT_VERSION_MAX 2
|
|
#define SCOUTFS_FORMAT_VERSION_MAX_STR __stringify(SCOUTFS_FORMAT_VERSION_MAX)
|
|
|
|
#define SCOUTFS_FORMAT_VERSION_FEAT_RETENTION 2
|
|
#define SCOUTFS_FORMAT_VERSION_FEAT_PROJECT_ID 2
|
|
#define SCOUTFS_FORMAT_VERSION_FEAT_QUOTA 2
|
|
#define SCOUTFS_FORMAT_VERSION_FEAT_INDX_TAG 2
|
|
|
|
/* statfs(2) f_type */
|
|
#define SCOUTFS_SUPER_MAGIC 0x554f4353 /* "SCOU" */
|
|
|
|
/* block header magic values, chosen at random */
|
|
#define SCOUTFS_BLOCK_MAGIC_SUPER 0x103c428b
|
|
#define SCOUTFS_BLOCK_MAGIC_BTREE 0xe597f96d
|
|
#define SCOUTFS_BLOCK_MAGIC_BLOOM 0x31995604
|
|
#define SCOUTFS_BLOCK_MAGIC_SRCH_BLOCK 0x897e4a7d
|
|
#define SCOUTFS_BLOCK_MAGIC_SRCH_PARENT 0xb23a2a05
|
|
#define SCOUTFS_BLOCK_MAGIC_ALLOC_LIST 0x8a93ac83
|
|
#define SCOUTFS_BLOCK_MAGIC_QUORUM 0xbc310868
|
|
|
|
/*
|
|
* The super block, quorum block, and file data allocation granularity
|
|
* use the smaller 4KB block.
|
|
*/
|
|
#define SCOUTFS_BLOCK_SM_SHIFT 12
|
|
#define SCOUTFS_BLOCK_SM_SIZE (1 << SCOUTFS_BLOCK_SM_SHIFT)
|
|
#define SCOUTFS_BLOCK_SM_MASK (SCOUTFS_BLOCK_SM_SIZE - 1)
|
|
#define SCOUTFS_BLOCK_SM_PER_PAGE (PAGE_SIZE / SCOUTFS_BLOCK_SM_SIZE)
|
|
#define SCOUTFS_BLOCK_SM_SECTOR_SHIFT (SCOUTFS_BLOCK_SM_SHIFT - 9)
|
|
#define SCOUTFS_BLOCK_SM_SECTORS (1 << SCOUTFS_BLOCK_SM_SECTOR_SHIFT)
|
|
#define SCOUTFS_BLOCK_SM_MAX (U64_MAX >> SCOUTFS_BLOCK_SM_SHIFT)
|
|
#define SCOUTFS_BLOCK_SM_PAGES_PER (SCOUTFS_BLOCK_SM_SIZE / PAGE_SIZE)
|
|
#define SCOUTFS_BLOCK_SM_PAGE_ORDER (SCOUTFS_BLOCK_SM_SHIFT - PAGE_SHIFT)
|
|
|
|
/*
|
|
* The radix and btree structures, and the forest bloom block, use the
|
|
* larger 64KB metadata block size.
|
|
*/
|
|
#define SCOUTFS_BLOCK_LG_SHIFT 16
|
|
#define SCOUTFS_BLOCK_LG_SIZE (1 << SCOUTFS_BLOCK_LG_SHIFT)
|
|
#define SCOUTFS_BLOCK_LG_MASK (SCOUTFS_BLOCK_LG_SIZE - 1)
|
|
#define SCOUTFS_BLOCK_LG_PER_PAGE (PAGE_SIZE / SCOUTFS_BLOCK_LG_SIZE)
|
|
#define SCOUTFS_BLOCK_LG_SECTOR_SHIFT (SCOUTFS_BLOCK_LG_SHIFT - 9)
|
|
#define SCOUTFS_BLOCK_LG_SECTORS (1 << SCOUTFS_BLOCK_LG_SECTOR_SHIFT)
|
|
#define SCOUTFS_BLOCK_LG_MAX (U64_MAX >> SCOUTFS_BLOCK_LG_SHIFT)
|
|
#define SCOUTFS_BLOCK_LG_PAGES_PER (SCOUTFS_BLOCK_LG_SIZE / PAGE_SIZE)
|
|
#define SCOUTFS_BLOCK_LG_PAGE_ORDER (SCOUTFS_BLOCK_LG_SHIFT - PAGE_SHIFT)
|
|
|
|
#define SCOUTFS_BLOCK_SM_LG_SHIFT (SCOUTFS_BLOCK_LG_SHIFT - \
|
|
SCOUTFS_BLOCK_SM_SHIFT)
|
|
|
|
|
|
/*
|
|
* The super block leaves some room before the first block for platform
|
|
* structures like boot loaders.
|
|
*/
|
|
#define SCOUTFS_SUPER_BLKNO ((64ULL * 1024) >> SCOUTFS_BLOCK_SM_SHIFT)
|
|
|
|
/*
|
|
* A small number of quorum blocks follow the super block, enough of
|
|
* them to match the starting offset of the super block so the region is
|
|
* aligned to the power of two that contains it.
|
|
*/
|
|
#define SCOUTFS_QUORUM_BLKNO (SCOUTFS_SUPER_BLKNO + 1)
|
|
#define SCOUTFS_QUORUM_BLOCKS (SCOUTFS_SUPER_BLKNO - 1)
|
|
|
|
/*
|
|
* Free metadata blocks start after the quorum blocks
|
|
*/
|
|
#define SCOUTFS_META_DEV_START_BLKNO \
|
|
((SCOUTFS_QUORUM_BLKNO + SCOUTFS_QUORUM_BLOCKS) >> \
|
|
SCOUTFS_BLOCK_SM_LG_SHIFT)
|
|
|
|
/*
|
|
* Start data on the data device aligned as well.
|
|
*/
|
|
#define SCOUTFS_DATA_DEV_START_BLKNO ((256ULL * 1024) >> SCOUTFS_BLOCK_SM_SHIFT)
|
|
|
|
|
|
#define SCOUTFS_UNIQUE_NAME_MAX_BYTES 64 /* includes null */
|
|
|
|
/*
|
|
* Base types used by other structures.
|
|
*/
|
|
struct scoutfs_timespec {
|
|
__le64 sec;
|
|
__le32 nsec;
|
|
__u8 __pad[4];
|
|
};
|
|
|
|
enum scoutfs_inet_family {
|
|
SCOUTFS_AF_NONE = 0,
|
|
SCOUTFS_AF_IPV4 = 1,
|
|
SCOUTFS_AF_IPV6 = 2,
|
|
};
|
|
|
|
struct scoutfs_inet_addr4 {
|
|
__le16 family;
|
|
__le16 port;
|
|
__le32 addr;
|
|
};
|
|
|
|
/*
|
|
* Not yet supported by code.
|
|
*/
|
|
struct scoutfs_inet_addr6 {
|
|
__le16 family;
|
|
__le16 port;
|
|
__u8 addr[16];
|
|
__le32 flow_info;
|
|
__le32 scope_id;
|
|
__u8 __pad[4];
|
|
};
|
|
|
|
union scoutfs_inet_addr {
|
|
struct scoutfs_inet_addr4 v4;
|
|
struct scoutfs_inet_addr6 v6;
|
|
};
|
|
|
|
/*
|
|
* This header is stored at the start of btree blocks and the super
|
|
* block for verification. The crc field is not included in the
|
|
* calculation of the crc.
|
|
*/
|
|
struct scoutfs_block_header {
|
|
__le32 crc;
|
|
__le32 magic;
|
|
__le64 fsid;
|
|
__le64 seq;
|
|
__le64 blkno;
|
|
};
|
|
|
|
/*
|
|
* A reference to a block. The corresponding fields in the block_header
|
|
* must match after having read the block contents.
|
|
*/
|
|
struct scoutfs_block_ref {
|
|
__le64 blkno;
|
|
__le64 seq;
|
|
};
|
|
|
|
/*
|
|
* scoutfs identifies all file system metadata items by a small key
|
|
* struct.
|
|
*
|
|
* Each item type maps their logical structures to the fixed fields in
|
|
* sort order. This lets us print keys without needing per-type
|
|
* formats.
|
|
*
|
|
* The keys are compared by considering the fields in struct order from
|
|
* most to least significant. They are considered a multi precision
|
|
* value when navigating the keys in ordered key space. We can
|
|
* increment them, subtract them from each other, etc.
|
|
*/
|
|
struct scoutfs_key {
|
|
__le64 _sk_first;
|
|
__le64 _sk_second;
|
|
__le64 _sk_third;
|
|
__u8 _sk_fourth;
|
|
__u8 sk_zone;
|
|
__u8 sk_type;
|
|
__u8 __pad[5];
|
|
};
|
|
|
|
/* inode index */
|
|
#define skii_major _sk_second
|
|
#define skii_ino _sk_third
|
|
|
|
/* node orphan inode */
|
|
#define sko_rid _sk_first
|
|
#define sko_ino _sk_second
|
|
|
|
/* quota rules */
|
|
#define skqr_hash _sk_second
|
|
#define skqr_coll_nr _sk_third
|
|
|
|
/* xattr totl */
|
|
#define skxt_a _sk_first
|
|
#define skxt_b _sk_second
|
|
#define skxt_c _sk_third
|
|
|
|
/* inode */
|
|
#define ski_ino _sk_first
|
|
|
|
/* xattr parts */
|
|
#define skx_ino _sk_first
|
|
#define skx_name_hash _sk_second
|
|
#define skx_id _sk_third
|
|
#define skx_part _sk_fourth
|
|
|
|
/* directory entries */
|
|
#define skd_ino _sk_first
|
|
#define skd_major _sk_second
|
|
#define skd_minor _sk_third
|
|
|
|
/* symlink target */
|
|
#define sks_ino _sk_first
|
|
#define sks_nr _sk_second
|
|
|
|
/* data extents */
|
|
#define skdx_ino _sk_first
|
|
#define skdx_end _sk_second
|
|
#define skdx_len _sk_third
|
|
|
|
/* log trees */
|
|
#define sklt_rid _sk_first
|
|
#define sklt_nr _sk_second
|
|
|
|
/* mounted clients */
|
|
#define skmc_rid _sk_first
|
|
|
|
/* free extents by blkno */
|
|
#define skfb_end _sk_first
|
|
#define skfb_len _sk_second
|
|
/* free extents by order */
|
|
#define skfo_revord _sk_first
|
|
#define skfo_end _sk_second
|
|
#define skfo_len _sk_third
|
|
|
|
struct scoutfs_avl_root {
|
|
__le16 node;
|
|
};
|
|
|
|
struct scoutfs_avl_node {
|
|
__le16 parent;
|
|
__le16 left;
|
|
__le16 right;
|
|
__u8 height;
|
|
__u8 __pad[1];
|
|
};
|
|
|
|
/* when we split we want to have multiple items on each side */
|
|
#define SCOUTFS_BTREE_MAX_VAL_LEN 896
|
|
|
|
/*
|
|
* A 4EB test image measured a worst case height of 17. This is plenty
|
|
* generous.
|
|
*/
|
|
#define SCOUTFS_BTREE_MAX_HEIGHT 20
|
|
|
|
/*
|
|
* A height of X means that the first block read will have level X-1 and
|
|
* the leaves will have level 0.
|
|
*/
|
|
struct scoutfs_btree_root {
|
|
struct scoutfs_block_ref ref;
|
|
__u8 height;
|
|
__u8 __pad[7];
|
|
};
|
|
|
|
struct scoutfs_btree_item {
|
|
struct scoutfs_avl_node node;
|
|
struct scoutfs_key key;
|
|
__le64 seq;
|
|
__le16 val_off;
|
|
__le16 val_len;
|
|
__u8 flags;
|
|
__u8 __pad[3];
|
|
};
|
|
|
|
#define SCOUTFS_ITEM_FLAG_DELETION (1 << 0)
|
|
|
|
struct scoutfs_btree_block {
|
|
struct scoutfs_block_header hdr;
|
|
struct scoutfs_avl_root item_root;
|
|
__le16 nr_items;
|
|
__le16 total_item_bytes;
|
|
__le16 mid_free_len;
|
|
__u8 level;
|
|
__u8 __pad[7];
|
|
struct scoutfs_btree_item items[];
|
|
/* leaf blocks have a fixed size item offset hash table at the end */
|
|
};
|
|
|
|
#define SCOUTFS_BTREE_VALUE_ALIGN 8
|
|
|
|
/*
|
|
* Try to aim for a 75% load in a leaf full of items with no value.
|
|
* We'll almost never see this because most items have values and most
|
|
* blocks aren't full.
|
|
*/
|
|
#define SCOUTFS_BTREE_LEAF_ITEM_HASH_NR_UNALIGNED \
|
|
((SCOUTFS_BLOCK_LG_SIZE - sizeof(struct scoutfs_btree_block)) / \
|
|
(sizeof(struct scoutfs_btree_item) + (sizeof(__le16))) * 100 / 75)
|
|
#define SCOUTFS_BTREE_LEAF_ITEM_HASH_NR \
|
|
(round_up(SCOUTFS_BTREE_LEAF_ITEM_HASH_NR_UNALIGNED, \
|
|
SCOUTFS_BTREE_VALUE_ALIGN))
|
|
#define SCOUTFS_BTREE_LEAF_ITEM_HASH_BYTES \
|
|
(SCOUTFS_BTREE_LEAF_ITEM_HASH_NR * sizeof(__le16))
|
|
|
|
/*
|
|
* first_nr tracks the nr of the first block in the list and is used for
|
|
* allocation sizing. total_nr is the sum of the nr of all the blocks in
|
|
* the list and is used for calculating total free block counts.
|
|
*/
|
|
struct scoutfs_alloc_list_head {
|
|
struct scoutfs_block_ref ref;
|
|
__le64 total_nr;
|
|
__le32 first_nr;
|
|
__le32 flags;
|
|
};
|
|
|
|
|
|
/*
|
|
* While the main allocator uses extent items in btree blocks, metadata
|
|
* allocations for a single transaction are recorded in arrays in
|
|
* blocks. This limits the number of allocations and frees needed to
|
|
* cow and modify the structure. The blocks can be stored in a list
|
|
* which lets us create a persistent log of pending frees that are
|
|
* generated as we cow btree blocks to insert freed extents.
|
|
*
|
|
* The array floats in the block so that both adding and removing blknos
|
|
* only modifies an index.
|
|
*/
|
|
struct scoutfs_alloc_list_block {
|
|
struct scoutfs_block_header hdr;
|
|
struct scoutfs_block_ref next;
|
|
__le32 start;
|
|
__le32 nr;
|
|
__le64 blknos[]; /* naturally aligned for sorting */
|
|
};
|
|
|
|
#define SCOUTFS_ALLOC_LIST_MAX_BLOCKS \
|
|
((SCOUTFS_BLOCK_LG_SIZE - sizeof(struct scoutfs_alloc_list_block)) / \
|
|
(member_sizeof(struct scoutfs_alloc_list_block, blknos[0])))
|
|
|
|
/*
|
|
* These can safely be initialized to all-zeros.
|
|
*/
|
|
struct scoutfs_alloc_root {
|
|
__le64 total_len;
|
|
__le32 flags;
|
|
__le32 _pad;
|
|
struct scoutfs_btree_root root;
|
|
};
|
|
|
|
/* Shared by _alloc_list_head and _alloc_root */
|
|
#define SCOUTFS_ALLOC_FLAG_LOW (1U << 0)
|
|
|
|
/* types of allocators, exposed to alloc_detail ioctl */
|
|
#define SCOUTFS_ALLOC_OWNER_NONE 0
|
|
#define SCOUTFS_ALLOC_OWNER_SERVER 1
|
|
#define SCOUTFS_ALLOC_OWNER_MOUNT 2
|
|
#define SCOUTFS_ALLOC_OWNER_SRCH 3
|
|
#define SCOUTFS_ALLOC_OWNER_LOG_MERGE 4
|
|
|
|
struct scoutfs_mounted_client_btree_val {
|
|
union scoutfs_inet_addr addr;
|
|
__u8 flags;
|
|
__u8 __pad[7];
|
|
};
|
|
|
|
#define SCOUTFS_MOUNTED_CLIENT_QUORUM (1 << 0)
|
|
|
|
/*
|
|
* srch files are a contiguous run of blocks with compressed entries
|
|
* described by a dense parent radix. The files can be stored in
|
|
* log_tree items when the files contain unsorted entries written by
|
|
* mounts during their transactions. Sorted files of increasing size
|
|
* are kept in a btree off the super for searching and further
|
|
* compacting.
|
|
*/
|
|
struct scoutfs_srch_entry {
|
|
__le64 hash;
|
|
__le64 ino;
|
|
__le64 id;
|
|
};
|
|
|
|
#define SCOUTFS_SRCH_ENTRY_MAX_BYTES (2 + (sizeof(__u64) * 3))
|
|
|
|
struct scoutfs_srch_file {
|
|
struct scoutfs_srch_entry first;
|
|
struct scoutfs_srch_entry last;
|
|
struct scoutfs_block_ref ref;
|
|
__le64 blocks;
|
|
__le64 entries;
|
|
__u8 height;
|
|
__u8 __pad[7];
|
|
};
|
|
|
|
struct scoutfs_srch_parent {
|
|
struct scoutfs_block_header hdr;
|
|
struct scoutfs_block_ref refs[];
|
|
};
|
|
|
|
#define SCOUTFS_SRCH_PARENT_REFS \
|
|
((SCOUTFS_BLOCK_LG_SIZE - \
|
|
offsetof(struct scoutfs_srch_parent, refs)) / \
|
|
sizeof(struct scoutfs_block_ref))
|
|
|
|
struct scoutfs_srch_block {
|
|
struct scoutfs_block_header hdr;
|
|
struct scoutfs_srch_entry first;
|
|
struct scoutfs_srch_entry last;
|
|
struct scoutfs_srch_entry tail;
|
|
__le32 entry_nr;
|
|
__le32 entry_bytes;
|
|
__u8 entries[];
|
|
};
|
|
|
|
/*
|
|
* Decoding loads final small deltas with full __u64 loads. Rather than
|
|
* check the size before each load we stop coding entries past the point
|
|
* where a full size entry could overflow the block. A final entry can
|
|
* start at this byte count and consume the rest of the block, though
|
|
* its unlikely.
|
|
*/
|
|
#define SCOUTFS_SRCH_BLOCK_SAFE_BYTES \
|
|
(SCOUTFS_BLOCK_LG_SIZE - sizeof(struct scoutfs_srch_block) - \
|
|
SCOUTFS_SRCH_ENTRY_MAX_BYTES)
|
|
|
|
#define SCOUTFS_SRCH_LOG_BLOCK_LIMIT (1024 * 1024 / SCOUTFS_BLOCK_LG_SIZE)
|
|
#define SCOUTFS_SRCH_COMPACT_ORDER 2
|
|
#define SCOUTFS_SRCH_COMPACT_NR (1 << SCOUTFS_SRCH_COMPACT_ORDER)
|
|
|
|
/*
|
|
* A persistent record of a srch file compaction operation in progress.
|
|
*
|
|
* When compacting log files blk and pos aren't used. When compacting
|
|
* sorted files blk is the logical block number and pos is the byte
|
|
* offset of the next entry. When deleting files pos is the height of
|
|
* the level that we're deleting, and blk is the logical block offset of
|
|
* the next parent ref array index to descend through.
|
|
*/
|
|
struct scoutfs_srch_compact {
|
|
struct scoutfs_alloc_list_head meta_avail;
|
|
struct scoutfs_alloc_list_head meta_freed;
|
|
__le64 id;
|
|
__u8 nr;
|
|
__u8 flags;
|
|
__u8 __pad[6];
|
|
struct scoutfs_srch_file out;
|
|
struct scoutfs_srch_compact_input {
|
|
struct scoutfs_srch_file sfl;
|
|
__le64 blk;
|
|
__le64 pos;
|
|
} in[SCOUTFS_SRCH_COMPACT_NR];
|
|
};
|
|
|
|
/* server -> client: combine input log file entries into output file */
|
|
#define SCOUTFS_SRCH_COMPACT_FLAG_LOG (1 << 0)
|
|
/* server -> client: combine input sorted file entries into output file */
|
|
#define SCOUTFS_SRCH_COMPACT_FLAG_SORTED (1 << 1)
|
|
/* server -> client: delete input files */
|
|
#define SCOUTFS_SRCH_COMPACT_FLAG_DELETE (1 << 2)
|
|
/* client -> server: compaction phase (LOG,SORTED,DELETE) done */
|
|
#define SCOUTFS_SRCH_COMPACT_FLAG_DONE (1 << 4)
|
|
/* client -> server: compaction failed */
|
|
#define SCOUTFS_SRCH_COMPACT_FLAG_ERROR (1 << 5)
|
|
|
|
#define SCOUTFS_DATA_ALLOC_MAX_ZONES 1024
|
|
#define SCOUTFS_DATA_ALLOC_ZONE_BYTES DIV_ROUND_UP(SCOUTFS_DATA_ALLOC_MAX_ZONES, 8)
|
|
#define SCOUTFS_DATA_ALLOC_ZONE_LE64S DIV_ROUND_UP(SCOUTFS_DATA_ALLOC_MAX_ZONES, 64)
|
|
|
|
/*
|
|
* XXX I imagine we should rename these now that they've evolved to track
|
|
* all the btrees that clients use during a transaction. It's not just
|
|
* about item logs, it's about clients making changes to trees.
|
|
*
|
|
* @get_trans_seq, @commit_trans_seq: These pair of sequence numbers
|
|
* determine if a transaction is currently open for the mount that owns
|
|
* the log_trees struct. get_trans_seq is advanced by the server as the
|
|
* transaction is opened. The server sets commit_trans_seq equal to
|
|
* get_ as the transaction is committed.
|
|
*/
|
|
struct scoutfs_log_trees {
|
|
struct scoutfs_alloc_list_head meta_avail;
|
|
struct scoutfs_alloc_list_head meta_freed;
|
|
struct scoutfs_btree_root item_root;
|
|
struct scoutfs_block_ref bloom_ref;
|
|
struct scoutfs_alloc_root data_avail;
|
|
struct scoutfs_alloc_root data_freed;
|
|
struct scoutfs_srch_file srch_file;
|
|
__le64 data_alloc_zone_blocks;
|
|
__le64 data_alloc_zones[SCOUTFS_DATA_ALLOC_ZONE_LE64S];
|
|
__le64 inode_count_delta;
|
|
__le64 get_trans_seq;
|
|
__le64 commit_trans_seq;
|
|
__le64 max_item_seq;
|
|
__le64 finalize_seq;
|
|
__le64 rid;
|
|
__le64 nr;
|
|
__le64 flags;
|
|
};
|
|
|
|
#define SCOUTFS_LOG_TREES_FINALIZED (1ULL << 0)
|
|
|
|
/* FS items are limited by the max btree value length */
|
|
#define SCOUTFS_MAX_VAL_SIZE SCOUTFS_BTREE_MAX_VAL_LEN
|
|
|
|
struct scoutfs_bloom_block {
|
|
struct scoutfs_block_header hdr;
|
|
__le64 total_set;
|
|
__le64 bits[];
|
|
};
|
|
|
|
/*
|
|
* Item log trees are accompanied by a block of bits that make up a
|
|
* bloom filter which indicate if the item log trees may contain items
|
|
* covered by a lock. The log trees should be finalized and merged long
|
|
* before the bloom filters fill up and start returning excessive false
|
|
* positives.
|
|
*/
|
|
#define SCOUTFS_FOREST_BLOOM_NRS 3
|
|
#define SCOUTFS_FOREST_BLOOM_BITS \
|
|
(((SCOUTFS_BLOCK_LG_SIZE - sizeof(struct scoutfs_bloom_block)) / \
|
|
member_sizeof(struct scoutfs_bloom_block, bits[0])) * \
|
|
member_sizeof(struct scoutfs_bloom_block, bits[0]) * 8)
|
|
#define SCOUTFS_FOREST_BLOOM_FUNC_BITS (SCOUTFS_BLOCK_LG_SHIFT + 3)
|
|
|
|
/*
|
|
* A private server btree item which records the status of a log merge
|
|
* operation that is in progress.
|
|
*/
|
|
struct scoutfs_log_merge_status {
|
|
struct scoutfs_key next_range_key;
|
|
__le64 nr_requests;
|
|
__le64 nr_complete;
|
|
__le64 seq;
|
|
};
|
|
|
|
/*
|
|
* A request is sent to the client and stored in a server btree item to
|
|
* record resources that would be reclaimed if the client failed. It
|
|
* has all the inputs needed for the client to perform its portion of a
|
|
* merge.
|
|
*/
|
|
struct scoutfs_log_merge_request {
|
|
struct scoutfs_alloc_list_head meta_avail;
|
|
struct scoutfs_alloc_list_head meta_freed;
|
|
struct scoutfs_btree_root logs_root;
|
|
struct scoutfs_btree_root root;
|
|
struct scoutfs_key start;
|
|
struct scoutfs_key end;
|
|
__le64 input_seq;
|
|
__le64 rid;
|
|
__le64 seq;
|
|
__le64 flags;
|
|
};
|
|
|
|
/* request root is subtree of fs root at parent, restricted merging modifications */
|
|
#define SCOUTFS_LOG_MERGE_REQUEST_SUBTREE (1ULL << 0)
|
|
|
|
/*
|
|
* The output of a client's merge of log btree items into a subtree
|
|
* rooted at a parent in the fs_root. The client sends it to the
|
|
* server, who stores it in a btree item for later splicing/rebalancing.
|
|
*/
|
|
struct scoutfs_log_merge_complete {
|
|
struct scoutfs_alloc_list_head meta_avail;
|
|
struct scoutfs_alloc_list_head meta_freed;
|
|
struct scoutfs_btree_root root;
|
|
struct scoutfs_key start;
|
|
struct scoutfs_key end;
|
|
struct scoutfs_key remain;
|
|
__le64 rid;
|
|
__le64 seq;
|
|
__le64 flags;
|
|
};
|
|
|
|
/* merge failed, ignore completion and reclaim stored request */
|
|
#define SCOUTFS_LOG_MERGE_COMP_ERROR (1ULL << 0)
|
|
/* merge didn't complete range, restart from remain */
|
|
#define SCOUTFS_LOG_MERGE_COMP_REMAIN (1ULL << 1)
|
|
|
|
/*
|
|
* Range items record the ranges of the fs keyspace that still need to
|
|
* be merged. They're added as a merge starts, removed as requests are
|
|
* sent and added back if the request didn't consume its entire range.
|
|
*/
|
|
struct scoutfs_log_merge_range {
|
|
struct scoutfs_key start;
|
|
struct scoutfs_key end;
|
|
};
|
|
|
|
struct scoutfs_log_merge_freeing {
|
|
struct scoutfs_btree_root root;
|
|
struct scoutfs_key key;
|
|
__le64 seq;
|
|
};
|
|
|
|
/*
|
|
* Keys are first sorted by major key zones.
|
|
*/
|
|
#define SCOUTFS_INODE_INDEX_ZONE 4
|
|
#define SCOUTFS_ORPHAN_ZONE 8
|
|
#define SCOUTFS_QUOTA_ZONE 10
|
|
#define SCOUTFS_XATTR_TOTL_ZONE 12
|
|
#define SCOUTFS_XATTR_INDX_ZONE 14
|
|
#define SCOUTFS_FS_ZONE 16
|
|
#define SCOUTFS_LOCK_ZONE 20
|
|
/* Items only stored in server btrees */
|
|
#define SCOUTFS_LOG_TREES_ZONE 24
|
|
#define SCOUTFS_MOUNTED_CLIENT_ZONE 28
|
|
#define SCOUTFS_SRCH_ZONE 32
|
|
#define SCOUTFS_FREE_EXTENT_BLKNO_ZONE 36
|
|
#define SCOUTFS_FREE_EXTENT_ORDER_ZONE 40
|
|
/* Items only stored in log merge server btrees */
|
|
#define SCOUTFS_LOG_MERGE_STATUS_ZONE 44
|
|
#define SCOUTFS_LOG_MERGE_RANGE_ZONE 48
|
|
#define SCOUTFS_LOG_MERGE_REQUEST_ZONE 52
|
|
#define SCOUTFS_LOG_MERGE_COMPLETE_ZONE 56
|
|
#define SCOUTFS_LOG_MERGE_FREEING_ZONE 60
|
|
|
|
/* inode index zone */
|
|
#define SCOUTFS_INODE_INDEX_META_SEQ_TYPE 4
|
|
#define SCOUTFS_INODE_INDEX_DATA_SEQ_TYPE 8
|
|
|
|
/* orphan zone, redundant type used for clarity */
|
|
#define SCOUTFS_ORPHAN_TYPE 4
|
|
|
|
/* quota zone */
|
|
#define SCOUTFS_QUOTA_RULE_TYPE 4
|
|
|
|
/* fs zone */
|
|
#define SCOUTFS_INODE_TYPE 4
|
|
#define SCOUTFS_XATTR_TYPE 8
|
|
#define SCOUTFS_DIRENT_TYPE 12
|
|
#define SCOUTFS_READDIR_TYPE 16
|
|
#define SCOUTFS_LINK_BACKREF_TYPE 20
|
|
#define SCOUTFS_SYMLINK_TYPE 24
|
|
#define SCOUTFS_DATA_EXTENT_TYPE 28
|
|
|
|
/* lock zone, only ever found in lock ranges, never in persistent items */
|
|
#define SCOUTFS_RENAME_TYPE 4
|
|
|
|
/* srch zone, only in server btrees */
|
|
#define SCOUTFS_SRCH_LOG_TYPE 4
|
|
#define SCOUTFS_SRCH_BLOCKS_TYPE 8
|
|
#define SCOUTFS_SRCH_PENDING_TYPE 12
|
|
#define SCOUTFS_SRCH_BUSY_TYPE 16
|
|
|
|
/* file data extents have start and len in key */
|
|
struct scoutfs_data_extent_val {
|
|
__le64 blkno;
|
|
__u8 flags;
|
|
__u8 __pad[7];
|
|
};
|
|
|
|
#define SEF_OFFLINE (1 << 0)
|
|
#define SEF_UNWRITTEN (1 << 1)
|
|
#define SEF_UNKNOWN (U8_MAX << 2)
|
|
|
|
/*
|
|
* The first xattr part item has a header that describes the xattr. The
|
|
* name and value are then packed into the following bytes in the first
|
|
* part item and overflow into the values of the rest of the part items.
|
|
*/
|
|
struct scoutfs_xattr {
|
|
__le16 val_len;
|
|
__u8 name_len;
|
|
__u8 __pad[5];
|
|
__u8 name[];
|
|
};
|
|
|
|
/*
|
|
* .totl. xattrs are mapped to items. The dotted u64s in the xattr name
|
|
* map to the item key. The item value total is the sum of all the
|
|
* xattr values. The item value count records the number of xattrs
|
|
* contributing to the total and is used when combining logged items to
|
|
* determine if totals are being created or destroyed.
|
|
*/
|
|
struct scoutfs_xattr_totl_val {
|
|
__le64 total;
|
|
__le64 count;
|
|
};
|
|
|
|
#define SQ_RF_TOTL_COUNT (1 << 0)
|
|
#define SQ_RF__UNKNOWN (~((1 << 1) - 1))
|
|
|
|
#define SQ_NS_LITERAL 0
|
|
#define SQ_NS_PROJ 1
|
|
#define SQ_NS_UID 2
|
|
#define SQ_NS_GID 3
|
|
#define SQ_NS__NR 4
|
|
#define SQ_NS__NR_SELECT (SQ_NS__NR - 1) /* !literal */
|
|
|
|
#define SQ_NF_SELECT (1 << 0)
|
|
#define SQ_NF__UNKNOWN (~((1 << 1) - 1))
|
|
|
|
#define SQ_OP_INODE 0
|
|
#define SQ_OP_DATA 1
|
|
#define SQ_OP__NR 2
|
|
|
|
struct scoutfs_quota_rule_val {
|
|
__le64 name_val[3];
|
|
__le64 limit;
|
|
__u8 prio;
|
|
__u8 op;
|
|
__u8 rule_flags;
|
|
__u8 name_source[3];
|
|
__u8 name_flags[3];
|
|
__u8 _pad[7];
|
|
};
|
|
|
|
/* XXX does this exist upstream somewhere? */
|
|
#define member_sizeof(TYPE, MEMBER) (sizeof(((TYPE *)0)->MEMBER))
|
|
|
|
#define SCOUTFS_UUID_BYTES 16
|
|
|
|
#define SCOUTFS_QUORUM_MAX_SLOTS 15
|
|
|
|
/*
|
|
* To elect a leader, members race to have their variable election
|
|
* timeouts expire. If they're first to send a vote request with a
|
|
* greater term to a majority of waiting members they'll be elected with
|
|
* a majority. If the timeouts are too close, the vote may be split and
|
|
* everyone will wait for another cycle of variable timeouts to expire.
|
|
*
|
|
* These determine how long it will take to elect a leader once there's
|
|
* no evidence of a server (no leader quorum blocks on mount; heartbeat
|
|
* timeout expired.)
|
|
*/
|
|
#define SCOUTFS_QUORUM_ELECT_MIN_MS 250
|
|
#define SCOUTFS_QUORUM_ELECT_VAR_MS 100
|
|
|
|
/*
|
|
* Once a leader is elected they send heartbeat messages to all quorum
|
|
* members at regular intervals to force members to wait the much longer
|
|
* heartbeat timeout. Once the heartbeat timeout expires without
|
|
* receiving a heartbeat message a member will start an election.
|
|
*
|
|
* These determine how long it could take members to notice that a
|
|
* leader has gone silent and start to elect a new leader. The
|
|
* heartbeat timeout can be changed at run time by options.
|
|
*/
|
|
#define SCOUTFS_QUORUM_HB_IVAL_MS 100
|
|
#define SCOUTFS_QUORUM_MIN_HB_TIMEO_MS (2 * MSEC_PER_SEC)
|
|
#define SCOUTFS_QUORUM_DEF_HB_TIMEO_MS (10 * MSEC_PER_SEC)
|
|
#define SCOUTFS_QUORUM_MAX_HB_TIMEO_MS (60 * MSEC_PER_SEC)
|
|
|
|
/*
|
|
* A newly elected leader will give fencing some time before giving up and
|
|
* shutting down.
|
|
*/
|
|
#define SCOUTFS_QUORUM_FENCE_TO_MS (15 * MSEC_PER_SEC)
|
|
|
|
struct scoutfs_quorum_message {
|
|
__le64 fsid;
|
|
__le64 version;
|
|
__le64 term;
|
|
__u8 type;
|
|
__u8 from;
|
|
__u8 __pad[2];
|
|
__le32 crc;
|
|
};
|
|
|
|
/* a candidate requests a vote */
|
|
#define SCOUTFS_QUORUM_MSG_REQUEST_VOTE 0
|
|
/* followers send votes to candidates */
|
|
#define SCOUTFS_QUORUM_MSG_VOTE 1
|
|
/* elected leaders broadcast heartbeats to delay elections */
|
|
#define SCOUTFS_QUORUM_MSG_HEARTBEAT 2
|
|
/* leaders broadcast as they leave to break heartbeat timeout */
|
|
#define SCOUTFS_QUORUM_MSG_RESIGNATION 3
|
|
#define SCOUTFS_QUORUM_MSG_INVALID 4
|
|
|
|
/*
|
|
* The version is currently always 0, but will be used by mounts to
|
|
* discover that membership has changed.
|
|
*/
|
|
struct scoutfs_quorum_config {
|
|
__le64 version;
|
|
struct scoutfs_quorum_slot {
|
|
union scoutfs_inet_addr addr;
|
|
} slots[SCOUTFS_QUORUM_MAX_SLOTS];
|
|
};
|
|
|
|
enum {
|
|
SCOUTFS_QUORUM_EVENT_BEGIN, /* quorum service starting up */
|
|
SCOUTFS_QUORUM_EVENT_TERM, /* updated persistent term */
|
|
SCOUTFS_QUORUM_EVENT_ELECT, /* won election */
|
|
SCOUTFS_QUORUM_EVENT_FENCE, /* server fenced others */
|
|
SCOUTFS_QUORUM_EVENT_STOP, /* server stopped */
|
|
SCOUTFS_QUORUM_EVENT_END, /* quorum service shutting down */
|
|
SCOUTFS_QUORUM_EVENT_NR,
|
|
};
|
|
|
|
struct scoutfs_quorum_block {
|
|
struct scoutfs_block_header hdr;
|
|
__le64 write_nr;
|
|
struct scoutfs_quorum_block_event {
|
|
__le64 write_nr;
|
|
__le64 rid;
|
|
__le64 term;
|
|
struct scoutfs_timespec ts;
|
|
} events[SCOUTFS_QUORUM_EVENT_NR];
|
|
};
|
|
|
|
/*
|
|
* Tunable options that apply to the entire system. They can be set in
|
|
* mkfs or in sysfs files which send an rpc to the server to make the
|
|
* change. The super version defines the options that exist.
|
|
*
|
|
* @set_bits: bits for each 64bit starting offset after set_bits
|
|
* indicate which logical option is set.
|
|
*
|
|
* @data_alloc_zone_blocks: if set, the data device is logically divided
|
|
* into contiguous zones of this many blocks. Data allocation will try
|
|
* and isolate allocated extents for each mount to their own zone. The
|
|
* zone size must be larger than the data alloc high water mark and
|
|
* large enough such that the number of zones is kept within its static
|
|
* limit.
|
|
*/
|
|
struct scoutfs_volume_options {
|
|
__le64 set_bits;
|
|
__le64 data_alloc_zone_blocks;
|
|
__le64 __future_expansion[63];
|
|
};
|
|
|
|
#define scoutfs_volopt_nr(field) \
|
|
((offsetof(struct scoutfs_volume_options, field) - \
|
|
(offsetof(struct scoutfs_volume_options, set_bits) + \
|
|
member_sizeof(struct scoutfs_volume_options, set_bits))) / sizeof(__le64))
|
|
#define scoutfs_volopt_bit(field) \
|
|
(1ULL << scoutfs_volopt_nr(field))
|
|
|
|
#define SCOUTFS_VOLOPT_DATA_ALLOC_ZONE_BLOCKS_NR \
|
|
scoutfs_volopt_nr(data_alloc_zone_blocks)
|
|
#define SCOUTFS_VOLOPT_DATA_ALLOC_ZONE_BLOCKS_BIT \
|
|
scoutfs_volopt_bit(data_alloc_zone_blocks)
|
|
|
|
#define SCOUTFS_VOLOPT_EXPANSION_BITS \
|
|
(~(scoutfs_volopt_bit(__future_expansion) - 1))
|
|
|
|
#define SCOUTFS_FLAG_IS_META_BDEV 0x01
|
|
|
|
struct scoutfs_super_block {
|
|
struct scoutfs_block_header hdr;
|
|
__le64 id;
|
|
__le64 fmt_vers;
|
|
__le64 flags;
|
|
__u8 uuid[SCOUTFS_UUID_BYTES];
|
|
__le64 seq;
|
|
__le64 next_ino;
|
|
__le64 inode_count;
|
|
__le64 total_meta_blocks; /* both static and dynamic */
|
|
__le64 total_data_blocks;
|
|
struct scoutfs_quorum_config qconf;
|
|
struct scoutfs_alloc_root meta_alloc[2];
|
|
struct scoutfs_alloc_root data_alloc;
|
|
struct scoutfs_alloc_list_head server_meta_avail[2];
|
|
struct scoutfs_alloc_list_head server_meta_freed[2];
|
|
struct scoutfs_btree_root fs_root;
|
|
struct scoutfs_btree_root logs_root;
|
|
struct scoutfs_btree_root log_merge;
|
|
struct scoutfs_btree_root mounted_clients;
|
|
struct scoutfs_btree_root srch_root;
|
|
struct scoutfs_volume_options volopt;
|
|
};
|
|
|
|
#define SCOUTFS_ROOT_INO 1
|
|
|
|
|
|
/*
|
|
* @meta_seq: advanced the first time an inode is updated in a given
|
|
* transaction. It can only advance again after the inode is written
|
|
* and a new transaction opens.
|
|
*
|
|
* @data_seq: advanced the first time a file's data (or size) is
|
|
* modified in a given transaction. It can only advance again after the
|
|
* file is written and a new transaction opens.
|
|
*
|
|
* @data_version: incremented every time the contents of a file could
|
|
* have changed. It is exposed via an ioctl and is then provided as an
|
|
* argument to data functions to protect racing modification.
|
|
*
|
|
* @online_blocks: The number of fixed 4k blocks currently allocated and
|
|
* storing data in the volume.
|
|
*
|
|
* @offline_blocks: The number of fixed 4k blocks that could be made
|
|
* online by staging.
|
|
*/
|
|
struct scoutfs_inode {
|
|
__le64 size;
|
|
__le64 meta_seq;
|
|
__le64 data_seq;
|
|
__le64 data_version;
|
|
__le64 online_blocks;
|
|
__le64 offline_blocks;
|
|
__le64 next_readdir_pos;
|
|
__le64 next_xattr_id;
|
|
__le64 version;
|
|
__le32 nlink;
|
|
__le32 uid;
|
|
__le32 gid;
|
|
__le32 mode;
|
|
__le32 rdev;
|
|
__le32 flags;
|
|
struct scoutfs_timespec atime;
|
|
struct scoutfs_timespec ctime;
|
|
struct scoutfs_timespec mtime;
|
|
struct scoutfs_timespec crtime;
|
|
__le64 proj;
|
|
};
|
|
|
|
#define SCOUTFS_INODE_FMT_V1_BYTES offsetof(struct scoutfs_inode, proj)
|
|
|
|
/*
|
|
* There are so few versions that we don't mind doing this work inline
|
|
* so that both utils and kernel can share these. Mounting has already
|
|
* checked that the format version is within the supported min and max,
|
|
* so these functions only deal with size variance within that band.
|
|
*/
|
|
/* Returns the native written inode size for the given format version, 0 for bad version */
|
|
static inline int scoutfs_inode_vers_bytes(__u64 fmt_vers)
|
|
{
|
|
if (fmt_vers == 1)
|
|
return SCOUTFS_INODE_FMT_V1_BYTES;
|
|
else
|
|
return sizeof(struct scoutfs_inode);
|
|
}
|
|
/*
|
|
* Returns true if bytes is a valid inode size to read from the given
|
|
* version. The given version must be greater than the version that
|
|
* introduced the size.
|
|
*/
|
|
static inline int scoutfs_inode_valid_vers_bytes(__u64 fmt_vers, int bytes)
|
|
{
|
|
return (bytes == sizeof(struct scoutfs_inode) && fmt_vers == SCOUTFS_FORMAT_VERSION_MAX) ||
|
|
(bytes == SCOUTFS_INODE_FMT_V1_BYTES);
|
|
}
|
|
|
|
#define SCOUTFS_INO_FLAG_TRUNCATE 0x1
|
|
#define SCOUTFS_INO_FLAG_RETENTION 0x2
|
|
|
|
#define SCOUTFS_ROOT_INO 1
|
|
|
|
/* like the block size, a reasonable min PATH_MAX across platforms */
|
|
#define SCOUTFS_SYMLINK_MAX_SIZE 4096
|
|
|
|
/*
|
|
* Dirents are stored in multiple places to isolate contention when
|
|
* performing different operations: hashed by name for creation and
|
|
* lookup, at incrementing positions for readdir and resolving inodes to
|
|
* paths. Each entry has all the metadata needed to reference all the
|
|
* items (so an entry cached by lookup can be used to unlink all the
|
|
* items).
|
|
*/
|
|
struct scoutfs_dirent {
|
|
__le64 ino;
|
|
__le64 hash;
|
|
__le64 pos;
|
|
__u8 type;
|
|
__u8 __pad[7];
|
|
__u8 name[];
|
|
};
|
|
|
|
#define SCOUTFS_NAME_LEN 255
|
|
|
|
/* S32_MAX avoids the (int) sign bit and might avoid sloppy bugs */
|
|
#define SCOUTFS_LINK_MAX S32_MAX
|
|
|
|
/* entries begin after . and .. */
|
|
#define SCOUTFS_DIRENT_FIRST_POS 2
|
|
/* getdents returns next pos with an entry, no entry at (f_pos)~0 */
|
|
#define SCOUTFS_DIRENT_LAST_POS (U64_MAX - 1)
|
|
|
|
enum scoutfs_dentry_type {
|
|
SCOUTFS_DT_FIFO = 0,
|
|
SCOUTFS_DT_CHR,
|
|
SCOUTFS_DT_DIR,
|
|
SCOUTFS_DT_BLK,
|
|
SCOUTFS_DT_REG,
|
|
SCOUTFS_DT_LNK,
|
|
SCOUTFS_DT_SOCK,
|
|
SCOUTFS_DT_WHT,
|
|
};
|
|
|
|
|
|
#define SCOUTFS_XATTR_MAX_NAME_LEN 255
|
|
#define SCOUTFS_XATTR_MAX_VAL_LEN 65535
|
|
#define SCOUTFS_XATTR_MAX_PART_SIZE SCOUTFS_MAX_VAL_SIZE
|
|
#define SCOUTFS_XATTR_MAX_TOTL_U64 23 /* octal U64_MAX */
|
|
|
|
#define SCOUTFS_XATTR_NR_PARTS(name_len, val_len) \
|
|
DIV_ROUND_UP(sizeof(struct scoutfs_xattr) + name_len + val_len, \
|
|
(unsigned int)SCOUTFS_XATTR_MAX_PART_SIZE)
|
|
|
|
#define SCOUTFS_LOCK_INODE_GROUP_NR 1024
|
|
#define SCOUTFS_LOCK_INODE_GROUP_MASK (SCOUTFS_LOCK_INODE_GROUP_NR - 1)
|
|
#define SCOUTFS_LOCK_SEQ_GROUP_MASK ((1ULL << 10) - 1)
|
|
|
|
/*
|
|
* messages over the wire.
|
|
*/
|
|
|
|
/*
|
|
* Greetings verify identity of communicating nodes. The sender sends
|
|
* their credentials and the receiver verifies them.
|
|
*
|
|
* @server_term: The raft term that elected the server. Initially 0
|
|
* from the client, sent by the server, then sent by the client as it
|
|
* tries to reconnect. Used to identify a client reconnecting to both
|
|
* the same serer after receiving a greeting response and to a new
|
|
* server after failover.
|
|
*
|
|
* @rid: The client's random id that was generated once as the mount
|
|
* started up. This identifies a specific remote mount across
|
|
* connections and servers. It's set to the client's rid in both the
|
|
* request and response for consistency.
|
|
*/
|
|
struct scoutfs_net_greeting {
|
|
__le64 fsid;
|
|
__le64 fmt_vers;
|
|
__le64 server_term;
|
|
__le64 rid;
|
|
__le64 flags;
|
|
};
|
|
|
|
#define SCOUTFS_NET_GREETING_FLAG_FAREWELL (1 << 0)
|
|
#define SCOUTFS_NET_GREETING_FLAG_QUORUM (1 << 1)
|
|
#define SCOUTFS_NET_GREETING_FLAG_INVALID (~(__u64)0 << 2)
|
|
|
|
/*
|
|
* This header precedes and describes all network messages sent over
|
|
* sockets.
|
|
*
|
|
* @seq: A sequence number that is increased for each message queued for
|
|
* send on the sender. The sender will never reorder messages in the
|
|
* send queue so this will always increase in recv on the receiver. The
|
|
* receiver can use this to drop messages that arrived twice after being
|
|
* resent across a newly connected socket for a given connection.
|
|
*
|
|
* @recv_seq: The sequence number of the last received message. The
|
|
* receiver is sending this to the sender in every message. The sender
|
|
* uses them to drop responses which have been delivered.
|
|
*
|
|
* @id: An increasing identifier that is set in each request. Responses
|
|
* specify the request that they're responding to.
|
|
*
|
|
* Error is only set to a translated errno and will only be found in
|
|
* response messages.
|
|
*/
|
|
struct scoutfs_net_header {
|
|
__le64 seq;
|
|
__le64 recv_seq;
|
|
__le64 id;
|
|
__le16 data_len;
|
|
__u8 cmd;
|
|
__u8 flags;
|
|
__u8 error;
|
|
__u8 __pad[3];
|
|
__u8 data[];
|
|
};
|
|
|
|
#define SCOUTFS_NET_FLAG_RESPONSE (1 << 0)
|
|
#define SCOUTFS_NET_FLAGS_UNKNOWN (U8_MAX << 1)
|
|
|
|
enum scoutfs_net_cmd {
|
|
SCOUTFS_NET_CMD_GREETING = 0,
|
|
SCOUTFS_NET_CMD_ALLOC_INODES,
|
|
SCOUTFS_NET_CMD_GET_LOG_TREES,
|
|
SCOUTFS_NET_CMD_COMMIT_LOG_TREES,
|
|
SCOUTFS_NET_CMD_SYNC_LOG_TREES,
|
|
SCOUTFS_NET_CMD_GET_ROOTS,
|
|
SCOUTFS_NET_CMD_GET_LAST_SEQ,
|
|
SCOUTFS_NET_CMD_LOCK,
|
|
SCOUTFS_NET_CMD_LOCK_RECOVER,
|
|
SCOUTFS_NET_CMD_SRCH_GET_COMPACT,
|
|
SCOUTFS_NET_CMD_SRCH_COMMIT_COMPACT,
|
|
SCOUTFS_NET_CMD_GET_LOG_MERGE,
|
|
SCOUTFS_NET_CMD_COMMIT_LOG_MERGE,
|
|
SCOUTFS_NET_CMD_OPEN_INO_MAP,
|
|
SCOUTFS_NET_CMD_GET_VOLOPT,
|
|
SCOUTFS_NET_CMD_SET_VOLOPT,
|
|
SCOUTFS_NET_CMD_CLEAR_VOLOPT,
|
|
SCOUTFS_NET_CMD_RESIZE_DEVICES,
|
|
SCOUTFS_NET_CMD_STATFS,
|
|
SCOUTFS_NET_CMD_FAREWELL,
|
|
SCOUTFS_NET_CMD_UNKNOWN,
|
|
};
|
|
|
|
/*
|
|
* Define a macro to evaluate another macro for each of the errnos we
|
|
* translate over the wire. This lets us keep our enum in sync with the
|
|
* mapping arrays to and from host errnos.
|
|
*/
|
|
#define EXPAND_EACH_NET_ERRNO \
|
|
EXPAND_NET_ERRNO(ENOENT) \
|
|
EXPAND_NET_ERRNO(ENOMEM) \
|
|
EXPAND_NET_ERRNO(EIO) \
|
|
EXPAND_NET_ERRNO(ENOSPC) \
|
|
EXPAND_NET_ERRNO(EINVAL) \
|
|
EXPAND_NET_ERRNO(ENOLINK)
|
|
|
|
#undef EXPAND_NET_ERRNO
|
|
#define EXPAND_NET_ERRNO(which) SCOUTFS_NET_ERR_##which,
|
|
enum scoutfs_net_errors {
|
|
SCOUTFS_NET_ERR_NONE = 0,
|
|
EXPAND_EACH_NET_ERRNO
|
|
SCOUTFS_NET_ERR_UNKNOWN,
|
|
};
|
|
|
|
/* arbitrarily chosen to be safely less than mss and allow 1k with header */
|
|
#define SCOUTFS_NET_MAX_DATA_LEN 1100
|
|
|
|
/*
|
|
* When there's no more free inodes this will be sent with ino = ~0 and
|
|
* nr = 0.
|
|
*/
|
|
struct scoutfs_net_inode_alloc {
|
|
__le64 ino;
|
|
__le64 nr;
|
|
};
|
|
|
|
struct scoutfs_net_roots {
|
|
struct scoutfs_btree_root fs_root;
|
|
struct scoutfs_btree_root logs_root;
|
|
struct scoutfs_btree_root srch_root;
|
|
};
|
|
|
|
struct scoutfs_net_resize_devices {
|
|
__le64 new_total_meta_blocks;
|
|
__le64 new_total_data_blocks;
|
|
};
|
|
|
|
struct scoutfs_net_statfs {
|
|
__u8 uuid[SCOUTFS_UUID_BYTES];
|
|
__le64 free_meta_blocks;
|
|
__le64 total_meta_blocks;
|
|
__le64 free_data_blocks;
|
|
__le64 total_data_blocks;
|
|
__le64 inode_count;
|
|
};
|
|
|
|
struct scoutfs_net_lock {
|
|
struct scoutfs_key key;
|
|
__le64 write_seq;
|
|
__u8 old_mode;
|
|
__u8 new_mode;
|
|
__u8 __pad[6];
|
|
};
|
|
|
|
struct scoutfs_net_lock_recover {
|
|
__le16 nr;
|
|
__u8 __pad[6];
|
|
struct scoutfs_net_lock locks[];
|
|
};
|
|
|
|
#define SCOUTFS_NET_LOCK_MAX_RECOVER_NR \
|
|
((SCOUTFS_NET_MAX_DATA_LEN - sizeof(struct scoutfs_net_lock_recover)) /\
|
|
sizeof(struct scoutfs_net_lock))
|
|
|
|
/* some enums for tracing */
|
|
enum scoutfs_lock_trace {
|
|
SLT_CLIENT,
|
|
SLT_SERVER,
|
|
SLT_GRANT,
|
|
SLT_INVALIDATE,
|
|
SLT_REQUEST,
|
|
SLT_RESPONSE,
|
|
SLT_NR,
|
|
};
|
|
|
|
/*
|
|
* Read and write locks operate as you'd expect. Multiple readers can
|
|
* hold read locks while writers are excluded. A single writer can hold
|
|
* a write lock which excludes other readers and writers. Writers can
|
|
* read while holding a write lock.
|
|
*
|
|
* Multiple writers can hold write only locks but they can not read,
|
|
* they can only generate dirty items. It's used when the system has
|
|
* other means of knowing that it's safe to overwrite items.
|
|
*
|
|
* The null mode provides no access and is used to destroy locks.
|
|
*/
|
|
enum scoutfs_lock_mode {
|
|
SCOUTFS_LOCK_NULL = 0,
|
|
SCOUTFS_LOCK_READ,
|
|
SCOUTFS_LOCK_WRITE,
|
|
SCOUTFS_LOCK_WRITE_ONLY,
|
|
SCOUTFS_LOCK_INVALID,
|
|
};
|
|
|
|
/*
|
|
* Scoutfs file handle structure - this can be copied out to userspace
|
|
* via open by handle or put on the wire from NFS.
|
|
*/
|
|
struct scoutfs_fid {
|
|
__le64 ino;
|
|
__le64 parent_ino;
|
|
};
|
|
|
|
#define FILEID_SCOUTFS 0x81
|
|
#define FILEID_SCOUTFS_WITH_PARENT 0x82
|
|
|
|
/*
|
|
* Identifiers for sources of corruption that can generate messages.
|
|
*/
|
|
enum scoutfs_corruption_sources {
|
|
SC_DIRENT_NAME_LEN = 0,
|
|
SC_DIRENT_BACKREF_NAME_LEN,
|
|
SC_DIRENT_READDIR_NAME_LEN,
|
|
SC_SYMLINK_INODE_SIZE,
|
|
SC_SYMLINK_MISSING_ITEM,
|
|
SC_SYMLINK_NOT_NULL_TERM,
|
|
SC_BTREE_BLOCK_LEVEL,
|
|
SC_BTREE_NO_CHILD_REF,
|
|
SC_INODE_BLOCK_COUNTS,
|
|
SC_NR_SOURCES,
|
|
};
|
|
|
|
#define SC_NR_LONGS DIV_ROUND_UP(SC_NR_SOURCES, BITS_PER_LONG)
|
|
|
|
#define SCOUTFS_OPEN_INO_MAP_SHIFT 10
|
|
#define SCOUTFS_OPEN_INO_MAP_BITS (1 << SCOUTFS_OPEN_INO_MAP_SHIFT)
|
|
#define SCOUTFS_OPEN_INO_MAP_MASK (SCOUTFS_OPEN_INO_MAP_BITS - 1)
|
|
#define SCOUTFS_OPEN_INO_MAP_LE64S (SCOUTFS_OPEN_INO_MAP_BITS / 64)
|
|
|
|
/*
|
|
* The request and response conversation is as follows:
|
|
*
|
|
* client[init] -> server:
|
|
* group_nr = G
|
|
* req_id = 0 (I)
|
|
* server -> client[*]
|
|
* group_nr = G
|
|
* req_id = R
|
|
* client[*] -> server
|
|
* group_nr = G (I)
|
|
* req_id = R
|
|
* bits
|
|
* server -> client[init]
|
|
* group_nr = G (I)
|
|
* req_id = R (I)
|
|
* bits
|
|
*
|
|
* Many of the fields in individual messages are ignored ("I") because
|
|
* the net id or the omap req_id can be used to identify the
|
|
* conversation. We always include them on the wire to make inspected
|
|
* messages easier to follow.
|
|
*/
|
|
struct scoutfs_open_ino_map_args {
|
|
__le64 group_nr;
|
|
__le64 req_id;
|
|
};
|
|
|
|
struct scoutfs_open_ino_map {
|
|
struct scoutfs_open_ino_map_args args;
|
|
__le64 bits[SCOUTFS_OPEN_INO_MAP_LE64S];
|
|
};
|
|
|
|
#endif
|