mirror of
https://github.com/versity/scoutfs.git
synced 2026-01-06 12:06:26 +00:00
scoutfs-utils: update format.h for quorum
Signed-off-by: Zach Brown <zab@versity.com>
This commit is contained in:
@@ -38,6 +38,33 @@
|
||||
*/
|
||||
#define SCOUTFS_SUPER_BLKNO ((64ULL * 1024) >> SCOUTFS_BLOCK_SHIFT)
|
||||
|
||||
/*
|
||||
* A reasonably large region of aligned quorum blocks follow the super
|
||||
* block.
|
||||
*/
|
||||
#define SCOUTFS_QUORUM_BLKNO ((128ULL * 1024) >> SCOUTFS_BLOCK_SHIFT)
|
||||
#define SCOUTFS_QUORUM_BLOCKS ((128ULL * 1024) >> SCOUTFS_BLOCK_SHIFT)
|
||||
#define SCOUTFS_QUORUM_MAX_SLOTS SCOUTFS_QUORUM_BLOCKS
|
||||
|
||||
/*
|
||||
* Base types used by other structures.
|
||||
*/
|
||||
struct scoutfs_timespec {
|
||||
__le64 sec;
|
||||
__le32 nsec;
|
||||
} __packed;
|
||||
|
||||
struct scoutfs_betimespec {
|
||||
__be64 sec;
|
||||
__be32 nsec;
|
||||
} __packed;
|
||||
|
||||
/* XXX ipv6 */
|
||||
struct scoutfs_inet_addr {
|
||||
__le32 addr;
|
||||
__le16 port;
|
||||
} __packed;
|
||||
|
||||
/*
|
||||
* This header is stored at the start of btree blocks and the super
|
||||
* block for verification. The crc is calculated by zeroing the crc and
|
||||
@@ -340,22 +367,72 @@ struct scoutfs_xattr {
|
||||
__u8 name[0];
|
||||
} __packed;
|
||||
|
||||
struct scoutfs_betimespec {
|
||||
__be64 sec;
|
||||
__be32 nsec;
|
||||
} __packed;
|
||||
|
||||
/* XXX does this exist upstream somewhere? */
|
||||
#define member_sizeof(TYPE, MEMBER) (sizeof(((TYPE *)0)->MEMBER))
|
||||
|
||||
#define SCOUTFS_UUID_BYTES 16
|
||||
#define SCOUTFS_UNIQUE_NAME_MAX_BYTES 64 /* includes null */
|
||||
|
||||
/* XXX ipv6 */
|
||||
struct scoutfs_inet_addr {
|
||||
__le32 addr;
|
||||
__le16 port;
|
||||
/*
|
||||
* During each quorum voting interval the fabric has to process 2 reads
|
||||
* and a write for each voting mount. The only reason we limit the
|
||||
* number of active quorum mounts is to limit the number of IOs per
|
||||
* interval. We use a pretty conservative interval given that IOs will
|
||||
* generally be faster than our constant and we'll have fewer active
|
||||
* than the max.
|
||||
*/
|
||||
#define SCOUTFS_QUORUM_MAX_ACTIVE 7
|
||||
#define SCOUTFS_QUORUM_IO_LATENCY_MS 10
|
||||
#define SCOUTFS_QUORUM_INTERVAL_MS \
|
||||
(SCOUTFS_QUORUM_MAX_ACTIVE * 3 * SCOUTFS_QUORUM_IO_LATENCY_MS)
|
||||
|
||||
/*
|
||||
* Each mount that is found in the quorum config in the super block can
|
||||
* write to quorum blocks indicating which mount they vote for as
|
||||
* the leader.
|
||||
*
|
||||
* @config_gen: references the config gen in the super block
|
||||
* @write_nr: incremented for every write, only 0 when never written
|
||||
* @elected_nr: incremented when elected, 0 otherwise
|
||||
* @vote_slot: the active config slot that the writer is voting for
|
||||
*/
|
||||
struct scoutfs_quorum_block {
|
||||
__le64 fsid;
|
||||
__le64 blkno;
|
||||
__le64 config_gen;
|
||||
__le64 write_nr;
|
||||
__le64 elected_nr;
|
||||
__le32 crc;
|
||||
__u8 vote_slot;
|
||||
} __packed;
|
||||
|
||||
#define SCOUTFS_QUORUM_MAX_SLOTS SCOUTFS_QUORUM_BLOCKS
|
||||
|
||||
/*
|
||||
* Each quorum voter is described by a slot which corresponds to the
|
||||
* block that the voter will write to.
|
||||
*
|
||||
* The stale flag is used to support config migration. A new
|
||||
* configuration is written in free slots and the old configuration is
|
||||
* marked stale. Stale slots can only be reclaimed once we have
|
||||
* evidence that the named mount won't try and write to it by seeing it
|
||||
* write to other slots or connect with the new gen.
|
||||
*/
|
||||
struct scoutfs_quorum_config {
|
||||
__le64 gen;
|
||||
struct scoutfs_quorum_slot {
|
||||
__u8 name[SCOUTFS_UNIQUE_NAME_MAX_BYTES];
|
||||
struct scoutfs_inet_addr addr;
|
||||
__u8 vote_priority;
|
||||
__u8 flags;
|
||||
} __packed slots[SCOUTFS_QUORUM_MAX_SLOTS];
|
||||
} __packed;
|
||||
|
||||
#define SCOUTFS_QUORUM_SLOT_ACTIVE (1 << 0)
|
||||
#define SCOUTFS_QUORUM_SLOT_STALE (1 << 1)
|
||||
#define SCOUTFS_QUORUM_SLOT_FLAGS_UNKNOWN (U8_MAX << 2)
|
||||
|
||||
struct scoutfs_super_block {
|
||||
struct scoutfs_block_header hdr;
|
||||
__le64 id;
|
||||
@@ -373,14 +450,11 @@ struct scoutfs_super_block {
|
||||
struct scoutfs_btree_root alloc_root;
|
||||
struct scoutfs_manifest manifest;
|
||||
struct scoutfs_inet_addr server_addr;
|
||||
struct scoutfs_quorum_config quorum_config;
|
||||
} __packed;
|
||||
|
||||
#define SCOUTFS_ROOT_INO 1
|
||||
|
||||
struct scoutfs_timespec {
|
||||
__le64 sec;
|
||||
__le32 nsec;
|
||||
} __packed;
|
||||
|
||||
/*
|
||||
* @meta_seq: advanced the first time an inode is updated in a given
|
||||
|
||||
199
utils/src/mkfs.c
199
utils/src/mkfs.c
@@ -1,4 +1,5 @@
|
||||
#include <unistd.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
@@ -10,6 +11,11 @@
|
||||
#include <sys/stat.h>
|
||||
#include <unistd.h>
|
||||
#include <assert.h>
|
||||
#include <getopt.h>
|
||||
#include <sys/socket.h>
|
||||
#include <netinet/in.h>
|
||||
#include <arpa/inet.h>
|
||||
#include <ctype.h>
|
||||
|
||||
#include "sparse.h"
|
||||
#include "cmd.h"
|
||||
@@ -157,7 +163,7 @@ static char *size_str(u64 nr, unsigned size)
|
||||
* - btree ring blocks with manifest and allocator btree blocks
|
||||
* - segment with root inode items
|
||||
*/
|
||||
static int write_new_fs(char *path, int fd)
|
||||
static int write_new_fs(char *path, int fd, struct scoutfs_quorum_config *conf)
|
||||
{
|
||||
struct scoutfs_super_block *super;
|
||||
struct scoutfs_key *ino_key;
|
||||
@@ -170,10 +176,13 @@ static int write_new_fs(char *path, int fd)
|
||||
struct scoutfs_btree_block *bt;
|
||||
struct scoutfs_btree_item *btitem;
|
||||
struct scoutfs_segment_item *item;
|
||||
struct scoutfs_quorum_slot *slot;
|
||||
struct scoutfs_key key;
|
||||
struct in_addr in;
|
||||
__le32 *prev_link;
|
||||
struct timeval tv;
|
||||
char uuid_str[37];
|
||||
void *zeros;
|
||||
u64 blkno;
|
||||
u64 limit;
|
||||
u64 size;
|
||||
@@ -184,13 +193,15 @@ static int write_new_fs(char *path, int fd)
|
||||
u64 free_start;
|
||||
u64 free_len;
|
||||
int ret;
|
||||
int i;
|
||||
|
||||
gettimeofday(&tv, NULL);
|
||||
|
||||
super = calloc(1, SCOUTFS_BLOCK_SIZE);
|
||||
bt = calloc(1, SCOUTFS_BLOCK_SIZE);
|
||||
sblk = calloc(1, SCOUTFS_SEGMENT_SIZE);
|
||||
if (!super || !bt || !sblk) {
|
||||
zeros = calloc(1, SCOUTFS_SEGMENT_SIZE);
|
||||
if (!super || !bt || !sblk || !zeros) {
|
||||
ret = -errno;
|
||||
fprintf(stderr, "failed to allocate block mem: %s (%d)\n",
|
||||
strerror(errno), errno);
|
||||
@@ -229,6 +240,9 @@ static int write_new_fs(char *path, int fd)
|
||||
super->next_node_id = cpu_to_le64(1);
|
||||
super->next_compact_id = cpu_to_le64(1);
|
||||
|
||||
super->quorum_config = *conf;
|
||||
super->quorum_config.gen = cpu_to_le64(1);
|
||||
|
||||
/* align the btree ring to the segment after the super */
|
||||
blkno = round_up(SCOUTFS_SUPER_BLKNO + 1, SCOUTFS_SEGMENT_BLOCKS);
|
||||
/* first usable segno follows manifest ring */
|
||||
@@ -388,6 +402,16 @@ static int write_new_fs(char *path, int fd)
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* zero out quorum blocks */
|
||||
for (i = 0; i < SCOUTFS_QUORUM_BLOCKS; i++) {
|
||||
ret = write_raw_block(fd, SCOUTFS_QUORUM_BLKNO + i, zeros);
|
||||
if (ret < 0) {
|
||||
fprintf(stderr, "error zeroing quorum block: %s (%d)\n",
|
||||
strerror(-errno), -errno);
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
/* write the super block */
|
||||
super->hdr.seq = cpu_to_le64(1);
|
||||
ret = write_block(fd, SCOUTFS_SUPER_BLKNO, NULL, &super->hdr);
|
||||
@@ -423,6 +447,19 @@ static int write_new_fs(char *path, int fd)
|
||||
SIZE_ARGS(le64_to_cpu(super->free_blocks),
|
||||
SCOUTFS_BLOCK_SIZE));
|
||||
|
||||
printf(" quorum slots:\n");
|
||||
for (i = 0; i < array_size(super->quorum_config.slots); i++) {
|
||||
slot = &super->quorum_config.slots[i];
|
||||
if (slot->flags == 0)
|
||||
continue;
|
||||
|
||||
in.s_addr = htonl(le32_to_cpu(slot->addr.addr));
|
||||
|
||||
printf(" [%2u]: name %s priority %u addr:port %s:%u\n",
|
||||
i, slot->name, slot->vote_priority,
|
||||
inet_ntoa(in), le16_to_cpu(slot->addr.port));
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
if (super)
|
||||
@@ -431,20 +468,174 @@ out:
|
||||
free(bt);
|
||||
if (sblk)
|
||||
free(sblk);
|
||||
if (zeros)
|
||||
free(zeros);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static struct option long_ops[] = {
|
||||
{ "quorum_slot", 1, NULL, 'Q' },
|
||||
{ NULL, 0, NULL, 0}
|
||||
};
|
||||
|
||||
enum { NAME, PRIORITY, ADDR, PORT };
|
||||
|
||||
static int parse_quorum_slot(struct scoutfs_quorum_config *conf, char *arg)
|
||||
{
|
||||
struct scoutfs_quorum_slot *slot;
|
||||
struct scoutfs_quorum_slot *sl;
|
||||
struct in_addr in;
|
||||
unsigned long port;
|
||||
int free_slot;
|
||||
char *save;
|
||||
char *tok;
|
||||
char *dup;
|
||||
char *s;
|
||||
int ret;
|
||||
int i;
|
||||
|
||||
dup = strdup(arg);
|
||||
if (!dup) {
|
||||
printf("allocation failure while parsing quorum slot '%s'\n",
|
||||
arg);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
for (i = 0; i < array_size(conf->slots); i++) {
|
||||
if (conf->slots[i].flags == 0)
|
||||
break;
|
||||
}
|
||||
if (i == array_size(conf->slots)) {
|
||||
printf("too many quorum slots provided\n");
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
slot = &conf->slots[i];
|
||||
free_slot = i;
|
||||
|
||||
slot->addr.port = cpu_to_le16(23853); /* randomly chosen */
|
||||
|
||||
for (save = NULL, s = dup, i = NAME; i <= PORT; i++, s = NULL) {
|
||||
tok = strtok_r(s, ":", &save);
|
||||
|
||||
if (tok == NULL)
|
||||
break;
|
||||
|
||||
/* assume flags and a default port */
|
||||
if (i == PORT && !isdigit(tok[0]))
|
||||
i = PRIORITY;
|
||||
|
||||
switch(i) {
|
||||
case NAME:
|
||||
if (strlen(tok) >= SCOUTFS_UNIQUE_NAME_MAX_BYTES) {
|
||||
printf("quorum slot name too long: %s\n", tok);
|
||||
return -EINVAL;
|
||||
}
|
||||
strcpy((char *)slot->name, tok);
|
||||
break;
|
||||
|
||||
case PRIORITY:
|
||||
slot->vote_priority = strtoul(tok, NULL, 0);
|
||||
if (slot->vote_priority > 255) {
|
||||
printf("invalid quorum slot priority: %s\n",
|
||||
tok);
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
break;
|
||||
|
||||
case ADDR:
|
||||
if (inet_aton(tok, &in) == 0) {
|
||||
printf("invalid quorum slot address: %s\n", tok);
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
slot->addr.addr = cpu_to_le32(htonl(in.s_addr));
|
||||
break;
|
||||
|
||||
case PORT:
|
||||
port = strtoul(tok, NULL, 0);
|
||||
if (port == 0 || port >= 65535) {
|
||||
printf("invalid quorum slot port: %s\n", tok);
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
slot->addr.port = cpu_to_le16(port);
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
if (slot->name[0] == '\0') {
|
||||
printf("quorum slot must specify name: %s\n", arg);
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (slot->addr.addr == 0) {
|
||||
printf("quorum slot must specify address: %s\n", arg);
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
for (i = 0; i < free_slot; i++) {
|
||||
sl = &conf->slots[i];
|
||||
|
||||
if (strcmp((char *)slot->name, (char *)sl->name) == 0) {
|
||||
printf("duplicate quorum slot name: %s\n", arg);
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (memcmp(&slot->addr, &sl->addr, sizeof(slot->addr)) == 0) {
|
||||
printf("duplicate quorum slot addr: %s\n", arg);
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
slot->flags = SCOUTFS_QUORUM_SLOT_ACTIVE;
|
||||
ret = 0;
|
||||
out:
|
||||
free(dup);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int mkfs_func(int argc, char *argv[])
|
||||
{
|
||||
struct scoutfs_quorum_config conf = {0,};
|
||||
bool have_quorum = false;
|
||||
char *path = argv[1];
|
||||
int ret;
|
||||
int fd;
|
||||
int c;
|
||||
|
||||
if (argc != 2) {
|
||||
while ((c = getopt_long(argc, argv, "Q:", long_ops, NULL)) != -1) {
|
||||
switch (c) {
|
||||
case 'Q':
|
||||
ret = parse_quorum_slot(&conf, optarg);
|
||||
if (ret)
|
||||
return ret;
|
||||
have_quorum = true;
|
||||
break;
|
||||
case '?':
|
||||
default:
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
|
||||
if (optind >= argc) {
|
||||
printf("scoutfs: mkfs: a single path argument is required\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
path = argv[optind];
|
||||
|
||||
if (!have_quorum) {
|
||||
printf("must configure quorum with --quorum_slot|-Q options\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
fd = open(path, O_RDWR | O_EXCL);
|
||||
if (fd < 0) {
|
||||
ret = -errno;
|
||||
@@ -453,7 +644,7 @@ static int mkfs_func(int argc, char *argv[])
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = write_new_fs(path, fd);
|
||||
ret = write_new_fs(path, fd, &conf);
|
||||
close(fd);
|
||||
|
||||
return ret;
|
||||
|
||||
@@ -9,6 +9,9 @@
|
||||
#include <stdarg.h>
|
||||
#include <ctype.h>
|
||||
#include <uuid/uuid.h>
|
||||
#include <sys/socket.h>
|
||||
#include <netinet/in.h>
|
||||
#include <arpa/inet.h>
|
||||
|
||||
#include "sparse.h"
|
||||
#include "util.h"
|
||||
@@ -440,9 +443,68 @@ static int print_btree(int fd, struct scoutfs_super_block *super, char *which,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int print_quorum_blocks(int fd, struct scoutfs_super_block *super)
|
||||
{
|
||||
struct scoutfs_quorum_block *blk;
|
||||
u64 blkno;
|
||||
int ret;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < SCOUTFS_QUORUM_BLOCKS; i++) {
|
||||
blkno = SCOUTFS_QUORUM_BLKNO + i;
|
||||
blk = read_block(fd, blkno);
|
||||
if (!blk) {
|
||||
ret = -ENOMEM;
|
||||
break;
|
||||
}
|
||||
|
||||
if (blk->fsid != 0 || blk->write_nr != 0) {
|
||||
printf("quorum block blkno %llu\n"
|
||||
" fsid %llx blkno %llu config_gen %llu crc 0x%08x\n"
|
||||
" write_nr %llu elected_nr %llu vote_slot %u\n",
|
||||
blkno, le64_to_cpu(blk->fsid),
|
||||
le64_to_cpu(blk->blkno),
|
||||
le64_to_cpu(blk->config_gen),
|
||||
le32_to_cpu(blk->crc),
|
||||
le64_to_cpu(blk->write_nr),
|
||||
le64_to_cpu(blk->elected_nr),
|
||||
blk->vote_slot);
|
||||
}
|
||||
|
||||
free(blk);
|
||||
ret = 0;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void print_slot_flags(unsigned long flags)
|
||||
{
|
||||
if (flags == 0) {
|
||||
printf("-");
|
||||
return;
|
||||
}
|
||||
|
||||
while (flags) {
|
||||
if (flags & SCOUTFS_QUORUM_SLOT_ACTIVE) {
|
||||
printf("active");
|
||||
flags &= ~SCOUTFS_QUORUM_SLOT_ACTIVE;
|
||||
|
||||
} else if (flags & SCOUTFS_QUORUM_SLOT_STALE) {
|
||||
printf("stale");
|
||||
flags &= ~SCOUTFS_QUORUM_SLOT_STALE;
|
||||
}
|
||||
|
||||
if (flags)
|
||||
printf(",");
|
||||
}
|
||||
}
|
||||
|
||||
static void print_super_block(struct scoutfs_super_block *super, u64 blkno)
|
||||
{
|
||||
struct scoutfs_quorum_slot *slot;
|
||||
char uuid_str[37];
|
||||
struct in_addr in;
|
||||
u64 count;
|
||||
int i;
|
||||
|
||||
@@ -491,6 +553,22 @@ static void print_super_block(struct scoutfs_super_block *super, u64 blkno)
|
||||
printf(" %u: %llu", i, count);
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
printf(" quorum_config:\n gen: %llu\n",
|
||||
le64_to_cpu(super->quorum_config.gen));
|
||||
for (i = 0; i < array_size(super->quorum_config.slots); i++) {
|
||||
slot = &super->quorum_config.slots[i];
|
||||
if (slot->flags == 0)
|
||||
continue;
|
||||
|
||||
in.s_addr = htonl(le32_to_cpu(slot->addr.addr));
|
||||
|
||||
printf(" [%2u]: name %s priority %u addr %s:%u flags ",
|
||||
i, slot->name, slot->vote_priority, inet_ntoa(in),
|
||||
le16_to_cpu(slot->addr.port));
|
||||
print_slot_flags(slot->flags);
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
static int print_volume(int fd)
|
||||
@@ -516,8 +594,12 @@ static int print_volume(int fd)
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = print_btree(fd, super, "alloc", &super->alloc_root,
|
||||
print_alloc_item, NULL);
|
||||
ret = print_quorum_blocks(fd, super);
|
||||
|
||||
err = print_btree(fd, super, "alloc", &super->alloc_root,
|
||||
print_alloc_item, NULL);
|
||||
if (err && !ret)
|
||||
ret = err;
|
||||
|
||||
err = print_btree(fd, super, "manifest", &super->manifest.root,
|
||||
print_manifest_entry, seg_map);
|
||||
|
||||
@@ -8,5 +8,5 @@
|
||||
|
||||
void pseudo_random_bytes(void *data, unsigned int len)
|
||||
{
|
||||
RAND_pseudo_bytes(data, len);
|
||||
RAND_bytes(data, len);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user