scoutfs-utils: update format.h for quorum

Signed-off-by: Zach Brown <zab@versity.com>
This commit is contained in:
Zach Brown
2018-10-10 11:09:48 -07:00
parent f59dfe8b73
commit ea969a5dde
4 changed files with 366 additions and 19 deletions

View File

@@ -38,6 +38,33 @@
*/
#define SCOUTFS_SUPER_BLKNO ((64ULL * 1024) >> SCOUTFS_BLOCK_SHIFT)
/*
* A reasonably large region of aligned quorum blocks follow the super
* block.
*/
#define SCOUTFS_QUORUM_BLKNO ((128ULL * 1024) >> SCOUTFS_BLOCK_SHIFT)
#define SCOUTFS_QUORUM_BLOCKS ((128ULL * 1024) >> SCOUTFS_BLOCK_SHIFT)
#define SCOUTFS_QUORUM_MAX_SLOTS SCOUTFS_QUORUM_BLOCKS
/*
* Base types used by other structures.
*/
struct scoutfs_timespec {
__le64 sec;
__le32 nsec;
} __packed;
struct scoutfs_betimespec {
__be64 sec;
__be32 nsec;
} __packed;
/* XXX ipv6 */
struct scoutfs_inet_addr {
__le32 addr;
__le16 port;
} __packed;
/*
* This header is stored at the start of btree blocks and the super
* block for verification. The crc is calculated by zeroing the crc and
@@ -340,22 +367,72 @@ struct scoutfs_xattr {
__u8 name[0];
} __packed;
struct scoutfs_betimespec {
__be64 sec;
__be32 nsec;
} __packed;
/* XXX does this exist upstream somewhere? */
#define member_sizeof(TYPE, MEMBER) (sizeof(((TYPE *)0)->MEMBER))
#define SCOUTFS_UUID_BYTES 16
#define SCOUTFS_UNIQUE_NAME_MAX_BYTES 64 /* includes null */
/* XXX ipv6 */
struct scoutfs_inet_addr {
__le32 addr;
__le16 port;
/*
* During each quorum voting interval the fabric has to process 2 reads
* and a write for each voting mount. The only reason we limit the
* number of active quorum mounts is to limit the number of IOs per
* interval. We use a pretty conservative interval given that IOs will
* generally be faster than our constant and we'll have fewer active
* than the max.
*/
#define SCOUTFS_QUORUM_MAX_ACTIVE 7
#define SCOUTFS_QUORUM_IO_LATENCY_MS 10
#define SCOUTFS_QUORUM_INTERVAL_MS \
(SCOUTFS_QUORUM_MAX_ACTIVE * 3 * SCOUTFS_QUORUM_IO_LATENCY_MS)
/*
* Each mount that is found in the quorum config in the super block can
* write to quorum blocks indicating which mount they vote for as
* the leader.
*
* @config_gen: references the config gen in the super block
* @write_nr: incremented for every write, only 0 when never written
* @elected_nr: incremented when elected, 0 otherwise
* @vote_slot: the active config slot that the writer is voting for
*/
struct scoutfs_quorum_block {
__le64 fsid;
__le64 blkno;
__le64 config_gen;
__le64 write_nr;
__le64 elected_nr;
__le32 crc;
__u8 vote_slot;
} __packed;
#define SCOUTFS_QUORUM_MAX_SLOTS SCOUTFS_QUORUM_BLOCKS
/*
* Each quorum voter is described by a slot which corresponds to the
* block that the voter will write to.
*
* The stale flag is used to support config migration. A new
* configuration is written in free slots and the old configuration is
* marked stale. Stale slots can only be reclaimed once we have
* evidence that the named mount won't try and write to it by seeing it
* write to other slots or connect with the new gen.
*/
struct scoutfs_quorum_config {
__le64 gen;
struct scoutfs_quorum_slot {
__u8 name[SCOUTFS_UNIQUE_NAME_MAX_BYTES];
struct scoutfs_inet_addr addr;
__u8 vote_priority;
__u8 flags;
} __packed slots[SCOUTFS_QUORUM_MAX_SLOTS];
} __packed;
#define SCOUTFS_QUORUM_SLOT_ACTIVE (1 << 0)
#define SCOUTFS_QUORUM_SLOT_STALE (1 << 1)
#define SCOUTFS_QUORUM_SLOT_FLAGS_UNKNOWN (U8_MAX << 2)
struct scoutfs_super_block {
struct scoutfs_block_header hdr;
__le64 id;
@@ -373,14 +450,11 @@ struct scoutfs_super_block {
struct scoutfs_btree_root alloc_root;
struct scoutfs_manifest manifest;
struct scoutfs_inet_addr server_addr;
struct scoutfs_quorum_config quorum_config;
} __packed;
#define SCOUTFS_ROOT_INO 1
struct scoutfs_timespec {
__le64 sec;
__le32 nsec;
} __packed;
/*
* @meta_seq: advanced the first time an inode is updated in a given

View File

@@ -1,4 +1,5 @@
#include <unistd.h>
#include <stdbool.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
@@ -10,6 +11,11 @@
#include <sys/stat.h>
#include <unistd.h>
#include <assert.h>
#include <getopt.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <ctype.h>
#include "sparse.h"
#include "cmd.h"
@@ -157,7 +163,7 @@ static char *size_str(u64 nr, unsigned size)
* - btree ring blocks with manifest and allocator btree blocks
* - segment with root inode items
*/
static int write_new_fs(char *path, int fd)
static int write_new_fs(char *path, int fd, struct scoutfs_quorum_config *conf)
{
struct scoutfs_super_block *super;
struct scoutfs_key *ino_key;
@@ -170,10 +176,13 @@ static int write_new_fs(char *path, int fd)
struct scoutfs_btree_block *bt;
struct scoutfs_btree_item *btitem;
struct scoutfs_segment_item *item;
struct scoutfs_quorum_slot *slot;
struct scoutfs_key key;
struct in_addr in;
__le32 *prev_link;
struct timeval tv;
char uuid_str[37];
void *zeros;
u64 blkno;
u64 limit;
u64 size;
@@ -184,13 +193,15 @@ static int write_new_fs(char *path, int fd)
u64 free_start;
u64 free_len;
int ret;
int i;
gettimeofday(&tv, NULL);
super = calloc(1, SCOUTFS_BLOCK_SIZE);
bt = calloc(1, SCOUTFS_BLOCK_SIZE);
sblk = calloc(1, SCOUTFS_SEGMENT_SIZE);
if (!super || !bt || !sblk) {
zeros = calloc(1, SCOUTFS_SEGMENT_SIZE);
if (!super || !bt || !sblk || !zeros) {
ret = -errno;
fprintf(stderr, "failed to allocate block mem: %s (%d)\n",
strerror(errno), errno);
@@ -229,6 +240,9 @@ static int write_new_fs(char *path, int fd)
super->next_node_id = cpu_to_le64(1);
super->next_compact_id = cpu_to_le64(1);
super->quorum_config = *conf;
super->quorum_config.gen = cpu_to_le64(1);
/* align the btree ring to the segment after the super */
blkno = round_up(SCOUTFS_SUPER_BLKNO + 1, SCOUTFS_SEGMENT_BLOCKS);
/* first usable segno follows manifest ring */
@@ -388,6 +402,16 @@ static int write_new_fs(char *path, int fd)
goto out;
}
/* zero out quorum blocks */
for (i = 0; i < SCOUTFS_QUORUM_BLOCKS; i++) {
ret = write_raw_block(fd, SCOUTFS_QUORUM_BLKNO + i, zeros);
if (ret < 0) {
fprintf(stderr, "error zeroing quorum block: %s (%d)\n",
strerror(-errno), -errno);
goto out;
}
}
/* write the super block */
super->hdr.seq = cpu_to_le64(1);
ret = write_block(fd, SCOUTFS_SUPER_BLKNO, NULL, &super->hdr);
@@ -423,6 +447,19 @@ static int write_new_fs(char *path, int fd)
SIZE_ARGS(le64_to_cpu(super->free_blocks),
SCOUTFS_BLOCK_SIZE));
printf(" quorum slots:\n");
for (i = 0; i < array_size(super->quorum_config.slots); i++) {
slot = &super->quorum_config.slots[i];
if (slot->flags == 0)
continue;
in.s_addr = htonl(le32_to_cpu(slot->addr.addr));
printf(" [%2u]: name %s priority %u addr:port %s:%u\n",
i, slot->name, slot->vote_priority,
inet_ntoa(in), le16_to_cpu(slot->addr.port));
}
ret = 0;
out:
if (super)
@@ -431,20 +468,174 @@ out:
free(bt);
if (sblk)
free(sblk);
if (zeros)
free(zeros);
return ret;
}
static struct option long_ops[] = {
{ "quorum_slot", 1, NULL, 'Q' },
{ NULL, 0, NULL, 0}
};
enum { NAME, PRIORITY, ADDR, PORT };
static int parse_quorum_slot(struct scoutfs_quorum_config *conf, char *arg)
{
struct scoutfs_quorum_slot *slot;
struct scoutfs_quorum_slot *sl;
struct in_addr in;
unsigned long port;
int free_slot;
char *save;
char *tok;
char *dup;
char *s;
int ret;
int i;
dup = strdup(arg);
if (!dup) {
printf("allocation failure while parsing quorum slot '%s'\n",
arg);
return -EINVAL;
}
for (i = 0; i < array_size(conf->slots); i++) {
if (conf->slots[i].flags == 0)
break;
}
if (i == array_size(conf->slots)) {
printf("too many quorum slots provided\n");
ret = -EINVAL;
goto out;
}
slot = &conf->slots[i];
free_slot = i;
slot->addr.port = cpu_to_le16(23853); /* randomly chosen */
for (save = NULL, s = dup, i = NAME; i <= PORT; i++, s = NULL) {
tok = strtok_r(s, ":", &save);
if (tok == NULL)
break;
/* assume flags and a default port */
if (i == PORT && !isdigit(tok[0]))
i = PRIORITY;
switch(i) {
case NAME:
if (strlen(tok) >= SCOUTFS_UNIQUE_NAME_MAX_BYTES) {
printf("quorum slot name too long: %s\n", tok);
return -EINVAL;
}
strcpy((char *)slot->name, tok);
break;
case PRIORITY:
slot->vote_priority = strtoul(tok, NULL, 0);
if (slot->vote_priority > 255) {
printf("invalid quorum slot priority: %s\n",
tok);
ret = -EINVAL;
goto out;
}
break;
case ADDR:
if (inet_aton(tok, &in) == 0) {
printf("invalid quorum slot address: %s\n", tok);
ret = -EINVAL;
goto out;
}
slot->addr.addr = cpu_to_le32(htonl(in.s_addr));
break;
case PORT:
port = strtoul(tok, NULL, 0);
if (port == 0 || port >= 65535) {
printf("invalid quorum slot port: %s\n", tok);
ret = -EINVAL;
goto out;
}
slot->addr.port = cpu_to_le16(port);
break;
}
}
if (slot->name[0] == '\0') {
printf("quorum slot must specify name: %s\n", arg);
ret = -EINVAL;
goto out;
}
if (slot->addr.addr == 0) {
printf("quorum slot must specify address: %s\n", arg);
ret = -EINVAL;
goto out;
}
for (i = 0; i < free_slot; i++) {
sl = &conf->slots[i];
if (strcmp((char *)slot->name, (char *)sl->name) == 0) {
printf("duplicate quorum slot name: %s\n", arg);
ret = -EINVAL;
goto out;
}
if (memcmp(&slot->addr, &sl->addr, sizeof(slot->addr)) == 0) {
printf("duplicate quorum slot addr: %s\n", arg);
ret = -EINVAL;
goto out;
}
}
slot->flags = SCOUTFS_QUORUM_SLOT_ACTIVE;
ret = 0;
out:
free(dup);
return ret;
}
static int mkfs_func(int argc, char *argv[])
{
struct scoutfs_quorum_config conf = {0,};
bool have_quorum = false;
char *path = argv[1];
int ret;
int fd;
int c;
if (argc != 2) {
while ((c = getopt_long(argc, argv, "Q:", long_ops, NULL)) != -1) {
switch (c) {
case 'Q':
ret = parse_quorum_slot(&conf, optarg);
if (ret)
return ret;
have_quorum = true;
break;
case '?':
default:
return -EINVAL;
}
}
if (optind >= argc) {
printf("scoutfs: mkfs: a single path argument is required\n");
return -EINVAL;
}
path = argv[optind];
if (!have_quorum) {
printf("must configure quorum with --quorum_slot|-Q options\n");
return -EINVAL;
}
fd = open(path, O_RDWR | O_EXCL);
if (fd < 0) {
ret = -errno;
@@ -453,7 +644,7 @@ static int mkfs_func(int argc, char *argv[])
return ret;
}
ret = write_new_fs(path, fd);
ret = write_new_fs(path, fd, &conf);
close(fd);
return ret;

View File

@@ -9,6 +9,9 @@
#include <stdarg.h>
#include <ctype.h>
#include <uuid/uuid.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include "sparse.h"
#include "util.h"
@@ -440,9 +443,68 @@ static int print_btree(int fd, struct scoutfs_super_block *super, char *which,
return ret;
}
static int print_quorum_blocks(int fd, struct scoutfs_super_block *super)
{
struct scoutfs_quorum_block *blk;
u64 blkno;
int ret;
int i;
for (i = 0; i < SCOUTFS_QUORUM_BLOCKS; i++) {
blkno = SCOUTFS_QUORUM_BLKNO + i;
blk = read_block(fd, blkno);
if (!blk) {
ret = -ENOMEM;
break;
}
if (blk->fsid != 0 || blk->write_nr != 0) {
printf("quorum block blkno %llu\n"
" fsid %llx blkno %llu config_gen %llu crc 0x%08x\n"
" write_nr %llu elected_nr %llu vote_slot %u\n",
blkno, le64_to_cpu(blk->fsid),
le64_to_cpu(blk->blkno),
le64_to_cpu(blk->config_gen),
le32_to_cpu(blk->crc),
le64_to_cpu(blk->write_nr),
le64_to_cpu(blk->elected_nr),
blk->vote_slot);
}
free(blk);
ret = 0;
}
return ret;
}
static void print_slot_flags(unsigned long flags)
{
if (flags == 0) {
printf("-");
return;
}
while (flags) {
if (flags & SCOUTFS_QUORUM_SLOT_ACTIVE) {
printf("active");
flags &= ~SCOUTFS_QUORUM_SLOT_ACTIVE;
} else if (flags & SCOUTFS_QUORUM_SLOT_STALE) {
printf("stale");
flags &= ~SCOUTFS_QUORUM_SLOT_STALE;
}
if (flags)
printf(",");
}
}
static void print_super_block(struct scoutfs_super_block *super, u64 blkno)
{
struct scoutfs_quorum_slot *slot;
char uuid_str[37];
struct in_addr in;
u64 count;
int i;
@@ -491,6 +553,22 @@ static void print_super_block(struct scoutfs_super_block *super, u64 blkno)
printf(" %u: %llu", i, count);
}
printf("\n");
printf(" quorum_config:\n gen: %llu\n",
le64_to_cpu(super->quorum_config.gen));
for (i = 0; i < array_size(super->quorum_config.slots); i++) {
slot = &super->quorum_config.slots[i];
if (slot->flags == 0)
continue;
in.s_addr = htonl(le32_to_cpu(slot->addr.addr));
printf(" [%2u]: name %s priority %u addr %s:%u flags ",
i, slot->name, slot->vote_priority, inet_ntoa(in),
le16_to_cpu(slot->addr.port));
print_slot_flags(slot->flags);
printf("\n");
}
}
static int print_volume(int fd)
@@ -516,8 +594,12 @@ static int print_volume(int fd)
goto out;
}
ret = print_btree(fd, super, "alloc", &super->alloc_root,
print_alloc_item, NULL);
ret = print_quorum_blocks(fd, super);
err = print_btree(fd, super, "alloc", &super->alloc_root,
print_alloc_item, NULL);
if (err && !ret)
ret = err;
err = print_btree(fd, super, "manifest", &super->manifest.root,
print_manifest_entry, seg_map);

View File

@@ -8,5 +8,5 @@
void pseudo_random_bytes(void *data, unsigned int len)
{
RAND_pseudo_bytes(data, len);
RAND_bytes(data, len);
}