mirror of
https://github.com/versity/scoutfs.git
synced 2026-04-30 09:56:55 +00:00
Previously quorum configuration specified the number of votes needed to elected the leader. This was an excessive amount of freedom in the configuration of the cluster which created all sorts of problems which had to be designed around. Most acutely, though, it required a probabilistic mechanism for mounts to persistently record that they're starting a server so that future servers could find and possibly fence them. They would write to a lot of quorum blocks and trust that it was unlikely that future servers would overwrite all of their written blocks. Overwriting was always possible, which would be bad enough, but it also required so much IO that we had to use long election timeouts to avoid spurious fencing. These longer timeouts had already gone wrong on some storage configurations, leading to hung mounts. To fix this and other problems we see coming, like live membership changes, we now specifically configure the number and identity of mounts which will be participating in quorum voting. With specific identities, mounts now have a corresponding specific block they can write to and which future servers can read from to see if they're still running. We change the quorum config in the super block from a single quorum_count to an array of quorum slots which specify the address of the mount that is assigned to that slot. The mount argument to specify a quorum voter changes from "server_addr=$addr" to "quorum_slot_nr=$nr" which specifies the mount's slot. The slot's address is used for udp election messages and tcp server connections. Now that we specifically have configured unique IP addresses for all the quorum members, we can use UDP messages to send and receive the vote mesages in the raft protocol to elect a leader. The quorum code doesn't have to read and write disk block votes and is a more reasonable core loop that either waits for received network messages or timeouts to advance the raft election state machine. The quorum blocks are now used for slots to store their persistent raft term and to set their leader state. We have event fields in the block to record the timestamp of the most recent interesting events that happened to the slot. Now that raft doesn't use IO, we can leave the quorum election work running in the background. The raft work in the quorum members is always running so we can use a much more typical raft implementation with heartbeats. Critically, this decouples the client and election life cycles. Quorum is always running and is responsible for starting and stopping the server. The client repeatedly tries to connect to a server, it has nothing to do with deciding to participate in quorum. Finally, we add a quorum/status sysfs file which shows the state of the quorum raft protocol in a member mount and has the last messages that were sent to or received from the other members. Signed-off-by: Zach Brown <zab@versity.com>
188 lines
3.9 KiB
C
188 lines
3.9 KiB
C
/*
|
|
* Copyright (C) 2017 Versity Software, Inc. All rights reserved.
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public
|
|
* License v2 as published by the Free Software Foundation.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* General Public License for more details.
|
|
*/
|
|
#include <linux/kernel.h>
|
|
#include <linux/module.h>
|
|
#include <linux/fs.h>
|
|
#include <linux/types.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/debugfs.h>
|
|
#include <linux/namei.h>
|
|
|
|
#include <linux/parser.h>
|
|
#include <linux/inet.h>
|
|
#include <linux/string.h>
|
|
#include <linux/in.h>
|
|
|
|
#include "msg.h"
|
|
#include "options.h"
|
|
#include "super.h"
|
|
|
|
static const match_table_t tokens = {
|
|
{Opt_quorum_slot_nr, "quorum_slot_nr=%s"},
|
|
{Opt_metadev_path, "metadev_path=%s"},
|
|
{Opt_err, NULL}
|
|
};
|
|
|
|
struct options_sb_info {
|
|
struct dentry *debugfs_dir;
|
|
};
|
|
|
|
u32 scoutfs_option_u32(struct super_block *sb, int token)
|
|
{
|
|
WARN_ON_ONCE(1);
|
|
return 0;
|
|
}
|
|
|
|
static int parse_bdev_path(struct super_block *sb, substring_t *substr,
|
|
char **bdev_path_ret)
|
|
{
|
|
char *bdev_path;
|
|
struct inode *bdev_inode;
|
|
struct path path;
|
|
bool got_path = false;
|
|
int ret;
|
|
|
|
bdev_path = match_strdup(substr);
|
|
if (!bdev_path) {
|
|
scoutfs_err(sb, "bdev string dup failed");
|
|
ret = -ENOMEM;
|
|
goto out;
|
|
}
|
|
|
|
ret = kern_path(bdev_path, LOOKUP_FOLLOW, &path);
|
|
if (ret) {
|
|
scoutfs_err(sb, "path %s not found for bdev: error %d",
|
|
bdev_path, ret);
|
|
goto out;
|
|
}
|
|
got_path = true;
|
|
|
|
bdev_inode = d_inode(path.dentry);
|
|
if (!S_ISBLK(bdev_inode->i_mode)) {
|
|
scoutfs_err(sb, "path %s for bdev is not a block device",
|
|
bdev_path);
|
|
ret = -ENOTBLK;
|
|
goto out;
|
|
}
|
|
|
|
out:
|
|
if (got_path) {
|
|
path_put(&path);
|
|
}
|
|
|
|
if (ret < 0) {
|
|
kfree(bdev_path);
|
|
} else {
|
|
*bdev_path_ret = bdev_path;
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
int scoutfs_parse_options(struct super_block *sb, char *options,
|
|
struct mount_options *parsed)
|
|
{
|
|
substring_t args[MAX_OPT_ARGS];
|
|
int nr;
|
|
int token;
|
|
char *p;
|
|
int ret;
|
|
|
|
/* Set defaults */
|
|
memset(parsed, 0, sizeof(*parsed));
|
|
parsed->quorum_slot_nr = -1;
|
|
|
|
while ((p = strsep(&options, ",")) != NULL) {
|
|
if (!*p)
|
|
continue;
|
|
|
|
token = match_token(p, tokens, args);
|
|
switch (token) {
|
|
case Opt_quorum_slot_nr:
|
|
|
|
if (parsed->quorum_slot_nr != -1) {
|
|
scoutfs_err(sb, "multiple quorum_slot_nr options provided, only provide one.");
|
|
return -EINVAL;
|
|
}
|
|
|
|
ret = match_int(args, &nr);
|
|
if (ret < 0 || nr < 0 ||
|
|
nr >= SCOUTFS_QUORUM_MAX_SLOTS) {
|
|
scoutfs_err(sb, "invalid quorum_slot_nr option, must be between 0 and %u",
|
|
SCOUTFS_QUORUM_MAX_SLOTS - 1);
|
|
if (ret == 0)
|
|
ret = -EINVAL;
|
|
return ret;
|
|
}
|
|
parsed->quorum_slot_nr = nr;
|
|
break;
|
|
case Opt_metadev_path:
|
|
|
|
ret = parse_bdev_path(sb, &args[0],
|
|
&parsed->metadev_path);
|
|
if (ret < 0)
|
|
return ret;
|
|
break;
|
|
default:
|
|
scoutfs_err(sb, "Unknown or malformed option, \"%s\"",
|
|
p);
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!parsed->metadev_path) {
|
|
scoutfs_err(sb, "Required mount option \"metadev_path\" not found");
|
|
return -EINVAL;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
int scoutfs_options_setup(struct super_block *sb)
|
|
{
|
|
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
|
struct options_sb_info *osi;
|
|
int ret;
|
|
|
|
osi = kzalloc(sizeof(struct options_sb_info), GFP_KERNEL);
|
|
if (!osi)
|
|
return -ENOMEM;
|
|
|
|
sbi->options = osi;
|
|
|
|
osi->debugfs_dir = debugfs_create_dir("options", sbi->debug_root);
|
|
if (!osi->debugfs_dir) {
|
|
ret = -ENOMEM;
|
|
goto out;
|
|
}
|
|
|
|
ret = 0;
|
|
out:
|
|
if (ret)
|
|
scoutfs_options_destroy(sb);
|
|
return ret;
|
|
}
|
|
|
|
void scoutfs_options_destroy(struct super_block *sb)
|
|
{
|
|
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
|
struct options_sb_info *osi = sbi->options;
|
|
|
|
if (osi) {
|
|
if (osi->debugfs_dir)
|
|
debugfs_remove_recursive(osi->debugfs_dir);
|
|
kfree(osi);
|
|
sbi->options = NULL;
|
|
}
|
|
}
|