mirror of
https://github.com/versity/scoutfs.git
synced 2026-01-07 20:45:18 +00:00
Quorum members will try to elect a new leader when they don't receive heartbeats from the currently elected leader. This timeout is short to encourage restoring service promptly. Heartbeats are sent from the quorum worker thread and are delayed while it synchronously starts up the server, which includes fencing previous servers. If fence requests take too long then heartbeats will be delayed long enough for remaining quorum members to elect a new leader while the recently elected server is still busy fencing. To fix this we decouple server startup from the quorum main thread. Server starting and stopping becomes asynchronous so the quorum thread is able to send heartbeats while the server work is off starting up and fencing. The server used to call into quorum to clear a flag as it exited. We remove that mechanism and have the server maintain a running status that quorum can query. We add some state to the quorum work to track the asynchronous state of the server. This lets the quorum protocol change roles immediately as needed while remembering that there is a server running that needs to be acted on. The server used to also call into quorum to update quorum blocks. This is a read-modify-write operation that has to be serialized. Now that we have both the server starting up and the quorum work running they both can't perform these read-modify-write cycles. Instead we have the quorum work own all the block updates and it queries the server status to determine when it should update the quorum block to indicate that the server has fenced or shut down. Signed-off-by: Zach Brown <zab@versity.com>
1228 lines
36 KiB
C
1228 lines
36 KiB
C
/*
|
|
* Copyright (C) 2019 Versity Software, Inc. All rights reserved.
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public
|
|
* License v2 as published by the Free Software Foundation.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* General Public License for more details.
|
|
*/
|
|
#include <linux/kernel.h>
|
|
#include <linux/fs.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/crc32c.h>
|
|
#include <linux/delay.h>
|
|
#include <linux/random.h>
|
|
#include <linux/sched.h>
|
|
#include <linux/hrtimer.h>
|
|
#include <linux/net.h>
|
|
#include <linux/inet.h>
|
|
#include <linux/in.h>
|
|
#include <net/sock.h>
|
|
#include <net/tcp.h>
|
|
|
|
#include "format.h"
|
|
#include "msg.h"
|
|
#include "counters.h"
|
|
#include "quorum.h"
|
|
#include "server.h"
|
|
#include "block.h"
|
|
#include "net.h"
|
|
#include "sysfs.h"
|
|
#include "fence.h"
|
|
#include "scoutfs_trace.h"
|
|
|
|
/*
|
|
* This quorum subsystem is responsible for ensuring that only one
|
|
* server is ever running among the mounts and has exclusive read/write
|
|
* access to the server structures in the metadata device.
|
|
*
|
|
* A specific set of mounts are quorum members as indicated by the
|
|
* quorum_slot_nr mount option. That option refers to the slot in the
|
|
* super block that contains their configuration. Only these mounts
|
|
* participate in the election of the leader.
|
|
*
|
|
* As each quorum member mounts it starts background work that uses a
|
|
* simplified raft leader election protocol to elect a leader. Each
|
|
* mount listens on a udp socket at the address found in its slot in the
|
|
* super block. It then sends and receives raft messages to and from
|
|
* the other slot addresses in the super block. As the protocol
|
|
* progresses eventually a mount will receive enough votes to become the
|
|
* leader. We're not using the full key-value store of raft, just the
|
|
* leadership election. Much of the functionality matches the raft
|
|
* concepts (roles, messages, timeouts) but there's no key value logs to
|
|
* synchronize.
|
|
*
|
|
* Once elected leader, the mount now has to ensure that it's the only
|
|
* running server. There could be previously elected servers still
|
|
* running (maybe they've deadlocked, or lost network communications).
|
|
* In addition to a configuration slot in the super block, each quorum
|
|
* member also has a known block location that represents their slot.
|
|
* The block contains an array of events which are updated during the life
|
|
* time of the quorum agent. The elected leader set its elected event
|
|
* and can then start the server.
|
|
*
|
|
* It's critical to raft elections that a participant's term not go
|
|
* backwards in time so each mount also uses its quorum block to store
|
|
* the greatest term it has used in messages.
|
|
*
|
|
* The quorum work still runs in the background while the server is
|
|
* running. The leader quorum work will regularly send heartbeat
|
|
* messages to the other quorum members to keep them from electing a new
|
|
* leader. If the server shuts down, or the mount disappears, the other
|
|
* quorum members will stop receiving heartbeats and will elect a new
|
|
* leader.
|
|
*
|
|
* Typically we require a strict majority of the configured quorum
|
|
* members to elect a leader. However, for simple usability, we do
|
|
* allow a majority of 1 when there are only one or two quorum members.
|
|
* In the two member case this can lead to split elections where each
|
|
* mount races to elect itself as leader and attempt to fence the other.
|
|
* The random election timeouts in raft make this unlikely, but it is
|
|
* possible.
|
|
*/
|
|
|
|
/*
|
|
* The fields of the message that the receiver can use after the message
|
|
* has been validated.
|
|
*/
|
|
struct quorum_host_msg {
|
|
u64 term;
|
|
u8 type;
|
|
u8 from;
|
|
};
|
|
|
|
struct last_msg {
|
|
struct quorum_host_msg msg;
|
|
ktime_t ts;
|
|
};
|
|
|
|
enum quorum_role { FOLLOWER, CANDIDATE, LEADER };
|
|
|
|
struct quorum_status {
|
|
enum quorum_role role;
|
|
u64 term;
|
|
u64 server_start_term;
|
|
int server_event;
|
|
int vote_for;
|
|
unsigned long vote_bits;
|
|
ktime_t timeout;
|
|
};
|
|
|
|
struct quorum_info {
|
|
struct super_block *sb;
|
|
struct work_struct work;
|
|
struct socket *sock;
|
|
bool shutdown;
|
|
|
|
int our_quorum_slot_nr;
|
|
int votes_needed;
|
|
|
|
spinlock_t show_lock;
|
|
struct quorum_status show_status;
|
|
struct last_msg last_send[SCOUTFS_QUORUM_MAX_SLOTS];
|
|
struct last_msg last_recv[SCOUTFS_QUORUM_MAX_SLOTS];
|
|
|
|
struct scoutfs_sysfs_attrs ssa;
|
|
};
|
|
|
|
#define DECLARE_QUORUM_INFO(sb, name) \
|
|
struct quorum_info *name = SCOUTFS_SB(sb)->quorum_info
|
|
#define DECLARE_QUORUM_INFO_KOBJ(kobj, name) \
|
|
DECLARE_QUORUM_INFO(SCOUTFS_SYSFS_ATTRS_SB(kobj), name)
|
|
|
|
static bool quorum_slot_present(struct scoutfs_super_block *super, int i)
|
|
{
|
|
BUG_ON(i < 0 || i > SCOUTFS_QUORUM_MAX_SLOTS);
|
|
|
|
return super->qconf.slots[i].addr.v4.family == cpu_to_le16(SCOUTFS_AF_IPV4);
|
|
}
|
|
|
|
static ktime_t election_timeout(void)
|
|
{
|
|
return ktime_add_ms(ktime_get(), SCOUTFS_QUORUM_ELECT_MIN_MS +
|
|
prandom_u32_max(SCOUTFS_QUORUM_ELECT_VAR_MS));
|
|
}
|
|
|
|
static ktime_t heartbeat_interval(void)
|
|
{
|
|
return ktime_add_ms(ktime_get(), SCOUTFS_QUORUM_HB_IVAL_MS);
|
|
}
|
|
|
|
static ktime_t heartbeat_timeout(void)
|
|
{
|
|
return ktime_add_ms(ktime_get(), SCOUTFS_QUORUM_HB_TIMEO_MS);
|
|
}
|
|
|
|
static int create_socket(struct super_block *sb)
|
|
{
|
|
DECLARE_QUORUM_INFO(sb, qinf);
|
|
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
|
struct socket *sock = NULL;
|
|
struct sockaddr_in sin;
|
|
int addrlen;
|
|
int ret;
|
|
|
|
ret = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
|
|
if (ret) {
|
|
scoutfs_err(sb, "quorum couldn't create udp socket: %d", ret);
|
|
goto out;
|
|
}
|
|
|
|
sock->sk->sk_allocation = GFP_NOFS;
|
|
|
|
scoutfs_quorum_slot_sin(super, qinf->our_quorum_slot_nr, &sin);
|
|
|
|
addrlen = sizeof(sin);
|
|
ret = kernel_bind(sock, (struct sockaddr *)&sin, addrlen);
|
|
if (ret) {
|
|
scoutfs_err(sb, "quorum failed to bind udp socket to "SIN_FMT": %d",
|
|
SIN_ARG(&sin), ret);
|
|
goto out;
|
|
}
|
|
|
|
out:
|
|
if (ret < 0 && sock) {
|
|
sock_release(sock);
|
|
sock = NULL;
|
|
}
|
|
qinf->sock = sock;
|
|
return ret;
|
|
}
|
|
|
|
static __le32 quorum_message_crc(struct scoutfs_quorum_message *qmes)
|
|
{
|
|
/* crc up to the crc field at the end */
|
|
unsigned int len = offsetof(struct scoutfs_quorum_message, crc);
|
|
|
|
return cpu_to_le32(crc32c(~0, qmes, len));
|
|
}
|
|
|
|
static void send_msg_members(struct super_block *sb, int type, u64 term,
|
|
int only)
|
|
{
|
|
DECLARE_QUORUM_INFO(sb, qinf);
|
|
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
|
ktime_t now;
|
|
int i;
|
|
|
|
struct scoutfs_quorum_message qmes = {
|
|
.fsid = super->hdr.fsid,
|
|
.term = cpu_to_le64(term),
|
|
.type = type,
|
|
.from = qinf->our_quorum_slot_nr,
|
|
};
|
|
struct kvec kv = {
|
|
.iov_base = &qmes,
|
|
.iov_len = sizeof(qmes),
|
|
};
|
|
struct sockaddr_in sin;
|
|
struct msghdr mh = {
|
|
.msg_iov = (struct iovec *)&kv,
|
|
.msg_iovlen = 1,
|
|
.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL,
|
|
.msg_name = &sin,
|
|
.msg_namelen = sizeof(sin),
|
|
};
|
|
|
|
trace_scoutfs_quorum_send_message(sb, term, type, only);
|
|
|
|
qmes.crc = quorum_message_crc(&qmes);
|
|
|
|
|
|
for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
|
|
if (!quorum_slot_present(super, i) ||
|
|
(only >= 0 && i != only) || i == qinf->our_quorum_slot_nr)
|
|
continue;
|
|
|
|
scoutfs_quorum_slot_sin(super, i, &sin);
|
|
now = ktime_get();
|
|
kernel_sendmsg(qinf->sock, &mh, &kv, 1, kv.iov_len);
|
|
|
|
spin_lock(&qinf->show_lock);
|
|
qinf->last_send[i].msg.term = term;
|
|
qinf->last_send[i].msg.type = type;
|
|
qinf->last_send[i].ts = now;
|
|
spin_unlock(&qinf->show_lock);
|
|
|
|
if (i == only)
|
|
break;
|
|
}
|
|
}
|
|
|
|
#define send_msg_to(sb, type, term, nr) send_msg_members(sb, type, term, nr)
|
|
#define send_msg_others(sb, type, term) send_msg_members(sb, type, term, -1)
|
|
|
|
/*
|
|
* The caller passes in their absolute timeout which we translate to a
|
|
* relative timeval for RCVTIMEO. It defines a 0.0 timeval as blocking
|
|
* indefinitely so we're careful to set dontwait if we happen to hit a
|
|
* 0.0 timeval.
|
|
*/
|
|
static int recv_msg(struct super_block *sb, struct quorum_host_msg *msg,
|
|
ktime_t abs_to)
|
|
{
|
|
DECLARE_QUORUM_INFO(sb, qinf);
|
|
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
|
struct scoutfs_quorum_message qmes;
|
|
struct timeval tv;
|
|
ktime_t rel_to;
|
|
ktime_t now;
|
|
int ret;
|
|
|
|
struct kvec kv = {
|
|
.iov_base = &qmes,
|
|
.iov_len = sizeof(struct scoutfs_quorum_message),
|
|
};
|
|
struct msghdr mh = {
|
|
.msg_iov = (struct iovec *)&kv,
|
|
.msg_iovlen = 1,
|
|
.msg_flags = MSG_NOSIGNAL,
|
|
};
|
|
|
|
memset(msg, 0, sizeof(*msg));
|
|
|
|
now = ktime_get();
|
|
if (ktime_before(now, abs_to))
|
|
rel_to = ktime_sub(abs_to, now);
|
|
else
|
|
rel_to = ns_to_ktime(0);
|
|
|
|
tv = ktime_to_timeval(rel_to);
|
|
if (tv.tv_sec == 0 && tv.tv_usec == 0) {
|
|
mh.msg_flags |= MSG_DONTWAIT;
|
|
} else {
|
|
ret = kernel_setsockopt(qinf->sock, SOL_SOCKET, SO_RCVTIMEO,
|
|
(char *)&tv, sizeof(tv));
|
|
if (ret < 0)
|
|
return ret;
|
|
}
|
|
|
|
ret = kernel_recvmsg(qinf->sock, &mh, &kv, 1, kv.iov_len, mh.msg_flags);
|
|
if (ret < 0)
|
|
return ret;
|
|
|
|
now = ktime_get();
|
|
|
|
if (ret != sizeof(qmes) ||
|
|
qmes.crc != quorum_message_crc(&qmes) ||
|
|
qmes.fsid != super->hdr.fsid ||
|
|
qmes.type >= SCOUTFS_QUORUM_MSG_INVALID ||
|
|
qmes.from >= SCOUTFS_QUORUM_MAX_SLOTS ||
|
|
!quorum_slot_present(super, qmes.from)) {
|
|
/* should we be trying to open a new socket? */
|
|
scoutfs_inc_counter(sb, quorum_recv_invalid);
|
|
return -EAGAIN;
|
|
}
|
|
|
|
msg->term = le64_to_cpu(qmes.term);
|
|
msg->type = qmes.type;
|
|
msg->from = qmes.from;
|
|
|
|
trace_scoutfs_quorum_recv_message(sb, msg->term, msg->type, msg->from);
|
|
|
|
spin_lock(&qinf->show_lock);
|
|
qinf->last_recv[msg->from].msg = *msg;
|
|
qinf->last_recv[msg->from].ts = now;
|
|
spin_unlock(&qinf->show_lock);
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Read and verify block fields before giving it to the caller. We
|
|
* should have exclusive write access to the block. We know that
|
|
* something has gone horribly wrong if we don't see our rid in the
|
|
* begin event after we've written it as we started up.
|
|
*/
|
|
static int read_quorum_block(struct super_block *sb, u64 blkno, struct scoutfs_quorum_block *blk,
|
|
bool check_rid)
|
|
{
|
|
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
|
struct scoutfs_super_block *super = &sbi->super;
|
|
const u64 rid = sbi->rid;
|
|
char msg[150];
|
|
__le32 crc;
|
|
int ret;
|
|
|
|
if (WARN_ON_ONCE(blkno < SCOUTFS_QUORUM_BLKNO) ||
|
|
WARN_ON_ONCE(blkno >= (SCOUTFS_QUORUM_BLKNO +
|
|
SCOUTFS_QUORUM_BLOCKS)))
|
|
return -EINVAL;
|
|
|
|
ret = scoutfs_block_read_sm(sb, sbi->meta_bdev, blkno,
|
|
&blk->hdr, sizeof(*blk), &crc);
|
|
if (ret < 0) {
|
|
scoutfs_err(sb, "quorum block read error %d", ret);
|
|
goto out;
|
|
}
|
|
|
|
/* detect invalid blocks */
|
|
if (blk->hdr.crc != crc)
|
|
snprintf(msg, sizeof(msg), "blk crc %08x != %08x",
|
|
le32_to_cpu(blk->hdr.crc), le32_to_cpu(crc));
|
|
else if (le32_to_cpu(blk->hdr.magic) != SCOUTFS_BLOCK_MAGIC_QUORUM)
|
|
snprintf(msg, sizeof(msg), "blk magic %08x != %08x",
|
|
le32_to_cpu(blk->hdr.magic), SCOUTFS_BLOCK_MAGIC_QUORUM);
|
|
else if (blk->hdr.fsid != super->hdr.fsid)
|
|
snprintf(msg, sizeof(msg), "blk fsid %016llx != %016llx",
|
|
le64_to_cpu(blk->hdr.fsid), le64_to_cpu(super->hdr.fsid));
|
|
else if (le64_to_cpu(blk->hdr.blkno) != blkno)
|
|
snprintf(msg, sizeof(msg), "blk blkno %llu != %llu",
|
|
le64_to_cpu(blk->hdr.blkno), blkno);
|
|
else if (check_rid && le64_to_cpu(blk->events[SCOUTFS_QUORUM_EVENT_BEGIN].rid) != rid)
|
|
snprintf(msg, sizeof(msg), "quorum block begin rid %016llx != our rid %016llx, are multiple mounts configured with this slot?",
|
|
le64_to_cpu(blk->events[SCOUTFS_QUORUM_EVENT_BEGIN].rid), rid);
|
|
else
|
|
msg[0] = '\0';
|
|
|
|
if (msg[0] != '\0') {
|
|
scoutfs_err(sb, "read invalid quorum block, %s", msg);
|
|
ret = -EIO;
|
|
goto out;
|
|
}
|
|
|
|
out:
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* It's really important in raft elections that the term not go
|
|
* backwards in time. We achieve this by having each participant record
|
|
* the greatest term they've seen in their quorum block. It's also
|
|
* important that participants agree on the greatest term. It can
|
|
* happen that one gets ahead of the rest, perhaps by being forcefully
|
|
* shutdown after having just been elected. As everyone starts up it's
|
|
* possible to have N-1 have term T-1 while just one participant thinks
|
|
* the term is T. That single participant will ignore all messages
|
|
* from older terms. If its timeout is greater then the others it can
|
|
* immediately override the election of the majority and request votes
|
|
* and become elected.
|
|
*
|
|
* A best-effort work around is to have everyone try and start from the
|
|
* greatest term that they can find in everyone's blocks. If it works
|
|
* then you avoid having those with greater terms ignore others. If it
|
|
* doesn't work the elections will eventually stabilize after rocky
|
|
* periods of fencing from what looks like concurrent elections.
|
|
*/
|
|
static void read_greatest_term(struct super_block *sb, u64 *term)
|
|
{
|
|
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
|
struct scoutfs_super_block *super = &sbi->super;
|
|
struct scoutfs_quorum_block blk;
|
|
int ret;
|
|
int e;
|
|
int s;
|
|
|
|
*term = 0;
|
|
|
|
for (s = 0; s < SCOUTFS_QUORUM_MAX_SLOTS; s++) {
|
|
if (!quorum_slot_present(super, s))
|
|
continue;
|
|
|
|
ret = read_quorum_block(sb, SCOUTFS_QUORUM_BLKNO + s, &blk, false);
|
|
if (ret < 0)
|
|
continue;
|
|
|
|
for (e = 0; e < ARRAY_SIZE(blk.events); e++) {
|
|
if (blk.events[e].rid)
|
|
*term = max(*term, le64_to_cpu(blk.events[e].term));
|
|
}
|
|
}
|
|
}
|
|
|
|
static void set_quorum_block_event(struct super_block *sb, struct scoutfs_quorum_block *blk,
|
|
int event, u64 term)
|
|
{
|
|
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
|
struct scoutfs_quorum_block_event *ev;
|
|
struct timespec64 ts;
|
|
|
|
if (WARN_ON_ONCE(event < 0 || event >= SCOUTFS_QUORUM_EVENT_NR))
|
|
return;
|
|
|
|
getnstimeofday64(&ts);
|
|
le64_add_cpu(&blk->write_nr, 1);
|
|
|
|
ev = &blk->events[event];
|
|
ev->write_nr = blk->write_nr;
|
|
ev->rid = cpu_to_le64(sbi->rid);
|
|
ev->term = cpu_to_le64(term);
|
|
ev->ts.sec = cpu_to_le64(ts.tv_sec);
|
|
ev->ts.nsec = cpu_to_le32(ts.tv_nsec);
|
|
}
|
|
|
|
static int write_quorum_block(struct super_block *sb, u64 blkno, struct scoutfs_quorum_block *blk)
|
|
{
|
|
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
|
|
|
if (WARN_ON_ONCE(blkno < SCOUTFS_QUORUM_BLKNO) ||
|
|
WARN_ON_ONCE(blkno >= (SCOUTFS_QUORUM_BLKNO +
|
|
SCOUTFS_QUORUM_BLOCKS)))
|
|
return -EINVAL;
|
|
|
|
return scoutfs_block_write_sm(sb, sbi->meta_bdev, blkno, &blk->hdr, sizeof(*blk));
|
|
}
|
|
|
|
/*
|
|
* Read the caller's slot's quorum block, make a change, and write it
|
|
* back out.
|
|
*/
|
|
static int update_quorum_block(struct super_block *sb, int event, u64 term, bool check_rid)
|
|
{
|
|
DECLARE_QUORUM_INFO(sb, qinf);
|
|
u64 blkno = SCOUTFS_QUORUM_BLKNO + qinf->our_quorum_slot_nr;
|
|
struct scoutfs_quorum_block blk;
|
|
int ret;
|
|
|
|
ret = read_quorum_block(sb, blkno, &blk, check_rid);
|
|
if (ret == 0) {
|
|
set_quorum_block_event(sb, &blk, event, term);
|
|
ret = write_quorum_block(sb, blkno, &blk);
|
|
if (ret < 0)
|
|
scoutfs_err(sb, "error %d reading quorum block %llu to update event %d term %llu",
|
|
ret, blkno, event, term);
|
|
} else {
|
|
scoutfs_err(sb, "error %d writing quorum block %llu after updating event %d term %llu",
|
|
ret, blkno, event, term);
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* The calling server has been elected and has started running but can't
|
|
* yet assume that it has exclusive access to the metadata device. We
|
|
* read all the quorum blocks looking for previously elected leaders to
|
|
* fence so that we're the only leader running.
|
|
*
|
|
* We're relying on the invariant that there can't be two mounts running
|
|
* with the same slot nr at the same time. With this constraint there
|
|
* can be at most two previous leaders per slot that need to be fenced:
|
|
* a persistent record of an old mount on the slot, and an active mount.
|
|
*
|
|
* If we start fence requests then we only wait for them to complete
|
|
* before returning. The server will reclaim their resources once it is
|
|
* up and running and will call us to update the fence event. If we
|
|
* don't start fence requests then we update the fence event
|
|
* immediately, the server has nothing more to do.
|
|
*
|
|
* Quorum will be sending heartbeats while we wait for fencing. That
|
|
* keeps us from being fenced while we allow userspace fencing to take a
|
|
* reasonably long time. We still want to timeout eventually.
|
|
*/
|
|
int scoutfs_quorum_fence_leaders(struct super_block *sb, u64 term)
|
|
{
|
|
#define NR_OLD 2
|
|
struct scoutfs_quorum_block_event old[SCOUTFS_QUORUM_MAX_SLOTS][NR_OLD] = {{{0,}}};
|
|
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
|
struct scoutfs_super_block *super = &sbi->super;
|
|
struct scoutfs_quorum_block blk;
|
|
struct sockaddr_in sin;
|
|
const u64 rid = sbi->rid;
|
|
bool fence_started = false;
|
|
u64 fenced = 0;
|
|
__le64 fence_rid;
|
|
int ret = 0;
|
|
int err;
|
|
int i;
|
|
int j;
|
|
|
|
BUILD_BUG_ON(SCOUTFS_QUORUM_BLOCKS < SCOUTFS_QUORUM_MAX_SLOTS);
|
|
|
|
for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
|
|
if (!quorum_slot_present(super, i))
|
|
continue;
|
|
|
|
ret = read_quorum_block(sb, SCOUTFS_QUORUM_BLKNO + i, &blk, false);
|
|
if (ret < 0)
|
|
goto out;
|
|
|
|
/* elected leader still running */
|
|
if (le64_to_cpu(blk.events[SCOUTFS_QUORUM_EVENT_ELECT].term) >
|
|
le64_to_cpu(blk.events[SCOUTFS_QUORUM_EVENT_STOP].term))
|
|
old[i][0] = blk.events[SCOUTFS_QUORUM_EVENT_ELECT];
|
|
|
|
/* persistent record of previous server before elected */
|
|
if ((le64_to_cpu(blk.events[SCOUTFS_QUORUM_EVENT_FENCE].term) >
|
|
le64_to_cpu(blk.events[SCOUTFS_QUORUM_EVENT_STOP].term)) &&
|
|
(le64_to_cpu(blk.events[SCOUTFS_QUORUM_EVENT_FENCE].term) <
|
|
le64_to_cpu(blk.events[SCOUTFS_QUORUM_EVENT_ELECT].term)))
|
|
old[i][1] = blk.events[SCOUTFS_QUORUM_EVENT_FENCE];
|
|
|
|
/* find greatest term that has fenced everything before it */
|
|
fenced = max(fenced, le64_to_cpu(blk.events[SCOUTFS_QUORUM_EVENT_FENCE].term));
|
|
}
|
|
|
|
/* now actually fence any old leaders which haven't been fenced yet */
|
|
for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
|
|
for (j = 0; j < NR_OLD; j++) {
|
|
if (le64_to_cpu(old[i][j].term) == 0 || /* uninitialized */
|
|
le64_to_cpu(old[i][j].term) < fenced || /* already fenced */
|
|
le64_to_cpu(old[i][j].term) > term || /* newer than us */
|
|
le64_to_cpu(old[i][j].rid) == rid) /* us */
|
|
continue;
|
|
|
|
scoutfs_inc_counter(sb, quorum_fence_leader);
|
|
scoutfs_quorum_slot_sin(super, i, &sin);
|
|
fence_rid = old[i][j].rid;
|
|
|
|
scoutfs_info(sb, "fencing previous leader "SCSBF" at term %llu in slot %u with address "SIN_FMT,
|
|
SCSB_LEFR_ARGS(super->hdr.fsid, fence_rid),
|
|
le64_to_cpu(old[i][j].term), i, SIN_ARG(&sin));
|
|
ret = scoutfs_fence_start(sb, le64_to_cpu(fence_rid), sin.sin_addr.s_addr,
|
|
SCOUTFS_FENCE_QUORUM_BLOCK_LEADER);
|
|
if (ret < 0)
|
|
goto out;
|
|
fence_started = true;
|
|
}
|
|
}
|
|
|
|
out:
|
|
err = scoutfs_fence_wait_fenced(sb, msecs_to_jiffies(SCOUTFS_QUORUM_FENCE_TO_MS));
|
|
if (ret == 0)
|
|
ret = err;
|
|
|
|
if (ret < 0)
|
|
scoutfs_inc_counter(sb, quorum_fence_error);
|
|
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* The main quorum task maintains its private status. It seemed cleaner
|
|
* to occasionally copy the status for showing in sysfs/debugfs files
|
|
* than to have the two lock access to shared status. The show copy is
|
|
* updated after being modified before the quorum task sleeps for a
|
|
* significant amount of time, either waiting on timeouts or interacting
|
|
* with the server.
|
|
*/
|
|
static void update_show_status(struct quorum_info *qinf, struct quorum_status *qst)
|
|
{
|
|
spin_lock(&qinf->show_lock);
|
|
qinf->show_status = *qst;
|
|
spin_unlock(&qinf->show_lock);
|
|
}
|
|
|
|
/*
|
|
* The quorum work always runs in the background of quorum member
|
|
* mounts. It's responsible for starting and stopping the server if
|
|
* it's elected leader. While it's leader it sends heartbeats to
|
|
* suppress other quorum work from standing for election.
|
|
*/
|
|
static void scoutfs_quorum_worker(struct work_struct *work)
|
|
{
|
|
struct quorum_info *qinf = container_of(work, struct quorum_info, work);
|
|
struct super_block *sb = qinf->sb;
|
|
struct sockaddr_in unused;
|
|
struct quorum_host_msg msg;
|
|
struct quorum_status qst = {0,};
|
|
int ret;
|
|
int err;
|
|
|
|
/* recording votes from slots as native single word bitmap */
|
|
BUILD_BUG_ON(SCOUTFS_QUORUM_MAX_SLOTS > BITS_PER_LONG);
|
|
|
|
/* start out as a follower */
|
|
qst.role = FOLLOWER;
|
|
qst.vote_for = -1;
|
|
|
|
/* read our starting term from greatest in all events in all slots */
|
|
read_greatest_term(sb, &qst.term);
|
|
|
|
/* see if there's a server to chose heartbeat or election timeout */
|
|
if (scoutfs_quorum_server_sin(sb, &unused) == 0)
|
|
qst.timeout = heartbeat_timeout();
|
|
else
|
|
qst.timeout = election_timeout();
|
|
|
|
/* record that we're up and running, readers check that it isn't updated */
|
|
ret = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_BEGIN, qst.term, false);
|
|
if (ret < 0)
|
|
goto out;
|
|
|
|
while (!(qinf->shutdown || scoutfs_forcing_unmount(sb))) {
|
|
|
|
update_show_status(qinf, &qst);
|
|
|
|
ret = recv_msg(sb, &msg, qst.timeout);
|
|
if (ret < 0) {
|
|
if (ret != -ETIMEDOUT && ret != -EAGAIN) {
|
|
scoutfs_err(sb, "quorum recvmsg err %d", ret);
|
|
scoutfs_inc_counter(sb, quorum_recv_error);
|
|
goto out;
|
|
}
|
|
msg.type = SCOUTFS_QUORUM_MSG_INVALID;
|
|
ret = 0;
|
|
}
|
|
|
|
/* ignore messages from older terms */
|
|
if (msg.type != SCOUTFS_QUORUM_MSG_INVALID &&
|
|
msg.term < qst.term)
|
|
msg.type = SCOUTFS_QUORUM_MSG_INVALID;
|
|
|
|
trace_scoutfs_quorum_loop(sb, qst.role, qst.term, qst.vote_for,
|
|
qst.vote_bits,
|
|
ktime_to_timespec64(qst.timeout));
|
|
|
|
/* receiving greater terms resets term, becomes follower */
|
|
if (msg.type != SCOUTFS_QUORUM_MSG_INVALID &&
|
|
msg.term > qst.term) {
|
|
if (qst.role == LEADER) {
|
|
scoutfs_warn(sb, "saw msg type %u from %u for term %llu while leader in term %llu, shutting down server.",
|
|
msg.type, msg.from, msg.term, qst.term);
|
|
}
|
|
qst.role = FOLLOWER;
|
|
qst.term = msg.term;
|
|
qst.vote_for = -1;
|
|
qst.vote_bits = 0;
|
|
scoutfs_inc_counter(sb, quorum_term_follower);
|
|
|
|
if (msg.type == SCOUTFS_QUORUM_MSG_HEARTBEAT)
|
|
qst.timeout = heartbeat_timeout();
|
|
else
|
|
qst.timeout = election_timeout();
|
|
|
|
/* store our increased term */
|
|
ret = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_TERM, qst.term, true);
|
|
if (ret < 0)
|
|
goto out;
|
|
}
|
|
|
|
/* followers and candidates start new election on timeout */
|
|
if (qst.role != LEADER &&
|
|
ktime_after(ktime_get(), qst.timeout)) {
|
|
/* .. but only if their server has stopped */
|
|
if (!scoutfs_server_is_down(sb)) {
|
|
qst.timeout = election_timeout();
|
|
scoutfs_inc_counter(sb, quorum_candidate_server_stopping);
|
|
continue;
|
|
}
|
|
|
|
qst.role = CANDIDATE;
|
|
qst.term++;
|
|
qst.vote_for = -1;
|
|
qst.vote_bits = 0;
|
|
set_bit(qinf->our_quorum_slot_nr, &qst.vote_bits);
|
|
send_msg_others(sb, SCOUTFS_QUORUM_MSG_REQUEST_VOTE,
|
|
qst.term);
|
|
qst.timeout = election_timeout();
|
|
scoutfs_inc_counter(sb, quorum_send_request);
|
|
|
|
/* store our increased term */
|
|
ret = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_TERM, qst.term, true);
|
|
if (ret < 0)
|
|
goto out;
|
|
}
|
|
|
|
/* candidates count votes in their term */
|
|
if (qst.role == CANDIDATE &&
|
|
msg.type == SCOUTFS_QUORUM_MSG_VOTE) {
|
|
if (test_and_set_bit(msg.from, &qst.vote_bits)) {
|
|
scoutfs_warn(sb, "already received vote from %u in term %llu, are there multiple mounts with quorum_slot_nr=%u?",
|
|
msg.from, qst.term, msg.from);
|
|
}
|
|
scoutfs_inc_counter(sb, quorum_recv_vote);
|
|
}
|
|
|
|
/*
|
|
* Candidates become leaders when they receive enough
|
|
* votes. (Possibly only counting their own vote in
|
|
* single vote majorities.)
|
|
*/
|
|
if (qst.role == CANDIDATE &&
|
|
hweight_long(qst.vote_bits) >= qinf->votes_needed) {
|
|
qst.role = LEADER;
|
|
scoutfs_inc_counter(sb, quorum_elected);
|
|
|
|
/* send heartbeat before server starts */
|
|
send_msg_others(sb, SCOUTFS_QUORUM_MSG_HEARTBEAT,
|
|
qst.term);
|
|
qst.timeout = heartbeat_interval();
|
|
|
|
update_show_status(qinf, &qst);
|
|
|
|
/* record that we've been elected before starting up server */
|
|
ret = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_ELECT, qst.term, true);
|
|
if (ret < 0)
|
|
goto out;
|
|
|
|
qst.server_start_term = qst.term;
|
|
qst.server_event = SCOUTFS_QUORUM_EVENT_ELECT;
|
|
scoutfs_server_start(sb, qst.term);
|
|
}
|
|
|
|
/*
|
|
* This leader's server is up, having finished fencing
|
|
* previous leaders. We update the fence event with the
|
|
* current term to let future leaders know that previous
|
|
* servers have been fenced.
|
|
*/
|
|
if (qst.role == LEADER && qst.server_event != SCOUTFS_QUORUM_EVENT_FENCE &&
|
|
scoutfs_server_is_up(sb)) {
|
|
ret = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_FENCE, qst.term, true);
|
|
if (ret < 0)
|
|
goto out;
|
|
qst.server_event = SCOUTFS_QUORUM_EVENT_FENCE;
|
|
}
|
|
|
|
/*
|
|
* Stop a running server if we're no longer leader in
|
|
* its term.
|
|
*/
|
|
if (!(qst.role == LEADER && qst.term == qst.server_start_term) &&
|
|
scoutfs_server_is_running(sb)) {
|
|
scoutfs_server_stop(sb);
|
|
}
|
|
|
|
/*
|
|
* A previously running server has stopped. The quorum
|
|
* protocol might have shut it down by changing roles or
|
|
* it might have stopped on its own, perhaps on errors.
|
|
* If we're still a leader then we become a follower and
|
|
* send resignations to encourage the next election.
|
|
* Always update the _STOP event to stop connections and
|
|
* fencing.
|
|
*/
|
|
if (qst.server_start_term > 0 && scoutfs_server_is_down(sb)) {
|
|
if (qst.role == LEADER) {
|
|
qst.role = FOLLOWER;
|
|
qst.vote_for = -1;
|
|
qst.vote_bits = 0;
|
|
qst.timeout = election_timeout();
|
|
scoutfs_inc_counter(sb, quorum_server_shutdown);
|
|
|
|
send_msg_others(sb, SCOUTFS_QUORUM_MSG_RESIGNATION,
|
|
qst.server_start_term);
|
|
scoutfs_inc_counter(sb, quorum_send_resignation);
|
|
}
|
|
|
|
ret = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_STOP,
|
|
qst.server_start_term, true);
|
|
if (ret < 0)
|
|
goto out;
|
|
|
|
qst.server_start_term = 0;
|
|
}
|
|
|
|
/* leaders regularly send heartbeats to delay elections */
|
|
if (qst.role == LEADER &&
|
|
ktime_after(ktime_get(), qst.timeout)) {
|
|
send_msg_others(sb, SCOUTFS_QUORUM_MSG_HEARTBEAT,
|
|
qst.term);
|
|
qst.timeout = heartbeat_interval();
|
|
scoutfs_inc_counter(sb, quorum_send_heartbeat);
|
|
}
|
|
|
|
/* receiving heartbeats extends timeout, delaying elections */
|
|
if (msg.type == SCOUTFS_QUORUM_MSG_HEARTBEAT) {
|
|
qst.timeout = heartbeat_timeout();
|
|
scoutfs_inc_counter(sb, quorum_recv_heartbeat);
|
|
}
|
|
|
|
/* receiving a resignation from server starts election */
|
|
if (msg.type == SCOUTFS_QUORUM_MSG_RESIGNATION &&
|
|
qst.role == FOLLOWER &&
|
|
msg.term == qst.term) {
|
|
qst.timeout = election_timeout();
|
|
scoutfs_inc_counter(sb, quorum_recv_resignation);
|
|
}
|
|
|
|
/* followers vote once per term */
|
|
if (qst.role == FOLLOWER &&
|
|
msg.type == SCOUTFS_QUORUM_MSG_REQUEST_VOTE &&
|
|
qst.vote_for == -1) {
|
|
qst.vote_for = msg.from;
|
|
send_msg_to(sb, SCOUTFS_QUORUM_MSG_VOTE, qst.term,
|
|
msg.from);
|
|
scoutfs_inc_counter(sb, quorum_send_vote);
|
|
}
|
|
}
|
|
|
|
update_show_status(qinf, &qst);
|
|
|
|
/* always try to stop a running server as we stop */
|
|
if (scoutfs_server_is_running(sb)) {
|
|
scoutfs_server_stop_wait(sb);
|
|
send_msg_others(sb, SCOUTFS_QUORUM_MSG_RESIGNATION, qst.term);
|
|
|
|
if (qst.server_start_term > 0) {
|
|
err = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_STOP,
|
|
qst.server_start_term, true);
|
|
if (err < 0 && ret == 0)
|
|
ret = err;
|
|
}
|
|
}
|
|
|
|
/* record that this slot no longer has an active quorum */
|
|
update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_END, qst.term, true);
|
|
out:
|
|
if (ret < 0) {
|
|
scoutfs_err(sb, "quorum service saw error %d, shutting down. This mount is no longer participating in quorum. It should be remounted to restore service.",
|
|
ret);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Clients read quorum blocks looking for the leader with a server whose
|
|
* address it can try and connect to.
|
|
*
|
|
* There can be records of multiple previous elected leaders if the
|
|
* current server hasn't yet fenced any old servers. We use the elected
|
|
* leader with the greatest elected term. If we get it wrong the
|
|
* connection will timeout and the client will try again.
|
|
*/
|
|
int scoutfs_quorum_server_sin(struct super_block *sb, struct sockaddr_in *sin)
|
|
{
|
|
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
|
struct scoutfs_super_block *super = &sbi->super;
|
|
struct scoutfs_quorum_block blk;
|
|
u64 elect_term;
|
|
u64 term = 0;
|
|
int ret = 0;
|
|
int i;
|
|
|
|
for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
|
|
if (!quorum_slot_present(super, i))
|
|
continue;
|
|
|
|
ret = read_quorum_block(sb, SCOUTFS_QUORUM_BLKNO + i, &blk, false);
|
|
if (ret < 0) {
|
|
scoutfs_err(sb, "error reading quorum block nr %u: %d",
|
|
i, ret);
|
|
goto out;
|
|
}
|
|
|
|
elect_term = le64_to_cpu(blk.events[SCOUTFS_QUORUM_EVENT_ELECT].term);
|
|
if (elect_term > term &&
|
|
elect_term > le64_to_cpu(blk.events[SCOUTFS_QUORUM_EVENT_STOP].term)) {
|
|
term = elect_term;
|
|
scoutfs_quorum_slot_sin(super, i, sin);
|
|
continue;
|
|
}
|
|
}
|
|
|
|
if (term == 0)
|
|
ret = -ENOENT;
|
|
|
|
out:
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* The number of votes needed for a member to reach quorum and be
|
|
* elected the leader: a majority of the number of present slots in the
|
|
* super block.
|
|
*/
|
|
u8 scoutfs_quorum_votes_needed(struct super_block *sb)
|
|
{
|
|
DECLARE_QUORUM_INFO(sb, qinf);
|
|
|
|
return qinf->votes_needed;
|
|
}
|
|
|
|
void scoutfs_quorum_slot_sin(struct scoutfs_super_block *super, int i,
|
|
struct sockaddr_in *sin)
|
|
{
|
|
BUG_ON(i < 0 || i >= SCOUTFS_QUORUM_MAX_SLOTS);
|
|
|
|
scoutfs_addr_to_sin(sin, &super->qconf.slots[i].addr);
|
|
}
|
|
|
|
static char *role_str(int role)
|
|
{
|
|
static char *roles[] = {
|
|
[FOLLOWER] = "follower",
|
|
[CANDIDATE] = "candidate",
|
|
[LEADER] = "leader",
|
|
};
|
|
|
|
if (role < 0 || role > ARRAY_SIZE(roles) || !roles[role])
|
|
return "invalid";
|
|
|
|
return roles[role];
|
|
}
|
|
|
|
#define snprintf_ret(buf, size, retp, fmt...) \
|
|
do { \
|
|
__typeof__(buf) _buf = buf; \
|
|
__typeof__(size) _size = size; \
|
|
__typeof__(retp) _retp = retp; \
|
|
__typeof__(*retp) _ret = *_retp; \
|
|
__typeof__(*retp) _len; \
|
|
\
|
|
if (_ret >= 0 && _ret < _size) { \
|
|
_len = snprintf(_buf + _ret, _size - _ret, ##fmt); \
|
|
if (_len < 0) \
|
|
_ret = _len; \
|
|
else \
|
|
_ret += _len; \
|
|
*_retp = _ret; \
|
|
} \
|
|
} while (0)
|
|
|
|
static ssize_t status_show(struct kobject *kobj, struct kobj_attribute *attr,
|
|
char *buf)
|
|
{
|
|
DECLARE_QUORUM_INFO_KOBJ(kobj, qinf);
|
|
struct quorum_status qst;
|
|
struct last_msg last;
|
|
struct timespec64 ts;
|
|
const ktime_t now = ktime_get();
|
|
size_t size;
|
|
int ret;
|
|
int i;
|
|
|
|
spin_lock(&qinf->show_lock);
|
|
qst = qinf->show_status;
|
|
spin_unlock(&qinf->show_lock);
|
|
|
|
size = PAGE_SIZE;
|
|
ret = 0;
|
|
|
|
snprintf_ret(buf, size, &ret, "quorum_slot_nr %u\n",
|
|
qinf->our_quorum_slot_nr);
|
|
snprintf_ret(buf, size, &ret, "term %llu\n",
|
|
qst.term);
|
|
snprintf_ret(buf, size, &ret, "server_start_term %llu\n", qst.server_start_term);
|
|
snprintf_ret(buf, size, &ret, "server_event %d\n", qst.server_event);
|
|
snprintf_ret(buf, size, &ret, "role %d (%s)\n",
|
|
qst.role, role_str(qst.role));
|
|
snprintf_ret(buf, size, &ret, "vote_for %d\n",
|
|
qst.vote_for);
|
|
snprintf_ret(buf, size, &ret, "vote_bits 0x%lx (count %lu)\n",
|
|
qst.vote_bits, hweight_long(qst.vote_bits));
|
|
ts = ktime_to_timespec64(ktime_sub(qst.timeout, now));
|
|
snprintf_ret(buf, size, &ret, "timeout_in_secs %lld.%09u\n",
|
|
(s64)ts.tv_sec, (int)ts.tv_nsec);
|
|
|
|
for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
|
|
spin_lock(&qinf->show_lock);
|
|
last = qinf->last_send[i];
|
|
spin_unlock(&qinf->show_lock);
|
|
|
|
if (last.msg.term == 0)
|
|
continue;
|
|
|
|
ts = ktime_to_timespec64(ktime_sub(now, last.ts));
|
|
snprintf_ret(buf, size, &ret,
|
|
"last_send to %u term %llu type %u secs_since %lld.%09u\n",
|
|
i, last.msg.term, last.msg.type,
|
|
(s64)ts.tv_sec, (int)ts.tv_nsec);
|
|
}
|
|
|
|
for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
|
|
spin_lock(&qinf->show_lock);
|
|
last = qinf->last_recv[i];
|
|
spin_unlock(&qinf->show_lock);
|
|
|
|
if (last.msg.term == 0)
|
|
continue;
|
|
|
|
ts = ktime_to_timespec64(ktime_sub(now, last.ts));
|
|
snprintf_ret(buf, size, &ret,
|
|
"last_recv from %u term %llu type %u secs_since %lld.%09u\n",
|
|
i, last.msg.term, last.msg.type,
|
|
(s64)ts.tv_sec, (int)ts.tv_nsec);
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
SCOUTFS_ATTR_RO(status);
|
|
|
|
static ssize_t is_leader_show(struct kobject *kobj,
|
|
struct kobj_attribute *attr, char *buf)
|
|
{
|
|
DECLARE_QUORUM_INFO_KOBJ(kobj, qinf);
|
|
|
|
return snprintf(buf, PAGE_SIZE, "%u",
|
|
!!(qinf->show_status.role == LEADER));
|
|
}
|
|
SCOUTFS_ATTR_RO(is_leader);
|
|
|
|
static struct attribute *quorum_attrs[] = {
|
|
SCOUTFS_ATTR_PTR(status),
|
|
SCOUTFS_ATTR_PTR(is_leader),
|
|
NULL,
|
|
};
|
|
|
|
static inline bool valid_ipv4_unicast(__be32 addr)
|
|
{
|
|
return !(ipv4_is_multicast(addr) && ipv4_is_lbcast(addr) &&
|
|
ipv4_is_zeronet(addr) && ipv4_is_local_multicast(addr));
|
|
}
|
|
|
|
static inline bool valid_ipv4_port(__be16 port)
|
|
{
|
|
return port != 0 && be16_to_cpu(port) != U16_MAX;
|
|
}
|
|
|
|
static int verify_quorum_slots(struct super_block *sb)
|
|
{
|
|
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
|
char slots[(SCOUTFS_QUORUM_MAX_SLOTS * 3) + 1];
|
|
DECLARE_QUORUM_INFO(sb, qinf);
|
|
struct sockaddr_in other;
|
|
struct sockaddr_in sin;
|
|
int found = 0;
|
|
int ret;
|
|
int i;
|
|
int j;
|
|
|
|
|
|
for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
|
|
if (!quorum_slot_present(super, i))
|
|
continue;
|
|
|
|
scoutfs_quorum_slot_sin(super, i, &sin);
|
|
|
|
if (!valid_ipv4_unicast(sin.sin_addr.s_addr)) {
|
|
scoutfs_err(sb, "quorum slot #%d has invalid ipv4 unicast address: "SIN_FMT,
|
|
i, SIN_ARG(&sin));
|
|
return -EINVAL;
|
|
}
|
|
|
|
if (!valid_ipv4_port(sin.sin_port)) {
|
|
scoutfs_err(sb, "quorum slot #%d has invalid ipv4 port number:"SIN_FMT,
|
|
i, SIN_ARG(&sin));
|
|
return -EINVAL;
|
|
}
|
|
|
|
for (j = i + 1; j < SCOUTFS_QUORUM_MAX_SLOTS; j++) {
|
|
if (!quorum_slot_present(super, j))
|
|
continue;
|
|
|
|
scoutfs_quorum_slot_sin(super, j, &other);
|
|
|
|
if (sin.sin_addr.s_addr == other.sin_addr.s_addr &&
|
|
sin.sin_port == other.sin_port) {
|
|
scoutfs_err(sb, "quorum slots #%u and #%u have the same address: "SIN_FMT,
|
|
i, j, SIN_ARG(&sin));
|
|
return -EINVAL;
|
|
}
|
|
}
|
|
|
|
found++;
|
|
}
|
|
|
|
if (found == 0) {
|
|
scoutfs_err(sb, "no populated quorum slots in superblock");
|
|
return -EINVAL;
|
|
}
|
|
|
|
if (!quorum_slot_present(super, qinf->our_quorum_slot_nr)) {
|
|
char *str = slots;
|
|
*str = '\0';
|
|
for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
|
|
if (quorum_slot_present(super, i)) {
|
|
ret = snprintf(str, &slots[ARRAY_SIZE(slots)] - str, "%c%u",
|
|
str == slots ? ' ' : ',', i);
|
|
if (ret < 2 || ret > 3) {
|
|
scoutfs_err(sb, "error gathering populated slots");
|
|
return -EINVAL;
|
|
}
|
|
str += ret;
|
|
}
|
|
}
|
|
scoutfs_err(sb, "quorum_slot_nr=%u option references unused slot, must be one of the following configured slots:%s",
|
|
qinf->our_quorum_slot_nr, slots);
|
|
return -EINVAL;
|
|
}
|
|
|
|
/*
|
|
* Always require a majority except in the pathological cases of
|
|
* 1 or 2 members.
|
|
*/
|
|
if (found < 3)
|
|
qinf->votes_needed = 1;
|
|
else
|
|
qinf->votes_needed = (found / 2) + 1;
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Once this schedules the quorum worker it can be elected leader and
|
|
* start the server, possibly before this returns.
|
|
*/
|
|
int scoutfs_quorum_setup(struct super_block *sb)
|
|
{
|
|
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
|
struct scoutfs_mount_options opts;
|
|
struct quorum_info *qinf;
|
|
int ret;
|
|
|
|
scoutfs_options_read(sb, &opts);
|
|
if (opts.quorum_slot_nr < 0)
|
|
return 0;
|
|
|
|
qinf = kzalloc(sizeof(struct quorum_info), GFP_KERNEL);
|
|
if (!qinf) {
|
|
ret = -ENOMEM;
|
|
goto out;
|
|
}
|
|
|
|
spin_lock_init(&qinf->show_lock);
|
|
INIT_WORK(&qinf->work, scoutfs_quorum_worker);
|
|
scoutfs_sysfs_init_attrs(sb, &qinf->ssa);
|
|
/* static for the lifetime of the mount */
|
|
qinf->our_quorum_slot_nr = opts.quorum_slot_nr;
|
|
|
|
sbi->quorum_info = qinf;
|
|
qinf->sb = sb;
|
|
|
|
ret = verify_quorum_slots(sb);
|
|
if (ret < 0)
|
|
goto out;
|
|
|
|
/* create in setup so errors cause mount to fail */
|
|
ret = create_socket(sb);
|
|
if (ret < 0)
|
|
goto out;
|
|
|
|
ret = scoutfs_sysfs_create_attrs(sb, &qinf->ssa, quorum_attrs,
|
|
"quorum");
|
|
if (ret < 0)
|
|
goto out;
|
|
|
|
schedule_work(&qinf->work);
|
|
|
|
out:
|
|
if (ret)
|
|
scoutfs_quorum_destroy(sb);
|
|
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Shutdown the quorum worker and destroy all our resources.
|
|
*
|
|
* This is called after client destruction which only completes once
|
|
* farewell requests are resolved. That only happens for a quorum member
|
|
* once it isn't needed for quorum.
|
|
*
|
|
* The work is the only place that starts the server, and it stops the
|
|
* server as it exits, so we can wait for it to finish and know that no
|
|
* server can be running to call back into us as it shuts down.
|
|
*/
|
|
void scoutfs_quorum_destroy(struct super_block *sb)
|
|
{
|
|
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
|
struct quorum_info *qinf = SCOUTFS_SB(sb)->quorum_info;
|
|
|
|
if (qinf) {
|
|
qinf->shutdown = true;
|
|
flush_work(&qinf->work);
|
|
|
|
scoutfs_sysfs_destroy_attrs(sb, &qinf->ssa);
|
|
if (qinf->sock)
|
|
sock_release(qinf->sock);
|
|
|
|
kfree(qinf);
|
|
sbi->quorum_info = NULL;
|
|
}
|
|
}
|