diff --git a/kmod/src/client.c b/kmod/src/client.c index 4bdbc15e..12c21399 100644 --- a/kmod/src/client.c +++ b/kmod/src/client.c @@ -34,13 +34,10 @@ /* * The client is responsible for maintaining a connection to the server. - * This includes managing quorum elections that determine which client - * should run the server that all the clients connect to. */ #define CLIENT_CONNECT_DELAY_MS (MSEC_PER_SEC / 10) #define CLIENT_CONNECT_TIMEOUT_MS (1 * MSEC_PER_SEC) -#define CLIENT_QUORUM_TIMEOUT_MS (5 * MSEC_PER_SEC) struct client_info { struct super_block *sb; @@ -303,27 +300,17 @@ out: * to the server. It's queued on mount and disconnect and we requeue * the work if the work fails and we're not shutting down. * - * In the typical case a mount reads the super blocks and finds the - * address of the currently running server and connects to it. - * Non-quorum member clients who can't connect will keep trying - * alternating reading the address and getting connect timeouts. - * - * Quorum members will try to elect a leader if they can't connect to - * the server. When then can't connect and are able to elect a leader - * then a new server is started. The new server will write its address - * in the super and everyone will be able to connect. + * We ask quorum for an address to try and connect to. If there isn't + * one, or it fails, we back off a bit before trying again. * * There's a tricky bit of coordination required to safely unmount. * Clients need to tell the server that they won't be coming back with a - * farewell request. Once a client receives its farewell response it - * can exit. But a majority of quorum members need to stick around to - * elect a server to process all their farewell requests. This is - * coordinated by having the greeting tell the server that a client is a - * quorum member. The server then holds on to farewell requests from - * members until only requests from the final quorum remain. These - * farewell responses are only sent after updating an unmount barrier in - * the super to indicate to the final quorum that they can safely exit - * without having received a farewell response over the network. + * farewell request. Once the server processes a farewell request from + * the client it can forget the client. If the connection is broken + * before the client gets the farewell response it doesn't want to + * reconnect to send it again.. instead the client can read the metadata + * device to check for the lack of an item which indicates that the + * server has processed its farewell. */ static void scoutfs_client_connect_worker(struct work_struct *work) { @@ -333,11 +320,9 @@ static void scoutfs_client_connect_worker(struct work_struct *work) struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct scoutfs_super_block *super = NULL; struct mount_options *opts = &sbi->opts; - const bool am_quorum = opts->server_addr.sin_addr.s_addr != 0; + const bool am_quorum = opts->quorum_slot_nr >= 0; struct scoutfs_net_greeting greet; struct sockaddr_in sin; - ktime_t timeout_abs; - u64 elected_term; int ret; super = kmalloc(sizeof(struct scoutfs_super_block), GFP_NOFS); @@ -359,36 +344,14 @@ static void scoutfs_client_connect_worker(struct work_struct *work) goto out; } - /* try to connect to the super's server address */ - scoutfs_addr_to_sin(&sin, &super->server_addr); - if (sin.sin_addr.s_addr != 0 && sin.sin_port != 0) - ret = scoutfs_net_connect(sb, client->conn, &sin, - CLIENT_CONNECT_TIMEOUT_MS); - else - ret = -ENOTCONN; - - if (ret < 0) { - /* non-quorum members will delay then retry connect */ - if (!am_quorum) - goto out; - - /* quorum members try to elect a leader */ - /* make sure local server isn't writing super during votes */ - scoutfs_server_stop(sb); - - timeout_abs = ktime_add_ms(ktime_get(), - CLIENT_QUORUM_TIMEOUT_MS); - - ret = scoutfs_quorum_election(sb, timeout_abs, - le64_to_cpu(super->quorum_server_term), - &elected_term); - /* start the server if we were asked to */ - if (elected_term > 0) - ret = scoutfs_server_start(sb, &opts->server_addr, - elected_term); - ret = -ENOTCONN; + ret = scoutfs_quorum_server_sin(sb, &sin); + if (ret < 0) + goto out; + + ret = scoutfs_net_connect(sb, client->conn, &sin, + CLIENT_CONNECT_TIMEOUT_MS); + if (ret < 0) goto out; - } /* send a greeting to verify endpoints of each connection */ greet.fsid = super->hdr.fsid; diff --git a/kmod/src/counters.h b/kmod/src/counters.h index 819cf67c..f6aa6b3b 100644 --- a/kmod/src/counters.h +++ b/kmod/src/counters.h @@ -139,18 +139,21 @@ EXPAND_COUNTER(net_recv_invalid_message) \ EXPAND_COUNTER(net_recv_messages) \ EXPAND_COUNTER(net_unknown_request) \ - EXPAND_COUNTER(quorum_cycle) \ - EXPAND_COUNTER(quorum_elected_leader) \ - EXPAND_COUNTER(quorum_election_timeout) \ - EXPAND_COUNTER(quorum_failure) \ - EXPAND_COUNTER(quorum_read_block) \ - EXPAND_COUNTER(quorum_read_block_error) \ + EXPAND_COUNTER(quorum_elected) \ + EXPAND_COUNTER(quorum_fence_error) \ + EXPAND_COUNTER(quorum_fence_leader) \ EXPAND_COUNTER(quorum_read_invalid_block) \ - EXPAND_COUNTER(quorum_saw_super_leader) \ - EXPAND_COUNTER(quorum_timedout) \ - EXPAND_COUNTER(quorum_write_block) \ - EXPAND_COUNTER(quorum_write_block_error) \ - EXPAND_COUNTER(quorum_fenced) \ + EXPAND_COUNTER(quorum_recv_error) \ + EXPAND_COUNTER(quorum_recv_heartbeat) \ + EXPAND_COUNTER(quorum_recv_invalid) \ + EXPAND_COUNTER(quorum_recv_resignation) \ + EXPAND_COUNTER(quorum_recv_vote) \ + EXPAND_COUNTER(quorum_send_heartbeat) \ + EXPAND_COUNTER(quorum_send_resignation) \ + EXPAND_COUNTER(quorum_send_request) \ + EXPAND_COUNTER(quorum_send_vote) \ + EXPAND_COUNTER(quorum_server_shutdown) \ + EXPAND_COUNTER(quorum_term_follower) \ EXPAND_COUNTER(server_commit_hold) \ EXPAND_COUNTER(server_commit_queue) \ EXPAND_COUNTER(server_commit_worker) \ diff --git a/kmod/src/format.h b/kmod/src/format.h index 79fb8611..5f0f24d5 100644 --- a/kmod/src/format.h +++ b/kmod/src/format.h @@ -14,6 +14,7 @@ #define SCOUTFS_BLOCK_MAGIC_SRCH_BLOCK 0x897e4a7d #define SCOUTFS_BLOCK_MAGIC_SRCH_PARENT 0xb23a2a05 #define SCOUTFS_BLOCK_MAGIC_ALLOC_LIST 0x8a93ac83 +#define SCOUTFS_BLOCK_MAGIC_QUORUM 0xbc310868 /* * The super block, quorum block, and file data allocation granularity @@ -54,15 +55,19 @@ #define SCOUTFS_SUPER_BLKNO ((64ULL * 1024) >> SCOUTFS_BLOCK_SM_SHIFT) /* - * A reasonably large region of aligned quorum blocks follow the super - * block. Each voting cycle reads the entire region so we don't want it - * to be too enormous. 256K seems like a reasonably chunky single IO. - * The number of blocks in the region also determines the number of - * mounts that have a reasonable probability of not overwriting each - * other's random block locations. + * A small number of quorum blocks follow the super block, enough of + * them to match the starting offset of the super block so the region is + * aligned to the power of two that contains it. */ -#define SCOUTFS_QUORUM_BLKNO ((256ULL * 1024) >> SCOUTFS_BLOCK_SM_SHIFT) -#define SCOUTFS_QUORUM_BLOCKS ((256ULL * 1024) >> SCOUTFS_BLOCK_SM_SHIFT) +#define SCOUTFS_QUORUM_BLKNO (SCOUTFS_SUPER_BLKNO + 1) +#define SCOUTFS_QUORUM_BLOCKS (SCOUTFS_SUPER_BLKNO - 1) + +/* + * Free metadata blocks start after the quorum blocks + */ +#define SCOUTFS_META_DEV_START_BLKNO \ + ((SCOUTFS_QUORUM_BLKNO + SCOUTFS_QUORUM_BLOCKS) >> \ + SCOUTFS_BLOCK_SM_LG_SHIFT) /* * Start data on the data device aligned as well. @@ -537,49 +542,77 @@ struct scoutfs_xattr { #define SCOUTFS_UUID_BYTES 16 -/* - * Mounts read all the quorum blocks and write to one random quorum - * block during a cycle. The min cycle time limits the per-mount iop - * load during elections. The random cycle delay makes it less likely - * that mounts will read and write at the same time and miss each - * other's writes. An election only completes if a quorum of mounts - * vote for a leader before any of their elections timeout. This is - * made less likely by the probability that mounts will overwrite each - * others random block locations. The max quorum count limits that - * probability. 9 mounts only have a 55% chance of writing to unique 4k - * blocks in a 256k region. The election timeout is set to include - * enough cycles to usually complete the election. Once a leader is - * elected it spends a number of cycles writing out blocks with itself - * logged as a leader. This reduces the possibility that servers - * will have their log entries overwritten and not be fenced. - */ -#define SCOUTFS_QUORUM_MAX_COUNT 9 -#define SCOUTFS_QUORUM_CYCLE_LO_MS 10 -#define SCOUTFS_QUORUM_CYCLE_HI_MS 20 -#define SCOUTFS_QUORUM_TERM_LO_MS 250 -#define SCOUTFS_QUORUM_TERM_HI_MS 500 -#define SCOUTFS_QUORUM_ELECTED_LOG_CYCLES 10 +#define SCOUTFS_QUORUM_MAX_SLOTS 15 -struct scoutfs_quorum_block { +/* + * To elect a leader, members race to have their variable election + * timeouts expire. If they're first to send a vote request with a + * greater term to a majority of waiting members they'll be elected with + * a majority. If the timeouts are too close, the vote may be split and + * everyone will wait for another cycle of variable timeouts to expire. + * + * These determine how long it will take to elect a leader once there's + * no evidence of a server (no leader quorum blocks on mount; heartbeat + * timeout expired.) + */ +#define SCOUTFS_QUORUM_ELECT_MIN_MS 250 +#define SCOUTFS_QUORUM_ELECT_VAR_MS 100 + +/* + * Once a leader is elected they send out heartbeats at regular + * intervals to force members to wait the much longer heartbeat timeout. + * Once heartbeat timeout expires without receiving a heartbeat they'll + * switch over the performing elections. + * + * These determine how long it could take members to notice that a + * leader has gone silent and start to elect a new leader. + */ +#define SCOUTFS_QUORUM_HB_IVAL_MS 100 +#define SCOUTFS_QUORUM_HB_TIMEO_MS (5 * MSEC_PER_SEC) + +struct scoutfs_quorum_message { __le64 fsid; - __le64 blkno; + __le64 version; __le64 term; - __le64 write_nr; - __le64 voter_rid; - __le64 vote_for_rid; + __u8 type; + __u8 from; + __u8 __pad[2]; __le32 crc; - __u8 log_nr; - __u8 __pad[3]; - struct scoutfs_quorum_log { - __le64 term; - __le64 rid; - struct scoutfs_inet_addr addr; - } log[0]; }; -#define SCOUTFS_QUORUM_LOG_MAX \ - ((SCOUTFS_BLOCK_SM_SIZE - sizeof(struct scoutfs_quorum_block)) / \ - sizeof(struct scoutfs_quorum_log)) +/* a candidate requests a vote */ +#define SCOUTFS_QUORUM_MSG_REQUEST_VOTE 0 +/* followers send votes to candidates */ +#define SCOUTFS_QUORUM_MSG_VOTE 1 +/* elected leaders broadcast heartbeats to delay elections */ +#define SCOUTFS_QUORUM_MSG_HEARTBEAT 2 +/* leaders broadcast as they leave to break heartbeat timeout */ +#define SCOUTFS_QUORUM_MSG_RESIGNATION 3 +#define SCOUTFS_QUORUM_MSG_INVALID 4 + +/* + * The version is currently always 0, but will be used by mounts to + * discover that membership has changed. + */ +struct scoutfs_quorum_config { + __le64 version; + struct scoutfs_quorum_slot { + struct scoutfs_inet_addr addr; + } slots[SCOUTFS_QUORUM_MAX_SLOTS]; +}; + +struct scoutfs_quorum_block { + struct scoutfs_block_header hdr; + __le64 term; + __le64 random_write_mark; + __le64 flags; + struct scoutfs_quorum_block_event { + __le64 rid; + struct scoutfs_timespec ts; + } write, update_term, set_leader, clear_leader, fenced; +}; + +#define SCOUTFS_QUORUM_BLOCK_LEADER (1 << 0) #define SCOUTFS_FLAG_IS_META_BDEV 0x01 @@ -597,12 +630,8 @@ struct scoutfs_super_block { __le64 total_data_blocks; __le64 first_data_blkno; __le64 last_data_blkno; - __le64 quorum_fenced_term; - __le64 quorum_server_term; __le64 unmount_barrier; - __u8 quorum_count; - __u8 __pad[7]; - struct scoutfs_inet_addr server_addr; + struct scoutfs_quorum_config qconf; struct scoutfs_alloc_root meta_alloc[2]; struct scoutfs_alloc_root data_alloc; struct scoutfs_alloc_list_head server_meta_avail[2]; diff --git a/kmod/src/options.c b/kmod/src/options.c index 4d698b36..757f9f97 100644 --- a/kmod/src/options.c +++ b/kmod/src/options.c @@ -28,7 +28,7 @@ #include "super.h" static const match_table_t tokens = { - {Opt_server_addr, "server_addr=%s"}, + {Opt_quorum_slot_nr, "quorum_slot_nr=%s"}, {Opt_metadev_path, "metadev_path=%s"}, {Opt_err, NULL} }; @@ -43,46 +43,6 @@ u32 scoutfs_option_u32(struct super_block *sb, int token) return 0; } -/* The caller's string is null terminted and can be clobbered */ -static int parse_ipv4(struct super_block *sb, char *str, - struct sockaddr_in *sin) -{ - unsigned long port = 0; - __be32 addr; - char *c; - int ret; - - /* null term port, if specified */ - c = strchr(str, ':'); - if (c) - *c = '\0'; - - /* parse addr */ - addr = in_aton(str); - if (ipv4_is_multicast(addr) || ipv4_is_lbcast(addr) || - ipv4_is_zeronet(addr) || - ipv4_is_local_multicast(addr)) { - scoutfs_err(sb, "invalid unicast ipv4 address: %s", str); - return -EINVAL; - } - - /* parse port, if specified */ - if (c) { - c++; - ret = kstrtoul(c, 0, &port); - if (ret != 0 || port == 0 || port >= U16_MAX) { - scoutfs_err(sb, "invalid port in ipv4 address: %s", c); - return -EINVAL; - } - } - - sin->sin_family = AF_INET; - sin->sin_addr.s_addr = addr; - sin->sin_port = cpu_to_be16(port); - - return 0; -} - static int parse_bdev_path(struct super_block *sb, substring_t *substr, char **bdev_path_ret) { @@ -132,14 +92,15 @@ out: int scoutfs_parse_options(struct super_block *sb, char *options, struct mount_options *parsed) { - char ipstr[INET_ADDRSTRLEN + 1]; substring_t args[MAX_OPT_ARGS]; + int nr; int token; char *p; int ret; /* Set defaults */ memset(parsed, 0, sizeof(*parsed)); + parsed->quorum_slot_nr = -1; while ((p = strsep(&options, ",")) != NULL) { if (!*p) @@ -147,12 +108,23 @@ int scoutfs_parse_options(struct super_block *sb, char *options, token = match_token(p, tokens, args); switch (token) { - case Opt_server_addr: + case Opt_quorum_slot_nr: - match_strlcpy(ipstr, args, ARRAY_SIZE(ipstr)); - ret = parse_ipv4(sb, ipstr, &parsed->server_addr); - if (ret < 0) + if (parsed->quorum_slot_nr != -1) { + scoutfs_err(sb, "multiple quorum_slot_nr options provided, only provide one."); + return -EINVAL; + } + + ret = match_int(args, &nr); + if (ret < 0 || nr < 0 || + nr >= SCOUTFS_QUORUM_MAX_SLOTS) { + scoutfs_err(sb, "invalid quorum_slot_nr option, must be between 0 and %u", + SCOUTFS_QUORUM_MAX_SLOTS - 1); + if (ret == 0) + ret = -EINVAL; return ret; + } + parsed->quorum_slot_nr = nr; break; case Opt_metadev_path: diff --git a/kmod/src/options.h b/kmod/src/options.h index b62be4d3..d948b5b7 100644 --- a/kmod/src/options.h +++ b/kmod/src/options.h @@ -6,13 +6,13 @@ #include "format.h" enum scoutfs_mount_options { - Opt_server_addr, + Opt_quorum_slot_nr, Opt_metadev_path, Opt_err, }; struct mount_options { - struct sockaddr_in server_addr; + int quorum_slot_nr; char *metadev_path; }; diff --git a/kmod/src/quorum.c b/kmod/src/quorum.c index 43c398d9..e9c5ca3f 100644 --- a/kmod/src/quorum.c +++ b/kmod/src/quorum.c @@ -13,752 +13,1042 @@ #include #include #include -#include #include -#include -#include #include #include #include #include -#include +#include +#include +#include +#include +#include #include "format.h" #include "msg.h" #include "counters.h" #include "quorum.h" #include "server.h" +#include "block.h" #include "net.h" #include "sysfs.h" #include "scoutfs_trace.h" /* - * scoutfs mounts communicate through a region of preallocated blocks to - * elect a leader who starts the server. Mounts which have been - * configured with a server address and which can't connect to a server - * attempt to form a quorum to elect a new leader who starts a new + * This quorum subsystem is responsible for ensuring that only one + * server is ever running among the mounts and has exclusive read/write + * access to the server structures in the metadata device. + * + * A specific set of mounts are quorum members as indicated by the + * quorum_slot_nr mount option. That option refers to the slot in the + * super block that contains their configuration. Only these mounts + * participate in the election of the leader. + * + * As each quorum member mounts it starts background work that uses a + * simplified raft leader election protocol to elect a leader. Each + * mount listens on a udp socket at the address found in its slot in the + * super block. It then sends and receives raft messages to and from + * the other slot addresses in the super block. As the protocol + * progresses eventually a mount will receive enough votes to become the + * leader. We're not using the full key-value store of raft, just the + * leadership election. Much of the functionality matches the raft + * concepts (roles, messages, timeouts) but there's no key value logs to + * synchronize. + * + * Once elected leader, the mount now has to ensure that it's the only + * running server. There could be previously elected servers still + * running (maybe they've deadlocked, or lost network communications). + * In addition to a configuration slot in the super block, each quorum + * member also has a known block location that represents their slot. + * They set a flag in their block indicating that they've been elected + * leader, then read slots for all the other blocks looking for + * previously active leaders to fence. After that it can start the * server. * - * The mounts participating in the election use a variant of the raft - * election protocol to establish quorum and elect a leader. We use - * block reads and writes instead of network messages. Mounts read all - * the blocks looking for messages to receive. Mounts write their vote - * to a random block in the region to send a message to all other - * mounts. Unlikely collisions are analogous to lossy networks losing - * messages and are handled by the protocol. + * It's critical to raft elections that a participant's term not go + * backwards in time so each mount also uses its quorum block to store + * the greatest term it has used in messages. * - * We allow a "majority" of 1 voter when there are less than three - * possible voters. This lets a simple network establish quorum. If - * the raft quorum timeouts align to leaders could both elect themselves - * and race to fence each other. In the worst case they could continue - * to do this indefinitely but it's unlikely as it would require a - * sequence of identical random raft timeouts. + * The quorum work still runs in the background while the server is + * running. The leader quorum work will regularly send heartbeat + * messages to the other quorum members to keep them from electing a new + * leader. If the server shuts down, or the mount disappears, the other + * quorum members will stop receiving heartbeats and will elect a new + * leader. * - * One of the reasons we use block reads and writes as the quorum - * communication medium is that it lets us leave behind a shared - * persistent log of previous election results. This then lets a newly - * elected leader fence all previously elected leaders that haven't - * shutdown so that they can safely assume exclusive access to the - * shared device. Every written block includes a log of election - * results. Every voter merges the log from every block it reads the - * block it writes. A leader doesn't attempt to fence until it's spent - * a few cycles writing blocks with itself as the log entry. This gives - * other voters time to migrate the log entry through the blocks. - * - * Once a leader is elected it fences any previously elected leaders - * still present in the log it merged while reading all the voting - * blocks. Once they've fenced they update the super block record of - * the latest term that has been fenced. This trims the log over time - * and keeps from attempting to fence the same mounts multiple times. - * As the server later shuts down it writes its term into the super to - * stop it from being fenced. - * - * The final complication comes during unmount. Clients exit after the - * server responds to their farewell request. But a majority of clients - * need to be present to elect a server to process farewell requests. - * The server knows which clients will attempt to vote for quorum and - * only responds to their farewell requests once they're no longer - * needed to elect a server -- either there's still quorum remaining of - * other mounts or the only mounts remaining are all quorum voters that - * have sent farewell requests. Before sending these final responses - * the server updates an unmount_barrier field in the super. If clients - * that are waiting for a farewell response see the unmount barrier - * increment they know that their farewell has been processed and they - * can assume a successful farewell response and exit cleanly. - * - * XXX: - actually fence + * Typically we require a strict majority of the configured quorum + * members to elect a leader. However, for simple usability, we do + * allow a majority of 1 when there are only one or two quorum members. + * In the two member case this can lead to split elections where each + * mount races to elect itself as leader and attempt to fence the other. + * The random election timeouts in raft make this unlikely, but it is + * possible. */ -struct quorum_info { - struct scoutfs_sysfs_attrs ssa; - - bool is_leader; +/* + * The fields of the message that the receiver can use after the message + * has been validated. + */ +struct quorum_host_msg { + u64 term; + u8 type; + u8 from; }; +struct last_msg { + struct quorum_host_msg msg; + struct timespec64 ts; +}; + +enum quorum_role { FOLLOWER, CANDIDATE, LEADER }; + +struct quorum_status { + enum quorum_role role; + u64 term; + int vote_for; + unsigned long vote_bits; + ktime_t timeout; +}; + +struct quorum_info { + struct super_block *sb; + struct work_struct work; + struct socket *sock; + bool shutdown; + + unsigned long flags; + int votes_needed; + + spinlock_t show_lock; + struct quorum_status show_status; + struct last_msg last_send[SCOUTFS_QUORUM_MAX_SLOTS]; + struct last_msg last_recv[SCOUTFS_QUORUM_MAX_SLOTS]; + + struct scoutfs_sysfs_attrs ssa; +}; + +#define QINF_FLAG_SERVER 0 + #define DECLARE_QUORUM_INFO(sb, name) \ struct quorum_info *name = SCOUTFS_SB(sb)->quorum_info #define DECLARE_QUORUM_INFO_KOBJ(kobj, name) \ DECLARE_QUORUM_INFO(SCOUTFS_SYSFS_ATTRS_SB(kobj), name) -/* - * Return an absolute ktime timeout expires value in the future after a - * random duration between hi and lo where both limits are possible. - */ -static ktime_t random_to(u32 lo, u32 hi) +static bool quorum_slot_present(struct scoutfs_super_block *super, int i) { - return ktime_add_ms(ktime_get(), lo + prandom_u32_max((hi + 1) - lo)); + BUG_ON(i < 0 || i > SCOUTFS_QUORUM_MAX_SLOTS); + + return super->qconf.slots[i].addr.addr != 0; } -/* - * The caller is about to read all the quorum blocks. We invalidate any - * cached blocks and issue one large contiguous read to repopulate the - * cache. The caller then uses normal __bread to read each block. I'm - * not a huge fan of the plug but I couldn't get the individual - * readahead requests merged without it. - */ -static void readahead_quorum_blocks(struct super_block *sb) +static ktime_t election_timeout(void) { + return ktime_add_ms(ktime_get(), SCOUTFS_QUORUM_ELECT_MIN_MS + + prandom_u32_max(SCOUTFS_QUORUM_ELECT_VAR_MS)); +} + +static ktime_t heartbeat_interval(void) +{ + return ktime_add_ms(ktime_get(), SCOUTFS_QUORUM_HB_IVAL_MS); +} + +static ktime_t heartbeat_timeout(void) +{ + return ktime_add_ms(ktime_get(), SCOUTFS_QUORUM_HB_TIMEO_MS); +} + +static int create_socket(struct super_block *sb) +{ + DECLARE_QUORUM_INFO(sb, qinf); struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct buffer_head *bh; - struct blk_plug plug; - int i; - - blk_start_plug(&plug); - - for (i = 0; i < SCOUTFS_QUORUM_BLOCKS; i++) { - bh = __getblk(sbi->meta_bdev, SCOUTFS_QUORUM_BLKNO + i, - SCOUTFS_BLOCK_SM_SIZE); - if (!bh) - continue; - - lock_buffer(bh); - clear_buffer_uptodate(bh); - unlock_buffer(bh); - - ll_rw_block(READA | REQ_META | REQ_PRIO, 1, &bh); - brelse(bh); - } - - blk_finish_plug(&plug); -} - -struct quorum_block_head { - struct list_head head; - union { - struct scoutfs_quorum_block blk; - u8 bytes[SCOUTFS_BLOCK_SM_SIZE]; - }; -}; - -static void free_quorum_blocks(struct list_head *blocks) -{ - struct quorum_block_head *qbh; - struct quorum_block_head *tmp; - - list_for_each_entry_safe(qbh, tmp, blocks, head) { - list_del_init(&qbh->head); - kfree(qbh); - } -} - -/* - * Callers don't mind us clobbering the crc temporarily. - */ -static __le32 quorum_block_crc(struct scoutfs_quorum_block *blk) -{ - __le32 calc_crc; - __le32 blk_crc; - - blk_crc = blk->crc; - blk->crc = 0; - calc_crc = cpu_to_le32(crc32c(~0, blk, sizeof(*blk))); - blk->crc = blk_crc; - - return calc_crc; -} - -static size_t quorum_block_bytes(struct scoutfs_quorum_block *blk) -{ - return offsetof(struct scoutfs_quorum_block, - log[blk->log_nr]); -} - -static bool invalid_quorum_block(struct buffer_head *bh, - struct scoutfs_quorum_block *blk) -{ - return bh->b_size != SCOUTFS_BLOCK_SM_SIZE || - sizeof(struct scoutfs_quorum_block) > SCOUTFS_BLOCK_SM_SIZE || - quorum_block_crc(blk) != blk->crc || - le64_to_cpu(blk->blkno) != bh->b_blocknr || - blk->term == 0 || - blk->log_nr > SCOUTFS_QUORUM_LOG_MAX || - quorum_block_bytes(blk) > SCOUTFS_BLOCK_SM_SIZE; -} - -/* true if a is stale and should be ignored */ -static bool stale_quorum_block(struct scoutfs_quorum_block *a, - struct scoutfs_quorum_block *b) -{ - if (le64_to_cpu(a->term) < le64_to_cpu(b->term)) - return true; - - if (le64_to_cpu(a->voter_rid) == le64_to_cpu(b->voter_rid) && - le64_to_cpu(a->write_nr) <= le64_to_cpu(b->write_nr)) - return true; - - return false; -} - -/* - * Get the most recent blocks from all the voters for the most recent term. - * We ignore any corrupt blocks, blocks not for our fsid, previous terms, - * and previous writes from a rid in the current term. - */ -static int read_quorum_blocks(struct super_block *sb, struct list_head *blocks) -{ - struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super; - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct scoutfs_quorum_block *blk; - struct quorum_block_head *qbh; - struct quorum_block_head *tmp; - struct buffer_head *bh = NULL; - LIST_HEAD(stale); - int ret; - int i; - - readahead_quorum_blocks(sb); - - for (i = 0; i < SCOUTFS_QUORUM_BLOCKS; i++) { - brelse(bh); - bh = __bread(sbi->meta_bdev, SCOUTFS_QUORUM_BLKNO + i, - SCOUTFS_BLOCK_SM_SIZE); - if (!bh) { - scoutfs_inc_counter(sb, quorum_read_block_error); - ret = -EIO; - goto out; - } - blk = (void *)(bh->b_data); - - /* ignore unwritten blocks or blocks for other filesystems */ - if (blk->voter_rid == 0 || blk->fsid != super->hdr.fsid) - continue; - - if (invalid_quorum_block(bh, blk)) { - scoutfs_inc_counter(sb, quorum_read_invalid_block); - continue; - } - - list_for_each_entry_safe(qbh, tmp, blocks, head) { - if (stale_quorum_block(blk, &qbh->blk)) { - blk = NULL; - break; - } - - if (stale_quorum_block(&qbh->blk, blk)) - list_move(&qbh->head, &stale); - } - free_quorum_blocks(&stale); - - if (!blk) - continue; - - qbh = kmalloc(sizeof(struct quorum_block_head), - GFP_NOFS); - if (!qbh) { - ret = -ENOMEM; - goto out; - } - - memcpy(&qbh->blk, blk, quorum_block_bytes(blk)); - list_add_tail(&qbh->head, blocks); - } - - list_for_each_entry(qbh, blocks, head) { - trace_scoutfs_quorum_read_block(sb, &qbh->blk); - scoutfs_inc_counter(sb, quorum_read_block); - } - - ret = 0; -out: - brelse(bh); - if (ret < 0) - free_quorum_blocks(blocks); - return ret; -} - -/* - * Synchronously write a single quorum block. The caller has provided - * the meaningful fields for the write. We fill in the fsid, blkno, and - * crc for every write and zero the rest of the block. - */ -static int write_quorum_block(struct super_block *sb, - struct scoutfs_quorum_block *our_blk) -{ - struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super; - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct scoutfs_quorum_block *blk; - struct buffer_head *bh = NULL; - size_t size; + struct mount_options *opts = &sbi->opts; + struct scoutfs_super_block *super = &sbi->super; + struct socket *sock = NULL; + struct sockaddr_in sin; + int addrlen; int ret; - BUILD_BUG_ON(sizeof(struct scoutfs_quorum_block) > - SCOUTFS_BLOCK_SM_SIZE); - - bh = __getblk(sbi->meta_bdev, SCOUTFS_QUORUM_BLKNO + - prandom_u32_max(SCOUTFS_QUORUM_BLOCKS), - SCOUTFS_BLOCK_SM_SIZE); - if (bh == NULL) { - ret = -EIO; + ret = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); + if (ret) { + scoutfs_err(sb, "quorum couldn't create udp socket: %d", ret); goto out; } - size = quorum_block_bytes(our_blk); - if (WARN_ON_ONCE(size > SCOUTFS_BLOCK_SM_SIZE || size > bh->b_size)) { - ret = -EIO; + sock->sk->sk_allocation = GFP_NOFS; + + scoutfs_quorum_slot_sin(super, opts->quorum_slot_nr, &sin); + + addrlen = sizeof(sin); + ret = kernel_bind(sock, (struct sockaddr *)&sin, addrlen); + if (ret) { + scoutfs_err(sb, "quorum failed to bind udp socket to "SIN_FMT": %d", + SIN_ARG(&sin), ret); goto out; } - blk = (void *)bh->b_data; - memset(blk, 0, bh->b_size); - memcpy(blk, our_blk, size); - - blk->fsid = super->hdr.fsid; - blk->blkno = cpu_to_le64(bh->b_blocknr); - blk->crc = quorum_block_crc(blk); - - lock_buffer(bh); - set_buffer_mapped(bh); - bh->b_end_io = end_buffer_write_sync; - get_bh(bh); - submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh); - - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) - ret = -EIO; - else - ret = 0; - - if (ret == 0) { - trace_scoutfs_quorum_write_block(sb, blk); - scoutfs_inc_counter(sb, quorum_write_block); - } out: - if (ret) - scoutfs_inc_counter(sb, quorum_write_block_error); - brelse(bh); + if (ret < 0 && sock) { + sock_release(sock); + sock = NULL; + } + qinf->sock = sock; return ret; } -/* - * Returns true if there's an entry for the given election. - */ -static bool log_contains(struct scoutfs_quorum_block *blk, u64 term, u64 rid) +static __le32 quorum_message_crc(struct scoutfs_quorum_message *qmes) { - int i; + /* crc up to the crc field at the end */ + unsigned int len = offsetof(struct scoutfs_quorum_message, crc); - for (i = 0; i < blk->log_nr; i++) { - if (le64_to_cpu(blk->log[i].term) == term && - le64_to_cpu(blk->log[i].rid) == rid) - return true; - } - - return false; + return cpu_to_le32(crc32c(~0, qmes, len)); } -/* add an entry to the log, returning error if it's full */ -static int log_add(struct scoutfs_quorum_block *blk, u64 term, u64 rid, - struct scoutfs_inet_addr *addr) -{ - int i; - - if (log_contains(blk, term, rid)) - return 0; - - if (blk->log_nr == SCOUTFS_QUORUM_LOG_MAX) - return -ENOSPC; - - i = blk->log_nr++; - blk->log[i].term = cpu_to_le64(term); - blk->log[i].rid = cpu_to_le64(rid); - blk->log[i].addr = *addr; - - return 0; -} - -/* migrate live log entries between blocks, returning err if full */ -static int log_merge(struct scoutfs_quorum_block *our_blk, - struct scoutfs_quorum_block *blk, - u64 fenced_term) -{ - int ret; - int i; - - for (i = 0; i < blk->log_nr; i++) { - if (le64_to_cpu(blk->log[i].term) > fenced_term) { - ret = log_add(our_blk, le64_to_cpu(blk->log[i].term), - le64_to_cpu(blk->log[i].rid), - &blk->log[i].addr); - if (ret < 0) - return ret; - } - } - - return 0; -} - -/* Remove old log entries for a voter before a given term. */ -static void log_purge(struct scoutfs_quorum_block *blk, u64 term, u64 rid) -{ - int i; - - for (i = 0; i < blk->log_nr; i++) { - if (le64_to_cpu(blk->log[i].term) < term && - le64_to_cpu(blk->log[i].rid) == rid) { - if (i != blk->log_nr - 1) - swap(blk->log[i], blk->log[blk->log_nr - 1]); - blk->log_nr--; - i--; /* continue from swapped in entry */ - } - } -} - - -/* - * The caller received a majority of votes and has been elected. Before - * assuming exclusive write access to the device we fence the winners of - * any previous elections still present in the log. Once they're fenced - * we re-read the super and update the fenced_term to indicate that - * those previous elections can be ignored and purged from the log. - * - * We can be attempting this concurrently with both previous and future - * elected leaders. The leader with the greatest elected term will win - * and fence all previous elected leaders. - * - * We clobber the caller's block as we go to not fence rids multiple times. - */ -static int fence_previous(struct super_block *sb, - struct scoutfs_quorum_block *blk, - u64 our_rid, u64 fenced_term, u64 term) -{ - struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super; - struct sockaddr_in their_sin; - int ret; - int i; - - for (i = 0; i < blk->log_nr; i++) { - if (le64_to_cpu(blk->log[i].rid) != our_rid && - le64_to_cpu(blk->log[i].term) > fenced_term && - le64_to_cpu(blk->log[i].term) < term) { - - scoutfs_inc_counter(sb, quorum_fenced); - scoutfs_addr_to_sin(&their_sin, &blk->log[i].addr); - scoutfs_err(sb, "fencing "SCSBF" at "SIN_FMT, - SCSB_LEFR_ARGS(super->hdr.fsid, - blk->log[i].rid), - SIN_ARG(&their_sin)); - - log_purge(blk, term, le64_to_cpu(blk->log[i].rid)); - i = -1; /* start over */ - } - } - - /* update fenced term now that we have exclusive access */ - ret = 0; - super = kmalloc(sizeof(struct scoutfs_super_block), GFP_NOFS); - if (super) { - ret = scoutfs_read_super(sb, super); - if (ret == 0) { - super->quorum_fenced_term = cpu_to_le64(term - 1); - ret = scoutfs_write_super(sb, super); - - } - kfree(super); - } else { - ret = -ENOMEM; - } - - if (ret != 0) { - scoutfs_err(sb, "failed to update fenced_term in super, this mount will probably be fenced"); - } - - return ret; -} - - - -/* - * The calling voting mount couldn't connect to a server. Participate - * in a raft election to chose a mount to start a new server. If a - * majority of other mounts join us then one of us will be elected and - * our caller will start the server. - * - * Voting members read the blocks at regular intervals. If they see a - * new election they vote for that candidate for the remainder of the - * election. If the election timeout expires they will start a new - * election and vote for themselves. Eventually a sufficient majority - * sees a new election and all vote in the majority for that candidate. - * - * The calling client may have just failed to connect to an elected - * address in the super block. We assume that server is dead and ignore - * it when trying to elect a new leader. But we eventually return with - * a timeout because the server could actually be fine and the client - * could have had communication to the server restored. - * - * We return success if we see a new server elected. If we are elected - * we set the caller's elected_term so they know to start the server. - */ -int scoutfs_quorum_election(struct super_block *sb, ktime_t timeout_abs, - u64 prev_term, u64 *elected_term) +static void send_msg_members(struct super_block *sb, int type, u64 term, + int only) { DECLARE_QUORUM_INFO(sb, qinf); struct mount_options *opts = &SCOUTFS_SB(sb)->opts; - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct scoutfs_super_block *super = NULL; - struct scoutfs_quorum_block *our_blk = NULL; - struct scoutfs_quorum_block *blk; - struct quorum_block_head *qbh; - struct scoutfs_inet_addr addr; - enum { VOTER, CANDIDATE }; - ktime_t cycle_to; - ktime_t term_to; - LIST_HEAD(blocks); - u64 vote_for_write_nr; - u64 vote_for_rid; - u64 write_nr; - u64 term; - int log_cycles = 0; - int votes; - int role; + struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super; + struct timespec64 ts; + int i; + + struct scoutfs_quorum_message qmes = { + .fsid = super->hdr.fsid, + .term = cpu_to_le64(term), + .type = type, + .from = opts->quorum_slot_nr, + }; + struct kvec kv = { + .iov_base = &qmes, + .iov_len = sizeof(qmes), + }; + struct sockaddr_in sin; + struct msghdr mh = { + .msg_iov = (struct iovec *)&kv, + .msg_iovlen = 1, + .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL, + .msg_name = &sin, + .msg_namelen = sizeof(sin), + }; + + trace_scoutfs_quorum_send_message(sb, term, type, only); + + qmes.crc = quorum_message_crc(&qmes); + + ts = ktime_to_timespec64(ktime_get()); + + for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) { + if (!quorum_slot_present(super, i) || + (only >= 0 && i != only) || i == opts->quorum_slot_nr) + continue; + + scoutfs_quorum_slot_sin(super, i, &sin); + kernel_sendmsg(qinf->sock, &mh, &kv, 1, kv.iov_len); + + spin_lock(&qinf->show_lock); + qinf->last_send[i].msg.term = term; + qinf->last_send[i].msg.type = type; + qinf->last_send[i].ts = ts; + spin_unlock(&qinf->show_lock); + + if (i == only) + break; + } +} + +#define send_msg_to(sb, type, term, nr) send_msg_members(sb, type, term, nr) +#define send_msg_others(sb, type, term) send_msg_members(sb, type, term, -1) + +/* + * The caller passes in their absolute timeout which we translate to a + * relative timeval for RCVTIMEO. It defines a 0.0 timeval as blocking + * indefinitely so we're careful to set dontwait if we happen to hit a + * 0.0 timeval. + */ +static int recv_msg(struct super_block *sb, struct quorum_host_msg *msg, + ktime_t abs_to) +{ + DECLARE_QUORUM_INFO(sb, qinf); + struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super; + struct scoutfs_quorum_message qmes; + struct timeval tv; + ktime_t rel_to; + ktime_t now; int ret; - *elected_term = 0; + struct kvec kv = { + .iov_base = &qmes, + .iov_len = sizeof(struct scoutfs_quorum_message), + }; + struct msghdr mh = { + .msg_iov = (struct iovec *)&kv, + .msg_iovlen = 1, + .msg_flags = MSG_NOSIGNAL, + }; - trace_scoutfs_quorum_election(sb, prev_term); + memset(msg, 0, sizeof(*msg)); - super = kmalloc(sizeof(struct scoutfs_super_block), GFP_NOFS); - our_blk = kmalloc(SCOUTFS_BLOCK_SM_SIZE, GFP_NOFS); - if (!super || !our_blk) { - ret = -ENOMEM; - goto out; - } + now = ktime_get(); + if (ktime_before(now, abs_to)) + rel_to = ktime_sub(abs_to, now); + else + rel_to = ns_to_ktime(0); - /* start out as a passive voter */ - role = VOTER; - term = 0; - write_nr = 0; - vote_for_rid = 0; - vote_for_write_nr = 0; - - /* we'll become a candidate if we don't see another candidate */ - term_to = random_to(SCOUTFS_QUORUM_TERM_LO_MS, - SCOUTFS_QUORUM_TERM_HI_MS); - - for (;;) { - memset(our_blk, 0, SCOUTFS_BLOCK_SM_SIZE); - - scoutfs_inc_counter(sb, quorum_cycle); - - ret = scoutfs_read_super(sb, super); - if (ret) - goto out; - - /* done if we see evidence of a new server */ - if (le64_to_cpu(super->quorum_server_term) > prev_term) { - scoutfs_inc_counter(sb, quorum_saw_super_leader); - ret = 0; - goto out; - } - - /* done if we couldn't elect anyone */ - if (ktime_after(ktime_get(), timeout_abs)) { - scoutfs_inc_counter(sb, quorum_timedout); - ret = -ETIMEDOUT; - goto out; - } - - /* become a candidate if the election times out */ - if (ktime_after(ktime_get(), term_to)) { - scoutfs_inc_counter(sb, quorum_election_timeout); - term_to = random_to(SCOUTFS_QUORUM_TERM_LO_MS, - SCOUTFS_QUORUM_TERM_HI_MS); - role = CANDIDATE; - term++; - vote_for_rid = sbi->rid; - log_cycles = 0; - } - - free_quorum_blocks(&blocks); - ret = read_quorum_blocks(sb, &blocks); + tv = ktime_to_timeval(rel_to); + if (tv.tv_sec == 0 && tv.tv_usec == 0) { + mh.msg_flags |= MSG_DONTWAIT; + } else { + ret = kernel_setsockopt(qinf->sock, SOL_SOCKET, SO_RCVTIMEO, + (char *)&tv, sizeof(tv)); if (ret < 0) - goto out; - - votes = 0; - - list_for_each_entry(qbh, &blocks, head) { - blk = &qbh->blk; - - /* - * Become a voter for a candidate the first time - * we see a new term. - * - * And also if we're a candidate and see a - * higher rid candidate in our term. This - * minimizes instability when two quorums are - * possible and race to elect two leaders. This - * is only barely reasonable when accepting the - * risk of instability in two mount - * configurations. - */ - if ((le64_to_cpu(blk->term) > term) || - (role == CANDIDATE && - le64_to_cpu(blk->term) == term && - blk->voter_rid == blk->vote_for_rid && - le64_to_cpu(blk->voter_rid) > sbi->rid)) { - role = VOTER; - term = le64_to_cpu(blk->term); - vote_for_rid = le64_to_cpu(blk->vote_for_rid); - vote_for_write_nr = 0; - votes = 0; - log_cycles = 0; - } - - /* candidate writes suppress voter election timers */ - if (role == VOTER && - blk->voter_rid == blk->vote_for_rid && - le64_to_cpu(blk->write_nr) > vote_for_write_nr) { - term_to = random_to(SCOUTFS_QUORUM_TERM_LO_MS, - SCOUTFS_QUORUM_TERM_HI_MS); - vote_for_write_nr = le64_to_cpu(blk->write_nr); - } - - /* count our votes */ - if (role == CANDIDATE && - le64_to_cpu(blk->vote_for_rid) == sbi->rid) { - votes++; - } - - /* try to write greater write_nr */ - write_nr = max(write_nr, le64_to_cpu(blk->write_nr)); - } - - trace_scoutfs_quorum_election_vote(sb, role, term, - vote_for_rid, votes, - log_cycles, - super->quorum_count); - - /* first merge logs from all votes this term */ - list_for_each_entry(qbh, &blocks, head) { - blk = &qbh->blk; - - ret = log_merge(our_blk, blk, - le64_to_cpu(super->quorum_fenced_term)); - if (ret < 0) - goto out; - } - - /* remove logs for voters that can't be servers */ - list_for_each_entry(qbh, &blocks, head) { - blk = &qbh->blk; - - if (blk->voter_rid != blk->vote_for_rid) - log_purge(our_blk, le64_to_cpu(blk->term), - le64_to_cpu(blk->voter_rid)); - } - - /* add ourselves to the log when we see vote quorum */ - if (role == CANDIDATE && votes >= super->quorum_count) { - scoutfs_addr_from_sin(&addr, &opts->server_addr); - ret = log_add(our_blk, term, vote_for_rid, &addr); - if (ret < 0) - goto out; - log_cycles++; /* will be written *this* cycle */ - } - - /* elected candidates can proceed after their log cycles */ - if (role == CANDIDATE && - log_cycles > SCOUTFS_QUORUM_ELECTED_LOG_CYCLES) { - /* our_blk is clobbered */ - ret = fence_previous(sb, our_blk, sbi->rid, - le64_to_cpu(super->quorum_fenced_term), - term); - if (ret < 0) - goto out; - scoutfs_inc_counter(sb, quorum_elected_leader); - qinf->is_leader = true; - *elected_term = term; - goto out; - } - - /* write our block every cycle */ - if (term > 0) { - our_blk->term = cpu_to_le64(term); - write_nr++; - our_blk->write_nr = cpu_to_le64(write_nr); - our_blk->voter_rid = cpu_to_le64(sbi->rid); - our_blk->vote_for_rid = cpu_to_le64(vote_for_rid); - - ret = write_quorum_block(sb, our_blk); - if (ret < 0) - goto out; - } - - /* add a small random delay to each cycle */ - cycle_to = random_to(SCOUTFS_QUORUM_CYCLE_LO_MS, - SCOUTFS_QUORUM_CYCLE_HI_MS); - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_hrtimeout(&cycle_to, HRTIMER_MODE_ABS); + return ret; } -out: - free_quorum_blocks(&blocks); - kfree(super); - kfree(our_blk); + ret = kernel_recvmsg(qinf->sock, &mh, &kv, 1, kv.iov_len, mh.msg_flags); + if (ret < 0) + return ret; - trace_scoutfs_quorum_election_ret(sb, ret, *elected_term); - if (ret) - scoutfs_inc_counter(sb, quorum_failure); + if (ret != sizeof(qmes) || + qmes.crc != quorum_message_crc(&qmes) || + qmes.fsid != super->hdr.fsid || + qmes.type >= SCOUTFS_QUORUM_MSG_INVALID || + qmes.from >= SCOUTFS_QUORUM_MAX_SLOTS || + !quorum_slot_present(super, qmes.from)) { + /* should we be trying to open a new socket? */ + scoutfs_inc_counter(sb, quorum_recv_invalid); + return -EAGAIN; + } + + msg->term = le64_to_cpu(qmes.term); + msg->type = qmes.type; + msg->from = qmes.from; + + trace_scoutfs_quorum_recv_message(sb, msg->term, msg->type, msg->from); + + spin_lock(&qinf->show_lock); + qinf->last_recv[msg->from].msg = *msg; + qinf->last_recv[msg->from].ts = ktime_to_timespec64(ktime_get()); + spin_unlock(&qinf->show_lock); + + return 0; +} + +/* + * The caller can provide a mark that they're using to track their + * written blocks. It's updated as they write the block and we can + * compare it with what we read to see if there have been unexpected + * intervening writes to the block -- the caller is supposed to have + * exclusive access to the block (or was fenced). + */ +static int read_quorum_block(struct super_block *sb, u64 blkno, + struct scoutfs_quorum_block *blk, __le64 *mark) +{ + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + struct scoutfs_super_block *super = &sbi->super; + __le32 crc; + int ret; + + if (WARN_ON_ONCE(blkno < SCOUTFS_QUORUM_BLKNO) || + WARN_ON_ONCE(blkno >= (SCOUTFS_QUORUM_BLKNO + + SCOUTFS_QUORUM_BLOCKS))) + return -EINVAL; + + ret = scoutfs_block_read_sm(sb, sbi->meta_bdev, blkno, + &blk->hdr, sizeof(*blk), &crc); + + /* detect invalid blocks */ + if (ret == 0 && + ((blk->hdr.crc != crc) || + (le32_to_cpu(blk->hdr.magic) != SCOUTFS_BLOCK_MAGIC_QUORUM) || + (blk->hdr.fsid != super->hdr.fsid) || + (le64_to_cpu(blk->hdr.blkno) != blkno))) { + scoutfs_inc_counter(sb, quorum_read_invalid_block); + ret = -EIO; + } + + if (mark && *mark != 0 && blk->random_write_mark != *mark) { + scoutfs_err(sb, "read unexpected quorum block write mark, are multiple mounts configured with the same slot?"); + ret = -EIO; + } + + if (ret < 0) + scoutfs_err(sb, "quorum block read error %d", ret); return ret; } -void scoutfs_quorum_clear_leader(struct super_block *sb) +static void set_quorum_block_event(struct super_block *sb, + struct scoutfs_quorum_block *blk, + struct scoutfs_quorum_block_event *ev) +{ + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + struct timespec64 ts; + + getnstimeofday64(&ts); + + ev->rid = cpu_to_le64(sbi->rid); + ev->ts.sec = cpu_to_le64(ts.tv_sec); + ev->ts.nsec = cpu_to_le32(ts.tv_nsec); +} + +/* + * Every time we write a block we update the write stamp and random + * write mark so readers can see our write. + */ +static int write_quorum_block(struct super_block *sb, u64 blkno, + struct scoutfs_quorum_block *blk, __le64 *mark) +{ + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + int ret; + + if (WARN_ON_ONCE(blkno < SCOUTFS_QUORUM_BLKNO) || + WARN_ON_ONCE(blkno >= (SCOUTFS_QUORUM_BLKNO + + SCOUTFS_QUORUM_BLOCKS))) + return -EINVAL; + + do { + get_random_bytes(&blk->random_write_mark, + sizeof(blk->random_write_mark)); + } while (blk->random_write_mark == 0); + + if (mark) + *mark = blk->random_write_mark; + + set_quorum_block_event(sb, blk, &blk->write); + + ret = scoutfs_block_write_sm(sb, sbi->meta_bdev, blkno, + &blk->hdr, sizeof(*blk)); + if (ret < 0) + scoutfs_err(sb, "quorum block write error %d", ret); + + return ret; +} + +/* + * Read the caller's slot's current quorum block, make a change, and + * write it back out. If the caller provides a mark it can cause read + * errors if we read a mark that doesn't match the last mark that the + * caller wrote. + */ +static int update_quorum_block(struct super_block *sb, u64 blkno, + __le64 *mark, int role, u64 term) +{ + struct scoutfs_quorum_block blk; + u64 flags; + u64 bits; + u64 set; + int ret; + + ret = read_quorum_block(sb, blkno, &blk, mark); + if (ret == 0) { + if (blk.term != cpu_to_le64(term)) { + blk.term = cpu_to_le64(term); + set_quorum_block_event(sb, &blk, &blk.update_term); + } + + flags = le64_to_cpu(blk.flags); + bits = SCOUTFS_QUORUM_BLOCK_LEADER; + set = role == LEADER ? SCOUTFS_QUORUM_BLOCK_LEADER : 0; + if ((flags & bits) != set) + set_quorum_block_event(sb, &blk, + set ? &blk.set_leader : + &blk.clear_leader); + blk.flags = cpu_to_le64((flags & ~bits) | set); + + ret = write_quorum_block(sb, blkno, &blk, mark); + } + + return ret; +} + + +/* + * The calling server has been elected and updated their block, but + * can't yet assume that it has exclusive access to the metadata device. + * We read all the quorum blocks looking for previously elected leaders + * to fence so that we're the only leader running. + */ +static int fence_leader_blocks(struct super_block *sb) +{ + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + struct scoutfs_super_block *super = &sbi->super; + struct mount_options *opts = &sbi->opts; + struct scoutfs_quorum_block blk; + struct sockaddr_in sin; + u64 blkno; + int ret = 0; + int i; + + BUILD_BUG_ON(SCOUTFS_QUORUM_BLOCKS < SCOUTFS_QUORUM_MAX_SLOTS); + + for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) { + if (i == opts->quorum_slot_nr) + continue; + + blkno = SCOUTFS_QUORUM_BLKNO + i; + ret = read_quorum_block(sb, blkno, &blk, NULL); + if (ret < 0) + goto out; + + if (!(le64_to_cpu(blk.flags) & SCOUTFS_QUORUM_BLOCK_LEADER)) + continue; + + scoutfs_inc_counter(sb, quorum_fence_leader); + scoutfs_quorum_slot_sin(super, i, &sin); + + scoutfs_err(sb, "fencing "SCSBF" at "SIN_FMT, + SCSB_LEFR_ARGS(super->hdr.fsid, blk.set_leader.rid), + SIN_ARG(&sin)); + + blk.flags &= ~cpu_to_le64(SCOUTFS_QUORUM_BLOCK_LEADER); + set_quorum_block_event(sb, &blk, &blk.fenced); + + ret = write_quorum_block(sb, blkno, &blk, NULL); + if (ret < 0) + goto out; + } + +out: + if (ret < 0) { + scoutfs_err(sb, "error %d fencing active", ret); + scoutfs_inc_counter(sb, quorum_fence_error); + } + + return ret; +} + +/* + * The quorum work always runs in the background of quorum member + * mounts. It's responsible for starting and stopping the server if + * it's elected leader, and the server can call back into it to let it + * know that it has shut itself down (perhaps due to error) so that the + * work should stop sending heartbeats. + */ +static void scoutfs_quorum_worker(struct work_struct *work) +{ + struct quorum_info *qinf = container_of(work, struct quorum_info, work); + struct super_block *sb = qinf->sb; + struct mount_options *opts = &SCOUTFS_SB(sb)->opts; + struct scoutfs_quorum_block blk; + struct sockaddr_in unused; + struct quorum_host_msg msg; + struct quorum_status qst; + __le64 mark; + u64 blkno; + int ret; + + /* recording votes from slots as native single word bitmap */ + BUILD_BUG_ON(SCOUTFS_QUORUM_MAX_SLOTS > BITS_PER_LONG); + + /* get our starting term from our persistent block */ + mark = 0; + blkno = SCOUTFS_QUORUM_BLKNO + opts->quorum_slot_nr; + ret = read_quorum_block(sb, blkno, &blk, &mark); + if (ret < 0) + goto out; + + /* start out as a follower */ + qst.role = FOLLOWER; + qst.term = le64_to_cpu(blk.term); + qst.vote_for = -1; + qst.vote_bits = 0; + + /* see if there's a server to chose heartbeat or election timeout */ + if (scoutfs_quorum_server_sin(sb, &unused) == 0) + qst.timeout = heartbeat_timeout(); + else + qst.timeout = election_timeout(); + + while (!qinf->shutdown) { + + ret = recv_msg(sb, &msg, qst.timeout); + if (ret < 0) { + if (ret != -ETIMEDOUT && ret != -EAGAIN) { + scoutfs_err(sb, "quorum recvmsg err %d", ret); + scoutfs_inc_counter(sb, quorum_recv_error); + goto out; + } + msg.type = SCOUTFS_QUORUM_MSG_INVALID; + ret = 0; + } + + /* ignore messages from older terms */ + if (msg.type != SCOUTFS_QUORUM_MSG_INVALID && + msg.term < qst.term) + msg.type = SCOUTFS_QUORUM_MSG_INVALID; + + /* if the server has shutdown we become follower */ + if (!test_bit(QINF_FLAG_SERVER, &qinf->flags) && + qst.role == LEADER) { + qst.role = FOLLOWER; + qst.vote_for = -1; + qst.vote_bits = 0; + qst.timeout = election_timeout(); + scoutfs_inc_counter(sb, quorum_server_shutdown); + + send_msg_others(sb, SCOUTFS_QUORUM_MSG_RESIGNATION, + qst.term); + scoutfs_inc_counter(sb, quorum_send_resignation); + + ret = update_quorum_block(sb, blkno, &mark, + qst.role, qst.term); + if (ret < 0) + goto out; + } + + spin_lock(&qinf->show_lock); + qinf->show_status = qst; + spin_unlock(&qinf->show_lock); + + trace_scoutfs_quorum_loop(sb, qst.role, qst.term, qst.vote_for, + qst.vote_bits, + ktime_to_timespec64(qst.timeout)); + + /* receiving greater terms resets term, becomes follower */ + if (msg.type != SCOUTFS_QUORUM_MSG_INVALID && + msg.term > qst.term) { + if (qst.role == LEADER) { + scoutfs_warn(sb, "saw msg type %u from %u for term %llu while leader in term %llu, shutting down server.", + msg.type, msg.from, msg.term, qst.term); + scoutfs_server_stop(sb); + } + qst.role = FOLLOWER; + qst.term = msg.term; + qst.vote_for = -1; + qst.vote_bits = 0; + scoutfs_inc_counter(sb, quorum_term_follower); + + if (msg.type == SCOUTFS_QUORUM_MSG_HEARTBEAT) + qst.timeout = heartbeat_timeout(); + else + qst.timeout = election_timeout(); + + /* store our increased term */ + ret = update_quorum_block(sb, blkno, &mark, + qst.role, qst.term); + if (ret < 0) + goto out; + } + + /* followers and candidates start new election on timeout */ + if (qst.role != LEADER && + ktime_after(ktime_get(), qst.timeout)) { + qst.role = CANDIDATE; + qst.term++; + qst.vote_for = -1; + qst.vote_bits = 0; + set_bit(opts->quorum_slot_nr, &qst.vote_bits); + send_msg_others(sb, SCOUTFS_QUORUM_MSG_REQUEST_VOTE, + qst.term); + qst.timeout = election_timeout(); + scoutfs_inc_counter(sb, quorum_send_request); + } + + /* candidates count votes in their term */ + if (qst.role == CANDIDATE && + msg.type == SCOUTFS_QUORUM_MSG_VOTE) { + if (test_bit(msg.from, &qst.vote_bits)) { + scoutfs_warn(sb, "already received vote from %u in term %llu, are there multiple mounts with quorum_slot_nr=%u?", + msg.from, qst.term, msg.from); + } + set_bit(msg.from, &qst.vote_bits); + scoutfs_inc_counter(sb, quorum_recv_vote); + } + + /* + * Candidates become leaders when they receive enough + * votes. (Possibly only counting their own vote in + * single vote majorities.) + */ + if (qst.role == CANDIDATE && + hweight_long(qst.vote_bits) >= qinf->votes_needed) { + qst.role = LEADER; + scoutfs_inc_counter(sb, quorum_elected); + + /* send heartbeat before server starts */ + send_msg_others(sb, SCOUTFS_QUORUM_MSG_HEARTBEAT, + qst.term); + qst.timeout = heartbeat_interval(); + + /* set our leader flag and fence */ + ret = update_quorum_block(sb, blkno, &mark, + qst.role, qst.term) ?: + fence_leader_blocks(sb); + if (ret < 0) + goto out; + + /* make very sure server is fully shut down */ + scoutfs_server_stop(sb); + /* set server bit before server shutdown could clear */ + set_bit(QINF_FLAG_SERVER, &qinf->flags); + + ret = scoutfs_server_start(sb, qst.term); + if (ret < 0) { + scoutfs_err(sb, "server startup failed with %d", + ret); + goto out; + } + } + + /* leaders regularly send heartbeats to delay elections */ + if (qst.role == LEADER && + ktime_after(ktime_get(), qst.timeout)) { + send_msg_others(sb, SCOUTFS_QUORUM_MSG_HEARTBEAT, + qst.term); + qst.timeout = heartbeat_interval(); + scoutfs_inc_counter(sb, quorum_send_heartbeat); + } + + /* receiving heartbeats extends timeout, delaying elections */ + if (msg.type == SCOUTFS_QUORUM_MSG_HEARTBEAT) { + qst.timeout = heartbeat_timeout(); + scoutfs_inc_counter(sb, quorum_recv_heartbeat); + } + + /* receiving a resignation from server starts election */ + if (msg.type == SCOUTFS_QUORUM_MSG_RESIGNATION && + qst.role == FOLLOWER && + msg.term == qst.term) { + qst.timeout = election_timeout(); + scoutfs_inc_counter(sb, quorum_recv_resignation); + } + + /* followers vote once per term */ + if (qst.role == FOLLOWER && + msg.type == SCOUTFS_QUORUM_MSG_REQUEST_VOTE && + qst.vote_for == -1) { + qst.vote_for = msg.from; + send_msg_to(sb, SCOUTFS_QUORUM_MSG_VOTE, qst.term, + msg.from); + scoutfs_inc_counter(sb, quorum_send_vote); + } + } + + /* always try to stop a running server as we stop */ + if (test_bit(QINF_FLAG_SERVER, &qinf->flags)) { + scoutfs_server_stop(sb); + send_msg_others(sb, SCOUTFS_QUORUM_MSG_RESIGNATION, + qst.term); + } + + /* always try to clear leader block as we stop to avoid fencing */ + if (qst.role == LEADER) { + ret = update_quorum_block(sb, blkno, &mark, + FOLLOWER, qst.term); + if (ret < 0) + goto out; + } +out: + if (ret < 0) { + scoutfs_err(sb, "quorum service saw error %d, shutting down. Cluster will be degraded until this slot is remounted to restart the quorum service", + ret); + } +} + +/* + * Set a flag for the quorum work's next iteration to indicate that the + * server has shutdown and that it should step down as leader, update + * quorum blocks, and stop sending heartbeats. + */ +void scoutfs_quorum_server_shutdown(struct super_block *sb) { DECLARE_QUORUM_INFO(sb, qinf); - qinf->is_leader = false; + set_bit(QINF_FLAG_SERVER, &qinf->flags); } +/* + * Clients read quorum blocks looking for the leader with a server whose + * address it can try and connect to. + * + * There can be multiple running servers if a client checks before a + * server has had a chance to fence any old servers. We try to use the + * block with the most recent timestamp. If we get it wrong the + * connection will timeout and the client will try again, presumably + * finding a single server block. + */ +int scoutfs_quorum_server_sin(struct super_block *sb, struct sockaddr_in *sin) +{ + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + struct scoutfs_super_block *super = &sbi->super; + struct scoutfs_quorum_block blk; + struct timespec64 recent = {0,}; + struct timespec64 ts; + int ret; + int i; + + for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) { + ret = read_quorum_block(sb, SCOUTFS_QUORUM_BLKNO + i, &blk, + NULL); + if (ret < 0) { + scoutfs_err(sb, "error reading quorum block nr %u: %d", + i, ret); + goto out; + } + + ts.tv_sec = le64_to_cpu(blk.set_leader.ts.sec); + ts.tv_nsec = le32_to_cpu(blk.set_leader.ts.nsec); + + if ((le64_to_cpu(blk.flags) & SCOUTFS_QUORUM_BLOCK_LEADER) && + (timespec64_to_ns(&ts) > timespec64_to_ns(&recent))) { + recent = ts; + scoutfs_quorum_slot_sin(super, i, sin); + continue; + } + } + + if (timespec64_to_ns(&recent) == 0) + ret = -ENOENT; + +out: + return ret; +} + +/* + * The number of votes needed for a member to reach quorum and be + * elected the leader: a majority of the number of present slots in the + * super block. + */ +u8 scoutfs_quorum_votes_needed(struct super_block *sb) +{ + DECLARE_QUORUM_INFO(sb, qinf); + + return qinf->votes_needed; +} + +void scoutfs_quorum_slot_sin(struct scoutfs_super_block *super, int i, + struct sockaddr_in *sin) +{ + BUG_ON(i < 0 || i >= SCOUTFS_QUORUM_MAX_SLOTS); + + scoutfs_addr_to_sin(sin, &super->qconf.slots[i].addr); +} + +static char *role_str(int role) +{ + static char *roles[] = { + [FOLLOWER] = "follower", + [CANDIDATE] = "candidate", + [LEADER] = "leader", + }; + + if (role < 0 || role > ARRAY_SIZE(roles) || !roles[role]) + return "invalid"; + + return roles[role]; +} + +#define snprintf_ret(buf, size, retp, fmt...) \ +do { \ + __typeof__(buf) _buf = buf; \ + __typeof__(size) _size = size; \ + __typeof__(retp) _retp = retp; \ + __typeof__(*retp) _ret = *_retp; \ + __typeof__(*retp) _len; \ + \ + if (_ret >= 0 && _ret < _size) { \ + _len = snprintf(_buf + _ret, _size - _ret, ##fmt); \ + if (_len < 0) \ + _ret = _len; \ + else \ + _ret += _len; \ + *_retp = _ret; \ + } \ +} while (0) + +static ssize_t status_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + DECLARE_QUORUM_INFO_KOBJ(kobj, qinf); + struct mount_options *opts = &SCOUTFS_SB(qinf->sb)->opts; + struct quorum_status qst; + struct last_msg last; + struct timespec64 ts; + size_t size; + int ret; + int i; + + spin_lock(&qinf->show_lock); + qst = qinf->show_status; + spin_unlock(&qinf->show_lock); + + size = PAGE_SIZE; + ret = 0; + + snprintf_ret(buf, size, &ret, "quorum_slot_nr %u\n", + opts->quorum_slot_nr); + snprintf_ret(buf, size, &ret, "term %llu\n", + qst.term); + snprintf_ret(buf, size, &ret, "role %d (%s)\n", + qst.role, role_str(qst.role)); + snprintf_ret(buf, size, &ret, "vote_for %d\n", + qst.vote_for); + snprintf_ret(buf, size, &ret, "vote_bits 0x%lx (count %lu)\n", + qst.vote_bits, hweight_long(qst.vote_bits)); + ts = ktime_to_timespec64(qst.timeout); + snprintf_ret(buf, size, &ret, "timeout %llu.%u\n", + (u64)ts.tv_sec, (int)ts.tv_nsec); + + for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) { + spin_lock(&qinf->show_lock); + last = qinf->last_send[i]; + spin_unlock(&qinf->show_lock); + + if (last.msg.term == 0) + continue; + + snprintf_ret(buf, size, &ret, + "last_send to %u term %llu type %u ts %llu.%u\n", + i, last.msg.term, last.msg.type, + (u64)last.ts.tv_sec, (int)last.ts.tv_nsec); + } + + for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) { + spin_lock(&qinf->show_lock); + last = qinf->last_recv[i]; + spin_unlock(&qinf->show_lock); + + if (last.msg.term == 0) + continue; + snprintf_ret(buf, size, &ret, + "last_recv from %u term %llu type %u ts %llu.%u\n", + i, last.msg.term, last.msg.type, + (u64)last.ts.tv_sec, (int)last.ts.tv_nsec); + } + + return ret; +} +SCOUTFS_ATTR_RO(status); + static ssize_t is_leader_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { DECLARE_QUORUM_INFO_KOBJ(kobj, qinf); - return snprintf(buf, PAGE_SIZE, "%u", !!qinf->is_leader); + return snprintf(buf, PAGE_SIZE, "%u", + !!(qinf->show_status.role == LEADER)); } SCOUTFS_ATTR_RO(is_leader); static struct attribute *quorum_attrs[] = { + SCOUTFS_ATTR_PTR(status), SCOUTFS_ATTR_PTR(is_leader), NULL, }; +static inline bool valid_ipv4_unicast(__be32 addr) +{ + return !(ipv4_is_multicast(addr) && ipv4_is_lbcast(addr) && + ipv4_is_zeronet(addr) && ipv4_is_local_multicast(addr)); +} + +static inline bool valid_ipv4_port(__be16 port) +{ + return port != 0 && be16_to_cpu(port) != U16_MAX; +} + +static int verify_quorum_slots(struct super_block *sb) +{ + struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super; + DECLARE_QUORUM_INFO(sb, qinf); + struct sockaddr_in other; + struct sockaddr_in sin; + int found = 0; + int i; + int j; + + for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) { + if (!quorum_slot_present(super, i)) + continue; + + scoutfs_quorum_slot_sin(super, i, &sin); + + if (!valid_ipv4_unicast(sin.sin_addr.s_addr)) { + scoutfs_err(sb, "quorum slot #%d has invalid ipv4 unicast address: "SIN_FMT, + i, SIN_ARG(&sin)); + return -EINVAL; + } + + if (!valid_ipv4_port(sin.sin_port)) { + scoutfs_err(sb, "quorum slot #%d has invalid ipv4 port number:"SIN_FMT, + i, SIN_ARG(&sin)); + return -EINVAL; + } + + for (j = i + 1; j < SCOUTFS_QUORUM_MAX_SLOTS; j++) { + scoutfs_quorum_slot_sin(super, j, &other); + + if (sin.sin_addr.s_addr == other.sin_addr.s_addr && + sin.sin_port == other.sin_port) { + scoutfs_err(sb, "quorum slots #%u and #%u have the same address: "SIN_FMT, + i, j, SIN_ARG(&sin)); + return -EINVAL; + } + } + + found++; + } + + if (found == 0) { + scoutfs_err(sb, "no populated quorum slots in superblock"); + return -EINVAL; + } + + /* + * Always require a majority except in the pathological cases of + * 1 or 2 members. + */ + if (found < 3) + qinf->votes_needed = 1; + else + qinf->votes_needed = (found / 2) + 1; + + return 0; +} + +/* + * Once this schedules the quorum worker it can be elected leader and + * start the server, possibly before this returns. + */ int scoutfs_quorum_setup(struct super_block *sb) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + struct mount_options *opts = &sbi->opts; struct quorum_info *qinf; int ret; + if (opts->quorum_slot_nr < 0) + return 0; + qinf = kzalloc(sizeof(struct quorum_info), GFP_KERNEL); if (!qinf) { ret = -ENOMEM; goto out; } + + spin_lock_init(&qinf->show_lock); + INIT_WORK(&qinf->work, scoutfs_quorum_worker); scoutfs_sysfs_init_attrs(sb, &qinf->ssa); sbi->quorum_info = qinf; + qinf->sb = sb; + + ret = verify_quorum_slots(sb); + if (ret < 0) + goto out; + + /* create in setup so errors cause mount to fail */ + ret = create_socket(sb); + if (ret < 0) + goto out; ret = scoutfs_sysfs_create_attrs(sb, &qinf->ssa, quorum_attrs, "quorum"); + if (ret < 0) + goto out; + + schedule_work(&qinf->work); + out: if (ret) scoutfs_quorum_destroy(sb); @@ -766,13 +1056,30 @@ out: return 0; } +/* + * Shutdown the quorum worker and destroy all our resources. + * + * This is called after client destruction which only completes once + * farewell requests are resolved. That only happens for a quorum member + * once it isn't needed for quorum. + * + * The work is the only place that starts the server, and it stops the + * server as it exits, so we can wait for it to finish and know that no + * server can be running to call back into us as it shuts down. + */ void scoutfs_quorum_destroy(struct super_block *sb) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct quorum_info *qinf = SCOUTFS_SB(sb)->quorum_info; if (qinf) { + qinf->shutdown = true; + flush_work(&qinf->work); + scoutfs_sysfs_destroy_attrs(sb, &qinf->ssa); + if (qinf->sock) + sock_release(qinf->sock); + kfree(qinf); sbi->quorum_info = NULL; } diff --git a/kmod/src/quorum.h b/kmod/src/quorum.h index 96eac0e4..f0994871 100644 --- a/kmod/src/quorum.h +++ b/kmod/src/quorum.h @@ -1,10 +1,15 @@ #ifndef _SCOUTFS_QUORUM_H_ #define _SCOUTFS_QUORUM_H_ -int scoutfs_quorum_election(struct super_block *sb, ktime_t timeout_abs, - u64 prev_term, u64 *elected_term); -void scoutfs_quorum_clear_leader(struct super_block *sb); +int scoutfs_quorum_server_sin(struct super_block *sb, struct sockaddr_in *sin); +void scoutfs_quorum_server_shutdown(struct super_block *sb); + +u8 scoutfs_quorum_votes_needed(struct super_block *sb); +void scoutfs_quorum_slot_sin(struct scoutfs_super_block *super, int i, + struct sockaddr_in *sin); int scoutfs_quorum_setup(struct super_block *sb); +void scoutfs_quorum_shutdown(struct super_block *sb); void scoutfs_quorum_destroy(struct super_block *sb); + #endif diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h index 5b02781e..d440a228 100644 --- a/kmod/src/scoutfs_trace.h +++ b/kmod/src/scoutfs_trace.h @@ -1797,118 +1797,69 @@ TRACE_EVENT(scoutfs_lock_message, __entry->old_mode, __entry->new_mode) ); +DECLARE_EVENT_CLASS(scoutfs_quorum_message_class, + TP_PROTO(struct super_block *sb, u64 term, u8 type, int nr), -TRACE_EVENT(scoutfs_quorum_election, - TP_PROTO(struct super_block *sb, u64 prev_term), - - TP_ARGS(sb, prev_term), + TP_ARGS(sb, term, type, nr), TP_STRUCT__entry( SCSB_TRACE_FIELDS - __field(__u64, prev_term) - ), - - TP_fast_assign( - SCSB_TRACE_ASSIGN(sb); - __entry->prev_term = prev_term; - ), - - TP_printk(SCSBF" prev_term %llu", - SCSB_TRACE_ARGS, __entry->prev_term) -); - -TRACE_EVENT(scoutfs_quorum_election_ret, - TP_PROTO(struct super_block *sb, int ret, u64 elected_term), - - TP_ARGS(sb, ret, elected_term), - - TP_STRUCT__entry( - SCSB_TRACE_FIELDS - __field(int, ret) - __field(__u64, elected_term) - ), - - TP_fast_assign( - SCSB_TRACE_ASSIGN(sb); - __entry->ret = ret; - __entry->elected_term = elected_term; - ), - - TP_printk(SCSBF" ret %d elected_term %llu", - SCSB_TRACE_ARGS, __entry->ret, __entry->elected_term) -); - -TRACE_EVENT(scoutfs_quorum_election_vote, - TP_PROTO(struct super_block *sb, int role, u64 term, u64 vote_for_rid, - int votes, int log_cycles, int quorum_count), - - TP_ARGS(sb, role, term, vote_for_rid, votes, log_cycles, quorum_count), - - TP_STRUCT__entry( - SCSB_TRACE_FIELDS - __field(int, role) __field(__u64, term) - __field(__u64, vote_for_rid) - __field(int, votes) - __field(int, log_cycles) - __field(int, quorum_count) + __field(__u8, type) + __field(int, nr) ), TP_fast_assign( SCSB_TRACE_ASSIGN(sb); - __entry->role = role; __entry->term = term; - __entry->vote_for_rid = vote_for_rid; - __entry->votes = votes; - __entry->log_cycles = log_cycles; - __entry->quorum_count = quorum_count; + __entry->type = type; + __entry->nr = nr; ), - TP_printk(SCSBF" role %d term %llu vote_for_rid %016llx votes %d log_cycles %d quorum_count %d", - SCSB_TRACE_ARGS, __entry->role, __entry->term, - __entry->vote_for_rid, __entry->votes, __entry->log_cycles, - __entry->quorum_count) + TP_printk(SCSBF" term %llu type %u nr %d", + SCSB_TRACE_ARGS, __entry->term, __entry->type, __entry->nr) +); +DEFINE_EVENT(scoutfs_quorum_message_class, scoutfs_quorum_send_message, + TP_PROTO(struct super_block *sb, u64 term, u8 type, int nr), + TP_ARGS(sb, term, type, nr) +); +DEFINE_EVENT(scoutfs_quorum_message_class, scoutfs_quorum_recv_message, + TP_PROTO(struct super_block *sb, u64 term, u8 type, int nr), + TP_ARGS(sb, term, type, nr) ); -DECLARE_EVENT_CLASS(scoutfs_quorum_block_class, - TP_PROTO(struct super_block *sb, struct scoutfs_quorum_block *blk), +TRACE_EVENT(scoutfs_quorum_loop, + TP_PROTO(struct super_block *sb, int role, u64 term, int vote_for, + unsigned long vote_bits, struct timespec64 timeout), - TP_ARGS(sb, blk), + TP_ARGS(sb, role, term, vote_for, vote_bits, timeout), TP_STRUCT__entry( SCSB_TRACE_FIELDS - __field(__u64, blkno) __field(__u64, term) - __field(__u64, write_nr) - __field(__u64, voter_rid) - __field(__u64, vote_for_rid) - __field(__u32, crc) - __field(__u8, log_nr) + __field(int, role) + __field(int, vote_for) + __field(unsigned long, vote_bits) + __field(unsigned long, vote_count) + __field(unsigned long long, timeout_sec) + __field(int, timeout_nsec) ), TP_fast_assign( SCSB_TRACE_ASSIGN(sb); - __entry->blkno = le64_to_cpu(blk->blkno); - __entry->term = le64_to_cpu(blk->term); - __entry->write_nr = le64_to_cpu(blk->write_nr); - __entry->voter_rid = le64_to_cpu(blk->voter_rid); - __entry->vote_for_rid = le64_to_cpu(blk->vote_for_rid); - __entry->crc = le32_to_cpu(blk->crc); - __entry->log_nr = blk->log_nr; + __entry->term = term; + __entry->role = role; + __entry->vote_for = vote_for; + __entry->vote_bits = vote_bits; + __entry->vote_count = hweight_long(vote_bits); + __entry->timeout_sec = timeout.tv_sec; + __entry->timeout_nsec = timeout.tv_nsec; ), - TP_printk(SCSBF" blkno %llu term %llu write_nr %llu voter_rid %016llx vote_for_rid %016llx crc 0x%08x log_nr %u", - SCSB_TRACE_ARGS, __entry->blkno, __entry->term, - __entry->write_nr, __entry->voter_rid, __entry->vote_for_rid, - __entry->crc, __entry->log_nr) -); -DEFINE_EVENT(scoutfs_quorum_block_class, scoutfs_quorum_read_block, - TP_PROTO(struct super_block *sb, struct scoutfs_quorum_block *blk), - TP_ARGS(sb, blk) -); -DEFINE_EVENT(scoutfs_quorum_block_class, scoutfs_quorum_write_block, - TP_PROTO(struct super_block *sb, struct scoutfs_quorum_block *blk), - TP_ARGS(sb, blk) + TP_printk(SCSBF" term %llu role %d vote_for %d vote_bits 0x%lx vote_count %lu timeout %llu.%u", + SCSB_TRACE_ARGS, __entry->term, __entry->role, + __entry->vote_for, __entry->vote_bits, __entry->vote_count, + __entry->timeout_sec, __entry->timeout_nsec) ); /* diff --git a/kmod/src/server.c b/kmod/src/server.c index d8118101..df5c0a67 100644 --- a/kmod/src/server.c +++ b/kmod/src/server.c @@ -59,7 +59,6 @@ struct server_info { int err; bool shutting_down; struct completion start_comp; - struct sockaddr_in listen_sin; u64 term; struct scoutfs_net_connection *conn; @@ -1362,7 +1361,7 @@ static void farewell_worker(struct work_struct *work) /* send as many responses as we can to maintain quorum */ while ((fw = list_first_entry_or_null(&reqs, struct farewell_request, entry)) && - (nr_mounted > super->quorum_count || + (nr_mounted > scoutfs_quorum_votes_needed(sb) || nr_unmounting >= nr_mounted)) { list_move_tail(&fw->entry, &send); @@ -1544,18 +1543,17 @@ static void scoutfs_server_worker(struct work_struct *work) struct super_block *sb = server->sb; struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct scoutfs_super_block *super = &sbi->super; + struct mount_options *opts = &sbi->opts; struct scoutfs_net_connection *conn = NULL; DECLARE_WAIT_QUEUE_HEAD(waitq); struct sockaddr_in sin; LIST_HEAD(conn_list); u64 max_vers; int ret; - int err; trace_scoutfs_server_work_enter(sb, 0, 0); - sin = server->listen_sin; - + scoutfs_quorum_slot_sin(super, opts->quorum_slot_nr, &sin); scoutfs_info(sb, "server setting up at "SIN_FMT, SIN_ARG(&sin)); conn = scoutfs_net_alloc_conn(sb, server_notify_up, server_notify_down, @@ -1575,9 +1573,6 @@ static void scoutfs_server_worker(struct work_struct *work) goto out; } - if (ret) - goto out; - /* start up the server subsystems before accepting */ ret = scoutfs_read_super(sb, super); if (ret < 0) @@ -1617,19 +1612,6 @@ static void scoutfs_server_worker(struct work_struct *work) if (ret) goto shutdown; - /* - * Write our address in the super before it's possible for net - * processing to start writing the super as part of - * transactions. In theory clients could be trying to connect - * to our address without having seen it in the super (maybe - * they saw it a long time ago). - */ - scoutfs_addr_from_sin(&super->server_addr, &sin); - super->quorum_server_term = cpu_to_le64(server->term); - ret = scoutfs_write_super(sb, super); - if (ret < 0) - goto shutdown; - /* start accepting connections and processing work */ server->conn = conn; scoutfs_net_listen(sb, conn); @@ -1656,30 +1638,14 @@ shutdown: scoutfs_lock_server_destroy(sb); out: - scoutfs_quorum_clear_leader(sb); scoutfs_net_free_conn(sb, conn); + /* let quorum know that we've shutdown */ + scoutfs_quorum_server_shutdown(sb); + scoutfs_info(sb, "server stopped at "SIN_FMT, SIN_ARG(&sin)); trace_scoutfs_server_work_exit(sb, 0, ret); - /* - * Always try to clear our presence in the super so that we're - * not fenced. We do this last because other mounts will try to - * reach quorum the moment they see zero here. The later we do - * this the longer we have to finish shutdown while clients - * timeout. - */ - err = scoutfs_read_super(sb, super); - if (err == 0) { - super->quorum_fenced_term = cpu_to_le64(server->term); - memset(&super->server_addr, 0, sizeof(super->server_addr)); - err = scoutfs_write_super(sb, super); - } - if (err < 0) { - scoutfs_err(sb, "failed to clear election term %llu at "SIN_FMT", this mount could be fenced", - server->term, SIN_ARG(&sin)); - } - server->err = ret; complete(&server->start_comp); } @@ -1689,14 +1655,12 @@ out: * the super block's fence_term has been set to the new server's term so * that it won't be fenced. */ -int scoutfs_server_start(struct super_block *sb, struct sockaddr_in *sin, - u64 term) +int scoutfs_server_start(struct super_block *sb, u64 term) { DECLARE_SERVER_INFO(sb, server); server->err = 0; server->shutting_down = false; - server->listen_sin = *sin; server->term = term; init_completion(&server->start_comp); diff --git a/kmod/src/server.h b/kmod/src/server.h index 274a66ea..f528789e 100644 --- a/kmod/src/server.h +++ b/kmod/src/server.h @@ -69,8 +69,7 @@ int scoutfs_server_apply_commit(struct super_block *sb, int err); struct sockaddr_in; struct scoutfs_quorum_elected_info; -int scoutfs_server_start(struct super_block *sb, struct sockaddr_in *sin, - u64 term); +int scoutfs_server_start(struct super_block *sb, u64 term); void scoutfs_server_abort(struct super_block *sb); void scoutfs_server_stop(struct super_block *sb); diff --git a/kmod/src/super.c b/kmod/src/super.c index d239cd06..1a795a69 100644 --- a/kmod/src/super.c +++ b/kmod/src/super.c @@ -176,7 +176,8 @@ static int scoutfs_show_options(struct seq_file *seq, struct dentry *root) struct super_block *sb = root->d_sb; struct mount_options *opts = &SCOUTFS_SB(sb)->opts; - seq_printf(seq, ",server_addr="SIN_FMT, SIN_ARG(&opts->server_addr)); + if (opts->quorum_slot_nr >= 0) + seq_printf(seq, ",quorum_slot_nr=%d", opts->quorum_slot_nr); seq_printf(seq, ",metadev_path=%s", opts->metadev_path); return 0; @@ -192,20 +193,19 @@ static ssize_t metadev_path_show(struct kobject *kobj, } SCOUTFS_ATTR_RO(metadev_path); -static ssize_t server_addr_show(struct kobject *kobj, +static ssize_t quorum_server_nr_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj); struct mount_options *opts = &SCOUTFS_SB(sb)->opts; - return snprintf(buf, PAGE_SIZE, SIN_FMT"\n", - SIN_ARG(&opts->server_addr)); + return snprintf(buf, PAGE_SIZE, "%d\n", opts->quorum_slot_nr); } -SCOUTFS_ATTR_RO(server_addr); +SCOUTFS_ATTR_RO(quorum_server_nr); static struct attribute *mount_options_attrs[] = { SCOUTFS_ATTR_PTR(metadev_path), - SCOUTFS_ATTR_PTR(server_addr), + SCOUTFS_ATTR_PTR(quorum_server_nr), NULL, }; @@ -257,15 +257,12 @@ static void scoutfs_put_super(struct super_block *sb) scoutfs_item_destroy(sb); scoutfs_forest_destroy(sb); - /* the server locks the listen address and compacts */ + scoutfs_quorum_destroy(sb); scoutfs_lock_shutdown(sb); scoutfs_server_destroy(sb); scoutfs_net_destroy(sb); scoutfs_lock_destroy(sb); - /* server clears quorum leader flag during shutdown */ - scoutfs_quorum_destroy(sb); - scoutfs_block_destroy(sb); scoutfs_destroy_triggers(sb); scoutfs_options_destroy(sb); @@ -390,17 +387,8 @@ static int scoutfs_read_super_from_bdev(struct super_block *sb, /* XXX do we want more rigorous invalid super checking? */ - if (super->quorum_count == 0 || - super->quorum_count > SCOUTFS_QUORUM_MAX_COUNT) { - scoutfs_err(sb, "super block has invalid quorum count %u, must be > 0 and <= %u", - super->quorum_count, SCOUTFS_QUORUM_MAX_COUNT); - ret = -EINVAL; - goto out; - } - if (invalid_blkno_limits(sb, "meta", - (SCOUTFS_QUORUM_BLKNO + SCOUTFS_QUORUM_BLOCKS) - << SCOUTFS_BLOCK_SM_LG_SHIFT, + SCOUTFS_META_DEV_START_BLKNO, super->first_meta_blkno, super->last_meta_blkno, sbi->meta_bdev, SCOUTFS_BLOCK_LG_SHIFT) || @@ -605,8 +593,8 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent) scoutfs_setup_trans(sb) ?: scoutfs_lock_setup(sb) ?: scoutfs_net_setup(sb) ?: - scoutfs_quorum_setup(sb) ?: scoutfs_server_setup(sb) ?: + scoutfs_quorum_setup(sb) ?: scoutfs_client_setup(sb) ?: scoutfs_lock_rid(sb, SCOUTFS_LOCK_WRITE, 0, sbi->rid, &sbi->rid_lock) ?: