diff --git a/kmod/src/counters.h b/kmod/src/counters.h index bb53e7a4..a7eccb60 100644 --- a/kmod/src/counters.h +++ b/kmod/src/counters.h @@ -157,6 +157,7 @@ EXPAND_COUNTER(orphan_scan_error) \ EXPAND_COUNTER(orphan_scan_item) \ EXPAND_COUNTER(orphan_scan_omap_set) \ + EXPAND_COUNTER(quorum_candidate_server_stopping) \ EXPAND_COUNTER(quorum_elected) \ EXPAND_COUNTER(quorum_fence_error) \ EXPAND_COUNTER(quorum_fence_leader) \ diff --git a/kmod/src/lock_server.c b/kmod/src/lock_server.c index 9623f866..56d20338 100644 --- a/kmod/src/lock_server.c +++ b/kmod/src/lock_server.c @@ -749,7 +749,7 @@ out: if (ret < 0) { scoutfs_err(sb, "lock server err %d during client rid %016llx farewell, shutting down", ret, rid); - scoutfs_server_abort(sb); + scoutfs_server_stop(sb); } return ret; diff --git a/kmod/src/net.c b/kmod/src/net.c index 7d8316b5..7a104c4f 100644 --- a/kmod/src/net.c +++ b/kmod/src/net.c @@ -1292,7 +1292,7 @@ restart: if (ret) { scoutfs_err(sb, "client fence returned err %d, shutting down server", ret); - scoutfs_server_abort(sb); + scoutfs_server_stop(sb); } } destroy_conn(acc); diff --git a/kmod/src/quorum.c b/kmod/src/quorum.c index bf87a98e..34daa105 100644 --- a/kmod/src/quorum.c +++ b/kmod/src/quorum.c @@ -105,6 +105,8 @@ enum quorum_role { FOLLOWER, CANDIDATE, LEADER }; struct quorum_status { enum quorum_role role; u64 term; + u64 server_start_term; + int server_event; int vote_for; unsigned long vote_bits; ktime_t timeout; @@ -117,7 +119,6 @@ struct quorum_info { bool shutdown; int our_quorum_slot_nr; - unsigned long flags; int votes_needed; spinlock_t show_lock; @@ -128,8 +129,6 @@ struct quorum_info { struct scoutfs_sysfs_attrs ssa; }; -#define QINF_FLAG_SERVER 0 - #define DECLARE_QUORUM_INFO(sb, name) \ struct quorum_info *name = SCOUTFS_SB(sb)->quorum_info #define DECLARE_QUORUM_INFO_KOBJ(kobj, name) \ @@ -494,16 +493,6 @@ static int update_quorum_block(struct super_block *sb, int event, u64 term, bool return ret; } -/* - * The calling server has fenced previous leaders and reclaimed their - * resources. We can now update our fence event with a greater term to - * stop future leaders from doing the same. - */ -int scoutfs_quorum_fence_complete(struct super_block *sb, u64 term) -{ - return update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_FENCE, term, true); -} - /* * The calling server has been elected and has started running but can't * yet assume that it has exclusive access to the metadata device. We @@ -593,15 +582,9 @@ int scoutfs_quorum_fence_leaders(struct super_block *sb, u64 term) } out: - if (fence_started) { - err = scoutfs_fence_wait_fenced(sb, msecs_to_jiffies(SCOUTFS_QUORUM_FENCE_TO_MS)); - if (ret == 0) - ret = err; - } else { - err = scoutfs_quorum_fence_complete(sb, term); - if (ret == 0) - ret = err; - } + err = scoutfs_fence_wait_fenced(sb, msecs_to_jiffies(SCOUTFS_QUORUM_FENCE_TO_MS)); + if (ret == 0) + ret = err; if (ret < 0) scoutfs_inc_counter(sb, quorum_fence_error); @@ -627,9 +610,8 @@ static void update_show_status(struct quorum_info *qinf, struct quorum_status *q /* * The quorum work always runs in the background of quorum member * mounts. It's responsible for starting and stopping the server if - * it's elected leader, and the server can call back into it to let it - * know that it has shut itself down (perhaps due to error) so that the - * work should stop sending heartbeats. + * it's elected leader. While it's leader it sends heartbeats to + * suppress other quorum work from standing for election. */ static void scoutfs_quorum_worker(struct work_struct *work) { @@ -637,7 +619,7 @@ static void scoutfs_quorum_worker(struct work_struct *work) struct super_block *sb = qinf->sb; struct sockaddr_in unused; struct quorum_host_msg msg; - struct quorum_status qst; + struct quorum_status qst = {0,}; int ret; int err; @@ -646,9 +628,7 @@ static void scoutfs_quorum_worker(struct work_struct *work) /* start out as a follower */ qst.role = FOLLOWER; - qst.term = 0; qst.vote_for = -1; - qst.vote_bits = 0; /* read our starting term from greatest in all events in all slots */ read_greatest_term(sb, &qst.term); @@ -684,20 +664,6 @@ static void scoutfs_quorum_worker(struct work_struct *work) msg.term < qst.term) msg.type = SCOUTFS_QUORUM_MSG_INVALID; - /* if the server has shutdown we become follower */ - if (!test_bit(QINF_FLAG_SERVER, &qinf->flags) && - qst.role == LEADER) { - qst.role = FOLLOWER; - qst.vote_for = -1; - qst.vote_bits = 0; - qst.timeout = election_timeout(); - scoutfs_inc_counter(sb, quorum_server_shutdown); - - send_msg_others(sb, SCOUTFS_QUORUM_MSG_RESIGNATION, - qst.term); - scoutfs_inc_counter(sb, quorum_send_resignation); - } - trace_scoutfs_quorum_loop(sb, qst.role, qst.term, qst.vote_for, qst.vote_bits, ktime_to_timespec64(qst.timeout)); @@ -708,8 +674,6 @@ static void scoutfs_quorum_worker(struct work_struct *work) if (qst.role == LEADER) { scoutfs_warn(sb, "saw msg type %u from %u for term %llu while leader in term %llu, shutting down server.", msg.type, msg.from, msg.term, qst.term); - update_show_status(qinf, &qst); - scoutfs_server_stop(sb); } qst.role = FOLLOWER; qst.term = msg.term; @@ -731,6 +695,13 @@ static void scoutfs_quorum_worker(struct work_struct *work) /* followers and candidates start new election on timeout */ if (qst.role != LEADER && ktime_after(ktime_get(), qst.timeout)) { + /* .. but only if their server has stopped */ + if (!scoutfs_server_is_down(sb)) { + qst.timeout = election_timeout(); + scoutfs_inc_counter(sb, quorum_candidate_server_stopping); + continue; + } + qst.role = CANDIDATE; qst.term++; qst.vote_for = -1; @@ -779,24 +750,62 @@ static void scoutfs_quorum_worker(struct work_struct *work) if (ret < 0) goto out; - /* make very sure server is fully shut down */ - scoutfs_server_stop(sb); - /* set server bit before server shutdown could clear */ - set_bit(QINF_FLAG_SERVER, &qinf->flags); + qst.server_start_term = qst.term; + qst.server_event = SCOUTFS_QUORUM_EVENT_ELECT; + scoutfs_server_start(sb, qst.term); + } - ret = scoutfs_server_start(sb, qst.term); - if (ret < 0) { - clear_bit(QINF_FLAG_SERVER, &qinf->flags); - /* store our increased term */ - err = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_STOP, qst.term, - true); - if (err < 0) { - ret = err; - goto out; - } - ret = 0; - continue; + /* + * This leader's server is up, having finished fencing + * previous leaders. We update the fence event with the + * current term to let future leaders know that previous + * servers have been fenced. + */ + if (qst.role == LEADER && qst.server_event != SCOUTFS_QUORUM_EVENT_FENCE && + scoutfs_server_is_up(sb)) { + ret = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_FENCE, qst.term, true); + if (ret < 0) + goto out; + qst.server_event = SCOUTFS_QUORUM_EVENT_FENCE; + } + + /* + * Stop a running server if we're no longer leader in + * its term. + */ + if (!(qst.role == LEADER && qst.term == qst.server_start_term) && + scoutfs_server_is_running(sb)) { + scoutfs_server_stop(sb); + } + + /* + * A previously running server has stopped. The quorum + * protocol might have shut it down by changing roles or + * it might have stopped on its own, perhaps on errors. + * If we're still a leader then we become a follower and + * send resignations to encourage the next election. + * Always update the _STOP event to stop connections and + * fencing. + */ + if (qst.server_start_term > 0 && scoutfs_server_is_down(sb)) { + if (qst.role == LEADER) { + qst.role = FOLLOWER; + qst.vote_for = -1; + qst.vote_bits = 0; + qst.timeout = election_timeout(); + scoutfs_inc_counter(sb, quorum_server_shutdown); + + send_msg_others(sb, SCOUTFS_QUORUM_MSG_RESIGNATION, + qst.server_start_term); + scoutfs_inc_counter(sb, quorum_send_resignation); } + + ret = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_STOP, + qst.server_start_term, true); + if (ret < 0) + goto out; + + qst.server_start_term = 0; } /* leaders regularly send heartbeats to delay elections */ @@ -836,11 +845,16 @@ static void scoutfs_quorum_worker(struct work_struct *work) update_show_status(qinf, &qst); /* always try to stop a running server as we stop */ - if (test_bit(QINF_FLAG_SERVER, &qinf->flags)) { - scoutfs_server_stop(sb); - scoutfs_fence_stop(sb); - send_msg_others(sb, SCOUTFS_QUORUM_MSG_RESIGNATION, - qst.term); + if (scoutfs_server_is_running(sb)) { + scoutfs_server_stop_wait(sb); + send_msg_others(sb, SCOUTFS_QUORUM_MSG_RESIGNATION, qst.term); + + if (qst.server_start_term > 0) { + err = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_STOP, + qst.server_start_term, true); + if (err < 0 && ret == 0) + ret = err; + } } /* record that this slot no longer has an active quorum */ @@ -852,21 +866,6 @@ out: } } -/* - * The calling server has shutdown and is no longer using shared - * resources. Clear the bit so that we stop sending heartbeats and - * allow the next server to be elected. Update the stop event so that - * it won't be considered available by clients or fenced by the next - * leader. - */ -void scoutfs_quorum_server_shutdown(struct super_block *sb, u64 term) -{ - DECLARE_QUORUM_INFO(sb, qinf); - - clear_bit(QINF_FLAG_SERVER, &qinf->flags); - update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_STOP, term, true); -} - /* * Clients read quorum blocks looking for the leader with a server whose * address it can try and connect to. @@ -988,6 +987,8 @@ static ssize_t status_show(struct kobject *kobj, struct kobj_attribute *attr, qinf->our_quorum_slot_nr); snprintf_ret(buf, size, &ret, "term %llu\n", qst.term); + snprintf_ret(buf, size, &ret, "server_start_term %llu\n", qst.server_start_term); + snprintf_ret(buf, size, &ret, "server_event %d\n", qst.server_event); snprintf_ret(buf, size, &ret, "role %d (%s)\n", qst.role, role_str(qst.role)); snprintf_ret(buf, size, &ret, "vote_for %d\n", diff --git a/kmod/src/quorum.h b/kmod/src/quorum.h index 1c2b6315..11959ab2 100644 --- a/kmod/src/quorum.h +++ b/kmod/src/quorum.h @@ -2,14 +2,12 @@ #define _SCOUTFS_QUORUM_H_ int scoutfs_quorum_server_sin(struct super_block *sb, struct sockaddr_in *sin); -void scoutfs_quorum_server_shutdown(struct super_block *sb, u64 term); u8 scoutfs_quorum_votes_needed(struct super_block *sb); void scoutfs_quorum_slot_sin(struct scoutfs_super_block *super, int i, struct sockaddr_in *sin); int scoutfs_quorum_fence_leaders(struct super_block *sb, u64 term); -int scoutfs_quorum_fence_complete(struct super_block *sb, u64 term); int scoutfs_quorum_setup(struct super_block *sb); void scoutfs_quorum_shutdown(struct super_block *sb); diff --git a/kmod/src/server.c b/kmod/src/server.c index 2fcd316b..12b03a9d 100644 --- a/kmod/src/server.c +++ b/kmod/src/server.c @@ -59,9 +59,7 @@ struct server_info { struct workqueue_struct *wq; struct work_struct work; - int err; - bool shutting_down; - struct completion start_comp; + int status; u64 term; struct scoutfs_net_connection *conn; @@ -155,6 +153,62 @@ static bool get_volopt_val(struct server_info *server, int nr, u64 *val) return is_set; } +enum { + SERVER_NOP = 0, + SERVER_STARTING, + SERVER_UP, + SERVER_STOPPING, + SERVER_DOWN, +}; + +bool scoutfs_server_is_running(struct super_block *sb) +{ + DECLARE_SERVER_INFO(sb, server); + long was = cmpxchg(&server->status, SERVER_NOP, SERVER_NOP); + + return was == SERVER_STARTING || was == SERVER_UP; +} + +bool scoutfs_server_is_up(struct super_block *sb) +{ + DECLARE_SERVER_INFO(sb, server); + + return cmpxchg(&server->status, SERVER_NOP, SERVER_NOP) == SERVER_UP; +} + +bool scoutfs_server_is_down(struct super_block *sb) +{ + DECLARE_SERVER_INFO(sb, server); + + return cmpxchg(&server->status, SERVER_NOP, SERVER_NOP) == SERVER_DOWN; +} + +static bool server_is_stopping(struct server_info *server) +{ + return cmpxchg(&server->status, SERVER_NOP, SERVER_NOP) == SERVER_STOPPING; +} + +static void stop_server(struct server_info *server) +{ + long was = cmpxchg(&server->status, SERVER_NOP, SERVER_NOP); + + if ((was == SERVER_STARTING || was == SERVER_UP) && + cmpxchg(&server->status, was, SERVER_STOPPING) == was) + wake_up(&server->waitq); +} + +static void server_up(struct server_info *server) +{ + cmpxchg(&server->status, SERVER_STARTING, SERVER_UP); +} + +static void server_down(struct server_info *server) +{ + long was = cmpxchg(&server->status, SERVER_NOP, SERVER_NOP); + + if (was != SERVER_DOWN) + cmpxchg(&server->status, was, SERVER_DOWN); +} struct commit_waiter { struct completion comp; @@ -162,24 +216,6 @@ struct commit_waiter { int ret; }; -static bool test_shutting_down(struct server_info *server) -{ - smp_rmb(); - return server->shutting_down; -} - -static void set_shutting_down(struct server_info *server, bool val) -{ - server->shutting_down = val; - smp_wmb(); -} - -static void stop_server(struct server_info *server) -{ - set_shutting_down(server, true); - wake_up(&server->waitq); -} - /* * Hold the shared rwsem that lets multiple holders modify blocks in the * current commit and prevents the commit worker from acquiring the @@ -2051,8 +2087,8 @@ static void server_log_merge_free_work(struct work_struct *work) bool commit = false; int ret = 0; - /* shutdown waits for us, we'll eventually load set shutting_down */ - while (!server->shutting_down) { + while (!server_is_stopping(server)) { + scoutfs_server_hold_commit(sb); mutex_lock(&server->logs_mutex); commit = true; @@ -3180,7 +3216,7 @@ out: */ static void queue_farewell_work(struct server_info *server) { - if (!test_shutting_down(server)) + if (!server_is_stopping(server)) queue_work(server->wq, &server->farewell_work); } @@ -3693,14 +3729,14 @@ static void fence_pending_recov_worker(struct work_struct *work) } if (ret < 0) - scoutfs_server_abort(sb); + stop_server(server); } static void recovery_timeout(struct super_block *sb) { DECLARE_SERVER_INFO(sb, server); - if (!test_shutting_down(server)) + if (!server_is_stopping(server)) queue_work(server->wq, &server->fence_pending_recov_work); } @@ -3765,7 +3801,7 @@ out: static void queue_reclaim_work(struct server_info *server, unsigned long delay) { - if (!test_shutting_down(server)) + if (!server_is_stopping(server)) queue_delayed_work(server->wq, &server->reclaim_dwork, delay); } @@ -3800,7 +3836,7 @@ static void reclaim_worker(struct work_struct *work) if (error == true) { scoutfs_err(sb, "saw error indicator on fence request for rid %016llx, shutting down server", rid); - scoutfs_server_abort(sb); + stop_server(server); ret = -ESHUTDOWN; goto out; } @@ -3809,7 +3845,7 @@ static void reclaim_worker(struct work_struct *work) if (ret < 0) { scoutfs_err(sb, "failure to reclaim fenced rid %016llx: err %d, shutting down server", rid, ret); - scoutfs_server_abort(sb); + stop_server(server); goto out; } @@ -3817,16 +3853,7 @@ static void reclaim_worker(struct work_struct *work) scoutfs_fence_free(sb, rid); scoutfs_server_recov_finish(sb, rid, SCOUTFS_RECOV_ALL); - /* tell quorum we've finished fencing all previous leaders */ - if (reason == SCOUTFS_FENCE_QUORUM_BLOCK_LEADER && - !scoutfs_fence_reason_pending(sb, reason)) { - ret = scoutfs_quorum_fence_complete(sb, server->term); - if (ret < 0) - goto out; - } - ret = 0; - out: /* queue next reclaim immediately if we're making progress */ if (ret == 0) @@ -3942,12 +3969,12 @@ static void scoutfs_server_worker(struct work_struct *work) scoutfs_net_listen(sb, conn); scoutfs_info(sb, "server ready at "SIN_FMT, SIN_ARG(&sin)); - complete(&server->start_comp); + server_up(server); queue_reclaim_work(server, 0); /* interruptible mostly to avoid stuck messages */ - wait_event_interruptible(server->waitq, test_shutting_down(server)); + wait_event_interruptible(server->waitq, server_is_stopping(server)); shutdown: scoutfs_info(sb, "server shutting down at "SIN_FMT, SIN_ARG(&sin)); @@ -3981,60 +4008,44 @@ out: scoutfs_fence_stop(sb); scoutfs_net_free_conn(sb, conn); - /* let quorum know that we've shutdown */ - scoutfs_quorum_server_shutdown(sb, server->term); + server_down(server); scoutfs_info(sb, "server stopped at "SIN_FMT, SIN_ARG(&sin)); trace_scoutfs_server_work_exit(sb, 0, ret); - - server->err = ret; - complete(&server->start_comp); } /* - * Wait for the server to successfully start. If this returns error then - * the super block's fence_term has been set to the new server's term so - * that it won't be fenced. + * Start the server but don't wait for it to complete. */ -int scoutfs_server_start(struct super_block *sb, u64 term) +void scoutfs_server_start(struct super_block *sb, u64 term) { DECLARE_SERVER_INFO(sb, server); - server->err = 0; - set_shutting_down(server, false); - server->term = term; - init_completion(&server->start_comp); - - queue_work(server->wq, &server->work); - - wait_for_completion(&server->start_comp); - return server->err; + if (cmpxchg(&server->status, SERVER_DOWN, SERVER_STARTING) == SERVER_DOWN) { + server->term = term; + queue_work(server->wq, &server->work); + } } /* * Start shutdown on the server but don't want for it to finish. */ -void scoutfs_server_abort(struct super_block *sb) -{ - DECLARE_SERVER_INFO(sb, server); - - stop_server(server); -} - -/* - * Once the server is stopped we give the caller our election info - * which might have been modified while we were running. - */ void scoutfs_server_stop(struct super_block *sb) { DECLARE_SERVER_INFO(sb, server); stop_server(server); +} - cancel_work_sync(&server->work); - cancel_work_sync(&server->farewell_work); - cancel_work_sync(&server->commit_work); - cancel_work_sync(&server->log_merge_free_work); +/* + * Start shutdown on the server and wait for it to finish. + */ +void scoutfs_server_stop_wait(struct super_block *sb) +{ + DECLARE_SERVER_INFO(sb, server); + + stop_server(server); + flush_work_sync(&server->work); } int scoutfs_server_setup(struct super_block *sb) @@ -4050,6 +4061,7 @@ int scoutfs_server_setup(struct super_block *sb) spin_lock_init(&server->lock); init_waitqueue_head(&server->waitq); INIT_WORK(&server->work, scoutfs_server_worker); + server->status = SERVER_DOWN; init_rwsem(&server->commit_rwsem); init_llist_head(&server->commit_waiters); INIT_WORK(&server->commit_work, scoutfs_server_commit_func); diff --git a/kmod/src/server.h b/kmod/src/server.h index d5829abe..ad215045 100644 --- a/kmod/src/server.h +++ b/kmod/src/server.h @@ -77,9 +77,12 @@ u64 scoutfs_server_seq(struct super_block *sb); u64 scoutfs_server_next_seq(struct super_block *sb); void scoutfs_server_set_seq_if_greater(struct super_block *sb, u64 seq); -int scoutfs_server_start(struct super_block *sb, u64 term); -void scoutfs_server_abort(struct super_block *sb); +void scoutfs_server_start(struct super_block *sb, u64 term); void scoutfs_server_stop(struct super_block *sb); +void scoutfs_server_stop_wait(struct super_block *sb); +bool scoutfs_server_is_running(struct super_block *sb); +bool scoutfs_server_is_up(struct super_block *sb); +bool scoutfs_server_is_down(struct super_block *sb); int scoutfs_server_setup(struct super_block *sb); void scoutfs_server_destroy(struct super_block *sb);