mirror of
https://github.com/versity/scoutfs.git
synced 2026-01-04 03:14:02 +00:00
Don't shutdown quorum if server startup fails
The quorum service shuts down if it sees errors that mean that it can't do its job. This is mostly fatal errors gathering resources at startup or runtime IO errors but it was also shutting down if server startup fails. That's not quite right. This should be treated like the server shutting down on errors. Quorum needs to stay around to participate in electing the next server. Fence timeouts could trigger this. A quorum mount could crash, the next server without a fence script could have a fence request timeout and shutdown, and now the third remaining server is left to indefinitely send vote requests into the void. With this fixed, continuing that example, the quorum service in the second mount remains to elect the third server with a working fence script after the second server shuts down after its fence request times out. Signed-off-by: Zach Brown <zab@versity.com>
This commit is contained in:
@@ -556,10 +556,8 @@ out:
|
||||
ret = err;
|
||||
}
|
||||
|
||||
if (ret < 0) {
|
||||
scoutfs_err(sb, "error %d attempting to find and fence previous leaders", ret);
|
||||
if (ret < 0)
|
||||
scoutfs_inc_counter(sb, quorum_fence_error);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
@@ -733,13 +731,15 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
ret = scoutfs_server_start(sb, qst.term);
|
||||
if (ret < 0) {
|
||||
clear_bit(QINF_FLAG_SERVER, &qinf->flags);
|
||||
scoutfs_err(sb, "server startup failed with %d", ret);
|
||||
/* store our increased term */
|
||||
err = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_STOP, qst.term,
|
||||
true);
|
||||
if (err < 0 && ret == 0)
|
||||
if (err < 0) {
|
||||
ret = err;
|
||||
goto out;
|
||||
goto out;
|
||||
}
|
||||
ret = 0;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -789,7 +789,7 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_END, qst.term, true);
|
||||
out:
|
||||
if (ret < 0) {
|
||||
scoutfs_err(sb, "quorum service saw error %d, shutting down. Cluster will be degraded until this slot is remounted to restart the quorum service",
|
||||
scoutfs_err(sb, "quorum service saw error %d, shutting down. This mount is no longer participating in quorum. It should be remounted to restore service.",
|
||||
ret);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3463,13 +3463,15 @@ static void scoutfs_server_worker(struct work_struct *work)
|
||||
|
||||
trace_scoutfs_server_work_enter(sb, 0, 0);
|
||||
|
||||
scoutfs_quorum_slot_sin(super, opts->quorum_slot_nr, &sin);
|
||||
scoutfs_info(sb, "server starting at "SIN_FMT, SIN_ARG(&sin));
|
||||
|
||||
/* first make sure no other servers are still running */
|
||||
ret = scoutfs_quorum_fence_leaders(sb, server->term);
|
||||
if (ret < 0)
|
||||
if (ret < 0) {
|
||||
scoutfs_err(sb, "server error %d attempting to fence previous leaders", ret);
|
||||
goto out;
|
||||
|
||||
scoutfs_quorum_slot_sin(super, opts->quorum_slot_nr, &sin);
|
||||
scoutfs_info(sb, "server setting up at "SIN_FMT, SIN_ARG(&sin));
|
||||
}
|
||||
|
||||
conn = scoutfs_net_alloc_conn(sb, server_notify_up, server_notify_down,
|
||||
sizeof(struct server_client_info),
|
||||
@@ -3490,8 +3492,10 @@ static void scoutfs_server_worker(struct work_struct *work)
|
||||
|
||||
/* start up the server subsystems before accepting */
|
||||
ret = scoutfs_read_super(sb, super);
|
||||
if (ret < 0)
|
||||
if (ret < 0) {
|
||||
scoutfs_err(sb, "server error %d reading super block", ret);
|
||||
goto shutdown;
|
||||
}
|
||||
|
||||
/* update volume options early, possibly for use during startup */
|
||||
write_seqcount_begin(&server->volopt_seqcount);
|
||||
@@ -3529,10 +3533,17 @@ static void scoutfs_server_worker(struct work_struct *work)
|
||||
}
|
||||
scoutfs_server_set_seq_if_greater(sb, max_seq);
|
||||
|
||||
ret = scoutfs_lock_server_setup(sb, &server->alloc, &server->wri) ?:
|
||||
start_recovery(sb);
|
||||
if (ret)
|
||||
ret = scoutfs_lock_server_setup(sb, &server->alloc, &server->wri);
|
||||
if (ret) {
|
||||
scoutfs_err(sb, "server error %d starting lock server", ret);
|
||||
goto shutdown;
|
||||
}
|
||||
|
||||
ret = start_recovery(sb);
|
||||
if (ret) {
|
||||
scoutfs_err(sb, "server error %d starting client recovery", ret);
|
||||
goto shutdown;
|
||||
}
|
||||
|
||||
/* start accepting connections and processing work */
|
||||
server->conn = conn;
|
||||
|
||||
@@ -40,7 +40,7 @@ t_filter_dmesg()
|
||||
# mount and unmount spew a bunch
|
||||
re="$re|scoutfs.*client connected"
|
||||
re="$re|scoutfs.*client disconnected"
|
||||
re="$re|scoutfs.*server setting up"
|
||||
re="$re|scoutfs.*server starting"
|
||||
re="$re|scoutfs.*server ready"
|
||||
re="$re|scoutfs.*server accepted"
|
||||
re="$re|scoutfs.*server closing"
|
||||
|
||||
Reference in New Issue
Block a user