Don't shutdown quorum if server startup fails

The quorum service shuts down if it sees errors that mean that it can't
do its job.

This is mostly fatal errors gathering resources at startup or runtime IO
errors but it was also shutting down if server startup fails.   That's
not quite right.  This should be treated like the server shutting down
on errors.  Quorum needs to stay around to participate in electing the
next server.

Fence timeouts could trigger this.   A quorum mount could crash, the
next server without a fence script could have a fence request timeout
and shutdown, and now the third remaining server is left to indefinitely
send vote requests into the void.

With this fixed, continuing that example, the quorum service in the
second mount remains to elect the third server with a working fence
script after the second server shuts down after its fence request times
out.

Signed-off-by: Zach Brown <zab@versity.com>
This commit is contained in:
Zach Brown
2021-07-12 13:07:18 -07:00
parent 011b7d52e5
commit e4dca8ddcc
3 changed files with 27 additions and 16 deletions

View File

@@ -556,10 +556,8 @@ out:
ret = err;
}
if (ret < 0) {
scoutfs_err(sb, "error %d attempting to find and fence previous leaders", ret);
if (ret < 0)
scoutfs_inc_counter(sb, quorum_fence_error);
}
return ret;
}
@@ -733,13 +731,15 @@ static void scoutfs_quorum_worker(struct work_struct *work)
ret = scoutfs_server_start(sb, qst.term);
if (ret < 0) {
clear_bit(QINF_FLAG_SERVER, &qinf->flags);
scoutfs_err(sb, "server startup failed with %d", ret);
/* store our increased term */
err = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_STOP, qst.term,
true);
if (err < 0 && ret == 0)
if (err < 0) {
ret = err;
goto out;
goto out;
}
ret = 0;
continue;
}
}
@@ -789,7 +789,7 @@ static void scoutfs_quorum_worker(struct work_struct *work)
update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_END, qst.term, true);
out:
if (ret < 0) {
scoutfs_err(sb, "quorum service saw error %d, shutting down. Cluster will be degraded until this slot is remounted to restart the quorum service",
scoutfs_err(sb, "quorum service saw error %d, shutting down. This mount is no longer participating in quorum. It should be remounted to restore service.",
ret);
}
}

View File

@@ -3463,13 +3463,15 @@ static void scoutfs_server_worker(struct work_struct *work)
trace_scoutfs_server_work_enter(sb, 0, 0);
scoutfs_quorum_slot_sin(super, opts->quorum_slot_nr, &sin);
scoutfs_info(sb, "server starting at "SIN_FMT, SIN_ARG(&sin));
/* first make sure no other servers are still running */
ret = scoutfs_quorum_fence_leaders(sb, server->term);
if (ret < 0)
if (ret < 0) {
scoutfs_err(sb, "server error %d attempting to fence previous leaders", ret);
goto out;
scoutfs_quorum_slot_sin(super, opts->quorum_slot_nr, &sin);
scoutfs_info(sb, "server setting up at "SIN_FMT, SIN_ARG(&sin));
}
conn = scoutfs_net_alloc_conn(sb, server_notify_up, server_notify_down,
sizeof(struct server_client_info),
@@ -3490,8 +3492,10 @@ static void scoutfs_server_worker(struct work_struct *work)
/* start up the server subsystems before accepting */
ret = scoutfs_read_super(sb, super);
if (ret < 0)
if (ret < 0) {
scoutfs_err(sb, "server error %d reading super block", ret);
goto shutdown;
}
/* update volume options early, possibly for use during startup */
write_seqcount_begin(&server->volopt_seqcount);
@@ -3529,10 +3533,17 @@ static void scoutfs_server_worker(struct work_struct *work)
}
scoutfs_server_set_seq_if_greater(sb, max_seq);
ret = scoutfs_lock_server_setup(sb, &server->alloc, &server->wri) ?:
start_recovery(sb);
if (ret)
ret = scoutfs_lock_server_setup(sb, &server->alloc, &server->wri);
if (ret) {
scoutfs_err(sb, "server error %d starting lock server", ret);
goto shutdown;
}
ret = start_recovery(sb);
if (ret) {
scoutfs_err(sb, "server error %d starting client recovery", ret);
goto shutdown;
}
/* start accepting connections and processing work */
server->conn = conn;

View File

@@ -40,7 +40,7 @@ t_filter_dmesg()
# mount and unmount spew a bunch
re="$re|scoutfs.*client connected"
re="$re|scoutfs.*client disconnected"
re="$re|scoutfs.*server setting up"
re="$re|scoutfs.*server starting"
re="$re|scoutfs.*server ready"
re="$re|scoutfs.*server accepted"
re="$re|scoutfs.*server closing"