Merge pull request #121 from versity/zab/heartbeat_fencing_tweaks

Zab/heartbeat fencing tweaks
This commit is contained in:
Zach Brown
2023-05-18 17:10:40 -07:00
committed by GitHub
11 changed files with 376 additions and 35 deletions

View File

@@ -166,6 +166,7 @@
EXPAND_COUNTER(quorum_recv_resignation) \
EXPAND_COUNTER(quorum_recv_vote) \
EXPAND_COUNTER(quorum_send_heartbeat) \
EXPAND_COUNTER(quorum_send_heartbeat_dropped) \
EXPAND_COUNTER(quorum_send_resignation) \
EXPAND_COUNTER(quorum_send_request) \
EXPAND_COUNTER(quorum_send_vote) \

View File

@@ -683,16 +683,19 @@ struct scoutfs_xattr_totl_val {
#define SCOUTFS_QUORUM_ELECT_VAR_MS 100
/*
* Once a leader is elected they send out heartbeats at regular
* intervals to force members to wait the much longer heartbeat timeout.
* Once heartbeat timeout expires without receiving a heartbeat they'll
* switch over the performing elections.
* Once a leader is elected they send heartbeat messages to all quorum
* members at regular intervals to force members to wait the much longer
* heartbeat timeout. Once the heartbeat timeout expires without
* receiving a heartbeat message a member will start an election.
*
* These determine how long it could take members to notice that a
* leader has gone silent and start to elect a new leader.
* leader has gone silent and start to elect a new leader. The
* heartbeat timeout can be changed at run time by options.
*/
#define SCOUTFS_QUORUM_HB_IVAL_MS 100
#define SCOUTFS_QUORUM_HB_TIMEO_MS (5 * MSEC_PER_SEC)
#define SCOUTFS_QUORUM_MIN_HB_TIMEO_MS (2 * MSEC_PER_SEC)
#define SCOUTFS_QUORUM_DEF_HB_TIMEO_MS (10 * MSEC_PER_SEC)
#define SCOUTFS_QUORUM_MAX_HB_TIMEO_MS (60 * MSEC_PER_SEC)
/*
* A newly elected leader will give fencing some time before giving up and

View File

@@ -36,6 +36,7 @@ enum {
Opt_metadev_path,
Opt_noacl,
Opt_orphan_scan_delay_ms,
Opt_quorum_heartbeat_timeout_ms,
Opt_quorum_slot_nr,
Opt_err,
};
@@ -47,6 +48,7 @@ static const match_table_t tokens = {
{Opt_metadev_path, "metadev_path=%s"},
{Opt_noacl, "noacl"},
{Opt_orphan_scan_delay_ms, "orphan_scan_delay_ms=%s"},
{Opt_quorum_heartbeat_timeout_ms, "quorum_heartbeat_timeout_ms=%s"},
{Opt_quorum_slot_nr, "quorum_slot_nr=%s"},
{Opt_err, NULL}
};
@@ -124,8 +126,30 @@ static void init_default_options(struct scoutfs_mount_options *opts)
opts->data_prealloc_blocks = SCOUTFS_DATA_PREALLOC_DEFAULT_BLOCKS;
opts->data_prealloc_contig_only = 1;
opts->quorum_slot_nr = -1;
opts->orphan_scan_delay_ms = -1;
opts->quorum_heartbeat_timeout_ms = SCOUTFS_QUORUM_DEF_HB_TIMEO_MS;
opts->quorum_slot_nr = -1;
}
static int set_quorum_heartbeat_timeout_ms(struct super_block *sb, int ret, u64 val)
{
DECLARE_OPTIONS_INFO(sb, optinf);
if (ret < 0) {
scoutfs_err(sb, "failed to parse quorum_heartbeat_timeout_ms value");
return -EINVAL;
}
if (val < SCOUTFS_QUORUM_MIN_HB_TIMEO_MS || val > SCOUTFS_QUORUM_MAX_HB_TIMEO_MS) {
scoutfs_err(sb, "invalid quorum_heartbeat_timeout_ms value %llu, must be between %lu and %lu",
val, SCOUTFS_QUORUM_MIN_HB_TIMEO_MS, SCOUTFS_QUORUM_MAX_HB_TIMEO_MS);
return -EINVAL;
}
write_seqlock(&optinf->seqlock);
optinf->opts.quorum_heartbeat_timeout_ms = val;
write_sequnlock(&optinf->seqlock);
return 0;
}
/*
@@ -206,6 +230,13 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m
opts->orphan_scan_delay_ms = nr;
break;
case Opt_quorum_heartbeat_timeout_ms:
ret = match_u64(args, &nr64);
ret = set_quorum_heartbeat_timeout_ms(sb, ret, nr64);
if (ret < 0)
return ret;
break;
case Opt_quorum_slot_nr:
if (opts->quorum_slot_nr != -1) {
scoutfs_err(sb, "multiple quorum_slot_nr options provided, only provide one.");
@@ -448,6 +479,38 @@ static ssize_t orphan_scan_delay_ms_store(struct kobject *kobj, struct kobj_attr
}
SCOUTFS_ATTR_RW(orphan_scan_delay_ms);
static ssize_t quorum_heartbeat_timeout_ms_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
struct scoutfs_mount_options opts;
scoutfs_options_read(sb, &opts);
return snprintf(buf, PAGE_SIZE, "%llu", opts.quorum_heartbeat_timeout_ms);
}
static ssize_t quorum_heartbeat_timeout_ms_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t count)
{
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
char nullterm[30]; /* more than enough for octal -U64_MAX */
u64 val;
int len;
int ret;
len = min(count, sizeof(nullterm) - 1);
memcpy(nullterm, buf, len);
nullterm[len] = '\0';
ret = kstrtoll(nullterm, 0, &val);
ret = set_quorum_heartbeat_timeout_ms(sb, ret, val);
if (ret == 0)
ret = count;
return ret;
}
SCOUTFS_ATTR_RW(quorum_heartbeat_timeout_ms);
static ssize_t quorum_slot_nr_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
{
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
@@ -464,6 +527,7 @@ static struct attribute *options_attrs[] = {
SCOUTFS_ATTR_PTR(data_prealloc_contig_only),
SCOUTFS_ATTR_PTR(metadev_path),
SCOUTFS_ATTR_PTR(orphan_scan_delay_ms),
SCOUTFS_ATTR_PTR(quorum_heartbeat_timeout_ms),
SCOUTFS_ATTR_PTR(quorum_slot_nr),
NULL,
};

View File

@@ -11,7 +11,7 @@ struct scoutfs_mount_options {
char *metadev_path;
unsigned int orphan_scan_delay_ms;
int quorum_slot_nr;
u64 quorum_heartbeat_timeout_ms;
};
void scoutfs_options_read(struct super_block *sb, struct scoutfs_mount_options *opts);

View File

@@ -100,6 +100,11 @@ struct last_msg {
ktime_t ts;
};
struct count_recent {
u64 count;
ktime_t recent;
};
enum quorum_role { FOLLOWER, CANDIDATE, LEADER };
struct quorum_status {
@@ -112,9 +117,12 @@ struct quorum_status {
ktime_t timeout;
};
#define HB_DELAY_NR (SCOUTFS_QUORUM_MAX_HB_TIMEO_MS / MSEC_PER_SEC)
struct quorum_info {
struct super_block *sb;
struct scoutfs_quorum_config qconf;
struct workqueue_struct *workq;
struct work_struct work;
struct socket *sock;
bool shutdown;
@@ -126,6 +134,8 @@ struct quorum_info {
struct quorum_status show_status;
struct last_msg last_send[SCOUTFS_QUORUM_MAX_SLOTS];
struct last_msg last_recv[SCOUTFS_QUORUM_MAX_SLOTS];
struct count_recent *hb_delay;
unsigned long max_hb_delay;
struct scoutfs_sysfs_attrs ssa;
};
@@ -160,9 +170,9 @@ static ktime_t heartbeat_interval(void)
return ktime_add_ms(ktime_get(), SCOUTFS_QUORUM_HB_IVAL_MS);
}
static ktime_t heartbeat_timeout(void)
static ktime_t heartbeat_timeout(struct scoutfs_mount_options *opts)
{
return ktime_add_ms(ktime_get(), SCOUTFS_QUORUM_HB_TIMEO_MS);
return ktime_add_ms(ktime_get(), opts->quorum_heartbeat_timeout_ms);
}
static int create_socket(struct super_block *sb)
@@ -179,7 +189,8 @@ static int create_socket(struct super_block *sb)
goto out;
}
sock->sk->sk_allocation = GFP_NOFS;
/* rather fail and retry than block waiting for free */
sock->sk->sk_allocation = GFP_ATOMIC;
quorum_slot_sin(&qinf->qconf, qinf->our_quorum_slot_nr, &sin);
@@ -208,12 +219,16 @@ static __le32 quorum_message_crc(struct scoutfs_quorum_message *qmes)
return cpu_to_le32(crc32c(~0, qmes, len));
}
static void send_msg_members(struct super_block *sb, int type, u64 term,
int only)
/*
* Returns the number of failures from sendmsg.
*/
static int send_msg_members(struct super_block *sb, int type, u64 term, int only)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
DECLARE_QUORUM_INFO(sb, qinf);
int failed = 0;
ktime_t now;
int ret;
int i;
struct scoutfs_quorum_message qmes = {
@@ -239,15 +254,21 @@ static void send_msg_members(struct super_block *sb, int type, u64 term,
qmes.crc = quorum_message_crc(&qmes);
for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
if (!quorum_slot_present(&qinf->qconf, i) ||
(only >= 0 && i != only) || i == qinf->our_quorum_slot_nr)
continue;
if (scoutfs_forcing_unmount(sb)) {
failed = 0;
break;
}
scoutfs_quorum_slot_sin(&qinf->qconf, i, &sin);
now = ktime_get();
kernel_sendmsg(qinf->sock, &mh, &kv, 1, kv.iov_len);
ret = kernel_sendmsg(qinf->sock, &mh, &kv, 1, kv.iov_len);
if (ret != kv.iov_len)
failed++;
spin_lock(&qinf->show_lock);
qinf->last_send[i].msg.term = term;
@@ -258,6 +279,8 @@ static void send_msg_members(struct super_block *sb, int type, u64 term,
if (i == only)
break;
}
return failed;
}
#define send_msg_to(sb, type, term, nr) send_msg_members(sb, type, term, nr)
@@ -312,6 +335,9 @@ static int recv_msg(struct super_block *sb, struct quorum_host_msg *msg,
if (ret < 0)
return ret;
if (scoutfs_forcing_unmount(sb))
return 0;
now = ktime_get();
if (ret != sizeof(qmes) ||
@@ -599,6 +625,71 @@ out:
return ret;
}
static void clear_hb_delay(struct quorum_info *qinf)
{
int i;
spin_lock(&qinf->show_lock);
qinf->max_hb_delay = 0;
for (i = 0; i < HB_DELAY_NR; i++) {
qinf->hb_delay[i].recent = ns_to_ktime(0);
qinf->hb_delay[i].count = 0;
}
spin_unlock(&qinf->show_lock);
}
struct hb_recording {
ktime_t prev;
int count;
};
/*
* Record long heartbeat delays. We only record the delay between back
* to back send attempts in the leader or back to back recv messages in
* the followers. The worker caller sets record_hb when their iteration
* sent or received a heartbeat. An iteration that does anything else
* resets the tracking.
*/
static void record_hb_delay(struct super_block *sb, struct quorum_info *qinf,
struct hb_recording *hbr, bool record_hb, int role)
{
bool log = false;
ktime_t now;
s64 s;
if (!record_hb) {
hbr->count = 0;
return;
}
now = ktime_get();
if (hbr->count < 2 && ++hbr->count < 2) {
hbr->prev = now;
return;
}
s = ktime_ms_delta(now, hbr->prev) / MSEC_PER_SEC;
hbr->prev = now;
if (s <= 0 || s >= HB_DELAY_NR)
return;
spin_lock(&qinf->show_lock);
if (qinf->max_hb_delay < s) {
qinf->max_hb_delay = s;
if (s >= 3)
log = true;
}
qinf->hb_delay[s].recent = now;
qinf->hb_delay[s].count++;
spin_unlock(&qinf->show_lock);
if (log)
scoutfs_info(sb, "longest quorum heartbeat %s delay of %lld sec",
role == LEADER ? "send" : "recv", s);
}
/*
* The main quorum task maintains its private status. It seemed cleaner
* to occasionally copy the status for showing in sysfs/debugfs files
@@ -623,16 +714,21 @@ static void update_show_status(struct quorum_info *qinf, struct quorum_status *q
static void scoutfs_quorum_worker(struct work_struct *work)
{
struct quorum_info *qinf = container_of(work, struct quorum_info, work);
struct scoutfs_mount_options opts;
struct super_block *sb = qinf->sb;
struct sockaddr_in unused;
struct quorum_host_msg msg;
struct quorum_status qst = {0,};
struct hb_recording hbr = {{0,},};
bool record_hb;
int ret;
int err;
/* recording votes from slots as native single word bitmap */
BUILD_BUG_ON(SCOUTFS_QUORUM_MAX_SLOTS > BITS_PER_LONG);
scoutfs_options_read(sb, &opts);
/* start out as a follower */
qst.role = FOLLOWER;
qst.vote_for = -1;
@@ -642,7 +738,7 @@ static void scoutfs_quorum_worker(struct work_struct *work)
/* see if there's a server to chose heartbeat or election timeout */
if (scoutfs_quorum_server_sin(sb, &unused) == 0)
qst.timeout = heartbeat_timeout();
qst.timeout = heartbeat_timeout(&opts);
else
qst.timeout = election_timeout();
@@ -666,6 +762,9 @@ static void scoutfs_quorum_worker(struct work_struct *work)
ret = 0;
}
scoutfs_options_read(sb, &opts);
record_hb = false;
/* ignore messages from older terms */
if (msg.type != SCOUTFS_QUORUM_MSG_INVALID &&
msg.term < qst.term)
@@ -681,6 +780,7 @@ static void scoutfs_quorum_worker(struct work_struct *work)
if (qst.role == LEADER) {
scoutfs_warn(sb, "saw msg type %u from %u for term %llu while leader in term %llu, shutting down server.",
msg.type, msg.from, msg.term, qst.term);
clear_hb_delay(qinf);
}
qst.role = FOLLOWER;
qst.term = msg.term;
@@ -689,7 +789,7 @@ static void scoutfs_quorum_worker(struct work_struct *work)
scoutfs_inc_counter(sb, quorum_term_follower);
if (msg.type == SCOUTFS_QUORUM_MSG_HEARTBEAT)
qst.timeout = heartbeat_timeout();
qst.timeout = heartbeat_timeout(&opts);
else
qst.timeout = election_timeout();
@@ -699,6 +799,21 @@ static void scoutfs_quorum_worker(struct work_struct *work)
goto out;
}
/* receiving heartbeats extends timeout, delaying elections */
if (msg.type == SCOUTFS_QUORUM_MSG_HEARTBEAT) {
qst.timeout = heartbeat_timeout(&opts);
scoutfs_inc_counter(sb, quorum_recv_heartbeat);
record_hb = true;
}
/* receiving a resignation from server starts election */
if (msg.type == SCOUTFS_QUORUM_MSG_RESIGNATION &&
qst.role == FOLLOWER &&
msg.term == qst.term) {
qst.timeout = election_timeout();
scoutfs_inc_counter(sb, quorum_recv_resignation);
}
/* followers and candidates start new election on timeout */
if (qst.role != LEADER &&
ktime_after(ktime_get(), qst.timeout)) {
@@ -751,6 +866,7 @@ static void scoutfs_quorum_worker(struct work_struct *work)
qst.timeout = heartbeat_interval();
update_show_status(qinf, &qst);
clear_hb_delay(qinf);
/* record that we've been elected before starting up server */
ret = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_ELECT, qst.term, true);
@@ -805,6 +921,7 @@ static void scoutfs_quorum_worker(struct work_struct *work)
send_msg_others(sb, SCOUTFS_QUORUM_MSG_RESIGNATION,
qst.server_start_term);
scoutfs_inc_counter(sb, quorum_send_resignation);
clear_hb_delay(qinf);
}
ret = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_STOP,
@@ -818,24 +935,16 @@ static void scoutfs_quorum_worker(struct work_struct *work)
/* leaders regularly send heartbeats to delay elections */
if (qst.role == LEADER &&
ktime_after(ktime_get(), qst.timeout)) {
send_msg_others(sb, SCOUTFS_QUORUM_MSG_HEARTBEAT,
qst.term);
ret = send_msg_others(sb, SCOUTFS_QUORUM_MSG_HEARTBEAT, qst.term);
if (ret > 0) {
scoutfs_add_counter(sb, quorum_send_heartbeat_dropped, ret);
ret = 0;
}
qst.timeout = heartbeat_interval();
scoutfs_inc_counter(sb, quorum_send_heartbeat);
}
record_hb = true;
/* receiving heartbeats extends timeout, delaying elections */
if (msg.type == SCOUTFS_QUORUM_MSG_HEARTBEAT) {
qst.timeout = heartbeat_timeout();
scoutfs_inc_counter(sb, quorum_recv_heartbeat);
}
/* receiving a resignation from server starts election */
if (msg.type == SCOUTFS_QUORUM_MSG_RESIGNATION &&
qst.role == FOLLOWER &&
msg.term == qst.term) {
qst.timeout = election_timeout();
scoutfs_inc_counter(sb, quorum_recv_resignation);
}
/* followers vote once per term */
@@ -847,6 +956,8 @@ static void scoutfs_quorum_worker(struct work_struct *work)
msg.from);
scoutfs_inc_counter(sb, quorum_send_vote);
}
record_hb_delay(sb, qinf, &hbr, record_hb, qst.role);
}
update_show_status(qinf, &qst);
@@ -983,9 +1094,11 @@ static ssize_t status_show(struct kobject *kobj, struct kobj_attribute *attr,
{
DECLARE_QUORUM_INFO_KOBJ(kobj, qinf);
struct quorum_status qst;
struct count_recent cr;
struct last_msg last;
struct timespec64 ts;
const ktime_t now = ktime_get();
unsigned long ul;
size_t size;
int ret;
int i;
@@ -1043,6 +1156,26 @@ static ssize_t status_show(struct kobject *kobj, struct kobj_attribute *attr,
(s64)ts.tv_sec, (int)ts.tv_nsec);
}
spin_lock(&qinf->show_lock);
ul = qinf->max_hb_delay;
spin_unlock(&qinf->show_lock);
if (ul)
snprintf_ret(buf, size, &ret, "HB Delay(s) Count Secs Since\n");
for (i = 1; i <= ul && i < HB_DELAY_NR; i++) {
spin_lock(&qinf->show_lock);
cr = qinf->hb_delay[i];
spin_unlock(&qinf->show_lock);
if (cr.count == 0)
continue;
ts = ktime_to_timespec64(ktime_sub(now, cr.recent));
snprintf_ret(buf, size, &ret,
"%11u %9llu %lld.%09u\n",
i, cr.count, (s64)ts.tv_sec, (int)ts.tv_nsec);
}
return ret;
}
SCOUTFS_ATTR_RO(status);
@@ -1180,7 +1313,12 @@ int scoutfs_quorum_setup(struct super_block *sb)
qinf = kzalloc(sizeof(struct quorum_info), GFP_KERNEL);
super = kmalloc(sizeof(struct scoutfs_super_block), GFP_KERNEL);
if (!qinf || !super) {
if (qinf)
qinf->hb_delay = __vmalloc(HB_DELAY_NR * sizeof(struct count_recent),
GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
if (!qinf || !super || !qinf->hb_delay) {
if (qinf)
vfree(qinf->hb_delay);
kfree(qinf);
ret = -ENOMEM;
goto out;
@@ -1195,6 +1333,15 @@ int scoutfs_quorum_setup(struct super_block *sb)
sbi->quorum_info = qinf;
qinf->sb = sb;
/* a high priority single threaded context without mem reclaim */
qinf->workq = alloc_workqueue("scoutfs_quorum_work",
WQ_NON_REENTRANT | WQ_UNBOUND |
WQ_HIGHPRI, 1);
if (!qinf->workq) {
ret = -ENOMEM;
goto out;
}
ret = scoutfs_read_super(sb, super);
if (ret < 0)
goto out;
@@ -1213,7 +1360,7 @@ int scoutfs_quorum_setup(struct super_block *sb)
if (ret < 0)
goto out;
schedule_work(&qinf->work);
queue_work(qinf->workq, &qinf->work);
out:
if (ret)
@@ -1243,10 +1390,14 @@ void scoutfs_quorum_destroy(struct super_block *sb)
qinf->shutdown = true;
flush_work(&qinf->work);
if (qinf->workq)
destroy_workqueue(qinf->workq);
scoutfs_sysfs_destroy_attrs(sb, &qinf->ssa);
if (qinf->sock)
sock_release(qinf->sock);
vfree(qinf->hb_delay);
kfree(qinf);
sbi->quorum_info = NULL;
}

View File

@@ -61,6 +61,7 @@ t_filter_dmesg()
re="$re|scoutfs .* error: meta_super META flag not set"
re="$re|scoutfs .* error: could not open metadev:.*"
re="$re|scoutfs .* error: Unknown or malformed option,.*"
re="$re|scoutfs .* error: invalid quorum_heartbeat_timeout_ms value"
# in debugging kernels we can slow things down a bit
re="$re|hrtimer: interrupt took .*"
@@ -81,6 +82,7 @@ t_filter_dmesg()
re="$re|scoutfs .* error .* freeing merged btree blocks.*.final commit del.upd freeing item"
re="$re|scoutfs .* error .*reading quorum block.*to update event.*"
re="$re|scoutfs .* error.*server failed to bind to.*"
re="$re|scoutfs .* critical transaction commit failure.*"
egrep -v "($re)"
}

View File

@@ -75,6 +75,15 @@ t_fs_nrs()
seq 0 $((T_NR_MOUNTS - 1))
}
#
# output the fs nrs of quorum nodes, we "know" that
# the quorum nrs are the first consequtive nrs
#
t_quorum_nrs()
{
seq 0 $((T_QUORUM - 1))
}
#
# outputs "1" if the fs number has "1" in its quorum/is_leader file.
# All other cases output 0, including the fs nr being a client which
@@ -391,7 +400,7 @@ t_set_sysfs_mount_option() {
local val="$3"
local opt="$(t_sysfs_path $nr)/mount_options/$name"
echo "$val" > "$opt"
echo "$val" > "$opt" 2>/dev/null
}
t_set_all_sysfs_mount_options() {

View File

@@ -0,0 +1,2 @@
== bad timeout values fail
== test different timeouts

View File

@@ -38,6 +38,7 @@ setup-error-teardown.sh
resize-devices.sh
change-devices.sh
fence-and-reclaim.sh
quorum-heartbeat-timeout.sh
orphan-inodes.sh
mount-unmount-race.sh
client-unmount-recovery.sh

View File

@@ -0,0 +1,89 @@
#
# test that the quorum_heartbeat_time_ms option affects how long it
# takes to recover from a failed mount.
#
t_require_mounts 2
time_ms()
{
# time_t in seconds, then trunate nanoseconds to 3 most dig digits
date +%s%3N
}
set_bad_timeout() {
local to="$1"
t_set_sysfs_mount_option 0 quorum_heartbeat_timeout_ms $to && \
t_fail "set bad q hb to $to"
}
set_quorum_timeouts()
{
local to="$1"
local was
local is
for nr in $(t_quorum_nrs); do
local mnt="$(eval echo \$T_M$nr)"
was=$(t_get_sysfs_mount_option $nr quorum_heartbeat_timeout_ms)
t_set_sysfs_mount_option $nr quorum_heartbeat_timeout_ms $to
is=$(t_get_sysfs_mount_option $nr quorum_heartbeat_timeout_ms)
if [ "$is" != "$to" ]; then
t_fail "tried to set qhbto on $nr to $to but got $is"
fi
done
}
test_timeout()
{
local to="$1"
local orig_to
local start
local nr
local delay
# set new timeouts, saving original
orig_to=$(t_get_sysfs_mount_option 0 quorum_heartbeat_timeout_ms)
set_quorum_timeouts $to
# give followers time to recv heartbeats and reset timeouts
sleep 1
# tear down the current server/leader
nr=$(t_server_nr)
t_force_umount $nr
# see how long it takes for the next leader to start
start=$(time_ms)
t_wait_for_leader
delay=$(($(time_ms) - start))
# kind of fun to have these logged
echo "to $to delay $delay" >> $T_TMP.delay
# restore the mount that we tore down
t_mount $nr
# reset the original timeouts
set_quorum_timeouts $orig_to
# make sure the new leader delay was reasonable
test "$delay" -gt "$to" || t_fail "delay $delay < to $to"
# allow 5 seconds of slop
test "$delay" -lt $(($to + 5000)) || t_fail "delay $delay > to $to + 5sec"
}
echo "== bad timeout values fail"
set_bad_timeout 0
set_bad_timeout -1
set_bad_timeout 1000000
echo "== test different timeouts"
def=$(t_get_sysfs_mount_option 0 quorum_heartbeat_timeout_ms)
test_timeout $def
test_timeout 3000
test_timeout $((def + 19000))
t_pass

View File

@@ -85,6 +85,25 @@ the options directory in the mount's sysfs directory. Writing a new
value will cause the next pending orphan scan to be rescheduled
with the newly written delay time.
.TP
.B quorum_heartbeat_timeout_ms=<number>
This option sets the amount of time, in milliseconds, that a quorum
member will wait without receiving heartbeat messages from the current
leader before trying to take over as leader. This setting is per-mount
and only changes the behavior of that mount.
.sp
This determines how long it may take before a failed leader is replaced
by a waiting quorum member. Setting it too low may lead to spurious
fencing as active leaders are prematurely replaced due to task or
network delays that prevent the quorum members from promptly sending and
receiving messages. The ideal setting is the longest acceptable
downtime during server failover. The default is 10000 (10s) and it can
not be less than 2000 greater than 60000.
.sp
This option can be changed in an active mount by writing to its file in
the options directory in the mount's sysfs directory. Writing a new
value will take effect the next time the quorum agent receives a
heartbeat message and sets the next timeout.
.TP
.B quorum_slot_nr=<number>
The quorum_slot_nr option assigns a quorum member slot to the mount.
The mount will use the slot assignment to claim exclusive ownership of