mirror of
https://github.com/versity/scoutfs.git
synced 2026-01-03 10:55:20 +00:00
Make quorum heartbeat timeout tunable
Add mount and sysfs options for changing the quorum heartbeat timeout. This allows setting a longer delay in taking over for failed hosts that has a greater chance of surviving temporary non-fatal delays. We also double the existing default timeout to 10s which is still reasonably responsive. Signed-off-by: Zach Brown <zab@versity.com>
This commit is contained in:
@@ -683,16 +683,19 @@ struct scoutfs_xattr_totl_val {
|
||||
#define SCOUTFS_QUORUM_ELECT_VAR_MS 100
|
||||
|
||||
/*
|
||||
* Once a leader is elected they send out heartbeats at regular
|
||||
* intervals to force members to wait the much longer heartbeat timeout.
|
||||
* Once heartbeat timeout expires without receiving a heartbeat they'll
|
||||
* switch over the performing elections.
|
||||
* Once a leader is elected they send heartbeat messages to all quorum
|
||||
* members at regular intervals to force members to wait the much longer
|
||||
* heartbeat timeout. Once the heartbeat timeout expires without
|
||||
* receiving a heartbeat message a member will start an election.
|
||||
*
|
||||
* These determine how long it could take members to notice that a
|
||||
* leader has gone silent and start to elect a new leader.
|
||||
* leader has gone silent and start to elect a new leader. The
|
||||
* heartbeat timeout can be changed at run time by options.
|
||||
*/
|
||||
#define SCOUTFS_QUORUM_HB_IVAL_MS 100
|
||||
#define SCOUTFS_QUORUM_HB_TIMEO_MS (5 * MSEC_PER_SEC)
|
||||
#define SCOUTFS_QUORUM_MIN_HB_TIMEO_MS (2 * MSEC_PER_SEC)
|
||||
#define SCOUTFS_QUORUM_DEF_HB_TIMEO_MS (10 * MSEC_PER_SEC)
|
||||
#define SCOUTFS_QUORUM_MAX_HB_TIMEO_MS (60 * MSEC_PER_SEC)
|
||||
|
||||
/*
|
||||
* A newly elected leader will give fencing some time before giving up and
|
||||
|
||||
@@ -36,6 +36,7 @@ enum {
|
||||
Opt_metadev_path,
|
||||
Opt_noacl,
|
||||
Opt_orphan_scan_delay_ms,
|
||||
Opt_quorum_heartbeat_timeout_ms,
|
||||
Opt_quorum_slot_nr,
|
||||
Opt_err,
|
||||
};
|
||||
@@ -47,6 +48,7 @@ static const match_table_t tokens = {
|
||||
{Opt_metadev_path, "metadev_path=%s"},
|
||||
{Opt_noacl, "noacl"},
|
||||
{Opt_orphan_scan_delay_ms, "orphan_scan_delay_ms=%s"},
|
||||
{Opt_quorum_heartbeat_timeout_ms, "quorum_heartbeat_timeout_ms=%s"},
|
||||
{Opt_quorum_slot_nr, "quorum_slot_nr=%s"},
|
||||
{Opt_err, NULL}
|
||||
};
|
||||
@@ -124,8 +126,30 @@ static void init_default_options(struct scoutfs_mount_options *opts)
|
||||
|
||||
opts->data_prealloc_blocks = SCOUTFS_DATA_PREALLOC_DEFAULT_BLOCKS;
|
||||
opts->data_prealloc_contig_only = 1;
|
||||
opts->quorum_slot_nr = -1;
|
||||
opts->orphan_scan_delay_ms = -1;
|
||||
opts->quorum_heartbeat_timeout_ms = SCOUTFS_QUORUM_DEF_HB_TIMEO_MS;
|
||||
opts->quorum_slot_nr = -1;
|
||||
}
|
||||
|
||||
static int set_quorum_heartbeat_timeout_ms(struct super_block *sb, int ret, u64 val)
|
||||
{
|
||||
DECLARE_OPTIONS_INFO(sb, optinf);
|
||||
|
||||
if (ret < 0) {
|
||||
scoutfs_err(sb, "failed to parse quorum_heartbeat_timeout_ms value");
|
||||
return -EINVAL;
|
||||
}
|
||||
if (val < SCOUTFS_QUORUM_MIN_HB_TIMEO_MS || val > SCOUTFS_QUORUM_MAX_HB_TIMEO_MS) {
|
||||
scoutfs_err(sb, "invalid quorum_heartbeat_timeout_ms value %llu, must be between %lu and %lu",
|
||||
val, SCOUTFS_QUORUM_MIN_HB_TIMEO_MS, SCOUTFS_QUORUM_MAX_HB_TIMEO_MS);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
write_seqlock(&optinf->seqlock);
|
||||
optinf->opts.quorum_heartbeat_timeout_ms = val;
|
||||
write_sequnlock(&optinf->seqlock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -206,6 +230,13 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m
|
||||
opts->orphan_scan_delay_ms = nr;
|
||||
break;
|
||||
|
||||
case Opt_quorum_heartbeat_timeout_ms:
|
||||
ret = match_u64(args, &nr64);
|
||||
ret = set_quorum_heartbeat_timeout_ms(sb, ret, nr64);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
break;
|
||||
|
||||
case Opt_quorum_slot_nr:
|
||||
if (opts->quorum_slot_nr != -1) {
|
||||
scoutfs_err(sb, "multiple quorum_slot_nr options provided, only provide one.");
|
||||
@@ -448,6 +479,38 @@ static ssize_t orphan_scan_delay_ms_store(struct kobject *kobj, struct kobj_attr
|
||||
}
|
||||
SCOUTFS_ATTR_RW(orphan_scan_delay_ms);
|
||||
|
||||
static ssize_t quorum_heartbeat_timeout_ms_show(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
char *buf)
|
||||
{
|
||||
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
||||
struct scoutfs_mount_options opts;
|
||||
|
||||
scoutfs_options_read(sb, &opts);
|
||||
|
||||
return snprintf(buf, PAGE_SIZE, "%llu", opts.quorum_heartbeat_timeout_ms);
|
||||
}
|
||||
static ssize_t quorum_heartbeat_timeout_ms_store(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
const char *buf, size_t count)
|
||||
{
|
||||
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
||||
char nullterm[30]; /* more than enough for octal -U64_MAX */
|
||||
u64 val;
|
||||
int len;
|
||||
int ret;
|
||||
|
||||
len = min(count, sizeof(nullterm) - 1);
|
||||
memcpy(nullterm, buf, len);
|
||||
nullterm[len] = '\0';
|
||||
|
||||
ret = kstrtoll(nullterm, 0, &val);
|
||||
ret = set_quorum_heartbeat_timeout_ms(sb, ret, val);
|
||||
if (ret == 0)
|
||||
ret = count;
|
||||
|
||||
return ret;
|
||||
}
|
||||
SCOUTFS_ATTR_RW(quorum_heartbeat_timeout_ms);
|
||||
|
||||
static ssize_t quorum_slot_nr_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
|
||||
{
|
||||
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
||||
@@ -464,6 +527,7 @@ static struct attribute *options_attrs[] = {
|
||||
SCOUTFS_ATTR_PTR(data_prealloc_contig_only),
|
||||
SCOUTFS_ATTR_PTR(metadev_path),
|
||||
SCOUTFS_ATTR_PTR(orphan_scan_delay_ms),
|
||||
SCOUTFS_ATTR_PTR(quorum_heartbeat_timeout_ms),
|
||||
SCOUTFS_ATTR_PTR(quorum_slot_nr),
|
||||
NULL,
|
||||
};
|
||||
|
||||
@@ -11,7 +11,7 @@ struct scoutfs_mount_options {
|
||||
char *metadev_path;
|
||||
unsigned int orphan_scan_delay_ms;
|
||||
int quorum_slot_nr;
|
||||
|
||||
u64 quorum_heartbeat_timeout_ms;
|
||||
};
|
||||
|
||||
void scoutfs_options_read(struct super_block *sb, struct scoutfs_mount_options *opts);
|
||||
|
||||
@@ -161,9 +161,9 @@ static ktime_t heartbeat_interval(void)
|
||||
return ktime_add_ms(ktime_get(), SCOUTFS_QUORUM_HB_IVAL_MS);
|
||||
}
|
||||
|
||||
static ktime_t heartbeat_timeout(void)
|
||||
static ktime_t heartbeat_timeout(struct scoutfs_mount_options *opts)
|
||||
{
|
||||
return ktime_add_ms(ktime_get(), SCOUTFS_QUORUM_HB_TIMEO_MS);
|
||||
return ktime_add_ms(ktime_get(), opts->quorum_heartbeat_timeout_ms);
|
||||
}
|
||||
|
||||
static int create_socket(struct super_block *sb)
|
||||
@@ -625,6 +625,7 @@ static void update_show_status(struct quorum_info *qinf, struct quorum_status *q
|
||||
static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
{
|
||||
struct quorum_info *qinf = container_of(work, struct quorum_info, work);
|
||||
struct scoutfs_mount_options opts;
|
||||
struct super_block *sb = qinf->sb;
|
||||
struct sockaddr_in unused;
|
||||
struct quorum_host_msg msg;
|
||||
@@ -635,6 +636,8 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
/* recording votes from slots as native single word bitmap */
|
||||
BUILD_BUG_ON(SCOUTFS_QUORUM_MAX_SLOTS > BITS_PER_LONG);
|
||||
|
||||
scoutfs_options_read(sb, &opts);
|
||||
|
||||
/* start out as a follower */
|
||||
qst.role = FOLLOWER;
|
||||
qst.vote_for = -1;
|
||||
@@ -644,7 +647,7 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
|
||||
/* see if there's a server to chose heartbeat or election timeout */
|
||||
if (scoutfs_quorum_server_sin(sb, &unused) == 0)
|
||||
qst.timeout = heartbeat_timeout();
|
||||
qst.timeout = heartbeat_timeout(&opts);
|
||||
else
|
||||
qst.timeout = election_timeout();
|
||||
|
||||
@@ -668,6 +671,8 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
ret = 0;
|
||||
}
|
||||
|
||||
scoutfs_options_read(sb, &opts);
|
||||
|
||||
/* ignore messages from older terms */
|
||||
if (msg.type != SCOUTFS_QUORUM_MSG_INVALID &&
|
||||
msg.term < qst.term)
|
||||
@@ -691,7 +696,7 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
scoutfs_inc_counter(sb, quorum_term_follower);
|
||||
|
||||
if (msg.type == SCOUTFS_QUORUM_MSG_HEARTBEAT)
|
||||
qst.timeout = heartbeat_timeout();
|
||||
qst.timeout = heartbeat_timeout(&opts);
|
||||
else
|
||||
qst.timeout = election_timeout();
|
||||
|
||||
@@ -703,7 +708,7 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
|
||||
/* receiving heartbeats extends timeout, delaying elections */
|
||||
if (msg.type == SCOUTFS_QUORUM_MSG_HEARTBEAT) {
|
||||
qst.timeout = heartbeat_timeout();
|
||||
qst.timeout = heartbeat_timeout(&opts);
|
||||
scoutfs_inc_counter(sb, quorum_recv_heartbeat);
|
||||
}
|
||||
|
||||
|
||||
@@ -85,6 +85,25 @@ the options directory in the mount's sysfs directory. Writing a new
|
||||
value will cause the next pending orphan scan to be rescheduled
|
||||
with the newly written delay time.
|
||||
.TP
|
||||
.B quorum_heartbeat_timeout_ms=<number>
|
||||
This option sets the amount of time, in milliseconds, that a quorum
|
||||
member will wait without receiving heartbeat messages from the current
|
||||
leader before trying to take over as leader. This setting is per-mount
|
||||
and only changes the behavior of that mount.
|
||||
.sp
|
||||
This determines how long it may take before a failed leader is replaced
|
||||
by a waiting quorum member. Setting it too low may lead to spurious
|
||||
fencing as active leaders are prematurely replaced due to task or
|
||||
network delays that prevent the quorum members from promptly sending and
|
||||
receiving messages. The ideal setting is the longest acceptable
|
||||
downtime during server failover. The default is 10000 (10s) and it can
|
||||
not be less than 2000 greater than 60000.
|
||||
.sp
|
||||
This option can be changed in an active mount by writing to its file in
|
||||
the options directory in the mount's sysfs directory. Writing a new
|
||||
value will take effect the next time the quorum agent receives a
|
||||
heartbeat message and sets the next timeout.
|
||||
.TP
|
||||
.B quorum_slot_nr=<number>
|
||||
The quorum_slot_nr option assigns a quorum member slot to the mount.
|
||||
The mount will use the slot assignment to claim exclusive ownership of
|
||||
|
||||
Reference in New Issue
Block a user