diff --git a/kmod/src/format.h b/kmod/src/format.h index 66bbf1c1..da1683dd 100644 --- a/kmod/src/format.h +++ b/kmod/src/format.h @@ -683,16 +683,19 @@ struct scoutfs_xattr_totl_val { #define SCOUTFS_QUORUM_ELECT_VAR_MS 100 /* - * Once a leader is elected they send out heartbeats at regular - * intervals to force members to wait the much longer heartbeat timeout. - * Once heartbeat timeout expires without receiving a heartbeat they'll - * switch over the performing elections. + * Once a leader is elected they send heartbeat messages to all quorum + * members at regular intervals to force members to wait the much longer + * heartbeat timeout. Once the heartbeat timeout expires without + * receiving a heartbeat message a member will start an election. * * These determine how long it could take members to notice that a - * leader has gone silent and start to elect a new leader. + * leader has gone silent and start to elect a new leader. The + * heartbeat timeout can be changed at run time by options. */ #define SCOUTFS_QUORUM_HB_IVAL_MS 100 -#define SCOUTFS_QUORUM_HB_TIMEO_MS (5 * MSEC_PER_SEC) +#define SCOUTFS_QUORUM_MIN_HB_TIMEO_MS (2 * MSEC_PER_SEC) +#define SCOUTFS_QUORUM_DEF_HB_TIMEO_MS (10 * MSEC_PER_SEC) +#define SCOUTFS_QUORUM_MAX_HB_TIMEO_MS (60 * MSEC_PER_SEC) /* * A newly elected leader will give fencing some time before giving up and diff --git a/kmod/src/options.c b/kmod/src/options.c index e28bf2a6..9a3d3471 100644 --- a/kmod/src/options.c +++ b/kmod/src/options.c @@ -36,6 +36,7 @@ enum { Opt_metadev_path, Opt_noacl, Opt_orphan_scan_delay_ms, + Opt_quorum_heartbeat_timeout_ms, Opt_quorum_slot_nr, Opt_err, }; @@ -47,6 +48,7 @@ static const match_table_t tokens = { {Opt_metadev_path, "metadev_path=%s"}, {Opt_noacl, "noacl"}, {Opt_orphan_scan_delay_ms, "orphan_scan_delay_ms=%s"}, + {Opt_quorum_heartbeat_timeout_ms, "quorum_heartbeat_timeout_ms=%s"}, {Opt_quorum_slot_nr, "quorum_slot_nr=%s"}, {Opt_err, NULL} }; @@ -124,8 +126,30 @@ static void init_default_options(struct scoutfs_mount_options *opts) opts->data_prealloc_blocks = SCOUTFS_DATA_PREALLOC_DEFAULT_BLOCKS; opts->data_prealloc_contig_only = 1; - opts->quorum_slot_nr = -1; opts->orphan_scan_delay_ms = -1; + opts->quorum_heartbeat_timeout_ms = SCOUTFS_QUORUM_DEF_HB_TIMEO_MS; + opts->quorum_slot_nr = -1; +} + +static int set_quorum_heartbeat_timeout_ms(struct super_block *sb, int ret, u64 val) +{ + DECLARE_OPTIONS_INFO(sb, optinf); + + if (ret < 0) { + scoutfs_err(sb, "failed to parse quorum_heartbeat_timeout_ms value"); + return -EINVAL; + } + if (val < SCOUTFS_QUORUM_MIN_HB_TIMEO_MS || val > SCOUTFS_QUORUM_MAX_HB_TIMEO_MS) { + scoutfs_err(sb, "invalid quorum_heartbeat_timeout_ms value %llu, must be between %lu and %lu", + val, SCOUTFS_QUORUM_MIN_HB_TIMEO_MS, SCOUTFS_QUORUM_MAX_HB_TIMEO_MS); + return -EINVAL; + } + + write_seqlock(&optinf->seqlock); + optinf->opts.quorum_heartbeat_timeout_ms = val; + write_sequnlock(&optinf->seqlock); + + return 0; } /* @@ -206,6 +230,13 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m opts->orphan_scan_delay_ms = nr; break; + case Opt_quorum_heartbeat_timeout_ms: + ret = match_u64(args, &nr64); + ret = set_quorum_heartbeat_timeout_ms(sb, ret, nr64); + if (ret < 0) + return ret; + break; + case Opt_quorum_slot_nr: if (opts->quorum_slot_nr != -1) { scoutfs_err(sb, "multiple quorum_slot_nr options provided, only provide one."); @@ -448,6 +479,38 @@ static ssize_t orphan_scan_delay_ms_store(struct kobject *kobj, struct kobj_attr } SCOUTFS_ATTR_RW(orphan_scan_delay_ms); +static ssize_t quorum_heartbeat_timeout_ms_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj); + struct scoutfs_mount_options opts; + + scoutfs_options_read(sb, &opts); + + return snprintf(buf, PAGE_SIZE, "%llu", opts.quorum_heartbeat_timeout_ms); +} +static ssize_t quorum_heartbeat_timeout_ms_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj); + char nullterm[30]; /* more than enough for octal -U64_MAX */ + u64 val; + int len; + int ret; + + len = min(count, sizeof(nullterm) - 1); + memcpy(nullterm, buf, len); + nullterm[len] = '\0'; + + ret = kstrtoll(nullterm, 0, &val); + ret = set_quorum_heartbeat_timeout_ms(sb, ret, val); + if (ret == 0) + ret = count; + + return ret; +} +SCOUTFS_ATTR_RW(quorum_heartbeat_timeout_ms); + static ssize_t quorum_slot_nr_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj); @@ -464,6 +527,7 @@ static struct attribute *options_attrs[] = { SCOUTFS_ATTR_PTR(data_prealloc_contig_only), SCOUTFS_ATTR_PTR(metadev_path), SCOUTFS_ATTR_PTR(orphan_scan_delay_ms), + SCOUTFS_ATTR_PTR(quorum_heartbeat_timeout_ms), SCOUTFS_ATTR_PTR(quorum_slot_nr), NULL, }; diff --git a/kmod/src/options.h b/kmod/src/options.h index d2e4ad74..639f3882 100644 --- a/kmod/src/options.h +++ b/kmod/src/options.h @@ -11,7 +11,7 @@ struct scoutfs_mount_options { char *metadev_path; unsigned int orphan_scan_delay_ms; int quorum_slot_nr; - + u64 quorum_heartbeat_timeout_ms; }; void scoutfs_options_read(struct super_block *sb, struct scoutfs_mount_options *opts); diff --git a/kmod/src/quorum.c b/kmod/src/quorum.c index d93e8396..e15f4d77 100644 --- a/kmod/src/quorum.c +++ b/kmod/src/quorum.c @@ -161,9 +161,9 @@ static ktime_t heartbeat_interval(void) return ktime_add_ms(ktime_get(), SCOUTFS_QUORUM_HB_IVAL_MS); } -static ktime_t heartbeat_timeout(void) +static ktime_t heartbeat_timeout(struct scoutfs_mount_options *opts) { - return ktime_add_ms(ktime_get(), SCOUTFS_QUORUM_HB_TIMEO_MS); + return ktime_add_ms(ktime_get(), opts->quorum_heartbeat_timeout_ms); } static int create_socket(struct super_block *sb) @@ -625,6 +625,7 @@ static void update_show_status(struct quorum_info *qinf, struct quorum_status *q static void scoutfs_quorum_worker(struct work_struct *work) { struct quorum_info *qinf = container_of(work, struct quorum_info, work); + struct scoutfs_mount_options opts; struct super_block *sb = qinf->sb; struct sockaddr_in unused; struct quorum_host_msg msg; @@ -635,6 +636,8 @@ static void scoutfs_quorum_worker(struct work_struct *work) /* recording votes from slots as native single word bitmap */ BUILD_BUG_ON(SCOUTFS_QUORUM_MAX_SLOTS > BITS_PER_LONG); + scoutfs_options_read(sb, &opts); + /* start out as a follower */ qst.role = FOLLOWER; qst.vote_for = -1; @@ -644,7 +647,7 @@ static void scoutfs_quorum_worker(struct work_struct *work) /* see if there's a server to chose heartbeat or election timeout */ if (scoutfs_quorum_server_sin(sb, &unused) == 0) - qst.timeout = heartbeat_timeout(); + qst.timeout = heartbeat_timeout(&opts); else qst.timeout = election_timeout(); @@ -668,6 +671,8 @@ static void scoutfs_quorum_worker(struct work_struct *work) ret = 0; } + scoutfs_options_read(sb, &opts); + /* ignore messages from older terms */ if (msg.type != SCOUTFS_QUORUM_MSG_INVALID && msg.term < qst.term) @@ -691,7 +696,7 @@ static void scoutfs_quorum_worker(struct work_struct *work) scoutfs_inc_counter(sb, quorum_term_follower); if (msg.type == SCOUTFS_QUORUM_MSG_HEARTBEAT) - qst.timeout = heartbeat_timeout(); + qst.timeout = heartbeat_timeout(&opts); else qst.timeout = election_timeout(); @@ -703,7 +708,7 @@ static void scoutfs_quorum_worker(struct work_struct *work) /* receiving heartbeats extends timeout, delaying elections */ if (msg.type == SCOUTFS_QUORUM_MSG_HEARTBEAT) { - qst.timeout = heartbeat_timeout(); + qst.timeout = heartbeat_timeout(&opts); scoutfs_inc_counter(sb, quorum_recv_heartbeat); } diff --git a/utils/man/scoutfs.5 b/utils/man/scoutfs.5 index 8f619e4b..6078bb74 100644 --- a/utils/man/scoutfs.5 +++ b/utils/man/scoutfs.5 @@ -85,6 +85,25 @@ the options directory in the mount's sysfs directory. Writing a new value will cause the next pending orphan scan to be rescheduled with the newly written delay time. .TP +.B quorum_heartbeat_timeout_ms= +This option sets the amount of time, in milliseconds, that a quorum +member will wait without receiving heartbeat messages from the current +leader before trying to take over as leader. This setting is per-mount +and only changes the behavior of that mount. +.sp +This determines how long it may take before a failed leader is replaced +by a waiting quorum member. Setting it too low may lead to spurious +fencing as active leaders are prematurely replaced due to task or +network delays that prevent the quorum members from promptly sending and +receiving messages. The ideal setting is the longest acceptable +downtime during server failover. The default is 10000 (10s) and it can +not be less than 2000 greater than 60000. +.sp +This option can be changed in an active mount by writing to its file in +the options directory in the mount's sysfs directory. Writing a new +value will take effect the next time the quorum agent receives a +heartbeat message and sets the next timeout. +.TP .B quorum_slot_nr= The quorum_slot_nr option assigns a quorum member slot to the mount. The mount will use the slot assignment to claim exclusive ownership of