mirror of
https://github.com/versity/scoutfs.git
synced 2026-01-06 12:06:26 +00:00
Merge pull request #121 from versity/zab/heartbeat_fencing_tweaks
Zab/heartbeat fencing tweaks
This commit is contained in:
@@ -166,6 +166,7 @@
|
||||
EXPAND_COUNTER(quorum_recv_resignation) \
|
||||
EXPAND_COUNTER(quorum_recv_vote) \
|
||||
EXPAND_COUNTER(quorum_send_heartbeat) \
|
||||
EXPAND_COUNTER(quorum_send_heartbeat_dropped) \
|
||||
EXPAND_COUNTER(quorum_send_resignation) \
|
||||
EXPAND_COUNTER(quorum_send_request) \
|
||||
EXPAND_COUNTER(quorum_send_vote) \
|
||||
|
||||
@@ -683,16 +683,19 @@ struct scoutfs_xattr_totl_val {
|
||||
#define SCOUTFS_QUORUM_ELECT_VAR_MS 100
|
||||
|
||||
/*
|
||||
* Once a leader is elected they send out heartbeats at regular
|
||||
* intervals to force members to wait the much longer heartbeat timeout.
|
||||
* Once heartbeat timeout expires without receiving a heartbeat they'll
|
||||
* switch over the performing elections.
|
||||
* Once a leader is elected they send heartbeat messages to all quorum
|
||||
* members at regular intervals to force members to wait the much longer
|
||||
* heartbeat timeout. Once the heartbeat timeout expires without
|
||||
* receiving a heartbeat message a member will start an election.
|
||||
*
|
||||
* These determine how long it could take members to notice that a
|
||||
* leader has gone silent and start to elect a new leader.
|
||||
* leader has gone silent and start to elect a new leader. The
|
||||
* heartbeat timeout can be changed at run time by options.
|
||||
*/
|
||||
#define SCOUTFS_QUORUM_HB_IVAL_MS 100
|
||||
#define SCOUTFS_QUORUM_HB_TIMEO_MS (5 * MSEC_PER_SEC)
|
||||
#define SCOUTFS_QUORUM_MIN_HB_TIMEO_MS (2 * MSEC_PER_SEC)
|
||||
#define SCOUTFS_QUORUM_DEF_HB_TIMEO_MS (10 * MSEC_PER_SEC)
|
||||
#define SCOUTFS_QUORUM_MAX_HB_TIMEO_MS (60 * MSEC_PER_SEC)
|
||||
|
||||
/*
|
||||
* A newly elected leader will give fencing some time before giving up and
|
||||
|
||||
@@ -36,6 +36,7 @@ enum {
|
||||
Opt_metadev_path,
|
||||
Opt_noacl,
|
||||
Opt_orphan_scan_delay_ms,
|
||||
Opt_quorum_heartbeat_timeout_ms,
|
||||
Opt_quorum_slot_nr,
|
||||
Opt_err,
|
||||
};
|
||||
@@ -47,6 +48,7 @@ static const match_table_t tokens = {
|
||||
{Opt_metadev_path, "metadev_path=%s"},
|
||||
{Opt_noacl, "noacl"},
|
||||
{Opt_orphan_scan_delay_ms, "orphan_scan_delay_ms=%s"},
|
||||
{Opt_quorum_heartbeat_timeout_ms, "quorum_heartbeat_timeout_ms=%s"},
|
||||
{Opt_quorum_slot_nr, "quorum_slot_nr=%s"},
|
||||
{Opt_err, NULL}
|
||||
};
|
||||
@@ -124,8 +126,30 @@ static void init_default_options(struct scoutfs_mount_options *opts)
|
||||
|
||||
opts->data_prealloc_blocks = SCOUTFS_DATA_PREALLOC_DEFAULT_BLOCKS;
|
||||
opts->data_prealloc_contig_only = 1;
|
||||
opts->quorum_slot_nr = -1;
|
||||
opts->orphan_scan_delay_ms = -1;
|
||||
opts->quorum_heartbeat_timeout_ms = SCOUTFS_QUORUM_DEF_HB_TIMEO_MS;
|
||||
opts->quorum_slot_nr = -1;
|
||||
}
|
||||
|
||||
static int set_quorum_heartbeat_timeout_ms(struct super_block *sb, int ret, u64 val)
|
||||
{
|
||||
DECLARE_OPTIONS_INFO(sb, optinf);
|
||||
|
||||
if (ret < 0) {
|
||||
scoutfs_err(sb, "failed to parse quorum_heartbeat_timeout_ms value");
|
||||
return -EINVAL;
|
||||
}
|
||||
if (val < SCOUTFS_QUORUM_MIN_HB_TIMEO_MS || val > SCOUTFS_QUORUM_MAX_HB_TIMEO_MS) {
|
||||
scoutfs_err(sb, "invalid quorum_heartbeat_timeout_ms value %llu, must be between %lu and %lu",
|
||||
val, SCOUTFS_QUORUM_MIN_HB_TIMEO_MS, SCOUTFS_QUORUM_MAX_HB_TIMEO_MS);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
write_seqlock(&optinf->seqlock);
|
||||
optinf->opts.quorum_heartbeat_timeout_ms = val;
|
||||
write_sequnlock(&optinf->seqlock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -206,6 +230,13 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m
|
||||
opts->orphan_scan_delay_ms = nr;
|
||||
break;
|
||||
|
||||
case Opt_quorum_heartbeat_timeout_ms:
|
||||
ret = match_u64(args, &nr64);
|
||||
ret = set_quorum_heartbeat_timeout_ms(sb, ret, nr64);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
break;
|
||||
|
||||
case Opt_quorum_slot_nr:
|
||||
if (opts->quorum_slot_nr != -1) {
|
||||
scoutfs_err(sb, "multiple quorum_slot_nr options provided, only provide one.");
|
||||
@@ -448,6 +479,38 @@ static ssize_t orphan_scan_delay_ms_store(struct kobject *kobj, struct kobj_attr
|
||||
}
|
||||
SCOUTFS_ATTR_RW(orphan_scan_delay_ms);
|
||||
|
||||
static ssize_t quorum_heartbeat_timeout_ms_show(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
char *buf)
|
||||
{
|
||||
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
||||
struct scoutfs_mount_options opts;
|
||||
|
||||
scoutfs_options_read(sb, &opts);
|
||||
|
||||
return snprintf(buf, PAGE_SIZE, "%llu", opts.quorum_heartbeat_timeout_ms);
|
||||
}
|
||||
static ssize_t quorum_heartbeat_timeout_ms_store(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
const char *buf, size_t count)
|
||||
{
|
||||
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
||||
char nullterm[30]; /* more than enough for octal -U64_MAX */
|
||||
u64 val;
|
||||
int len;
|
||||
int ret;
|
||||
|
||||
len = min(count, sizeof(nullterm) - 1);
|
||||
memcpy(nullterm, buf, len);
|
||||
nullterm[len] = '\0';
|
||||
|
||||
ret = kstrtoll(nullterm, 0, &val);
|
||||
ret = set_quorum_heartbeat_timeout_ms(sb, ret, val);
|
||||
if (ret == 0)
|
||||
ret = count;
|
||||
|
||||
return ret;
|
||||
}
|
||||
SCOUTFS_ATTR_RW(quorum_heartbeat_timeout_ms);
|
||||
|
||||
static ssize_t quorum_slot_nr_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
|
||||
{
|
||||
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
||||
@@ -464,6 +527,7 @@ static struct attribute *options_attrs[] = {
|
||||
SCOUTFS_ATTR_PTR(data_prealloc_contig_only),
|
||||
SCOUTFS_ATTR_PTR(metadev_path),
|
||||
SCOUTFS_ATTR_PTR(orphan_scan_delay_ms),
|
||||
SCOUTFS_ATTR_PTR(quorum_heartbeat_timeout_ms),
|
||||
SCOUTFS_ATTR_PTR(quorum_slot_nr),
|
||||
NULL,
|
||||
};
|
||||
|
||||
@@ -11,7 +11,7 @@ struct scoutfs_mount_options {
|
||||
char *metadev_path;
|
||||
unsigned int orphan_scan_delay_ms;
|
||||
int quorum_slot_nr;
|
||||
|
||||
u64 quorum_heartbeat_timeout_ms;
|
||||
};
|
||||
|
||||
void scoutfs_options_read(struct super_block *sb, struct scoutfs_mount_options *opts);
|
||||
|
||||
@@ -100,6 +100,11 @@ struct last_msg {
|
||||
ktime_t ts;
|
||||
};
|
||||
|
||||
struct count_recent {
|
||||
u64 count;
|
||||
ktime_t recent;
|
||||
};
|
||||
|
||||
enum quorum_role { FOLLOWER, CANDIDATE, LEADER };
|
||||
|
||||
struct quorum_status {
|
||||
@@ -112,9 +117,12 @@ struct quorum_status {
|
||||
ktime_t timeout;
|
||||
};
|
||||
|
||||
#define HB_DELAY_NR (SCOUTFS_QUORUM_MAX_HB_TIMEO_MS / MSEC_PER_SEC)
|
||||
|
||||
struct quorum_info {
|
||||
struct super_block *sb;
|
||||
struct scoutfs_quorum_config qconf;
|
||||
struct workqueue_struct *workq;
|
||||
struct work_struct work;
|
||||
struct socket *sock;
|
||||
bool shutdown;
|
||||
@@ -126,6 +134,8 @@ struct quorum_info {
|
||||
struct quorum_status show_status;
|
||||
struct last_msg last_send[SCOUTFS_QUORUM_MAX_SLOTS];
|
||||
struct last_msg last_recv[SCOUTFS_QUORUM_MAX_SLOTS];
|
||||
struct count_recent *hb_delay;
|
||||
unsigned long max_hb_delay;
|
||||
|
||||
struct scoutfs_sysfs_attrs ssa;
|
||||
};
|
||||
@@ -160,9 +170,9 @@ static ktime_t heartbeat_interval(void)
|
||||
return ktime_add_ms(ktime_get(), SCOUTFS_QUORUM_HB_IVAL_MS);
|
||||
}
|
||||
|
||||
static ktime_t heartbeat_timeout(void)
|
||||
static ktime_t heartbeat_timeout(struct scoutfs_mount_options *opts)
|
||||
{
|
||||
return ktime_add_ms(ktime_get(), SCOUTFS_QUORUM_HB_TIMEO_MS);
|
||||
return ktime_add_ms(ktime_get(), opts->quorum_heartbeat_timeout_ms);
|
||||
}
|
||||
|
||||
static int create_socket(struct super_block *sb)
|
||||
@@ -179,7 +189,8 @@ static int create_socket(struct super_block *sb)
|
||||
goto out;
|
||||
}
|
||||
|
||||
sock->sk->sk_allocation = GFP_NOFS;
|
||||
/* rather fail and retry than block waiting for free */
|
||||
sock->sk->sk_allocation = GFP_ATOMIC;
|
||||
|
||||
quorum_slot_sin(&qinf->qconf, qinf->our_quorum_slot_nr, &sin);
|
||||
|
||||
@@ -208,12 +219,16 @@ static __le32 quorum_message_crc(struct scoutfs_quorum_message *qmes)
|
||||
return cpu_to_le32(crc32c(~0, qmes, len));
|
||||
}
|
||||
|
||||
static void send_msg_members(struct super_block *sb, int type, u64 term,
|
||||
int only)
|
||||
/*
|
||||
* Returns the number of failures from sendmsg.
|
||||
*/
|
||||
static int send_msg_members(struct super_block *sb, int type, u64 term, int only)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
DECLARE_QUORUM_INFO(sb, qinf);
|
||||
int failed = 0;
|
||||
ktime_t now;
|
||||
int ret;
|
||||
int i;
|
||||
|
||||
struct scoutfs_quorum_message qmes = {
|
||||
@@ -239,15 +254,21 @@ static void send_msg_members(struct super_block *sb, int type, u64 term,
|
||||
|
||||
qmes.crc = quorum_message_crc(&qmes);
|
||||
|
||||
|
||||
for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
|
||||
if (!quorum_slot_present(&qinf->qconf, i) ||
|
||||
(only >= 0 && i != only) || i == qinf->our_quorum_slot_nr)
|
||||
continue;
|
||||
|
||||
if (scoutfs_forcing_unmount(sb)) {
|
||||
failed = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
scoutfs_quorum_slot_sin(&qinf->qconf, i, &sin);
|
||||
now = ktime_get();
|
||||
kernel_sendmsg(qinf->sock, &mh, &kv, 1, kv.iov_len);
|
||||
ret = kernel_sendmsg(qinf->sock, &mh, &kv, 1, kv.iov_len);
|
||||
if (ret != kv.iov_len)
|
||||
failed++;
|
||||
|
||||
spin_lock(&qinf->show_lock);
|
||||
qinf->last_send[i].msg.term = term;
|
||||
@@ -258,6 +279,8 @@ static void send_msg_members(struct super_block *sb, int type, u64 term,
|
||||
if (i == only)
|
||||
break;
|
||||
}
|
||||
|
||||
return failed;
|
||||
}
|
||||
|
||||
#define send_msg_to(sb, type, term, nr) send_msg_members(sb, type, term, nr)
|
||||
@@ -312,6 +335,9 @@ static int recv_msg(struct super_block *sb, struct quorum_host_msg *msg,
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
if (scoutfs_forcing_unmount(sb))
|
||||
return 0;
|
||||
|
||||
now = ktime_get();
|
||||
|
||||
if (ret != sizeof(qmes) ||
|
||||
@@ -599,6 +625,71 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void clear_hb_delay(struct quorum_info *qinf)
|
||||
{
|
||||
int i;
|
||||
|
||||
spin_lock(&qinf->show_lock);
|
||||
qinf->max_hb_delay = 0;
|
||||
for (i = 0; i < HB_DELAY_NR; i++) {
|
||||
qinf->hb_delay[i].recent = ns_to_ktime(0);
|
||||
qinf->hb_delay[i].count = 0;
|
||||
}
|
||||
spin_unlock(&qinf->show_lock);
|
||||
}
|
||||
|
||||
struct hb_recording {
|
||||
ktime_t prev;
|
||||
int count;
|
||||
};
|
||||
|
||||
/*
|
||||
* Record long heartbeat delays. We only record the delay between back
|
||||
* to back send attempts in the leader or back to back recv messages in
|
||||
* the followers. The worker caller sets record_hb when their iteration
|
||||
* sent or received a heartbeat. An iteration that does anything else
|
||||
* resets the tracking.
|
||||
*/
|
||||
static void record_hb_delay(struct super_block *sb, struct quorum_info *qinf,
|
||||
struct hb_recording *hbr, bool record_hb, int role)
|
||||
{
|
||||
bool log = false;
|
||||
ktime_t now;
|
||||
s64 s;
|
||||
|
||||
if (!record_hb) {
|
||||
hbr->count = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
now = ktime_get();
|
||||
|
||||
if (hbr->count < 2 && ++hbr->count < 2) {
|
||||
hbr->prev = now;
|
||||
return;
|
||||
}
|
||||
|
||||
s = ktime_ms_delta(now, hbr->prev) / MSEC_PER_SEC;
|
||||
hbr->prev = now;
|
||||
|
||||
if (s <= 0 || s >= HB_DELAY_NR)
|
||||
return;
|
||||
|
||||
spin_lock(&qinf->show_lock);
|
||||
if (qinf->max_hb_delay < s) {
|
||||
qinf->max_hb_delay = s;
|
||||
if (s >= 3)
|
||||
log = true;
|
||||
}
|
||||
qinf->hb_delay[s].recent = now;
|
||||
qinf->hb_delay[s].count++;
|
||||
spin_unlock(&qinf->show_lock);
|
||||
|
||||
if (log)
|
||||
scoutfs_info(sb, "longest quorum heartbeat %s delay of %lld sec",
|
||||
role == LEADER ? "send" : "recv", s);
|
||||
}
|
||||
|
||||
/*
|
||||
* The main quorum task maintains its private status. It seemed cleaner
|
||||
* to occasionally copy the status for showing in sysfs/debugfs files
|
||||
@@ -623,16 +714,21 @@ static void update_show_status(struct quorum_info *qinf, struct quorum_status *q
|
||||
static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
{
|
||||
struct quorum_info *qinf = container_of(work, struct quorum_info, work);
|
||||
struct scoutfs_mount_options opts;
|
||||
struct super_block *sb = qinf->sb;
|
||||
struct sockaddr_in unused;
|
||||
struct quorum_host_msg msg;
|
||||
struct quorum_status qst = {0,};
|
||||
struct hb_recording hbr = {{0,},};
|
||||
bool record_hb;
|
||||
int ret;
|
||||
int err;
|
||||
|
||||
/* recording votes from slots as native single word bitmap */
|
||||
BUILD_BUG_ON(SCOUTFS_QUORUM_MAX_SLOTS > BITS_PER_LONG);
|
||||
|
||||
scoutfs_options_read(sb, &opts);
|
||||
|
||||
/* start out as a follower */
|
||||
qst.role = FOLLOWER;
|
||||
qst.vote_for = -1;
|
||||
@@ -642,7 +738,7 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
|
||||
/* see if there's a server to chose heartbeat or election timeout */
|
||||
if (scoutfs_quorum_server_sin(sb, &unused) == 0)
|
||||
qst.timeout = heartbeat_timeout();
|
||||
qst.timeout = heartbeat_timeout(&opts);
|
||||
else
|
||||
qst.timeout = election_timeout();
|
||||
|
||||
@@ -666,6 +762,9 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
ret = 0;
|
||||
}
|
||||
|
||||
scoutfs_options_read(sb, &opts);
|
||||
record_hb = false;
|
||||
|
||||
/* ignore messages from older terms */
|
||||
if (msg.type != SCOUTFS_QUORUM_MSG_INVALID &&
|
||||
msg.term < qst.term)
|
||||
@@ -681,6 +780,7 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
if (qst.role == LEADER) {
|
||||
scoutfs_warn(sb, "saw msg type %u from %u for term %llu while leader in term %llu, shutting down server.",
|
||||
msg.type, msg.from, msg.term, qst.term);
|
||||
clear_hb_delay(qinf);
|
||||
}
|
||||
qst.role = FOLLOWER;
|
||||
qst.term = msg.term;
|
||||
@@ -689,7 +789,7 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
scoutfs_inc_counter(sb, quorum_term_follower);
|
||||
|
||||
if (msg.type == SCOUTFS_QUORUM_MSG_HEARTBEAT)
|
||||
qst.timeout = heartbeat_timeout();
|
||||
qst.timeout = heartbeat_timeout(&opts);
|
||||
else
|
||||
qst.timeout = election_timeout();
|
||||
|
||||
@@ -699,6 +799,21 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* receiving heartbeats extends timeout, delaying elections */
|
||||
if (msg.type == SCOUTFS_QUORUM_MSG_HEARTBEAT) {
|
||||
qst.timeout = heartbeat_timeout(&opts);
|
||||
scoutfs_inc_counter(sb, quorum_recv_heartbeat);
|
||||
record_hb = true;
|
||||
}
|
||||
|
||||
/* receiving a resignation from server starts election */
|
||||
if (msg.type == SCOUTFS_QUORUM_MSG_RESIGNATION &&
|
||||
qst.role == FOLLOWER &&
|
||||
msg.term == qst.term) {
|
||||
qst.timeout = election_timeout();
|
||||
scoutfs_inc_counter(sb, quorum_recv_resignation);
|
||||
}
|
||||
|
||||
/* followers and candidates start new election on timeout */
|
||||
if (qst.role != LEADER &&
|
||||
ktime_after(ktime_get(), qst.timeout)) {
|
||||
@@ -751,6 +866,7 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
qst.timeout = heartbeat_interval();
|
||||
|
||||
update_show_status(qinf, &qst);
|
||||
clear_hb_delay(qinf);
|
||||
|
||||
/* record that we've been elected before starting up server */
|
||||
ret = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_ELECT, qst.term, true);
|
||||
@@ -805,6 +921,7 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
send_msg_others(sb, SCOUTFS_QUORUM_MSG_RESIGNATION,
|
||||
qst.server_start_term);
|
||||
scoutfs_inc_counter(sb, quorum_send_resignation);
|
||||
clear_hb_delay(qinf);
|
||||
}
|
||||
|
||||
ret = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_STOP,
|
||||
@@ -818,24 +935,16 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
/* leaders regularly send heartbeats to delay elections */
|
||||
if (qst.role == LEADER &&
|
||||
ktime_after(ktime_get(), qst.timeout)) {
|
||||
send_msg_others(sb, SCOUTFS_QUORUM_MSG_HEARTBEAT,
|
||||
qst.term);
|
||||
ret = send_msg_others(sb, SCOUTFS_QUORUM_MSG_HEARTBEAT, qst.term);
|
||||
if (ret > 0) {
|
||||
scoutfs_add_counter(sb, quorum_send_heartbeat_dropped, ret);
|
||||
ret = 0;
|
||||
}
|
||||
|
||||
qst.timeout = heartbeat_interval();
|
||||
scoutfs_inc_counter(sb, quorum_send_heartbeat);
|
||||
}
|
||||
record_hb = true;
|
||||
|
||||
/* receiving heartbeats extends timeout, delaying elections */
|
||||
if (msg.type == SCOUTFS_QUORUM_MSG_HEARTBEAT) {
|
||||
qst.timeout = heartbeat_timeout();
|
||||
scoutfs_inc_counter(sb, quorum_recv_heartbeat);
|
||||
}
|
||||
|
||||
/* receiving a resignation from server starts election */
|
||||
if (msg.type == SCOUTFS_QUORUM_MSG_RESIGNATION &&
|
||||
qst.role == FOLLOWER &&
|
||||
msg.term == qst.term) {
|
||||
qst.timeout = election_timeout();
|
||||
scoutfs_inc_counter(sb, quorum_recv_resignation);
|
||||
}
|
||||
|
||||
/* followers vote once per term */
|
||||
@@ -847,6 +956,8 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
msg.from);
|
||||
scoutfs_inc_counter(sb, quorum_send_vote);
|
||||
}
|
||||
|
||||
record_hb_delay(sb, qinf, &hbr, record_hb, qst.role);
|
||||
}
|
||||
|
||||
update_show_status(qinf, &qst);
|
||||
@@ -983,9 +1094,11 @@ static ssize_t status_show(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
{
|
||||
DECLARE_QUORUM_INFO_KOBJ(kobj, qinf);
|
||||
struct quorum_status qst;
|
||||
struct count_recent cr;
|
||||
struct last_msg last;
|
||||
struct timespec64 ts;
|
||||
const ktime_t now = ktime_get();
|
||||
unsigned long ul;
|
||||
size_t size;
|
||||
int ret;
|
||||
int i;
|
||||
@@ -1043,6 +1156,26 @@ static ssize_t status_show(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
(s64)ts.tv_sec, (int)ts.tv_nsec);
|
||||
}
|
||||
|
||||
spin_lock(&qinf->show_lock);
|
||||
ul = qinf->max_hb_delay;
|
||||
spin_unlock(&qinf->show_lock);
|
||||
if (ul)
|
||||
snprintf_ret(buf, size, &ret, "HB Delay(s) Count Secs Since\n");
|
||||
|
||||
for (i = 1; i <= ul && i < HB_DELAY_NR; i++) {
|
||||
spin_lock(&qinf->show_lock);
|
||||
cr = qinf->hb_delay[i];
|
||||
spin_unlock(&qinf->show_lock);
|
||||
|
||||
if (cr.count == 0)
|
||||
continue;
|
||||
|
||||
ts = ktime_to_timespec64(ktime_sub(now, cr.recent));
|
||||
snprintf_ret(buf, size, &ret,
|
||||
"%11u %9llu %lld.%09u\n",
|
||||
i, cr.count, (s64)ts.tv_sec, (int)ts.tv_nsec);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
SCOUTFS_ATTR_RO(status);
|
||||
@@ -1180,7 +1313,12 @@ int scoutfs_quorum_setup(struct super_block *sb)
|
||||
|
||||
qinf = kzalloc(sizeof(struct quorum_info), GFP_KERNEL);
|
||||
super = kmalloc(sizeof(struct scoutfs_super_block), GFP_KERNEL);
|
||||
if (!qinf || !super) {
|
||||
if (qinf)
|
||||
qinf->hb_delay = __vmalloc(HB_DELAY_NR * sizeof(struct count_recent),
|
||||
GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
|
||||
if (!qinf || !super || !qinf->hb_delay) {
|
||||
if (qinf)
|
||||
vfree(qinf->hb_delay);
|
||||
kfree(qinf);
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
@@ -1195,6 +1333,15 @@ int scoutfs_quorum_setup(struct super_block *sb)
|
||||
sbi->quorum_info = qinf;
|
||||
qinf->sb = sb;
|
||||
|
||||
/* a high priority single threaded context without mem reclaim */
|
||||
qinf->workq = alloc_workqueue("scoutfs_quorum_work",
|
||||
WQ_NON_REENTRANT | WQ_UNBOUND |
|
||||
WQ_HIGHPRI, 1);
|
||||
if (!qinf->workq) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = scoutfs_read_super(sb, super);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
@@ -1213,7 +1360,7 @@ int scoutfs_quorum_setup(struct super_block *sb)
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
schedule_work(&qinf->work);
|
||||
queue_work(qinf->workq, &qinf->work);
|
||||
|
||||
out:
|
||||
if (ret)
|
||||
@@ -1243,10 +1390,14 @@ void scoutfs_quorum_destroy(struct super_block *sb)
|
||||
qinf->shutdown = true;
|
||||
flush_work(&qinf->work);
|
||||
|
||||
if (qinf->workq)
|
||||
destroy_workqueue(qinf->workq);
|
||||
|
||||
scoutfs_sysfs_destroy_attrs(sb, &qinf->ssa);
|
||||
if (qinf->sock)
|
||||
sock_release(qinf->sock);
|
||||
|
||||
vfree(qinf->hb_delay);
|
||||
kfree(qinf);
|
||||
sbi->quorum_info = NULL;
|
||||
}
|
||||
|
||||
@@ -61,6 +61,7 @@ t_filter_dmesg()
|
||||
re="$re|scoutfs .* error: meta_super META flag not set"
|
||||
re="$re|scoutfs .* error: could not open metadev:.*"
|
||||
re="$re|scoutfs .* error: Unknown or malformed option,.*"
|
||||
re="$re|scoutfs .* error: invalid quorum_heartbeat_timeout_ms value"
|
||||
|
||||
# in debugging kernels we can slow things down a bit
|
||||
re="$re|hrtimer: interrupt took .*"
|
||||
@@ -81,6 +82,7 @@ t_filter_dmesg()
|
||||
re="$re|scoutfs .* error .* freeing merged btree blocks.*.final commit del.upd freeing item"
|
||||
re="$re|scoutfs .* error .*reading quorum block.*to update event.*"
|
||||
re="$re|scoutfs .* error.*server failed to bind to.*"
|
||||
re="$re|scoutfs .* critical transaction commit failure.*"
|
||||
|
||||
egrep -v "($re)"
|
||||
}
|
||||
|
||||
@@ -75,6 +75,15 @@ t_fs_nrs()
|
||||
seq 0 $((T_NR_MOUNTS - 1))
|
||||
}
|
||||
|
||||
#
|
||||
# output the fs nrs of quorum nodes, we "know" that
|
||||
# the quorum nrs are the first consequtive nrs
|
||||
#
|
||||
t_quorum_nrs()
|
||||
{
|
||||
seq 0 $((T_QUORUM - 1))
|
||||
}
|
||||
|
||||
#
|
||||
# outputs "1" if the fs number has "1" in its quorum/is_leader file.
|
||||
# All other cases output 0, including the fs nr being a client which
|
||||
@@ -391,7 +400,7 @@ t_set_sysfs_mount_option() {
|
||||
local val="$3"
|
||||
local opt="$(t_sysfs_path $nr)/mount_options/$name"
|
||||
|
||||
echo "$val" > "$opt"
|
||||
echo "$val" > "$opt" 2>/dev/null
|
||||
}
|
||||
|
||||
t_set_all_sysfs_mount_options() {
|
||||
|
||||
2
tests/golden/quorum-heartbeat-timeout
Normal file
2
tests/golden/quorum-heartbeat-timeout
Normal file
@@ -0,0 +1,2 @@
|
||||
== bad timeout values fail
|
||||
== test different timeouts
|
||||
@@ -38,6 +38,7 @@ setup-error-teardown.sh
|
||||
resize-devices.sh
|
||||
change-devices.sh
|
||||
fence-and-reclaim.sh
|
||||
quorum-heartbeat-timeout.sh
|
||||
orphan-inodes.sh
|
||||
mount-unmount-race.sh
|
||||
client-unmount-recovery.sh
|
||||
|
||||
89
tests/tests/quorum-heartbeat-timeout.sh
Normal file
89
tests/tests/quorum-heartbeat-timeout.sh
Normal file
@@ -0,0 +1,89 @@
|
||||
#
|
||||
# test that the quorum_heartbeat_time_ms option affects how long it
|
||||
# takes to recover from a failed mount.
|
||||
#
|
||||
|
||||
t_require_mounts 2
|
||||
|
||||
time_ms()
|
||||
{
|
||||
# time_t in seconds, then trunate nanoseconds to 3 most dig digits
|
||||
date +%s%3N
|
||||
}
|
||||
|
||||
set_bad_timeout() {
|
||||
local to="$1"
|
||||
t_set_sysfs_mount_option 0 quorum_heartbeat_timeout_ms $to && \
|
||||
t_fail "set bad q hb to $to"
|
||||
}
|
||||
|
||||
set_quorum_timeouts()
|
||||
{
|
||||
local to="$1"
|
||||
local was
|
||||
local is
|
||||
|
||||
for nr in $(t_quorum_nrs); do
|
||||
local mnt="$(eval echo \$T_M$nr)"
|
||||
|
||||
was=$(t_get_sysfs_mount_option $nr quorum_heartbeat_timeout_ms)
|
||||
t_set_sysfs_mount_option $nr quorum_heartbeat_timeout_ms $to
|
||||
is=$(t_get_sysfs_mount_option $nr quorum_heartbeat_timeout_ms)
|
||||
|
||||
if [ "$is" != "$to" ]; then
|
||||
t_fail "tried to set qhbto on $nr to $to but got $is"
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
test_timeout()
|
||||
{
|
||||
local to="$1"
|
||||
local orig_to
|
||||
local start
|
||||
local nr
|
||||
local delay
|
||||
|
||||
# set new timeouts, saving original
|
||||
orig_to=$(t_get_sysfs_mount_option 0 quorum_heartbeat_timeout_ms)
|
||||
set_quorum_timeouts $to
|
||||
|
||||
# give followers time to recv heartbeats and reset timeouts
|
||||
sleep 1
|
||||
|
||||
# tear down the current server/leader
|
||||
nr=$(t_server_nr)
|
||||
t_force_umount $nr
|
||||
|
||||
# see how long it takes for the next leader to start
|
||||
start=$(time_ms)
|
||||
t_wait_for_leader
|
||||
delay=$(($(time_ms) - start))
|
||||
|
||||
# kind of fun to have these logged
|
||||
echo "to $to delay $delay" >> $T_TMP.delay
|
||||
|
||||
# restore the mount that we tore down
|
||||
t_mount $nr
|
||||
|
||||
# reset the original timeouts
|
||||
set_quorum_timeouts $orig_to
|
||||
|
||||
# make sure the new leader delay was reasonable
|
||||
test "$delay" -gt "$to" || t_fail "delay $delay < to $to"
|
||||
# allow 5 seconds of slop
|
||||
test "$delay" -lt $(($to + 5000)) || t_fail "delay $delay > to $to + 5sec"
|
||||
}
|
||||
|
||||
echo "== bad timeout values fail"
|
||||
set_bad_timeout 0
|
||||
set_bad_timeout -1
|
||||
set_bad_timeout 1000000
|
||||
|
||||
echo "== test different timeouts"
|
||||
def=$(t_get_sysfs_mount_option 0 quorum_heartbeat_timeout_ms)
|
||||
test_timeout $def
|
||||
test_timeout 3000
|
||||
test_timeout $((def + 19000))
|
||||
|
||||
t_pass
|
||||
@@ -85,6 +85,25 @@ the options directory in the mount's sysfs directory. Writing a new
|
||||
value will cause the next pending orphan scan to be rescheduled
|
||||
with the newly written delay time.
|
||||
.TP
|
||||
.B quorum_heartbeat_timeout_ms=<number>
|
||||
This option sets the amount of time, in milliseconds, that a quorum
|
||||
member will wait without receiving heartbeat messages from the current
|
||||
leader before trying to take over as leader. This setting is per-mount
|
||||
and only changes the behavior of that mount.
|
||||
.sp
|
||||
This determines how long it may take before a failed leader is replaced
|
||||
by a waiting quorum member. Setting it too low may lead to spurious
|
||||
fencing as active leaders are prematurely replaced due to task or
|
||||
network delays that prevent the quorum members from promptly sending and
|
||||
receiving messages. The ideal setting is the longest acceptable
|
||||
downtime during server failover. The default is 10000 (10s) and it can
|
||||
not be less than 2000 greater than 60000.
|
||||
.sp
|
||||
This option can be changed in an active mount by writing to its file in
|
||||
the options directory in the mount's sysfs directory. Writing a new
|
||||
value will take effect the next time the quorum agent receives a
|
||||
heartbeat message and sets the next timeout.
|
||||
.TP
|
||||
.B quorum_slot_nr=<number>
|
||||
The quorum_slot_nr option assigns a quorum member slot to the mount.
|
||||
The mount will use the slot assignment to claim exclusive ownership of
|
||||
|
||||
Reference in New Issue
Block a user