Add --force to prepare empty data dev cli command

Add an option to skip the empty allocator checks when writing a new super block to a data device. This probably won't see the light of day. Signed-off-by: Zach Brown <zab@versity.com>
Merge pull request #126 from versity/zab/rht_block_shrink_deadlock
2026-04-30 01:46:54 +00:00 · 2023-06-21 14:53:29 -07:00 · 2023-06-16 10:30:16 -07:00 · 2023-06-16 09:38:58 -07:00 · 2023-06-16 09:37:37 -07:00 · 2023-06-15 14:45:26 -07:00
14 changed files with 471 additions and 53 deletions
--- a/ReleaseNotes.md
+++ b/ReleaseNotes.md
@@ -1,6 +1,17 @@
 Versity ScoutFS Release Notes
 =============================

+---
+v1.13
+\
+*May 19, 2023*
+
+Add the quorum\_heartbeat\_timeout\_ms mount option to set the quorum
+heartbeat timeout.
+
+Change some task prioritization and allocation behavior of the quorum
+agent to help reduce delays in sending and receiving heartbeat messages.
+
 ---
 v1.12
 \
--- a/kmod/src/block.c
+++ b/kmod/src/block.c
@@ -1096,6 +1096,7 @@ static int block_shrink(struct shrinker *shrink, struct shrink_control *sc)
 	struct super_block *sb = binf->sb;
 	struct rhashtable_iter iter;
 	struct block_private *bp;
+	bool stop = false;
 	unsigned long nr;
 	u64 recently;

@@ -1107,7 +1108,6 @@ static int block_shrink(struct shrinker *shrink, struct shrink_control *sc)

 	nr = DIV_ROUND_UP(nr, SCOUTFS_BLOCK_LG_PAGES_PER);

-restart:
 	recently = accessed_recently(binf);
 	rhashtable_walk_enter(&binf->ht, &iter);
 	rhashtable_walk_start(&iter);
@@ -1129,12 +1129,15 @@ restart:
 		if (bp == NULL)
 			break;
 		if (bp == ERR_PTR(-EAGAIN)) {
-			/* hard exit to wait for rcu rebalance to finish */
-			rhashtable_walk_stop(&iter);
-			rhashtable_walk_exit(&iter);
-			scoutfs_inc_counter(sb, block_cache_shrink_restart);
-			synchronize_rcu();
-			goto restart;
+			/*
+			 * We can be called from reclaim in the allocation
+			 * to resize the hash table itself.  We have to
+			 * return so that the caller can proceed and
+			 * enable hash table iteration again.
+			 */
+			scoutfs_inc_counter(sb, block_cache_shrink_stop);
+			stop = true;
+			break;
 		}

 		scoutfs_inc_counter(sb, block_cache_shrink_next);
@@ -1157,8 +1160,11 @@ restart:
 	rhashtable_walk_stop(&iter);
 	rhashtable_walk_exit(&iter);
 out:
-	return min_t(u64, (u64)atomic_read(&binf->total_inserted) * SCOUTFS_BLOCK_LG_PAGES_PER,
-		     INT_MAX);
+	if (stop)
+		return -1;
+	else
+		return min_t(u64, INT_MAX,
+			     (u64)atomic_read(&binf->total_inserted) * SCOUTFS_BLOCK_LG_PAGES_PER);
 }

 struct sm_block_completion {
--- a/kmod/src/counters.h
+++ b/kmod/src/counters.h
@@ -34,7 +34,7 @@
 	EXPAND_COUNTER(block_cache_shrink_next)			\
 	EXPAND_COUNTER(block_cache_shrink_recent)		\
 	EXPAND_COUNTER(block_cache_shrink_remove)		\
-	EXPAND_COUNTER(block_cache_shrink_restart)		\
+	EXPAND_COUNTER(block_cache_shrink_stop)			\
 	EXPAND_COUNTER(btree_compact_values)			\
 	EXPAND_COUNTER(btree_compact_values_enomem)		\
 	EXPAND_COUNTER(btree_delete)				\
@@ -166,6 +166,7 @@
 	EXPAND_COUNTER(quorum_recv_resignation)			\
 	EXPAND_COUNTER(quorum_recv_vote)			\
 	EXPAND_COUNTER(quorum_send_heartbeat)			\
+	EXPAND_COUNTER(quorum_send_heartbeat_dropped)		\
 	EXPAND_COUNTER(quorum_send_resignation)			\
 	EXPAND_COUNTER(quorum_send_request)			\
 	EXPAND_COUNTER(quorum_send_vote)			\
--- a/kmod/src/format.h
+++ b/kmod/src/format.h
@@ -683,16 +683,19 @@ struct scoutfs_xattr_totl_val {
 #define SCOUTFS_QUORUM_ELECT_VAR_MS	100

 /*
- * Once a leader is elected they send out heartbeats at regular
- * intervals to force members to wait the much longer heartbeat timeout.
- * Once heartbeat timeout expires without receiving a heartbeat they'll
- * switch over the performing elections.
+ * Once a leader is elected they send heartbeat messages to all quorum
+ * members at regular intervals to force members to wait the much longer
+ * heartbeat timeout.  Once the heartbeat timeout expires without
+ * receiving a heartbeat message a member will start an election.
 *
 * These determine how long it could take members to notice that a
- * leader has gone silent and start to elect a new leader.
+ * leader has gone silent and start to elect a new leader.  The
+ * heartbeat timeout can be changed at run time by options.
 */
 #define SCOUTFS_QUORUM_HB_IVAL_MS	100
-#define SCOUTFS_QUORUM_HB_TIMEO_MS	(5 * MSEC_PER_SEC)
+#define SCOUTFS_QUORUM_MIN_HB_TIMEO_MS	(2 * MSEC_PER_SEC)
+#define SCOUTFS_QUORUM_DEF_HB_TIMEO_MS	(10 * MSEC_PER_SEC)
+#define SCOUTFS_QUORUM_MAX_HB_TIMEO_MS	(60 * MSEC_PER_SEC)

 /*
 * A newly elected leader will give fencing some time before giving up and
--- a/kmod/src/options.c
+++ b/kmod/src/options.c
@@ -36,6 +36,7 @@ enum {
 	Opt_metadev_path,
 	Opt_noacl,
 	Opt_orphan_scan_delay_ms,
+	Opt_quorum_heartbeat_timeout_ms,
 	Opt_quorum_slot_nr,
 	Opt_err,
 };
@@ -47,6 +48,7 @@ static const match_table_t tokens = {
 	{Opt_metadev_path, "metadev_path=%s"},
 	{Opt_noacl, "noacl"},
 	{Opt_orphan_scan_delay_ms, "orphan_scan_delay_ms=%s"},
+	{Opt_quorum_heartbeat_timeout_ms, "quorum_heartbeat_timeout_ms=%s"},
 	{Opt_quorum_slot_nr, "quorum_slot_nr=%s"},
 	{Opt_err, NULL}
 };
@@ -124,8 +126,24 @@ static void init_default_options(struct scoutfs_mount_options *opts)

 	opts->data_prealloc_blocks = SCOUTFS_DATA_PREALLOC_DEFAULT_BLOCKS;
 	opts->data_prealloc_contig_only = 1;
-	opts->quorum_slot_nr = -1;
 	opts->orphan_scan_delay_ms = -1;
+	opts->quorum_heartbeat_timeout_ms = SCOUTFS_QUORUM_DEF_HB_TIMEO_MS;
+	opts->quorum_slot_nr = -1;
+}
+
+static int verify_quorum_heartbeat_timeout_ms(struct super_block *sb, int ret, u64 val)
+{
+	if (ret < 0) {
+		scoutfs_err(sb, "failed to parse quorum_heartbeat_timeout_ms value");
+		return -EINVAL;
+	}
+	if (val < SCOUTFS_QUORUM_MIN_HB_TIMEO_MS || val > SCOUTFS_QUORUM_MAX_HB_TIMEO_MS) {
+		scoutfs_err(sb, "invalid quorum_heartbeat_timeout_ms value %llu, must be between %lu and %lu",
+			    val, SCOUTFS_QUORUM_MIN_HB_TIMEO_MS, SCOUTFS_QUORUM_MAX_HB_TIMEO_MS);
+		return -EINVAL;
+	}
+
+	return 0;
 }

 /*
@@ -206,6 +224,14 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m
 			opts->orphan_scan_delay_ms = nr;
 			break;

+		case Opt_quorum_heartbeat_timeout_ms:
+			ret = match_u64(args, &nr64);
+			ret = verify_quorum_heartbeat_timeout_ms(sb, ret, nr64);
+			if (ret < 0)
+				return ret;
+			opts->quorum_heartbeat_timeout_ms = nr64;
+			break;
+
 		case Opt_quorum_slot_nr:
 			if (opts->quorum_slot_nr != -1) {
 				scoutfs_err(sb, "multiple quorum_slot_nr options provided, only provide one.");
@@ -448,6 +474,43 @@ static ssize_t orphan_scan_delay_ms_store(struct kobject *kobj, struct kobj_attr
 }
 SCOUTFS_ATTR_RW(orphan_scan_delay_ms);

+static ssize_t quorum_heartbeat_timeout_ms_show(struct kobject *kobj, struct kobj_attribute *attr,
+						char *buf)
+{
+	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
+	struct scoutfs_mount_options opts;
+
+	scoutfs_options_read(sb, &opts);
+
+	return snprintf(buf, PAGE_SIZE, "%llu", opts.quorum_heartbeat_timeout_ms);
+}
+static ssize_t quorum_heartbeat_timeout_ms_store(struct kobject *kobj, struct kobj_attribute *attr,
+						 const char *buf, size_t count)
+{
+	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
+	DECLARE_OPTIONS_INFO(sb, optinf);
+	char nullterm[30]; /* more than enough for octal -U64_MAX */
+	u64 val;
+	int len;
+	int ret;
+
+	len = min(count, sizeof(nullterm) - 1);
+	memcpy(nullterm, buf, len);
+	nullterm[len] = '\0';
+
+	ret = kstrtoll(nullterm, 0, &val);
+	ret = verify_quorum_heartbeat_timeout_ms(sb, ret, val);
+	if (ret == 0) {
+		write_seqlock(&optinf->seqlock);
+		optinf->opts.quorum_heartbeat_timeout_ms = val;
+		write_sequnlock(&optinf->seqlock);
+		ret = count;
+	}
+
+	return ret;
+}
+SCOUTFS_ATTR_RW(quorum_heartbeat_timeout_ms);
+
 static ssize_t quorum_slot_nr_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
 {
 	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
@@ -464,6 +527,7 @@ static struct attribute *options_attrs[] = {
 	SCOUTFS_ATTR_PTR(data_prealloc_contig_only),
 	SCOUTFS_ATTR_PTR(metadev_path),
 	SCOUTFS_ATTR_PTR(orphan_scan_delay_ms),
+	SCOUTFS_ATTR_PTR(quorum_heartbeat_timeout_ms),
 	SCOUTFS_ATTR_PTR(quorum_slot_nr),
 	NULL,
 };
--- a/kmod/src/options.h
+++ b/kmod/src/options.h
@@ -11,7 +11,7 @@ struct scoutfs_mount_options {
 	char *metadev_path;
 	unsigned int orphan_scan_delay_ms;
 	int quorum_slot_nr;
-
+	u64 quorum_heartbeat_timeout_ms;
 };

 void scoutfs_options_read(struct super_block *sb, struct scoutfs_mount_options *opts);
--- a/kmod/src/quorum.c
+++ b/kmod/src/quorum.c
@@ -100,6 +100,11 @@ struct last_msg {
 	ktime_t ts;
 };

+struct count_recent {
+	u64 count;
+	ktime_t recent;
+};
+
 enum quorum_role { FOLLOWER, CANDIDATE, LEADER };

 struct quorum_status {
@@ -112,9 +117,12 @@ struct quorum_status {
 	ktime_t timeout;
 };

+#define HB_DELAY_NR		(SCOUTFS_QUORUM_MAX_HB_TIMEO_MS / MSEC_PER_SEC)
+
 struct quorum_info {
 	struct super_block *sb;
 	struct scoutfs_quorum_config qconf;
+	struct workqueue_struct *workq;
 	struct work_struct work;
 	struct socket *sock;
 	bool shutdown;
@@ -126,6 +134,8 @@ struct quorum_info {
 	struct quorum_status show_status;
 	struct last_msg last_send[SCOUTFS_QUORUM_MAX_SLOTS];
 	struct last_msg last_recv[SCOUTFS_QUORUM_MAX_SLOTS];
+	struct count_recent *hb_delay;
+	unsigned long max_hb_delay;

 	struct scoutfs_sysfs_attrs ssa;
 };
@@ -160,9 +170,9 @@ static ktime_t heartbeat_interval(void)
 	return ktime_add_ms(ktime_get(), SCOUTFS_QUORUM_HB_IVAL_MS);
 }

-static ktime_t heartbeat_timeout(void)
+static ktime_t heartbeat_timeout(struct scoutfs_mount_options *opts)
 {
-	return ktime_add_ms(ktime_get(), SCOUTFS_QUORUM_HB_TIMEO_MS);
+	return ktime_add_ms(ktime_get(), opts->quorum_heartbeat_timeout_ms);
 }

 static int create_socket(struct super_block *sb)
@@ -179,7 +189,8 @@ static int create_socket(struct super_block *sb)
 		goto out;
 	}

-	sock->sk->sk_allocation = GFP_NOFS;
+	/* rather fail and retry than block waiting for free */
+	sock->sk->sk_allocation = GFP_ATOMIC;

 	quorum_slot_sin(&qinf->qconf, qinf->our_quorum_slot_nr, &sin);

@@ -208,12 +219,16 @@ static __le32 quorum_message_crc(struct scoutfs_quorum_message *qmes)
 	return cpu_to_le32(crc32c(~0, qmes, len));
 }

-static void send_msg_members(struct super_block *sb, int type, u64 term,
-			     int only)
+/*
+ * Returns the number of failures from sendmsg.
+ */
+static int send_msg_members(struct super_block *sb, int type, u64 term, int only)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	DECLARE_QUORUM_INFO(sb, qinf);
+	int failed = 0;
 	ktime_t now;
+	int ret;
 	int i;

 	struct scoutfs_quorum_message qmes = {
@@ -239,15 +254,21 @@ static void send_msg_members(struct super_block *sb, int type, u64 term,

 	qmes.crc = quorum_message_crc(&qmes);

-
 	for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
 		if (!quorum_slot_present(&qinf->qconf, i) ||
 		    (only >= 0 && i != only) || i == qinf->our_quorum_slot_nr)
 			continue;

+		if (scoutfs_forcing_unmount(sb)) {
+			failed = 0;
+			break;
+		}
+
 		scoutfs_quorum_slot_sin(&qinf->qconf, i, &sin);
 		now = ktime_get();
-		kernel_sendmsg(qinf->sock, &mh, &kv, 1, kv.iov_len);
+		ret = kernel_sendmsg(qinf->sock, &mh, &kv, 1, kv.iov_len);
+		if (ret != kv.iov_len)
+			failed++;

 		spin_lock(&qinf->show_lock);
 		qinf->last_send[i].msg.term = term;
@@ -258,6 +279,8 @@ static void send_msg_members(struct super_block *sb, int type, u64 term,
 		if (i == only)
 			break;
 	}
+
+	return failed;
 }

 #define send_msg_to(sb, type, term, nr)  send_msg_members(sb, type, term, nr)
@@ -312,6 +335,9 @@ static int recv_msg(struct super_block *sb, struct quorum_host_msg *msg,
 	if (ret < 0)
 		return ret;

+	if (scoutfs_forcing_unmount(sb))
+		return 0;
+
 	now = ktime_get();

 	if (ret != sizeof(qmes) ||
@@ -599,6 +625,71 @@ out:
 	return ret;
 }

+static void clear_hb_delay(struct quorum_info *qinf)
+{
+	int i;
+
+	spin_lock(&qinf->show_lock);
+	qinf->max_hb_delay = 0;
+	for (i = 0; i < HB_DELAY_NR; i++) {
+		qinf->hb_delay[i].recent = ns_to_ktime(0);
+		qinf->hb_delay[i].count = 0;
+	}
+	spin_unlock(&qinf->show_lock);
+}
+
+struct hb_recording {
+	ktime_t prev;
+	int count;
+};
+
+/*
+ * Record long heartbeat delays.  We only record the delay between back
+ * to back send attempts in the leader or back to back recv messages in
+ * the followers.  The worker caller sets record_hb when their iteration
+ * sent or received a heartbeat.  An iteration that does anything else
+ * resets the tracking.
+ */
+static void record_hb_delay(struct super_block *sb, struct quorum_info *qinf,
+			    struct hb_recording *hbr, bool record_hb, int role)
+{
+	bool log = false;
+	ktime_t now;
+	s64 s;
+
+	if (!record_hb) {
+		hbr->count = 0;
+		return;
+	}
+
+	now = ktime_get();
+
+	if (hbr->count < 2 && ++hbr->count < 2) {
+		hbr->prev = now;
+		return;
+	}
+
+	s = ktime_ms_delta(now, hbr->prev) / MSEC_PER_SEC;
+	hbr->prev = now;
+
+	if (s <= 0 || s >= HB_DELAY_NR)
+		return;
+
+	spin_lock(&qinf->show_lock);
+	if (qinf->max_hb_delay < s) {
+		qinf->max_hb_delay = s;
+		if (s >= 3)
+			log = true;
+	}
+	qinf->hb_delay[s].recent = now;
+	qinf->hb_delay[s].count++;
+	spin_unlock(&qinf->show_lock);
+
+	if (log)
+		scoutfs_info(sb, "longest quorum heartbeat %s delay of %lld sec",
+			     role == LEADER ? "send" : "recv", s);
+}
+
 /*
 * The main quorum task maintains its private status.  It seemed cleaner
 * to occasionally copy the status for showing in sysfs/debugfs files
@@ -623,16 +714,21 @@ static void update_show_status(struct quorum_info *qinf, struct quorum_status *q
 static void scoutfs_quorum_worker(struct work_struct *work)
 {
 	struct quorum_info *qinf = container_of(work, struct quorum_info, work);
+	struct scoutfs_mount_options opts;
 	struct super_block *sb = qinf->sb;
 	struct sockaddr_in unused;
 	struct quorum_host_msg msg;
 	struct quorum_status qst = {0,};
+	struct hb_recording hbr = {{0,},};
+	bool record_hb;
 	int ret;
 	int err;

 	/* recording votes from slots as native single word bitmap */
 	BUILD_BUG_ON(SCOUTFS_QUORUM_MAX_SLOTS > BITS_PER_LONG);

+	scoutfs_options_read(sb, &opts);
+
 	/* start out as a follower */
 	qst.role = FOLLOWER;
 	qst.vote_for = -1;
@@ -642,7 +738,7 @@ static void scoutfs_quorum_worker(struct work_struct *work)

 	/* see if there's a server to chose heartbeat or election timeout */
 	if (scoutfs_quorum_server_sin(sb, &unused) == 0)
-		qst.timeout = heartbeat_timeout();
+		qst.timeout = heartbeat_timeout(&opts);
 	else
 		qst.timeout = election_timeout();

@@ -666,6 +762,9 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 			ret = 0;
 		}

+		scoutfs_options_read(sb, &opts);
+		record_hb = false;
+
 		/* ignore messages from older terms */
 		if (msg.type != SCOUTFS_QUORUM_MSG_INVALID &&
 		    msg.term < qst.term)
@@ -681,6 +780,7 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 			if (qst.role == LEADER) {
 				scoutfs_warn(sb, "saw msg type %u from %u for term %llu while leader in term %llu, shutting down server.",
 					     msg.type, msg.from, msg.term, qst.term);
+				clear_hb_delay(qinf);
 			}
 			qst.role = FOLLOWER;
 			qst.term = msg.term;
@@ -689,7 +789,7 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 			scoutfs_inc_counter(sb, quorum_term_follower);

 			if (msg.type == SCOUTFS_QUORUM_MSG_HEARTBEAT)
-				qst.timeout = heartbeat_timeout();
+				qst.timeout = heartbeat_timeout(&opts);
 			else
 				qst.timeout = election_timeout();

@@ -699,6 +799,21 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 				goto out;
 		}

+		/* receiving heartbeats extends timeout, delaying elections */
+		if (msg.type == SCOUTFS_QUORUM_MSG_HEARTBEAT) {
+			qst.timeout = heartbeat_timeout(&opts);
+			scoutfs_inc_counter(sb, quorum_recv_heartbeat);
+			record_hb = true;
+		}
+
+		/* receiving a resignation from server starts election */
+		if (msg.type == SCOUTFS_QUORUM_MSG_RESIGNATION &&
+		    qst.role == FOLLOWER &&
+		    msg.term == qst.term) {
+			qst.timeout = election_timeout();
+			scoutfs_inc_counter(sb, quorum_recv_resignation);
+		}
+
 		/* followers and candidates start new election on timeout */
 		if (qst.role != LEADER &&
 		    ktime_after(ktime_get(), qst.timeout)) {
@@ -751,6 +866,7 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 			qst.timeout = heartbeat_interval();

 			update_show_status(qinf, &qst);
+			clear_hb_delay(qinf);

 			/* record that we've been elected before starting up server */
 			ret = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_ELECT, qst.term, true);
@@ -805,6 +921,7 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 				send_msg_others(sb, SCOUTFS_QUORUM_MSG_RESIGNATION,
 						qst.server_start_term);
 				scoutfs_inc_counter(sb, quorum_send_resignation);
+				clear_hb_delay(qinf);
 			}

 			ret = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_STOP,
@@ -818,24 +935,16 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 		/* leaders regularly send heartbeats to delay elections */
 		if (qst.role == LEADER &&
 		    ktime_after(ktime_get(), qst.timeout)) {
-			send_msg_others(sb, SCOUTFS_QUORUM_MSG_HEARTBEAT,
-					qst.term);
+			ret = send_msg_others(sb, SCOUTFS_QUORUM_MSG_HEARTBEAT, qst.term);
+			if (ret > 0) {
+				scoutfs_add_counter(sb, quorum_send_heartbeat_dropped, ret);
+				ret = 0;
+			}
+
 			qst.timeout = heartbeat_interval();
 			scoutfs_inc_counter(sb, quorum_send_heartbeat);
-		}
+			record_hb = true;

-		/* receiving heartbeats extends timeout, delaying elections */
-		if (msg.type == SCOUTFS_QUORUM_MSG_HEARTBEAT) {
-			qst.timeout = heartbeat_timeout();
-			scoutfs_inc_counter(sb, quorum_recv_heartbeat);
-		}
-
-		/* receiving a resignation from server starts election */
-		if (msg.type == SCOUTFS_QUORUM_MSG_RESIGNATION &&
-		    qst.role == FOLLOWER &&
-		    msg.term == qst.term) {
-			qst.timeout = election_timeout();
-			scoutfs_inc_counter(sb, quorum_recv_resignation);
 		}

 		/* followers vote once per term */
@@ -847,6 +956,8 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 				    msg.from);
 			scoutfs_inc_counter(sb, quorum_send_vote);
 		}
+
+		record_hb_delay(sb, qinf, &hbr, record_hb, qst.role);
 	}

 	update_show_status(qinf, &qst);
@@ -983,9 +1094,11 @@ static ssize_t status_show(struct kobject *kobj, struct kobj_attribute *attr,
 {
 	DECLARE_QUORUM_INFO_KOBJ(kobj, qinf);
 	struct quorum_status qst;
+	struct count_recent cr;
 	struct last_msg last;
 	struct timespec64 ts;
 	const ktime_t now = ktime_get();
+	unsigned long ul;
 	size_t size;
 	int ret;
 	int i;
@@ -1043,6 +1156,26 @@ static ssize_t status_show(struct kobject *kobj, struct kobj_attribute *attr,
 			     (s64)ts.tv_sec, (int)ts.tv_nsec);
 	}

+	spin_lock(&qinf->show_lock);
+	ul = qinf->max_hb_delay;
+	spin_unlock(&qinf->show_lock);
+	if (ul)
+		snprintf_ret(buf, size, &ret, "HB Delay(s)      Count  Secs Since\n");
+
+	for (i = 1; i <= ul && i < HB_DELAY_NR; i++) {
+		spin_lock(&qinf->show_lock);
+		cr = qinf->hb_delay[i];
+		spin_unlock(&qinf->show_lock);
+
+		if (cr.count == 0)
+			continue;
+
+		ts = ktime_to_timespec64(ktime_sub(now, cr.recent));
+		snprintf_ret(buf, size, &ret,
+			     "%11u  %9llu  %lld.%09u\n",
+			     i, cr.count, (s64)ts.tv_sec, (int)ts.tv_nsec);
+	}
+
 	return ret;
 }
 SCOUTFS_ATTR_RO(status);
@@ -1180,7 +1313,12 @@ int scoutfs_quorum_setup(struct super_block *sb)

 	qinf = kzalloc(sizeof(struct quorum_info), GFP_KERNEL);
 	super = kmalloc(sizeof(struct scoutfs_super_block), GFP_KERNEL);
-	if (!qinf || !super) {
+	if (qinf)
+		qinf->hb_delay = __vmalloc(HB_DELAY_NR * sizeof(struct count_recent),
+					   GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
+	if (!qinf || !super || !qinf->hb_delay) {
+		if (qinf)
+			vfree(qinf->hb_delay);
 		kfree(qinf);
 		ret = -ENOMEM;
 		goto out;
@@ -1195,6 +1333,15 @@ int scoutfs_quorum_setup(struct super_block *sb)
 	sbi->quorum_info = qinf;
 	qinf->sb = sb;

+	/* a high priority single threaded context without mem reclaim */
+	qinf->workq = alloc_workqueue("scoutfs_quorum_work",
+				       WQ_NON_REENTRANT | WQ_UNBOUND |
+				       WQ_HIGHPRI, 1);
+	if (!qinf->workq) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
 	ret = scoutfs_read_super(sb, super);
 	if (ret < 0)
 		goto out;
@@ -1213,7 +1360,7 @@ int scoutfs_quorum_setup(struct super_block *sb)
 	if (ret < 0)
 		goto out;

-	schedule_work(&qinf->work);
+	queue_work(qinf->workq, &qinf->work);

 out:
 	if (ret)
@@ -1243,10 +1390,14 @@ void scoutfs_quorum_destroy(struct super_block *sb)
 		qinf->shutdown = true;
 		flush_work(&qinf->work);

+		if (qinf->workq)
+			destroy_workqueue(qinf->workq);
+
 		scoutfs_sysfs_destroy_attrs(sb, &qinf->ssa);
 		if (qinf->sock)
 			sock_release(qinf->sock);

+		vfree(qinf->hb_delay);
 		kfree(qinf);
 		sbi->quorum_info = NULL;
 	}
--- a/tests/funcs/filter.sh
+++ b/tests/funcs/filter.sh
@@ -18,6 +18,7 @@ t_filter_dmesg()

 	# the kernel can just be noisy
 	re=" used greatest stack depth: "
+	re="$re|sched: RT throttling activated"

 	# mkfs/mount checks partition tables
 	re="$re|unknown partition table"
@@ -61,6 +62,7 @@ t_filter_dmesg()
 	re="$re|scoutfs .* error: meta_super META flag not set"
 	re="$re|scoutfs .* error: could not open metadev:.*"
 	re="$re|scoutfs .* error: Unknown or malformed option,.*"
+	re="$re|scoutfs .* error: invalid quorum_heartbeat_timeout_ms value"

 	# in debugging kernels we can slow things down a bit
 	re="$re|hrtimer: interrupt took .*"
@@ -81,6 +83,7 @@ t_filter_dmesg()
 	re="$re|scoutfs .* error .* freeing merged btree blocks.*.final commit del.upd freeing item"
 	re="$re|scoutfs .* error .*reading quorum block.*to update event.*"
 	re="$re|scoutfs .* error.*server failed to bind to.*"
+	re="$re|scoutfs .* critical transaction commit failure.*"

 	egrep -v "($re)" 
 }
--- a/tests/funcs/fs.sh
+++ b/tests/funcs/fs.sh
@@ -75,6 +75,15 @@ t_fs_nrs()
 	seq 0 $((T_NR_MOUNTS - 1))
 }

+#
+# output the fs nrs of quorum nodes, we "know" that
+# the quorum nrs are the first consequtive nrs
+#
+t_quorum_nrs()
+{
+	seq 0 $((T_QUORUM - 1))
+}
+
 #
 # outputs "1" if the fs number has "1" in its quorum/is_leader file.
 # All other cases output 0, including the fs nr being a client which
@@ -144,7 +153,27 @@ t_mount()
 	test "$nr" -lt "$T_NR_MOUNTS" || \
 		t_fail "fs nr $nr invalid"

-	eval t_quiet mount -t scoutfs \$T_O$nr \$T_DB$nr \$T_M$nr
+	eval t_quiet mount -t scoutfs \$T_O$nr\$opt \$T_DB$nr \$T_M$nr
+}
+
+#
+# Mount with an optional mount option string.  If the string is empty
+# then the saved mount options are used.  If the string has contents
+# then it is appended to the end of the saved options with a separating
+# comma.
+#
+# Unlike t_mount this won't inherently fail in t_quiet, errors are
+# returned so bad options can be tested.
+#
+t_mount_opt()
+{
+	local nr="$1"
+	local opt="${2:+,$2}"
+
+	test "$nr" -lt "$T_NR_MOUNTS" || \
+		t_fail "fs nr $nr invalid"
+
+	eval mount -t scoutfs \$T_O$nr\$opt \$T_DB$nr \$T_M$nr
 }

 t_umount()
@@ -391,7 +420,7 @@ t_set_sysfs_mount_option() {
 	local val="$3"
 	local opt="$(t_sysfs_path $nr)/mount_options/$name"

-	echo "$val" > "$opt"
+	echo "$val" > "$opt" 2>/dev/null
 }

 t_set_all_sysfs_mount_options() {
--- a/tests/golden/quorum-heartbeat-timeout
+++ b/tests/golden/quorum-heartbeat-timeout
@@ -0,0 +1,5 @@
+== bad timeout values fail
+== bad mount option fails
+== mount option
+== sysfs
+== reset all options
--- a/tests/sequence
+++ b/tests/sequence
@@ -38,6 +38,7 @@ setup-error-teardown.sh
 resize-devices.sh
 change-devices.sh
 fence-and-reclaim.sh
+quorum-heartbeat-timeout.sh
 orphan-inodes.sh
 mount-unmount-race.sh
 client-unmount-recovery.sh
--- a/tests/tests/quorum-heartbeat-timeout.sh
+++ b/tests/tests/quorum-heartbeat-timeout.sh
@@ -0,0 +1,117 @@
+#
+# test that the quorum_heartbeat_time_ms option affects how long it
+# takes to recover from a failed mount.
+#
+
+t_require_mounts 2
+
+time_ms()
+{
+	# time_t in seconds, then trunate nanoseconds to 3 most dig digits
+	date +%s%3N
+}
+
+set_bad_timeout() {
+	local to="$1"
+	t_set_sysfs_mount_option 0 quorum_heartbeat_timeout_ms $to && \
+		t_fail "set bad q hb to $to"
+}
+
+set_timeout()
+{
+	local nr="$1"
+	local how="$2"
+	local to="$3"
+	local is
+
+	if [ $how == "sysfs" ]; then
+		t_set_sysfs_mount_option $nr quorum_heartbeat_timeout_ms $to
+	fi
+	if [ $how == "mount" ]; then
+		t_umount $nr
+		t_mount_opt $nr "quorum_heartbeat_timeout_ms=$to"
+	fi
+
+	is=$(t_get_sysfs_mount_option $nr quorum_heartbeat_timeout_ms)
+
+	if [ "$is" != "$to" ]; then
+		t_fail "tried to set qhbto on $nr via $how to $to but got $is"
+	fi
+}
+
+test_timeout()
+{
+	local how="$1"
+	local to="$2"
+	local start
+	local nr
+	local sv
+	local delay
+	local low
+	local high
+
+	# set timeout on non-server quorum mounts
+	sv=$(t_server_nr)
+	for nr in $(t_quorum_nrs); do
+		if [ $nr -ne $sv ]; then
+			set_timeout $nr $how $to
+		fi
+	done
+
+	# give followers time to recv heartbeats and reset timeouts
+	sleep 1
+
+	# tear down the current server/leader
+	t_force_umount $sv
+
+	# see how long it takes for the next leader to start
+	start=$(time_ms)
+	t_wait_for_leader
+	delay=$(($(time_ms) - start))
+
+	# kind of fun to have these logged
+	echo "to $to delay $delay" >> $T_TMP.delay
+
+	# restore the mount that we tore down
+	t_mount $sv
+
+	# make sure the new leader delay was reasonable, allowing for some slack
+	low=$((to - 1000))
+	high=$((to + 5000))
+
+	# make sure the new leader delay was reasonable
+	test "$delay" -lt "$low" && t_fail "delay $delay < low $low (to $to)"
+	test "$delay" -gt "$high" && t_fail "delay $delay > high $high (to $to)"
+}
+
+echo "== bad timeout values fail"
+set_bad_timeout 0
+set_bad_timeout -1
+set_bad_timeout 1000000
+
+echo "== bad mount option fails"
+if [ "$(t_server_nr)" == 0 ]; then
+	nr=1
+else
+	nr=0
+fi
+t_umount $nr
+t_mount_opt $nr "quorum_heartbeat_timeout_ms=1000000" 2>/dev/null && \
+	t_fail "bad mount option succeeded"
+t_mount $nr
+
+echo "== mount option"
+def=$(t_get_sysfs_mount_option 0 quorum_heartbeat_timeout_ms)
+test_timeout mount $def
+test_timeout mount 3000
+test_timeout mount $((def + 19000))
+
+echo "== sysfs"
+test_timeout sysfs $def
+test_timeout sysfs 3000
+test_timeout sysfs $((def + 19000))
+
+echo "== reset all options"
+t_remount_all
+
+t_pass
--- a/utils/man/scoutfs.5
+++ b/utils/man/scoutfs.5
@@ -85,6 +85,25 @@ the options directory in the mount's sysfs directory.  Writing a new
 value will cause the next pending orphan scan to be rescheduled
 with the newly written delay time.
 .TP
+.B quorum_heartbeat_timeout_ms=<number>
+This option sets the amount of time, in milliseconds, that a quorum
+member will wait without receiving heartbeat messages from the current
+leader before trying to take over as leader.  This setting is per-mount
+and only changes the behavior of that mount.
+.sp
+This determines how long it may take before a failed leader is replaced
+by a waiting quorum member.  Setting it too low may lead to spurious
+fencing as active leaders are prematurely replaced due to task or
+network delays that prevent the quorum members from promptly sending and
+receiving messages.  The ideal setting is the longest acceptable
+downtime during server failover.  The default is 10000 (10s) and it can
+not be less than 2000 greater than 60000.
+.sp
+This option can be changed in an active mount by writing to its file in
+the options directory in the mount's sysfs directory.  Writing a new
+value will take effect the next time the quorum agent receives a
+heartbeat message and sets the next timeout.
+.TP
 .B quorum_slot_nr=<number>
 The quorum_slot_nr option assigns a quorum member slot to the mount.
 The mount will use the slot assignment to claim exclusive ownership of
--- a/utils/src/prepare_empty_data_device.c
+++ b/utils/src/prepare_empty_data_device.c
@@ -38,6 +38,7 @@ struct prepare_empty_data_dev_args {
 	char *meta_device;
 	char *data_device;
 	bool check;
+	bool force;
 };

 static int do_prepare_empty_data_dev(struct prepare_empty_data_dev_args *args)
@@ -84,13 +85,15 @@ static int do_prepare_empty_data_dev(struct prepare_empty_data_dev_args *args)
 		goto out;
 	}

-	in_use = (le64_to_cpu(meta_super->total_data_blocks) - SCOUTFS_DATA_DEV_START_BLKNO) -
-		 le64_to_cpu(meta_super->data_alloc.total_len);
-	if (in_use) {
-		fprintf(stderr, "Data block allocator metadata shows "SIZE_FMT" data blocks used by files.  They must be removed, truncated, or released before a new empty data device can be used.\n",
-		       SIZE_ARGS(in_use, SCOUTFS_BLOCK_SM_SIZE));
-		ret = -EINVAL;
-		goto out;
+	if (!args->force) {
+		in_use = (le64_to_cpu(meta_super->total_data_blocks) - SCOUTFS_DATA_DEV_START_BLKNO) -
+			 le64_to_cpu(meta_super->data_alloc.total_len);
+		if (in_use) {
+			fprintf(stderr, "Data block allocator metadata shows "SIZE_FMT" data blocks used by files.  They must be removed, truncated, or released before a new empty data device can be used.\n",
+			       SIZE_ARGS(in_use, SCOUTFS_BLOCK_SM_SIZE));
+			ret = -EINVAL;
+			goto out;
+		}
 	}

 	if (args->data_device) {
@@ -193,6 +196,9 @@ static int parse_opt(int key, char *arg, struct argp_state *state)
 	case 'c':
 		args->check = true;
 		break;
+	case 'f':
+		args->force = true;
+		break;
 	case ARGP_KEY_ARG:
 		if (!args->meta_device)
 			args->meta_device = strdup_or_error(state, arg);
@@ -216,6 +222,7 @@ static int parse_opt(int key, char *arg, struct argp_state *state)

 static struct argp_option options[] = {
 	{ "check", 'c', NULL, 0, "Only check for errors and do not write", },
+	{ "force", 'f', NULL, 0, "Force writing, don't check meta allocators", },
 	{ NULL }
 };

@@ -230,6 +237,7 @@ static int prepare_empty_data_dev_cmd(int argc, char *argv[])
 {
 	struct prepare_empty_data_dev_args prepare_empty_data_dev_args = { 
 		.check = false,
+		.force = false,
 	};
 	int ret;
Author	SHA1	Message	Date
Zach Brown	03c8dea413	Add --force to prepare empty data dev cli command Add an option to skip the empty allocator checks when writing a new super block to a data device. This probably won't see the light of day. Signed-off-by: Zach Brown <zab@versity.com>	2023-06-21 14:53:29 -07:00
Zach Brown	6c0ab75477	Merge pull request #126 from versity/zab/rht_block_shrink_deadlock Avoid deadlock from block reclaim in rht resize	2023-06-16 10:30:16 -07:00
Zach Brown	89b238a5c4	Add more acceptable quorum delay during testing Loaded VMs can see a few more seconds delay. Signed-off-by: Zach Brown <zab@versity.com>	2023-06-16 09:38:58 -07:00
Zach Brown	05371b83f0	Update expected console messages during testing Signed-off-by: Zach Brown <zab@versity.com>	2023-06-16 09:37:37 -07:00
Zach Brown	acafb869e7	Avoid deadlock from block reclaim in rht resize The RCU hash table uses deferred work to resize the hash table. There's a time during resize when hash table iteration will return EAGAIN until resize makes more progress. During this time resize can perform GFP_KERNEL allocations. Our shrinker tries to iterate over its RCU hash table to find blocks to reclaim. It tries to restart iteration if it gets EAGAIN on the assumption that it will be usable again soon. Combine the two and our shrinker can get stuck retrying iteration indefinitely because it's shrinking on behalf of the hash table resizing that is trying to allocate the next table before making iteration work again. We have to stop shrinking in this case so that the resizing caller can proceed. Signed-off-by: Zach Brown <zab@versity.com>	2023-06-15 14:45:26 -07:00
Zach Brown	5a1e5639c2	Merge pull request #124 from versity/zab/fix_quo_hb_mount_option Zab/fix quo hb mount option	2023-06-07 10:50:32 -07:00
Zach Brown	950963375b	Update quorum heartbeat test for mount option Update the quorum_heartbeat_timeout_ms test to also test the mount option, not just updating the timeout via sysfs. This takes some reworking as we have to avoid the active leader/server when setting the timeout via the mount option. We also allow for a bit more slack around comparing kernel sleeps and userspace wall clocks. Signed-off-by: Zach Brown <zab@versity.com>	2023-05-23 09:57:13 -07:00
Zach Brown	e52435b993	Add t_mount_opt Add a test helper that mounts with a mount option. Signed-off-by: Zach Brown <zab@versity.com>	2023-05-22 16:30:01 -07:00
Zach Brown	2b72c57cb0	Fix crash in quorum_heartbeat_timeout_ms parsing Mount option parsing runs early enough that the rest of the option read/write serialization infrastructure isn't set up yet. The quorum_heartbeat_timeout_ms mount option tried to use a helper that updated the stored option but it wasn't initialized yet so it crashed. The helper was really only to have the option validity test in one place. It's reworked to only verify the option and the actual setting is left to the callers. Signed-off-by: Zach Brown <zab@versity.com>	2023-05-22 16:29:56 -07:00
Zach Brown	9c67b2a42d	Merge pull request #122 from versity/zab/v1.13 v1.13 Release	2023-05-19 11:38:48 -07:00
Zach Brown	0b38aeb5a4	v1.13 Release Finish the release notes for the 1.13 release. Signed-off-by: Zach Brown <zab@versity.com>	2023-05-19 10:38:40 -07:00
Zach Brown	2daf873983	Merge pull request #121 from versity/zab/heartbeat_fencing_tweaks Zab/heartbeat fencing tweaks	2023-05-18 17:10:40 -07:00
Zach Brown	904c5dce90	Filter forced unmount transaction commit error Add a transaction commit error message to the set of errors we ignore when triggering forced unmount. Signed-off-by: Zach Brown <zab@versity.com>	2023-05-18 15:50:34 -07:00
Zach Brown	57c6d78df8	Add test of quorum heartbeat timeout setting Signed-off-by: Zach Brown <zab@versity.com>	2023-05-18 15:50:33 -07:00
Zach Brown	74e9d0f764	Silence test syfs option failure If setting a sysfs option failes the bash write error is output. It contains the script line number which can fail over time, leading to mismatched golden output failures if we used the output as an expected indication of failure. Callers should test its rc and output accordingly if they want the failure logged and compared. Signed-off-by: Zach Brown <zab@versity.com>	2023-05-18 11:15:28 -07:00
Zach Brown	98eb0eb649	Add t_quorum_nrs test helper Add a quick function that outputs the fs numbers of the quorum mounts. Signed-off-by: Zach Brown <zab@versity.com>	2023-05-18 11:15:28 -07:00
Zach Brown	15de0c21c1	Have quorum drop messages on force unmount Forced unmount is supposed to isolate the mount from the world. The net.c TCP messaging returns errors when sending during forced unmount. The quorum code has its own UDP messaging and wasn't taking forced unmount into account. This lead to quorum still being able to send resignation messages to other quorum peers during forced unmount, making it hard to test heartbeat timeouts with forced unmount. The quorum messaging is already unreliable so we can easily make it drop messages during forced unmount. Now forced unmount more fully isolates the quorum code and it becomes easier to test. Signed-off-by: Zach Brown <zab@versity.com>	2023-05-18 10:01:19 -07:00
Zach Brown	7b65767803	Track and log quorum heartbeat delays Add tracking and reporting of delays in sending or receiving quorum heartbeat messages. We measure the time between back to back sends or receives of heartbeat messages. We record these delays truncated down to second granularity in the quorum sysfs status file. We log messages to the console for each longest measured delay up to the maximum configurable heartbeat timeout. Signed-off-by: Zach Brown <zab@versity.com>	2023-05-17 14:44:27 -07:00
Zach Brown	46640e4ff9	Add counter for quorum heartbeat send failures Add a counter which tracks the number of heartbeat message send attempts which fail. Signed-off-by: Zach Brown <zab@versity.com>	2023-05-17 14:44:27 -07:00
Zach Brown	912906f050	Make quorum heartbeat timeout tunable Add mount and sysfs options for changing the quorum heartbeat timeout. This allows setting a longer delay in taking over for failed hosts that has a greater chance of surviving temporary non-fatal delays. We also double the existing default timeout to 10s which is still reasonably responsive. Signed-off-by: Zach Brown <zab@versity.com>	2023-05-17 14:44:27 -07:00
Zach Brown	ec02cf442b	Use lower latency allocation in quorum socket The quorum udp socket allocation still allowed starting io which can trigger longer latencies trying to free memory. We change the flags to prefer dipping into emergency pools and then failing rather than blocking trying to satisfy an allocation. We'd much rather have a given heartbeat attempt fail and have the opportunity to succeed at the next interval rather than running the risk of blocking across multiple intervals. Signed-off-by: Zach Brown <zab@versity.com>	2023-05-17 14:44:27 -07:00
Zach Brown	0e9cd1eea5	Use specific work queue for quorum work The quorum work was using the system workq. While that's mostly fine, we can create a dedicated workqueue with the specific flags that we need. The quorum work needs to run promptly to avoid fencing so we set it to high priority. Signed-off-by: Zach Brown <zab@versity.com>	2023-05-17 14:44:27 -07:00
Zach Brown	e18ea24561	Move quorum recv that sets timeout before check In the quorum work loop some message receive actions extend the timeout after the timeout expiration is checked. This is usually fine when the work runs soon after the messages are received and before the timeout expires. But under load the work might not schedule until long after both the message has been received and the timeout has expired. If the message was a heartbeat message then the wakeup delay would be mistaken for lack of activity on the server and it would try to take over for an otherwise active server. This moves the extension of the heartbeat on message receive to before the timeout is checked. In our case of a delayed heartbeat message it would still find it in the recv queue and extend the timeout, avoiding fencing an active server. Signed-off-by: Zach Brown <zab@versity.com>	2023-05-17 09:56:53 -07:00
Zach Brown	723309ff75	Merge pull request #120 from versity/zab/v1.12 v1.12 Release	2023-04-17 15:33:36 -07:00