scoutfs: start and stop server with quorum

Currently all mounts try to get a dlm lock which gives them exclusive access to become the server for the filesystem. That isn't going to work if we're moving to locking provided by the server. This uses quorum election to determine who should run the server. We switch from long running server work blocked trying to get a lock to calls which start and stop the server. Signed-off-by: Zach Brown <zab@versity.com>
2026-02-07 19:20:44 +00:00 · 2018-10-11 15:09:12 -07:00
parent 08a140c8b0
commit 288d781645
6 changed files with 158 additions and 173 deletions
--- a/kmod/src/client.c
+++ b/kmod/src/client.c
@@ -33,19 +33,17 @@
 #include "client.h"
 #include "net.h"
 #include "endian_swap.h"
+#include "quorum.h"

 /*
- * The client always maintains a connection to the server.  It reads the
- * super to get the address it should try and connect to.
+ * The client is responsible for maintaining a connection to the server.
+ * This includes managing quorum elections that determine which client
+ * should run the server that all the clients connect to.
 */

-/*
- * Connection timeouts have to allow for enough time for servers to
- * reboot.  Figure order minutes at the outside.
- */
-#define CONN_RETRY_MIN_MS	10UL
-#define CONN_RETRY_MAX_MS	(5UL * MSEC_PER_SEC)
-#define CONN_RETRY_LIMIT_J	(5 * 60 * HZ)
+#define CLIENT_CONNECT_DELAY_MS		(MSEC_PER_SEC / 10)
+#define CLIENT_CONNECT_TIMEOUT_MS	(1 * MSEC_PER_SEC)
+#define CLIENT_QUORUM_TIMEOUT_MS	(5 * MSEC_PER_SEC)

 struct client_info {
 	struct super_block *sb;
@@ -55,23 +53,12 @@ struct client_info {
 	atomic_t shutting_down;

 	struct workqueue_struct *workq;
-	struct delayed_work connect_dwork;
+	struct work_struct connect_work;

-	/* connection timeouts are tracked across attempts */
-	unsigned long conn_retry_ms;
+	struct scoutfs_quorum_elected_info qei;
+	u64 old_elected_nr;
 };

-static void reset_connect_timeout(struct client_info *client)
-{
-	client->conn_retry_ms = CONN_RETRY_MIN_MS;
-}
-
-static void grow_connect_timeout(struct client_info *client)
-{
-	client->conn_retry_ms = min(client->conn_retry_ms * 2,
-				    CONN_RETRY_MAX_MS);
-}
-
 /*
 * Ask for a new run of allocated inode numbers.  The server can return
 * fewer than @count.  It will success with nr == 0 if we've run out.
@@ -346,49 +333,96 @@ out:
 }

 /*
- * Attempt to connect to the listening address that the server wrote in
- * the super block.  We keep trying indefinitely with an increasing
- * delay if we fail to either read the address or connect to it.
+ * If the previous election told us to start the server then stop it
+ * and wipe the old election info.  If we're not fast enough to clear
+ * the election block then the next server might fence us.  Should
+ * be very unlikely as election requires multiple RMW cycles.
+ */
+static void stop_our_server(struct super_block *sb,
+			    struct scoutfs_quorum_elected_info *qei)
+{
+	if (qei->run_server) {
+		scoutfs_server_stop(sb);
+		scoutfs_quorum_clear_elected(sb, qei);
+		memset(qei, 0, sizeof(*qei));
+	}
+}
+
+/*
+ * This work is responsible for managing leader elections, running the
+ * server, and connecting clients to the server.
 *
- * We're careful to only ever have one connection attempt in flight.  We
- * only queue this work on mount, on error, or from the notify_down
- * callback.
+ * In the typical case a mount reads the quorum blocks and finds the
+ * address of the currently running server and connects to it.
+ *
+ * More rarely clients who aren't connected and are configured to
+ * participate in quorum need to elect the new leader.  The elected info
+ * filled by quorum tells us if we were elected to run the server.
+ *
+ * This leads to the possibility that the mount who is running the
+ * server had its mount disconnect.  This is only weirdly different from
+ * other clients disconnecting and trying to reconnect because of the
+ * way quorum slots are reconfigured and reclaimed.  If we connect to a
+ * server with the new quorum config then we can't have any old servers
+ * running in the stale old quorum slot.  The simplest way to do this is
+ * to *always* stop the server if we're running it and we got
+ * disconnected.  It's a big hammer, but it's reliable, and arguably if
+ * *we* couldn't' use *our* server then something bad is happening and
+ * someone else should be the server.
+ *
+ * This only executes on mount, error, or as a connection disconnects
+ * and there's only ever one executing.
 */
 static void scoutfs_client_connect_worker(struct work_struct *work)
 {
 	struct client_info *client = container_of(work, struct client_info,
-						  connect_dwork.work);
+						  connect_work);
 	struct super_block *sb = client->sb;
+	struct scoutfs_quorum_elected_info *qei = &client->qei;
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	struct scoutfs_super_block *super = &sbi->super;
+	struct mount_options *opts = &sbi->opts;
 	struct scoutfs_net_greeting greet;
-	struct scoutfs_super_block super;
-	struct sockaddr_in sin;
+	ktime_t timeout_abs;
 	int ret;

-	ret = scoutfs_read_super(sb, &super);
+	/* don't try quorum and connecting while our mount runs a server */
+	stop_our_server(sb, qei);
+
+	timeout_abs = ktime_add_ms(ktime_get(), CLIENT_QUORUM_TIMEOUT_MS);
+
+	ret = scoutfs_quorum_election(sb, opts->uniq_name,
+				      client->old_elected_nr,
+			              timeout_abs, qei);
 	if (ret)
 		goto out;

-	if (super.server_addr.addr == cpu_to_le32(INADDR_ANY)) {
-		ret = -EADDRNOTAVAIL;
+	if (qei->run_server) {
+		ret = scoutfs_server_start(sb, &qei->sin);
+		if (ret) {
+			/* forget that we tried to start the server */
+			memset(qei, 0, sizeof(*qei));
+			goto out;
+		}
+	}
+
+	/* always give the server some time before connecting */
+	msleep(CLIENT_CONNECT_DELAY_MS);
+
+	ret = scoutfs_net_connect(sb, client->conn, &qei->sin,
+				  CLIENT_CONNECT_TIMEOUT_MS);
+	if (ret) {
+		/* we couldn't connect, try electing a new server */
+		client->old_elected_nr = qei->elected_nr;
 		goto out;
 	}

-	memset(&sin, 0, sizeof(sin));
-	sin.sin_family = AF_INET;
-	sin.sin_addr.s_addr = le32_to_be32(super.server_addr.addr);
-	sin.sin_port = le16_to_be16(super.server_addr.port);
-
-	ret = scoutfs_net_connect(sb, client->conn, &sin,
-				  client->conn_retry_ms);
-	if (ret)
-		goto out;
-
-	reset_connect_timeout(client);
+	/* trust this server again if it's still around after we disconnect */
+	client->old_elected_nr = 0;

 	/* send a greeting to verify endpoints of each connection */
-	greet.fsid = super.id;
-	greet.format_hash = super.format_hash;
+	greet.fsid = super->id;
+	greet.format_hash = super->format_hash;
 	greet.node_id = cpu_to_le64(sbi->node_id);

 	ret = scoutfs_net_submit_request(sb, client->conn,
@@ -397,13 +431,9 @@ static void scoutfs_client_connect_worker(struct work_struct *work)
 					 client_greeting, NULL, NULL);
 	if (ret)
 		scoutfs_net_shutdown(sb, client->conn);
-
 out:
-	if (ret && !atomic_read(&client->shutting_down)) {
-		queue_delayed_work(client->workq, &client->connect_dwork,
-				   msecs_to_jiffies(client->conn_retry_ms));
-		grow_connect_timeout(client);
-	}
+	if (ret && !atomic_read(&client->shutting_down))
+		queue_work(client->workq, &client->connect_work);
 }

 /*
@@ -474,11 +504,8 @@ static void client_notify_down(struct super_block *sb,
 {
 	struct client_info *client = SCOUTFS_SB(sb)->client_info;

-	if (!atomic_read(&client->shutting_down)) {
-		queue_delayed_work(client->workq, &client->connect_dwork,
-				   msecs_to_jiffies(client->conn_retry_ms));
-		grow_connect_timeout(client);
-	}
+	if (!atomic_read(&client->shutting_down))
+		queue_work(client->workq, &client->connect_work);
 }

 /*
@@ -509,8 +536,7 @@ int scoutfs_client_setup(struct super_block *sb)
 	client->sb = sb;
 	init_completion(&client->node_id_comp);
 	atomic_set(&client->shutting_down, 0);
-	INIT_DELAYED_WORK(&client->connect_dwork,
-			  scoutfs_client_connect_worker);
+	INIT_WORK(&client->connect_work, scoutfs_client_connect_worker);

 	client->conn = scoutfs_net_alloc_conn(sb, NULL, client_notify_down, 0,
 					      client_req_funcs, "client");
@@ -525,10 +551,7 @@ int scoutfs_client_setup(struct super_block *sb)
 		goto out;
 	}

-	reset_connect_timeout(client);
-	/* delay initial connect to give a local server some time to setup */
-	queue_delayed_work(client->workq, &client->connect_dwork,
-			   msecs_to_jiffies(client->conn_retry_ms));
+	queue_work(client->workq, &client->connect_work);
 	ret = 0;

 out:
@@ -552,13 +575,16 @@ void scoutfs_client_destroy(struct super_block *sb)
 		atomic_set(&client->shutting_down, 1);

 		/* make sure worker isn't using the conn */
-		cancel_delayed_work_sync(&client->connect_dwork);
+		cancel_work_sync(&client->connect_work);

 		/* make racing conn use explode */
 		conn = client->conn;
 		client->conn = NULL;
 		scoutfs_net_free_conn(sb, conn);

+		/* stop running the server if we were, harmless otherwise */
+		stop_our_server(sb, &client->qei);
+
 		if (client->workq)
 			destroy_workqueue(client->workq);
 		kfree(client);
--- a/kmod/src/format.h
+++ b/kmod/src/format.h
@@ -453,7 +453,6 @@ struct scoutfs_super_block {
 	__le64 next_compact_id;
 	struct scoutfs_btree_root alloc_root;
 	struct scoutfs_manifest manifest;
-	struct scoutfs_inet_addr server_addr;
 	struct scoutfs_quorum_config quorum_config;
 } __packed;

--- a/kmod/src/options.c
+++ b/kmod/src/options.c
@@ -27,8 +27,6 @@
 #include "super.h"

 static const match_table_t tokens = {
-	{Opt_listen, "listen=%s"},
-	{Opt_cluster, "cluster=%s"},
 	{Opt_uniq_name, "uniq_name=%s"},
 	{Opt_err, NULL}
 };
@@ -55,15 +53,12 @@ u32 scoutfs_option_u32(struct super_block *sb, int token)
 int scoutfs_parse_options(struct super_block *sb, char *options,
 			  struct mount_options *parsed)
 {
-	char ipstr[INET_ADDRSTRLEN + 1];
 	substring_t args[MAX_OPT_ARGS];
 	int token, len;
-	__be32 addr;
 	char *p;

 	/* Set defaults */
 	memset(parsed, 0, sizeof(*parsed));
-	strcpy(parsed->cluster_name, "scoutfs");

 	while ((p = strsep(&options, ",")) != NULL) {
 		if (!*p)
@@ -71,22 +66,6 @@ int scoutfs_parse_options(struct super_block *sb, char *options,

 		token = match_token(p, tokens, args);
 		switch (token) {
-		case Opt_listen:
-			match_strlcpy(ipstr, args, ARRAY_SIZE(ipstr));
-			addr = in_aton(ipstr);
-			if (ipv4_is_multicast(addr) || ipv4_is_lbcast(addr) ||
-			    ipv4_is_zeronet(addr) || ipv4_is_local_multicast(addr))
-				return -EINVAL;
-			parsed->listen_addr.addr =
-				cpu_to_le32(be32_to_cpu(addr));
-			break;
-		case Opt_cluster:
-			len = args[0].to - args[0].from;
-			if (len == 0 || len > (MAX_CLUSTER_NAME_LEN - 1))
-				return -EINVAL;
-			match_strlcpy(parsed->cluster_name, args,
-				      MAX_CLUSTER_NAME_LEN);
-			break;
 		case Opt_uniq_name:
 			len = match_strlcpy(parsed->uniq_name, args,
 					    SCOUTFS_UNIQUE_NAME_MAX_BYTES);
--- a/kmod/src/options.h
+++ b/kmod/src/options.h
@@ -5,8 +5,6 @@
 #include "format.h"

 enum {
-	Opt_listen = 0,
-	Opt_cluster,
 	/*
 	 * For debugging we can quickly create huge trees by limiting
 	 * the number of items in each block as though the blocks were tiny.
@@ -16,11 +14,7 @@ enum {
 	Opt_err,
 };

-#define MAX_CLUSTER_NAME_LEN 17
-struct mount_options
-{
-	struct scoutfs_inet_addr	listen_addr;
-	char				cluster_name[MAX_CLUSTER_NAME_LEN];
+struct mount_options {
 	char uniq_name[SCOUTFS_UNIQUE_NAME_MAX_BYTES];
 };

--- a/kmod/src/server.c
+++ b/kmod/src/server.c
@@ -35,20 +35,14 @@
 #include "lock_server.h"
 #include "endian_swap.h"

-/*
- * XXX pre commit:
- *  - comments
- */
-
 /*
 * Every active mount can act as the server that listens on a net
 * connection and accepts connections from all the other mounts acting
 * as clients.
 *
- * It queues long-lived work that blocks trying to acquire a lock.  If
- * it acquires the lock it listens on a socket and serves requests.  If
+ * The server is started when raft elects the mount as the leader.  If
 * it sees errors it shuts down the server in the hopes that another
- * mount will have less trouble.
+ * mount will become the leader and have less trouble.
 */

 struct server_info {
@@ -57,9 +51,11 @@ struct server_info {
 	wait_queue_head_t waitq;

 	struct workqueue_struct *wq;
-	struct delayed_work dwork;
-	struct completion shutdown_comp;
-	bool bind_warned;
+	struct work_struct work;
+	int err;
+	bool shutting_down;
+	struct completion start_comp;
+	struct sockaddr_in listen_sin;
 	struct scoutfs_net_connection *conn;

 	/* request processing coordinates committing manifest and alloc */
@@ -495,9 +491,11 @@ static int remove_segno(struct super_block *sb, u64 segno)
 	return ret;
 }

-static void shutdown_server(struct server_info *server)
+static void stop_server(struct server_info *server)
 {
-	complete(&server->shutdown_comp);
+	/* wait_event/wake_up provide barriers */
+	server->shutting_down = true;
+	wake_up(&server->waitq);
 }

 /*
@@ -1733,21 +1731,6 @@ out:
 	trace_scoutfs_server_compact_work_exit(sb, 0, ret);
 }

-/*
- * This relies on the caller having read the current super and advanced
- * its seq so that it's dirty.  This will go away when we communicate
- * the server address in a lock lvb.
- */
-static int write_server_addr(struct super_block *sb, struct sockaddr_in *sin)
-{
-	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
-
-	super->server_addr.addr = be32_to_le32(sin->sin_addr.s_addr);
-	super->server_addr.port = be16_to_le16(sin->sin_port);
-
-	return scoutfs_write_dirty_super(sb);
-}
-
 static scoutfs_net_request_t server_req_funcs[] = {
 	[SCOUTFS_NET_CMD_GREETING]		= server_greeting,
 	[SCOUTFS_NET_CMD_ALLOC_INODES]		= server_alloc_inodes,
@@ -1800,27 +1783,18 @@ static void server_notify_down(struct super_block *sb,
 		forget_client_compacts(sb, sci);
 		try_queue_compact(server);
 	} else {
-		shutdown_server(server);
+		stop_server(server);
 	}
 }

-/*
- * This work is always running or has a delayed timer set while a super
- * is mounted.  It tries to grab the lock to become the server.  If it
- * succeeds it publishes its address and accepts connections.  If
- * anything goes wrong it releases the lock and sets a timer to try to
- * become the server all over again.
- */
 static void scoutfs_server_worker(struct work_struct *work)
 {
 	struct server_info *server = container_of(work, struct server_info,
-						  dwork.work);
+						  work);
 	struct super_block *sb = server->sb;
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct scoutfs_super_block *super = &sbi->super;
 	struct scoutfs_net_connection *conn = NULL;
-	static struct sockaddr_in zeros = {0,};
-	struct scoutfs_lock *lock = NULL;
 	struct pending_seq *ps;
 	struct pending_seq *ps_tmp;
 	DECLARE_WAIT_QUEUE_HEAD(waitq);
@@ -1830,13 +1804,6 @@ static void scoutfs_server_worker(struct work_struct *work)

 	trace_scoutfs_server_work_enter(sb, 0, 0);

-	init_completion(&server->shutdown_comp);
-
-	ret = scoutfs_lock_global(sb, DLM_LOCK_EX, 0,
-				  SCOUTFS_LOCK_TYPE_GLOBAL_SERVER, &lock);
-	if (ret)
-		goto out;
-
 	conn = scoutfs_net_alloc_conn(sb, server_notify_up, server_notify_down,
 				      sizeof(struct server_client_info),
 				      server_req_funcs, "server");
@@ -1845,33 +1812,21 @@ static void scoutfs_server_worker(struct work_struct *work)
 		goto out;
 	}

-	sin.sin_family = AF_INET;
-	sin.sin_addr.s_addr = le32_to_be32(sbi->opts.listen_addr.addr);
-	sin.sin_port = le16_to_be16(sbi->opts.listen_addr.port);
+	sin = server->listen_sin;

-	/* get the address of our listening socket */
 	ret = scoutfs_net_bind(sb, conn, &sin);
 	if (ret) {
-		if (!server->bind_warned) {
-			scoutfs_err(sb, "server failed to bind to "SIN_FMT", errno %d%s.  Retrying indefinitely..",
-				    SIN_ARG(&sin), ret,
-				    ret == -EADDRNOTAVAIL ? " (Bad address?)"
-							  : "");
-			server->bind_warned = true;
-		}
+		scoutfs_err(sb, "server failed to bind to "SIN_FMT", err %d%s",
+			    SIN_ARG(&sin), ret,
+			    ret == -EADDRNOTAVAIL ? " (Bad address?)"
+						  : "");
 		goto out;
 	}

-	/* publish the address for clients to connect to */
 	ret = scoutfs_read_super(sb, super);
 	if (ret)
 		goto out;

-	scoutfs_advance_dirty_super(sb);
-	ret = write_server_addr(sb, &sin);
-	if (ret)
-		goto out;
-
 	/* start up the server subsystems before accepting */
 	ret = scoutfs_btree_setup(sb) ?:
 	      scoutfs_manifest_setup(sb) ?:
@@ -1879,6 +1834,8 @@ static void scoutfs_server_worker(struct work_struct *work)
 	if (ret)
 		goto shutdown;

+	complete(&server->start_comp);
+
 	scoutfs_advance_dirty_super(sb);
 	server->stable_manifest_root = super->manifest.root;

@@ -1888,8 +1845,8 @@ static void scoutfs_server_worker(struct work_struct *work)
 	server->conn = conn;
 	scoutfs_net_listen(sb, conn);

-	/* wait for listening down or umount, conn can still be live */
-	wait_for_completion_interruptible(&server->shutdown_comp);
+	/* wait_event/wake_up provide barriers */
+	wait_event_interruptible(server->waitq, server->shutting_down);

 	scoutfs_info(sb, "server shutting down on "SIN_FMT, SIN_ARG(&sin));

@@ -1913,16 +1870,39 @@ shutdown:
 		kfree(ps);
 	}

-	write_server_addr(sb, &zeros);
-
 out:
 	scoutfs_net_free_conn(sb, conn);
-	scoutfs_unlock(sb, lock, DLM_LOCK_EX);
-
-	/* always requeues, cancel_delayed_work_sync cancels on shutdown */
-	queue_delayed_work(server->wq, &server->dwork, HZ / 2);

 	trace_scoutfs_server_work_exit(sb, 0, ret);
+
+	server->err = ret;
+	complete(&server->start_comp);
+}
+
+/* XXX can we call start multiple times? */
+int scoutfs_server_start(struct super_block *sb, struct sockaddr_in *sin)
+{
+	DECLARE_SERVER_INFO(sb, server);
+
+	server->err = 0;
+	server->shutting_down = false;
+	server->listen_sin = *sin;
+	init_completion(&server->start_comp);
+
+	queue_work(server->wq, &server->work);
+
+	wait_for_completion(&server->start_comp);
+	return server->err;
+}
+
+void scoutfs_server_stop(struct super_block *sb)
+{
+	DECLARE_SERVER_INFO(sb, server);
+
+	stop_server(server);
+	/* XXX not sure both are needed */
+	cancel_work_sync(&server->work);
+	cancel_work_sync(&server->commit_work);
 }

 int scoutfs_server_setup(struct super_block *sb)
@@ -1937,9 +1917,7 @@ int scoutfs_server_setup(struct super_block *sb)
 	server->sb = sb;
 	spin_lock_init(&server->lock);
 	init_waitqueue_head(&server->waitq);
-	init_completion(&server->shutdown_comp);
-	server->bind_warned = false;
-	INIT_DELAYED_WORK(&server->dwork, scoutfs_server_worker);
+	INIT_WORK(&server->work, scoutfs_server_worker);
 	init_rwsem(&server->commit_rwsem);
 	init_llist_head(&server->commit_waiters);
 	INIT_WORK(&server->commit_work, scoutfs_server_commit_func);
@@ -1960,22 +1938,24 @@ int scoutfs_server_setup(struct super_block *sb)
 		return -ENOMEM;
 	}

-	queue_delayed_work(server->wq, &server->dwork, 0);
-
 	sbi->server_info = server;
 	return 0;
 }

+/*
+ * The caller should have already stopped but we do the same just in
+ * case.
+ */
 void scoutfs_server_destroy(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct server_info *server = sbi->server_info;

 	if (server) {
-		shutdown_server(server);
+		stop_server(server);

 		/* wait for server work to wait for everything to shut down */
-		cancel_delayed_work_sync(&server->dwork);
+		cancel_work_sync(&server->work);
 		/* recv work/compaction could have left commit_work queued */
 		cancel_work_sync(&server->commit_work);

--- a/kmod/src/server.h
+++ b/kmod/src/server.h
@@ -48,6 +48,9 @@ do {								\
 	__entry->name##_id, __entry->name##_data_len, __entry->name##_cmd, \
 	__entry->name##_flags, __entry->name##_error

+struct scoutfs_net_manifest_entry;
+struct scoutfs_manifest_entry;
+
 void scoutfs_init_ment_to_net(struct scoutfs_net_manifest_entry *net_ment,
 			      struct scoutfs_manifest_entry *ment);
 void scoutfs_init_ment_from_net(struct scoutfs_manifest_entry *ment,
@@ -58,6 +61,10 @@ int scoutfs_server_lock_request(struct super_block *sb, u64 node_id,
 int scoutfs_server_lock_response(struct super_block *sb, u64 node_id,
 				 u64 id, struct scoutfs_net_lock *nl);

+struct sockaddr_in;
+int scoutfs_server_start(struct super_block *sb, struct sockaddr_in *sin);
+void scoutfs_server_stop(struct super_block *sb);
+
 int scoutfs_server_setup(struct super_block *sb);
 void scoutfs_server_destroy(struct super_block *sb);