diff --git a/kmod/src/client.c b/kmod/src/client.c index 9ef4c247..81603385 100644 --- a/kmod/src/client.c +++ b/kmod/src/client.c @@ -33,19 +33,17 @@ #include "client.h" #include "net.h" #include "endian_swap.h" +#include "quorum.h" /* - * The client always maintains a connection to the server. It reads the - * super to get the address it should try and connect to. + * The client is responsible for maintaining a connection to the server. + * This includes managing quorum elections that determine which client + * should run the server that all the clients connect to. */ -/* - * Connection timeouts have to allow for enough time for servers to - * reboot. Figure order minutes at the outside. - */ -#define CONN_RETRY_MIN_MS 10UL -#define CONN_RETRY_MAX_MS (5UL * MSEC_PER_SEC) -#define CONN_RETRY_LIMIT_J (5 * 60 * HZ) +#define CLIENT_CONNECT_DELAY_MS (MSEC_PER_SEC / 10) +#define CLIENT_CONNECT_TIMEOUT_MS (1 * MSEC_PER_SEC) +#define CLIENT_QUORUM_TIMEOUT_MS (5 * MSEC_PER_SEC) struct client_info { struct super_block *sb; @@ -55,23 +53,12 @@ struct client_info { atomic_t shutting_down; struct workqueue_struct *workq; - struct delayed_work connect_dwork; + struct work_struct connect_work; - /* connection timeouts are tracked across attempts */ - unsigned long conn_retry_ms; + struct scoutfs_quorum_elected_info qei; + u64 old_elected_nr; }; -static void reset_connect_timeout(struct client_info *client) -{ - client->conn_retry_ms = CONN_RETRY_MIN_MS; -} - -static void grow_connect_timeout(struct client_info *client) -{ - client->conn_retry_ms = min(client->conn_retry_ms * 2, - CONN_RETRY_MAX_MS); -} - /* * Ask for a new run of allocated inode numbers. The server can return * fewer than @count. It will success with nr == 0 if we've run out. @@ -346,49 +333,96 @@ out: } /* - * Attempt to connect to the listening address that the server wrote in - * the super block. We keep trying indefinitely with an increasing - * delay if we fail to either read the address or connect to it. + * If the previous election told us to start the server then stop it + * and wipe the old election info. If we're not fast enough to clear + * the election block then the next server might fence us. Should + * be very unlikely as election requires multiple RMW cycles. + */ +static void stop_our_server(struct super_block *sb, + struct scoutfs_quorum_elected_info *qei) +{ + if (qei->run_server) { + scoutfs_server_stop(sb); + scoutfs_quorum_clear_elected(sb, qei); + memset(qei, 0, sizeof(*qei)); + } +} + +/* + * This work is responsible for managing leader elections, running the + * server, and connecting clients to the server. * - * We're careful to only ever have one connection attempt in flight. We - * only queue this work on mount, on error, or from the notify_down - * callback. + * In the typical case a mount reads the quorum blocks and finds the + * address of the currently running server and connects to it. + * + * More rarely clients who aren't connected and are configured to + * participate in quorum need to elect the new leader. The elected info + * filled by quorum tells us if we were elected to run the server. + * + * This leads to the possibility that the mount who is running the + * server had its mount disconnect. This is only weirdly different from + * other clients disconnecting and trying to reconnect because of the + * way quorum slots are reconfigured and reclaimed. If we connect to a + * server with the new quorum config then we can't have any old servers + * running in the stale old quorum slot. The simplest way to do this is + * to *always* stop the server if we're running it and we got + * disconnected. It's a big hammer, but it's reliable, and arguably if + * *we* couldn't' use *our* server then something bad is happening and + * someone else should be the server. + * + * This only executes on mount, error, or as a connection disconnects + * and there's only ever one executing. */ static void scoutfs_client_connect_worker(struct work_struct *work) { struct client_info *client = container_of(work, struct client_info, - connect_dwork.work); + connect_work); struct super_block *sb = client->sb; + struct scoutfs_quorum_elected_info *qei = &client->qei; struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + struct scoutfs_super_block *super = &sbi->super; + struct mount_options *opts = &sbi->opts; struct scoutfs_net_greeting greet; - struct scoutfs_super_block super; - struct sockaddr_in sin; + ktime_t timeout_abs; int ret; - ret = scoutfs_read_super(sb, &super); + /* don't try quorum and connecting while our mount runs a server */ + stop_our_server(sb, qei); + + timeout_abs = ktime_add_ms(ktime_get(), CLIENT_QUORUM_TIMEOUT_MS); + + ret = scoutfs_quorum_election(sb, opts->uniq_name, + client->old_elected_nr, + timeout_abs, qei); if (ret) goto out; - if (super.server_addr.addr == cpu_to_le32(INADDR_ANY)) { - ret = -EADDRNOTAVAIL; + if (qei->run_server) { + ret = scoutfs_server_start(sb, &qei->sin); + if (ret) { + /* forget that we tried to start the server */ + memset(qei, 0, sizeof(*qei)); + goto out; + } + } + + /* always give the server some time before connecting */ + msleep(CLIENT_CONNECT_DELAY_MS); + + ret = scoutfs_net_connect(sb, client->conn, &qei->sin, + CLIENT_CONNECT_TIMEOUT_MS); + if (ret) { + /* we couldn't connect, try electing a new server */ + client->old_elected_nr = qei->elected_nr; goto out; } - memset(&sin, 0, sizeof(sin)); - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = le32_to_be32(super.server_addr.addr); - sin.sin_port = le16_to_be16(super.server_addr.port); - - ret = scoutfs_net_connect(sb, client->conn, &sin, - client->conn_retry_ms); - if (ret) - goto out; - - reset_connect_timeout(client); + /* trust this server again if it's still around after we disconnect */ + client->old_elected_nr = 0; /* send a greeting to verify endpoints of each connection */ - greet.fsid = super.id; - greet.format_hash = super.format_hash; + greet.fsid = super->id; + greet.format_hash = super->format_hash; greet.node_id = cpu_to_le64(sbi->node_id); ret = scoutfs_net_submit_request(sb, client->conn, @@ -397,13 +431,9 @@ static void scoutfs_client_connect_worker(struct work_struct *work) client_greeting, NULL, NULL); if (ret) scoutfs_net_shutdown(sb, client->conn); - out: - if (ret && !atomic_read(&client->shutting_down)) { - queue_delayed_work(client->workq, &client->connect_dwork, - msecs_to_jiffies(client->conn_retry_ms)); - grow_connect_timeout(client); - } + if (ret && !atomic_read(&client->shutting_down)) + queue_work(client->workq, &client->connect_work); } /* @@ -474,11 +504,8 @@ static void client_notify_down(struct super_block *sb, { struct client_info *client = SCOUTFS_SB(sb)->client_info; - if (!atomic_read(&client->shutting_down)) { - queue_delayed_work(client->workq, &client->connect_dwork, - msecs_to_jiffies(client->conn_retry_ms)); - grow_connect_timeout(client); - } + if (!atomic_read(&client->shutting_down)) + queue_work(client->workq, &client->connect_work); } /* @@ -509,8 +536,7 @@ int scoutfs_client_setup(struct super_block *sb) client->sb = sb; init_completion(&client->node_id_comp); atomic_set(&client->shutting_down, 0); - INIT_DELAYED_WORK(&client->connect_dwork, - scoutfs_client_connect_worker); + INIT_WORK(&client->connect_work, scoutfs_client_connect_worker); client->conn = scoutfs_net_alloc_conn(sb, NULL, client_notify_down, 0, client_req_funcs, "client"); @@ -525,10 +551,7 @@ int scoutfs_client_setup(struct super_block *sb) goto out; } - reset_connect_timeout(client); - /* delay initial connect to give a local server some time to setup */ - queue_delayed_work(client->workq, &client->connect_dwork, - msecs_to_jiffies(client->conn_retry_ms)); + queue_work(client->workq, &client->connect_work); ret = 0; out: @@ -552,13 +575,16 @@ void scoutfs_client_destroy(struct super_block *sb) atomic_set(&client->shutting_down, 1); /* make sure worker isn't using the conn */ - cancel_delayed_work_sync(&client->connect_dwork); + cancel_work_sync(&client->connect_work); /* make racing conn use explode */ conn = client->conn; client->conn = NULL; scoutfs_net_free_conn(sb, conn); + /* stop running the server if we were, harmless otherwise */ + stop_our_server(sb, &client->qei); + if (client->workq) destroy_workqueue(client->workq); kfree(client); diff --git a/kmod/src/format.h b/kmod/src/format.h index 86cfe48d..a8ebb465 100644 --- a/kmod/src/format.h +++ b/kmod/src/format.h @@ -453,7 +453,6 @@ struct scoutfs_super_block { __le64 next_compact_id; struct scoutfs_btree_root alloc_root; struct scoutfs_manifest manifest; - struct scoutfs_inet_addr server_addr; struct scoutfs_quorum_config quorum_config; } __packed; diff --git a/kmod/src/options.c b/kmod/src/options.c index 2f1bef8d..8fa7ab50 100644 --- a/kmod/src/options.c +++ b/kmod/src/options.c @@ -27,8 +27,6 @@ #include "super.h" static const match_table_t tokens = { - {Opt_listen, "listen=%s"}, - {Opt_cluster, "cluster=%s"}, {Opt_uniq_name, "uniq_name=%s"}, {Opt_err, NULL} }; @@ -55,15 +53,12 @@ u32 scoutfs_option_u32(struct super_block *sb, int token) int scoutfs_parse_options(struct super_block *sb, char *options, struct mount_options *parsed) { - char ipstr[INET_ADDRSTRLEN + 1]; substring_t args[MAX_OPT_ARGS]; int token, len; - __be32 addr; char *p; /* Set defaults */ memset(parsed, 0, sizeof(*parsed)); - strcpy(parsed->cluster_name, "scoutfs"); while ((p = strsep(&options, ",")) != NULL) { if (!*p) @@ -71,22 +66,6 @@ int scoutfs_parse_options(struct super_block *sb, char *options, token = match_token(p, tokens, args); switch (token) { - case Opt_listen: - match_strlcpy(ipstr, args, ARRAY_SIZE(ipstr)); - addr = in_aton(ipstr); - if (ipv4_is_multicast(addr) || ipv4_is_lbcast(addr) || - ipv4_is_zeronet(addr) || ipv4_is_local_multicast(addr)) - return -EINVAL; - parsed->listen_addr.addr = - cpu_to_le32(be32_to_cpu(addr)); - break; - case Opt_cluster: - len = args[0].to - args[0].from; - if (len == 0 || len > (MAX_CLUSTER_NAME_LEN - 1)) - return -EINVAL; - match_strlcpy(parsed->cluster_name, args, - MAX_CLUSTER_NAME_LEN); - break; case Opt_uniq_name: len = match_strlcpy(parsed->uniq_name, args, SCOUTFS_UNIQUE_NAME_MAX_BYTES); diff --git a/kmod/src/options.h b/kmod/src/options.h index a1fffacf..a26df0e9 100644 --- a/kmod/src/options.h +++ b/kmod/src/options.h @@ -5,8 +5,6 @@ #include "format.h" enum { - Opt_listen = 0, - Opt_cluster, /* * For debugging we can quickly create huge trees by limiting * the number of items in each block as though the blocks were tiny. @@ -16,11 +14,7 @@ enum { Opt_err, }; -#define MAX_CLUSTER_NAME_LEN 17 -struct mount_options -{ - struct scoutfs_inet_addr listen_addr; - char cluster_name[MAX_CLUSTER_NAME_LEN]; +struct mount_options { char uniq_name[SCOUTFS_UNIQUE_NAME_MAX_BYTES]; }; diff --git a/kmod/src/server.c b/kmod/src/server.c index bf4047be..fba4f9eb 100644 --- a/kmod/src/server.c +++ b/kmod/src/server.c @@ -35,20 +35,14 @@ #include "lock_server.h" #include "endian_swap.h" -/* - * XXX pre commit: - * - comments - */ - /* * Every active mount can act as the server that listens on a net * connection and accepts connections from all the other mounts acting * as clients. * - * It queues long-lived work that blocks trying to acquire a lock. If - * it acquires the lock it listens on a socket and serves requests. If + * The server is started when raft elects the mount as the leader. If * it sees errors it shuts down the server in the hopes that another - * mount will have less trouble. + * mount will become the leader and have less trouble. */ struct server_info { @@ -57,9 +51,11 @@ struct server_info { wait_queue_head_t waitq; struct workqueue_struct *wq; - struct delayed_work dwork; - struct completion shutdown_comp; - bool bind_warned; + struct work_struct work; + int err; + bool shutting_down; + struct completion start_comp; + struct sockaddr_in listen_sin; struct scoutfs_net_connection *conn; /* request processing coordinates committing manifest and alloc */ @@ -495,9 +491,11 @@ static int remove_segno(struct super_block *sb, u64 segno) return ret; } -static void shutdown_server(struct server_info *server) +static void stop_server(struct server_info *server) { - complete(&server->shutdown_comp); + /* wait_event/wake_up provide barriers */ + server->shutting_down = true; + wake_up(&server->waitq); } /* @@ -1733,21 +1731,6 @@ out: trace_scoutfs_server_compact_work_exit(sb, 0, ret); } -/* - * This relies on the caller having read the current super and advanced - * its seq so that it's dirty. This will go away when we communicate - * the server address in a lock lvb. - */ -static int write_server_addr(struct super_block *sb, struct sockaddr_in *sin) -{ - struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super; - - super->server_addr.addr = be32_to_le32(sin->sin_addr.s_addr); - super->server_addr.port = be16_to_le16(sin->sin_port); - - return scoutfs_write_dirty_super(sb); -} - static scoutfs_net_request_t server_req_funcs[] = { [SCOUTFS_NET_CMD_GREETING] = server_greeting, [SCOUTFS_NET_CMD_ALLOC_INODES] = server_alloc_inodes, @@ -1800,27 +1783,18 @@ static void server_notify_down(struct super_block *sb, forget_client_compacts(sb, sci); try_queue_compact(server); } else { - shutdown_server(server); + stop_server(server); } } -/* - * This work is always running or has a delayed timer set while a super - * is mounted. It tries to grab the lock to become the server. If it - * succeeds it publishes its address and accepts connections. If - * anything goes wrong it releases the lock and sets a timer to try to - * become the server all over again. - */ static void scoutfs_server_worker(struct work_struct *work) { struct server_info *server = container_of(work, struct server_info, - dwork.work); + work); struct super_block *sb = server->sb; struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct scoutfs_super_block *super = &sbi->super; struct scoutfs_net_connection *conn = NULL; - static struct sockaddr_in zeros = {0,}; - struct scoutfs_lock *lock = NULL; struct pending_seq *ps; struct pending_seq *ps_tmp; DECLARE_WAIT_QUEUE_HEAD(waitq); @@ -1830,13 +1804,6 @@ static void scoutfs_server_worker(struct work_struct *work) trace_scoutfs_server_work_enter(sb, 0, 0); - init_completion(&server->shutdown_comp); - - ret = scoutfs_lock_global(sb, DLM_LOCK_EX, 0, - SCOUTFS_LOCK_TYPE_GLOBAL_SERVER, &lock); - if (ret) - goto out; - conn = scoutfs_net_alloc_conn(sb, server_notify_up, server_notify_down, sizeof(struct server_client_info), server_req_funcs, "server"); @@ -1845,33 +1812,21 @@ static void scoutfs_server_worker(struct work_struct *work) goto out; } - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = le32_to_be32(sbi->opts.listen_addr.addr); - sin.sin_port = le16_to_be16(sbi->opts.listen_addr.port); + sin = server->listen_sin; - /* get the address of our listening socket */ ret = scoutfs_net_bind(sb, conn, &sin); if (ret) { - if (!server->bind_warned) { - scoutfs_err(sb, "server failed to bind to "SIN_FMT", errno %d%s. Retrying indefinitely..", - SIN_ARG(&sin), ret, - ret == -EADDRNOTAVAIL ? " (Bad address?)" - : ""); - server->bind_warned = true; - } + scoutfs_err(sb, "server failed to bind to "SIN_FMT", err %d%s", + SIN_ARG(&sin), ret, + ret == -EADDRNOTAVAIL ? " (Bad address?)" + : ""); goto out; } - /* publish the address for clients to connect to */ ret = scoutfs_read_super(sb, super); if (ret) goto out; - scoutfs_advance_dirty_super(sb); - ret = write_server_addr(sb, &sin); - if (ret) - goto out; - /* start up the server subsystems before accepting */ ret = scoutfs_btree_setup(sb) ?: scoutfs_manifest_setup(sb) ?: @@ -1879,6 +1834,8 @@ static void scoutfs_server_worker(struct work_struct *work) if (ret) goto shutdown; + complete(&server->start_comp); + scoutfs_advance_dirty_super(sb); server->stable_manifest_root = super->manifest.root; @@ -1888,8 +1845,8 @@ static void scoutfs_server_worker(struct work_struct *work) server->conn = conn; scoutfs_net_listen(sb, conn); - /* wait for listening down or umount, conn can still be live */ - wait_for_completion_interruptible(&server->shutdown_comp); + /* wait_event/wake_up provide barriers */ + wait_event_interruptible(server->waitq, server->shutting_down); scoutfs_info(sb, "server shutting down on "SIN_FMT, SIN_ARG(&sin)); @@ -1913,16 +1870,39 @@ shutdown: kfree(ps); } - write_server_addr(sb, &zeros); - out: scoutfs_net_free_conn(sb, conn); - scoutfs_unlock(sb, lock, DLM_LOCK_EX); - - /* always requeues, cancel_delayed_work_sync cancels on shutdown */ - queue_delayed_work(server->wq, &server->dwork, HZ / 2); trace_scoutfs_server_work_exit(sb, 0, ret); + + server->err = ret; + complete(&server->start_comp); +} + +/* XXX can we call start multiple times? */ +int scoutfs_server_start(struct super_block *sb, struct sockaddr_in *sin) +{ + DECLARE_SERVER_INFO(sb, server); + + server->err = 0; + server->shutting_down = false; + server->listen_sin = *sin; + init_completion(&server->start_comp); + + queue_work(server->wq, &server->work); + + wait_for_completion(&server->start_comp); + return server->err; +} + +void scoutfs_server_stop(struct super_block *sb) +{ + DECLARE_SERVER_INFO(sb, server); + + stop_server(server); + /* XXX not sure both are needed */ + cancel_work_sync(&server->work); + cancel_work_sync(&server->commit_work); } int scoutfs_server_setup(struct super_block *sb) @@ -1937,9 +1917,7 @@ int scoutfs_server_setup(struct super_block *sb) server->sb = sb; spin_lock_init(&server->lock); init_waitqueue_head(&server->waitq); - init_completion(&server->shutdown_comp); - server->bind_warned = false; - INIT_DELAYED_WORK(&server->dwork, scoutfs_server_worker); + INIT_WORK(&server->work, scoutfs_server_worker); init_rwsem(&server->commit_rwsem); init_llist_head(&server->commit_waiters); INIT_WORK(&server->commit_work, scoutfs_server_commit_func); @@ -1960,22 +1938,24 @@ int scoutfs_server_setup(struct super_block *sb) return -ENOMEM; } - queue_delayed_work(server->wq, &server->dwork, 0); - sbi->server_info = server; return 0; } +/* + * The caller should have already stopped but we do the same just in + * case. + */ void scoutfs_server_destroy(struct super_block *sb) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct server_info *server = sbi->server_info; if (server) { - shutdown_server(server); + stop_server(server); /* wait for server work to wait for everything to shut down */ - cancel_delayed_work_sync(&server->dwork); + cancel_work_sync(&server->work); /* recv work/compaction could have left commit_work queued */ cancel_work_sync(&server->commit_work); diff --git a/kmod/src/server.h b/kmod/src/server.h index f2c9b8e7..01aed49b 100644 --- a/kmod/src/server.h +++ b/kmod/src/server.h @@ -48,6 +48,9 @@ do { \ __entry->name##_id, __entry->name##_data_len, __entry->name##_cmd, \ __entry->name##_flags, __entry->name##_error +struct scoutfs_net_manifest_entry; +struct scoutfs_manifest_entry; + void scoutfs_init_ment_to_net(struct scoutfs_net_manifest_entry *net_ment, struct scoutfs_manifest_entry *ment); void scoutfs_init_ment_from_net(struct scoutfs_manifest_entry *ment, @@ -58,6 +61,10 @@ int scoutfs_server_lock_request(struct super_block *sb, u64 node_id, int scoutfs_server_lock_response(struct super_block *sb, u64 node_id, u64 id, struct scoutfs_net_lock *nl); +struct sockaddr_in; +int scoutfs_server_start(struct super_block *sb, struct sockaddr_in *sin); +void scoutfs_server_stop(struct super_block *sb); + int scoutfs_server_setup(struct super_block *sb); void scoutfs_server_destroy(struct super_block *sb);