From 57f34e90e919ea8e056cb9a55ae337da6b828dfc Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Thu, 4 Feb 2021 16:25:15 -0800 Subject: [PATCH] Use mounted_client item as sign of farewell As clients unmount they send a farewell request that cleans up persistent state associated with the mount. The client needs to be sure that it gets processed, and we must maintain a majority of quorum members mounted to be able to elect a server to process farewell requests. We had a mechanism using the unmount_barrier fields in the greeting and super_block to let the final unmounting quorum majority know that their farewells have been processed and that they didn't need to keep trying to reconnect. But we missed that we also need this out of band farewell handling signal for non-quorum member clients as well. The server can send farewells to a non-member client as well as the final majority and then tear down all the connections before the non-quorum client can see its farewell response. It also needs to be able to know that its farewell has been processed before the server let the final majority unmount. We can remove the custom unmount_barrier method and instead have all unmounting clients check for their mounted_client item in the server's btree. This item is removed as the last step of farewell processing so if the client sees that it has been removed it knows that it doesn't need to resend the farewell and can finish unmounting. This fixes a bug where a non-quorum unmount could hang if it raced with the final majority unmounting. I was able to trigger this hang in our tests with 5 mounts and 3 quorum members. Signed-off-by: Zach Brown --- kmod/src/client.c | 65 ++++++++++++++++++++++++++++++++++------------- kmod/src/format.h | 8 ------ kmod/src/server.c | 41 +++++++----------------------- utils/src/print.c | 2 -- 4 files changed, 57 insertions(+), 59 deletions(-) diff --git a/kmod/src/client.c b/kmod/src/client.c index 12c21399..50adc22e 100644 --- a/kmod/src/client.c +++ b/kmod/src/client.c @@ -289,12 +289,55 @@ static int client_greeting(struct super_block *sb, scoutfs_net_client_greeting(sb, conn, new_server); client->server_term = le64_to_cpu(gr->server_term); - client->greeting_umb = le64_to_cpu(gr->unmount_barrier); ret = 0; out: return ret; } +/* + * The client is deciding if it needs to keep trying to reconnect to + * have its farewell request processed. The server removes our mounted + * client item last so that if we don't see it we know the server has + * processed our farewell and we don't need to reconnect, we can unmount + * safely. + * + * This is peeking at btree blocks that the server could be actively + * freeing with cow updates so it can see stale blocks, we just return + * the error and we'll retry eventually as the connection times out. + */ +static int lookup_mounted_client_item(struct super_block *sb, u64 rid) +{ + struct scoutfs_key key = { + .sk_zone = SCOUTFS_MOUNTED_CLIENT_ZONE, + .skmc_rid = cpu_to_le64(rid), + }; + struct scoutfs_super_block *super; + SCOUTFS_BTREE_ITEM_REF(iref); + int ret; + + super = kmalloc(sizeof(struct scoutfs_super_block), GFP_NOFS); + if (!super) { + ret = -ENOMEM; + goto out; + } + + ret = scoutfs_read_super(sb, super); + if (ret) + goto out; + + ret = scoutfs_btree_lookup(sb, &super->mounted_clients, &key, &iref); + if (ret == 0) { + scoutfs_btree_put_iref(&iref); + ret = 1; + } + if (ret == -ENOENT) + ret = 0; + + kfree(super); +out: + return ret; +} + /* * This work is responsible for maintaining a connection from the client * to the server. It's queued on mount and disconnect and we requeue @@ -318,26 +361,16 @@ static void scoutfs_client_connect_worker(struct work_struct *work) connect_dwork.work); struct super_block *sb = client->sb; struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct scoutfs_super_block *super = NULL; + struct scoutfs_super_block *super = &sbi->super; struct mount_options *opts = &sbi->opts; const bool am_quorum = opts->quorum_slot_nr >= 0; struct scoutfs_net_greeting greet; struct sockaddr_in sin; int ret; - super = kmalloc(sizeof(struct scoutfs_super_block), GFP_NOFS); - if (!super) { - ret = -ENOMEM; - goto out; - } - - ret = scoutfs_read_super(sb, super); - if (ret) - goto out; - - /* can safely unmount if we see that server processed our farewell */ - if (am_quorum && client->sending_farewell && - (le64_to_cpu(super->unmount_barrier) > client->greeting_umb)) { + /* can unmount once server farewell handling removes our item */ + if (client->sending_farewell && + lookup_mounted_client_item(sb, sbi->rid) == 0) { client->farewell_error = 0; complete(&client->farewell_comp); ret = 0; @@ -357,7 +390,6 @@ static void scoutfs_client_connect_worker(struct work_struct *work) greet.fsid = super->hdr.fsid; greet.version = super->version; greet.server_term = cpu_to_le64(client->server_term); - greet.unmount_barrier = cpu_to_le64(client->greeting_umb); greet.rid = cpu_to_le64(sbi->rid); greet.flags = 0; if (client->sending_farewell) @@ -372,7 +404,6 @@ static void scoutfs_client_connect_worker(struct work_struct *work) if (ret) scoutfs_net_shutdown(sb, client->conn); out: - kfree(super); /* always have a small delay before retrying to avoid storms */ if (ret && !atomic_read(&client->shutting_down)) diff --git a/kmod/src/format.h b/kmod/src/format.h index 5f0f24d5..9578586f 100644 --- a/kmod/src/format.h +++ b/kmod/src/format.h @@ -630,7 +630,6 @@ struct scoutfs_super_block { __le64 total_data_blocks; __le64 first_data_blkno; __le64 last_data_blkno; - __le64 unmount_barrier; struct scoutfs_quorum_config qconf; struct scoutfs_alloc_root meta_alloc[2]; struct scoutfs_alloc_root data_alloc; @@ -765,12 +764,6 @@ enum scoutfs_dentry_type { * the same serer after receiving a greeting response and to a new * server after failover. * - * @unmount_barrier: Incremented every time the remaining majority of - * quorum members all agree to leave. The server tells a quorum member - * the value that it's connecting under so that if the client sees the - * value increase in the super block then it knows that the server has - * processed its farewell and can safely unmount. - * * @rid: The client's random id that was generated once as the mount * started up. This identifies a specific remote mount across * connections and servers. It's set to the client's rid in both the @@ -780,7 +773,6 @@ struct scoutfs_net_greeting { __le64 fsid; __le64 version; __le64 server_term; - __le64 unmount_barrier; __le64 rid; __le64 flags; }; diff --git a/kmod/src/server.c b/kmod/src/server.c index df5c0a67..785be5ff 100644 --- a/kmod/src/server.c +++ b/kmod/src/server.c @@ -1145,7 +1145,6 @@ static int server_greeting(struct super_block *sb, struct scoutfs_net_greeting *gr = arg; struct scoutfs_net_greeting greet; DECLARE_SERVER_INFO(sb, server); - __le64 umb = 0; bool reconnecting; bool first_contact; bool farewell; @@ -1178,10 +1177,6 @@ static int server_greeting(struct super_block *sb, if (ret < 0) goto send_err; - spin_lock(&server->lock); - umb = super->unmount_barrier; - spin_unlock(&server->lock); - mutex_lock(&server->farewell_mutex); ret = insert_mounted_client(sb, le64_to_cpu(gr->rid), le64_to_cpu(gr->flags)); @@ -1189,8 +1184,6 @@ static int server_greeting(struct super_block *sb, ret = scoutfs_server_apply_commit(sb, ret); queue_work(server->wq, &server->farewell_work); - } else { - umb = gr->unmount_barrier; } send_err: @@ -1199,7 +1192,6 @@ send_err: greet.fsid = super->hdr.fsid; greet.version = super->version; greet.server_term = cpu_to_le64(server->term); - greet.unmount_barrier = umb; greet.rid = gr->rid; greet.flags = 0; @@ -1255,16 +1247,15 @@ static bool invalid_mounted_client_item(struct scoutfs_btree_item_ref *iref) /* * This work processes farewell requests asynchronously. Requests from - * quorum members can be held until only the final quorum remains and + * quorum members can be held until only the final majority remains and * they've all sent farewell requests. * - * When we remove the last mounted client record for the last quorum - * member then we increase the unmount_barrier and write it to the super - * block. If members don't get their farewell response they'll see the - * greater umount_barrier in the super and will know that their farewell - * has been processed and that they can exit. + * A client can be disconnected before receiving our farewell response. + * Before reconnecting they check for their mounted client item, if it's + * been removed then they know that their farewell has been processed + * and that they finish unmounting without reconnecting. * - * Responses for clients who aren't members are immediately sent. + * Responses for clients who aren't quorum members are immediately sent. * Clients that don't have a mounted client record have already had * their farewell processed by another server and can proceed. * @@ -1293,7 +1284,6 @@ static void farewell_worker(struct work_struct *work) struct scoutfs_key key; LIST_HEAD(reqs); LIST_HEAD(send); - bool deleted = false; bool is_quorum; bool more_reqs; int ret; @@ -1367,7 +1357,6 @@ static void farewell_worker(struct work_struct *work) list_move_tail(&fw->entry, &send); nr_mounted--; nr_unmounting--; - deleted = true; } /* process and send farewell responses */ @@ -1376,24 +1365,12 @@ static void farewell_worker(struct work_struct *work) if (ret) goto out; + /* delete mounted client last, client reconnect looks for it */ ret = scoutfs_lock_server_farewell(sb, fw->rid) ?: remove_trans_seq(sb, fw->rid) ?: reclaim_log_trees(sb, fw->rid) ?: - delete_mounted_client(sb, fw->rid) ?: - cancel_srch_compact(sb, fw->rid); - - ret = scoutfs_server_apply_commit(sb, ret); - if (ret) - goto out; - } - - /* update the unmount barrier if we deleted all quorum members */ - if (deleted && nr_mounted == 0) { - ret = scoutfs_server_hold_commit(sb); - if (ret) - goto out; - - le64_add_cpu(&super->unmount_barrier, 1); + cancel_srch_compact(sb, fw->rid) ?: + delete_mounted_client(sb, fw->rid); ret = scoutfs_server_apply_commit(sb, ret); if (ret) diff --git a/utils/src/print.c b/utils/src/print.c index d6df9d29..ce89c12c 100644 --- a/utils/src/print.c +++ b/utils/src/print.c @@ -870,7 +870,6 @@ static void print_super_block(struct scoutfs_super_block *super, u64 blkno) printf(" next_ino %llu next_trans_seq %llu\n" " total_meta_blocks %llu first_meta_blkno %llu last_meta_blkno %llu\n" " total_data_blocks %llu first_data_blkno %llu last_data_blkno %llu\n" - " unmount_barrier %llu\n" " meta_alloc[0]: "ALCROOT_F"\n" " meta_alloc[1]: "ALCROOT_F"\n" " data_alloc: "ALCROOT_F"\n" @@ -891,7 +890,6 @@ static void print_super_block(struct scoutfs_super_block *super, u64 blkno) le64_to_cpu(super->total_data_blocks), le64_to_cpu(super->first_data_blkno), le64_to_cpu(super->last_data_blkno), - le64_to_cpu(super->unmount_barrier), ALCROOT_A(&super->meta_alloc[0]), ALCROOT_A(&super->meta_alloc[1]), ALCROOT_A(&super->data_alloc),