diff --git a/kmod/src/btree.c b/kmod/src/btree.c index bf6675e9..006eacd3 100644 --- a/kmod/src/btree.c +++ b/kmod/src/btree.c @@ -350,6 +350,7 @@ static size_t super_root_offsets[] = { offsetof(struct scoutfs_super_block, manifest.root), offsetof(struct scoutfs_super_block, lock_clients), offsetof(struct scoutfs_super_block, trans_seqs), + offsetof(struct scoutfs_super_block, mounted_clients), }; #define for_each_super_root(super, i, root) \ diff --git a/kmod/src/client.c b/kmod/src/client.c index 8149213d..8d3bcee6 100644 --- a/kmod/src/client.c +++ b/kmod/src/client.c @@ -59,6 +59,7 @@ struct client_info { u64 old_elected_nr; u64 server_term; + u64 greeting_umb; bool sending_farewell; int farewell_error; @@ -365,22 +366,27 @@ static int client_greeting(struct super_block *sb, scoutfs_net_client_greeting(sb, conn, new_server); client->server_term = le64_to_cpu(gr->server_term); + client->greeting_umb = le64_to_cpu(gr->unmount_barrier); ret = 0; out: return ret; } /* - * If the previous election told us to start the server then stop it - * and wipe the old election info. If we're not fast enough to clear - * the election block then the next server might fence us. Should - * be very unlikely as election requires multiple RMW cycles. + * If the previous election told us to start the server then stop it and + * clear the indication that we were elected. We get the current + * version of the election info from the server because they might have + * modified it while they were running. the old election info. + * + * If we're not fast enough to clear the election from the quorum block + * then the next server might fence us. Should be very unlikely as + * election requires multiple RMW cycles. */ static void stop_our_server(struct super_block *sb, struct scoutfs_quorum_elected_info *qei) { if (qei->run_server) { - scoutfs_server_stop(sb); + scoutfs_server_stop(sb, qei); scoutfs_quorum_clear_elected(sb, qei); memset(qei, 0, sizeof(*qei)); } @@ -431,12 +437,22 @@ static void scoutfs_client_connect_worker(struct work_struct *work) ret = scoutfs_quorum_election(sb, opts->uniq_name, client->old_elected_nr, - timeout_abs, qei); + timeout_abs, client->sending_farewell, + client->greeting_umb, qei); if (ret) goto out; + /* we saw that the server wrote a new unmount barrier */ + if (client->sending_farewell && qei->elected_nr == 0 && + qei->unmount_barrier > client->greeting_umb) { + client->farewell_error = 0; + complete(&client->farewell_comp); + ret = 0; + goto out; + } + if (qei->run_server) { - ret = scoutfs_server_start(sb, &qei->sin, qei->elected_nr); + ret = scoutfs_server_start(sb, &qei->sin, qei->elected_nr, qei); if (ret) { /* forget that we tried to start the server */ memset(qei, 0, sizeof(*qei)); @@ -459,9 +475,11 @@ static void scoutfs_client_connect_worker(struct work_struct *work) client->old_elected_nr = 0; /* send a greeting to verify endpoints of each connection */ + memcpy(greet.name, opts->uniq_name, sizeof(greet.name)); greet.fsid = super->hdr.fsid; greet.format_hash = super->format_hash; greet.server_term = cpu_to_le64(client->server_term); + greet.unmount_barrier = 0; greet.node_id = cpu_to_le64(sbi->node_id); greet.flags = 0; if (client->sending_farewell) diff --git a/kmod/src/format.h b/kmod/src/format.h index 6918bd82..b5ced5d4 100644 --- a/kmod/src/format.h +++ b/kmod/src/format.h @@ -48,6 +48,8 @@ #define SCOUTFS_QUORUM_BLOCKS ((128ULL * 1024) >> SCOUTFS_BLOCK_SHIFT) #define SCOUTFS_QUORUM_MAX_SLOTS SCOUTFS_QUORUM_BLOCKS +#define SCOUTFS_UNIQUE_NAME_MAX_BYTES 64 /* includes null */ + /* * Base types used by other structures. */ @@ -289,6 +291,18 @@ struct scoutfs_trans_seq_btree_key { __be64 node_id; } __packed; +/* + * The server keeps a persistent record of mounted clients. + */ +struct scoutfs_mounted_client_btree_key { + __be64 node_id; +} __packed; + +struct scoutfs_mounted_client_btree_val { + __u8 name[SCOUTFS_UNIQUE_NAME_MAX_BYTES]; +} __packed; + + /* * The max number of links defines the max number of entries that we can * index in o(log n) and the static list head storage size in the @@ -395,7 +409,6 @@ struct scoutfs_xattr { #define member_sizeof(TYPE, MEMBER) (sizeof(((TYPE *)0)->MEMBER)) #define SCOUTFS_UUID_BYTES 16 -#define SCOUTFS_UNIQUE_NAME_MAX_BYTES 64 /* includes null */ /* * During each quorum voting interval the fabric has to process 2 reads @@ -418,6 +431,7 @@ struct scoutfs_xattr { * @config_gen: references the config gen in the super block * @write_nr: incremented for every write, only 0 when never written * @elected_nr: incremented when elected, 0 otherwise + * @unmount_barrier: incremented by servers when all members have unmounted * @vote_slot: the active config slot that the writer is voting for */ struct scoutfs_quorum_block { @@ -426,6 +440,7 @@ struct scoutfs_quorum_block { __le64 config_gen; __le64 write_nr; __le64 elected_nr; + __le64 unmount_barrier; __le32 crc; __u8 vote_slot; } __packed; @@ -475,6 +490,7 @@ struct scoutfs_super_block { struct scoutfs_quorum_config quorum_config; struct scoutfs_btree_root lock_clients; struct scoutfs_btree_root trans_seqs; + struct scoutfs_btree_root mounted_clients; } __packed; #define SCOUTFS_ROOT_INO 1 @@ -593,20 +609,30 @@ enum { * Greetings verify identity of communicating nodes. The sender sends * their credentials and the receiver verifies them. * + * @name: The client sends its unique name to the server. + * * @server_term: The raft term that elected the server. Initially 0 * from the client, sent by the server, then sent by the client as it * tries to reconnect. Used to identify a client reconnecting to a * server that has timed out its connection. * + * @unmount_barrier: Incremented every time the remaining majority of + * quorum members all agree to leave. The server tells a quorum member + * the value that it's connecting under so that if the client sees the + * value increase in a quorum block it knows that the server has + * processed its farewell and can safely unmount. + * * @node_id: The id of the client. Initially 0 from the client, * assigned by the server, and sent by the client as it reconnects. * Used by the server to identify reconnecting clients whose existing * state must be dealt with. */ struct scoutfs_net_greeting { + __u8 name[SCOUTFS_UNIQUE_NAME_MAX_BYTES]; __le64 fsid; __le64 format_hash; __le64 server_term; + __le64 unmount_barrier; __le64 node_id; __le64 flags; } __packed; diff --git a/kmod/src/lock_server.c b/kmod/src/lock_server.c index 5c55362c..da7e1062 100644 --- a/kmod/src/lock_server.c +++ b/kmod/src/lock_server.c @@ -800,7 +800,7 @@ static void scoutfs_lock_server_recovery_timeout(struct work_struct *work) if (ret < 0) { scoutfs_err(sb, "lock server saw err %d while timing out clients, shutting down", ret); - scoutfs_server_stop(sb); + scoutfs_server_abort(sb); } } @@ -870,7 +870,7 @@ int scoutfs_lock_server_farewell(struct super_block *sb, u64 node_id) out: if (ret < 0) { scoutfs_err(sb, "lock server err %d during node %llu farewell, shutting down", ret, node_id); - scoutfs_server_stop(sb); + scoutfs_server_abort(sb); } return ret; diff --git a/kmod/src/net.c b/kmod/src/net.c index fae167f8..8449ce56 100644 --- a/kmod/src/net.c +++ b/kmod/src/net.c @@ -100,7 +100,7 @@ struct scoutfs_net_connection { established:1, /* added sends queue send work */ shutting_down:1, /* shutdown work has been queued */ saw_greeting:1, /* saw greeting on this sock */ - saw_farewell:1, /* saw farewell request from client */ + saw_farewell:1, /* saw farewell response to client */ reconn_wait:1, /* shutdown, waiting for reconnect */ reconn_freeing:1; /* waiting done, setter frees */ unsigned long reconn_deadline; @@ -788,6 +788,11 @@ static void scoutfs_net_send_worker(struct work_struct *work) continue; } + if ((msend->nh.cmd == SCOUTFS_NET_CMD_FAREWELL) && + nh_is_response(&msend->nh)) { + conn->saw_farewell = 1; + } + msend->nh.recv_seq = cpu_to_le64(atomic64_read(&conn->recv_seq)); @@ -1629,22 +1634,6 @@ restart: conn->notify_up(sb, conn, conn->info, node_id); } -/* - * The server has received a farewell message and is sending a response. - * All we do is mark the connection so that it is freed the next time it - * is shutdown, presumably as the client disconnects after receiving the - * response. The server caller has cleaned up all the state it had - * associated with the client. - */ -void scoutfs_net_server_farewell(struct super_block *sb, - struct scoutfs_net_connection *conn) -{ - spin_lock(&conn->lock); - conn->saw_farewell = 1; - spin_unlock(&conn->lock); -} - - /* * Submit a request down the connection. It's up to the caller to * ensure that the conn is allocated. Sends submitted when the diff --git a/kmod/src/net.h b/kmod/src/net.h index 82da64bf..c144c69e 100644 --- a/kmod/src/net.h +++ b/kmod/src/net.h @@ -75,8 +75,6 @@ void scoutfs_net_server_greeting(struct super_block *sb, u64 node_id, u64 greeting_id, bool sent_node_id, bool first_contact, bool farewell); -void scoutfs_net_server_farewell(struct super_block *sb, - struct scoutfs_net_connection *conn); void scoutfs_net_farewell(struct super_block *sb, struct scoutfs_net_connection *conn); diff --git a/kmod/src/quorum.c b/kmod/src/quorum.c index 112e2da9..7f90e407 100644 --- a/kmod/src/quorum.c +++ b/kmod/src/quorum.c @@ -392,7 +392,8 @@ static inline int first_slot_flags(struct scoutfs_quorum_config *conf, */ static int write_quorum_block(struct super_block *sb, __le64 fsid, __le64 config_gen, u8 our_slot, __le64 write_nr, - u64 elected_nr, u8 vote_slot) + u64 elected_nr, u64 unmount_barrier, + u8 vote_slot) { struct scoutfs_quorum_block *blk; struct buffer_head *bh; @@ -416,6 +417,7 @@ static int write_quorum_block(struct super_block *sb, __le64 fsid, blk->config_gen = config_gen; blk->write_nr = write_nr; blk->elected_nr = cpu_to_le64(elected_nr); + blk->unmount_barrier = cpu_to_le64(unmount_barrier); blk->vote_slot = vote_slot; blk->crc = quorum_block_crc(blk); @@ -473,8 +475,8 @@ static int fence_other_elected(struct super_block *sb, scoutfs_inc_counter(sb, quorum_fenced); ret = write_quorum_block(sb, super->hdr.fsid, - conf->gen, i, blk.write_nr, - 0, i); + conf->gen, i, blk.write_nr, 0, + le64_to_cpu(blk.unmount_barrier), i); if (ret) break; } @@ -509,9 +511,13 @@ struct quorum_block_history { * When we return success we update the caller's elected info with the * most recent elected leader we found, which may well be long gone. We * return -ENOENT if we didn't find any elected leaders. + * + * If we return success because we saw a larger unmount barrier we set + * elected_nr to 0 and fill the unmount_barrier. */ int scoutfs_quorum_election(struct super_block *sb, char *our_name, u64 old_elected_nr, ktime_t timeout_abs, + bool unmounting, u64 our_umb, struct scoutfs_quorum_elected_info *qei) { struct scoutfs_super_block *super = NULL; @@ -524,6 +530,7 @@ int scoutfs_quorum_election(struct super_block *sb, char *our_name, ktime_t now; __le64 write_nr = 0; u64 elected_nr = 0; + u64 unmount_barrier = 0; int vote_streak = 0; int vote_slot; int our_slot; @@ -551,13 +558,7 @@ int scoutfs_quorum_election(struct super_block *sb, char *our_name, goto out; conf = &super->quorum_config; - /* allow a single vote majority when 1 or 2 active */ - if (nr_active <= 2) - majority = 1; - else if (nr_active & 1) - majority = (nr_active + 1) / 2; - else - majority = (nr_active / 2) + 1; + majority = scoutfs_quorum_majority(sb, conf); readahead_quorum_blocks(sb); @@ -590,6 +591,8 @@ int scoutfs_quorum_election(struct super_block *sb, char *our_name, qei->config_gen = blk.config_gen; qei->write_nr = blk.write_nr; qei->elected_nr = le64_to_cpu(blk.elected_nr); + qei->unmount_barrier = + le64_to_cpu(blk.unmount_barrier); qei->config_slot = i; } } @@ -639,12 +642,23 @@ int scoutfs_quorum_election(struct super_block *sb, char *our_name, nr_votes = 0; write_nr = cpu_to_le64(1); elected_nr = 0; + unmount_barrier = 0; for_each_active_block(sb, super, conf, hist, hi, &blk, slot, i){ /* count our votes (maybe including from us) */ if (hi->writing >= 2 && blk.vote_slot == our_slot) nr_votes++; + /* can finish unmounting if members all left */ + if (unmounting && + le64_to_cpu(blk.unmount_barrier) > our_umb) { + qei->elected_nr = 0; + qei->unmount_barrier = + le64_to_cpu(blk.unmount_barrier); + ret = 0; + goto out; + } + /* sample existing fields for our write */ if (i == our_slot) { write_nr = blk.write_nr; @@ -652,6 +666,8 @@ int scoutfs_quorum_election(struct super_block *sb, char *our_name, } elected_nr = max(elected_nr, le64_to_cpu(blk.elected_nr)); + unmount_barrier = max(unmount_barrier, + le64_to_cpu(blk.unmount_barrier)); } @@ -667,7 +683,8 @@ int scoutfs_quorum_election(struct super_block *sb, char *our_name, elected_nr = 0; write_quorum_block(sb, super->hdr.fsid, conf->gen, our_slot, - write_nr, elected_nr, vote_slot); + write_nr, elected_nr, unmount_barrier, + vote_slot); set_current_state(TASK_UNINTERRUPTIBLE); schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); @@ -714,5 +731,65 @@ int scoutfs_quorum_clear_elected(struct super_block *sb, return write_quorum_block(sb, super->hdr.fsid, qei->config_gen, qei->config_slot, qei->write_nr, 0, + qei->unmount_barrier, qei->config_slot); +} + +int scoutfs_quorum_update_barrier(struct super_block *sb, + struct scoutfs_quorum_elected_info *qei, + u64 unmount_barrier) +{ + struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super; + + qei->unmount_barrier = unmount_barrier; + + return write_quorum_block(sb, super->hdr.fsid, qei->config_gen, + qei->config_slot, qei->write_nr, + qei->elected_nr, qei->unmount_barrier, qei->config_slot); } + +/* + * If there's only one or two active slots then a single vote is sufficient + * for a majority. + */ +int scoutfs_quorum_majority(struct super_block *sb, + struct scoutfs_quorum_config *conf) +{ + struct scoutfs_quorum_slot *slot; + int nr_active = 0; + int majority; + int i; + + for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) { + slot = &conf->slots[i]; + + if (slot->flags & SCOUTFS_QUORUM_SLOT_ACTIVE) + nr_active++; + } + + if (nr_active <= 2) + majority = 1; + else if (nr_active & 1) + majority = (nr_active + 1) / 2; + else + majority = (nr_active / 2) + 1; + + return majority; +} + +bool scoutfs_quorum_voting_member(struct super_block *sb, + struct scoutfs_quorum_config *conf, + char *name) +{ + struct scoutfs_quorum_slot *slot; + int i; + + for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) { + slot = &conf->slots[i]; + + if (strcmp(slot->name, name) == 0) + return true; + } + + return false; +} diff --git a/kmod/src/quorum.h b/kmod/src/quorum.h index 82e6708e..c40e9d48 100644 --- a/kmod/src/quorum.h +++ b/kmod/src/quorum.h @@ -6,14 +6,24 @@ struct scoutfs_quorum_elected_info { __le64 config_gen; __le64 write_nr; u64 elected_nr; + u64 unmount_barrier; unsigned int config_slot; bool run_server; }; int scoutfs_quorum_election(struct super_block *sb, char *our_name, u64 old_elected_nr, ktime_t timeout_abs, + bool unmounting, u64 our_umb, struct scoutfs_quorum_elected_info *qei); int scoutfs_quorum_clear_elected(struct super_block *sb, struct scoutfs_quorum_elected_info *qei); +int scoutfs_quorum_update_barrier(struct super_block *sb, + struct scoutfs_quorum_elected_info *qei, + u64 unmount_barrier); +int scoutfs_quorum_majority(struct super_block *sb, + struct scoutfs_quorum_config *conf); +bool scoutfs_quorum_voting_member(struct super_block *sb, + struct scoutfs_quorum_config *conf, + char *name); #endif diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h index 3991ece4..48040606 100644 --- a/kmod/src/scoutfs_trace.h +++ b/kmod/src/scoutfs_trace.h @@ -2482,6 +2482,7 @@ DECLARE_EVENT_CLASS(scoutfs_quorum_block_class, __field(__u64, config_gen) __field(__u64, write_nr) __field(__u64, elected_nr) + __field(__u64, unmount_barrier) __field(__u32, crc) __field(__u8, vote_slot) ), @@ -2493,14 +2494,15 @@ DECLARE_EVENT_CLASS(scoutfs_quorum_block_class, __entry->config_gen = le64_to_cpu(blk->config_gen); __entry->write_nr = le64_to_cpu(blk->write_nr); __entry->elected_nr = le64_to_cpu(blk->elected_nr); + __entry->unmount_barrier = le64_to_cpu(blk->unmount_barrier); __entry->crc = le32_to_cpu(blk->crc); __entry->vote_slot = blk->vote_slot; ), - TP_printk("fsid "FSID_FMT" io_blkno %llu hdr_blkno %llu config_gen %llu write_nr %llu elected_nr %llu crc 0x%08x vote_slot %u", + TP_printk("fsid "FSID_FMT" io_blkno %llu hdr_blkno %llu config_gen %llu write_nr %llu elected_nr %llu umb %llu crc 0x%08x vote_slot %u", __entry->fsid, __entry->io_blkno, __entry->hdr_blkno, __entry->config_gen, __entry->write_nr, __entry->elected_nr, - __entry->crc, __entry->vote_slot) + __entry->unmount_barrier, __entry->crc, __entry->vote_slot) ); DEFINE_EVENT(scoutfs_quorum_block_class, scoutfs_quorum_read_block, TP_PROTO(struct super_block *sb, u64 io_blkno, diff --git a/kmod/src/server.c b/kmod/src/server.c index f18c1d81..fd7cda3d 100644 --- a/kmod/src/server.c +++ b/kmod/src/server.c @@ -35,6 +35,7 @@ #include "net.h" #include "lock_server.h" #include "endian_swap.h" +#include "quorum.h" /* * Every active mount can act as the server that listens on a net @@ -60,6 +61,8 @@ struct server_info { u64 term; struct scoutfs_net_connection *conn; + struct scoutfs_quorum_elected_info qei; + /* request processing coordinates committing manifest and alloc */ struct rw_semaphore commit_rwsem; struct llist_head commit_waiters; @@ -84,6 +87,11 @@ struct server_info { unsigned long nr_compacts; struct list_head compacts; struct work_struct compact_work; + + /* track clients waiting in unmmount for farewell response */ + struct mutex farewell_mutex; + struct list_head farewell_requests; + struct work_struct farewell_work; }; #define DECLARE_SERVER_INFO(sb, name) \ @@ -1176,6 +1184,46 @@ int scoutfs_server_lock_recover_request(struct super_block *sb, u64 node_id, NULL, NULL); } +static int insert_mounted_client(struct super_block *sb, u64 node_id, + char *name) +{ + struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super; + struct scoutfs_mounted_client_btree_key mck; + struct scoutfs_mounted_client_btree_val mcv; + + mck.node_id = cpu_to_be64(node_id); + strncpy(mcv.name, name, sizeof(mcv.name)); + + return scoutfs_btree_insert(sb, &super->mounted_clients, + &mck, sizeof(mck), &mcv, sizeof(mcv)); +} + +/* + * Remove the record of a mounted client. The record can already be + * removed if we're processing a farewell on behalf of a client that + * already had a previous server process its farewell. + * + * When we remove the last mounted client that's voting we write a new + * quorum block with the updated unmount_barrier. + * + * The caller has to serialize with farewell processing. + */ +static int delete_mounted_client(struct super_block *sb, u64 node_id) +{ + struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super; + struct scoutfs_mounted_client_btree_key mck; + int ret; + + mck.node_id = cpu_to_be64(node_id); + + ret = scoutfs_btree_delete(sb, &super->mounted_clients, + &mck, sizeof(mck)); + if (ret == -ENOENT) + ret = 0; + + return ret; +} + /* * Process an incoming greeting request in the server from the client. * We try to send responses to failed greetings so that the sender can @@ -1247,9 +1295,17 @@ static int server_greeting(struct super_block *sb, le64_add_cpu(&super->next_node_id, 1); spin_unlock(&server->lock); - queue_commit_work(server, &cw); + mutex_lock(&server->farewell_mutex); + ret = insert_mounted_client(sb, le64_to_cpu(node_id), gr->name); + mutex_unlock(&server->farewell_mutex); + + if (ret == 0) + queue_commit_work(server, &cw); up_read(&server->commit_rwsem); - ret = wait_for_commit(&cw); + if (ret == 0) { + ret = wait_for_commit(&cw); + queue_work(server->wq, &server->farewell_work); + } } else { node_id = gr->node_id; } @@ -1259,9 +1315,11 @@ send_err: if (err) node_id = 0; + memset(greet.name, 0, sizeof(greet.name)); greet.fsid = super->hdr.fsid; greet.format_hash = super->format_hash; greet.server_term = cpu_to_le64(server->term); + greet.unmount_barrier = cpu_to_le64(server->qei.unmount_barrier); greet.node_id = node_id; greet.flags = 0; @@ -1305,6 +1363,223 @@ out: return ret; } +struct farewell_request { + struct list_head entry; + u64 net_id; + u64 node_id; +}; + +static bool invalid_mounted_client_item(struct scoutfs_btree_item_ref *iref) +{ + return (iref->key_len != + sizeof(struct scoutfs_mounted_client_btree_key)) || + (iref->val_len != + sizeof(struct scoutfs_mounted_client_btree_val)); +} + +/* + * This work processes farewell requests asynchronously. Requests from + * voting quorum members can be held until they're no longer needed to + * vote for quorum and elect a server to process farewell requests. + * + * This will hold farewell requests from voting clients until either it + * isn't needed for quorum because a majority remains without it, or it + * won't be needed for quorum because all the remaining mounted clients + * are voting and waiting for farewell. + * + * When we remove the last mounted client record for the last voting + * client then we increase the unmount_barrier and write it to the + * server's quorum block. If voting clients don't get their farewell + * response they'll attempt to form quorum again to start the server for + * their farewell response but will find the increased umount_barrier. + * The'll know that their farewell has been processed and they can exit + * without forming quorum. + * + * Responses that are waiting for clients who aren't voting are + * immediately sent. Clients that don't have a mounted client record + * have already had their farewell processed by another server and can + * proceed. + * + * This can trust the quorum config found in the super that was read + * when the server started. Only the current server can rewrite the + * working config. + * + * Farewell responses are unique in that sending them causes the server + * to shutdown the connection to the client next time the socket + * disconnects. If the socket is destroyed before the client gets the + * response they'll reconnect and we'll see them as a brand new client + * who immediately sends a farewell. It'll be processed and it all + * works out. + * + * If this worker sees an error it assumes that this sever is done for + * and that another had better take its place. + */ +static void farewell_worker(struct work_struct *work) +{ + struct server_info *server = container_of(work, struct server_info, + farewell_work); + struct super_block *sb = server->sb; + struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super; + struct scoutfs_quorum_config *conf = &super->quorum_config; + struct scoutfs_mounted_client_btree_key mck; + struct scoutfs_mounted_client_btree_val *mcv; + struct farewell_request *tmp; + struct farewell_request *fw; + SCOUTFS_BTREE_ITEM_REF(iref); + struct commit_waiter cw; + unsigned int nr_unmounting = 0; + unsigned int nr_mounted = 0; + unsigned int majority; + LIST_HEAD(reqs); + LIST_HEAD(send); + bool deleted = false; + bool voting; + bool more_reqs; + int ret; + + majority = scoutfs_quorum_majority(sb, conf); + + /* grab all the requests that are waiting */ + mutex_lock(&server->farewell_mutex); + list_splice_init(&server->farewell_requests, &reqs); + mutex_unlock(&server->farewell_mutex); + + /* count how many reqs requests are from voting clients */ + nr_unmounting = 0; + list_for_each_entry_safe(fw, tmp, &reqs, entry) { + mck.node_id = cpu_to_be64(fw->node_id); + ret = scoutfs_btree_lookup(sb, &super->mounted_clients, + &mck, sizeof(mck), &iref); + if (ret == 0 && invalid_mounted_client_item(&iref)) { + scoutfs_btree_put_iref(&iref); + ret = -EIO; + } + if (ret < 0) { + if (ret == -ENOENT) { + list_move_tail(&fw->entry, &send); + continue; + } + goto out; + } + + mcv = iref.val; + voting = scoutfs_quorum_voting_member(sb, conf, mcv->name); + scoutfs_btree_put_iref(&iref); + + if (!voting) { + list_move_tail(&fw->entry, &send); + continue; + } + + nr_unmounting++; + } + + /* see how many mounted clients could vote for quorum */ + memset(&mck, 0, sizeof(mck)); + for (;;) { + ret = scoutfs_btree_next(sb, &super->mounted_clients, + &mck, sizeof(mck), &iref); + if (ret == 0 && invalid_mounted_client_item(&iref)) { + scoutfs_btree_put_iref(&iref); + ret = -EIO; + } + if (ret != 0) { + if (ret == -ENOENT) + break; + goto out; + } + + memcpy(&mck, iref.key, sizeof(mck)); + mcv = iref.val; + + if (scoutfs_quorum_voting_member(sb, conf, mcv->name)) + nr_mounted++; + + scoutfs_btree_put_iref(&iref); + be64_add_cpu(&mck.node_id, 1); + + } + + /* send as many responses as we can to maintain quorum */ + while ((fw = list_first_entry_or_null(&reqs, struct farewell_request, + entry)) && + (nr_mounted > majority || nr_unmounting >= nr_mounted)) { + + list_move_tail(&fw->entry, &send); + nr_mounted--; + nr_unmounting--; + deleted = true; + } + + /* process and send farewell responses */ + list_for_each_entry_safe(fw, tmp, &send, entry) { + + down_read(&server->commit_rwsem); + + ret = scoutfs_lock_server_farewell(sb, fw->node_id) ?: + remove_trans_seq(sb, fw->node_id) ?: + delete_mounted_client(sb, fw->node_id); + if (ret == 0) + queue_commit_work(server, &cw); + + up_read(&server->commit_rwsem); + if (ret == 0) + ret = wait_for_commit(&cw); + if (ret) + goto out; + } + + /* update the unmount barrier the first time we delete all mounted */ + if (deleted && nr_mounted == 0) { + ret = scoutfs_quorum_update_barrier(sb, &server->qei, + server->qei.unmount_barrier + 1); + if (ret) + goto out; + } + + /* and finally send all the responses */ + list_for_each_entry_safe(fw, tmp, &send, entry) { + + ret = scoutfs_net_response_node(sb, server->conn, fw->node_id, + SCOUTFS_NET_CMD_FAREWELL, + fw->net_id, 0, NULL, 0); + if (ret) + break; + + list_del_init(&fw->entry); + kfree(fw); + } + + ret = 0; +out: + mutex_lock(&server->farewell_mutex); + more_reqs = !list_empty(&server->farewell_requests); + list_splice_init(&reqs, &server->farewell_requests); + list_splice_init(&send, &server->farewell_requests); + mutex_unlock(&server->farewell_mutex); + + if (ret < 0) + stop_server(server); + else if (more_reqs && !server->shutting_down) + queue_work(server->wq, &server->farewell_work); +} + +static void free_farewell_requests(struct super_block *sb, u64 node_id) +{ + struct server_info *server = SCOUTFS_SB(sb)->server_info; + struct farewell_request *tmp; + struct farewell_request *fw; + + mutex_lock(&server->farewell_mutex); + list_for_each_entry_safe(fw, tmp, &server->farewell_requests, entry) { + if (node_id == 0 || fw->node_id == node_id) { + list_del_init(&fw->entry); + kfree(fw); + } + } + mutex_unlock(&server->farewell_mutex); +} + /* * The server is receiving a farewell message from a client that is * unmounting. It won't send any more requests and once it receives our @@ -1322,26 +1597,28 @@ static int server_farewell(struct super_block *sb, { struct server_info *server = SCOUTFS_SB(sb)->server_info; u64 node_id = scoutfs_net_client_node_id(conn); - struct commit_waiter cw; - int ret; + struct farewell_request *fw; if (arg_len != 0) return -EINVAL; - scoutfs_net_server_farewell(sb, conn); + /* XXX tear down if we fence, or if we shut down */ - down_read(&server->commit_rwsem); + fw = kmalloc(sizeof(struct farewell_request), GFP_NOFS); + if (fw == NULL) + return -ENOMEM; - ret = scoutfs_lock_server_farewell(sb, node_id) ?: - remove_trans_seq(sb, node_id); - if (ret == 0) - queue_commit_work(server, &cw); + fw->node_id = node_id; + fw->net_id = id; - up_read(&server->commit_rwsem); - if (ret == 0) - ret = wait_for_commit(&cw); + mutex_lock(&server->farewell_mutex); + list_add_tail(&fw->entry, &server->farewell_requests); + mutex_unlock(&server->farewell_mutex); - return scoutfs_net_response(sb, conn, cmd, id, ret, NULL, 0); + queue_work(server->wq, &server->farewell_work); + + /* response will be sent later */ + return 0; } /* requests sent to clients are tracked so we can free resources */ @@ -1992,6 +2269,8 @@ static void server_notify_down(struct super_block *sb, server->nr_clients); spin_unlock(&server->lock); + free_farewell_requests(sb, node_id); + forget_client_compacts(sb, sci); try_queue_compact(server); } else { @@ -2085,7 +2364,7 @@ out: /* XXX can we call start multiple times? */ int scoutfs_server_start(struct super_block *sb, struct sockaddr_in *sin, - u64 term) + u64 term, struct scoutfs_quorum_elected_info *qei) { DECLARE_SERVER_INFO(sb, server); @@ -2093,6 +2372,7 @@ int scoutfs_server_start(struct super_block *sb, struct sockaddr_in *sin, server->shutting_down = false; server->listen_sin = *sin; server->term = term; + server->qei = *qei; init_completion(&server->start_comp); queue_work(server->wq, &server->work); @@ -2101,7 +2381,22 @@ int scoutfs_server_start(struct super_block *sb, struct sockaddr_in *sin, return server->err; } -void scoutfs_server_stop(struct super_block *sb) +/* + * Start shutdown on the server but don't want for it to finish. + */ +void scoutfs_server_abort(struct super_block *sb) +{ + DECLARE_SERVER_INFO(sb, server); + + stop_server(server); +} + +/* + * Once the server is stopped we give the caller our election info + * which might have been modified while we were running. + */ +void scoutfs_server_stop(struct super_block *sb, + struct scoutfs_quorum_elected_info *qei) { DECLARE_SERVER_INFO(sb, server); @@ -2109,6 +2404,8 @@ void scoutfs_server_stop(struct super_block *sb) /* XXX not sure both are needed */ cancel_work_sync(&server->work); cancel_work_sync(&server->commit_work); + + *qei = server->qei; } int scoutfs_server_setup(struct super_block *sb) @@ -2135,6 +2432,9 @@ int scoutfs_server_setup(struct super_block *sb) server->compacts_per_client = 2; INIT_LIST_HEAD(&server->compacts); INIT_WORK(&server->compact_work, scoutfs_server_compact_worker); + mutex_init(&server->farewell_mutex); + INIT_LIST_HEAD(&server->farewell_requests); + INIT_WORK(&server->farewell_work, farewell_worker); server->wq = alloc_workqueue("scoutfs_server", WQ_UNBOUND | WQ_NON_REENTRANT, 0); @@ -2164,6 +2464,10 @@ void scoutfs_server_destroy(struct super_block *sb) /* recv work/compaction could have left commit_work queued */ cancel_work_sync(&server->commit_work); + /* pending farewell requests are another server's problem */ + cancel_work_sync(&server->farewell_work); + free_farewell_requests(sb, 0); + trace_scoutfs_server_workqueue_destroy(sb, 0, 0); destroy_workqueue(server->wq); diff --git a/kmod/src/server.h b/kmod/src/server.h index 2162816d..fee6ac0e 100644 --- a/kmod/src/server.h +++ b/kmod/src/server.h @@ -72,9 +72,12 @@ int scoutfs_server_lock_recover_request(struct super_block *sb, u64 node_id, struct scoutfs_key *key); struct sockaddr_in; +struct scoutfs_quorum_elected_info; int scoutfs_server_start(struct super_block *sb, struct sockaddr_in *sin, - u64 term); -void scoutfs_server_stop(struct super_block *sb); + u64 term, struct scoutfs_quorum_elected_info *qei); +void scoutfs_server_abort(struct super_block *sb); +void scoutfs_server_stop(struct super_block *sb, + struct scoutfs_quorum_elected_info *qei); int scoutfs_server_setup(struct super_block *sb); void scoutfs_server_destroy(struct super_block *sb);