diff --git a/kmod/src/fence.c b/kmod/src/fence.c index cf5da8bd..c7bb5aa4 100644 --- a/kmod/src/fence.c +++ b/kmod/src/fence.c @@ -187,7 +187,9 @@ static ssize_t reason_show(struct kobject *kobj, struct kobj_attribute *attr, unsigned r = fence->reason; char *str = "unknown"; static char *reasons[] = { - [SCOUTFS_FENCE_TEST] = "test", + [SCOUTFS_FENCE_CLIENT_RECOVERY] = "client_recovery", + [SCOUTFS_FENCE_CLIENT_RECONNECT] = "client_reconnect", + [SCOUTFS_FENCE_QUORUM_BLOCK_LEADER] = "quorum_block_leader", }; if (r < ARRAY_SIZE(reasons) && reasons[r]) diff --git a/kmod/src/fence.h b/kmod/src/fence.h index 0cd8db26..96263d7d 100644 --- a/kmod/src/fence.h +++ b/kmod/src/fence.h @@ -2,6 +2,9 @@ #define _SCOUTFS_FENCE_H_ enum { + SCOUTFS_FENCE_CLIENT_RECOVERY, + SCOUTFS_FENCE_CLIENT_RECONNECT, + SCOUTFS_FENCE_QUORUM_BLOCK_LEADER, }; int scoutfs_fence_start(struct super_block *sb, u64 rid, __be32 ipv4_addr, int reason); diff --git a/kmod/src/format.h b/kmod/src/format.h index 3c0d789f..c2b938d7 100644 --- a/kmod/src/format.h +++ b/kmod/src/format.h @@ -586,6 +586,12 @@ struct scoutfs_xattr { #define SCOUTFS_QUORUM_HB_IVAL_MS 100 #define SCOUTFS_QUORUM_HB_TIMEO_MS (5 * MSEC_PER_SEC) +/* + * A newly elected leader will give fencing some time before giving up and + * shutting down. + */ +#define SCOUTFS_QUORUM_FENCE_TO_MS (15 * MSEC_PER_SEC) + struct scoutfs_quorum_message { __le64 fsid; __le64 version; diff --git a/kmod/src/net.c b/kmod/src/net.c index 2837307c..0199b8e9 100644 --- a/kmod/src/net.c +++ b/kmod/src/net.c @@ -30,6 +30,7 @@ #include "net.h" #include "endian_swap.h" #include "tseq.h" +#include "fence.h" /* * scoutfs networking delivers requests and responses between nodes. @@ -1220,6 +1221,7 @@ static void scoutfs_net_reconn_free_worker(struct work_struct *work) unsigned long now = jiffies; unsigned long deadline = 0; bool requeue = false; + int ret; trace_scoutfs_net_reconn_free_work_enter(sb, 0, 0); @@ -1233,10 +1235,18 @@ restart: time_after_eq(now, acc->reconn_deadline))) { set_conn_fl(acc, reconn_freeing); spin_unlock(&conn->lock); - if (!test_conn_fl(conn, shutting_down)) - scoutfs_info(sb, "client timed out "SIN_FMT" -> "SIN_FMT", can not reconnect", - SIN_ARG(&acc->sockname), + if (!test_conn_fl(conn, shutting_down)) { + scoutfs_info(sb, "client "SIN_FMT" reconnect timed out, fencing", SIN_ARG(&acc->peername)); + ret = scoutfs_fence_start(sb, acc->rid, + acc->peername.sin_addr.s_addr, + SCOUTFS_FENCE_CLIENT_RECONNECT); + if (ret) { + scoutfs_err(sb, "client fence returned err %d, shutting down server", + ret); + scoutfs_server_abort(sb); + } + } destroy_conn(acc); goto restart; } diff --git a/kmod/src/quorum.c b/kmod/src/quorum.c index c7d50cab..2a6d877e 100644 --- a/kmod/src/quorum.c +++ b/kmod/src/quorum.c @@ -32,6 +32,7 @@ #include "block.h" #include "net.h" #include "sysfs.h" +#include "fence.h" #include "scoutfs_trace.h" /* @@ -461,28 +462,78 @@ static int update_quorum_block(struct super_block *sb, u64 blkno, return ret; } +/* + * The calling server had fenced previous leaders before starting up, + * now that it's up it has reclaimed their resources and can clear their + * leader flags. + */ +int scoutfs_quorum_clear_rid_leader(struct super_block *sb, u64 rid) +{ + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + struct scoutfs_super_block *super = &sbi->super; + struct mount_options *opts = &sbi->opts; + struct scoutfs_quorum_block blk; + int ret = 0; + u64 blkno; + int i; + + for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) { + if (i == opts->quorum_slot_nr || !quorum_slot_present(super, i)) + continue; + + blkno = SCOUTFS_QUORUM_BLKNO + i; + ret = read_quorum_block(sb, blkno, &blk, NULL); + if (ret < 0) + break; + + if (le64_to_cpu(blk.set_leader.rid) == rid) { + blk.flags &= ~cpu_to_le64(SCOUTFS_QUORUM_BLOCK_LEADER); + set_quorum_block_event(sb, &blk, &blk.fenced); + + ret = write_quorum_block(sb, blkno, &blk, NULL); + break; + } + } + + if (ret < 0) + scoutfs_err(sb, "error %d clearing leader block for rid %016llx", ret, rid); + + return ret; +} /* - * The calling server has been elected and updated their block, but - * can't yet assume that it has exclusive access to the metadata device. - * We read all the quorum blocks looking for previously elected leaders - * to fence so that we're the only leader running. + * The calling server has been elected, had its block updated, and has + * started running but can't yet assume that it has exclusive access to + * the metadata device. We read all the quorum blocks looking for + * previously elected leaders to fence so that we're the only leader + * running. + * + * We only wait for the previous leaders to be fenced. We don't clear + * the leader bits because the server is going to reclaim their + * resources once its up and running. Only then will the leader bits be + * cleared. + * + * Quorum will be sending heartbeats while we wait for fencing. That + * keeps us from being fenced while we allow userspace fencing to take a + * reasonably long time. We still want to timeout eventually. */ -static int fence_leader_blocks(struct super_block *sb) +int scoutfs_quorum_fence_leader_blocks(struct super_block *sb, u64 term) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct scoutfs_super_block *super = &sbi->super; struct mount_options *opts = &sbi->opts; struct scoutfs_quorum_block blk; struct sockaddr_in sin; + bool fence_started = false; u64 blkno; int ret = 0; + int err; int i; BUILD_BUG_ON(SCOUTFS_QUORUM_BLOCKS < SCOUTFS_QUORUM_MAX_SLOTS); for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) { - if (i == opts->quorum_slot_nr) + if (i == opts->quorum_slot_nr || !quorum_slot_present(super, i)) continue; blkno = SCOUTFS_QUORUM_BLKNO + i; @@ -490,27 +541,31 @@ static int fence_leader_blocks(struct super_block *sb) if (ret < 0) goto out; - if (!(le64_to_cpu(blk.flags) & SCOUTFS_QUORUM_BLOCK_LEADER)) + if (!(le64_to_cpu(blk.flags) & SCOUTFS_QUORUM_BLOCK_LEADER) || + le64_to_cpu(blk.term) > term) continue; scoutfs_inc_counter(sb, quorum_fence_leader); scoutfs_quorum_slot_sin(super, i, &sin); - scoutfs_err(sb, "fencing "SCSBF" at "SIN_FMT, - SCSB_LEFR_ARGS(super->hdr.fsid, blk.set_leader.rid), - SIN_ARG(&sin)); - - blk.flags &= ~cpu_to_le64(SCOUTFS_QUORUM_BLOCK_LEADER); - set_quorum_block_event(sb, &blk, &blk.fenced); - - ret = write_quorum_block(sb, blkno, &blk, NULL); + scoutfs_info(sb, "fencing previous leader "SCSBF" in slot %u with address "SIN_FMT, + SCSB_LEFR_ARGS(super->hdr.fsid, blk.set_leader.rid), i, SIN_ARG(&sin)); + ret = scoutfs_fence_start(sb, le64_to_cpu(blk.set_leader.rid), sin.sin_addr.s_addr, + SCOUTFS_FENCE_QUORUM_BLOCK_LEADER); if (ret < 0) goto out; + fence_started = true; + } out: + if (fence_started) { + err = scoutfs_fence_wait_fenced(sb, msecs_to_jiffies(SCOUTFS_QUORUM_FENCE_TO_MS)); + if (ret == 0) + ret = err; + } if (ret < 0) { - scoutfs_err(sb, "error %d fencing active", ret); + scoutfs_err(sb, "error %d fencing leader blocks", ret); scoutfs_inc_counter(sb, quorum_fence_error); } @@ -670,10 +725,8 @@ static void scoutfs_quorum_worker(struct work_struct *work) qst.term); qst.timeout = heartbeat_interval(); - /* set our leader flag and fence */ - ret = update_quorum_block(sb, blkno, &mark, - qst.role, qst.term) ?: - fence_leader_blocks(sb); + /* set our leader flag before starting server */ + ret = update_quorum_block(sb, blkno, &mark, qst.role, qst.term); if (ret < 0) goto out; @@ -727,6 +780,7 @@ static void scoutfs_quorum_worker(struct work_struct *work) /* always try to stop a running server as we stop */ if (test_bit(QINF_FLAG_SERVER, &qinf->flags)) { scoutfs_server_stop(sb); + scoutfs_fence_stop(sb); send_msg_others(sb, SCOUTFS_QUORUM_MSG_RESIGNATION, qst.term); } diff --git a/kmod/src/quorum.h b/kmod/src/quorum.h index f0994871..3f41fd65 100644 --- a/kmod/src/quorum.h +++ b/kmod/src/quorum.h @@ -8,6 +8,9 @@ u8 scoutfs_quorum_votes_needed(struct super_block *sb); void scoutfs_quorum_slot_sin(struct scoutfs_super_block *super, int i, struct sockaddr_in *sin); +int scoutfs_quorum_fence_leader_blocks(struct super_block *sb, u64 term); +int scoutfs_quorum_clear_rid_leader(struct super_block *sb, u64 rid); + int scoutfs_quorum_setup(struct super_block *sb); void scoutfs_quorum_shutdown(struct super_block *sb); void scoutfs_quorum_destroy(struct super_block *sb); diff --git a/kmod/src/server.c b/kmod/src/server.c index fb353b25..fbe602a9 100644 --- a/kmod/src/server.c +++ b/kmod/src/server.c @@ -40,6 +40,7 @@ #include "forest.h" #include "recov.h" #include "omap.h" +#include "fence.h" /* * Every active mount can act as the server that listens on a net @@ -106,6 +107,8 @@ struct server_info { /* recovery timeout fences from work */ struct work_struct fence_pending_recov_work; + /* while running we check for fenced mounts to reclaim */ + struct delayed_work reclaim_dwork; }; #define DECLARE_SERVER_INFO(sb, name) \ @@ -1672,7 +1675,15 @@ static bool invalid_mounted_client_item(struct scoutfs_btree_item_ref *iref) sizeof(struct scoutfs_mounted_client_btree_val)); } -static int reclaim_rid(struct super_block *sb, u64 rid) +/* + * Reclaim all the resources for a mount which has gone away. It's sent + * us a farewell promising to leave or we actively fenced it. + * + * It's safe to call this multiple times for a given rid. Each + * individual action knows to recognize that it's already been performed + * and return success. + */ +static int reclaim_rid(struct super_block *sb, u64 rid, bool clear_leader) { int ret; @@ -1680,13 +1691,14 @@ static int reclaim_rid(struct super_block *sb, u64 rid) if (ret < 0) return ret; - /* delete mounted client last, client reconnect looks for it */ + /* delete mounted client last, recovery looks for it */ ret = scoutfs_lock_server_farewell(sb, rid) ?: remove_trans_seq(sb, rid) ?: reclaim_log_trees(sb, rid) ?: cancel_srch_compact(sb, rid) ?: - delete_mounted_client(sb, rid) ?: - scoutfs_omap_remove_rid(sb, rid); + scoutfs_omap_remove_rid(sb, rid) ?: + (clear_leader ? scoutfs_quorum_clear_rid_leader(sb, rid) : 0) ?: + delete_mounted_client(sb, rid); return scoutfs_server_apply_commit(sb, ret); } @@ -1816,9 +1828,9 @@ static void farewell_worker(struct work_struct *work) } } - /* process and send farewell responses */ + /* clean up resources for mounts before sending responses */ list_for_each_entry_safe(fw, tmp, &send, entry) { - ret = reclaim_rid(sb, fw->rid); + ret = reclaim_rid(sb, fw->rid, false); if (ret) goto out; } @@ -2017,21 +2029,19 @@ static void fence_pending_recov_worker(struct work_struct *work) struct server_info *server = container_of(work, struct server_info, fence_pending_recov_work); struct super_block *sb = server->sb; - u64 rid; + u64 rid = 0; int ret; - while ((rid = scoutfs_recov_next_pending(sb, SCOUTFS_RECOV_ALL)) > 0) { + while ((rid = scoutfs_recov_next_pending(sb, rid, SCOUTFS_RECOV_ALL)) > 0) { scoutfs_err(sb, "%lu ms recovery timeout expired for client rid %016llx, fencing", SERVER_RECOV_TIMEOUT_MS, rid); - ret = reclaim_rid(sb, rid); - if (ret < 0) { - scoutfs_err(sb, "error %d reclaiming rid %016llx, shutting down", ret, rid); - stop_server(server); + ret = scoutfs_fence_start(sb, rid, 0, SCOUTFS_FENCE_CLIENT_RECOVERY); + if (ret) { + scoutfs_err(sb, "fence returned err %d, shutting down server", ret); + scoutfs_server_abort(sb); break; } - - scoutfs_server_recov_finish(sb, rid, SCOUTFS_RECOV_ALL); } } @@ -2102,6 +2112,69 @@ out: return ret; } +static void queue_reclaim_work(struct server_info *server, unsigned long delay) +{ + if (!server->shutting_down) + queue_delayed_work(server->wq, &server->reclaim_dwork, delay); +} + +#define RECLAIM_WORK_DELAY_MS MSEC_PER_SEC + +/* + * Fencing is performed by userspace and can happen as we're elected + * leader before the server is running. Once we're running we want to + * reclaim resources from any mounts that may have been fenced. + * + * The reclaim worker runs regularly in the background and reclaims the + * resources for mounts that have been fenced. Once the fenced rid has + * been reclaimed the fence request can be removed. + * + * This is queued by the server work as it starts up, requeues itself + * until shutdown, and is then canceled by the server work as it shuts + * down. + */ +static void reclaim_worker(struct work_struct *work) +{ + struct server_info *server = container_of(work, struct server_info, reclaim_dwork.work); + struct super_block *sb = server->sb; + bool error; + int reason; + u64 rid; + int ret; + + ret = scoutfs_fence_next(sb, &rid, &reason, &error); + if (ret < 0) + goto out; + + if (error == true) { + scoutfs_err(sb, "saw error indicator on fence request for rid %016llx, shutting down server", + rid); + scoutfs_server_abort(sb); + ret = -ESHUTDOWN; + goto out; + } + + ret = reclaim_rid(sb, rid, reason == SCOUTFS_FENCE_QUORUM_BLOCK_LEADER); + if (ret < 0) { + scoutfs_err(sb, "failure to reclaim fenced rid %016llx: err %d, shutting down server", + rid, ret); + scoutfs_server_abort(sb); + goto out; + } + + scoutfs_info(sb, "successfully reclaimed resources for fenced rid %016llx", rid); + scoutfs_fence_free(sb, rid); + scoutfs_server_recov_finish(sb, rid, SCOUTFS_RECOV_ALL); + ret = 0; + +out: + /* queue next reclaim immediately if we're making progress */ + if (ret == 0) + queue_reclaim_work(server, 0); + else + queue_reclaim_work(server, msecs_to_jiffies(RECLAIM_WORK_DELAY_MS)); +} + static void scoutfs_server_worker(struct work_struct *work) { struct server_info *server = container_of(work, struct server_info, @@ -2118,6 +2191,11 @@ static void scoutfs_server_worker(struct work_struct *work) trace_scoutfs_server_work_enter(sb, 0, 0); + /* first make sure no other servers are still running */ + ret = scoutfs_quorum_fence_leader_blocks(sb, server->term); + if (ret < 0) + goto out; + scoutfs_quorum_slot_sin(super, opts->quorum_slot_nr, &sin); scoutfs_info(sb, "server setting up at "SIN_FMT, SIN_ARG(&sin)); @@ -2189,6 +2267,8 @@ static void scoutfs_server_worker(struct work_struct *work) scoutfs_info(sb, "server ready at "SIN_FMT, SIN_ARG(&sin)); complete(&server->start_comp); + queue_reclaim_work(server, 0); + /* wait_event/wake_up provide barriers */ wait_event_interruptible(server->waitq, server->shutting_down); @@ -2197,6 +2277,7 @@ shutdown: /* wait for farewell to finish sending messages */ flush_work(&server->farewell_work); + cancel_delayed_work_sync(&server->reclaim_dwork); /* wait for requests to finish, no more requests */ scoutfs_net_shutdown(sb, conn); @@ -2209,6 +2290,7 @@ shutdown: /* wait for extra queues by requests, won't find waiters */ flush_work(&server->commit_work); + scoutfs_fence_stop(sb); scoutfs_lock_server_destroy(sb); scoutfs_omap_server_shutdown(sb); @@ -2299,6 +2381,7 @@ int scoutfs_server_setup(struct super_block *sb) seqcount_init(&server->volopt_seqcount); mutex_init(&server->volopt_mutex); INIT_WORK(&server->fence_pending_recov_work, fence_pending_recov_worker); + INIT_DELAYED_WORK(&server->reclaim_dwork, reclaim_worker); server->wq = alloc_workqueue("scoutfs_server", WQ_UNBOUND | WQ_NON_REENTRANT, 0);