diff --git a/kmod/src/btree.c b/kmod/src/btree.c index a7b0f130..c4ef6921 100644 --- a/kmod/src/btree.c +++ b/kmod/src/btree.c @@ -348,6 +348,7 @@ static void advance_to_next_half(struct scoutfs_btree_ring *bring) static size_t super_root_offsets[] = { offsetof(struct scoutfs_super_block, alloc_root), offsetof(struct scoutfs_super_block, manifest.root), + offsetof(struct scoutfs_super_block, lock_clients), }; #define for_each_super_root(super, i, root) \ diff --git a/kmod/src/client.c b/kmod/src/client.c index 85a5a729..d0d0325f 100644 --- a/kmod/src/client.c +++ b/kmod/src/client.c @@ -260,6 +260,19 @@ int scoutfs_client_lock_response(struct super_block *sb, u64 net_id, net_id, 0, nl, sizeof(*nl)); } +/* Send a lock recover response to the server. */ +int scoutfs_client_lock_recover_response(struct super_block *sb, u64 net_id, + struct scoutfs_net_lock_recover *nlr) +{ + struct client_info *client = SCOUTFS_SB(sb)->client_info; + u16 bytes = offsetof(struct scoutfs_net_lock_recover, + locks[le16_to_cpu(nlr->nr)]); + + return scoutfs_net_response(sb, client->conn, + SCOUTFS_NET_CMD_LOCK_RECOVER, + net_id, 0, nlr, bytes); +} + /* The client is receiving a invalidation request from the server */ static int client_lock(struct super_block *sb, struct scoutfs_net_connection *conn, u8 cmd, u64 id, @@ -273,6 +286,19 @@ static int client_lock(struct super_block *sb, return scoutfs_lock_invalidate_request(sb, id, arg); } +/* The server is asking us for the client's locks starting with the given key */ +static int client_lock_recover(struct super_block *sb, + struct scoutfs_net_connection *conn, + u8 cmd, u64 id, void *arg, u16 arg_len) +{ + if (arg_len != sizeof(struct scoutfs_key)) + return -EINVAL; + + /* XXX error? */ + + return scoutfs_lock_recover_request(sb, id, arg); +} + /* * Process a greeting response in the client from the server. This is * called for every connected socket on the connection. The first @@ -508,6 +534,7 @@ out: static scoutfs_net_request_t client_req_funcs[] = { [SCOUTFS_NET_CMD_COMPACT] = client_compact, [SCOUTFS_NET_CMD_LOCK] = client_lock, + [SCOUTFS_NET_CMD_LOCK_RECOVER] = client_lock_recover, }; /* diff --git a/kmod/src/client.h b/kmod/src/client.h index ca244f9d..dd0c2eae 100644 --- a/kmod/src/client.h +++ b/kmod/src/client.h @@ -21,6 +21,8 @@ int scoutfs_client_lock_request(struct super_block *sb, struct scoutfs_net_lock *nl); int scoutfs_client_lock_response(struct super_block *sb, u64 net_id, struct scoutfs_net_lock *nl); +int scoutfs_client_lock_recover_response(struct super_block *sb, u64 net_id, + struct scoutfs_net_lock_recover *nlr); int scoutfs_client_wait_node_id(struct super_block *sb); int scoutfs_client_setup(struct super_block *sb); diff --git a/kmod/src/counters.h b/kmod/src/counters.h index e997fe31..a35df40b 100644 --- a/kmod/src/counters.h +++ b/kmod/src/counters.h @@ -98,6 +98,7 @@ EXPAND_COUNTER(lock_lock) \ EXPAND_COUNTER(lock_lock_error) \ EXPAND_COUNTER(lock_nonblock_eagain) \ + EXPAND_COUNTER(lock_recover_request) \ EXPAND_COUNTER(lock_shrink_queued) \ EXPAND_COUNTER(lock_shrink_request_aborted) \ EXPAND_COUNTER(lock_unlock) \ diff --git a/kmod/src/format.h b/kmod/src/format.h index d46b2195..be2931dc 100644 --- a/kmod/src/format.h +++ b/kmod/src/format.h @@ -272,6 +272,14 @@ struct scoutfs_extent_btree_key { __be64 minor; } __packed; +/* + * The lock server keeps a persistent record of connected clients so that + * server failover knows who to wait for before resuming operations. + */ +struct scoutfs_lock_client_btree_key { + __be64 node_id; +} __packed; + /* * The max number of links defines the max number of entries that we can * index in o(log n) and the static list head storage size in the @@ -456,6 +464,7 @@ struct scoutfs_super_block { struct scoutfs_btree_root alloc_root; struct scoutfs_manifest manifest; struct scoutfs_quorum_config quorum_config; + struct scoutfs_btree_root lock_clients; } __packed; #define SCOUTFS_ROOT_INO 1 @@ -642,6 +651,7 @@ enum { SCOUTFS_NET_CMD_STATFS, SCOUTFS_NET_CMD_COMPACT, SCOUTFS_NET_CMD_LOCK, + SCOUTFS_NET_CMD_LOCK_RECOVER, SCOUTFS_NET_CMD_FAREWELL, SCOUTFS_NET_CMD_UNKNOWN, }; @@ -768,6 +778,15 @@ struct scoutfs_net_lock { __u8 new_mode; } __packed; +struct scoutfs_net_lock_recover { + __le16 nr; + struct scoutfs_net_lock locks[0]; +} __packed; + +#define SCOUTFS_NET_LOCK_MAX_RECOVER_NR \ + ((SCOUTFS_NET_MAX_DATA_LEN - sizeof(struct scoutfs_net_lock_recover)) /\ + sizeof(struct scoutfs_net_lock)) + /* some enums for tracing */ enum { SLT_CLIENT, diff --git a/kmod/src/lock.c b/kmod/src/lock.c index c33c43bd..fa490b7c 100644 --- a/kmod/src/lock.c +++ b/kmod/src/lock.c @@ -54,6 +54,13 @@ * lock attempt can't immediately match an existing granted lock. This * is fine for the only rare user which can back out of its lock * inversion and retry with a full blocking lock. + * + * Lock recovery is initiated by the server when it recognizes that + * we're reconnecting to it while a previous server left a persistenr + * record of us. We resend all our pending requests which are deferred + * until recovery finishes. The server sends us a recovery request and + * we respond with all our locks. Our resent requests are processed + * relative to that lock state we resend. */ #define GRACE_PERIOD_KT ms_to_ktime(2) @@ -407,7 +414,8 @@ static void lock_remove(struct lock_info *linfo, struct scoutfs_lock *lock) } static struct scoutfs_lock *lock_lookup(struct super_block *sb, - struct scoutfs_key *start) + struct scoutfs_key *start, + struct scoutfs_lock **next) { DECLARE_LOCK_INFO(sb, linfo); struct rb_node *node = linfo->lock_tree.rb_node; @@ -416,16 +424,22 @@ static struct scoutfs_lock *lock_lookup(struct super_block *sb, assert_spin_locked(&linfo->lock); + if (next) + *next = NULL; + while (node) { lock = container_of(node, struct scoutfs_lock, node); cmp = scoutfs_key_compare(start, &lock->start); - if (cmp < 0) + if (cmp < 0) { + if (next) + *next = lock; node = node->rb_left; - else if (cmp > 0) + } else if (cmp > 0) { node = node->rb_right; - else + } else { return lock; + } } return NULL; @@ -454,7 +468,7 @@ static struct scoutfs_lock *get_lock(struct super_block *sb, assert_spin_locked(&linfo->lock); - lock = lock_lookup(sb, start); + lock = lock_lookup(sb, start, NULL); if (lock) __lock_del_lru(linfo, lock); @@ -599,7 +613,7 @@ int scoutfs_lock_grant_response(struct super_block *sb, spin_lock(&linfo->lock); /* lock must already be busy with request_pending */ - lock = lock_lookup(sb, &nl->key); + lock = lock_lookup(sb, &nl->key, NULL); BUG_ON(!lock); BUG_ON(!lock->request_pending); @@ -750,6 +764,58 @@ int scoutfs_lock_invalidate_request(struct super_block *sb, u64 net_id, return 0; } +/* + * The server is asking us to send them as many locks as we can starting + * with the given key. We'll send a response with 0 locks to indicate + * that we've sent all our locks. This is called in client processing + * so the client won't try to reconnect to another server until we + * return. + */ +int scoutfs_lock_recover_request(struct super_block *sb, u64 net_id, + struct scoutfs_key *key) +{ + DECLARE_LOCK_INFO(sb, linfo); + struct scoutfs_net_lock_recover *nlr; + struct scoutfs_lock *lock; + struct scoutfs_lock *next; + struct rb_node *node; + int ret; + int i; + + scoutfs_inc_counter(sb, lock_recover_request); + + nlr = kmalloc(offsetof(struct scoutfs_net_lock_recover, + locks[SCOUTFS_NET_LOCK_MAX_RECOVER_NR]), + GFP_NOFS); + if (!nlr) + return -ENOMEM; + + spin_lock(&linfo->lock); + + lock = lock_lookup(sb, key, &next) ?: next; + + for (i = 0; lock && i < SCOUTFS_NET_LOCK_MAX_RECOVER_NR; i++) { + + nlr->locks[i].key = lock->start; + nlr->locks[i].old_mode = lock->mode; + nlr->locks[i].new_mode = lock->mode; + + node = rb_next(&lock->node); + if (node) + lock = rb_entry(node, struct scoutfs_lock, node); + else + lock = NULL; + } + + nlr->nr = cpu_to_le16(i); + + spin_unlock(&linfo->lock); + + ret = scoutfs_client_lock_recover_response(sb, net_id, nlr); + kfree(nlr); + return ret; +} + static bool lock_wait_cond(struct super_block *sb, struct scoutfs_lock *lock, int mode) { diff --git a/kmod/src/lock.h b/kmod/src/lock.h index b3920ed3..b6cbf767 100644 --- a/kmod/src/lock.h +++ b/kmod/src/lock.h @@ -48,6 +48,8 @@ int scoutfs_lock_grant_response(struct super_block *sb, struct scoutfs_net_lock *nl); int scoutfs_lock_invalidate_request(struct super_block *sb, u64 net_id, struct scoutfs_net_lock *nl); +int scoutfs_lock_recover_request(struct super_block *sb, u64 net_id, + struct scoutfs_key *key); int scoutfs_lock_inode(struct super_block *sb, int mode, int flags, struct inode *inode, struct scoutfs_lock **ret_lock); diff --git a/kmod/src/lock_server.c b/kmod/src/lock_server.c index bb5f4ed3..5c55362c 100644 --- a/kmod/src/lock_server.c +++ b/kmod/src/lock_server.c @@ -18,6 +18,9 @@ #include "counters.h" #include "net.h" #include "tseq.h" +#include "spbm.h" +#include "btree.h" +#include "msg.h" #include "scoutfs_trace.h" #include "lock_server.h" @@ -67,10 +70,18 @@ * server shuts down. */ +#define LOCK_SERVER_RECOVERY_MS (10 * MSEC_PER_SEC) + struct lock_server_info { + struct super_block *sb; + spinlock_t lock; + struct mutex mutex; struct rb_root locks_root; + struct scoutfs_spbm recovery_pending; + struct delayed_work recovery_dwork; + struct scoutfs_tseq_tree tseq_tree; struct dentry *tseq_dentry; }; @@ -222,10 +233,12 @@ static bool client_entries_compatible(struct client_lock_entry *granted, */ static struct server_lock_node *get_server_lock(struct lock_server_info *inf, struct scoutfs_key *key, - struct server_lock_node *ins) + struct server_lock_node *ins, + bool or_next) { struct rb_root *root = &inf->locks_root; struct server_lock_node *ret = NULL; + struct server_lock_node *next = NULL; struct server_lock_node *snode; struct rb_node *parent = NULL; struct rb_node **node; @@ -240,6 +253,8 @@ static struct server_lock_node *get_server_lock(struct lock_server_info *inf, cmp = scoutfs_key_compare(key, &snode->key); if (cmp < 0) { + if (or_next) + next = snode; node = &(*node)->rb_left; } else if (cmp > 0) { node = &(*node)->rb_right; @@ -255,6 +270,9 @@ static struct server_lock_node *get_server_lock(struct lock_server_info *inf, ret = ins; } + if (ret == NULL && or_next && next) + ret = next; + if (ret) atomic_inc(&ret->refcount); @@ -266,6 +284,33 @@ static struct server_lock_node *get_server_lock(struct lock_server_info *inf, return ret; } +/* Get a server lock node, allocating if one doesn't exist. Caller must put. */ +static struct server_lock_node *alloc_server_lock(struct lock_server_info *inf, + struct scoutfs_key *key) +{ + struct server_lock_node *snode; + struct server_lock_node *ins; + + snode = get_server_lock(inf, key, NULL, false); + if (snode == NULL) { + ins = kzalloc(sizeof(struct server_lock_node), GFP_NOFS); + if (ins) { + atomic_set(&ins->refcount, 0); + mutex_init(&ins->mutex); + ins->key = *key; + INIT_LIST_HEAD(&ins->granted); + INIT_LIST_HEAD(&ins->requested); + INIT_LIST_HEAD(&ins->invalidated); + + snode = get_server_lock(inf, key, ins, false); + if (snode != ins) + kfree(ins); + } + } + + return snode; +} + /* * Finish with a server lock which has the mutex held, freeing it if * it's empty and unused. @@ -324,7 +369,6 @@ int scoutfs_lock_server_request(struct super_block *sb, u64 node_id, DECLARE_LOCK_SERVER_INFO(sb, inf); struct client_lock_entry *clent; struct server_lock_node *snode; - struct server_lock_node *ins; int ret; trace_scoutfs_lock_message(sb, SLT_SERVER, SLT_GRANT, SLT_REQUEST, @@ -346,25 +390,11 @@ int scoutfs_lock_server_request(struct super_block *sb, u64 node_id, clent->net_id = net_id; clent->mode = nl->new_mode; - snode = get_server_lock(inf, &nl->key, NULL); + snode = alloc_server_lock(inf, &nl->key); if (snode == NULL) { - ins = kzalloc(sizeof(struct server_lock_node), GFP_NOFS); - if (ins == NULL) { - kfree(clent); - ret = -ENOMEM; - goto out; - } - - atomic_set(&ins->refcount, 0); - mutex_init(&ins->mutex); - ins->key = nl->key; - INIT_LIST_HEAD(&ins->granted); - INIT_LIST_HEAD(&ins->requested); - INIT_LIST_HEAD(&ins->invalidated); - - snode = get_server_lock(inf, &nl->key, ins); - if (snode != ins) - kfree(ins); + kfree(clent); + ret = -ENOMEM; + goto out; } clent->snode = snode; @@ -401,7 +431,7 @@ int scoutfs_lock_server_response(struct super_block *sb, u64 node_id, } /* XXX should always have a server lock here? recovery? */ - snode = get_server_lock(inf, &nl->key, NULL); + snode = get_server_lock(inf, &nl->key, NULL, false); if (!snode) { ret = -EINVAL; goto out; @@ -441,6 +471,14 @@ out: * This is called with the snode mutex held. This can free the snode if * it's empty. The caller can't reference the snode once this returns * so we unlock the snode mutex. + * + * All progress must wait for all clients to finish with recovery + * because we don't know which locks they'll hold. The unlocked + * recovery_pending test here is OK. It's filled by setup before + * anything runs. It's emptied by recovery completion. We can get a + * false nonempty result if we race with recovery completion, but that's + * OK because recovery completion processes all the locks that have + * requests after emptying, including the unlikely loser of that race. */ static int process_waiting_requests(struct super_block *sb, struct server_lock_node *snode) @@ -455,8 +493,9 @@ static int process_waiting_requests(struct super_block *sb, BUG_ON(!mutex_is_locked(&snode->mutex)); - /* request processing waits for all invalidation responses */ - if (!list_empty(&snode->invalidated)) { + /* processing waits for all invalidation responses or recovery */ + if (!list_empty(&snode->invalidated) || + !scoutfs_spbm_empty(&inf->recovery_pending)) { ret = 0; goto out; } @@ -523,6 +562,320 @@ out: return ret; } +/* + * The server received a greeting from a client for the first time. If + * the client had already talked to the server then we must find an + * existing record for it and should begin recovery. If it doesn't have + * a record then its timed out and we can't allow it to reconnect. If + * its connecting for the first time then we insert a new record. If + * + * This is running in concurrent client greeting processing contexts. + */ +int scoutfs_lock_server_greeting(struct super_block *sb, u64 node_id, + bool should_exist) +{ + DECLARE_LOCK_SERVER_INFO(sb, inf); + struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super; + struct scoutfs_lock_client_btree_key cbk; + SCOUTFS_BTREE_ITEM_REF(iref); + struct scoutfs_key key; + int ret; + + cbk.node_id = cpu_to_be64(node_id); + + mutex_lock(&inf->mutex); + if (should_exist) { + ret = scoutfs_btree_lookup(sb, &super->lock_clients, + &cbk, sizeof(cbk), &iref); + if (ret == 0) + scoutfs_btree_put_iref(&iref); + } else { + ret = scoutfs_btree_insert(sb, &super->lock_clients, + &cbk, sizeof(cbk), NULL, 0); + } + mutex_unlock(&inf->mutex); + + if (should_exist && ret == 0) { + scoutfs_key_set_zeros(&key); + ret = scoutfs_server_lock_recover_request(sb, node_id, &key); + if (ret) + goto out; + } + +out: + return ret; +} + +/* + * A client sent their last recovery response and can exit recovery. If + * they were the last client in recovery then we can process all the + * server locks that had requests. + */ +static int finished_recovery(struct super_block *sb, u64 node_id, bool cancel) +{ + DECLARE_LOCK_SERVER_INFO(sb, inf); + struct server_lock_node *snode; + struct scoutfs_key key; + bool still_pending; + int ret = 0; + + spin_lock(&inf->lock); + scoutfs_spbm_clear(&inf->recovery_pending, node_id); + still_pending = !scoutfs_spbm_empty(&inf->recovery_pending); + spin_unlock(&inf->lock); + if (still_pending) + return 0; + + if (cancel) + cancel_delayed_work_sync(&inf->recovery_dwork); + + scoutfs_key_set_zeros(&key); + + while ((snode = get_server_lock(inf, &key, NULL, true))) { + + key = snode->key; + scoutfs_key_inc(&key); + + if (!list_empty(&snode->requested)) { + ret = process_waiting_requests(sb, snode); + if (ret) + break; + } else { + put_server_lock(inf, snode); + } + } + + return ret; +} + +/* + * We sent a lock recover request to the client when we received its + * greeting while in recovery. Here we instantiate all the locks it + * gave us in response and send another request from the next key. + * We're done once we receive an empty response. + */ +int scoutfs_lock_server_recover_response(struct super_block *sb, u64 node_id, + struct scoutfs_net_lock_recover *nlr) +{ + DECLARE_LOCK_SERVER_INFO(sb, inf); + struct client_lock_entry *existing; + struct client_lock_entry *clent; + struct server_lock_node *snode; + struct scoutfs_key key; + int ret = 0; + int i; + + /* client must be in recovery */ + spin_lock(&inf->lock); + if (!scoutfs_spbm_test(&inf->recovery_pending, node_id)) + ret = -EINVAL; + spin_unlock(&inf->lock); + if (ret) + goto out; + + /* client has sent us all their locks */ + if (nlr->nr == 0) { + ret = finished_recovery(sb, node_id, true); + goto out; + } + + for (i = 0; i < le16_to_cpu(nlr->nr); i++) { + clent = kzalloc(sizeof(struct client_lock_entry), GFP_NOFS); + if (!clent) { + ret = -ENOMEM; + goto out; + } + + INIT_LIST_HEAD(&clent->head); + clent->node_id = node_id; + clent->net_id = 0; + clent->mode = nlr->locks[i].new_mode; + + snode = alloc_server_lock(inf, &nlr->locks[i].key); + if (snode == NULL) { + kfree(clent); + ret = -ENOMEM; + goto out; + } + + existing = find_entry(snode, &snode->granted, node_id); + if (existing) { + kfree(clent); + put_server_lock(inf, snode); + ret = -EEXIST; + goto out; + } + + clent->snode = snode; + add_client_entry(snode, &snode->granted, clent); + scoutfs_tseq_add(&inf->tseq_tree, &clent->tseq_entry); + + put_server_lock(inf, snode); + } + + /* send request for next batch of keys */ + key = nlr->locks[le16_to_cpu(nlr->nr) - 1].key; + scoutfs_key_inc(&key); + + ret = scoutfs_server_lock_recover_request(sb, node_id, &key); +out: + return ret; +} + +static int node_id_and_put_iref(struct scoutfs_btree_item_ref *iref, + u64 *node_id) +{ + struct scoutfs_lock_client_btree_key *cbk; + int ret; + + if (iref->key_len == sizeof(*cbk) && iref->val_len == 0) { + cbk = iref->key; + *node_id = be64_to_cpu(cbk->node_id); + ret = 0; + } else { + ret = -EIO; + } + scoutfs_btree_put_iref(iref); + return ret; +} + +/* + * This work executes if enough time passes without all of the clients + * finishing with recovery and canceling the work. We walk through the + * client records and find any that still have their recovery pending. + */ +static void scoutfs_lock_server_recovery_timeout(struct work_struct *work) +{ + struct lock_server_info *inf = container_of(work, + struct lock_server_info, + recovery_dwork.work); + struct super_block *sb = inf->sb; + struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super; + struct scoutfs_lock_client_btree_key cbk; + SCOUTFS_BTREE_ITEM_REF(iref); + bool timed_out; + u64 node_id; + int ret; + + /* we enter recovery if there are any client records */ + for (node_id = 0; ; node_id++) { + cbk.node_id = cpu_to_be64(node_id); + ret = scoutfs_btree_next(sb, &super->lock_clients, + &cbk, sizeof(cbk), &iref); + if (ret == -ENOENT) { + ret = 0; + break; + } + if (ret == 0) + ret = node_id_and_put_iref(&iref, &node_id); + if (ret < 0) + break; + + spin_lock(&inf->lock); + if (scoutfs_spbm_test(&inf->recovery_pending, node_id)) { + scoutfs_spbm_clear(&inf->recovery_pending, node_id); + timed_out = true; + } else { + timed_out = false; + } + spin_unlock(&inf->lock); + + if (!timed_out) + continue; + + scoutfs_err(sb, "client node_id %llu lock recovery timed out", + node_id); + + /* XXX these aren't immediately committed */ + cbk.node_id = cpu_to_be64(node_id); + ret = scoutfs_btree_delete(sb, &super->lock_clients, + &cbk, sizeof(cbk)); + if (ret) + break; + } + + /* force processing all pending lock requests */ + if (ret == 0) + ret = finished_recovery(sb, 0, false); + + if (ret < 0) { + scoutfs_err(sb, "lock server saw err %d while timing out clients, shutting down", ret); + scoutfs_server_stop(sb); + } +} + +/* + * A client is leaving the lock service. They aren't using locks and + * won't send any more requests. We tear down all the state we had for + * them. This can be called multiple times for a given client as their + * farewell is resent to new servers. It's OK to not find any state. + * If we fail to delete a persistent entry then we have to shut down and + * hope that the next server has more luck. + */ +int scoutfs_lock_server_farewell(struct super_block *sb, u64 node_id) +{ + DECLARE_LOCK_SERVER_INFO(sb, inf); + struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super; + struct scoutfs_lock_client_btree_key cli; + struct client_lock_entry *clent; + struct client_lock_entry *tmp; + struct server_lock_node *snode; + struct scoutfs_key key; + struct list_head *list; + bool freed; + int ret = 0; + + cli.node_id = cpu_to_be64(node_id); + mutex_lock(&inf->mutex); + ret = scoutfs_btree_delete(sb, &super->lock_clients, &cli, sizeof(cli)); + mutex_unlock(&inf->mutex); + if (ret == -ENOENT) { + ret = 0; + goto out; + } + if (ret < 0) + goto out; + + scoutfs_key_set_zeros(&key); + + while ((snode = get_server_lock(inf, &key, NULL, true))) { + + freed = false; + for (list = &snode->granted; list != NULL; + list = (list == &snode->granted) ? &snode->requested : + (list == &snode->requested) ? &snode->invalidated : + NULL) { + + list_for_each_entry_safe(clent, tmp, list, head) { + if (clent->node_id == node_id) { + free_client_entry(inf, snode, clent); + freed = true; + } + } + } + + key = snode->key; + scoutfs_key_inc(&key); + + if (freed) { + ret = process_waiting_requests(sb, snode); + if (ret) + goto out; + } else { + put_server_lock(inf, snode); + } + } + ret = 0; + +out: + if (ret < 0) { + scoutfs_err(sb, "lock server err %d during node %llu farewell, shutting down", ret, node_id); + scoutfs_server_stop(sb); + } + + return ret; +} + static char *lock_mode_string(u8 mode) { static char *mode_strings[] = { @@ -566,17 +919,35 @@ static void lock_server_tseq_show(struct seq_file *m, clent->net_id); } +/* + * Setup the lock server. This is called before networking can deliver + * requests. If we find existing client records then we enter recovery. + * Lock request processing is deferred until recovery is resolved for + * all the existing clients, either they reconnect and replay locks or + * we time them out. + */ int scoutfs_lock_server_setup(struct super_block *sb) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super; struct lock_server_info *inf; + SCOUTFS_BTREE_ITEM_REF(iref); + struct scoutfs_lock_client_btree_key cbk; + unsigned int nr; + u64 node_id; + int ret; inf = kzalloc(sizeof(struct lock_server_info), GFP_KERNEL); if (!inf) return -ENOMEM; + inf->sb = sb; spin_lock_init(&inf->lock); + mutex_init(&inf->mutex); inf->locks_root = RB_ROOT; + scoutfs_spbm_init(&inf->recovery_pending); + INIT_DELAYED_WORK(&inf->recovery_dwork, + scoutfs_lock_server_recovery_timeout); scoutfs_tseq_tree_init(&inf->tseq_tree, lock_server_tseq_show); inf->tseq_dentry = scoutfs_tseq_create("server_locks", sbi->debug_root, @@ -588,7 +959,37 @@ int scoutfs_lock_server_setup(struct super_block *sb) sbi->lock_server_info = inf; - return 0; + /* we enter recovery if there are any client records */ + nr = 0; + for (node_id = 0; ; node_id++) { + cbk.node_id = cpu_to_be64(node_id); + ret = scoutfs_btree_next(sb, &super->lock_clients, + &cbk, sizeof(cbk), &iref); + if (ret == -ENOENT) + break; + if (ret == 0) + ret = node_id_and_put_iref(&iref, &node_id); + if (ret < 0) + goto out; + + ret = scoutfs_spbm_set(&inf->recovery_pending, node_id); + if (ret) + goto out; + nr++; + + if (node_id == U64_MAX) + break; + } + ret = 0; + + if (nr) { + schedule_delayed_work(&inf->recovery_dwork, + msecs_to_jiffies(LOCK_SERVER_RECOVERY_MS)); + scoutfs_warn(sb, "waiting for %u lock clients to connect", nr); + } + +out: + return ret; } /* @@ -606,6 +1007,8 @@ void scoutfs_lock_server_destroy(struct super_block *sb) LIST_HEAD(list); if (inf) { + cancel_delayed_work_sync(&inf->recovery_dwork); + debugfs_remove(inf->tseq_dentry); rbtree_postorder_for_each_entry_safe(snode, stmp, @@ -624,6 +1027,8 @@ void scoutfs_lock_server_destroy(struct super_block *sb) kfree(snode); } + scoutfs_spbm_destroy(&inf->recovery_pending); + kfree(inf); sbi->lock_server_info = NULL; } diff --git a/kmod/src/lock_server.h b/kmod/src/lock_server.h index 2b6f4b1f..cc0606a8 100644 --- a/kmod/src/lock_server.h +++ b/kmod/src/lock_server.h @@ -1,10 +1,15 @@ #ifndef _SCOUTFS_LOCK_SERVER_H_ #define _SCOUTFS_LOCK_SERVER_H_ +int scoutfs_lock_server_recover_response(struct super_block *sb, u64 node_id, + struct scoutfs_net_lock_recover *nlr); int scoutfs_lock_server_request(struct super_block *sb, u64 node_id, u64 net_id, struct scoutfs_net_lock *nl); +int scoutfs_lock_server_greeting(struct super_block *sb, u64 node_id, + bool should_exist); int scoutfs_lock_server_response(struct super_block *sb, u64 node_id, struct scoutfs_net_lock *nl); +int scoutfs_lock_server_farewell(struct super_block *sb, u64 node_id); int scoutfs_lock_server_setup(struct super_block *sb); void scoutfs_lock_server_destroy(struct super_block *sb); diff --git a/kmod/src/server.c b/kmod/src/server.c index 1077afe5..4f76a737 100644 --- a/kmod/src/server.c +++ b/kmod/src/server.c @@ -1066,6 +1066,39 @@ int scoutfs_server_lock_response(struct super_block *sb, u64 node_id, nl, sizeof(*nl)); } +static bool invalid_recover(struct scoutfs_net_lock_recover *nlr, + unsigned long bytes) +{ + return ((bytes < sizeof(*nlr)) || + (bytes != offsetof(struct scoutfs_net_lock_recover, + locks[le16_to_cpu(nlr->nr)]))); +} + +static int lock_recover_response(struct super_block *sb, + struct scoutfs_net_connection *conn, + void *resp, unsigned int resp_len, + int error, void *data) +{ + u64 node_id = scoutfs_net_client_node_id(conn); + + if (invalid_recover(resp, resp_len)) + return -EINVAL; + + return scoutfs_lock_server_recover_response(sb, node_id, resp); +} + +int scoutfs_server_lock_recover_request(struct super_block *sb, u64 node_id, + struct scoutfs_key *key) +{ + struct server_info *server = SCOUTFS_SB(sb)->server_info; + + return scoutfs_net_submit_request_node(sb, server->conn, node_id, + SCOUTFS_NET_CMD_LOCK_RECOVER, + key, sizeof(*key), + lock_recover_response, + NULL, NULL); +} + /* * Process an incoming greeting request in the server from the client. * We try to send responses to failed greetings so that the sender can @@ -1083,6 +1116,14 @@ int scoutfs_server_lock_response(struct super_block *sb, u64 node_id, * disconnect before they receive the response and resent and initial * blank greeting. We could use a client uuid to associate with * allocated node_ids. + * + * XXX The logic of this has gotten convoluted. The lock server can + * send a recovery request so it needs to be called after the core net + * greeting call enables messages. But we want the greeting reply to be + * sent first, so we currently queue it on the send queue before + * enabling messages. That means that a lot of errors that happen after + * the reply can't be sent to the client. They'll just see a disconnect + * and won't know what's happened. This all needs to be refactored. */ static int server_greeting(struct super_block *sb, struct scoutfs_net_connection *conn, @@ -1098,10 +1139,11 @@ static int server_greeting(struct super_block *sb, bool first_contact; bool farewell; int ret = 0; + int err; if (arg_len != sizeof(struct scoutfs_net_greeting)) { ret = -EINVAL; - goto out; + goto send_err; } if (gr->fsid != super->hdr.fsid) { @@ -1109,7 +1151,7 @@ static int server_greeting(struct super_block *sb, le64_to_cpu(gr->fsid), le64_to_cpu(super->hdr.fsid)); ret = -EINVAL; - goto out; + goto send_err; } if (gr->format_hash != super->format_hash) { @@ -1117,7 +1159,7 @@ static int server_greeting(struct super_block *sb, le64_to_cpu(gr->format_hash), le64_to_cpu(super->format_hash)); ret = -EINVAL; - goto out; + goto send_err; } if (gr->node_id == 0) { @@ -1131,35 +1173,58 @@ static int server_greeting(struct super_block *sb, queue_commit_work(server, &cw); up_read(&server->commit_rwsem); ret = wait_for_commit(&cw); - if (ret) { - node_id = 0; - goto out; - } } else { node_id = gr->node_id; } +send_err: + err = ret; + if (err) + node_id = 0; + greet.fsid = super->hdr.fsid; greet.format_hash = super->format_hash; greet.server_term = cpu_to_le64(server->term); greet.node_id = node_id; greet.flags = 0; -out: - ret = scoutfs_net_response(sb, conn, cmd, id, ret, - &greet, sizeof(greet)); - if (node_id != 0 && ret == 0) { - sent_node_id = gr->node_id != 0; - first_contact = le64_to_cpu(gr->server_term) != server->term; - if (gr->flags & cpu_to_le64(SCOUTFS_NET_GREETING_FLAG_FAREWELL)) - farewell = true; - else - farewell = false; - scoutfs_net_server_greeting(sb, conn, le64_to_cpu(node_id), id, - sent_node_id, first_contact, - farewell); + /* queue greeting response to be sent first once messaging enabled */ + ret = scoutfs_net_response(sb, conn, cmd, id, err, + &greet, sizeof(greet)); + if (ret == 0 && err) + ret = err; + if (ret) + goto out; + + /* have the net core enable messaging and resend */ + sent_node_id = gr->node_id != 0; + first_contact = le64_to_cpu(gr->server_term) != server->term; + if (gr->flags & cpu_to_le64(SCOUTFS_NET_GREETING_FLAG_FAREWELL)) + farewell = true; + else + farewell = false; + + scoutfs_net_server_greeting(sb, conn, le64_to_cpu(node_id), id, + sent_node_id, first_contact, farewell); + + /* lock server might send recovery request */ + if (le64_to_cpu(gr->server_term) != server->term) { + + /* we're now doing two commits per greeting, not great */ + down_read(&server->commit_rwsem); + + ret = scoutfs_lock_server_greeting(sb, le64_to_cpu(node_id), + gr->server_term != 0); + if (ret == 0) + queue_commit_work(server, &cw); + up_read(&server->commit_rwsem); + if (ret == 0) + ret = wait_for_commit(&cw); + if (ret) + goto out; } +out: return ret; } @@ -1178,12 +1243,25 @@ static int server_farewell(struct super_block *sb, struct scoutfs_net_connection *conn, u8 cmd, u64 id, void *arg, u16 arg_len) { + struct server_info *server = SCOUTFS_SB(sb)->server_info; + u64 node_id = scoutfs_net_client_node_id(conn); + struct commit_waiter cw; + int ret; + if (arg_len != 0) return -EINVAL; scoutfs_net_server_farewell(sb, conn); - return scoutfs_net_response(sb, conn, cmd, id, 0, NULL, 0); + down_read(&server->commit_rwsem); + ret = scoutfs_lock_server_farewell(sb, node_id); + if (ret == 0) + queue_commit_work(server, &cw); + up_read(&server->commit_rwsem); + if (ret == 0) + ret = wait_for_commit(&cw); + + return scoutfs_net_response(sb, conn, cmd, id, ret, NULL, 0); } /* requests sent to clients are tracked so we can free resources */ diff --git a/kmod/src/server.h b/kmod/src/server.h index a843e437..9d646813 100644 --- a/kmod/src/server.h +++ b/kmod/src/server.h @@ -68,6 +68,8 @@ int scoutfs_server_lock_request(struct super_block *sb, u64 node_id, struct scoutfs_net_lock *nl); int scoutfs_server_lock_response(struct super_block *sb, u64 node_id, u64 id, struct scoutfs_net_lock *nl); +int scoutfs_server_lock_recover_request(struct super_block *sb, u64 node_id, + struct scoutfs_key *key); struct sockaddr_in; int scoutfs_server_start(struct super_block *sb, struct sockaddr_in *sin,