Files
scoutfs/kmod/src/client.c
Zach Brown 288d781645 scoutfs: start and stop server with quorum
Currently all mounts try to get a dlm lock which gives them exclusive
access to become the server for the filesystem.  That isn't going to
work if we're moving to locking provided by the server.

This uses quorum election to determine who should run the server.  We
switch from long running server work blocked trying to get a lock to
calls which start and stop the server.

Signed-off-by: Zach Brown <zab@versity.com>
2019-04-12 10:54:07 -07:00

594 lines
16 KiB
C

/*
* Copyright (C) 2017 Versity Software, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <asm/ioctls.h>
#include <linux/net.h>
#include <linux/inet.h>
#include <linux/in.h>
#include <net/sock.h>
#include <net/tcp.h>
#include <asm/barrier.h>
#include "format.h"
#include "counters.h"
#include "inode.h"
#include "btree.h"
#include "manifest.h"
#include "seg.h"
#include "compact.h"
#include "scoutfs_trace.h"
#include "msg.h"
#include "client.h"
#include "net.h"
#include "endian_swap.h"
#include "quorum.h"
/*
* The client is responsible for maintaining a connection to the server.
* This includes managing quorum elections that determine which client
* should run the server that all the clients connect to.
*/
#define CLIENT_CONNECT_DELAY_MS (MSEC_PER_SEC / 10)
#define CLIENT_CONNECT_TIMEOUT_MS (1 * MSEC_PER_SEC)
#define CLIENT_QUORUM_TIMEOUT_MS (5 * MSEC_PER_SEC)
struct client_info {
struct super_block *sb;
struct scoutfs_net_connection *conn;
struct completion node_id_comp;
atomic_t shutting_down;
struct workqueue_struct *workq;
struct work_struct connect_work;
struct scoutfs_quorum_elected_info qei;
u64 old_elected_nr;
};
/*
* Ask for a new run of allocated inode numbers. The server can return
* fewer than @count. It will success with nr == 0 if we've run out.
*/
int scoutfs_client_alloc_inodes(struct super_block *sb, u64 count,
u64 *ino, u64 *nr)
{
struct client_info *client = SCOUTFS_SB(sb)->client_info;
struct scoutfs_net_inode_alloc ial;
__le64 lecount = cpu_to_le64(count);
int ret;
ret = scoutfs_net_sync_request(sb, client->conn,
SCOUTFS_NET_CMD_ALLOC_INODES,
&lecount, sizeof(lecount),
&ial, sizeof(ial));
if (ret == 0) {
*ino = le64_to_cpu(ial.ino);
*nr = le64_to_cpu(ial.nr);
if (*nr == 0)
ret = -ENOSPC;
else if (*ino + *nr < *ino)
ret = -EINVAL;
}
return ret;
}
/*
* Ask the server for an extent of at most @blocks blocks. It can return
* smaller extents.
*/
int scoutfs_client_alloc_extent(struct super_block *sb, u64 blocks, u64 *start,
u64 *len)
{
struct client_info *client = SCOUTFS_SB(sb)->client_info;
__le64 leblocks = cpu_to_le64(blocks);
struct scoutfs_net_extent nex;
int ret;
ret = scoutfs_net_sync_request(sb, client->conn,
SCOUTFS_NET_CMD_ALLOC_EXTENT,
&leblocks, sizeof(leblocks),
&nex, sizeof(nex));
if (ret == 0) {
if (nex.len == 0) {
ret = -ENOSPC;
} else {
*start = le64_to_cpu(nex.start);
*len = le64_to_cpu(nex.len);
}
}
return ret;
}
int scoutfs_client_free_extents(struct super_block *sb,
struct scoutfs_net_extent_list *nexl)
{
struct client_info *client = SCOUTFS_SB(sb)->client_info;
unsigned int bytes;
bytes = SCOUTFS_NET_EXTENT_LIST_BYTES(le64_to_cpu(nexl->nr));
return scoutfs_net_sync_request(sb, client->conn,
SCOUTFS_NET_CMD_FREE_EXTENTS,
nexl, bytes, NULL, 0);
}
int scoutfs_client_alloc_segno(struct super_block *sb, u64 *segno)
{
struct client_info *client = SCOUTFS_SB(sb)->client_info;
__le64 lesegno;
int ret;
ret = scoutfs_net_sync_request(sb, client->conn,
SCOUTFS_NET_CMD_ALLOC_SEGNO,
NULL, 0, &lesegno, sizeof(lesegno));
if (ret == 0) {
if (lesegno == 0)
ret = -ENOSPC;
else
*segno = le64_to_cpu(lesegno);
}
return ret;
}
int scoutfs_client_record_segment(struct super_block *sb,
struct scoutfs_segment *seg, u8 level)
{
struct client_info *client = SCOUTFS_SB(sb)->client_info;
struct scoutfs_net_manifest_entry net_ment;
struct scoutfs_manifest_entry ment;
scoutfs_seg_init_ment(&ment, level, seg);
scoutfs_init_ment_to_net(&net_ment, &ment);
return scoutfs_net_sync_request(sb, client->conn,
SCOUTFS_NET_CMD_RECORD_SEGMENT,
&net_ment, sizeof(net_ment), NULL, 0);
}
int scoutfs_client_advance_seq(struct super_block *sb, u64 *seq)
{
struct client_info *client = SCOUTFS_SB(sb)->client_info;
__le64 before = cpu_to_le64p(seq);
__le64 after;
int ret;
ret = scoutfs_net_sync_request(sb, client->conn,
SCOUTFS_NET_CMD_ADVANCE_SEQ,
&before, sizeof(before),
&after, sizeof(after));
if (ret == 0)
*seq = le64_to_cpu(after);
return ret;
}
int scoutfs_client_get_last_seq(struct super_block *sb, u64 *seq)
{
struct client_info *client = SCOUTFS_SB(sb)->client_info;
__le64 last_seq;
int ret;
ret = scoutfs_net_sync_request(sb, client->conn,
SCOUTFS_NET_CMD_GET_LAST_SEQ,
NULL, 0, &last_seq, sizeof(last_seq));
if (ret == 0)
*seq = le64_to_cpu(last_seq);
return ret;
}
int scoutfs_client_get_manifest_root(struct super_block *sb,
struct scoutfs_btree_root *root)
{
struct client_info *client = SCOUTFS_SB(sb)->client_info;
return scoutfs_net_sync_request(sb, client->conn,
SCOUTFS_NET_CMD_GET_MANIFEST_ROOT,
NULL, 0, root,
sizeof(struct scoutfs_btree_root));
}
int scoutfs_client_statfs(struct super_block *sb,
struct scoutfs_net_statfs *nstatfs)
{
struct client_info *client = SCOUTFS_SB(sb)->client_info;
return scoutfs_net_sync_request(sb, client->conn,
SCOUTFS_NET_CMD_STATFS, NULL, 0,
nstatfs,
sizeof(struct scoutfs_net_statfs));
}
/* process an incoming grant response from the server */
static int client_lock_response(struct super_block *sb,
struct scoutfs_net_connection *conn,
void *resp, unsigned int resp_len,
int error, void *data)
{
if (resp_len != sizeof(struct scoutfs_net_lock))
return -EINVAL;
/* XXX error? */
return scoutfs_lock_grant_response(sb, resp);
}
/* Send a lock request to the server. */
int scoutfs_client_lock_request(struct super_block *sb,
struct scoutfs_net_lock *nl)
{
struct client_info *client = SCOUTFS_SB(sb)->client_info;
return scoutfs_net_submit_request(sb, client->conn,
SCOUTFS_NET_CMD_LOCK,
nl, sizeof(*nl),
client_lock_response, NULL, NULL);
}
/* Send a lock response to the server. */
int scoutfs_client_lock_response(struct super_block *sb, u64 net_id,
struct scoutfs_net_lock *nl)
{
struct client_info *client = SCOUTFS_SB(sb)->client_info;
return scoutfs_net_response(sb, client->conn, SCOUTFS_NET_CMD_LOCK,
net_id, 0, nl, sizeof(*nl));
}
/* The client is receiving a invalidation request from the server */
static int client_lock(struct super_block *sb,
struct scoutfs_net_connection *conn, u8 cmd, u64 id,
void *arg, u16 arg_len)
{
if (arg_len != sizeof(struct scoutfs_net_lock))
return -EINVAL;
/* XXX error? */
return scoutfs_lock_invalidate_request(sb, id, arg);
}
/*
* Process a greeting response in the client from the server. This is
* called for every connected socket on the connection. The first
* response will have the node_id that the server assigned the client.
*/
static int client_greeting(struct super_block *sb,
struct scoutfs_net_connection *conn,
void *resp, unsigned int resp_len, int error,
void *data)
{
struct client_info *client = SCOUTFS_SB(sb)->client_info;
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct scoutfs_net_greeting *gr = resp;
int ret = 0;
if (error) {
ret = error;
goto out;
}
if (resp_len != sizeof(struct scoutfs_net_greeting)) {
ret = -EINVAL;
goto out;
}
if (gr->fsid != super->id) {
scoutfs_warn(sb, "server sent fsid 0x%llx, client has 0x%llx",
le64_to_cpu(gr->fsid),
le64_to_cpu(super->id));
ret = -EINVAL;
goto out;
}
if (gr->format_hash != super->format_hash) {
scoutfs_warn(sb, "server sent format 0x%llx, client has 0x%llx",
le64_to_cpu(gr->format_hash),
le64_to_cpu(super->format_hash));
ret = -EINVAL;
goto out;
}
if (sbi->node_id != 0 && le64_to_cpu(gr->node_id) != sbi->node_id) {
scoutfs_warn(sb, "server sent node_id %llu, client has %llu",
le64_to_cpu(gr->node_id),
sbi->node_id);
ret = -EINVAL;
goto out;
}
if (sbi->node_id == 0 && gr->node_id == 0) {
scoutfs_warn(sb, "server sent node_id 0, client also has 0\n");
ret = -EINVAL;
goto out;
}
if (sbi->node_id == 0) {
sbi->node_id = le64_to_cpu(gr->node_id);
complete(&client->node_id_comp);
}
out:
return ret;
}
/*
* If the previous election told us to start the server then stop it
* and wipe the old election info. If we're not fast enough to clear
* the election block then the next server might fence us. Should
* be very unlikely as election requires multiple RMW cycles.
*/
static void stop_our_server(struct super_block *sb,
struct scoutfs_quorum_elected_info *qei)
{
if (qei->run_server) {
scoutfs_server_stop(sb);
scoutfs_quorum_clear_elected(sb, qei);
memset(qei, 0, sizeof(*qei));
}
}
/*
* This work is responsible for managing leader elections, running the
* server, and connecting clients to the server.
*
* In the typical case a mount reads the quorum blocks and finds the
* address of the currently running server and connects to it.
*
* More rarely clients who aren't connected and are configured to
* participate in quorum need to elect the new leader. The elected info
* filled by quorum tells us if we were elected to run the server.
*
* This leads to the possibility that the mount who is running the
* server had its mount disconnect. This is only weirdly different from
* other clients disconnecting and trying to reconnect because of the
* way quorum slots are reconfigured and reclaimed. If we connect to a
* server with the new quorum config then we can't have any old servers
* running in the stale old quorum slot. The simplest way to do this is
* to *always* stop the server if we're running it and we got
* disconnected. It's a big hammer, but it's reliable, and arguably if
* *we* couldn't' use *our* server then something bad is happening and
* someone else should be the server.
*
* This only executes on mount, error, or as a connection disconnects
* and there's only ever one executing.
*/
static void scoutfs_client_connect_worker(struct work_struct *work)
{
struct client_info *client = container_of(work, struct client_info,
connect_work);
struct super_block *sb = client->sb;
struct scoutfs_quorum_elected_info *qei = &client->qei;
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct scoutfs_super_block *super = &sbi->super;
struct mount_options *opts = &sbi->opts;
struct scoutfs_net_greeting greet;
ktime_t timeout_abs;
int ret;
/* don't try quorum and connecting while our mount runs a server */
stop_our_server(sb, qei);
timeout_abs = ktime_add_ms(ktime_get(), CLIENT_QUORUM_TIMEOUT_MS);
ret = scoutfs_quorum_election(sb, opts->uniq_name,
client->old_elected_nr,
timeout_abs, qei);
if (ret)
goto out;
if (qei->run_server) {
ret = scoutfs_server_start(sb, &qei->sin);
if (ret) {
/* forget that we tried to start the server */
memset(qei, 0, sizeof(*qei));
goto out;
}
}
/* always give the server some time before connecting */
msleep(CLIENT_CONNECT_DELAY_MS);
ret = scoutfs_net_connect(sb, client->conn, &qei->sin,
CLIENT_CONNECT_TIMEOUT_MS);
if (ret) {
/* we couldn't connect, try electing a new server */
client->old_elected_nr = qei->elected_nr;
goto out;
}
/* trust this server again if it's still around after we disconnect */
client->old_elected_nr = 0;
/* send a greeting to verify endpoints of each connection */
greet.fsid = super->id;
greet.format_hash = super->format_hash;
greet.node_id = cpu_to_le64(sbi->node_id);
ret = scoutfs_net_submit_request(sb, client->conn,
SCOUTFS_NET_CMD_GREETING,
&greet, sizeof(greet),
client_greeting, NULL, NULL);
if (ret)
scoutfs_net_shutdown(sb, client->conn);
out:
if (ret && !atomic_read(&client->shutting_down))
queue_work(client->workq, &client->connect_work);
}
/*
* Perform a compaction in the client as requested by the server. The
* server has protected the input segments and allocated the output
* segnos for us. This executes in work queued by the client's net
* connection. It only reads and write segments. The server will
* update the manifest and allocators while processing the response. An
* error response includes the compaction id so that the server can
* clean it up.
*
* If we get duplicate requests across a reconnected socket we can have
* two workers performing the same compaction simultaneously. This
* isn't particularly efficient but it's rare and won't corrupt the
* output. Our response can be lost if the socket is shutdown while
* it's in flight, the server deals with this.
*/
static int client_compact(struct super_block *sb,
struct scoutfs_net_connection *conn,
u8 cmd, u64 id, void *arg, u16 arg_len)
{
struct scoutfs_net_compact_response *resp = NULL;
struct scoutfs_net_compact_request *req;
int ret;
if (arg_len != sizeof(struct scoutfs_net_compact_request)) {
ret = -EINVAL;
goto out;
}
req = arg;
trace_scoutfs_client_compact_start(sb, le64_to_cpu(req->id),
req->last_level, req->flags);
resp = kzalloc(sizeof(struct scoutfs_net_compact_response), GFP_NOFS);
if (!resp) {
ret = -ENOMEM;
} else {
resp->id = req->id;
ret = scoutfs_compact(sb, req, resp);
}
trace_scoutfs_client_compact_stop(sb, le64_to_cpu(req->id), ret);
if (ret < 0)
ret = scoutfs_net_response(sb, conn, cmd, id, ret,
&req->id, sizeof(req->id));
else
ret = scoutfs_net_response(sb, conn, cmd, id, 0,
resp, sizeof(*resp));
kfree(resp);
out:
return ret;
}
static scoutfs_net_request_t client_req_funcs[] = {
[SCOUTFS_NET_CMD_COMPACT] = client_compact,
[SCOUTFS_NET_CMD_LOCK] = client_lock,
};
/*
* Called when either a connect attempt or established connection times
* out and fails.
*/
static void client_notify_down(struct super_block *sb,
struct scoutfs_net_connection *conn, void *info,
u64 node_id)
{
struct client_info *client = SCOUTFS_SB(sb)->client_info;
if (!atomic_read(&client->shutting_down))
queue_work(client->workq, &client->connect_work);
}
/*
* Wait for the first connected socket on the connection that assigns
* the node_id that will be used for the rest of the life time of the
* mount.
*/
int scoutfs_client_wait_node_id(struct super_block *sb)
{
struct client_info *client = SCOUTFS_SB(sb)->client_info;
return wait_for_completion_interruptible(&client->node_id_comp);
}
int scoutfs_client_setup(struct super_block *sb)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct client_info *client;
int ret;
client = kzalloc(sizeof(struct client_info), GFP_KERNEL);
if (!client) {
ret = -ENOMEM;
goto out;
}
sbi->client_info = client;
client->sb = sb;
init_completion(&client->node_id_comp);
atomic_set(&client->shutting_down, 0);
INIT_WORK(&client->connect_work, scoutfs_client_connect_worker);
client->conn = scoutfs_net_alloc_conn(sb, NULL, client_notify_down, 0,
client_req_funcs, "client");
if (!client->conn) {
ret = -ENOMEM;
goto out;
}
client->workq = alloc_workqueue("scoutfs_client_workq", WQ_UNBOUND, 1);
if (!client->workq) {
ret = -ENOMEM;
goto out;
}
queue_work(client->workq, &client->connect_work);
ret = 0;
out:
if (ret)
scoutfs_client_destroy(sb);
return ret;
}
/*
* There must be no more callers to the client request functions by the
* time we get here.
*/
void scoutfs_client_destroy(struct super_block *sb)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct client_info *client = SCOUTFS_SB(sb)->client_info;
struct scoutfs_net_connection *conn;
if (client) {
/* stop notify_down from queueing connect work */
atomic_set(&client->shutting_down, 1);
/* make sure worker isn't using the conn */
cancel_work_sync(&client->connect_work);
/* make racing conn use explode */
conn = client->conn;
client->conn = NULL;
scoutfs_net_free_conn(sb, conn);
/* stop running the server if we were, harmless otherwise */
stop_our_server(sb, &client->qei);
if (client->workq)
destroy_workqueue(client->workq);
kfree(client);
sbi->client_info = NULL;
}
}