scoutfs/kmod/src/client.c

/*
 * Copyright (C) 2017 Versity Software, Inc.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License v2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 */
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <asm/ioctls.h>
#include <linux/net.h>
#include <linux/inet.h>
#include <linux/in.h>
#include <net/sock.h>
#include <net/tcp.h>
#include <asm/barrier.h>

#include "format.h"
#include "counters.h"
#include "inode.h"
#include "btree.h"
#include "manifest.h"
#include "seg.h"
#include "compact.h"
#include "scoutfs_trace.h"
#include "msg.h"
#include "client.h"
#include "net.h"
#include "endian_swap.h"
#include "quorum.h"

/*
 * The client is responsible for maintaining a connection to the server.
 * This includes managing quorum elections that determine which client
 * should run the server that all the clients connect to.
 */

#define CLIENT_CONNECT_DELAY_MS		(MSEC_PER_SEC / 10)
#define CLIENT_CONNECT_TIMEOUT_MS	(1 * MSEC_PER_SEC)
#define CLIENT_QUORUM_TIMEOUT_MS	(5 * MSEC_PER_SEC)

struct client_info {
	struct super_block *sb;

	struct scoutfs_net_connection *conn;
	atomic_t shutting_down;

	struct workqueue_struct *workq;
	struct delayed_work connect_dwork;

	u64 server_term;
	u64 greeting_umb;

	bool sending_farewell;
	int farewell_error;
	struct completion farewell_comp;
};

/*
 * Ask for a new run of allocated inode numbers.  The server can return
 * fewer than @count.  It will success with nr == 0 if we've run out.
 */
int scoutfs_client_alloc_inodes(struct super_block *sb, u64 count,
				u64 *ino, u64 *nr)
{
	struct client_info *client = SCOUTFS_SB(sb)->client_info;
	struct scoutfs_net_inode_alloc ial;
	__le64 lecount = cpu_to_le64(count);
	int ret;

	ret = scoutfs_net_sync_request(sb, client->conn,
				       SCOUTFS_NET_CMD_ALLOC_INODES,
				       &lecount, sizeof(lecount),
				       &ial, sizeof(ial));
	if (ret == 0) {
		*ino = le64_to_cpu(ial.ino);
		*nr = le64_to_cpu(ial.nr);

		if (*nr == 0)
			ret = -ENOSPC;
		else if (*ino + *nr < *ino)
			ret = -EINVAL;
	}

	return ret;
}

/*
 * Ask the server for an extent of at most @blocks blocks.  It can return
 * smaller extents.
 */
int scoutfs_client_alloc_extent(struct super_block *sb, u64 blocks, u64 *start,
				u64 *len)

{
	struct client_info *client = SCOUTFS_SB(sb)->client_info;
	__le64 leblocks = cpu_to_le64(blocks);
	struct scoutfs_net_extent nex;
	int ret;

	ret = scoutfs_net_sync_request(sb, client->conn,
				       SCOUTFS_NET_CMD_ALLOC_EXTENT,
				       &leblocks, sizeof(leblocks),
				       &nex, sizeof(nex));
	if (ret == 0) {
		if (nex.len == 0) {
			ret = -ENOSPC;
		} else {
			*start = le64_to_cpu(nex.start);
			*len = le64_to_cpu(nex.len);
		}
	}

	return ret;
}

int scoutfs_client_free_extents(struct super_block *sb,
				struct scoutfs_net_extent_list *nexl)
{
	struct client_info *client = SCOUTFS_SB(sb)->client_info;
	unsigned int bytes;

	bytes = SCOUTFS_NET_EXTENT_LIST_BYTES(le64_to_cpu(nexl->nr));

	return scoutfs_net_sync_request(sb, client->conn,
					SCOUTFS_NET_CMD_FREE_EXTENTS,
					nexl, bytes, NULL, 0);
}

int scoutfs_client_alloc_segno(struct super_block *sb, u64 *segno)
{
	struct client_info *client = SCOUTFS_SB(sb)->client_info;
	__le64 lesegno;
	int ret;

	ret = scoutfs_net_sync_request(sb, client->conn,
				       SCOUTFS_NET_CMD_ALLOC_SEGNO,
				       NULL, 0, &lesegno, sizeof(lesegno));
	if (ret == 0) {
		if (lesegno == 0)
			ret = -ENOSPC;
		else
			*segno = le64_to_cpu(lesegno);
	}

	return ret;
}

int scoutfs_client_record_segment(struct super_block *sb,
				  struct scoutfs_segment *seg, u8 level)
{
	struct client_info *client = SCOUTFS_SB(sb)->client_info;
	struct scoutfs_net_manifest_entry net_ment;
	struct scoutfs_manifest_entry ment;

	scoutfs_seg_init_ment(&ment, level, seg);
	scoutfs_init_ment_to_net(&net_ment, &ment);

	return scoutfs_net_sync_request(sb, client->conn,
					SCOUTFS_NET_CMD_RECORD_SEGMENT,
					&net_ment, sizeof(net_ment), NULL, 0);
}

int scoutfs_client_get_log_trees(struct super_block *sb,
				 struct scoutfs_log_trees *lt)
{
	struct client_info *client = SCOUTFS_SB(sb)->client_info;

	return scoutfs_net_sync_request(sb, client->conn,
					SCOUTFS_NET_CMD_GET_LOG_TREES,
					NULL, 0, lt, sizeof(*lt));
}

int scoutfs_client_commit_log_trees(struct super_block *sb,
				    struct scoutfs_log_trees *lt)
{
	struct client_info *client = SCOUTFS_SB(sb)->client_info;

	return scoutfs_net_sync_request(sb, client->conn,
					SCOUTFS_NET_CMD_COMMIT_LOG_TREES,
					lt, sizeof(*lt), NULL, 0);
}

int scoutfs_client_advance_seq(struct super_block *sb, u64 *seq)
{
	struct client_info *client = SCOUTFS_SB(sb)->client_info;
	__le64 before = cpu_to_le64p(seq);
	__le64 after;
	int ret;

	ret = scoutfs_net_sync_request(sb, client->conn,
				       SCOUTFS_NET_CMD_ADVANCE_SEQ,
				       &before, sizeof(before),
				       &after, sizeof(after));
	if (ret == 0)
		*seq = le64_to_cpu(after);

	return ret;
}

int scoutfs_client_get_last_seq(struct super_block *sb, u64 *seq)
{
	struct client_info *client = SCOUTFS_SB(sb)->client_info;
	__le64 last_seq;
	int ret;

	ret = scoutfs_net_sync_request(sb, client->conn,
				       SCOUTFS_NET_CMD_GET_LAST_SEQ,
				       NULL, 0, &last_seq, sizeof(last_seq));
	if (ret == 0)
		*seq = le64_to_cpu(last_seq);

	return ret;
}

int scoutfs_client_get_manifest_root(struct super_block *sb,
				     struct scoutfs_btree_root *root)
{
	struct client_info *client = SCOUTFS_SB(sb)->client_info;

	return scoutfs_net_sync_request(sb, client->conn,
					SCOUTFS_NET_CMD_GET_MANIFEST_ROOT,
					NULL, 0, root,
					sizeof(struct scoutfs_btree_root));
}

int scoutfs_client_statfs(struct super_block *sb,
			  struct scoutfs_net_statfs *nstatfs)
{
	struct client_info *client = SCOUTFS_SB(sb)->client_info;

	return scoutfs_net_sync_request(sb, client->conn,
					SCOUTFS_NET_CMD_STATFS, NULL, 0,
					nstatfs,
					sizeof(struct scoutfs_net_statfs));
}

/* process an incoming grant response from the server */
static int client_lock_response(struct super_block *sb,
				struct scoutfs_net_connection *conn,
				void *resp, unsigned int resp_len,
				int error, void *data)
{
	if (resp_len != sizeof(struct scoutfs_net_lock))
		return -EINVAL;

	/* XXX error? */

	return scoutfs_lock_grant_response(sb, resp);
}

/* Send a lock request to the server. */
int scoutfs_client_lock_request(struct super_block *sb,
				struct scoutfs_net_lock *nl)
{
	struct client_info *client = SCOUTFS_SB(sb)->client_info;

	return scoutfs_net_submit_request(sb, client->conn,
					  SCOUTFS_NET_CMD_LOCK,
					  nl, sizeof(*nl),
					  client_lock_response, NULL, NULL);
}

/* Send a lock response to the server. */
int scoutfs_client_lock_response(struct super_block *sb, u64 net_id,
				struct scoutfs_net_lock *nl)
{
	struct client_info *client = SCOUTFS_SB(sb)->client_info;

	return scoutfs_net_response(sb, client->conn, SCOUTFS_NET_CMD_LOCK,
				    net_id, 0, nl, sizeof(*nl));
}

/* Send a lock recover response to the server. */
int scoutfs_client_lock_recover_response(struct super_block *sb, u64 net_id,
					 struct scoutfs_net_lock_recover *nlr)
{
	struct client_info *client = SCOUTFS_SB(sb)->client_info;
	u16 bytes = offsetof(struct scoutfs_net_lock_recover,
			     locks[le16_to_cpu(nlr->nr)]);

	return scoutfs_net_response(sb, client->conn,
				    SCOUTFS_NET_CMD_LOCK_RECOVER,
				    net_id, 0, nlr, bytes);
}

/* The client is receiving a invalidation request from the server */
static int client_lock(struct super_block *sb,
		       struct scoutfs_net_connection *conn, u8 cmd, u64 id,
		       void *arg, u16 arg_len)
{
	if (arg_len != sizeof(struct scoutfs_net_lock))
		return -EINVAL;

	/* XXX error? */

	return scoutfs_lock_invalidate_request(sb, id, arg);
}

/* The server is asking us for the client's locks starting with the given key */
static int client_lock_recover(struct super_block *sb,
			       struct scoutfs_net_connection *conn,
			       u8 cmd, u64 id, void *arg, u16 arg_len)
{
	if (arg_len != sizeof(struct scoutfs_key))
		return -EINVAL;

	/* XXX error? */

	return scoutfs_lock_recover_request(sb, id, arg);
}

/*
 * Process a greeting response in the client from the server.  This is
 * called for every connected socket on the connection.  Each response
 * contains the remote server's elected term which can be used to
 * identify server failover.
 */
static int client_greeting(struct super_block *sb,
			   struct scoutfs_net_connection *conn,
			   void *resp, unsigned int resp_len, int error,
			   void *data)
{
	struct client_info *client = SCOUTFS_SB(sb)->client_info;
	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
	struct scoutfs_net_greeting *gr = resp;
	bool new_server;
	int ret;

	if (error) {
		ret = error;
		goto out;
	}

	if (resp_len != sizeof(struct scoutfs_net_greeting)) {
		ret = -EINVAL;
		goto out;
	}

	if (gr->fsid != super->hdr.fsid) {
		scoutfs_warn(sb, "server sent fsid 0x%llx, client has 0x%llx",
			     le64_to_cpu(gr->fsid),
			     le64_to_cpu(super->hdr.fsid));
		ret = -EINVAL;
		goto out;
	}

	if (gr->format_hash != super->format_hash) {
		scoutfs_warn(sb, "server sent format 0x%llx, client has 0x%llx",
			     le64_to_cpu(gr->format_hash),
			     le64_to_cpu(super->format_hash));
		ret = -EINVAL;
		goto out;
	}

	new_server = le64_to_cpu(gr->server_term) != client->server_term;
	scoutfs_net_client_greeting(sb, conn, new_server);

	client->server_term = le64_to_cpu(gr->server_term);
	client->greeting_umb = le64_to_cpu(gr->unmount_barrier);
	ret = 0;
out:
	return ret;
}

/*
 * This work is responsible for maintaining a connection from the client
 * to the server.  It's queued on mount and disconnect and we requeue
 * the work if the work fails and we're not shutting down.
 *
 * In the typical case a mount reads the super blocks and finds the
 * address of the currently running server and connects to it.
 * Non-voting clients who can't connect will keep trying alternating
 * reading the address and getting connect timeouts.
 *
 * Voting mounts will try to elect a leader if they can't connect to the
 * server.  When a quorum can't connect and are able to elect a leader
 * then a new server is started.  The new server will write its address
 * in the super and everyone will be able to connect.
 *
 * There's a tricky bit of coordination required to safely unmount.
 * Clients need to tell the server that they won't be coming back with a
 * farewell request.  Once a client receives its farewell response it
 * can exit.  But a majority of clients need to stick around to elect a
 * server to process all their farewell requests.  This is coordinated
 * by having the greeting tell the server that a client is a voter.  The
 * server then holds on to farewell requests from voters until only
 * requests from the final quorum remain.  These farewell responses are
 * only sent after updating an unmount barrier in the super to indicate
 * to the final quorum that they can safely exit without having received
 * a farewell response over the network.
 */
static void scoutfs_client_connect_worker(struct work_struct *work)
{
	struct client_info *client = container_of(work, struct client_info,
						  connect_dwork.work);
	struct super_block *sb = client->sb;
	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
	struct scoutfs_super_block *super = NULL;
	struct mount_options *opts = &sbi->opts;
	const bool am_voter = opts->server_addr.sin_addr.s_addr != 0;
	struct scoutfs_net_greeting greet;
	struct sockaddr_in sin;
	ktime_t timeout_abs;
	u64 elected_term;
	int ret;

	super = kmalloc(sizeof(struct scoutfs_super_block), GFP_NOFS);
	if (!super) {
		ret = -ENOMEM;
		goto out;
	}

	ret = scoutfs_read_super(sb, super);
	if (ret)
		goto out;

	/* can safely unmount if we see that server processed our farewell */
	if (am_voter && client->sending_farewell &&
	    (le64_to_cpu(super->unmount_barrier) > client->greeting_umb)) {
		client->farewell_error = 0;
		complete(&client->farewell_comp);
		ret = 0;
		goto out;
	}

	/* try to connect to the super's server address */
	scoutfs_addr_to_sin(&sin, &super->server_addr);
	if (sin.sin_addr.s_addr != 0 && sin.sin_port != 0)
		ret = scoutfs_net_connect(sb, client->conn, &sin,
					  CLIENT_CONNECT_TIMEOUT_MS);
	else
		ret = -ENOTCONN;

	/* voters try to elect a leader if they couldn't connect */
	if (ret < 0) {
		/* non-voters will keep retrying */
		if (!am_voter)
			goto out;

		/* make sure local server isn't writing super during votes */
		scoutfs_server_stop(sb);

		timeout_abs = ktime_add_ms(ktime_get(),
					   CLIENT_QUORUM_TIMEOUT_MS);

		ret = scoutfs_quorum_election(sb, timeout_abs,
					le64_to_cpu(super->quorum_server_term),
					&elected_term);
		/* start the server if we were asked to */
		if (elected_term > 0)
			ret = scoutfs_server_start(sb, &opts->server_addr,
						   elected_term);
		ret = -ENOTCONN;
		goto out;
	}

	/* send a greeting to verify endpoints of each connection */
	greet.fsid = super->hdr.fsid;
	greet.format_hash = super->format_hash;
	greet.server_term = cpu_to_le64(client->server_term);
	greet.unmount_barrier = cpu_to_le64(client->greeting_umb);
	greet.rid = cpu_to_le64(sbi->rid);
	greet.flags = 0;
	if (client->sending_farewell)
		greet.flags |= cpu_to_le64(SCOUTFS_NET_GREETING_FLAG_FAREWELL);
	if (am_voter)
		greet.flags |= cpu_to_le64(SCOUTFS_NET_GREETING_FLAG_VOTER);

	ret = scoutfs_net_submit_request(sb, client->conn,
					 SCOUTFS_NET_CMD_GREETING,
					 &greet, sizeof(greet),
					 client_greeting, NULL, NULL);
	if (ret)
		scoutfs_net_shutdown(sb, client->conn);
out:
	kfree(super);

	/* always have a small delay before retrying to avoid storms */
	if (ret && !atomic_read(&client->shutting_down))
		queue_delayed_work(client->workq, &client->connect_dwork,
				   msecs_to_jiffies(CLIENT_CONNECT_DELAY_MS));
}

/*
 * Perform a compaction in the client as requested by the server.  The
 * server has protected the input segments and allocated the output
 * segnos for us.  This executes in work queued by the client's net
 * connection.  It only reads and write segments.  The server will
 * update the manifest and allocators while processing the response.  An
 * error response includes the compaction id so that the server can
 * clean it up.
 *
 * If we get duplicate requests across a reconnected socket we can have
 * two workers performing the same compaction simultaneously.  This
 * isn't particularly efficient but it's rare and won't corrupt the
 * output.  Our response can be lost if the socket is shutdown while
 * it's in flight, the server deals with this.
 */
static int client_compact(struct super_block *sb,
			  struct scoutfs_net_connection *conn,
			  u8 cmd, u64 id, void *arg, u16 arg_len)
{
	struct scoutfs_net_compact_response *resp = NULL;
	struct scoutfs_net_compact_request *req;
	int ret;

	if (arg_len != sizeof(struct scoutfs_net_compact_request)) {
		ret = -EINVAL;
		goto out;
	}
	req = arg;

	trace_scoutfs_client_compact_start(sb, le64_to_cpu(req->id),
					   req->last_level, req->flags);

	resp = kzalloc(sizeof(struct scoutfs_net_compact_response), GFP_NOFS);
	if (!resp) {
		ret = -ENOMEM;
	} else {
		resp->id = req->id;
		ret = scoutfs_compact(sb, req, resp);
	}

	trace_scoutfs_client_compact_stop(sb, le64_to_cpu(req->id), ret);

	if (ret < 0)
		ret = scoutfs_net_response(sb, conn, cmd, id, ret,
					   &req->id, sizeof(req->id));
	else
		ret = scoutfs_net_response(sb, conn, cmd, id, 0,
					   resp, sizeof(*resp));
	kfree(resp);
out:
	return ret;
}

static scoutfs_net_request_t client_req_funcs[] = {
	[SCOUTFS_NET_CMD_COMPACT]		= client_compact,
	[SCOUTFS_NET_CMD_LOCK]			= client_lock,
	[SCOUTFS_NET_CMD_LOCK_RECOVER]		= client_lock_recover,
};

/*
 * Called when either a connect attempt or established connection times
 * out and fails.
 */
static void client_notify_down(struct super_block *sb,
			       struct scoutfs_net_connection *conn, void *info,
			       u64 rid)
{
	struct client_info *client = SCOUTFS_SB(sb)->client_info;

	if (!atomic_read(&client->shutting_down))
		queue_delayed_work(client->workq, &client->connect_dwork, 0);
}

int scoutfs_client_setup(struct super_block *sb)
{
	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
	struct client_info *client;
	int ret;

	client = kzalloc(sizeof(struct client_info), GFP_KERNEL);
	if (!client) {
		ret = -ENOMEM;
		goto out;
	}
	sbi->client_info = client;

	client->sb = sb;
	atomic_set(&client->shutting_down, 0);
	INIT_DELAYED_WORK(&client->connect_dwork,
			  scoutfs_client_connect_worker);
	init_completion(&client->farewell_comp);

	client->conn = scoutfs_net_alloc_conn(sb, NULL, client_notify_down, 0,
					      client_req_funcs, "client");
	if (!client->conn) {
		ret = -ENOMEM;
		goto out;
	}

	client->workq = alloc_workqueue("scoutfs_client_workq", WQ_UNBOUND, 1);
	if (!client->workq) {
		ret = -ENOMEM;
		goto out;
	}

	queue_delayed_work(client->workq, &client->connect_dwork, 0);
	ret = 0;

out:
	if (ret)
		scoutfs_client_destroy(sb);
	return ret;
}

/* Once we get a response from the server we can shut down */
static int client_farewell_response(struct super_block *sb,
				    struct scoutfs_net_connection *conn,
				    void *resp, unsigned int resp_len,
				    int error, void *data)
{
	struct client_info *client = SCOUTFS_SB(sb)->client_info;

	if (resp_len != 0)
		return -EINVAL;

	client->farewell_error = error;
	complete(&client->farewell_comp);

	return 0;
}

/*
 * There must be no more callers to the client request functions by the
 * time we get here.
 *
 * If we've connected to a server then we send them a farewell request
 * so that they don't wait for us to reconnect and trigger a timeout.
 *
 * This decision is a little racy.  The server considers us connected
 * when it records a persistent record of our rid as it processes our
 * greeting.  We can disconnect before receiving the greeting response
 * and leave without sending a farewell.  So given that awkward initial
 * race, we also have a bit of a race where we just test the server_term
 * to see if we've ever gotten a greeting reply from any server.  We
 * don't try to synchronize with pending connection attempts.
 *
 * The consequences of aborting a mount at just the wrong time and
 * disconnecting without the farewell handshake depend on what the
 * server does to timed out clients.  At best it'll spit out a warning
 * message that a client disconnected but it won't fence us if we didn't
 * have any persistent state.
 */
void scoutfs_client_destroy(struct super_block *sb)
{
	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
	struct client_info *client = SCOUTFS_SB(sb)->client_info;
	struct scoutfs_net_connection *conn;
	int ret;

	if (client == NULL)
		return;

	if (client->server_term != 0) {
		client->sending_farewell = true;
		ret = scoutfs_net_submit_request(sb, client->conn,
						 SCOUTFS_NET_CMD_FAREWELL,
						 NULL, 0,
						 client_farewell_response,
						 NULL, NULL);
		if (ret == 0) {
			ret = wait_for_completion_interruptible(
							&client->farewell_comp);
			if (ret == 0)
				ret = client->farewell_error;
		}
		if (ret) {
			scoutfs_inc_counter(sb, client_farewell_error);
			scoutfs_warn(sb, "client saw farewell error %d, server might see client connection time out", ret);
		}
	}

	/* stop notify_down from queueing connect work */
	atomic_set(&client->shutting_down, 1);

	/* make sure worker isn't using the conn */
	cancel_delayed_work_sync(&client->connect_dwork);

	/* make racing conn use explode */
	conn = client->conn;
	client->conn = NULL;
	scoutfs_net_free_conn(sb, conn);

	if (client->workq)
		destroy_workqueue(client->workq);
	kfree(client);
	sbi->client_info = NULL;
}