/* * Copyright (C) 2017 Versity Software, Inc. All rights reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License v2 as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. */ #include #include #include #include #include #include #include #include #include #include #include "format.h" #include "counters.h" #include "inode.h" #include "btree.h" #include "manifest.h" #include "seg.h" #include "compact.h" #include "scoutfs_trace.h" #include "msg.h" #include "client.h" #include "net.h" #include "endian_swap.h" #include "quorum.h" /* * The client is responsible for maintaining a connection to the server. * This includes managing quorum elections that determine which client * should run the server that all the clients connect to. */ #define CLIENT_CONNECT_DELAY_MS (MSEC_PER_SEC / 10) #define CLIENT_CONNECT_TIMEOUT_MS (1 * MSEC_PER_SEC) #define CLIENT_QUORUM_TIMEOUT_MS (5 * MSEC_PER_SEC) struct client_info { struct super_block *sb; struct scoutfs_net_connection *conn; atomic_t shutting_down; struct workqueue_struct *workq; struct delayed_work connect_dwork; u64 server_term; u64 greeting_umb; bool sending_farewell; int farewell_error; struct completion farewell_comp; }; /* * Ask for a new run of allocated inode numbers. The server can return * fewer than @count. It will success with nr == 0 if we've run out. */ int scoutfs_client_alloc_inodes(struct super_block *sb, u64 count, u64 *ino, u64 *nr) { struct client_info *client = SCOUTFS_SB(sb)->client_info; struct scoutfs_net_inode_alloc ial; __le64 lecount = cpu_to_le64(count); int ret; ret = scoutfs_net_sync_request(sb, client->conn, SCOUTFS_NET_CMD_ALLOC_INODES, &lecount, sizeof(lecount), &ial, sizeof(ial)); if (ret == 0) { *ino = le64_to_cpu(ial.ino); *nr = le64_to_cpu(ial.nr); if (*nr == 0) ret = -ENOSPC; else if (*ino + *nr < *ino) ret = -EINVAL; } return ret; } /* * Ask the server for an extent of at most @blocks blocks. It can return * smaller extents. */ int scoutfs_client_alloc_extent(struct super_block *sb, u64 blocks, u64 *start, u64 *len) { struct client_info *client = SCOUTFS_SB(sb)->client_info; __le64 leblocks = cpu_to_le64(blocks); struct scoutfs_net_extent nex; int ret; ret = scoutfs_net_sync_request(sb, client->conn, SCOUTFS_NET_CMD_ALLOC_EXTENT, &leblocks, sizeof(leblocks), &nex, sizeof(nex)); if (ret == 0) { if (nex.len == 0) { ret = -ENOSPC; } else { *start = le64_to_cpu(nex.start); *len = le64_to_cpu(nex.len); } } return ret; } int scoutfs_client_free_extents(struct super_block *sb, struct scoutfs_net_extent_list *nexl) { struct client_info *client = SCOUTFS_SB(sb)->client_info; unsigned int bytes; bytes = SCOUTFS_NET_EXTENT_LIST_BYTES(le64_to_cpu(nexl->nr)); return scoutfs_net_sync_request(sb, client->conn, SCOUTFS_NET_CMD_FREE_EXTENTS, nexl, bytes, NULL, 0); } int scoutfs_client_alloc_segno(struct super_block *sb, u64 *segno) { struct client_info *client = SCOUTFS_SB(sb)->client_info; __le64 lesegno; int ret; ret = scoutfs_net_sync_request(sb, client->conn, SCOUTFS_NET_CMD_ALLOC_SEGNO, NULL, 0, &lesegno, sizeof(lesegno)); if (ret == 0) { if (lesegno == 0) ret = -ENOSPC; else *segno = le64_to_cpu(lesegno); } return ret; } int scoutfs_client_record_segment(struct super_block *sb, struct scoutfs_segment *seg, u8 level) { struct client_info *client = SCOUTFS_SB(sb)->client_info; struct scoutfs_net_manifest_entry net_ment; struct scoutfs_manifest_entry ment; scoutfs_seg_init_ment(&ment, level, seg); scoutfs_init_ment_to_net(&net_ment, &ment); return scoutfs_net_sync_request(sb, client->conn, SCOUTFS_NET_CMD_RECORD_SEGMENT, &net_ment, sizeof(net_ment), NULL, 0); } int scoutfs_client_advance_seq(struct super_block *sb, u64 *seq) { struct client_info *client = SCOUTFS_SB(sb)->client_info; __le64 before = cpu_to_le64p(seq); __le64 after; int ret; ret = scoutfs_net_sync_request(sb, client->conn, SCOUTFS_NET_CMD_ADVANCE_SEQ, &before, sizeof(before), &after, sizeof(after)); if (ret == 0) *seq = le64_to_cpu(after); return ret; } int scoutfs_client_get_last_seq(struct super_block *sb, u64 *seq) { struct client_info *client = SCOUTFS_SB(sb)->client_info; __le64 last_seq; int ret; ret = scoutfs_net_sync_request(sb, client->conn, SCOUTFS_NET_CMD_GET_LAST_SEQ, NULL, 0, &last_seq, sizeof(last_seq)); if (ret == 0) *seq = le64_to_cpu(last_seq); return ret; } int scoutfs_client_get_manifest_root(struct super_block *sb, struct scoutfs_btree_root *root) { struct client_info *client = SCOUTFS_SB(sb)->client_info; return scoutfs_net_sync_request(sb, client->conn, SCOUTFS_NET_CMD_GET_MANIFEST_ROOT, NULL, 0, root, sizeof(struct scoutfs_btree_root)); } int scoutfs_client_statfs(struct super_block *sb, struct scoutfs_net_statfs *nstatfs) { struct client_info *client = SCOUTFS_SB(sb)->client_info; return scoutfs_net_sync_request(sb, client->conn, SCOUTFS_NET_CMD_STATFS, NULL, 0, nstatfs, sizeof(struct scoutfs_net_statfs)); } /* process an incoming grant response from the server */ static int client_lock_response(struct super_block *sb, struct scoutfs_net_connection *conn, void *resp, unsigned int resp_len, int error, void *data) { if (resp_len != sizeof(struct scoutfs_net_lock)) return -EINVAL; /* XXX error? */ return scoutfs_lock_grant_response(sb, resp); } /* Send a lock request to the server. */ int scoutfs_client_lock_request(struct super_block *sb, struct scoutfs_net_lock *nl) { struct client_info *client = SCOUTFS_SB(sb)->client_info; return scoutfs_net_submit_request(sb, client->conn, SCOUTFS_NET_CMD_LOCK, nl, sizeof(*nl), client_lock_response, NULL, NULL); } /* Send a lock response to the server. */ int scoutfs_client_lock_response(struct super_block *sb, u64 net_id, struct scoutfs_net_lock *nl) { struct client_info *client = SCOUTFS_SB(sb)->client_info; return scoutfs_net_response(sb, client->conn, SCOUTFS_NET_CMD_LOCK, net_id, 0, nl, sizeof(*nl)); } /* Send a lock recover response to the server. */ int scoutfs_client_lock_recover_response(struct super_block *sb, u64 net_id, struct scoutfs_net_lock_recover *nlr) { struct client_info *client = SCOUTFS_SB(sb)->client_info; u16 bytes = offsetof(struct scoutfs_net_lock_recover, locks[le16_to_cpu(nlr->nr)]); return scoutfs_net_response(sb, client->conn, SCOUTFS_NET_CMD_LOCK_RECOVER, net_id, 0, nlr, bytes); } /* The client is receiving a invalidation request from the server */ static int client_lock(struct super_block *sb, struct scoutfs_net_connection *conn, u8 cmd, u64 id, void *arg, u16 arg_len) { if (arg_len != sizeof(struct scoutfs_net_lock)) return -EINVAL; /* XXX error? */ return scoutfs_lock_invalidate_request(sb, id, arg); } /* The server is asking us for the client's locks starting with the given key */ static int client_lock_recover(struct super_block *sb, struct scoutfs_net_connection *conn, u8 cmd, u64 id, void *arg, u16 arg_len) { if (arg_len != sizeof(struct scoutfs_key)) return -EINVAL; /* XXX error? */ return scoutfs_lock_recover_request(sb, id, arg); } /* * Process a greeting response in the client from the server. This is * called for every connected socket on the connection. Each response * contains the remote server's elected term which can be used to * identify server failover. */ static int client_greeting(struct super_block *sb, struct scoutfs_net_connection *conn, void *resp, unsigned int resp_len, int error, void *data) { struct client_info *client = SCOUTFS_SB(sb)->client_info; struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super; struct scoutfs_net_greeting *gr = resp; bool new_server; int ret; if (error) { ret = error; goto out; } if (resp_len != sizeof(struct scoutfs_net_greeting)) { ret = -EINVAL; goto out; } if (gr->fsid != super->hdr.fsid) { scoutfs_warn(sb, "server sent fsid 0x%llx, client has 0x%llx", le64_to_cpu(gr->fsid), le64_to_cpu(super->hdr.fsid)); ret = -EINVAL; goto out; } if (gr->format_hash != super->format_hash) { scoutfs_warn(sb, "server sent format 0x%llx, client has 0x%llx", le64_to_cpu(gr->format_hash), le64_to_cpu(super->format_hash)); ret = -EINVAL; goto out; } new_server = le64_to_cpu(gr->server_term) != client->server_term; scoutfs_net_client_greeting(sb, conn, new_server); client->server_term = le64_to_cpu(gr->server_term); client->greeting_umb = le64_to_cpu(gr->unmount_barrier); ret = 0; out: return ret; } /* * This work is responsible for maintaining a connection from the client * to the server. It's queued on mount and disconnect and we requeue * the work if the work fails and we're not shutting down. * * In the typical case a mount reads the super blocks and finds the * address of the currently running server and connects to it. * Non-voting clients who can't connect will keep trying alternating * reading the address and getting connect timeouts. * * Voting mounts will try to elect a leader if they can't connect to the * server. When a quorum can't connect and are able to elect a leader * then a new server is started. The new server will write its address * in the super and everyone will be able to connect. * * There's a tricky bit of coordination required to safely unmount. * Clients need to tell the server that they won't be coming back with a * farewell request. Once a client receives its farewell response it * can exit. But a majority of clients need to stick around to elect a * server to process all their farewell requests. This is coordinated * by having the greeting tell the server that a client is a voter. The * server then holds on to farewell requests from voters until only * requests from the final quorum remain. These farewell responses are * only sent after updating an unmount barrier in the super to indicate * to the final quorum that they can safely exit without having received * a farewell response over the network. */ static void scoutfs_client_connect_worker(struct work_struct *work) { struct client_info *client = container_of(work, struct client_info, connect_dwork.work); struct super_block *sb = client->sb; struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct scoutfs_super_block *super = NULL; struct mount_options *opts = &sbi->opts; const bool am_voter = opts->server_addr.sin_addr.s_addr != 0; struct scoutfs_net_greeting greet; struct sockaddr_in sin; ktime_t timeout_abs; u64 elected_term; int ret; super = kmalloc(sizeof(struct scoutfs_super_block), GFP_NOFS); if (!super) { ret = -ENOMEM; goto out; } ret = scoutfs_read_super(sb, super); if (ret) goto out; /* can safely unmount if we see that server processed our farewell */ if (am_voter && client->sending_farewell && (le64_to_cpu(super->unmount_barrier) > client->greeting_umb)) { client->farewell_error = 0; complete(&client->farewell_comp); ret = 0; goto out; } /* try to connect to the super's server address */ scoutfs_addr_to_sin(&sin, &super->server_addr); if (sin.sin_addr.s_addr != 0 && sin.sin_port != 0) ret = scoutfs_net_connect(sb, client->conn, &sin, CLIENT_CONNECT_TIMEOUT_MS); else ret = -ENOTCONN; /* voters try to elect a leader if they couldn't connect */ if (ret < 0) { /* non-voters will keep retrying */ if (!am_voter) goto out; /* make sure local server isn't writing super during votes */ scoutfs_server_stop(sb); timeout_abs = ktime_add_ms(ktime_get(), CLIENT_QUORUM_TIMEOUT_MS); ret = scoutfs_quorum_election(sb, timeout_abs, le64_to_cpu(super->quorum_server_term), &elected_term); /* start the server if we were asked to */ if (elected_term > 0) ret = scoutfs_server_start(sb, &opts->server_addr, elected_term); ret = -ENOTCONN; goto out; } /* send a greeting to verify endpoints of each connection */ greet.fsid = super->hdr.fsid; greet.format_hash = super->format_hash; greet.server_term = cpu_to_le64(client->server_term); greet.unmount_barrier = cpu_to_le64(client->greeting_umb); greet.rid = cpu_to_le64(sbi->rid); greet.flags = 0; if (client->sending_farewell) greet.flags |= cpu_to_le64(SCOUTFS_NET_GREETING_FLAG_FAREWELL); if (am_voter) greet.flags |= cpu_to_le64(SCOUTFS_NET_GREETING_FLAG_VOTER); ret = scoutfs_net_submit_request(sb, client->conn, SCOUTFS_NET_CMD_GREETING, &greet, sizeof(greet), client_greeting, NULL, NULL); if (ret) scoutfs_net_shutdown(sb, client->conn); out: kfree(super); /* always have a small delay before retrying to avoid storms */ if (ret && !atomic_read(&client->shutting_down)) queue_delayed_work(client->workq, &client->connect_dwork, msecs_to_jiffies(CLIENT_CONNECT_DELAY_MS)); } /* * Perform a compaction in the client as requested by the server. The * server has protected the input segments and allocated the output * segnos for us. This executes in work queued by the client's net * connection. It only reads and write segments. The server will * update the manifest and allocators while processing the response. An * error response includes the compaction id so that the server can * clean it up. * * If we get duplicate requests across a reconnected socket we can have * two workers performing the same compaction simultaneously. This * isn't particularly efficient but it's rare and won't corrupt the * output. Our response can be lost if the socket is shutdown while * it's in flight, the server deals with this. */ static int client_compact(struct super_block *sb, struct scoutfs_net_connection *conn, u8 cmd, u64 id, void *arg, u16 arg_len) { struct scoutfs_net_compact_response *resp = NULL; struct scoutfs_net_compact_request *req; int ret; if (arg_len != sizeof(struct scoutfs_net_compact_request)) { ret = -EINVAL; goto out; } req = arg; trace_scoutfs_client_compact_start(sb, le64_to_cpu(req->id), req->last_level, req->flags); resp = kzalloc(sizeof(struct scoutfs_net_compact_response), GFP_NOFS); if (!resp) { ret = -ENOMEM; } else { resp->id = req->id; ret = scoutfs_compact(sb, req, resp); } trace_scoutfs_client_compact_stop(sb, le64_to_cpu(req->id), ret); if (ret < 0) ret = scoutfs_net_response(sb, conn, cmd, id, ret, &req->id, sizeof(req->id)); else ret = scoutfs_net_response(sb, conn, cmd, id, 0, resp, sizeof(*resp)); kfree(resp); out: return ret; } static scoutfs_net_request_t client_req_funcs[] = { [SCOUTFS_NET_CMD_COMPACT] = client_compact, [SCOUTFS_NET_CMD_LOCK] = client_lock, [SCOUTFS_NET_CMD_LOCK_RECOVER] = client_lock_recover, }; /* * Called when either a connect attempt or established connection times * out and fails. */ static void client_notify_down(struct super_block *sb, struct scoutfs_net_connection *conn, void *info, u64 rid) { struct client_info *client = SCOUTFS_SB(sb)->client_info; if (!atomic_read(&client->shutting_down)) queue_delayed_work(client->workq, &client->connect_dwork, 0); } int scoutfs_client_setup(struct super_block *sb) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct client_info *client; int ret; client = kzalloc(sizeof(struct client_info), GFP_KERNEL); if (!client) { ret = -ENOMEM; goto out; } sbi->client_info = client; client->sb = sb; atomic_set(&client->shutting_down, 0); INIT_DELAYED_WORK(&client->connect_dwork, scoutfs_client_connect_worker); init_completion(&client->farewell_comp); client->conn = scoutfs_net_alloc_conn(sb, NULL, client_notify_down, 0, client_req_funcs, "client"); if (!client->conn) { ret = -ENOMEM; goto out; } client->workq = alloc_workqueue("scoutfs_client_workq", WQ_UNBOUND, 1); if (!client->workq) { ret = -ENOMEM; goto out; } queue_delayed_work(client->workq, &client->connect_dwork, 0); ret = 0; out: if (ret) scoutfs_client_destroy(sb); return ret; } /* Once we get a response from the server we can shut down */ static int client_farewell_response(struct super_block *sb, struct scoutfs_net_connection *conn, void *resp, unsigned int resp_len, int error, void *data) { struct client_info *client = SCOUTFS_SB(sb)->client_info; if (resp_len != 0) return -EINVAL; client->farewell_error = error; complete(&client->farewell_comp); return 0; } /* * There must be no more callers to the client request functions by the * time we get here. * * If we've connected to a server then we send them a farewell request * so that they don't wait for us to reconnect and trigger a timeout. * * This decision is a little racy. The server considers us connected * when it records a persistent record of our rid as it processes our * greeting. We can disconnect before receiving the greeting response * and leave without sending a farewell. So given that awkward initial * race, we also have a bit of a race where we just test the server_term * to see if we've ever gotten a greeting reply from any server. We * don't try to synchronize with pending connection attempts. * * The consequences of aborting a mount at just the wrong time and * disconnecting without the farewell handshake depend on what the * server does to timed out clients. At best it'll spit out a warning * message that a client disconnected but it won't fence us if we didn't * have any persistent state. */ void scoutfs_client_destroy(struct super_block *sb) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct client_info *client = SCOUTFS_SB(sb)->client_info; struct scoutfs_net_connection *conn; int ret; if (client == NULL) return; if (client->server_term != 0) { client->sending_farewell = true; ret = scoutfs_net_submit_request(sb, client->conn, SCOUTFS_NET_CMD_FAREWELL, NULL, 0, client_farewell_response, NULL, NULL); if (ret == 0) { ret = wait_for_completion_interruptible( &client->farewell_comp); if (ret == 0) ret = client->farewell_error; } if (ret) { scoutfs_inc_counter(sb, client_farewell_error); scoutfs_warn(sb, "client saw farewell error %d, server might see client connection time out", ret); } } /* stop notify_down from queueing connect work */ atomic_set(&client->shutting_down, 1); /* make sure worker isn't using the conn */ cancel_delayed_work_sync(&client->connect_dwork); /* make racing conn use explode */ conn = client->conn; client->conn = NULL; scoutfs_net_free_conn(sb, conn); if (client->workq) destroy_workqueue(client->workq); kfree(client); sbi->client_info = NULL; }