mirror of
https://github.com/versity/scoutfs.git
synced 2026-02-08 03:30:46 +00:00
Now that a mount's client is responsible for electing and starting a server we need to be careful about coordinating unmount. We can't let unmounting clients leave the remaining mounted clients without quorum. The server carefully tracks who is mounted and who is unmounting while it is processing farewell requests. It only sends responses to voting mounts while quorum remains or once all the voting clients are all trying to unmount. We use a field in the quorum blocks to communicate to the final set of unmounting voters that their farewells have been processed and that they can finish unmounting without trying to restablish quorum. The commit introduces and maintains the unmount_barrier field in the quorum blocks. It is passed to the server from the election, the server sends it to the client and writes new versions, and the client compares what it received with what it sees in quorum blocks. The commit then has the clients send their unique name to the server who stores it in persistent mounted client records and compares the names to the quorum config when deciding which farewell reqeusts can be responded to. Now that farewell response processing can block for a very long time it is moved off into async work so that it doesn't prevent net connections from being shutdown and re-established. This also makes it easier to make global decisions based on the count of pending farewell requests. Signed-off-by: Zach Brown <zab@versity.com>
2478 lines
66 KiB
C
2478 lines
66 KiB
C
/*
|
|
* Copyright (C) 2018 Versity Software, Inc. All rights reserved.
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public
|
|
* License v2 as published by the Free Software Foundation.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* General Public License for more details.
|
|
*/
|
|
#include <linux/kernel.h>
|
|
#include <linux/fs.h>
|
|
#include <linux/slab.h>
|
|
#include <asm/ioctls.h>
|
|
#include <linux/net.h>
|
|
#include <linux/inet.h>
|
|
#include <linux/in.h>
|
|
#include <net/sock.h>
|
|
#include <net/tcp.h>
|
|
#include <linux/log2.h>
|
|
#include <asm/unaligned.h>
|
|
|
|
#include "format.h"
|
|
#include "counters.h"
|
|
#include "inode.h"
|
|
#include "btree.h"
|
|
#include "manifest.h"
|
|
#include "seg.h"
|
|
#include "compact.h"
|
|
#include "scoutfs_trace.h"
|
|
#include "msg.h"
|
|
#include "server.h"
|
|
#include "net.h"
|
|
#include "lock_server.h"
|
|
#include "endian_swap.h"
|
|
#include "quorum.h"
|
|
|
|
/*
|
|
* Every active mount can act as the server that listens on a net
|
|
* connection and accepts connections from all the other mounts acting
|
|
* as clients.
|
|
*
|
|
* The server is started when raft elects the mount as the leader. If
|
|
* it sees errors it shuts down the server in the hopes that another
|
|
* mount will become the leader and have less trouble.
|
|
*/
|
|
|
|
struct server_info {
|
|
struct super_block *sb;
|
|
spinlock_t lock;
|
|
wait_queue_head_t waitq;
|
|
|
|
struct workqueue_struct *wq;
|
|
struct work_struct work;
|
|
int err;
|
|
bool shutting_down;
|
|
struct completion start_comp;
|
|
struct sockaddr_in listen_sin;
|
|
u64 term;
|
|
struct scoutfs_net_connection *conn;
|
|
|
|
struct scoutfs_quorum_elected_info qei;
|
|
|
|
/* request processing coordinates committing manifest and alloc */
|
|
struct rw_semaphore commit_rwsem;
|
|
struct llist_head commit_waiters;
|
|
struct work_struct commit_work;
|
|
|
|
/* server remembers the stable manifest root for clients */
|
|
seqcount_t stable_seqcount;
|
|
struct scoutfs_btree_root stable_manifest_root;
|
|
|
|
/* server tracks seq use */
|
|
struct rw_semaphore seq_rwsem;
|
|
|
|
/* server tracks pending frees to be applied during commit */
|
|
struct rw_semaphore alloc_rwsem;
|
|
struct list_head pending_frees;
|
|
|
|
struct list_head clients;
|
|
unsigned long nr_clients;
|
|
|
|
/* track compaction in flight */
|
|
unsigned long compacts_per_client;
|
|
unsigned long nr_compacts;
|
|
struct list_head compacts;
|
|
struct work_struct compact_work;
|
|
|
|
/* track clients waiting in unmmount for farewell response */
|
|
struct mutex farewell_mutex;
|
|
struct list_head farewell_requests;
|
|
struct work_struct farewell_work;
|
|
};
|
|
|
|
#define DECLARE_SERVER_INFO(sb, name) \
|
|
struct server_info *name = SCOUTFS_SB(sb)->server_info
|
|
|
|
/*
|
|
* The server tracks each connected client.
|
|
*/
|
|
struct server_client_info {
|
|
u64 node_id;
|
|
struct list_head head;
|
|
unsigned long nr_compacts;
|
|
};
|
|
|
|
struct commit_waiter {
|
|
struct completion comp;
|
|
struct llist_node node;
|
|
int ret;
|
|
};
|
|
|
|
static void init_extent_btree_key(struct scoutfs_extent_btree_key *ebk,
|
|
u8 type, u64 major, u64 minor)
|
|
{
|
|
ebk->type = type;
|
|
ebk->major = cpu_to_be64(major);
|
|
ebk->minor = cpu_to_be64(minor);
|
|
}
|
|
|
|
static int init_extent_from_btree_key(struct scoutfs_extent *ext, u8 type,
|
|
struct scoutfs_extent_btree_key *ebk,
|
|
unsigned int key_bytes)
|
|
{
|
|
u64 start;
|
|
u64 len;
|
|
|
|
/* btree _next doesn't have last key limit */
|
|
if (ebk->type != type)
|
|
return -ENOENT;
|
|
|
|
if (key_bytes != sizeof(struct scoutfs_extent_btree_key) ||
|
|
(ebk->type != SCOUTFS_FREE_EXTENT_BLKNO_TYPE &&
|
|
ebk->type != SCOUTFS_FREE_EXTENT_BLOCKS_TYPE))
|
|
return -EIO; /* XXX corruption, bad key */
|
|
|
|
start = be64_to_cpu(ebk->major);
|
|
len = be64_to_cpu(ebk->minor);
|
|
if (ebk->type == SCOUTFS_FREE_EXTENT_BLOCKS_TYPE)
|
|
swap(start, len);
|
|
start -= len - 1;
|
|
|
|
return scoutfs_extent_init(ext, ebk->type, 0, start, len, 0, 0);
|
|
}
|
|
|
|
/*
|
|
* This is called by the extent core on behalf of the server who holds
|
|
* the appropriate locks to protect the many btree items that can be
|
|
* accessed on behalf of one extent operation.
|
|
*
|
|
* The free_blocks count in the super tracks the number of blocks in
|
|
* the primary extent index. We update it here instead of expecting
|
|
* callers to remember.
|
|
*/
|
|
static int server_extent_io(struct super_block *sb, int op,
|
|
struct scoutfs_extent *ext, void *data)
|
|
{
|
|
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
|
struct scoutfs_extent_btree_key ebk;
|
|
SCOUTFS_BTREE_ITEM_REF(iref);
|
|
bool mirror = false;
|
|
u8 mirror_type;
|
|
u8 mirror_op = 0;
|
|
int ret;
|
|
int err;
|
|
|
|
trace_scoutfs_server_extent_io(sb, ext);
|
|
|
|
if (WARN_ON_ONCE(ext->type != SCOUTFS_FREE_EXTENT_BLKNO_TYPE &&
|
|
ext->type != SCOUTFS_FREE_EXTENT_BLOCKS_TYPE))
|
|
return -EINVAL;
|
|
|
|
if (ext->type == SCOUTFS_FREE_EXTENT_BLKNO_TYPE &&
|
|
(op == SEI_INSERT || op == SEI_DELETE)) {
|
|
mirror = true;
|
|
mirror_type = SCOUTFS_FREE_EXTENT_BLOCKS_TYPE;
|
|
mirror_op = op == SEI_INSERT ? SEI_DELETE : SEI_INSERT;
|
|
}
|
|
|
|
init_extent_btree_key(&ebk, ext->type, ext->start + ext->len - 1,
|
|
ext->len);
|
|
if (ext->type == SCOUTFS_FREE_EXTENT_BLOCKS_TYPE)
|
|
swap(ebk.major, ebk.minor);
|
|
|
|
if (op == SEI_NEXT || op == SEI_PREV) {
|
|
if (op == SEI_NEXT)
|
|
ret = scoutfs_btree_next(sb, &super->alloc_root,
|
|
&ebk, sizeof(ebk), &iref);
|
|
else
|
|
ret = scoutfs_btree_prev(sb, &super->alloc_root,
|
|
&ebk, sizeof(ebk), &iref);
|
|
if (ret == 0) {
|
|
ret = init_extent_from_btree_key(ext, ext->type,
|
|
iref.key,
|
|
iref.key_len);
|
|
scoutfs_btree_put_iref(&iref);
|
|
}
|
|
|
|
} else if (op == SEI_INSERT) {
|
|
ret = scoutfs_btree_insert(sb, &super->alloc_root,
|
|
&ebk, sizeof(ebk), NULL, 0);
|
|
|
|
} else if (op == SEI_DELETE) {
|
|
ret = scoutfs_btree_delete(sb, &super->alloc_root,
|
|
&ebk, sizeof(ebk));
|
|
|
|
} else {
|
|
ret = WARN_ON_ONCE(-EINVAL);
|
|
}
|
|
|
|
if (ret == 0 && mirror) {
|
|
swap(ext->type, mirror_type);
|
|
ret = server_extent_io(sb, op, ext, data);
|
|
swap(ext->type, mirror_type);
|
|
if (ret < 0) {
|
|
err = server_extent_io(sb, mirror_op, ext, data);
|
|
if (err)
|
|
scoutfs_corruption(sb,
|
|
SC_SERVER_EXTENT_CLEANUP,
|
|
corrupt_server_extent_cleanup,
|
|
"op %u ext "SE_FMT" ret %d",
|
|
op, SE_ARG(ext), err);
|
|
}
|
|
}
|
|
|
|
if (ret == 0 && ext->type == SCOUTFS_FREE_EXTENT_BLKNO_TYPE) {
|
|
if (op == SEI_INSERT)
|
|
le64_add_cpu(&super->free_blocks, ext->len);
|
|
else if (op == SEI_DELETE)
|
|
le64_add_cpu(&super->free_blocks, -ext->len);
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Allocate an extent of the given length in the first smallest free
|
|
* extent that contains it. We allocate in multiples of segment blocks
|
|
* and expose that to callers today.
|
|
*
|
|
* This doesn't have the cursor that segment allocation does. It's
|
|
* possible that a recently freed segment can merge to form a larger
|
|
* free extent that can be very quickly allocated to a node. The hope is
|
|
* that doesn't happen very often.
|
|
*/
|
|
static int alloc_extent(struct super_block *sb, u64 blocks,
|
|
u64 *start, u64 *len)
|
|
{
|
|
struct server_info *server = SCOUTFS_SB(sb)->server_info;
|
|
struct scoutfs_extent ext;
|
|
int ret;
|
|
|
|
*start = 0;
|
|
*len = 0;
|
|
|
|
down_write(&server->alloc_rwsem);
|
|
|
|
if (blocks & (SCOUTFS_SEGMENT_BLOCKS - 1)) {
|
|
ret = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
scoutfs_extent_init(&ext, SCOUTFS_FREE_EXTENT_BLOCKS_TYPE, 0,
|
|
0, blocks, 0, 0);
|
|
ret = scoutfs_extent_next(sb, server_extent_io, &ext, NULL);
|
|
if (ret == -ENOENT)
|
|
ret = scoutfs_extent_prev(sb, server_extent_io, &ext, NULL);
|
|
if (ret) {
|
|
if (ret == -ENOENT)
|
|
ret = -ENOSPC;
|
|
goto out;
|
|
}
|
|
|
|
trace_scoutfs_server_alloc_extent_next(sb, &ext);
|
|
|
|
ext.type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE;
|
|
ext.len = min(blocks, ext.len);
|
|
|
|
ret = scoutfs_extent_remove(sb, server_extent_io, &ext, NULL);
|
|
if (ret)
|
|
goto out;
|
|
|
|
trace_scoutfs_server_alloc_extent_allocated(sb, &ext);
|
|
|
|
*start = ext.start;
|
|
*len = ext.len;
|
|
ret = 0;
|
|
|
|
out:
|
|
up_write(&server->alloc_rwsem);
|
|
|
|
if (ret)
|
|
scoutfs_inc_counter(sb, server_extent_alloc_error);
|
|
else
|
|
scoutfs_inc_counter(sb, server_extent_alloc);
|
|
|
|
return ret;
|
|
}
|
|
|
|
struct pending_free_extent {
|
|
struct list_head head;
|
|
u64 start;
|
|
u64 len;
|
|
};
|
|
|
|
/*
|
|
* Now that the transaction's done we can apply all the pending frees.
|
|
* The list entries are totally unsorted so this is the first time that
|
|
* we can discover corruption from duplicated frees, etc. This can also
|
|
* fail on normal transient io or memory errors.
|
|
*
|
|
* We can't unwind if this fails. The caller can freak out or keep
|
|
* trying forever.
|
|
*/
|
|
static int apply_pending_frees(struct super_block *sb)
|
|
{
|
|
struct server_info *server = SCOUTFS_SB(sb)->server_info;
|
|
struct pending_free_extent *pfe;
|
|
struct pending_free_extent *tmp;
|
|
struct scoutfs_extent ext;
|
|
int ret;
|
|
|
|
down_write(&server->alloc_rwsem);
|
|
|
|
list_for_each_entry_safe(pfe, tmp, &server->pending_frees, head) {
|
|
scoutfs_inc_counter(sb, server_free_pending_extent);
|
|
scoutfs_extent_init(&ext, SCOUTFS_FREE_EXTENT_BLKNO_TYPE, 0,
|
|
pfe->start, pfe->len, 0, 0);
|
|
trace_scoutfs_server_free_pending_extent(sb, &ext);
|
|
ret = scoutfs_extent_add(sb, server_extent_io, &ext, NULL);
|
|
if (ret) {
|
|
scoutfs_inc_counter(sb, server_free_pending_error);
|
|
break;
|
|
}
|
|
|
|
list_del_init(&pfe->head);
|
|
kfree(pfe);
|
|
}
|
|
|
|
up_write(&server->alloc_rwsem);
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* If there are still pending frees to destroy it means the server didn't
|
|
* shut down cleanly and that's not well supported today so we want to
|
|
* have it holler if this happens. In the future we'd cleanly support
|
|
* forced shutdown that had been told that it's OK to throw away dirty
|
|
* state.
|
|
*/
|
|
static int destroy_pending_frees(struct super_block *sb)
|
|
{
|
|
struct server_info *server = SCOUTFS_SB(sb)->server_info;
|
|
struct pending_free_extent *pfe;
|
|
struct pending_free_extent *tmp;
|
|
|
|
WARN_ON_ONCE(!list_empty(&server->pending_frees));
|
|
|
|
down_write(&server->alloc_rwsem);
|
|
|
|
list_for_each_entry_safe(pfe, tmp, &server->pending_frees, head) {
|
|
list_del_init(&pfe->head);
|
|
kfree(pfe);
|
|
}
|
|
|
|
up_write(&server->alloc_rwsem);
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* We can't satisfy allocations with freed extents until the removed
|
|
* references to the freed extents have been committed. We add freed
|
|
* extents to a list that is only applied to the persistent indexes as
|
|
* the transaction is being committed and the current transaction won't
|
|
* try to allocate any more extents. If we didn't do this then we could
|
|
* write to referenced data as part of the commit that frees it. If the
|
|
* commit was interrupted the stable data could have been overwritten.
|
|
*/
|
|
static int free_extent(struct super_block *sb, u64 start, u64 len)
|
|
{
|
|
struct server_info *server = SCOUTFS_SB(sb)->server_info;
|
|
struct pending_free_extent *pfe;
|
|
int ret;
|
|
|
|
scoutfs_inc_counter(sb, server_free_extent);
|
|
|
|
down_write(&server->alloc_rwsem);
|
|
|
|
pfe = kmalloc(sizeof(struct pending_free_extent), GFP_NOFS);
|
|
if (!pfe) {
|
|
ret = -ENOMEM;
|
|
} else {
|
|
pfe->start = start;
|
|
pfe->len = len;
|
|
list_add_tail(&pfe->head, &server->pending_frees);
|
|
ret = 0;
|
|
}
|
|
|
|
up_write(&server->alloc_rwsem);
|
|
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* This is called by the compaction code which is running in the server.
|
|
* The server caller has held all the locks, etc.
|
|
*/
|
|
static int free_segno(struct super_block *sb, u64 segno)
|
|
{
|
|
scoutfs_inc_counter(sb, server_free_segno);
|
|
trace_scoutfs_free_segno(sb, segno);
|
|
return free_extent(sb, segno << SCOUTFS_SEGMENT_BLOCK_SHIFT,
|
|
SCOUTFS_SEGMENT_BLOCKS);
|
|
}
|
|
|
|
/*
|
|
* Allocate a segment on behalf of compaction or a node wanting to write
|
|
* a level 0 segment. It has to be aligned to the segment size because
|
|
* we address segments with aligned segment numbers instead of block
|
|
* offsets.
|
|
*
|
|
* We can use a simple cursor sweep of the index by start because all
|
|
* server extents are multiples of the segment size. Sweeping through
|
|
* the volume tries to spread out new segment writes and make it more
|
|
* rare to write to a recently freed segment which can cause a client to
|
|
* have to re-read the manifest.
|
|
*/
|
|
static int alloc_segno(struct super_block *sb, u64 *segno)
|
|
{
|
|
struct server_info *server = SCOUTFS_SB(sb)->server_info;
|
|
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
|
struct scoutfs_extent ext;
|
|
u64 curs;
|
|
int ret;
|
|
|
|
down_write(&server->alloc_rwsem);
|
|
|
|
curs = ALIGN(le64_to_cpu(super->alloc_cursor), SCOUTFS_SEGMENT_BLOCKS);
|
|
*segno = 0;
|
|
|
|
do {
|
|
scoutfs_extent_init(&ext, SCOUTFS_FREE_EXTENT_BLKNO_TYPE, 0,
|
|
curs, 1, 0, 0);
|
|
ret = scoutfs_extent_next(sb, server_extent_io, &ext, NULL);
|
|
} while (ret == -ENOENT && curs && (curs = 0, 1));
|
|
if (ret) {
|
|
if (ret == -ENOENT)
|
|
ret = -ENOSPC;
|
|
goto out;
|
|
}
|
|
|
|
trace_scoutfs_server_alloc_segno_next(sb, &ext);
|
|
|
|
/* use cursor if within extent, otherwise start of next extent */
|
|
if (ext.start < curs)
|
|
ext.start = curs;
|
|
ext.len = SCOUTFS_SEGMENT_BLOCKS;
|
|
|
|
ret = scoutfs_extent_remove(sb, server_extent_io, &ext, NULL);
|
|
if (ret)
|
|
goto out;
|
|
|
|
super->alloc_cursor = cpu_to_le64(ext.start + ext.len);
|
|
|
|
*segno = ext.start >> SCOUTFS_SEGMENT_BLOCK_SHIFT;
|
|
|
|
trace_scoutfs_server_alloc_segno_allocated(sb, &ext);
|
|
trace_scoutfs_alloc_segno(sb, *segno);
|
|
scoutfs_inc_counter(sb, server_alloc_segno);
|
|
|
|
out:
|
|
up_write(&server->alloc_rwsem);
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* "allocating" a segno removes an unknown segment from the allocator
|
|
* and returns it, "removing" a segno removes a specific segno from the
|
|
* allocator.
|
|
*/
|
|
static int remove_segno(struct super_block *sb, u64 segno)
|
|
{
|
|
struct server_info *server = SCOUTFS_SB(sb)->server_info;
|
|
struct scoutfs_extent ext;
|
|
int ret;
|
|
|
|
trace_scoutfs_remove_segno(sb, segno);
|
|
|
|
scoutfs_extent_init(&ext, SCOUTFS_FREE_EXTENT_BLKNO_TYPE, 0,
|
|
segno << SCOUTFS_SEGMENT_BLOCK_SHIFT,
|
|
SCOUTFS_SEGMENT_BLOCKS, 0, 0);
|
|
|
|
down_write(&server->alloc_rwsem);
|
|
ret = scoutfs_extent_remove(sb, server_extent_io, &ext, NULL);
|
|
up_write(&server->alloc_rwsem);
|
|
return ret;
|
|
}
|
|
|
|
static void stop_server(struct server_info *server)
|
|
{
|
|
/* wait_event/wake_up provide barriers */
|
|
server->shutting_down = true;
|
|
wake_up(&server->waitq);
|
|
}
|
|
|
|
/*
|
|
* Queue compaction work if clients have capacity for processing
|
|
* requests and the manifest knows of levels with too many segments.
|
|
*/
|
|
static void try_queue_compact(struct server_info *server)
|
|
{
|
|
struct super_block *sb = server->sb;
|
|
bool can_request;
|
|
|
|
spin_lock(&server->lock);
|
|
can_request = server->nr_compacts <
|
|
(server->nr_clients * server->compacts_per_client);
|
|
spin_unlock(&server->lock);
|
|
if (can_request && scoutfs_manifest_should_compact(sb))
|
|
queue_work(server->wq, &server->compact_work);
|
|
}
|
|
|
|
/*
|
|
* This is called while still holding the rwsem that prevents commits so
|
|
* that the caller can be sure to be woken by the next commit after they
|
|
* queue and release the lock.
|
|
*
|
|
* It's important to realize that the caller's commit_waiter list node
|
|
* might be serviced by a currently running commit work while queueing
|
|
* another work run in the future. This caller can return from
|
|
* wait_for_commit() while the commit_work is still queued.
|
|
*
|
|
* This could queue delayed work but we're first trying to have batching
|
|
* work by having concurrent modification line up behind a commit in
|
|
* flight. Once the commit finishes it'll unlock and hopefully everyone
|
|
* will race to make their changes and they'll all be applied by the
|
|
* next commit after that.
|
|
*/
|
|
static void queue_commit_work(struct server_info *server,
|
|
struct commit_waiter *cw)
|
|
{
|
|
lockdep_assert_held(&server->commit_rwsem);
|
|
|
|
cw->ret = 0;
|
|
init_completion(&cw->comp);
|
|
llist_add(&cw->node, &server->commit_waiters);
|
|
queue_work(server->wq, &server->commit_work);
|
|
}
|
|
|
|
/*
|
|
* Wait for a commit during request processing and return its status.
|
|
*/
|
|
static inline int wait_for_commit(struct commit_waiter *cw)
|
|
{
|
|
wait_for_completion(&cw->comp);
|
|
return cw->ret;
|
|
}
|
|
|
|
/*
|
|
* A core function of request processing is to modify the manifest and
|
|
* allocator. Often the processing needs to make the modifications
|
|
* persistent before replying. We'd like to batch these commits as much
|
|
* as is reasonable so that we don't degrade to a few IO round trips per
|
|
* request.
|
|
*
|
|
* Getting that batching right is bound up in the concurrency of request
|
|
* processing so a clear way to implement the batched commits is to
|
|
* implement commits with a single pending work func like the
|
|
* processing.
|
|
*
|
|
* Processing paths acquire the rwsem for reading while they're making
|
|
* multiple dependent changes. When they're done and want it persistent
|
|
* they add themselves to the list of waiters and queue the commit work.
|
|
* This work runs, acquires the lock to exclude other writers, and
|
|
* performs the commit. Readers can run concurrently with these
|
|
* commits.
|
|
*/
|
|
static void scoutfs_server_commit_func(struct work_struct *work)
|
|
{
|
|
struct server_info *server = container_of(work, struct server_info,
|
|
commit_work);
|
|
struct super_block *sb = server->sb;
|
|
struct commit_waiter *cw;
|
|
struct commit_waiter *pos;
|
|
struct llist_node *node;
|
|
int ret;
|
|
|
|
trace_scoutfs_server_commit_work_enter(sb, 0, 0);
|
|
|
|
down_write(&server->commit_rwsem);
|
|
|
|
/* try to free first which can dirty the btrees */
|
|
ret = apply_pending_frees(sb);
|
|
if (ret) {
|
|
scoutfs_err(sb, "server error freeing extents: %d", ret);
|
|
goto out;
|
|
}
|
|
|
|
if (!scoutfs_btree_has_dirty(sb)) {
|
|
ret = 0;
|
|
goto out;
|
|
}
|
|
|
|
ret = scoutfs_btree_write_dirty(sb);
|
|
if (ret) {
|
|
scoutfs_err(sb, "server error writing btree blocks: %d", ret);
|
|
goto out;
|
|
}
|
|
|
|
ret = scoutfs_write_dirty_super(sb);
|
|
if (ret) {
|
|
scoutfs_err(sb, "server error writing super block: %d", ret);
|
|
goto out;
|
|
}
|
|
|
|
scoutfs_btree_write_complete(sb);
|
|
|
|
write_seqcount_begin(&server->stable_seqcount);
|
|
server->stable_manifest_root = SCOUTFS_SB(sb)->super.manifest.root;
|
|
write_seqcount_end(&server->stable_seqcount);
|
|
|
|
scoutfs_advance_dirty_super(sb);
|
|
ret = 0;
|
|
|
|
out:
|
|
node = llist_del_all(&server->commit_waiters);
|
|
|
|
/* waiters always wait on completion, cw could be free after complete */
|
|
llist_for_each_entry_safe(cw, pos, node, node) {
|
|
cw->ret = ret;
|
|
complete(&cw->comp);
|
|
}
|
|
|
|
up_write(&server->commit_rwsem);
|
|
trace_scoutfs_server_commit_work_exit(sb, 0, ret);
|
|
}
|
|
|
|
void scoutfs_init_ment_to_net(struct scoutfs_net_manifest_entry *net_ment,
|
|
struct scoutfs_manifest_entry *ment)
|
|
{
|
|
net_ment->segno = cpu_to_le64(ment->segno);
|
|
net_ment->seq = cpu_to_le64(ment->seq);
|
|
net_ment->first = ment->first;
|
|
net_ment->last = ment->last;
|
|
net_ment->level = ment->level;
|
|
}
|
|
|
|
void scoutfs_init_ment_from_net(struct scoutfs_manifest_entry *ment,
|
|
struct scoutfs_net_manifest_entry *net_ment)
|
|
{
|
|
ment->segno = le64_to_cpu(net_ment->segno);
|
|
ment->seq = le64_to_cpu(net_ment->seq);
|
|
ment->level = net_ment->level;
|
|
ment->first = net_ment->first;
|
|
ment->last = net_ment->last;
|
|
}
|
|
|
|
static int server_alloc_inodes(struct super_block *sb,
|
|
struct scoutfs_net_connection *conn,
|
|
u8 cmd, u64 id, void *arg, u16 arg_len)
|
|
{
|
|
DECLARE_SERVER_INFO(sb, server);
|
|
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
|
struct scoutfs_super_block *super = &sbi->super;
|
|
struct scoutfs_net_inode_alloc ial = { 0, };
|
|
struct commit_waiter cw;
|
|
__le64 lecount;
|
|
u64 ino;
|
|
u64 nr;
|
|
int ret;
|
|
|
|
if (arg_len != sizeof(lecount)) {
|
|
ret = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
memcpy(&lecount, arg, arg_len);
|
|
|
|
down_read(&server->commit_rwsem);
|
|
|
|
spin_lock(&sbi->next_ino_lock);
|
|
ino = le64_to_cpu(super->next_ino);
|
|
nr = min(le64_to_cpu(lecount), U64_MAX - ino);
|
|
le64_add_cpu(&super->next_ino, nr);
|
|
spin_unlock(&sbi->next_ino_lock);
|
|
|
|
queue_commit_work(server, &cw);
|
|
up_read(&server->commit_rwsem);
|
|
|
|
ial.ino = cpu_to_le64(ino);
|
|
ial.nr = cpu_to_le64(nr);
|
|
|
|
ret = wait_for_commit(&cw);
|
|
out:
|
|
return scoutfs_net_response(sb, conn, cmd, id, ret, &ial, sizeof(ial));
|
|
}
|
|
|
|
/*
|
|
* Give the client an extent allocation of len blocks. We leave the
|
|
* details to the extent allocator.
|
|
*/
|
|
static int server_alloc_extent(struct super_block *sb,
|
|
struct scoutfs_net_connection *conn,
|
|
u8 cmd, u64 id, void *arg, u16 arg_len)
|
|
{
|
|
DECLARE_SERVER_INFO(sb, server);
|
|
struct commit_waiter cw;
|
|
struct scoutfs_net_extent nex = {0,};
|
|
__le64 leblocks;
|
|
u64 start;
|
|
u64 len;
|
|
int ret;
|
|
|
|
if (arg_len != sizeof(leblocks)) {
|
|
ret = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
memcpy(&leblocks, arg, arg_len);
|
|
|
|
down_read(&server->commit_rwsem);
|
|
ret = alloc_extent(sb, le64_to_cpu(leblocks), &start, &len);
|
|
if (ret == 0)
|
|
queue_commit_work(server, &cw);
|
|
up_read(&server->commit_rwsem);
|
|
if (ret == 0)
|
|
ret = wait_for_commit(&cw);
|
|
if (ret)
|
|
goto out;
|
|
|
|
nex.start = cpu_to_le64(start);
|
|
nex.len = cpu_to_le64(len);
|
|
out:
|
|
return scoutfs_net_response(sb, conn, cmd, id, ret, &nex, sizeof(nex));
|
|
}
|
|
|
|
static bool invalid_net_extent_list(struct scoutfs_net_extent_list *nexl,
|
|
unsigned data_len)
|
|
{
|
|
return (data_len < sizeof(struct scoutfs_net_extent_list)) ||
|
|
(le64_to_cpu(nexl->nr) > SCOUTFS_NET_EXTENT_LIST_MAX_NR) ||
|
|
(data_len != offsetof(struct scoutfs_net_extent_list,
|
|
extents[le64_to_cpu(nexl->nr)]));
|
|
}
|
|
|
|
static int server_free_extents(struct super_block *sb,
|
|
struct scoutfs_net_connection *conn,
|
|
u8 cmd, u64 id, void *arg, u16 arg_len)
|
|
{
|
|
DECLARE_SERVER_INFO(sb, server);
|
|
struct scoutfs_net_extent_list *nexl;
|
|
struct commit_waiter cw;
|
|
int ret = 0;
|
|
int err;
|
|
u64 i;
|
|
|
|
nexl = arg;
|
|
if (invalid_net_extent_list(nexl, arg_len)) {
|
|
ret = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
down_read(&server->commit_rwsem);
|
|
|
|
for (i = 0; i < le64_to_cpu(nexl->nr); i++) {
|
|
ret = free_extent(sb, le64_to_cpu(nexl->extents[i].start),
|
|
le64_to_cpu(nexl->extents[i].len));
|
|
if (ret)
|
|
break;
|
|
}
|
|
|
|
if (i > 0)
|
|
queue_commit_work(server, &cw);
|
|
up_read(&server->commit_rwsem);
|
|
|
|
if (i > 0) {
|
|
err = wait_for_commit(&cw);
|
|
if (ret == 0)
|
|
ret = err;
|
|
}
|
|
|
|
out:
|
|
return scoutfs_net_response(sb, conn, cmd, id, ret, NULL, 0);
|
|
}
|
|
|
|
/*
|
|
* We still special case segno allocation because it's aligned and we'd
|
|
* like to keep that detail in the server.
|
|
*/
|
|
static int server_alloc_segno(struct super_block *sb,
|
|
struct scoutfs_net_connection *conn,
|
|
u8 cmd, u64 id, void *arg, u16 arg_len)
|
|
{
|
|
DECLARE_SERVER_INFO(sb, server);
|
|
struct commit_waiter cw;
|
|
__le64 lesegno = 0;
|
|
u64 segno;
|
|
int ret;
|
|
|
|
if (arg_len != 0) {
|
|
ret = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
down_read(&server->commit_rwsem);
|
|
ret = alloc_segno(sb, &segno);
|
|
if (ret == 0)
|
|
queue_commit_work(server, &cw);
|
|
up_read(&server->commit_rwsem);
|
|
if (ret == 0)
|
|
ret = wait_for_commit(&cw);
|
|
if (ret)
|
|
goto out;
|
|
|
|
lesegno = cpu_to_le64(segno);
|
|
out:
|
|
return scoutfs_net_response(sb, conn, cmd, id, ret,
|
|
&lesegno, sizeof(lesegno));
|
|
}
|
|
|
|
static int server_record_segment(struct super_block *sb,
|
|
struct scoutfs_net_connection *conn,
|
|
u8 cmd, u64 id, void *arg, u16 arg_len)
|
|
{
|
|
DECLARE_SERVER_INFO(sb, server);
|
|
struct scoutfs_net_manifest_entry *net_ment;
|
|
struct scoutfs_manifest_entry ment;
|
|
struct commit_waiter cw;
|
|
int ret;
|
|
|
|
if (arg_len != sizeof(struct scoutfs_net_manifest_entry)) {
|
|
ret = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
net_ment = arg;
|
|
|
|
retry:
|
|
down_read(&server->commit_rwsem);
|
|
scoutfs_manifest_lock(sb);
|
|
|
|
if (scoutfs_manifest_level0_full(sb)) {
|
|
scoutfs_manifest_unlock(sb);
|
|
up_read(&server->commit_rwsem);
|
|
/* XXX waits indefinitely? io errors? */
|
|
wait_event(server->waitq, !scoutfs_manifest_level0_full(sb));
|
|
goto retry;
|
|
}
|
|
|
|
scoutfs_init_ment_from_net(&ment, net_ment);
|
|
|
|
ret = scoutfs_manifest_add(sb, &ment);
|
|
scoutfs_manifest_unlock(sb);
|
|
|
|
if (ret == 0)
|
|
queue_commit_work(server, &cw);
|
|
up_read(&server->commit_rwsem);
|
|
|
|
if (ret == 0) {
|
|
ret = wait_for_commit(&cw);
|
|
if (ret == 0)
|
|
try_queue_compact(server);
|
|
}
|
|
|
|
out:
|
|
return scoutfs_net_response(sb, conn, cmd, id, ret, NULL, 0);
|
|
}
|
|
|
|
/*
|
|
* Give the client the next sequence number for their transaction. They
|
|
* provide their previous transaction sequence number that they've
|
|
* committed.
|
|
*
|
|
* We track the sequence numbers of transactions that clients have open.
|
|
* This limits the transaction sequence numbers that can be returned in
|
|
* the index of inodes by meta and data transaction numbers. We
|
|
* communicate the largest possible sequence number to clients via an
|
|
* rpc.
|
|
*
|
|
* The transaction sequence tracking is stored in a btree so it is
|
|
* shared across servers. Final entries are removed when processing a
|
|
* client's farewell or when it's removed.
|
|
*/
|
|
static int server_advance_seq(struct super_block *sb,
|
|
struct scoutfs_net_connection *conn,
|
|
u8 cmd, u64 id, void *arg, u16 arg_len)
|
|
{
|
|
DECLARE_SERVER_INFO(sb, server);
|
|
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
|
struct scoutfs_super_block *super = &sbi->super;
|
|
struct commit_waiter cw;
|
|
__le64 their_seq;
|
|
__le64 next_seq;
|
|
struct scoutfs_trans_seq_btree_key tsk;
|
|
u64 node_id = scoutfs_net_client_node_id(conn);
|
|
int ret;
|
|
|
|
if (arg_len != sizeof(__le64)) {
|
|
ret = -EINVAL;
|
|
goto out;
|
|
}
|
|
memcpy(&their_seq, arg, sizeof(their_seq));
|
|
|
|
down_read(&server->commit_rwsem);
|
|
down_write(&server->seq_rwsem);
|
|
|
|
if (their_seq != 0) {
|
|
tsk.trans_seq = le64_to_be64(their_seq);
|
|
tsk.node_id = cpu_to_be64(node_id);
|
|
|
|
ret = scoutfs_btree_delete(sb, &super->trans_seqs,
|
|
&tsk, sizeof(tsk));
|
|
if (ret < 0 && ret != -ENOENT)
|
|
goto out;
|
|
}
|
|
|
|
next_seq = super->next_trans_seq;
|
|
le64_add_cpu(&super->next_trans_seq, 1);
|
|
|
|
trace_scoutfs_trans_seq_advance(sb, node_id, le64_to_cpu(their_seq),
|
|
le64_to_cpu(next_seq));
|
|
|
|
tsk.trans_seq = le64_to_be64(next_seq);
|
|
tsk.node_id = cpu_to_be64(node_id);
|
|
|
|
ret = scoutfs_btree_insert(sb, &super->trans_seqs,
|
|
&tsk, sizeof(tsk), NULL, 0);
|
|
out:
|
|
up_write(&server->seq_rwsem);
|
|
if (ret == 0)
|
|
queue_commit_work(server, &cw);
|
|
up_read(&server->commit_rwsem);
|
|
if (ret == 0)
|
|
ret = wait_for_commit(&cw);
|
|
|
|
return scoutfs_net_response(sb, conn, cmd, id, ret,
|
|
&next_seq, sizeof(next_seq));
|
|
}
|
|
|
|
/*
|
|
* Remove any transaction sequences owned by the client. They must have
|
|
* committed any final transaction by the time they get here via sending
|
|
* their farewell message. This can be called multiple times as the
|
|
* client's farewell is retransmitted so it's OK to not find any
|
|
* entries. This is called with the server commit rwsem held.
|
|
*/
|
|
static int remove_trans_seq(struct super_block *sb, u64 node_id)
|
|
{
|
|
DECLARE_SERVER_INFO(sb, server);
|
|
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
|
struct scoutfs_super_block *super = &sbi->super;
|
|
struct scoutfs_trans_seq_btree_key tsk;
|
|
SCOUTFS_BTREE_ITEM_REF(iref);
|
|
int ret = 0;
|
|
|
|
down_write(&server->seq_rwsem);
|
|
|
|
tsk.trans_seq = 0;
|
|
tsk.node_id = 0;
|
|
|
|
for (;;) {
|
|
ret = scoutfs_btree_next(sb, &super->trans_seqs,
|
|
&tsk, sizeof(tsk), &iref);
|
|
if (ret < 0) {
|
|
if (ret == -ENOENT)
|
|
ret = 0;
|
|
break;
|
|
}
|
|
|
|
memcpy(&tsk, iref.key, iref.key_len);
|
|
scoutfs_btree_put_iref(&iref);
|
|
|
|
if (be64_to_cpu(tsk.node_id) == node_id) {
|
|
trace_scoutfs_trans_seq_farewell(sb, node_id,
|
|
be64_to_cpu(tsk.trans_seq));
|
|
ret = scoutfs_btree_delete(sb, &super->trans_seqs,
|
|
&tsk, sizeof(tsk));
|
|
break;
|
|
}
|
|
|
|
be64_add_cpu(&tsk.trans_seq, 1);
|
|
tsk.node_id = 0;
|
|
}
|
|
|
|
up_write(&server->seq_rwsem);
|
|
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Give the calling client the last valid trans_seq that it can return
|
|
* in results from the indices of trans seqs to inodes. These indices
|
|
* promise to only advance so we can't return results past those that
|
|
* are still outstanding and not yet visible in the indices. If there
|
|
* are no outstanding transactions (what? how?) we give them the max
|
|
* possible sequence.
|
|
*/
|
|
static int server_get_last_seq(struct super_block *sb,
|
|
struct scoutfs_net_connection *conn,
|
|
u8 cmd, u64 id, void *arg, u16 arg_len)
|
|
{
|
|
DECLARE_SERVER_INFO(sb, server);
|
|
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
|
struct scoutfs_super_block *super = &sbi->super;
|
|
struct scoutfs_trans_seq_btree_key tsk;
|
|
SCOUTFS_BTREE_ITEM_REF(iref);
|
|
u64 node_id = scoutfs_net_client_node_id(conn);
|
|
__le64 last_seq = 0;
|
|
int ret;
|
|
|
|
if (arg_len != 0) {
|
|
ret = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
down_read(&server->seq_rwsem);
|
|
|
|
tsk.trans_seq = 0;
|
|
tsk.node_id = 0;
|
|
|
|
ret = scoutfs_btree_next(sb, &super->trans_seqs,
|
|
&tsk, sizeof(tsk), &iref);
|
|
if (ret == 0) {
|
|
if (iref.key_len != sizeof(tsk)) {
|
|
ret = -EINVAL;
|
|
} else {
|
|
memcpy(&tsk, iref.key, iref.key_len);
|
|
last_seq = cpu_to_le64(be64_to_cpu(tsk.trans_seq) - 1);
|
|
}
|
|
scoutfs_btree_put_iref(&iref);
|
|
|
|
} else if (ret == -ENOENT) {
|
|
last_seq = super->next_trans_seq;
|
|
le64_add_cpu(&last_seq, -1ULL);
|
|
ret = 0;
|
|
}
|
|
|
|
trace_scoutfs_trans_seq_last(sb, node_id, le64_to_cpu(last_seq));
|
|
|
|
up_read(&server->seq_rwsem);
|
|
out:
|
|
return scoutfs_net_response(sb, conn, cmd, id, ret,
|
|
&last_seq, sizeof(last_seq));
|
|
}
|
|
|
|
static int server_get_manifest_root(struct super_block *sb,
|
|
struct scoutfs_net_connection *conn,
|
|
u8 cmd, u64 id, void *arg, u16 arg_len)
|
|
{
|
|
DECLARE_SERVER_INFO(sb, server);
|
|
struct scoutfs_btree_root root;
|
|
unsigned int start;
|
|
int ret;
|
|
|
|
if (arg_len == 0) {
|
|
do {
|
|
start = read_seqcount_begin(&server->stable_seqcount);
|
|
root = server->stable_manifest_root;
|
|
} while (read_seqcount_retry(&server->stable_seqcount, start));
|
|
ret = 0;
|
|
} else {
|
|
ret = -EINVAL;
|
|
}
|
|
|
|
return scoutfs_net_response(sb, conn, cmd, id, ret,
|
|
&root, sizeof(root));
|
|
}
|
|
|
|
/*
|
|
* Sample the super stats that the client wants for statfs by serializing
|
|
* with each component.
|
|
*/
|
|
static int server_statfs(struct super_block *sb,
|
|
struct scoutfs_net_connection *conn,
|
|
u8 cmd, u64 id, void *arg, u16 arg_len)
|
|
{
|
|
DECLARE_SERVER_INFO(sb, server);
|
|
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
|
struct scoutfs_super_block *super = &sbi->super;
|
|
struct scoutfs_net_statfs nstatfs;
|
|
int ret;
|
|
|
|
if (arg_len == 0) {
|
|
/* uuid and total_segs are constant, so far */
|
|
memcpy(nstatfs.uuid, super->uuid, sizeof(nstatfs.uuid));
|
|
|
|
spin_lock(&sbi->next_ino_lock);
|
|
nstatfs.next_ino = super->next_ino;
|
|
spin_unlock(&sbi->next_ino_lock);
|
|
|
|
down_read(&server->alloc_rwsem);
|
|
nstatfs.total_blocks = super->total_blocks;
|
|
nstatfs.bfree = super->free_blocks;
|
|
up_read(&server->alloc_rwsem);
|
|
ret = 0;
|
|
} else {
|
|
ret = -EINVAL;
|
|
}
|
|
|
|
return scoutfs_net_response(sb, conn, cmd, id, ret,
|
|
&nstatfs, sizeof(nstatfs));
|
|
}
|
|
|
|
static int server_lock(struct super_block *sb,
|
|
struct scoutfs_net_connection *conn,
|
|
u8 cmd, u64 id, void *arg, u16 arg_len)
|
|
{
|
|
u64 node_id = scoutfs_net_client_node_id(conn);
|
|
|
|
if (arg_len != sizeof(struct scoutfs_net_lock))
|
|
return -EINVAL;
|
|
|
|
return scoutfs_lock_server_request(sb, node_id, id, arg);
|
|
}
|
|
|
|
static int lock_response(struct super_block *sb,
|
|
struct scoutfs_net_connection *conn,
|
|
void *resp, unsigned int resp_len,
|
|
int error, void *data)
|
|
{
|
|
u64 node_id = scoutfs_net_client_node_id(conn);
|
|
|
|
if (resp_len != sizeof(struct scoutfs_net_lock))
|
|
return -EINVAL;
|
|
|
|
return scoutfs_lock_server_response(sb, node_id, resp);
|
|
}
|
|
|
|
int scoutfs_server_lock_request(struct super_block *sb, u64 node_id,
|
|
struct scoutfs_net_lock *nl)
|
|
{
|
|
struct server_info *server = SCOUTFS_SB(sb)->server_info;
|
|
|
|
return scoutfs_net_submit_request_node(sb, server->conn, node_id,
|
|
SCOUTFS_NET_CMD_LOCK,
|
|
nl, sizeof(*nl),
|
|
lock_response, NULL, NULL);
|
|
}
|
|
|
|
int scoutfs_server_lock_response(struct super_block *sb, u64 node_id,
|
|
u64 id, struct scoutfs_net_lock *nl)
|
|
{
|
|
struct server_info *server = SCOUTFS_SB(sb)->server_info;
|
|
|
|
return scoutfs_net_response_node(sb, server->conn, node_id,
|
|
SCOUTFS_NET_CMD_LOCK, id, 0,
|
|
nl, sizeof(*nl));
|
|
}
|
|
|
|
static bool invalid_recover(struct scoutfs_net_lock_recover *nlr,
|
|
unsigned long bytes)
|
|
{
|
|
return ((bytes < sizeof(*nlr)) ||
|
|
(bytes != offsetof(struct scoutfs_net_lock_recover,
|
|
locks[le16_to_cpu(nlr->nr)])));
|
|
}
|
|
|
|
static int lock_recover_response(struct super_block *sb,
|
|
struct scoutfs_net_connection *conn,
|
|
void *resp, unsigned int resp_len,
|
|
int error, void *data)
|
|
{
|
|
u64 node_id = scoutfs_net_client_node_id(conn);
|
|
|
|
if (invalid_recover(resp, resp_len))
|
|
return -EINVAL;
|
|
|
|
return scoutfs_lock_server_recover_response(sb, node_id, resp);
|
|
}
|
|
|
|
int scoutfs_server_lock_recover_request(struct super_block *sb, u64 node_id,
|
|
struct scoutfs_key *key)
|
|
{
|
|
struct server_info *server = SCOUTFS_SB(sb)->server_info;
|
|
|
|
return scoutfs_net_submit_request_node(sb, server->conn, node_id,
|
|
SCOUTFS_NET_CMD_LOCK_RECOVER,
|
|
key, sizeof(*key),
|
|
lock_recover_response,
|
|
NULL, NULL);
|
|
}
|
|
|
|
static int insert_mounted_client(struct super_block *sb, u64 node_id,
|
|
char *name)
|
|
{
|
|
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
|
struct scoutfs_mounted_client_btree_key mck;
|
|
struct scoutfs_mounted_client_btree_val mcv;
|
|
|
|
mck.node_id = cpu_to_be64(node_id);
|
|
strncpy(mcv.name, name, sizeof(mcv.name));
|
|
|
|
return scoutfs_btree_insert(sb, &super->mounted_clients,
|
|
&mck, sizeof(mck), &mcv, sizeof(mcv));
|
|
}
|
|
|
|
/*
|
|
* Remove the record of a mounted client. The record can already be
|
|
* removed if we're processing a farewell on behalf of a client that
|
|
* already had a previous server process its farewell.
|
|
*
|
|
* When we remove the last mounted client that's voting we write a new
|
|
* quorum block with the updated unmount_barrier.
|
|
*
|
|
* The caller has to serialize with farewell processing.
|
|
*/
|
|
static int delete_mounted_client(struct super_block *sb, u64 node_id)
|
|
{
|
|
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
|
struct scoutfs_mounted_client_btree_key mck;
|
|
int ret;
|
|
|
|
mck.node_id = cpu_to_be64(node_id);
|
|
|
|
ret = scoutfs_btree_delete(sb, &super->mounted_clients,
|
|
&mck, sizeof(mck));
|
|
if (ret == -ENOENT)
|
|
ret = 0;
|
|
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Process an incoming greeting request in the server from the client.
|
|
* We try to send responses to failed greetings so that the sender can
|
|
* log some detail before shutting down. A failure to send a greeting
|
|
* response shuts down the connection.
|
|
*
|
|
* We allocate a new node_id for the first connect attempt from a
|
|
* client.
|
|
*
|
|
* If a client reconnects they'll send their initially assigned node_id
|
|
* in their greeting request.
|
|
*
|
|
* XXX We can lose allocated node_ids here as we record the node_id as
|
|
* live as we send a valid greeting response. The client might
|
|
* disconnect before they receive the response and resent and initial
|
|
* blank greeting. We could use a client uuid to associate with
|
|
* allocated node_ids.
|
|
*
|
|
* XXX The logic of this has gotten convoluted. The lock server can
|
|
* send a recovery request so it needs to be called after the core net
|
|
* greeting call enables messages. But we want the greeting reply to be
|
|
* sent first, so we currently queue it on the send queue before
|
|
* enabling messages. That means that a lot of errors that happen after
|
|
* the reply can't be sent to the client. They'll just see a disconnect
|
|
* and won't know what's happened. This all needs to be refactored.
|
|
*/
|
|
static int server_greeting(struct super_block *sb,
|
|
struct scoutfs_net_connection *conn,
|
|
u8 cmd, u64 id, void *arg, u16 arg_len)
|
|
{
|
|
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
|
struct scoutfs_net_greeting *gr = arg;
|
|
struct scoutfs_net_greeting greet;
|
|
DECLARE_SERVER_INFO(sb, server);
|
|
struct commit_waiter cw;
|
|
__le64 node_id = 0;
|
|
bool sent_node_id;
|
|
bool first_contact;
|
|
bool farewell;
|
|
int ret = 0;
|
|
int err;
|
|
|
|
if (arg_len != sizeof(struct scoutfs_net_greeting)) {
|
|
ret = -EINVAL;
|
|
goto send_err;
|
|
}
|
|
|
|
if (gr->fsid != super->hdr.fsid) {
|
|
scoutfs_warn(sb, "client sent fsid 0x%llx, server has 0x%llx",
|
|
le64_to_cpu(gr->fsid),
|
|
le64_to_cpu(super->hdr.fsid));
|
|
ret = -EINVAL;
|
|
goto send_err;
|
|
}
|
|
|
|
if (gr->format_hash != super->format_hash) {
|
|
scoutfs_warn(sb, "client sent format 0x%llx, server has 0x%llx",
|
|
le64_to_cpu(gr->format_hash),
|
|
le64_to_cpu(super->format_hash));
|
|
ret = -EINVAL;
|
|
goto send_err;
|
|
}
|
|
|
|
if (gr->node_id == 0) {
|
|
down_read(&server->commit_rwsem);
|
|
|
|
spin_lock(&server->lock);
|
|
node_id = super->next_node_id;
|
|
le64_add_cpu(&super->next_node_id, 1);
|
|
spin_unlock(&server->lock);
|
|
|
|
mutex_lock(&server->farewell_mutex);
|
|
ret = insert_mounted_client(sb, le64_to_cpu(node_id), gr->name);
|
|
mutex_unlock(&server->farewell_mutex);
|
|
|
|
if (ret == 0)
|
|
queue_commit_work(server, &cw);
|
|
up_read(&server->commit_rwsem);
|
|
if (ret == 0) {
|
|
ret = wait_for_commit(&cw);
|
|
queue_work(server->wq, &server->farewell_work);
|
|
}
|
|
} else {
|
|
node_id = gr->node_id;
|
|
}
|
|
|
|
send_err:
|
|
err = ret;
|
|
if (err)
|
|
node_id = 0;
|
|
|
|
memset(greet.name, 0, sizeof(greet.name));
|
|
greet.fsid = super->hdr.fsid;
|
|
greet.format_hash = super->format_hash;
|
|
greet.server_term = cpu_to_le64(server->term);
|
|
greet.unmount_barrier = cpu_to_le64(server->qei.unmount_barrier);
|
|
greet.node_id = node_id;
|
|
greet.flags = 0;
|
|
|
|
/* queue greeting response to be sent first once messaging enabled */
|
|
ret = scoutfs_net_response(sb, conn, cmd, id, err,
|
|
&greet, sizeof(greet));
|
|
if (ret == 0 && err)
|
|
ret = err;
|
|
if (ret)
|
|
goto out;
|
|
|
|
/* have the net core enable messaging and resend */
|
|
sent_node_id = gr->node_id != 0;
|
|
first_contact = le64_to_cpu(gr->server_term) != server->term;
|
|
if (gr->flags & cpu_to_le64(SCOUTFS_NET_GREETING_FLAG_FAREWELL))
|
|
farewell = true;
|
|
else
|
|
farewell = false;
|
|
|
|
scoutfs_net_server_greeting(sb, conn, le64_to_cpu(node_id), id,
|
|
sent_node_id, first_contact, farewell);
|
|
|
|
/* lock server might send recovery request */
|
|
if (le64_to_cpu(gr->server_term) != server->term) {
|
|
|
|
/* we're now doing two commits per greeting, not great */
|
|
down_read(&server->commit_rwsem);
|
|
|
|
ret = scoutfs_lock_server_greeting(sb, le64_to_cpu(node_id),
|
|
gr->server_term != 0);
|
|
if (ret == 0)
|
|
queue_commit_work(server, &cw);
|
|
up_read(&server->commit_rwsem);
|
|
if (ret == 0)
|
|
ret = wait_for_commit(&cw);
|
|
if (ret)
|
|
goto out;
|
|
}
|
|
|
|
out:
|
|
return ret;
|
|
}
|
|
|
|
struct farewell_request {
|
|
struct list_head entry;
|
|
u64 net_id;
|
|
u64 node_id;
|
|
};
|
|
|
|
static bool invalid_mounted_client_item(struct scoutfs_btree_item_ref *iref)
|
|
{
|
|
return (iref->key_len !=
|
|
sizeof(struct scoutfs_mounted_client_btree_key)) ||
|
|
(iref->val_len !=
|
|
sizeof(struct scoutfs_mounted_client_btree_val));
|
|
}
|
|
|
|
/*
|
|
* This work processes farewell requests asynchronously. Requests from
|
|
* voting quorum members can be held until they're no longer needed to
|
|
* vote for quorum and elect a server to process farewell requests.
|
|
*
|
|
* This will hold farewell requests from voting clients until either it
|
|
* isn't needed for quorum because a majority remains without it, or it
|
|
* won't be needed for quorum because all the remaining mounted clients
|
|
* are voting and waiting for farewell.
|
|
*
|
|
* When we remove the last mounted client record for the last voting
|
|
* client then we increase the unmount_barrier and write it to the
|
|
* server's quorum block. If voting clients don't get their farewell
|
|
* response they'll attempt to form quorum again to start the server for
|
|
* their farewell response but will find the increased umount_barrier.
|
|
* The'll know that their farewell has been processed and they can exit
|
|
* without forming quorum.
|
|
*
|
|
* Responses that are waiting for clients who aren't voting are
|
|
* immediately sent. Clients that don't have a mounted client record
|
|
* have already had their farewell processed by another server and can
|
|
* proceed.
|
|
*
|
|
* This can trust the quorum config found in the super that was read
|
|
* when the server started. Only the current server can rewrite the
|
|
* working config.
|
|
*
|
|
* Farewell responses are unique in that sending them causes the server
|
|
* to shutdown the connection to the client next time the socket
|
|
* disconnects. If the socket is destroyed before the client gets the
|
|
* response they'll reconnect and we'll see them as a brand new client
|
|
* who immediately sends a farewell. It'll be processed and it all
|
|
* works out.
|
|
*
|
|
* If this worker sees an error it assumes that this sever is done for
|
|
* and that another had better take its place.
|
|
*/
|
|
static void farewell_worker(struct work_struct *work)
|
|
{
|
|
struct server_info *server = container_of(work, struct server_info,
|
|
farewell_work);
|
|
struct super_block *sb = server->sb;
|
|
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
|
struct scoutfs_quorum_config *conf = &super->quorum_config;
|
|
struct scoutfs_mounted_client_btree_key mck;
|
|
struct scoutfs_mounted_client_btree_val *mcv;
|
|
struct farewell_request *tmp;
|
|
struct farewell_request *fw;
|
|
SCOUTFS_BTREE_ITEM_REF(iref);
|
|
struct commit_waiter cw;
|
|
unsigned int nr_unmounting = 0;
|
|
unsigned int nr_mounted = 0;
|
|
unsigned int majority;
|
|
LIST_HEAD(reqs);
|
|
LIST_HEAD(send);
|
|
bool deleted = false;
|
|
bool voting;
|
|
bool more_reqs;
|
|
int ret;
|
|
|
|
majority = scoutfs_quorum_majority(sb, conf);
|
|
|
|
/* grab all the requests that are waiting */
|
|
mutex_lock(&server->farewell_mutex);
|
|
list_splice_init(&server->farewell_requests, &reqs);
|
|
mutex_unlock(&server->farewell_mutex);
|
|
|
|
/* count how many reqs requests are from voting clients */
|
|
nr_unmounting = 0;
|
|
list_for_each_entry_safe(fw, tmp, &reqs, entry) {
|
|
mck.node_id = cpu_to_be64(fw->node_id);
|
|
ret = scoutfs_btree_lookup(sb, &super->mounted_clients,
|
|
&mck, sizeof(mck), &iref);
|
|
if (ret == 0 && invalid_mounted_client_item(&iref)) {
|
|
scoutfs_btree_put_iref(&iref);
|
|
ret = -EIO;
|
|
}
|
|
if (ret < 0) {
|
|
if (ret == -ENOENT) {
|
|
list_move_tail(&fw->entry, &send);
|
|
continue;
|
|
}
|
|
goto out;
|
|
}
|
|
|
|
mcv = iref.val;
|
|
voting = scoutfs_quorum_voting_member(sb, conf, mcv->name);
|
|
scoutfs_btree_put_iref(&iref);
|
|
|
|
if (!voting) {
|
|
list_move_tail(&fw->entry, &send);
|
|
continue;
|
|
}
|
|
|
|
nr_unmounting++;
|
|
}
|
|
|
|
/* see how many mounted clients could vote for quorum */
|
|
memset(&mck, 0, sizeof(mck));
|
|
for (;;) {
|
|
ret = scoutfs_btree_next(sb, &super->mounted_clients,
|
|
&mck, sizeof(mck), &iref);
|
|
if (ret == 0 && invalid_mounted_client_item(&iref)) {
|
|
scoutfs_btree_put_iref(&iref);
|
|
ret = -EIO;
|
|
}
|
|
if (ret != 0) {
|
|
if (ret == -ENOENT)
|
|
break;
|
|
goto out;
|
|
}
|
|
|
|
memcpy(&mck, iref.key, sizeof(mck));
|
|
mcv = iref.val;
|
|
|
|
if (scoutfs_quorum_voting_member(sb, conf, mcv->name))
|
|
nr_mounted++;
|
|
|
|
scoutfs_btree_put_iref(&iref);
|
|
be64_add_cpu(&mck.node_id, 1);
|
|
|
|
}
|
|
|
|
/* send as many responses as we can to maintain quorum */
|
|
while ((fw = list_first_entry_or_null(&reqs, struct farewell_request,
|
|
entry)) &&
|
|
(nr_mounted > majority || nr_unmounting >= nr_mounted)) {
|
|
|
|
list_move_tail(&fw->entry, &send);
|
|
nr_mounted--;
|
|
nr_unmounting--;
|
|
deleted = true;
|
|
}
|
|
|
|
/* process and send farewell responses */
|
|
list_for_each_entry_safe(fw, tmp, &send, entry) {
|
|
|
|
down_read(&server->commit_rwsem);
|
|
|
|
ret = scoutfs_lock_server_farewell(sb, fw->node_id) ?:
|
|
remove_trans_seq(sb, fw->node_id) ?:
|
|
delete_mounted_client(sb, fw->node_id);
|
|
if (ret == 0)
|
|
queue_commit_work(server, &cw);
|
|
|
|
up_read(&server->commit_rwsem);
|
|
if (ret == 0)
|
|
ret = wait_for_commit(&cw);
|
|
if (ret)
|
|
goto out;
|
|
}
|
|
|
|
/* update the unmount barrier the first time we delete all mounted */
|
|
if (deleted && nr_mounted == 0) {
|
|
ret = scoutfs_quorum_update_barrier(sb, &server->qei,
|
|
server->qei.unmount_barrier + 1);
|
|
if (ret)
|
|
goto out;
|
|
}
|
|
|
|
/* and finally send all the responses */
|
|
list_for_each_entry_safe(fw, tmp, &send, entry) {
|
|
|
|
ret = scoutfs_net_response_node(sb, server->conn, fw->node_id,
|
|
SCOUTFS_NET_CMD_FAREWELL,
|
|
fw->net_id, 0, NULL, 0);
|
|
if (ret)
|
|
break;
|
|
|
|
list_del_init(&fw->entry);
|
|
kfree(fw);
|
|
}
|
|
|
|
ret = 0;
|
|
out:
|
|
mutex_lock(&server->farewell_mutex);
|
|
more_reqs = !list_empty(&server->farewell_requests);
|
|
list_splice_init(&reqs, &server->farewell_requests);
|
|
list_splice_init(&send, &server->farewell_requests);
|
|
mutex_unlock(&server->farewell_mutex);
|
|
|
|
if (ret < 0)
|
|
stop_server(server);
|
|
else if (more_reqs && !server->shutting_down)
|
|
queue_work(server->wq, &server->farewell_work);
|
|
}
|
|
|
|
static void free_farewell_requests(struct super_block *sb, u64 node_id)
|
|
{
|
|
struct server_info *server = SCOUTFS_SB(sb)->server_info;
|
|
struct farewell_request *tmp;
|
|
struct farewell_request *fw;
|
|
|
|
mutex_lock(&server->farewell_mutex);
|
|
list_for_each_entry_safe(fw, tmp, &server->farewell_requests, entry) {
|
|
if (node_id == 0 || fw->node_id == node_id) {
|
|
list_del_init(&fw->entry);
|
|
kfree(fw);
|
|
}
|
|
}
|
|
mutex_unlock(&server->farewell_mutex);
|
|
}
|
|
|
|
/*
|
|
* The server is receiving a farewell message from a client that is
|
|
* unmounting. It won't send any more requests and once it receives our
|
|
* response it will not reconnect.
|
|
*
|
|
* XXX we should make sure that all our requests to the client have finished
|
|
* before we respond. Locking will have its own messaging for orderly
|
|
* shutdown. That leaves compaction which will be addressed as part of
|
|
* the larger work of recovering compactions that were in flight when
|
|
* a client crashed.
|
|
*/
|
|
static int server_farewell(struct super_block *sb,
|
|
struct scoutfs_net_connection *conn,
|
|
u8 cmd, u64 id, void *arg, u16 arg_len)
|
|
{
|
|
struct server_info *server = SCOUTFS_SB(sb)->server_info;
|
|
u64 node_id = scoutfs_net_client_node_id(conn);
|
|
struct farewell_request *fw;
|
|
|
|
if (arg_len != 0)
|
|
return -EINVAL;
|
|
|
|
/* XXX tear down if we fence, or if we shut down */
|
|
|
|
fw = kmalloc(sizeof(struct farewell_request), GFP_NOFS);
|
|
if (fw == NULL)
|
|
return -ENOMEM;
|
|
|
|
fw->node_id = node_id;
|
|
fw->net_id = id;
|
|
|
|
mutex_lock(&server->farewell_mutex);
|
|
list_add_tail(&fw->entry, &server->farewell_requests);
|
|
mutex_unlock(&server->farewell_mutex);
|
|
|
|
queue_work(server->wq, &server->farewell_work);
|
|
|
|
/* response will be sent later */
|
|
return 0;
|
|
}
|
|
|
|
/* requests sent to clients are tracked so we can free resources */
|
|
struct compact_request {
|
|
struct list_head head;
|
|
u64 node_id;
|
|
struct scoutfs_net_compact_request req;
|
|
};
|
|
|
|
/*
|
|
* Find a node that can process our compaction request. Return a
|
|
* node_id if we found a client and added the compaction to the client
|
|
* and server counts. Returns 0 if no suitable clients were found.
|
|
*/
|
|
static u64 compact_request_start(struct super_block *sb,
|
|
struct compact_request *cr)
|
|
{
|
|
struct server_info *server = SCOUTFS_SB(sb)->server_info;
|
|
struct server_client_info *last;
|
|
struct server_client_info *sci;
|
|
u64 node_id = 0;
|
|
|
|
spin_lock(&server->lock);
|
|
|
|
/* XXX no last_entry_or_null? :( */
|
|
if (!list_empty(&server->clients))
|
|
last = list_last_entry(&server->clients,
|
|
struct server_client_info, head);
|
|
else
|
|
last = NULL;
|
|
|
|
while ((sci = list_first_entry_or_null(&server->clients,
|
|
struct server_client_info,
|
|
head)) != NULL) {
|
|
list_move_tail(&sci->head, &server->clients);
|
|
if (sci->nr_compacts < server->compacts_per_client) {
|
|
list_add(&cr->head, &server->compacts);
|
|
server->nr_compacts++;
|
|
sci->nr_compacts++;
|
|
node_id = sci->node_id;
|
|
cr->node_id = node_id;
|
|
break;
|
|
}
|
|
if (sci == last)
|
|
break;
|
|
}
|
|
|
|
trace_scoutfs_server_compact_start(sb, le64_to_cpu(cr->req.id),
|
|
cr->req.ents[0].level, node_id,
|
|
node_id ? sci->nr_compacts : 0,
|
|
server->nr_compacts,
|
|
server->compacts_per_client);
|
|
|
|
spin_unlock(&server->lock);
|
|
|
|
return node_id;
|
|
}
|
|
|
|
/*
|
|
* Find a tracked compact request for the compaction id, remove it from
|
|
* the server and client counts, and return it to the caller.
|
|
*/
|
|
static struct compact_request *compact_request_done(struct super_block *sb,
|
|
u64 id)
|
|
{
|
|
struct server_info *server = SCOUTFS_SB(sb)->server_info;
|
|
struct compact_request *ret = NULL;
|
|
struct server_client_info *sci;
|
|
struct compact_request *cr;
|
|
|
|
spin_lock(&server->lock);
|
|
|
|
list_for_each_entry(cr, &server->compacts, head) {
|
|
if (le64_to_cpu(cr->req.id) != id)
|
|
continue;
|
|
|
|
list_for_each_entry(sci, &server->clients, head) {
|
|
if (sci->node_id == cr->node_id) {
|
|
sci->nr_compacts--;
|
|
break;
|
|
}
|
|
}
|
|
|
|
server->nr_compacts--;
|
|
list_del_init(&cr->head);
|
|
ret = cr;
|
|
break;
|
|
}
|
|
|
|
trace_scoutfs_server_compact_done(sb, id, ret ? ret->node_id : 0,
|
|
server->nr_compacts);
|
|
|
|
spin_unlock(&server->lock);
|
|
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* When a client disconnects we forget the compactions that they had
|
|
* in flight so that we have capacity to send compaction requests to the
|
|
* remaining clients.
|
|
*
|
|
* XXX we do not free their allocated segnos because they could still be
|
|
* running and writing to those blocks. To do this safely we'd need
|
|
* full recovery procedures with fencing to ensure that they're not able
|
|
* to write to those blocks anymore.
|
|
*/
|
|
static void forget_client_compacts(struct super_block *sb,
|
|
struct server_client_info *sci)
|
|
{
|
|
struct server_info *server = SCOUTFS_SB(sb)->server_info;
|
|
struct compact_request *cr;
|
|
struct compact_request *pos;
|
|
LIST_HEAD(forget);
|
|
|
|
spin_lock(&server->lock);
|
|
list_for_each_entry_safe(cr, pos, &server->compacts, head) {
|
|
if (cr->node_id == sci->node_id) {
|
|
sci->nr_compacts--;
|
|
server->nr_compacts--;
|
|
list_move(&cr->head, &forget);
|
|
}
|
|
}
|
|
spin_unlock(&server->lock);
|
|
|
|
list_for_each_entry_safe(cr, pos, &forget, head) {
|
|
scoutfs_manifest_compact_done(sb, &cr->req);
|
|
list_del_init(&cr->head);
|
|
kfree(cr);
|
|
}
|
|
}
|
|
|
|
static int segno_in_ents(u64 segno, struct scoutfs_net_manifest_entry *ents,
|
|
unsigned int nr)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < nr; i++) {
|
|
if (ents[i].segno == 0)
|
|
break;
|
|
if (segno == le64_to_cpu(ents[i].segno))
|
|
return 1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int remove_segnos(struct super_block *sb, __le64 *segnos,
|
|
unsigned int nr,
|
|
struct scoutfs_net_manifest_entry *unless,
|
|
unsigned int nr_unless, bool cleanup);
|
|
|
|
/*
|
|
* Free (unaligned) segnos if they're not found in the unless entries.
|
|
* If this returns an error then we've cleaned up partial frees on
|
|
* error. This panics if it sees an error and can't cleanup on error.
|
|
*
|
|
* There are variants of this for lots of add/del, alloc/remove data
|
|
* structurs.
|
|
*/
|
|
static int free_segnos(struct super_block *sb, __le64 *segnos,
|
|
unsigned int nr,
|
|
struct scoutfs_net_manifest_entry *unless,
|
|
unsigned int nr_unless, bool cleanup)
|
|
|
|
{
|
|
u64 segno;
|
|
int ret = 0;
|
|
int i;
|
|
|
|
for (i = 0; i < nr; i++) {
|
|
segno = le64_to_cpu(get_unaligned(&segnos[i]));
|
|
if (segno == 0)
|
|
break;
|
|
if (segno_in_ents(segno, unless, nr_unless))
|
|
continue;
|
|
|
|
ret = free_segno(sb, segno);
|
|
BUG_ON(ret < 0 && !cleanup);
|
|
if (ret < 0) {
|
|
remove_segnos(sb, segnos, i, unless, nr_unless, false);
|
|
break;
|
|
}
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
/* the segno array can be unaligned */
|
|
static int alloc_segnos(struct super_block *sb, __le64 * segnos,
|
|
unsigned int nr)
|
|
|
|
{
|
|
u64 segno;
|
|
int ret = 0;
|
|
int i;
|
|
|
|
for (i = 0; i < nr; i++) {
|
|
ret = alloc_segno(sb, &segno);
|
|
if (ret < 0) {
|
|
free_segnos(sb, segnos, i, NULL, 0, false);
|
|
break;
|
|
}
|
|
put_unaligned(cpu_to_le64(segno), &segnos[i]);
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
static int remove_segnos(struct super_block *sb, __le64 *segnos,
|
|
unsigned int nr,
|
|
struct scoutfs_net_manifest_entry *unless,
|
|
unsigned int nr_unless, bool cleanup)
|
|
|
|
{
|
|
u64 segno;
|
|
int ret = 0;
|
|
int i;
|
|
|
|
for (i = 0; i < nr; i++) {
|
|
segno = le64_to_cpu(get_unaligned(&segnos[i]));
|
|
if (segno == 0)
|
|
break;
|
|
if (segno_in_ents(segno, unless, nr_unless))
|
|
continue;
|
|
|
|
ret = remove_segno(sb, segno);
|
|
BUG_ON(ret < 0 && !cleanup);
|
|
if (ret < 0) {
|
|
free_segnos(sb, segnos, i, unless, nr_unless, false);
|
|
break;
|
|
}
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
|
|
static int remove_entry_segnos(struct super_block *sb,
|
|
struct scoutfs_net_manifest_entry *ents,
|
|
unsigned int nr,
|
|
struct scoutfs_net_manifest_entry *unless,
|
|
unsigned int nr_unless, bool cleanup);
|
|
|
|
static int free_entry_segnos(struct super_block *sb,
|
|
struct scoutfs_net_manifest_entry *ents,
|
|
unsigned int nr,
|
|
struct scoutfs_net_manifest_entry *unless,
|
|
unsigned int nr_unless, bool cleanup)
|
|
{
|
|
int ret = 0;
|
|
int i;
|
|
|
|
for (i = 0; i < nr; i++) {
|
|
if (ents[i].segno == 0)
|
|
break;
|
|
if (segno_in_ents(le64_to_cpu(ents[i].segno),
|
|
unless, nr_unless))
|
|
continue;
|
|
|
|
ret = free_segno(sb, le64_to_cpu(ents[i].segno));
|
|
BUG_ON(ret < 0 && !cleanup);
|
|
if (ret < 0) {
|
|
remove_entry_segnos(sb, ents, i, unless, nr_unless,
|
|
false);
|
|
break;
|
|
}
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
static int remove_entry_segnos(struct super_block *sb,
|
|
struct scoutfs_net_manifest_entry *ents,
|
|
unsigned int nr,
|
|
struct scoutfs_net_manifest_entry *unless,
|
|
unsigned int nr_unless, bool cleanup)
|
|
{
|
|
int ret = 0;
|
|
int i;
|
|
|
|
for (i = 0; i < nr; i++) {
|
|
if (ents[i].segno == 0)
|
|
break;
|
|
if (segno_in_ents(le64_to_cpu(ents[i].segno),
|
|
unless, nr_unless))
|
|
continue;
|
|
|
|
ret = remove_segno(sb, le64_to_cpu(ents[i].segno));
|
|
BUG_ON(ret < 0 && !cleanup);
|
|
if (ret < 0) {
|
|
free_entry_segnos(sb, ents, i, unless, nr_unless,
|
|
false);
|
|
break;
|
|
}
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
static int del_manifest_entries(struct super_block *sb,
|
|
struct scoutfs_net_manifest_entry *ents,
|
|
unsigned int nr, bool cleanup);
|
|
|
|
static int add_manifest_entries(struct super_block *sb,
|
|
struct scoutfs_net_manifest_entry *ents,
|
|
unsigned int nr, bool cleanup)
|
|
{
|
|
struct scoutfs_manifest_entry ment;
|
|
int ret = 0;
|
|
int i;
|
|
|
|
for (i = 0; i < nr; i++) {
|
|
if (ents[i].segno == 0)
|
|
break;
|
|
|
|
scoutfs_init_ment_from_net(&ment, &ents[i]);
|
|
|
|
ret = scoutfs_manifest_add(sb, &ment);
|
|
BUG_ON(ret < 0 && !cleanup);
|
|
if (ret < 0) {
|
|
del_manifest_entries(sb, ents, i, false);
|
|
break;
|
|
}
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
static int del_manifest_entries(struct super_block *sb,
|
|
struct scoutfs_net_manifest_entry *ents,
|
|
unsigned int nr, bool cleanup)
|
|
{
|
|
struct scoutfs_manifest_entry ment;
|
|
int ret = 0;
|
|
int i;
|
|
|
|
for (i = 0; i < nr; i++) {
|
|
if (ents[i].segno == 0)
|
|
break;
|
|
|
|
scoutfs_init_ment_from_net(&ment, &ents[i]);
|
|
|
|
ret = scoutfs_manifest_del(sb, &ment);
|
|
BUG_ON(ret < 0 && !cleanup);
|
|
if (ret < 0) {
|
|
add_manifest_entries(sb, ents, i, false);
|
|
break;
|
|
}
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Process a received compaction response. This is called in concurrent
|
|
* processing work context so it's racing with other compaction
|
|
* responses and new compaction requests being built and sent.
|
|
*
|
|
* If the compaction failed then we only have to free the allocated
|
|
* output segnos sent in the request.
|
|
*
|
|
* If the compaction succeeded then we need to delete the input manifest
|
|
* entries, add any new output manifest entries, and free allocated
|
|
* segnos and input manifest segnos that aren't found in output segnos.
|
|
*
|
|
* And finally we always remove the compaction from the runtime client
|
|
* accounting
|
|
*
|
|
* As we finish a compaction we wake level0 writers if there's now space
|
|
* in level 0 for a new segment.
|
|
*
|
|
* Errors in processing are taken as an indication that this server is
|
|
* no longer able to do its job. We return hard errors which shut down
|
|
* the server in the hopes that another healthy server will start up.
|
|
* We may want to revisit this.
|
|
*/
|
|
static int compact_response(struct super_block *sb,
|
|
struct scoutfs_net_connection *conn,
|
|
void *resp, unsigned int resp_len,
|
|
int error, void *data)
|
|
{
|
|
struct server_info *server = SCOUTFS_SB(sb)->server_info;
|
|
struct scoutfs_net_compact_response *cresp = NULL;
|
|
struct compact_request *cr = NULL;
|
|
bool level0_was_full = false;
|
|
bool add_ents = false;
|
|
bool del_ents = false;
|
|
bool rem_segnos = false;
|
|
struct commit_waiter cw;
|
|
__le64 id;
|
|
int ret;
|
|
|
|
if (error) {
|
|
/* an error response without an id is fatal */
|
|
if (resp_len != sizeof(__le64)) {
|
|
ret = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
memcpy(&id, resp, resp_len);
|
|
|
|
} else {
|
|
if (resp_len != sizeof(struct scoutfs_net_compact_response)) {
|
|
ret = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
cresp = resp;
|
|
id = cresp->id;
|
|
}
|
|
|
|
trace_scoutfs_server_compact_response(sb, le64_to_cpu(id), error);
|
|
|
|
/* XXX we only free tracked requests on responses, must still exist */
|
|
cr = compact_request_done(sb, le64_to_cpu(id));
|
|
if (WARN_ON_ONCE(cr == NULL)) {
|
|
ret = -ENOENT;
|
|
goto out;
|
|
}
|
|
|
|
down_read(&server->commit_rwsem);
|
|
scoutfs_manifest_lock(sb);
|
|
|
|
level0_was_full = scoutfs_manifest_level0_full(sb);
|
|
|
|
if (error) {
|
|
ret = 0;
|
|
goto cleanup;
|
|
}
|
|
|
|
/* delete old manifest entries */
|
|
ret = del_manifest_entries(sb, cr->req.ents, ARRAY_SIZE(cr->req.ents),
|
|
true);
|
|
if (ret)
|
|
goto cleanup;
|
|
add_ents = true;
|
|
|
|
/* add new manifest entries */
|
|
ret = add_manifest_entries(sb, cresp->ents, ARRAY_SIZE(cresp->ents),
|
|
true);
|
|
if (ret)
|
|
goto cleanup;
|
|
del_ents = true;
|
|
|
|
/* free allocated segnos not found in new entries */
|
|
ret = free_segnos(sb, cr->req.segnos, ARRAY_SIZE(cr->req.segnos),
|
|
cresp->ents, ARRAY_SIZE(cresp->ents), true);
|
|
if (ret)
|
|
goto cleanup;
|
|
rem_segnos = true;
|
|
|
|
/* free input segnos not found in new entries */
|
|
ret = free_entry_segnos(sb, cr->req.ents, ARRAY_SIZE(cr->req.ents),
|
|
cresp->ents, ARRAY_SIZE(cresp->ents), true);
|
|
cleanup:
|
|
/* cleanup partial commits on errors */
|
|
if (ret < 0 && rem_segnos)
|
|
remove_segnos(sb, cr->req.segnos, ARRAY_SIZE(cr->req.segnos),
|
|
cresp->ents, ARRAY_SIZE(cresp->ents), false);
|
|
if (ret < 0 && del_ents)
|
|
del_manifest_entries(sb, cresp->ents, ARRAY_SIZE(cresp->ents),
|
|
false);
|
|
if (ret < 0 && add_ents)
|
|
add_manifest_entries(sb, cr->req.ents,
|
|
ARRAY_SIZE(cr->req.ents), false);
|
|
|
|
/* free all the allocated output segnos if compaction failed */
|
|
if ((error || ret < 0) && cr != NULL)
|
|
free_segnos(sb, cr->req.segnos, ARRAY_SIZE(cr->req.segnos),
|
|
NULL, 0, false);
|
|
|
|
if (ret == 0 && level0_was_full && !scoutfs_manifest_level0_full(sb))
|
|
wake_up(&server->waitq);
|
|
|
|
if (ret == 0)
|
|
queue_commit_work(server, &cw);
|
|
scoutfs_manifest_unlock(sb);
|
|
up_read(&server->commit_rwsem);
|
|
|
|
if (cr) {
|
|
scoutfs_manifest_compact_done(sb, &cr->req);
|
|
kfree(cr);
|
|
}
|
|
|
|
if (ret == 0) {
|
|
ret = wait_for_commit(&cw);
|
|
if (ret == 0)
|
|
try_queue_compact(server);
|
|
}
|
|
|
|
out:
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* The compaction worker executes as the manifest is updated and we see
|
|
* that a level has too many segments and clients aren't processing all
|
|
* their max number of compaction requests. Only one compaction worker
|
|
* executes.
|
|
*
|
|
* We have the manifest build us a compaction request, find a client to
|
|
* send it too, and record it for later completion processing.
|
|
*
|
|
* The manifest tracks pending compactions and won't use the same
|
|
* segments as inputs to multiple compactions. We track the number of
|
|
* compactions in flight to each client to keep them balanced.
|
|
*/
|
|
static void scoutfs_server_compact_worker(struct work_struct *work)
|
|
{
|
|
struct server_info *server = container_of(work, struct server_info,
|
|
compact_work);
|
|
struct super_block *sb = server->sb;
|
|
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
|
struct scoutfs_net_compact_request *req;
|
|
struct compact_request *cr;
|
|
struct commit_waiter cw;
|
|
int nr_segnos = 0;
|
|
u64 node_id;
|
|
__le64 id;
|
|
int ret;
|
|
|
|
trace_scoutfs_server_compact_work_enter(sb, 0, 0);
|
|
|
|
cr = kzalloc(sizeof(struct compact_request), GFP_NOFS);
|
|
if (!cr) {
|
|
ret = -ENOMEM;
|
|
goto out;
|
|
}
|
|
req = &cr->req;
|
|
|
|
/* get the input manifest entries */
|
|
ret = scoutfs_manifest_next_compact(sb, req);
|
|
if (ret <= 0)
|
|
goto out;
|
|
|
|
nr_segnos = ret + SCOUTFS_COMPACTION_SEGNO_OVERHEAD;
|
|
|
|
/* get the next id and allocate possible output segnos */
|
|
down_read(&server->commit_rwsem);
|
|
|
|
spin_lock(&server->lock);
|
|
id = super->next_compact_id;
|
|
le64_add_cpu(&super->next_compact_id, 1);
|
|
spin_unlock(&server->lock);
|
|
|
|
ret = alloc_segnos(sb, req->segnos, nr_segnos);
|
|
if (ret == 0)
|
|
queue_commit_work(server, &cw);
|
|
up_read(&server->commit_rwsem);
|
|
if (ret == 0)
|
|
ret = wait_for_commit(&cw);
|
|
if (ret)
|
|
goto out;
|
|
|
|
/* try to send to a node with capacity, they can disconnect */
|
|
retry:
|
|
req->id = id;
|
|
node_id = compact_request_start(sb, cr);
|
|
if (node_id == 0) {
|
|
ret = 0;
|
|
goto out;
|
|
}
|
|
|
|
/* response processing can complete compaction before this returns */
|
|
ret = scoutfs_net_submit_request_node(sb, server->conn, node_id,
|
|
SCOUTFS_NET_CMD_COMPACT,
|
|
req, sizeof(*req),
|
|
compact_response, NULL, NULL);
|
|
if (ret < 0) {
|
|
cr = compact_request_done(sb, le64_to_cpu(id));
|
|
BUG_ON(cr == NULL); /* must still be there, no node cleanup */
|
|
}
|
|
if (ret == -ENOTCONN)
|
|
goto retry;
|
|
if (ret < 0)
|
|
goto out;
|
|
|
|
/* cr is now owned by response processing */
|
|
cr = NULL;
|
|
ret = 1;
|
|
|
|
out:
|
|
if (ret <= 0 && cr != NULL) {
|
|
scoutfs_manifest_compact_done(sb, req);
|
|
|
|
/* don't need to wait for commit when freeing in cleanup */
|
|
down_read(&server->commit_rwsem);
|
|
free_segnos(sb, req->segnos, nr_segnos, NULL, 0, false);
|
|
up_read(&server->commit_rwsem);
|
|
|
|
kfree(cr);
|
|
}
|
|
|
|
if (ret > 0)
|
|
try_queue_compact(server);
|
|
|
|
trace_scoutfs_server_compact_work_exit(sb, 0, ret);
|
|
}
|
|
|
|
static scoutfs_net_request_t server_req_funcs[] = {
|
|
[SCOUTFS_NET_CMD_GREETING] = server_greeting,
|
|
[SCOUTFS_NET_CMD_ALLOC_INODES] = server_alloc_inodes,
|
|
[SCOUTFS_NET_CMD_ALLOC_EXTENT] = server_alloc_extent,
|
|
[SCOUTFS_NET_CMD_FREE_EXTENTS] = server_free_extents,
|
|
[SCOUTFS_NET_CMD_ALLOC_SEGNO] = server_alloc_segno,
|
|
[SCOUTFS_NET_CMD_RECORD_SEGMENT] = server_record_segment,
|
|
[SCOUTFS_NET_CMD_ADVANCE_SEQ] = server_advance_seq,
|
|
[SCOUTFS_NET_CMD_GET_LAST_SEQ] = server_get_last_seq,
|
|
[SCOUTFS_NET_CMD_GET_MANIFEST_ROOT] = server_get_manifest_root,
|
|
[SCOUTFS_NET_CMD_STATFS] = server_statfs,
|
|
[SCOUTFS_NET_CMD_LOCK] = server_lock,
|
|
[SCOUTFS_NET_CMD_FAREWELL] = server_farewell,
|
|
};
|
|
|
|
static void server_notify_up(struct super_block *sb,
|
|
struct scoutfs_net_connection *conn,
|
|
void *info, u64 node_id)
|
|
{
|
|
struct server_client_info *sci = info;
|
|
DECLARE_SERVER_INFO(sb, server);
|
|
|
|
if (node_id != 0) {
|
|
sci->node_id = node_id;
|
|
sci->nr_compacts = 0;
|
|
spin_lock(&server->lock);
|
|
list_add_tail(&sci->head, &server->clients);
|
|
server->nr_clients++;
|
|
trace_scoutfs_server_client_up(sb, node_id, server->nr_clients);
|
|
spin_unlock(&server->lock);
|
|
|
|
try_queue_compact(server);
|
|
}
|
|
}
|
|
|
|
static void server_notify_down(struct super_block *sb,
|
|
struct scoutfs_net_connection *conn,
|
|
void *info, u64 node_id)
|
|
{
|
|
struct server_client_info *sci = info;
|
|
DECLARE_SERVER_INFO(sb, server);
|
|
|
|
if (node_id != 0) {
|
|
spin_lock(&server->lock);
|
|
list_del_init(&sci->head);
|
|
server->nr_clients--;
|
|
trace_scoutfs_server_client_down(sb, node_id,
|
|
server->nr_clients);
|
|
spin_unlock(&server->lock);
|
|
|
|
free_farewell_requests(sb, node_id);
|
|
|
|
forget_client_compacts(sb, sci);
|
|
try_queue_compact(server);
|
|
} else {
|
|
stop_server(server);
|
|
}
|
|
}
|
|
|
|
static void scoutfs_server_worker(struct work_struct *work)
|
|
{
|
|
struct server_info *server = container_of(work, struct server_info,
|
|
work);
|
|
struct super_block *sb = server->sb;
|
|
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
|
struct scoutfs_super_block *super = &sbi->super;
|
|
struct scoutfs_net_connection *conn = NULL;
|
|
DECLARE_WAIT_QUEUE_HEAD(waitq);
|
|
struct sockaddr_in sin;
|
|
LIST_HEAD(conn_list);
|
|
int ret;
|
|
|
|
trace_scoutfs_server_work_enter(sb, 0, 0);
|
|
|
|
conn = scoutfs_net_alloc_conn(sb, server_notify_up, server_notify_down,
|
|
sizeof(struct server_client_info),
|
|
server_req_funcs, "server");
|
|
if (!conn) {
|
|
ret = -ENOMEM;
|
|
goto out;
|
|
}
|
|
|
|
sin = server->listen_sin;
|
|
|
|
ret = scoutfs_net_bind(sb, conn, &sin);
|
|
if (ret) {
|
|
scoutfs_err(sb, "server failed to bind to "SIN_FMT", err %d%s",
|
|
SIN_ARG(&sin), ret,
|
|
ret == -EADDRNOTAVAIL ? " (Bad address?)"
|
|
: "");
|
|
goto out;
|
|
}
|
|
|
|
ret = scoutfs_read_super(sb, super);
|
|
if (ret)
|
|
goto out;
|
|
|
|
/* start up the server subsystems before accepting */
|
|
ret = scoutfs_btree_setup(sb) ?:
|
|
scoutfs_manifest_setup(sb) ?:
|
|
scoutfs_lock_server_setup(sb);
|
|
if (ret)
|
|
goto shutdown;
|
|
|
|
complete(&server->start_comp);
|
|
|
|
scoutfs_advance_dirty_super(sb);
|
|
server->stable_manifest_root = super->manifest.root;
|
|
|
|
scoutfs_info(sb, "server started on "SIN_FMT, SIN_ARG(&sin));
|
|
|
|
/* start accepting connections and processing work */
|
|
server->conn = conn;
|
|
scoutfs_net_listen(sb, conn);
|
|
|
|
/* wait_event/wake_up provide barriers */
|
|
wait_event_interruptible(server->waitq, server->shutting_down);
|
|
|
|
scoutfs_info(sb, "server shutting down on "SIN_FMT, SIN_ARG(&sin));
|
|
|
|
shutdown:
|
|
/* wait for request processing */
|
|
scoutfs_net_shutdown(sb, conn);
|
|
/* drain compact work queued by responses */
|
|
cancel_work_sync(&server->compact_work);
|
|
/* wait for commit queued by request processing */
|
|
flush_work(&server->commit_work);
|
|
server->conn = NULL;
|
|
|
|
destroy_pending_frees(sb);
|
|
scoutfs_manifest_destroy(sb);
|
|
scoutfs_btree_destroy(sb);
|
|
scoutfs_lock_server_destroy(sb);
|
|
|
|
out:
|
|
scoutfs_net_free_conn(sb, conn);
|
|
|
|
trace_scoutfs_server_work_exit(sb, 0, ret);
|
|
|
|
server->err = ret;
|
|
complete(&server->start_comp);
|
|
}
|
|
|
|
/* XXX can we call start multiple times? */
|
|
int scoutfs_server_start(struct super_block *sb, struct sockaddr_in *sin,
|
|
u64 term, struct scoutfs_quorum_elected_info *qei)
|
|
{
|
|
DECLARE_SERVER_INFO(sb, server);
|
|
|
|
server->err = 0;
|
|
server->shutting_down = false;
|
|
server->listen_sin = *sin;
|
|
server->term = term;
|
|
server->qei = *qei;
|
|
init_completion(&server->start_comp);
|
|
|
|
queue_work(server->wq, &server->work);
|
|
|
|
wait_for_completion(&server->start_comp);
|
|
return server->err;
|
|
}
|
|
|
|
/*
|
|
* Start shutdown on the server but don't want for it to finish.
|
|
*/
|
|
void scoutfs_server_abort(struct super_block *sb)
|
|
{
|
|
DECLARE_SERVER_INFO(sb, server);
|
|
|
|
stop_server(server);
|
|
}
|
|
|
|
/*
|
|
* Once the server is stopped we give the caller our election info
|
|
* which might have been modified while we were running.
|
|
*/
|
|
void scoutfs_server_stop(struct super_block *sb,
|
|
struct scoutfs_quorum_elected_info *qei)
|
|
{
|
|
DECLARE_SERVER_INFO(sb, server);
|
|
|
|
stop_server(server);
|
|
/* XXX not sure both are needed */
|
|
cancel_work_sync(&server->work);
|
|
cancel_work_sync(&server->commit_work);
|
|
|
|
*qei = server->qei;
|
|
}
|
|
|
|
int scoutfs_server_setup(struct super_block *sb)
|
|
{
|
|
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
|
struct server_info *server;
|
|
|
|
server = kzalloc(sizeof(struct server_info), GFP_KERNEL);
|
|
if (!server)
|
|
return -ENOMEM;
|
|
|
|
server->sb = sb;
|
|
spin_lock_init(&server->lock);
|
|
init_waitqueue_head(&server->waitq);
|
|
INIT_WORK(&server->work, scoutfs_server_worker);
|
|
init_rwsem(&server->commit_rwsem);
|
|
init_llist_head(&server->commit_waiters);
|
|
INIT_WORK(&server->commit_work, scoutfs_server_commit_func);
|
|
seqcount_init(&server->stable_seqcount);
|
|
init_rwsem(&server->seq_rwsem);
|
|
init_rwsem(&server->alloc_rwsem);
|
|
INIT_LIST_HEAD(&server->pending_frees);
|
|
INIT_LIST_HEAD(&server->clients);
|
|
server->compacts_per_client = 2;
|
|
INIT_LIST_HEAD(&server->compacts);
|
|
INIT_WORK(&server->compact_work, scoutfs_server_compact_worker);
|
|
mutex_init(&server->farewell_mutex);
|
|
INIT_LIST_HEAD(&server->farewell_requests);
|
|
INIT_WORK(&server->farewell_work, farewell_worker);
|
|
|
|
server->wq = alloc_workqueue("scoutfs_server",
|
|
WQ_UNBOUND | WQ_NON_REENTRANT, 0);
|
|
if (!server->wq) {
|
|
kfree(server);
|
|
return -ENOMEM;
|
|
}
|
|
|
|
sbi->server_info = server;
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* The caller should have already stopped but we do the same just in
|
|
* case.
|
|
*/
|
|
void scoutfs_server_destroy(struct super_block *sb)
|
|
{
|
|
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
|
struct server_info *server = sbi->server_info;
|
|
|
|
if (server) {
|
|
stop_server(server);
|
|
|
|
/* wait for server work to wait for everything to shut down */
|
|
cancel_work_sync(&server->work);
|
|
/* recv work/compaction could have left commit_work queued */
|
|
cancel_work_sync(&server->commit_work);
|
|
|
|
/* pending farewell requests are another server's problem */
|
|
cancel_work_sync(&server->farewell_work);
|
|
free_farewell_requests(sb, 0);
|
|
|
|
trace_scoutfs_server_workqueue_destroy(sb, 0, 0);
|
|
destroy_workqueue(server->wq);
|
|
|
|
kfree(server);
|
|
sbi->server_info = NULL;
|
|
}
|
|
}
|