mirror of
https://github.com/versity/scoutfs.git
synced 2026-04-23 06:50:30 +00:00
Use the client's rid in networking instead of the node_id. The node_id no longer has to be allocated by the server and sent in the greeting. Instead the client sends it to the server in its greeting. The server then uses the client's announced rid just like it used to use the its node_id. It's used to record clients in the btree and to identify clients in sending and receive processing. The use of the rid in networking calls makes its way to locking and compaction which now use the rid to identify clients intead of the node_id. Signed-off-by: Zach Brown <zab@versity.com>
542 lines
14 KiB
C
542 lines
14 KiB
C
/*
|
|
* Copyright (C) 2015 Versity Software, Inc. All rights reserved.
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public
|
|
* License v2 as published by the Free Software Foundation.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* General Public License for more details.
|
|
*/
|
|
#include <linux/kernel.h>
|
|
#include <linux/module.h>
|
|
#include <linux/fs.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/pagemap.h>
|
|
#include <linux/magic.h>
|
|
#include <linux/random.h>
|
|
#include <linux/statfs.h>
|
|
#include <linux/sched.h>
|
|
#include <linux/debugfs.h>
|
|
#include <linux/percpu.h>
|
|
|
|
#include "super.h"
|
|
#include "block.h"
|
|
#include "export.h"
|
|
#include "format.h"
|
|
#include "inode.h"
|
|
#include "dir.h"
|
|
#include "msg.h"
|
|
#include "counters.h"
|
|
#include "triggers.h"
|
|
#include "trans.h"
|
|
#include "item.h"
|
|
#include "manifest.h"
|
|
#include "seg.h"
|
|
#include "bio.h"
|
|
#include "compact.h"
|
|
#include "data.h"
|
|
#include "lock.h"
|
|
#include "net.h"
|
|
#include "client.h"
|
|
#include "server.h"
|
|
#include "options.h"
|
|
#include "sysfs.h"
|
|
#include "quorum.h"
|
|
#include "scoutfs_trace.h"
|
|
|
|
static struct dentry *scoutfs_debugfs_root;
|
|
|
|
static DEFINE_PER_CPU(u64, clock_sync_ids) = 0;
|
|
|
|
/*
|
|
* Give the caller a unique clock sync id for a message they're about to
|
|
* send. We make the ids reasonably globally unique by using randomly
|
|
* initialized per-cpu 64bit counters.
|
|
*/
|
|
__le64 scoutfs_clock_sync_id(void)
|
|
{
|
|
u64 rnd = 0;
|
|
u64 ret;
|
|
u64 *id;
|
|
|
|
retry:
|
|
preempt_disable();
|
|
id = this_cpu_ptr(&clock_sync_ids);
|
|
if (*id == 0) {
|
|
if (rnd == 0) {
|
|
preempt_enable();
|
|
get_random_bytes(&rnd, sizeof(rnd));
|
|
goto retry;
|
|
}
|
|
*id = rnd;
|
|
}
|
|
|
|
ret = ++(*id);
|
|
preempt_enable();
|
|
|
|
return cpu_to_le64(ret);
|
|
}
|
|
|
|
/*
|
|
* Ask the server for the current statfs fields. The message is very
|
|
* cheap so we're not worrying about spinning in statfs flooding the
|
|
* server with requests. We can add a cache and stale results if that
|
|
* becomes a problem.
|
|
*
|
|
* We fake the number of free inodes value by assuming that we can fill
|
|
* free blocks with a certain number of inodes. We then the number of
|
|
* current inodes to that free count to determine the total possible
|
|
* inodes.
|
|
*
|
|
* The fsid that we report is constructed from the xor of the first two
|
|
* and second two little endian u32s that make up the uuid bytes.
|
|
*/
|
|
static int scoutfs_statfs(struct dentry *dentry, struct kstatfs *kst)
|
|
{
|
|
struct super_block *sb = dentry->d_inode->i_sb;
|
|
struct scoutfs_net_statfs nstatfs;
|
|
__le32 uuid[4];
|
|
int ret;
|
|
|
|
ret = scoutfs_client_statfs(sb, &nstatfs);
|
|
if (ret)
|
|
return ret;
|
|
|
|
kst->f_bfree = le64_to_cpu(nstatfs.bfree);
|
|
kst->f_type = SCOUTFS_SUPER_MAGIC;
|
|
kst->f_bsize = SCOUTFS_BLOCK_SIZE;
|
|
kst->f_blocks = le64_to_cpu(nstatfs.total_blocks);
|
|
kst->f_bavail = kst->f_bfree;
|
|
|
|
kst->f_ffree = kst->f_bfree * 16;
|
|
kst->f_files = kst->f_ffree + le64_to_cpu(nstatfs.next_ino);
|
|
|
|
BUILD_BUG_ON(sizeof(uuid) != sizeof(nstatfs.uuid));
|
|
memcpy(uuid, &nstatfs, sizeof(uuid));
|
|
kst->f_fsid.val[0] = le32_to_cpu(uuid[0]) ^ le32_to_cpu(uuid[1]);
|
|
kst->f_fsid.val[1] = le32_to_cpu(uuid[2]) ^ le32_to_cpu(uuid[3]);
|
|
kst->f_namelen = SCOUTFS_NAME_LEN;
|
|
kst->f_frsize = SCOUTFS_BLOCK_SIZE;
|
|
/* the vfs fills f_flags */
|
|
|
|
/*
|
|
* We don't take cluster locks in statfs which makes it a very
|
|
* convenient place to trigger lock reclaim for debugging. We
|
|
* try to free as many locks as possible.
|
|
*/
|
|
if (scoutfs_trigger(sb, STATFS_LOCK_PURGE))
|
|
scoutfs_free_unused_locks(sb, -1UL);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int scoutfs_show_options(struct seq_file *seq, struct dentry *root)
|
|
{
|
|
struct super_block *sb = root->d_sb;
|
|
struct mount_options *opts = &SCOUTFS_SB(sb)->opts;
|
|
|
|
seq_printf(seq, ",server_addr="SIN_FMT, SIN_ARG(&opts->server_addr));
|
|
|
|
return 0;
|
|
}
|
|
|
|
static ssize_t server_addr_show(struct kobject *kobj,
|
|
struct kobj_attribute *attr, char *buf)
|
|
{
|
|
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
|
struct mount_options *opts = &SCOUTFS_SB(sb)->opts;
|
|
|
|
return snprintf(buf, PAGE_SIZE, SIN_FMT"\n",
|
|
SIN_ARG(&opts->server_addr));
|
|
}
|
|
SCOUTFS_ATTR_RO(server_addr);
|
|
|
|
static struct attribute *mount_options_attrs[] = {
|
|
SCOUTFS_ATTR_PTR(server_addr),
|
|
NULL,
|
|
};
|
|
|
|
static int scoutfs_sync_fs(struct super_block *sb, int wait)
|
|
{
|
|
trace_scoutfs_sync_fs(sb, wait);
|
|
scoutfs_inc_counter(sb, trans_commit_sync_fs);
|
|
|
|
return scoutfs_trans_sync(sb, wait);
|
|
}
|
|
|
|
/*
|
|
* This destroys all the state that's built up in the sb info during
|
|
* mount. It's called by us on errors during mount if we haven't set
|
|
* s_root, by mount after returning errors if we have set s_root, and by
|
|
* unmount after having synced the super.
|
|
*/
|
|
static void scoutfs_put_super(struct super_block *sb)
|
|
{
|
|
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
|
|
|
trace_scoutfs_put_super(sb);
|
|
|
|
sbi->shutdown = true;
|
|
|
|
scoutfs_data_destroy(sb);
|
|
|
|
scoutfs_unlock(sb, sbi->rid_lock, SCOUTFS_LOCK_WRITE);
|
|
sbi->rid_lock = NULL;
|
|
|
|
scoutfs_shutdown_trans(sb);
|
|
scoutfs_client_destroy(sb);
|
|
scoutfs_inode_destroy(sb);
|
|
|
|
/* the server locks the listen address and compacts */
|
|
scoutfs_lock_shutdown(sb);
|
|
scoutfs_server_destroy(sb);
|
|
scoutfs_net_destroy(sb);
|
|
scoutfs_seg_destroy(sb);
|
|
scoutfs_lock_destroy(sb);
|
|
|
|
/* server clears quorum leader flag during shutdown */
|
|
scoutfs_quorum_destroy(sb);
|
|
|
|
scoutfs_item_destroy(sb);
|
|
scoutfs_destroy_triggers(sb);
|
|
scoutfs_options_destroy(sb);
|
|
scoutfs_sysfs_destroy_attrs(sb, &sbi->mopts_ssa);
|
|
debugfs_remove(sbi->debug_root);
|
|
scoutfs_destroy_counters(sb);
|
|
scoutfs_destroy_sysfs(sb);
|
|
kfree(sbi);
|
|
|
|
sb->s_fs_info = NULL;
|
|
}
|
|
|
|
static const struct super_operations scoutfs_super_ops = {
|
|
.alloc_inode = scoutfs_alloc_inode,
|
|
.drop_inode = scoutfs_drop_inode,
|
|
.evict_inode = scoutfs_evict_inode,
|
|
.destroy_inode = scoutfs_destroy_inode,
|
|
.sync_fs = scoutfs_sync_fs,
|
|
.statfs = scoutfs_statfs,
|
|
.show_options = scoutfs_show_options,
|
|
.put_super = scoutfs_put_super,
|
|
};
|
|
|
|
/*
|
|
* Write the caller's super. The caller has always read a valid super
|
|
* before modifying and writing it. The caller's super is modified
|
|
* to reflect the write.
|
|
*
|
|
* XXX it'd be pretty easy to preallocate to avoid failure here.
|
|
*/
|
|
int scoutfs_write_super(struct super_block *sb,
|
|
struct scoutfs_super_block *caller)
|
|
{
|
|
struct scoutfs_super_block *super;
|
|
struct page *page;
|
|
int ret;
|
|
|
|
page = alloc_page(GFP_KERNEL | __GFP_ZERO);
|
|
if (!page)
|
|
return -ENOMEM;
|
|
|
|
le64_add_cpu(&caller->hdr.seq, 1);
|
|
|
|
super = page_address(page);
|
|
memcpy(super, caller, sizeof(*super));
|
|
super->hdr.crc = scoutfs_block_calc_crc(&super->hdr);
|
|
|
|
ret = scoutfs_bio_write(sb, &page, le64_to_cpu(super->hdr.blkno), 1);
|
|
WARN_ON_ONCE(ret);
|
|
|
|
__free_page(page);
|
|
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Read the super block. If it's valid store it in the caller's super
|
|
* struct.
|
|
*/
|
|
int scoutfs_read_super(struct super_block *sb,
|
|
struct scoutfs_super_block *super_res)
|
|
{
|
|
struct scoutfs_super_block *super;
|
|
struct page *page;
|
|
__le32 calc;
|
|
int ret;
|
|
|
|
page = alloc_page(GFP_KERNEL);
|
|
if (!page)
|
|
return -ENOMEM;
|
|
|
|
ret = scoutfs_bio_read(sb, &page, SCOUTFS_SUPER_BLKNO, 1);
|
|
if (ret) {
|
|
scoutfs_err(sb, "error reading super block: %d", ret);
|
|
goto out;
|
|
}
|
|
|
|
super = scoutfs_page_block_address(&page, 0);
|
|
|
|
if (super->hdr.magic != cpu_to_le32(SCOUTFS_BLOCK_MAGIC_SUPER)) {
|
|
scoutfs_err(sb, "super block has invalid magic value 0x%08x",
|
|
le32_to_cpu(super->hdr.magic));
|
|
ret = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
calc = scoutfs_block_calc_crc(&super->hdr);
|
|
if (calc != super->hdr.crc) {
|
|
scoutfs_err(sb, "super block has invalid crc 0x%08x, calculated 0x%08x",
|
|
le32_to_cpu(super->hdr.crc), le32_to_cpu(calc));
|
|
ret = -EINVAL;
|
|
}
|
|
|
|
if (le64_to_cpu(super->hdr.blkno) != SCOUTFS_SUPER_BLKNO) {
|
|
scoutfs_err(sb, "super block has invalid block number %llu, data read from %llu",
|
|
le64_to_cpu(super->hdr.blkno), SCOUTFS_SUPER_BLKNO);
|
|
ret = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
|
|
if (super->format_hash != cpu_to_le64(SCOUTFS_FORMAT_HASH)) {
|
|
scoutfs_err(sb, "super block has invalid format hash 0x%llx, expected 0x%llx",
|
|
le64_to_cpu(super->format_hash),
|
|
SCOUTFS_FORMAT_HASH);
|
|
ret = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
/* XXX do we want more rigorous invalid super checking? */
|
|
|
|
if (super->quorum_count == 0 ||
|
|
super->quorum_count > SCOUTFS_QUORUM_MAX_COUNT) {
|
|
scoutfs_err(sb, "super block has invalid quorum count %u, must be > 0 and <= %u",
|
|
super->quorum_count, SCOUTFS_QUORUM_MAX_COUNT);
|
|
ret = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
*super_res = *super;
|
|
ret = 0;
|
|
out:
|
|
__free_page(page);
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* This needs to be setup after reading the super because it uses the
|
|
* fsid found in the super block.
|
|
*/
|
|
static int scoutfs_debugfs_setup(struct super_block *sb)
|
|
{
|
|
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
|
char name[32];
|
|
|
|
snprintf(name, ARRAY_SIZE(name), SCSBF, SCSB_ARGS(sb));
|
|
|
|
sbi->debug_root = debugfs_create_dir(name, scoutfs_debugfs_root);
|
|
if (!sbi->debug_root)
|
|
return -ENOMEM;
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Calculate a random id for the mount very early, it's used in tracing
|
|
* and message output. The system assumes that a rid of 0 can't exist. We're
|
|
* also paranoid and avoid rids that are likely the result of bad rng.
|
|
*/
|
|
static int assign_random_id(struct scoutfs_sb_info *sbi)
|
|
{
|
|
unsigned int attempts = 0;
|
|
|
|
do {
|
|
if (++attempts == 100)
|
|
return -EIO;
|
|
get_random_bytes(&sbi->rid, sizeof(sbi->rid));
|
|
} while (sbi->rid == 0 || sbi->rid == ~0ULL);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
|
|
{
|
|
struct scoutfs_sb_info *sbi;
|
|
struct mount_options opts;
|
|
struct inode *inode;
|
|
int ret;
|
|
|
|
trace_scoutfs_fill_super(sb);
|
|
|
|
sb->s_magic = SCOUTFS_SUPER_MAGIC;
|
|
sb->s_maxbytes = MAX_LFS_FILESIZE;
|
|
sb->s_op = &scoutfs_super_ops;
|
|
sb->s_export_op = &scoutfs_export_ops;
|
|
|
|
/* btree blocks use long lived bh->b_data refs */
|
|
mapping_set_gfp_mask(sb->s_bdev->bd_inode->i_mapping, GFP_NOFS);
|
|
|
|
sbi = kzalloc(sizeof(struct scoutfs_sb_info), GFP_KERNEL);
|
|
sb->s_fs_info = sbi;
|
|
sbi->sb = sb;
|
|
if (!sbi)
|
|
return -ENOMEM;
|
|
|
|
ret = assign_random_id(sbi);
|
|
if (ret < 0)
|
|
return ret;
|
|
|
|
spin_lock_init(&sbi->next_ino_lock);
|
|
init_waitqueue_head(&sbi->trans_hold_wq);
|
|
spin_lock_init(&sbi->data_wait_root.lock);
|
|
sbi->data_wait_root.root = RB_ROOT;
|
|
spin_lock_init(&sbi->trans_write_lock);
|
|
INIT_DELAYED_WORK(&sbi->trans_write_work, scoutfs_trans_write_func);
|
|
init_waitqueue_head(&sbi->trans_write_wq);
|
|
scoutfs_sysfs_init_attrs(sb, &sbi->mopts_ssa);
|
|
|
|
ret = scoutfs_parse_options(sb, data, &opts);
|
|
if (ret)
|
|
goto out;
|
|
|
|
sbi->opts = opts;
|
|
|
|
ret = sb_set_blocksize(sb, SCOUTFS_BLOCK_SIZE);
|
|
if (ret != SCOUTFS_BLOCK_SIZE) {
|
|
scoutfs_err(sb, "failed to set blocksize, returned %d", ret);
|
|
ret = -EIO;
|
|
goto out;
|
|
}
|
|
|
|
ret = scoutfs_read_super(sb, &SCOUTFS_SB(sb)->super) ?:
|
|
scoutfs_debugfs_setup(sb) ?:
|
|
scoutfs_setup_sysfs(sb) ?:
|
|
scoutfs_setup_counters(sb) ?:
|
|
scoutfs_options_setup(sb) ?:
|
|
scoutfs_sysfs_create_attrs(sb, &sbi->mopts_ssa,
|
|
mount_options_attrs, "mount_options") ?:
|
|
scoutfs_setup_triggers(sb) ?:
|
|
scoutfs_seg_setup(sb) ?:
|
|
scoutfs_item_setup(sb) ?:
|
|
scoutfs_inode_setup(sb) ?:
|
|
scoutfs_data_setup(sb) ?:
|
|
scoutfs_setup_trans(sb) ?:
|
|
scoutfs_lock_setup(sb) ?:
|
|
scoutfs_net_setup(sb) ?:
|
|
scoutfs_quorum_setup(sb) ?:
|
|
scoutfs_server_setup(sb) ?:
|
|
scoutfs_client_setup(sb) ?:
|
|
scoutfs_lock_rid(sb, SCOUTFS_LOCK_WRITE, 0, sbi->rid,
|
|
&sbi->rid_lock);
|
|
if (ret)
|
|
goto out;
|
|
|
|
inode = scoutfs_iget(sb, SCOUTFS_ROOT_INO);
|
|
if (IS_ERR(inode)) {
|
|
ret = PTR_ERR(inode);
|
|
goto out;
|
|
}
|
|
|
|
sb->s_root = d_make_root(inode);
|
|
if (!sb->s_root) {
|
|
ret = -ENOMEM;
|
|
goto out;
|
|
}
|
|
|
|
ret = scoutfs_client_advance_seq(sb, &sbi->trans_seq);
|
|
if (ret)
|
|
goto out;
|
|
|
|
scoutfs_trans_restart_sync_deadline(sb);
|
|
// scoutfs_scan_orphans(sb);
|
|
ret = 0;
|
|
out:
|
|
/* on error, generic_shutdown_super calls put_super if s_root */
|
|
if (ret && !sb->s_root)
|
|
scoutfs_put_super(sb);
|
|
|
|
return ret;
|
|
}
|
|
|
|
static struct dentry *scoutfs_mount(struct file_system_type *fs_type, int flags,
|
|
const char *dev_name, void *data)
|
|
{
|
|
return mount_bdev(fs_type, flags, dev_name, data, scoutfs_fill_super);
|
|
}
|
|
|
|
/*
|
|
* kill_block_super eventually calls ->put_super if s_root is set
|
|
*/
|
|
static void scoutfs_kill_sb(struct super_block *sb)
|
|
{
|
|
trace_scoutfs_kill_sb(sb);
|
|
|
|
kill_block_super(sb);
|
|
}
|
|
|
|
static struct file_system_type scoutfs_fs_type = {
|
|
.owner = THIS_MODULE,
|
|
.name = "scoutfs",
|
|
.mount = scoutfs_mount,
|
|
.kill_sb = scoutfs_kill_sb,
|
|
.fs_flags = FS_REQUIRES_DEV,
|
|
};
|
|
MODULE_ALIAS_FS("scoutfs");
|
|
|
|
/* safe to call at any failure point in _init */
|
|
static void teardown_module(void)
|
|
{
|
|
debugfs_remove(scoutfs_debugfs_root);
|
|
scoutfs_dir_exit();
|
|
scoutfs_inode_exit();
|
|
scoutfs_sysfs_exit();
|
|
}
|
|
|
|
static int __init scoutfs_module_init(void)
|
|
{
|
|
int ret;
|
|
|
|
/*
|
|
* gcc only recently learned to let __attribute__(section) add
|
|
* SHT_NOTE notes. But the assembler always could.
|
|
*/
|
|
__asm__ __volatile__ (
|
|
".section .note.git_describe,\"a\"\n"
|
|
".string \""SCOUTFS_GIT_DESCRIBE"\\n\"\n"
|
|
".previous\n");
|
|
|
|
scoutfs_init_counters();
|
|
|
|
ret = scoutfs_sysfs_init();
|
|
if (ret)
|
|
return ret;
|
|
|
|
scoutfs_debugfs_root = debugfs_create_dir("scoutfs", NULL);
|
|
if (!scoutfs_debugfs_root) {
|
|
ret = -ENOMEM;
|
|
goto out;
|
|
}
|
|
ret = scoutfs_inode_init() ?:
|
|
scoutfs_dir_init() ?:
|
|
register_filesystem(&scoutfs_fs_type);
|
|
out:
|
|
if (ret)
|
|
teardown_module();
|
|
return ret;
|
|
}
|
|
module_init(scoutfs_module_init)
|
|
|
|
static void __exit scoutfs_module_exit(void)
|
|
{
|
|
unregister_filesystem(&scoutfs_fs_type);
|
|
teardown_module();
|
|
}
|
|
module_exit(scoutfs_module_exit)
|
|
|
|
MODULE_AUTHOR("Zach Brown <zab@versity.com>");
|
|
MODULE_LICENSE("GPL");
|
|
MODULE_INFO(git_describe, SCOUTFS_GIT_DESCRIBE);
|