mirror of
https://github.com/versity/scoutfs.git
synced 2026-01-03 10:55:20 +00:00
Initial commit
This is the initial commit of the repo that will track development against distro kernels. This is an import of a prototype branch in the upstream kernel that only had a few initial commits. It needed to move to the old readdir interface and use find_or_create_page() instead of pagecache_get_page() to build in older distro kernels.
This commit is contained in:
7
kmod/.gitignore
vendored
Normal file
7
kmod/.gitignore
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
src/*.o
|
||||
src/*.ko
|
||||
src/*.mod.c
|
||||
src/*.cmd
|
||||
src/.tmp_versions/
|
||||
src/Module.symvers
|
||||
src/modules.order
|
||||
4
kmod/Makefile
Normal file
4
kmod/Makefile
Normal file
@@ -0,0 +1,4 @@
|
||||
ALL: module
|
||||
|
||||
module:
|
||||
make CONFIG_SCOUTFS_FS=m -C $(SK_KSRC) M=$(PWD)/src
|
||||
10
kmod/src/Kconfig
Normal file
10
kmod/src/Kconfig
Normal file
@@ -0,0 +1,10 @@
|
||||
config SCOUTFS_FS
|
||||
tristate "scoutfs filesystem"
|
||||
help
|
||||
scoutfs is a clustered file system that stores data in large
|
||||
blocks in shared block storage.
|
||||
|
||||
To compile this file system support as a module, choose M here. The
|
||||
module will be called scoutfs.
|
||||
|
||||
If unsure, say N.
|
||||
3
kmod/src/Makefile
Normal file
3
kmod/src/Makefile
Normal file
@@ -0,0 +1,3 @@
|
||||
obj-$(CONFIG_SCOUTFS_FS) := scoutfs.o
|
||||
|
||||
scoutfs-y += dir.o inode.o item.o lsm.o mkfs.o super.o
|
||||
551
kmod/src/dir.c
Normal file
551
kmod/src/dir.c
Normal file
@@ -0,0 +1,551 @@
|
||||
/*
|
||||
* Copyright (C) 2016 Versity Software, Inc. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License v2 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*/
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/crc32c.h>
|
||||
#include <linux/uio.h>
|
||||
|
||||
#include "format.h"
|
||||
#include "dir.h"
|
||||
#include "inode.h"
|
||||
#include "key.h"
|
||||
#include "item.h"
|
||||
#include "super.h"
|
||||
|
||||
/*
|
||||
* Directory entries are stored in items whose offset is determined by
|
||||
* the hash of the entry's name. This was primarily chosen to minimize
|
||||
* the amount of data stored for each entry.
|
||||
*
|
||||
* Because we're hashing the name we need to worry about collisions. We
|
||||
* store all the entries with the same hash value in the item. This was
|
||||
* done so that create works with one specific item.
|
||||
*
|
||||
* readdir iterates over these items in hash order. The high bits of
|
||||
* the entry's readdir f_pos come from the item offset while the low
|
||||
* bits come from a collision number in the entry.
|
||||
*
|
||||
* The full readdir position, and thus the absolute max number of
|
||||
* entries in a directory, is limited to 2^31 to avoid the risk of
|
||||
* breaking legacy environments. Even with a relatively small 27bit
|
||||
* item offset allowing 16 colliding entries gets well into hundreds of
|
||||
* millions of entries before an item fills up and we return a premature
|
||||
* ENOSPC. Hundreds of millions in a single dir ought to be, wait for
|
||||
* it, good enough for anybody.
|
||||
*
|
||||
* Each item's contents are protected by the dir inode's i_mutex that
|
||||
* callers acquire before calling our dir operations. If we wanted more
|
||||
* fine grained concurrency, and we might, we'd have to be careful to
|
||||
* manage the shared items.
|
||||
*/
|
||||
|
||||
static unsigned int mode_to_type(umode_t mode)
|
||||
{
|
||||
#define S_SHIFT 12
|
||||
static unsigned char mode_types[S_IFMT >> S_SHIFT] = {
|
||||
[S_IFIFO >> S_SHIFT] = SCOUTFS_DT_FIFO,
|
||||
[S_IFCHR >> S_SHIFT] = SCOUTFS_DT_CHR,
|
||||
[S_IFDIR >> S_SHIFT] = SCOUTFS_DT_DIR,
|
||||
[S_IFBLK >> S_SHIFT] = SCOUTFS_DT_BLK,
|
||||
[S_IFREG >> S_SHIFT] = SCOUTFS_DT_REG,
|
||||
[S_IFLNK >> S_SHIFT] = SCOUTFS_DT_LNK,
|
||||
[S_IFSOCK >> S_SHIFT] = SCOUTFS_DT_SOCK,
|
||||
};
|
||||
|
||||
return mode_types[(mode & S_IFMT) >> S_SHIFT];
|
||||
#undef S_SHIFT
|
||||
}
|
||||
|
||||
#if 0
|
||||
static unsigned int dentry_type(unsigned int type)
|
||||
{
|
||||
static unsigned char types[] = {
|
||||
[SCOUTFS_DT_FIFO] = DT_FIFO,
|
||||
[SCOUTFS_DT_CHR] = DT_CHR,
|
||||
[SCOUTFS_DT_DIR] = DT_DIR,
|
||||
[SCOUTFS_DT_BLK] = DT_BLK,
|
||||
[SCOUTFS_DT_REG] = DT_REG,
|
||||
[SCOUTFS_DT_LNK] = DT_LNK,
|
||||
[SCOUTFS_DT_SOCK] = DT_SOCK,
|
||||
[SCOUTFS_DT_WHT] = DT_WHT,
|
||||
};
|
||||
|
||||
if (type < ARRAY_SIZE(types))
|
||||
return types[type];
|
||||
|
||||
return DT_UNKNOWN;
|
||||
}
|
||||
#endif
|
||||
|
||||
static int names_equal(const char *name_a, int len_a, const char *name_b,
|
||||
int len_b)
|
||||
{
|
||||
return (len_a == len_b) && !memcmp(name_a, name_b, len_a);
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the offset portion of a dirent key from the hash of the name.
|
||||
*
|
||||
* XXX This crc nonsense is a quick hack. We'll want something a
|
||||
* lot stronger like siphash.
|
||||
*/
|
||||
static u32 name_hash(struct inode *dir, const char *name, unsigned int len)
|
||||
{
|
||||
struct scoutfs_inode_info *ci = SCOUTFS_I(dir);
|
||||
|
||||
return crc32c(ci->salt, name, len) >> (32 - SCOUTFS_DIRENT_OFF_BITS);
|
||||
}
|
||||
|
||||
static unsigned int dent_bytes(unsigned int name_len)
|
||||
{
|
||||
return sizeof(struct scoutfs_dirent) + name_len;
|
||||
}
|
||||
|
||||
static unsigned int dent_val_off(struct scoutfs_item *item,
|
||||
struct scoutfs_dirent *dent)
|
||||
{
|
||||
return (char *)dent - (char *)item->val;
|
||||
}
|
||||
|
||||
static inline struct scoutfs_dirent *next_dent(struct scoutfs_item *item,
|
||||
struct scoutfs_dirent *dent)
|
||||
{
|
||||
unsigned int next_off;
|
||||
|
||||
next_off = dent_val_off(item, dent) + dent_bytes(dent->name_len);
|
||||
if (next_off == item->val_len)
|
||||
return NULL;
|
||||
|
||||
return item->val + next_off;
|
||||
}
|
||||
|
||||
#define for_each_item_dent(item, dent) \
|
||||
for (dent = item->val; dent; dent = next_dent(item, dent))
|
||||
|
||||
struct dentry_info {
|
||||
/*
|
||||
* The key offset and collision nr are stored so that we don't
|
||||
* have to either hash the name to find the item or compare
|
||||
* names to find the dirent in the item.
|
||||
*/
|
||||
u32 key_offset;
|
||||
u8 coll_nr;
|
||||
};
|
||||
|
||||
static struct kmem_cache *scoutfs_dentry_cachep;
|
||||
|
||||
static struct dentry_info *alloc_dentry_info(struct dentry *dentry)
|
||||
{
|
||||
struct dentry_info *di;
|
||||
|
||||
/* XXX read mb? */
|
||||
if (dentry->d_fsdata)
|
||||
return dentry->d_fsdata;
|
||||
|
||||
di = kmem_cache_zalloc(scoutfs_dentry_cachep, GFP_NOFS);
|
||||
if (!di)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
spin_lock(&dentry->d_lock);
|
||||
if (!dentry->d_fsdata)
|
||||
dentry->d_fsdata = di;
|
||||
spin_unlock(&dentry->d_lock);
|
||||
|
||||
if (di != dentry->d_fsdata)
|
||||
kmem_cache_free(scoutfs_dentry_cachep, di);
|
||||
|
||||
return dentry->d_fsdata;
|
||||
}
|
||||
|
||||
/*
|
||||
* Lookup searches for an entry for the given name amongst the entries
|
||||
* stored in the item at the name's hash.
|
||||
*/
|
||||
static struct dentry *scoutfs_lookup(struct inode *dir, struct dentry *dentry,
|
||||
unsigned int flags)
|
||||
{
|
||||
struct super_block *sb = dir->i_sb;
|
||||
struct scoutfs_dirent *dent;
|
||||
struct scoutfs_item *item;
|
||||
struct dentry_info *di;
|
||||
struct scoutfs_key key;
|
||||
struct inode *inode;
|
||||
u64 ino = 0;
|
||||
u32 h = 0;
|
||||
u32 nr = 0;
|
||||
int ret;
|
||||
|
||||
di = alloc_dentry_info(dentry);
|
||||
if (IS_ERR(di)) {
|
||||
ret = PTR_ERR(di);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (dentry->d_name.len > SCOUTFS_NAME_LEN) {
|
||||
ret = -ENAMETOOLONG;
|
||||
goto out;
|
||||
}
|
||||
|
||||
h = name_hash(dir, dentry->d_name.name, dentry->d_name.len);
|
||||
scoutfs_set_key(&key, scoutfs_ino(dir), SCOUTFS_DIRENT_KEY, h);
|
||||
|
||||
item = scoutfs_item_lookup(sb, &key);
|
||||
if (IS_ERR(item)) {
|
||||
ret = PTR_ERR(item);
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = -ENOENT;
|
||||
for_each_item_dent(item, dent) {
|
||||
if (names_equal(dentry->d_name.name, dentry->d_name.len,
|
||||
dent->name, dent->name_len)) {
|
||||
ino = le64_to_cpu(dent->ino);
|
||||
nr = dent->coll_nr;
|
||||
ret = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
scoutfs_item_put(item);
|
||||
out:
|
||||
if (ret == -ENOENT) {
|
||||
inode = NULL;
|
||||
} else if (ret) {
|
||||
inode = ERR_PTR(ret);
|
||||
} else {
|
||||
di->key_offset = h;
|
||||
di->coll_nr = nr;
|
||||
inode = scoutfs_iget(sb, ino);
|
||||
}
|
||||
|
||||
return d_splice_alias(inode, dentry);
|
||||
}
|
||||
|
||||
/* this exists upstream so we can just delete it in a forward port */
|
||||
static int dir_emit_dots(struct file *file, void *dirent, filldir_t filldir)
|
||||
{
|
||||
struct dentry *dentry = file->f_path.dentry;
|
||||
struct inode *inode = dentry->d_inode;
|
||||
struct inode *parent = dentry->d_parent->d_inode;
|
||||
|
||||
if (file->f_pos == 0) {
|
||||
if (!filldir(dirent, ".", 1, 1, scoutfs_ino(inode), DT_DIR))
|
||||
return 0;
|
||||
file->f_pos = 1;
|
||||
}
|
||||
|
||||
if (file->f_pos == 1) {
|
||||
if (!filldir(dirent, "..", 2, 1, scoutfs_ino(parent), DT_DIR))
|
||||
return 0;
|
||||
file->f_pos = 2;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* readdir finds the next entry at or past the hash|coll_nr stored in
|
||||
* the ctx->pos (f_pos).
|
||||
*
|
||||
* It will need to be careful not to read past the region of the dirent
|
||||
* hash offset keys that it has access to.
|
||||
*/
|
||||
static int scoutfs_readdir(struct file *file, void *dirent, filldir_t filldir)
|
||||
{
|
||||
struct inode *inode = file_inode(file);
|
||||
struct super_block *sb = inode->i_sb;
|
||||
struct scoutfs_dirent *dent;
|
||||
struct scoutfs_key last_key;
|
||||
struct scoutfs_item *item;
|
||||
struct scoutfs_key key;
|
||||
u32 nr;
|
||||
u32 off;
|
||||
u64 pos;
|
||||
int ret = 0;
|
||||
|
||||
if (!dir_emit_dots(file, dirent, filldir))
|
||||
return 0;
|
||||
|
||||
scoutfs_set_key(&last_key, scoutfs_ino(inode), SCOUTFS_DIRENT_KEY,
|
||||
SCOUTFS_DIRENT_OFF_MASK);
|
||||
|
||||
do {
|
||||
off = file->f_pos >> SCOUTFS_DIRENT_COLL_BITS;
|
||||
nr = file->f_pos & SCOUTFS_DIRENT_COLL_MASK;
|
||||
|
||||
scoutfs_set_key(&key, scoutfs_ino(inode), SCOUTFS_DIRENT_KEY,
|
||||
off);
|
||||
item = scoutfs_item_next(sb, &key);
|
||||
if (IS_ERR(item)) {
|
||||
ret = PTR_ERR(item);
|
||||
if (ret == -ENOENT)
|
||||
ret = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
if (scoutfs_key_cmp(&item->key, &last_key) > 0) {
|
||||
scoutfs_item_put(item);
|
||||
break;
|
||||
}
|
||||
|
||||
/* reset nr to 0 if we found the next item */
|
||||
if (scoutfs_key_offset(&item->key) != off)
|
||||
nr = 0;
|
||||
|
||||
pos = scoutfs_key_offset(&item->key)
|
||||
<< SCOUTFS_DIRENT_COLL_BITS;
|
||||
for_each_item_dent(item, dent) {
|
||||
if (dent->coll_nr < nr)
|
||||
continue;
|
||||
|
||||
if (!filldir(dirent, dent->name, dent->name_len, pos,
|
||||
le64_to_cpu(dent->ino), dent->type))
|
||||
break;
|
||||
|
||||
file->f_pos = (pos | dent->coll_nr) + 1;
|
||||
}
|
||||
|
||||
scoutfs_item_put(item);
|
||||
|
||||
/* advance to the next hash value if we finished item */
|
||||
if (dent == NULL)
|
||||
file->f_pos = pos + (1 << SCOUTFS_DIRENT_COLL_BITS);
|
||||
|
||||
} while (dent == NULL);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int scoutfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
|
||||
dev_t rdev)
|
||||
{
|
||||
struct super_block *sb = dir->i_sb;
|
||||
struct inode *inode = NULL;
|
||||
struct scoutfs_dirent *dent;
|
||||
struct scoutfs_item *item;
|
||||
struct dentry_info *di;
|
||||
struct scoutfs_key key;
|
||||
int bytes;
|
||||
int ret;
|
||||
int off;
|
||||
u64 nr;
|
||||
u64 h;
|
||||
|
||||
di = alloc_dentry_info(dentry);
|
||||
if (IS_ERR(di))
|
||||
return PTR_ERR(di);
|
||||
|
||||
if (dentry->d_name.len > SCOUTFS_NAME_LEN)
|
||||
return -ENAMETOOLONG;
|
||||
|
||||
inode = scoutfs_new_inode(sb, dir, mode, rdev);
|
||||
if (IS_ERR(inode))
|
||||
return PTR_ERR(inode);
|
||||
|
||||
h = name_hash(dir, dentry->d_name.name, dentry->d_name.len);
|
||||
scoutfs_set_key(&key, scoutfs_ino(dir), SCOUTFS_DIRENT_KEY, h);
|
||||
bytes = dent_bytes(dentry->d_name.len);
|
||||
|
||||
item = scoutfs_item_lookup(sb, &key);
|
||||
if (item == ERR_PTR(-ENOENT)) {
|
||||
item = scoutfs_item_create(sb, &key, bytes);
|
||||
if (!IS_ERR(item)) {
|
||||
/* mark a newly created item */
|
||||
dent = item->val;
|
||||
dent->name_len = 0;
|
||||
}
|
||||
}
|
||||
if (IS_ERR(item)) {
|
||||
ret = PTR_ERR(item);
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
nr = 0;
|
||||
for_each_item_dent(item, dent) {
|
||||
/* the common case of a newly created item */
|
||||
if (!dent->name_len)
|
||||
break;
|
||||
|
||||
/* XXX check for eexist? can't happen? */
|
||||
|
||||
/* found a free coll nr, insert here */
|
||||
if (nr < dent->coll_nr) {
|
||||
off = dent_val_off(item, dent);
|
||||
ret = scoutfs_item_expand(item, off, bytes);
|
||||
if (!ret)
|
||||
dent = item->val + off;
|
||||
break;
|
||||
}
|
||||
|
||||
/* the item's full */
|
||||
if (nr++ == SCOUTFS_DIRENT_COLL_MASK) {
|
||||
ret = -ENOSPC;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!ret) {
|
||||
dent->ino = cpu_to_le64(scoutfs_ino(inode));
|
||||
dent->type = mode_to_type(inode->i_mode);
|
||||
dent->coll_nr = nr;
|
||||
dent->name_len = dentry->d_name.len;
|
||||
memcpy(dent->name, dentry->d_name.name, dent->name_len);
|
||||
di->key_offset = h;
|
||||
di->coll_nr = nr;
|
||||
}
|
||||
|
||||
scoutfs_item_put(item);
|
||||
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
i_size_write(dir, i_size_read(dir) + dentry->d_name.len);
|
||||
dir->i_mtime = dir->i_ctime = CURRENT_TIME;
|
||||
|
||||
if (S_ISDIR(mode)) {
|
||||
inc_nlink(inode);
|
||||
inc_nlink(dir);
|
||||
}
|
||||
|
||||
mark_inode_dirty(inode);
|
||||
mark_inode_dirty(dir);
|
||||
|
||||
insert_inode_hash(inode);
|
||||
d_instantiate(dentry, inode);
|
||||
out:
|
||||
/* XXX delete the inode item here */
|
||||
if (ret && !IS_ERR_OR_NULL(inode))
|
||||
iput(inode);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* XXX hmm, do something with excl? */
|
||||
static int scoutfs_create(struct inode *dir, struct dentry *dentry,
|
||||
umode_t mode, bool excl)
|
||||
{
|
||||
return scoutfs_mknod(dir, dentry, mode | S_IFREG, 0);
|
||||
}
|
||||
|
||||
static int scoutfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
|
||||
{
|
||||
return scoutfs_mknod(dir, dentry, mode | S_IFDIR, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Unlink removes the entry from its item and removes the item if ours
|
||||
* was the only remaining entry.
|
||||
*/
|
||||
static int scoutfs_unlink(struct inode *dir, struct dentry *dentry)
|
||||
{
|
||||
struct super_block *sb = dir->i_sb;
|
||||
struct inode *inode = dentry->d_inode;
|
||||
struct timespec ts = current_kernel_time();
|
||||
struct scoutfs_dirent *dent;
|
||||
struct scoutfs_item *item;
|
||||
struct dentry_info *di;
|
||||
struct scoutfs_key key;
|
||||
int ret = 0;
|
||||
|
||||
if (WARN_ON_ONCE(!dentry->d_fsdata))
|
||||
return -EINVAL;
|
||||
di = dentry->d_fsdata;
|
||||
|
||||
trace_printk("dir size %llu entry k_off nr %u %u\n",
|
||||
i_size_read(inode), di->key_offset, di->coll_nr);
|
||||
|
||||
if (S_ISDIR(inode->i_mode) && i_size_read(inode))
|
||||
return -ENOTEMPTY;
|
||||
|
||||
scoutfs_set_key(&key, scoutfs_ino(dir), SCOUTFS_DIRENT_KEY,
|
||||
di->key_offset);
|
||||
|
||||
item = scoutfs_item_lookup(sb, &key);
|
||||
if (IS_ERR(item)) {
|
||||
ret = PTR_ERR(item);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* XXX error to not find the coll nr we were looking for? */
|
||||
for_each_item_dent(item, dent) {
|
||||
if (dent->coll_nr != di->coll_nr)
|
||||
continue;
|
||||
|
||||
/* XXX compare names and eio? */
|
||||
|
||||
if (item->val_len == dent_bytes(dent->name_len)) {
|
||||
scoutfs_item_delete(sb, item);
|
||||
ret = 0;
|
||||
} else {
|
||||
ret = scoutfs_item_shrink(item,
|
||||
dent_val_off(item, dent),
|
||||
dent_bytes(dent->name_len));
|
||||
}
|
||||
dent = NULL;
|
||||
break;
|
||||
}
|
||||
|
||||
scoutfs_item_put(item);
|
||||
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
dir->i_ctime = ts;
|
||||
dir->i_mtime = ts;
|
||||
i_size_write(dir, i_size_read(dir) - dentry->d_name.len);
|
||||
|
||||
inode->i_ctime = ts;
|
||||
drop_nlink(inode);
|
||||
if (S_ISDIR(inode->i_mode)) {
|
||||
drop_nlink(dir);
|
||||
drop_nlink(inode);
|
||||
}
|
||||
mark_inode_dirty(inode);
|
||||
mark_inode_dirty(dir);
|
||||
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
const struct file_operations scoutfs_dir_fops = {
|
||||
.readdir = scoutfs_readdir,
|
||||
};
|
||||
|
||||
const struct inode_operations scoutfs_dir_iops = {
|
||||
.lookup = scoutfs_lookup,
|
||||
.mknod = scoutfs_mknod,
|
||||
.create = scoutfs_create,
|
||||
.mkdir = scoutfs_mkdir,
|
||||
.unlink = scoutfs_unlink,
|
||||
.rmdir = scoutfs_unlink,
|
||||
};
|
||||
|
||||
void scoutfs_dir_exit(void)
|
||||
{
|
||||
if (scoutfs_dentry_cachep) {
|
||||
kmem_cache_destroy(scoutfs_dentry_cachep);
|
||||
scoutfs_dentry_cachep = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
int scoutfs_dir_init(void)
|
||||
{
|
||||
scoutfs_dentry_cachep = kmem_cache_create("scoutfs_dentry_info",
|
||||
sizeof(struct dentry_info), 0,
|
||||
SLAB_RECLAIM_ACCOUNT, NULL);
|
||||
if (!scoutfs_dentry_cachep)
|
||||
return -ENOMEM;
|
||||
|
||||
return 0;
|
||||
}
|
||||
10
kmod/src/dir.h
Normal file
10
kmod/src/dir.h
Normal file
@@ -0,0 +1,10 @@
|
||||
#ifndef _SCOUTFS_DIR_H_
|
||||
#define _SCOUTFS_DIR_H_
|
||||
|
||||
extern const struct file_operations scoutfs_dir_fops;
|
||||
extern const struct inode_operations scoutfs_dir_iops;
|
||||
|
||||
int scoutfs_dir_init(void);
|
||||
void scoutfs_dir_exit(void);
|
||||
|
||||
#endif
|
||||
122
kmod/src/format.h
Normal file
122
kmod/src/format.h
Normal file
@@ -0,0 +1,122 @@
|
||||
#ifndef _SCOUTFS_FORMAT_H_
|
||||
#define _SCOUTFS_FORMAT_H_
|
||||
|
||||
#define SCOUTFS_SUPER_MAGIC 0x554f4353 /* "SCOU" */
|
||||
|
||||
#define SCOUTFS_BLOCK_SHIFT 22
|
||||
#define SCOUTFS_BLOCK_SIZE (1 << SCOUTFS_BLOCK_SHIFT)
|
||||
|
||||
/*
|
||||
* This bloom size is chosen to have a roughly 1% false positive rate
|
||||
* for ~90k items which is roughly the worst case for a block full of
|
||||
* dirents with reasonably small names. Pathologically smaller items
|
||||
* could be even more dense.
|
||||
*/
|
||||
#define SCOUTFS_BLOOM_FILTER_BYTES (128 * 1024)
|
||||
#define SCOUTFS_BLOOM_FILTER_BITS (SCOUTFS_BLOOM_FILTER_BYTES * 8)
|
||||
#define SCOUTFS_BLOOM_INDEX_BITS (ilog2(SCOUTFS_BLOOM_FILTER_BITS))
|
||||
#define SCOUTFS_BLOOM_INDEX_MASK ((1 << SCOUTFS_BLOOM_INDEX_BITS) - 1)
|
||||
#define SCOUTFS_BLOOM_INDEX_NR 7
|
||||
|
||||
/*
|
||||
* We should be able to make the offset smaller if neither dirents nor
|
||||
* data items use the full 64 bits.
|
||||
*/
|
||||
struct scoutfs_key {
|
||||
__le64 inode;
|
||||
u8 type;
|
||||
__le64 offset;
|
||||
} __packed;
|
||||
|
||||
#define SCOUTFS_INODE_KEY 128
|
||||
#define SCOUTFS_DIRENT_KEY 192
|
||||
|
||||
struct scoutfs_lsm_block {
|
||||
struct scoutfs_key first;
|
||||
struct scoutfs_key last;
|
||||
__le32 nr_items;
|
||||
/* u8 bloom[SCOUTFS_BLOOM_BYTES]; */
|
||||
/* struct scoutfs_item_header items[0] .. */
|
||||
} __packed;
|
||||
|
||||
struct scoutfs_item_header {
|
||||
struct scoutfs_key key;
|
||||
__le16 val_len;
|
||||
} __packed;
|
||||
|
||||
|
||||
struct scoutfs_timespec {
|
||||
__le64 sec;
|
||||
__le32 nsec;
|
||||
} __packed;
|
||||
|
||||
/*
|
||||
* XXX
|
||||
* - otime?
|
||||
* - compat flags?
|
||||
* - version?
|
||||
* - generation?
|
||||
* - be more careful with rdev?
|
||||
*/
|
||||
struct scoutfs_inode {
|
||||
__le64 size;
|
||||
__le64 blocks;
|
||||
__le32 nlink;
|
||||
__le32 uid;
|
||||
__le32 gid;
|
||||
__le32 mode;
|
||||
__le32 rdev;
|
||||
__le32 salt;
|
||||
struct scoutfs_timespec atime;
|
||||
struct scoutfs_timespec ctime;
|
||||
struct scoutfs_timespec mtime;
|
||||
} __packed;
|
||||
|
||||
#define SCOUTFS_ROOT_INO 1
|
||||
|
||||
/*
|
||||
* Dirents are stored in items with an offset of the hash of their name.
|
||||
* Colliding names are packed into the value.
|
||||
*/
|
||||
struct scoutfs_dirent {
|
||||
__le64 ino;
|
||||
#if defined(__LITTLE_ENDIAN_BITFIELD)
|
||||
__u8 type:4,
|
||||
coll_nr:4;
|
||||
#else
|
||||
__u8 coll_nr:4,
|
||||
type:4;
|
||||
#endif
|
||||
__u8 name_len;
|
||||
__u8 name[0];
|
||||
} __packed;
|
||||
|
||||
#define SCOUTFS_NAME_LEN 255
|
||||
|
||||
/*
|
||||
* We only use 31 bits for readdir positions so that we don't confuse
|
||||
* old signed 32bit f_pos applications or those on the other side of
|
||||
* network protocols that have limited readir positions.
|
||||
*/
|
||||
|
||||
#define SCOUTFS_DIRENT_OFF_BITS 27
|
||||
#define SCOUTFS_DIRENT_OFF_MASK ((1 << SCOUTFS_DIRENT_OFF_BITS) - 1)
|
||||
#define SCOUTFS_DIRENT_COLL_BITS 4
|
||||
#define SCOUTFS_DIRENT_COLL_MASK ((1 << SCOUTFS_DIRENT_COLL_BITS) - 1)
|
||||
|
||||
/* getdents returns the *next* pos with each entry. so we can't return ~0 */
|
||||
#define SCOUTFS_DIRENT_MAX_POS \
|
||||
(((1 << (SCOUTFS_DIRENT_OFF_BITS + SCOUTFS_DIRENT_COLL_BITS)) - 1) - 1)
|
||||
|
||||
enum {
|
||||
SCOUTFS_DT_FIFO = 0,
|
||||
SCOUTFS_DT_CHR,
|
||||
SCOUTFS_DT_DIR,
|
||||
SCOUTFS_DT_BLK,
|
||||
SCOUTFS_DT_REG,
|
||||
SCOUTFS_DT_LNK,
|
||||
SCOUTFS_DT_SOCK,
|
||||
SCOUTFS_DT_WHT,
|
||||
};
|
||||
|
||||
#endif
|
||||
272
kmod/src/inode.c
Normal file
272
kmod/src/inode.c
Normal file
@@ -0,0 +1,272 @@
|
||||
/*
|
||||
* Copyright (C) 2015 Versity Software, Inc. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License v2 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*/
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/random.h>
|
||||
|
||||
#include "format.h"
|
||||
#include "super.h"
|
||||
#include "key.h"
|
||||
#include "inode.h"
|
||||
#include "item.h"
|
||||
#include "dir.h"
|
||||
|
||||
/*
|
||||
* XXX
|
||||
* - worry about i_ino trunctation, not sure if we do anything
|
||||
* - use inode item value lengths for forward/back compat
|
||||
*/
|
||||
|
||||
static struct kmem_cache *scoutfs_inode_cachep;
|
||||
|
||||
static void scoutfs_inode_ctor(void *obj)
|
||||
{
|
||||
struct scoutfs_inode_info *ci = obj;
|
||||
|
||||
inode_init_once(&ci->inode);
|
||||
}
|
||||
|
||||
struct inode *scoutfs_alloc_inode(struct super_block *sb)
|
||||
{
|
||||
struct scoutfs_inode_info *ci;
|
||||
|
||||
ci = kmem_cache_alloc(scoutfs_inode_cachep, GFP_NOFS);
|
||||
if (!ci)
|
||||
return NULL;
|
||||
|
||||
return &ci->inode;
|
||||
}
|
||||
|
||||
static void scoutfs_i_callback(struct rcu_head *head)
|
||||
{
|
||||
struct inode *inode = container_of(head, struct inode, i_rcu);
|
||||
|
||||
trace_printk("freeing inode %p\n", inode);
|
||||
kmem_cache_free(scoutfs_inode_cachep, SCOUTFS_I(inode));
|
||||
}
|
||||
|
||||
void scoutfs_destroy_inode(struct inode *inode)
|
||||
{
|
||||
call_rcu(&inode->i_rcu, scoutfs_i_callback);
|
||||
}
|
||||
|
||||
/*
|
||||
* Called once new inode allocation or inode reading has initialized
|
||||
* enough of the inode for us to set the ops based on the mode.
|
||||
*/
|
||||
static void set_inode_ops(struct inode *inode)
|
||||
{
|
||||
switch (inode->i_mode & S_IFMT) {
|
||||
case S_IFREG:
|
||||
// inode->i_mapping->a_ops = &scoutfs_file_aops;
|
||||
// inode->i_op = &scoutfs_file_iops;
|
||||
// inode->i_fop = &scoutfs_file_fops;
|
||||
break;
|
||||
case S_IFDIR:
|
||||
inode->i_op = &scoutfs_dir_iops;
|
||||
inode->i_fop = &scoutfs_dir_fops;
|
||||
break;
|
||||
case S_IFLNK:
|
||||
// inode->i_op = &scoutfs_symlink_iops;
|
||||
break;
|
||||
default:
|
||||
// inode->i_op = &scoutfs_special_iops;
|
||||
init_special_inode(inode, inode->i_mode, inode->i_rdev);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static void load_inode(struct inode *inode, struct scoutfs_inode *cinode)
|
||||
{
|
||||
struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
|
||||
|
||||
i_size_write(inode, le64_to_cpu(cinode->size));
|
||||
set_nlink(inode, le32_to_cpu(cinode->nlink));
|
||||
i_uid_write(inode, le32_to_cpu(cinode->uid));
|
||||
i_gid_write(inode, le32_to_cpu(cinode->gid));
|
||||
inode->i_mode = le32_to_cpu(cinode->mode);
|
||||
inode->i_rdev = le32_to_cpu(cinode->rdev);
|
||||
inode->i_atime.tv_sec = le64_to_cpu(cinode->atime.sec);
|
||||
inode->i_atime.tv_nsec = le32_to_cpu(cinode->atime.nsec);
|
||||
inode->i_mtime.tv_sec = le64_to_cpu(cinode->mtime.sec);
|
||||
inode->i_mtime.tv_nsec = le32_to_cpu(cinode->mtime.nsec);
|
||||
inode->i_ctime.tv_sec = le64_to_cpu(cinode->ctime.sec);
|
||||
inode->i_ctime.tv_nsec = le32_to_cpu(cinode->ctime.nsec);
|
||||
|
||||
ci->salt = le32_to_cpu(cinode->salt);
|
||||
}
|
||||
|
||||
static int scoutfs_read_locked_inode(struct inode *inode)
|
||||
{
|
||||
struct super_block *sb = inode->i_sb;
|
||||
struct scoutfs_item *item;
|
||||
struct scoutfs_key key;
|
||||
|
||||
scoutfs_set_key(&key, scoutfs_ino(inode), SCOUTFS_INODE_KEY, 0);
|
||||
|
||||
item = scoutfs_item_lookup(sb, &key);
|
||||
if (IS_ERR(item))
|
||||
return PTR_ERR(item);
|
||||
|
||||
load_inode(inode, item->val);
|
||||
scoutfs_item_put(item);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int scoutfs_iget_test(struct inode *inode, void *arg)
|
||||
{
|
||||
struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
|
||||
u64 *ino = arg;
|
||||
|
||||
return ci->ino == *ino;
|
||||
}
|
||||
|
||||
static int scoutfs_iget_set(struct inode *inode, void *arg)
|
||||
{
|
||||
struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
|
||||
u64 *ino = arg;
|
||||
|
||||
inode->i_ino = *ino;
|
||||
ci->ino = *ino;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct inode *scoutfs_iget(struct super_block *sb, u64 ino)
|
||||
{
|
||||
struct inode *inode;
|
||||
int ret;
|
||||
|
||||
inode = iget5_locked(sb, ino, scoutfs_iget_test, scoutfs_iget_set,
|
||||
&ino);
|
||||
if (!inode)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
if (inode->i_state & I_NEW) {
|
||||
ret = scoutfs_read_locked_inode(inode);
|
||||
if (ret) {
|
||||
iget_failed(inode);
|
||||
inode = ERR_PTR(ret);
|
||||
} else {
|
||||
set_inode_ops(inode);
|
||||
unlock_new_inode(inode);
|
||||
}
|
||||
}
|
||||
|
||||
return inode;
|
||||
}
|
||||
|
||||
static void store_inode(struct scoutfs_inode *cinode, struct inode *inode)
|
||||
{
|
||||
struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
|
||||
|
||||
cinode->size = cpu_to_le64(i_size_read(inode));
|
||||
cinode->nlink = cpu_to_le32(inode->i_nlink);
|
||||
cinode->uid = cpu_to_le32(i_uid_read(inode));
|
||||
cinode->gid = cpu_to_le32(i_gid_read(inode));
|
||||
cinode->mode = cpu_to_le32(inode->i_mode);
|
||||
cinode->rdev = cpu_to_le32(inode->i_rdev);
|
||||
cinode->atime.sec = cpu_to_le64(inode->i_atime.tv_sec);
|
||||
cinode->atime.nsec = cpu_to_le32(inode->i_atime.tv_nsec);
|
||||
cinode->ctime.sec = cpu_to_le64(inode->i_ctime.tv_sec);
|
||||
cinode->ctime.nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
|
||||
cinode->mtime.sec = cpu_to_le64(inode->i_mtime.tv_sec);
|
||||
cinode->mtime.nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
|
||||
|
||||
cinode->salt = cpu_to_le32(ci->salt);
|
||||
}
|
||||
|
||||
/*
|
||||
* Every time we modify the inode in memory we copy it to its inode
|
||||
* item. This lets us write out blocks of items without having to track
|
||||
* down dirty vfs inodes and safely copy them into items before writing.
|
||||
*/
|
||||
int scoutfs_inode_update(struct inode *inode)
|
||||
{
|
||||
struct super_block *sb = inode->i_sb;
|
||||
struct scoutfs_item *item;
|
||||
struct scoutfs_key key;
|
||||
|
||||
scoutfs_set_key(&key, scoutfs_ino(inode), SCOUTFS_INODE_KEY, 0);
|
||||
|
||||
item = scoutfs_item_lookup(sb, &key);
|
||||
if (IS_ERR(item))
|
||||
return PTR_ERR(item);
|
||||
|
||||
store_inode(item->val, inode);
|
||||
scoutfs_item_put(item);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocate and initialize a new inode. The caller is responsible for
|
||||
* creating links to it and updating it. @dir can be null.
|
||||
*/
|
||||
struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir,
|
||||
umode_t mode, dev_t rdev)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_inode_info *ci;
|
||||
struct scoutfs_item *item;
|
||||
struct scoutfs_key key;
|
||||
struct inode *inode;
|
||||
|
||||
inode = new_inode(sb);
|
||||
if (!inode)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
ci = SCOUTFS_I(inode);
|
||||
ci->ino = atomic64_inc_return(&sbi->next_ino);
|
||||
get_random_bytes(&ci->salt, sizeof(ci->salt));
|
||||
|
||||
inode->i_ino = ci->ino;
|
||||
inode_init_owner(inode, dir, mode);
|
||||
inode_set_bytes(inode, 0);
|
||||
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
|
||||
inode->i_rdev = rdev;
|
||||
set_inode_ops(inode);
|
||||
|
||||
scoutfs_set_key(&key, scoutfs_ino(inode), SCOUTFS_INODE_KEY, 0);
|
||||
|
||||
item = scoutfs_item_create(inode->i_sb, &key,
|
||||
sizeof(struct scoutfs_inode));
|
||||
if (IS_ERR(item)) {
|
||||
iput(inode);
|
||||
inode = ERR_CAST(item);
|
||||
}
|
||||
return inode;
|
||||
}
|
||||
|
||||
void scoutfs_inode_exit(void)
|
||||
{
|
||||
if (scoutfs_inode_cachep) {
|
||||
rcu_barrier();
|
||||
kmem_cache_destroy(scoutfs_inode_cachep);
|
||||
scoutfs_inode_cachep = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
int scoutfs_inode_init(void)
|
||||
{
|
||||
scoutfs_inode_cachep = kmem_cache_create("scoutfs_inode_info",
|
||||
sizeof(struct scoutfs_inode_info), 0,
|
||||
SLAB_RECLAIM_ACCOUNT,
|
||||
scoutfs_inode_ctor);
|
||||
if (!scoutfs_inode_cachep)
|
||||
return -ENOMEM;
|
||||
|
||||
return 0;
|
||||
}
|
||||
32
kmod/src/inode.h
Normal file
32
kmod/src/inode.h
Normal file
@@ -0,0 +1,32 @@
|
||||
#ifndef _SCOUTFS_INODE_H_
|
||||
#define _SCOUTFS_INODE_H_
|
||||
|
||||
struct scoutfs_inode_info {
|
||||
u64 ino;
|
||||
u32 salt;
|
||||
|
||||
struct inode inode;
|
||||
};
|
||||
|
||||
static inline struct scoutfs_inode_info *SCOUTFS_I(struct inode *inode)
|
||||
{
|
||||
return container_of(inode, struct scoutfs_inode_info, inode);
|
||||
}
|
||||
|
||||
static inline u64 scoutfs_ino(struct inode *inode)
|
||||
{
|
||||
return SCOUTFS_I(inode)->ino;
|
||||
}
|
||||
|
||||
struct inode *scoutfs_alloc_inode(struct super_block *sb);
|
||||
void scoutfs_destroy_inode(struct inode *inode);
|
||||
|
||||
struct inode *scoutfs_iget(struct super_block *sb, u64 ino);
|
||||
int scoutfs_inode_update(struct inode *inode);
|
||||
struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir,
|
||||
umode_t mode, dev_t rdev);
|
||||
|
||||
void scoutfs_inode_exit(void);
|
||||
int scoutfs_inode_init(void);
|
||||
|
||||
#endif
|
||||
423
kmod/src/item.c
Normal file
423
kmod/src/item.c
Normal file
@@ -0,0 +1,423 @@
|
||||
/*
|
||||
* Copyright (C) 2015 Versity Software, Inc. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License v2 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*/
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/rbtree.h>
|
||||
#include <linux/slab.h>
|
||||
|
||||
#include "super.h"
|
||||
#include "key.h"
|
||||
#include "item.h"
|
||||
|
||||
/*
|
||||
* describe:
|
||||
* - tracks per-item dirty state for writing
|
||||
* - decouples vfs cache lifetimes from item lifetimes
|
||||
* - item-granular cache for things vfs doesn't cache (readdir, xattr)
|
||||
*
|
||||
* XXX:
|
||||
* - warnings for invalid keys/lens
|
||||
* - memory pressure
|
||||
*/
|
||||
|
||||
enum {
|
||||
ITW_NEXT = 1,
|
||||
ITW_PREV,
|
||||
};
|
||||
|
||||
static inline struct scoutfs_item *node_item(struct super_block *sb,
|
||||
struct rb_root *root,
|
||||
struct rb_node *node)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
unsigned long off;
|
||||
|
||||
if (root == &sbi->item_root)
|
||||
off = offsetof(struct scoutfs_item, node);
|
||||
else
|
||||
off = offsetof(struct scoutfs_item, dirty_node);
|
||||
|
||||
return (void *)((char *)node - off);
|
||||
}
|
||||
|
||||
static inline struct rb_node *item_node(struct super_block *sb,
|
||||
struct rb_root *root,
|
||||
struct scoutfs_item *item)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
unsigned long off;
|
||||
|
||||
if (root == &sbi->item_root)
|
||||
off = offsetof(struct scoutfs_item, node);
|
||||
else
|
||||
off = offsetof(struct scoutfs_item, dirty_node);
|
||||
|
||||
return (void *)((char *)item + off);
|
||||
}
|
||||
|
||||
/*
|
||||
* Insert a new item in the tree. The caller must have done a lookup to
|
||||
* ensure that the key is not already present.
|
||||
*/
|
||||
static void insert_item(struct super_block *sb, struct rb_root *root,
|
||||
struct scoutfs_item *ins)
|
||||
{
|
||||
struct rb_node **node = &root->rb_node;
|
||||
struct rb_node *parent = NULL;
|
||||
struct scoutfs_item *item;
|
||||
int cmp;
|
||||
|
||||
while (*node) {
|
||||
parent = *node;
|
||||
item = node_item(sb, root, *node);
|
||||
|
||||
cmp = scoutfs_key_cmp(&ins->key, &item->key);
|
||||
BUG_ON(cmp == 0);
|
||||
if (cmp < 0)
|
||||
node = &(*node)->rb_left;
|
||||
else
|
||||
node = &(*node)->rb_right;
|
||||
}
|
||||
|
||||
rb_link_node(item_node(sb, root, ins), parent, node);
|
||||
rb_insert_color(item_node(sb, root, ins), root);
|
||||
}
|
||||
|
||||
enum {
|
||||
FI_NEXT = 1,
|
||||
FI_PREV,
|
||||
};
|
||||
|
||||
/*
|
||||
* Walk the tree looking for an item.
|
||||
*
|
||||
* If NEXT or PREV are specified then those will be returned
|
||||
* if the specific item isn't found.
|
||||
*/
|
||||
static struct scoutfs_item *find_item(struct super_block *sb,
|
||||
struct rb_root *root,
|
||||
struct scoutfs_key *key, int np)
|
||||
{
|
||||
struct rb_node *node = root->rb_node;
|
||||
struct scoutfs_item *found = NULL;
|
||||
struct scoutfs_item *item;
|
||||
int cmp;
|
||||
|
||||
while (node) {
|
||||
item = node_item(sb, root, node);
|
||||
|
||||
cmp = scoutfs_key_cmp(key, &item->key);
|
||||
if (cmp < 0) {
|
||||
if (np == FI_NEXT)
|
||||
found = item;
|
||||
node = node->rb_left;
|
||||
} else if (cmp > 0) {
|
||||
if (np == FI_PREV)
|
||||
found = item;
|
||||
node = node->rb_right;
|
||||
} else {
|
||||
found = item;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return found;
|
||||
}
|
||||
|
||||
static struct scoutfs_item *alloc_item(struct scoutfs_key *key,
|
||||
unsigned int val_len)
|
||||
{
|
||||
struct scoutfs_item *item;
|
||||
void *val;
|
||||
|
||||
item = kmalloc(sizeof(struct scoutfs_item), GFP_NOFS);
|
||||
val = kmalloc(val_len, GFP_NOFS);
|
||||
if (!item || !val) {
|
||||
kfree(item);
|
||||
kfree(val);
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
RB_CLEAR_NODE(&item->node);
|
||||
RB_CLEAR_NODE(&item->dirty_node);
|
||||
atomic_set(&item->refcount, 1);
|
||||
item->key = *key;
|
||||
item->val_len = val_len;
|
||||
item->val = val;
|
||||
|
||||
return item;
|
||||
}
|
||||
|
||||
/*
|
||||
* Create a new item stored at the given key. Return it with a reference.
|
||||
* return an ERR_PTR with ENOMEM or EEXIST.
|
||||
*
|
||||
* The caller is responsible for initializing the item's value.
|
||||
*/
|
||||
struct scoutfs_item *scoutfs_item_create(struct super_block *sb,
|
||||
struct scoutfs_key *key,
|
||||
unsigned int val_len)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_item *existing;
|
||||
struct scoutfs_item *item;
|
||||
unsigned long flags;
|
||||
|
||||
item = alloc_item(key, val_len);
|
||||
if (IS_ERR(item))
|
||||
return item;
|
||||
|
||||
spin_lock_irqsave(&sbi->item_lock, flags);
|
||||
|
||||
existing = find_item(sb, &sbi->item_root, key, 0);
|
||||
if (!existing) {
|
||||
insert_item(sb, &sbi->item_root, item);
|
||||
insert_item(sb, &sbi->dirty_item_root, item);
|
||||
atomic_add(2, &item->refcount);
|
||||
}
|
||||
spin_unlock_irqrestore(&sbi->item_lock, flags);
|
||||
|
||||
if (existing) {
|
||||
scoutfs_item_put(item);
|
||||
item = ERR_PTR(-EEXIST);
|
||||
}
|
||||
|
||||
trace_printk("item %p key "CKF" val_len %d\n", item, CKA(key), val_len);
|
||||
|
||||
return item;
|
||||
}
|
||||
|
||||
/*
|
||||
* The caller is still responsible for unlocking and putting the item.
|
||||
*
|
||||
* We don't try and optimize away the lock for items that are already
|
||||
* removed from the tree. The caller's locking and item behaviour means
|
||||
* that racing to remove an item is extremely rare.
|
||||
*
|
||||
* XXX for now we're just removing it from the rbtree. We'd need to leave
|
||||
* behind a deletion record for lsm.
|
||||
*/
|
||||
void scoutfs_item_delete(struct super_block *sb, struct scoutfs_item *item)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&sbi->item_lock, flags);
|
||||
|
||||
if (!RB_EMPTY_NODE(&item->dirty_node)) {
|
||||
rb_erase(&item->dirty_node, &sbi->dirty_item_root);
|
||||
RB_CLEAR_NODE(&item->dirty_node);
|
||||
scoutfs_item_put(item);
|
||||
}
|
||||
|
||||
if (!RB_EMPTY_NODE(&item->node)) {
|
||||
rb_erase(&item->node, &sbi->item_root);
|
||||
RB_CLEAR_NODE(&item->node);
|
||||
scoutfs_item_put(item);
|
||||
}
|
||||
|
||||
spin_unlock_irqrestore(&sbi->item_lock, flags);
|
||||
}
|
||||
|
||||
static struct scoutfs_item *item_lookup(struct super_block *sb,
|
||||
struct scoutfs_key *key, int np)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_item *item;
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&sbi->item_lock, flags);
|
||||
|
||||
item = find_item(sb, &sbi->item_root, key, np);
|
||||
if (item)
|
||||
atomic_inc(&item->refcount);
|
||||
else
|
||||
item = ERR_PTR(-ENOENT);
|
||||
|
||||
spin_unlock_irqrestore(&sbi->item_lock, flags);
|
||||
|
||||
return item;
|
||||
}
|
||||
|
||||
struct scoutfs_item *scoutfs_item_lookup(struct super_block *sb,
|
||||
struct scoutfs_key *key)
|
||||
{
|
||||
return item_lookup(sb, key, 0);
|
||||
}
|
||||
|
||||
struct scoutfs_item *scoutfs_item_next(struct super_block *sb,
|
||||
struct scoutfs_key *key)
|
||||
{
|
||||
return item_lookup(sb, key, FI_NEXT);
|
||||
}
|
||||
|
||||
struct scoutfs_item *scoutfs_item_prev(struct super_block *sb,
|
||||
struct scoutfs_key *key)
|
||||
{
|
||||
return item_lookup(sb, key, FI_PREV);
|
||||
}
|
||||
|
||||
/*
|
||||
* Expand the item's value by inserting bytes at the given offset. The
|
||||
* new bytes are not initialized.
|
||||
*/
|
||||
int scoutfs_item_expand(struct scoutfs_item *item, int off, int bytes)
|
||||
{
|
||||
void *val;
|
||||
|
||||
/* XXX bytes too big */
|
||||
if (WARN_ON_ONCE(off < 0 || off > item->val_len))
|
||||
return -EINVAL;
|
||||
|
||||
val = kmalloc(item->val_len + bytes, GFP_NOFS);
|
||||
if (!val)
|
||||
return -ENOMEM;
|
||||
|
||||
memcpy(val, item->val, off);
|
||||
memcpy(val + off + bytes, item->val + off, item->val_len - off);
|
||||
|
||||
kfree(item->val);
|
||||
item->val = val;
|
||||
item->val_len += bytes;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Shrink the item's value by remove bytes at the given offset.
|
||||
*/
|
||||
int scoutfs_item_shrink(struct scoutfs_item *item, int off, int bytes)
|
||||
{
|
||||
void *val;
|
||||
|
||||
if (WARN_ON_ONCE(off < 0 || off >= item->val_len ||
|
||||
bytes <= 0 || (off + bytes) > item->val_len ||
|
||||
bytes == item->val_len))
|
||||
return -EINVAL;
|
||||
|
||||
val = kmalloc(item->val_len - bytes, GFP_NOFS);
|
||||
if (!val)
|
||||
return -ENOMEM;
|
||||
|
||||
memcpy(val, item->val, off);
|
||||
memcpy(val + off, item->val + off + bytes,
|
||||
item->val_len - (off + bytes));
|
||||
|
||||
kfree(item->val);
|
||||
item->val = val;
|
||||
item->val_len -= bytes;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void scoutfs_item_mark_dirty(struct super_block *sb, struct scoutfs_item *item)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&sbi->item_lock, flags);
|
||||
|
||||
if (RB_EMPTY_NODE(&item->dirty_node)) {
|
||||
insert_item(sb, &sbi->dirty_item_root, item);
|
||||
atomic_inc(&item->refcount);
|
||||
}
|
||||
|
||||
spin_unlock_irqrestore(&sbi->item_lock, flags);
|
||||
}
|
||||
|
||||
/*
|
||||
* Mark all the dirty items clean by emptying the dirty rbtree. The
|
||||
* caller should be preventing writes from dirtying new items.
|
||||
*
|
||||
* We erase leaf nodes with no children to minimize rotation
|
||||
* overhead during erase. Dirty items must be in the main rbtree if
|
||||
* they're in the dirty rbtree so the puts here shouldn't free the
|
||||
* items.
|
||||
*/
|
||||
void scoutfs_item_all_clean(struct super_block *sb)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct rb_root *root = &sbi->dirty_item_root;
|
||||
struct scoutfs_item *item;
|
||||
struct rb_node *node;
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&sbi->item_lock, flags);
|
||||
|
||||
node = sbi->dirty_item_root.rb_node;
|
||||
while (node) {
|
||||
if (node->rb_left)
|
||||
node = node->rb_left;
|
||||
else if (node->rb_right)
|
||||
node = node->rb_right;
|
||||
else {
|
||||
item = node_item(sb, root, node);
|
||||
node = rb_parent(node);
|
||||
|
||||
trace_printk("item %p key "CKF"\n",
|
||||
item, CKA(&item->key));
|
||||
rb_erase(&item->dirty_node, root);
|
||||
RB_CLEAR_NODE(&item->dirty_node);
|
||||
scoutfs_item_put(item);
|
||||
}
|
||||
}
|
||||
|
||||
spin_unlock_irqrestore(&sbi->item_lock, flags);
|
||||
}
|
||||
|
||||
/*
|
||||
* If the item is null then the first dirty item is returned. If an
|
||||
* item is given then the next dirty item is returned. NULL is returned
|
||||
* if there are no more dirty items.
|
||||
*
|
||||
* The caller is given a reference that it has to put. The given item
|
||||
* will always have its item dropped including if it returns NULL.
|
||||
*/
|
||||
struct scoutfs_item *scoutfs_item_next_dirty(struct super_block *sb,
|
||||
struct scoutfs_item *item)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_item *next_item;
|
||||
struct rb_node *node;
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&sbi->item_lock, flags);
|
||||
|
||||
if (item)
|
||||
node = rb_next(&item->dirty_node);
|
||||
else
|
||||
node = rb_first(&sbi->dirty_item_root);
|
||||
|
||||
if (node) {
|
||||
next_item = node_item(sb, &sbi->dirty_item_root, node);
|
||||
atomic_inc(&next_item->refcount);
|
||||
} else {
|
||||
next_item = NULL;
|
||||
}
|
||||
|
||||
spin_unlock_irqrestore(&sbi->item_lock, flags);
|
||||
|
||||
scoutfs_item_put(item);
|
||||
|
||||
return next_item;
|
||||
}
|
||||
|
||||
void scoutfs_item_put(struct scoutfs_item *item)
|
||||
{
|
||||
if (!IS_ERR_OR_NULL(item) && atomic_dec_and_test(&item->refcount)) {
|
||||
WARN_ON_ONCE(!RB_EMPTY_NODE(&item->node));
|
||||
WARN_ON_ONCE(!RB_EMPTY_NODE(&item->dirty_node));
|
||||
kfree(item);
|
||||
}
|
||||
}
|
||||
37
kmod/src/item.h
Normal file
37
kmod/src/item.h
Normal file
@@ -0,0 +1,37 @@
|
||||
#ifndef _SCOUTFS_ITEM_H_
|
||||
#define _SCOUTFS_ITEM_H_
|
||||
|
||||
#include "format.h"
|
||||
|
||||
struct scoutfs_item {
|
||||
struct rb_node node;
|
||||
struct rb_node dirty_node;
|
||||
atomic_t refcount;
|
||||
|
||||
/* the key is constant for the life of the item */
|
||||
struct scoutfs_key key;
|
||||
|
||||
/* the value can be changed by expansion or shrinking */
|
||||
unsigned int val_len;
|
||||
void *val;
|
||||
};
|
||||
|
||||
struct scoutfs_item *scoutfs_item_create(struct super_block *sb,
|
||||
struct scoutfs_key *key,
|
||||
unsigned int val_len);
|
||||
struct scoutfs_item *scoutfs_item_lookup(struct super_block *sb,
|
||||
struct scoutfs_key *key);
|
||||
struct scoutfs_item *scoutfs_item_next(struct super_block *sb,
|
||||
struct scoutfs_key *key);
|
||||
struct scoutfs_item *scoutfs_item_prev(struct super_block *sb,
|
||||
struct scoutfs_key *key);
|
||||
int scoutfs_item_expand(struct scoutfs_item *item, int off, int bytes);
|
||||
int scoutfs_item_shrink(struct scoutfs_item *item, int off, int bytes);
|
||||
void scoutfs_item_delete(struct super_block *sb, struct scoutfs_item *item);
|
||||
void scoutfs_item_mark_dirty(struct super_block *sb, struct scoutfs_item *item);
|
||||
struct scoutfs_item *scoutfs_item_next_dirty(struct super_block *sb,
|
||||
struct scoutfs_item *item);
|
||||
void scoutfs_item_all_clean(struct super_block *sb);
|
||||
void scoutfs_item_put(struct scoutfs_item *item);
|
||||
|
||||
#endif
|
||||
43
kmod/src/key.h
Normal file
43
kmod/src/key.h
Normal file
@@ -0,0 +1,43 @@
|
||||
#ifndef _SCOUTFS_KEY_H_
|
||||
#define _SCOUTFS_KEY_H_
|
||||
|
||||
#include <linux/types.h>
|
||||
#include "format.h"
|
||||
|
||||
#define CKF "%llu.%u.%llu"
|
||||
#define CKA(key) \
|
||||
le64_to_cpu((key)->inode), (key)->type, le64_to_cpu((key)->offset)
|
||||
|
||||
static inline u64 scoutfs_key_inode(struct scoutfs_key *key)
|
||||
{
|
||||
return le64_to_cpu(key->inode);
|
||||
}
|
||||
|
||||
static inline u64 scoutfs_key_offset(struct scoutfs_key *key)
|
||||
{
|
||||
return le64_to_cpu(key->offset);
|
||||
}
|
||||
|
||||
static inline int le64_cmp(__le64 a, __le64 b)
|
||||
{
|
||||
return le64_to_cpu(a) < le64_to_cpu(b) ? -1 :
|
||||
le64_to_cpu(a) > le64_to_cpu(b) ? 1 : 0;
|
||||
}
|
||||
|
||||
static inline int scoutfs_key_cmp(struct scoutfs_key *a, struct scoutfs_key *b)
|
||||
{
|
||||
return le64_cmp(a->inode, b->inode) ?:
|
||||
((short)a->type - (short)b->type) ?:
|
||||
le64_cmp(a->offset, b->offset);
|
||||
}
|
||||
|
||||
|
||||
static inline void scoutfs_set_key(struct scoutfs_key *key, u64 inode, u8 type,
|
||||
u64 offset)
|
||||
{
|
||||
key->inode = cpu_to_le64(inode);
|
||||
key->type = type;
|
||||
key->offset = cpu_to_le64(offset);
|
||||
}
|
||||
|
||||
#endif
|
||||
330
kmod/src/lsm.c
Normal file
330
kmod/src/lsm.c
Normal file
@@ -0,0 +1,330 @@
|
||||
/*
|
||||
* Copyright (C) 2016 Versity Software, Inc. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License v2 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*/
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/crc32c.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/writeback.h>
|
||||
|
||||
#include "format.h"
|
||||
#include "dir.h"
|
||||
#include "inode.h"
|
||||
#include "key.h"
|
||||
#include "item.h"
|
||||
#include "super.h"
|
||||
#include "lsm.h"
|
||||
|
||||
#define PAGE_CACHE_PAGE_BITS (PAGE_CACHE_SIZE * 8)
|
||||
|
||||
/* XXX garbage hack until we have siphash */
|
||||
static u64 bloom_hash(struct scoutfs_key *key, __le64 *hash_key)
|
||||
{
|
||||
__le32 *salts = (void *)hash_key;
|
||||
|
||||
return ((u64)crc32c(le32_to_cpu(salts[0]), key, sizeof(*key)) << 32) |
|
||||
crc32c(le32_to_cpu(salts[1]), key, sizeof(*key));
|
||||
}
|
||||
|
||||
/*
|
||||
* Set the caller's bloom indices for their item key.
|
||||
*/
|
||||
static void get_bloom_indices(struct super_block *sb,
|
||||
struct scoutfs_key *key, u32 *ind)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
__le64 *hash_key = sbi->bloom_hash_keys;
|
||||
u64 hash;
|
||||
int h;
|
||||
int i;
|
||||
|
||||
for (i = 0; ; ) {
|
||||
hash = bloom_hash(key, hash_key);
|
||||
hash_key += 2;
|
||||
|
||||
for (h = 0; h < 64 / SCOUTFS_BLOOM_INDEX_BITS; h++) {
|
||||
ind[i++] = hash & SCOUTFS_BLOOM_INDEX_MASK;
|
||||
if (i == SCOUTFS_BLOOM_INDEX_NR)
|
||||
return;
|
||||
|
||||
hash >>= SCOUTFS_BLOOM_INDEX_BITS;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct pages {
|
||||
/* fixed for the group of pages */
|
||||
struct address_space *mapping;
|
||||
struct page **pages;
|
||||
pgoff_t pgoff;
|
||||
|
||||
/* number of pages stored in the pages array */
|
||||
int nr;
|
||||
/* byte offset of the free space at end of current page */
|
||||
int off;
|
||||
/* bytes remaining in the ovarall large block */
|
||||
int remaining;
|
||||
};
|
||||
|
||||
/*
|
||||
* The caller has our fixed-size bloom filter in the locked pages
|
||||
* starting at the given byte offset in the first page. Our job is to
|
||||
* hash the key and set its bits in the bloom filter.
|
||||
*/
|
||||
static void set_bloom_bits(struct super_block *sb, struct page **pages,
|
||||
unsigned int offset, struct scoutfs_key *key)
|
||||
{
|
||||
u32 inds[SCOUTFS_BLOOM_INDEX_NR];
|
||||
struct page *page;
|
||||
int offset_bits = offset * 8;
|
||||
int full_bit;
|
||||
int page_bit;
|
||||
void *addr;
|
||||
int i;
|
||||
|
||||
get_bloom_indices(sb, key, inds);
|
||||
|
||||
for (i = 0; i < SCOUTFS_BLOOM_INDEX_NR; i++) {
|
||||
full_bit = offset_bits + inds[i];
|
||||
page = pages[full_bit / PAGE_CACHE_PAGE_BITS];
|
||||
page_bit = full_bit % PAGE_CACHE_PAGE_BITS;
|
||||
|
||||
addr = kmap_atomic(page);
|
||||
set_bit_le(page_bit, addr);
|
||||
kunmap_atomic(addr);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* XXX the zeroing here is unreliable. We'll want to zero the bloom but
|
||||
* not all the pages that are about to be overwritten. Bleh.
|
||||
*
|
||||
* Returns the number of bytes copied if there was room. Returns 0 if
|
||||
* there wasn't. Returns -errno on a hard failure.
|
||||
*/
|
||||
static int copy_to_pages(struct pages *pgs, void *ptr, size_t count)
|
||||
{
|
||||
struct page *page;
|
||||
int ret = count;
|
||||
void *addr;
|
||||
int bytes;
|
||||
|
||||
if (count > pgs->remaining)
|
||||
return 0;
|
||||
|
||||
while (count) {
|
||||
if (pgs->off == PAGE_CACHE_SIZE) {
|
||||
page = find_or_create_page(pgs->mapping,
|
||||
pgs->pgoff + pgs->nr,
|
||||
GFP_NOFS | __GFP_ZERO);
|
||||
trace_printk("page %p\n", page);
|
||||
if (!page) {
|
||||
ret = -ENOMEM;
|
||||
break;
|
||||
}
|
||||
|
||||
pgs->pages[pgs->nr++] = page;
|
||||
pgs->off = 0;
|
||||
} else {
|
||||
page = pgs->pages[pgs->nr - 1];
|
||||
}
|
||||
|
||||
bytes = min(PAGE_CACHE_SIZE - pgs->off, count);
|
||||
|
||||
trace_printk("page %p off %d ptr %p count %zu bytes %d remaining %d\n",
|
||||
page, pgs->off, ptr, count, bytes, pgs->remaining);
|
||||
|
||||
if (ptr) {
|
||||
addr = kmap_atomic(page);
|
||||
memcpy(addr + pgs->off, ptr, bytes);
|
||||
kunmap_atomic(addr);
|
||||
ptr += bytes;
|
||||
}
|
||||
count -= bytes;
|
||||
pgs->off += bytes;
|
||||
pgs->remaining -= bytes;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void drop_pages(struct pages *pgs, bool dirty)
|
||||
{
|
||||
struct page *page;
|
||||
int i;
|
||||
|
||||
if (!pgs->pages)
|
||||
return;
|
||||
|
||||
for (i = 0; i < pgs->nr; i++) {
|
||||
page = pgs->pages[i];
|
||||
|
||||
SetPageUptodate(page);
|
||||
if (dirty)
|
||||
set_page_dirty(page);
|
||||
unlock_page(page);
|
||||
page_cache_release(page);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Write dirty items from the given item into dirty page cache pages in
|
||||
* the block device at the given large block number.
|
||||
*
|
||||
* All the page cache pages are locked and pinned while they're being
|
||||
* dirtied. The intent is to have a single large IO leave once they're
|
||||
* all ready. This is an easy way to do that while maintaining
|
||||
* consistency with the block device page cache. But it might not work :).
|
||||
*
|
||||
* We do one sweep over the items. The item's aren't indexed. We might
|
||||
* want to change that.
|
||||
*
|
||||
* Even though we're doing one sweep over the items we're holding the
|
||||
* bloom filter and header pinned until the items are done. If we didn't
|
||||
* mind the risk of the blocks going out of order we wouldn't need the
|
||||
* allocated array of page pointers.
|
||||
*/
|
||||
static struct scoutfs_item *dirty_block_pages(struct super_block *sb,
|
||||
struct scoutfs_item *item, u64 blkno)
|
||||
{
|
||||
struct scoutfs_item_header ihdr;
|
||||
struct scoutfs_lsm_block lblk;
|
||||
struct pages pgs;
|
||||
void *addr;
|
||||
int ret;
|
||||
|
||||
/* assuming header starts page, and pgoff shift calculation */
|
||||
BUILD_BUG_ON(SCOUTFS_BLOCK_SHIFT < PAGE_CACHE_SHIFT);
|
||||
|
||||
if (WARN_ON_ONCE(!item))
|
||||
return item;
|
||||
|
||||
/* XXX not super thrilled with this allocation */
|
||||
pgs.pages = kmalloc_array(SCOUTFS_BLOCK_SIZE / PAGE_CACHE_SIZE,
|
||||
sizeof(struct page *), GFP_NOFS);
|
||||
if (!pgs.pages) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
pgs.mapping = sb->s_bdev->bd_inode->i_mapping;
|
||||
pgs.pgoff = blkno >> (SCOUTFS_BLOCK_SHIFT - PAGE_CACHE_SHIFT);
|
||||
pgs.nr = 0;
|
||||
pgs.off = PAGE_CACHE_SIZE,
|
||||
pgs.remaining = SCOUTFS_BLOCK_SIZE;
|
||||
|
||||
/* reserve space at the start of the block for header and bloom */
|
||||
ret = copy_to_pages(&pgs, NULL, sizeof(lblk));
|
||||
if (ret > 0)
|
||||
ret = copy_to_pages(&pgs, NULL, SCOUTFS_BLOOM_FILTER_BYTES);
|
||||
if (ret <= 0)
|
||||
goto out;
|
||||
|
||||
lblk.first = item->key;
|
||||
lblk.nr_items = 0;
|
||||
do {
|
||||
trace_printk("item %p key "CKF"\n", item, CKA(&item->key));
|
||||
|
||||
ihdr.key = item->key;
|
||||
ihdr.val_len = cpu_to_le16(item->val_len);
|
||||
ret = copy_to_pages(&pgs, &ihdr, sizeof(ihdr));
|
||||
if (ret > 0)
|
||||
ret = copy_to_pages(&pgs, item->val, item->val_len);
|
||||
if (ret <= 0)
|
||||
goto out;
|
||||
|
||||
lblk.last = item->key;
|
||||
le32_add_cpu(&lblk.nr_items, 1);
|
||||
|
||||
/* set each item's bloom bits */
|
||||
set_bloom_bits(sb, pgs.pages, sizeof(lblk), &item->key);
|
||||
|
||||
item = scoutfs_item_next_dirty(sb, item);
|
||||
} while (item);
|
||||
|
||||
/* copy the filled in header to the start of the block */
|
||||
addr = kmap_atomic(pgs.pages[0]);
|
||||
memcpy(addr, &lblk, sizeof(lblk));
|
||||
kunmap_atomic(addr);
|
||||
|
||||
out:
|
||||
/* dirty if no error (null ok!), unlock, and release */
|
||||
drop_pages(&pgs, !IS_ERR(item));
|
||||
kfree(pgs.pages);
|
||||
if (ret < 0) {
|
||||
scoutfs_item_put(item);
|
||||
item = ERR_PTR(ret);
|
||||
}
|
||||
return item;
|
||||
}
|
||||
|
||||
/*
|
||||
* Sync dirty data by writing all the dirty items into a series of level
|
||||
* 0 blocks.
|
||||
*
|
||||
* This is an initial first pass, the full method will need to:
|
||||
* - wait for pending writers
|
||||
* - block future writers
|
||||
* - update our manifest regardless of server communication
|
||||
* - communicate blocks and key ranges to server
|
||||
* - ensure that racing sync/dirty don't livelock
|
||||
*/
|
||||
int scoutfs_sync_fs(struct super_block *sb, int wait)
|
||||
{
|
||||
struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_item *item;
|
||||
u64 blknos[16]; /* XXX */
|
||||
u64 blkno;
|
||||
int ret = 0;
|
||||
int i;
|
||||
|
||||
item = scoutfs_item_next_dirty(sb, NULL);
|
||||
if (!item)
|
||||
return 0;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(blknos); i++) {
|
||||
blkno = atomic64_inc_return(&sbi->next_blkno);
|
||||
|
||||
item = dirty_block_pages(sb, item, blkno);
|
||||
if (IS_ERR(item)) {
|
||||
ret = PTR_ERR(item);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* start each block's IO */
|
||||
ret = filemap_flush(mapping);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
if (!item)
|
||||
break;
|
||||
}
|
||||
/* dirty items should have been limited */
|
||||
WARN_ON_ONCE(i >= ARRAY_SIZE(blknos));
|
||||
|
||||
/* then wait for all block IO to finish */
|
||||
if (wait) {
|
||||
ret = filemap_write_and_wait(mapping);
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* mark everything clean */
|
||||
scoutfs_item_all_clean(sb);
|
||||
ret = 0;
|
||||
out:
|
||||
trace_printk("ret %d\n", ret);
|
||||
WARN_ON_ONCE(ret);
|
||||
return ret;
|
||||
}
|
||||
6
kmod/src/lsm.h
Normal file
6
kmod/src/lsm.h
Normal file
@@ -0,0 +1,6 @@
|
||||
#ifndef _SCOUTFS_LSM_H_
|
||||
#define _SCOUTFS_LSM_H_
|
||||
|
||||
int scoutfs_sync_fs(struct super_block *sb, int wait);
|
||||
|
||||
#endif
|
||||
52
kmod/src/mkfs.c
Normal file
52
kmod/src/mkfs.c
Normal file
@@ -0,0 +1,52 @@
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/time.h>
|
||||
#include <linux/random.h>
|
||||
|
||||
#include "super.h"
|
||||
#include "item.h"
|
||||
#include "key.h"
|
||||
#include "mkfs.h"
|
||||
|
||||
/*
|
||||
* For now a file system system only exists in the item cache for the
|
||||
* duration of the mount. This "mkfs" hack creates a root dir inode in
|
||||
* the item cache on mount so that we can run tests in memory and not
|
||||
* worry about user space or persistent storage.
|
||||
*/
|
||||
int scoutfs_mkfs(struct super_block *sb)
|
||||
{
|
||||
const struct timespec ts = current_kernel_time();
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_inode *cinode;
|
||||
struct scoutfs_item *item;
|
||||
struct scoutfs_key key;
|
||||
int i;
|
||||
|
||||
atomic64_set(&sbi->next_ino, SCOUTFS_ROOT_INO + 1);
|
||||
atomic64_set(&sbi->next_blkno, 2);
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(sbi->bloom_hash_keys); i++) {
|
||||
get_random_bytes(&sbi->bloom_hash_keys[i],
|
||||
sizeof(sbi->bloom_hash_keys[i]));
|
||||
}
|
||||
|
||||
scoutfs_set_key(&key, SCOUTFS_ROOT_INO, SCOUTFS_INODE_KEY, 0);
|
||||
|
||||
item = scoutfs_item_create(sb, &key, sizeof(struct scoutfs_inode));
|
||||
if (IS_ERR(item))
|
||||
return PTR_ERR(item);
|
||||
|
||||
cinode = item->val;
|
||||
memset(cinode, 0, sizeof(struct scoutfs_inode));
|
||||
cinode->nlink = cpu_to_le32(2);
|
||||
cinode->mode = cpu_to_le32(S_IFDIR | 0755);
|
||||
cinode->atime.sec = cpu_to_le64(ts.tv_sec);
|
||||
cinode->atime.nsec = cpu_to_le32(ts.tv_nsec);
|
||||
cinode->ctime = cinode->atime;
|
||||
cinode->mtime = cinode->atime;
|
||||
get_random_bytes(&cinode->salt, sizeof(cinode->salt));
|
||||
|
||||
scoutfs_item_put(item);
|
||||
return 0;
|
||||
}
|
||||
6
kmod/src/mkfs.h
Normal file
6
kmod/src/mkfs.h
Normal file
@@ -0,0 +1,6 @@
|
||||
#ifndef _SCOUTFS_MKFS_H_
|
||||
#define _SCOUTFS_MKFS_H_
|
||||
|
||||
int scoutfs_mkfs(struct super_block *sb);
|
||||
|
||||
#endif
|
||||
103
kmod/src/super.c
Normal file
103
kmod/src/super.c
Normal file
@@ -0,0 +1,103 @@
|
||||
/*
|
||||
* Copyright (C) 2015 Versity Software, Inc. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License v2 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*/
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/magic.h>
|
||||
|
||||
#include "super.h"
|
||||
#include "format.h"
|
||||
#include "mkfs.h"
|
||||
#include "inode.h"
|
||||
#include "dir.h"
|
||||
#include "lsm.h"
|
||||
|
||||
static const struct super_operations scoutfs_super_ops = {
|
||||
.alloc_inode = scoutfs_alloc_inode,
|
||||
.destroy_inode = scoutfs_destroy_inode,
|
||||
.sync_fs = scoutfs_sync_fs,
|
||||
};
|
||||
|
||||
static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi;
|
||||
struct inode *inode;
|
||||
int ret;
|
||||
|
||||
sb->s_magic = SCOUTFS_SUPER_MAGIC;
|
||||
sb->s_maxbytes = MAX_LFS_FILESIZE;
|
||||
sb->s_op = &scoutfs_super_ops;
|
||||
|
||||
sbi = kzalloc(sizeof(struct scoutfs_sb_info), GFP_KERNEL);
|
||||
sb->s_fs_info = sbi;
|
||||
if (!sbi)
|
||||
return -ENOMEM;
|
||||
|
||||
spin_lock_init(&sbi->item_lock);
|
||||
sbi->item_root = RB_ROOT;
|
||||
sbi->dirty_item_root = RB_ROOT;
|
||||
|
||||
ret = scoutfs_mkfs(sb);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
inode = scoutfs_iget(sb, SCOUTFS_ROOT_INO);
|
||||
if (IS_ERR(inode))
|
||||
return PTR_ERR(inode);
|
||||
|
||||
sb->s_root = d_make_root(inode);
|
||||
if (!sb->s_root)
|
||||
return -ENOMEM;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct dentry *scoutfs_mount(struct file_system_type *fs_type, int flags,
|
||||
const char *dev_name, void *data)
|
||||
{
|
||||
return mount_bdev(fs_type, flags, dev_name, data, scoutfs_fill_super);
|
||||
}
|
||||
|
||||
static void scoutfs_kill_sb(struct super_block *sb)
|
||||
{
|
||||
kill_block_super(sb);
|
||||
kfree(sb->s_fs_info);
|
||||
}
|
||||
|
||||
static struct file_system_type scoutfs_fs_type = {
|
||||
.owner = THIS_MODULE,
|
||||
.name = "scoutfs",
|
||||
.mount = scoutfs_mount,
|
||||
.kill_sb = scoutfs_kill_sb,
|
||||
.fs_flags = FS_REQUIRES_DEV,
|
||||
};
|
||||
|
||||
static int __init scoutfs_module_init(void)
|
||||
{
|
||||
return scoutfs_inode_init() ?:
|
||||
scoutfs_dir_init() ?:
|
||||
register_filesystem(&scoutfs_fs_type);
|
||||
}
|
||||
module_init(scoutfs_module_init)
|
||||
|
||||
static void __exit scoutfs_module_exit(void)
|
||||
{
|
||||
unregister_filesystem(&scoutfs_fs_type);
|
||||
scoutfs_dir_exit();
|
||||
scoutfs_inode_exit();
|
||||
}
|
||||
module_exit(scoutfs_module_exit)
|
||||
|
||||
MODULE_AUTHOR("Zach Brown <zab@versity.com>");
|
||||
MODULE_LICENSE("GPL");
|
||||
22
kmod/src/super.h
Normal file
22
kmod/src/super.h
Normal file
@@ -0,0 +1,22 @@
|
||||
#ifndef _SCOUTFS_SUPER_H_
|
||||
#define _SCOUTFS_SUPER_H_
|
||||
|
||||
#include <linux/rbtree.h>
|
||||
|
||||
struct scoutfs_sb_info {
|
||||
atomic64_t next_ino;
|
||||
atomic64_t next_blkno;
|
||||
|
||||
__le64 bloom_hash_keys[6]; /* XXX */
|
||||
|
||||
spinlock_t item_lock;
|
||||
struct rb_root item_root;
|
||||
struct rb_root dirty_item_root;
|
||||
};
|
||||
|
||||
static inline struct scoutfs_sb_info *SCOUTFS_SB(struct super_block *sb)
|
||||
{
|
||||
return sb->s_fs_info;
|
||||
}
|
||||
|
||||
#endif
|
||||
Reference in New Issue
Block a user