Initial commit

This is the initial commit of the repo that will track development
against distro kernels.

This is an import of a prototype branch in the upstream kernel that only
had a few initial commits.  It needed to move to the old readdir
interface and use find_or_create_page() instead of pagecache_get_page()
to build in older distro kernels.
This commit is contained in:
Zach Brown
2016-02-05 13:15:41 -08:00
commit 25a1e8d1b7
18 changed files with 2033 additions and 0 deletions

7
kmod/.gitignore vendored Normal file
View File

@@ -0,0 +1,7 @@
src/*.o
src/*.ko
src/*.mod.c
src/*.cmd
src/.tmp_versions/
src/Module.symvers
src/modules.order

4
kmod/Makefile Normal file
View File

@@ -0,0 +1,4 @@
ALL: module
module:
make CONFIG_SCOUTFS_FS=m -C $(SK_KSRC) M=$(PWD)/src

10
kmod/src/Kconfig Normal file
View File

@@ -0,0 +1,10 @@
config SCOUTFS_FS
tristate "scoutfs filesystem"
help
scoutfs is a clustered file system that stores data in large
blocks in shared block storage.
To compile this file system support as a module, choose M here. The
module will be called scoutfs.
If unsure, say N.

3
kmod/src/Makefile Normal file
View File

@@ -0,0 +1,3 @@
obj-$(CONFIG_SCOUTFS_FS) := scoutfs.o
scoutfs-y += dir.o inode.o item.o lsm.o mkfs.o super.o

551
kmod/src/dir.c Normal file
View File

@@ -0,0 +1,551 @@
/*
* Copyright (C) 2016 Versity Software, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/crc32c.h>
#include <linux/uio.h>
#include "format.h"
#include "dir.h"
#include "inode.h"
#include "key.h"
#include "item.h"
#include "super.h"
/*
* Directory entries are stored in items whose offset is determined by
* the hash of the entry's name. This was primarily chosen to minimize
* the amount of data stored for each entry.
*
* Because we're hashing the name we need to worry about collisions. We
* store all the entries with the same hash value in the item. This was
* done so that create works with one specific item.
*
* readdir iterates over these items in hash order. The high bits of
* the entry's readdir f_pos come from the item offset while the low
* bits come from a collision number in the entry.
*
* The full readdir position, and thus the absolute max number of
* entries in a directory, is limited to 2^31 to avoid the risk of
* breaking legacy environments. Even with a relatively small 27bit
* item offset allowing 16 colliding entries gets well into hundreds of
* millions of entries before an item fills up and we return a premature
* ENOSPC. Hundreds of millions in a single dir ought to be, wait for
* it, good enough for anybody.
*
* Each item's contents are protected by the dir inode's i_mutex that
* callers acquire before calling our dir operations. If we wanted more
* fine grained concurrency, and we might, we'd have to be careful to
* manage the shared items.
*/
static unsigned int mode_to_type(umode_t mode)
{
#define S_SHIFT 12
static unsigned char mode_types[S_IFMT >> S_SHIFT] = {
[S_IFIFO >> S_SHIFT] = SCOUTFS_DT_FIFO,
[S_IFCHR >> S_SHIFT] = SCOUTFS_DT_CHR,
[S_IFDIR >> S_SHIFT] = SCOUTFS_DT_DIR,
[S_IFBLK >> S_SHIFT] = SCOUTFS_DT_BLK,
[S_IFREG >> S_SHIFT] = SCOUTFS_DT_REG,
[S_IFLNK >> S_SHIFT] = SCOUTFS_DT_LNK,
[S_IFSOCK >> S_SHIFT] = SCOUTFS_DT_SOCK,
};
return mode_types[(mode & S_IFMT) >> S_SHIFT];
#undef S_SHIFT
}
#if 0
static unsigned int dentry_type(unsigned int type)
{
static unsigned char types[] = {
[SCOUTFS_DT_FIFO] = DT_FIFO,
[SCOUTFS_DT_CHR] = DT_CHR,
[SCOUTFS_DT_DIR] = DT_DIR,
[SCOUTFS_DT_BLK] = DT_BLK,
[SCOUTFS_DT_REG] = DT_REG,
[SCOUTFS_DT_LNK] = DT_LNK,
[SCOUTFS_DT_SOCK] = DT_SOCK,
[SCOUTFS_DT_WHT] = DT_WHT,
};
if (type < ARRAY_SIZE(types))
return types[type];
return DT_UNKNOWN;
}
#endif
static int names_equal(const char *name_a, int len_a, const char *name_b,
int len_b)
{
return (len_a == len_b) && !memcmp(name_a, name_b, len_a);
}
/*
* Return the offset portion of a dirent key from the hash of the name.
*
* XXX This crc nonsense is a quick hack. We'll want something a
* lot stronger like siphash.
*/
static u32 name_hash(struct inode *dir, const char *name, unsigned int len)
{
struct scoutfs_inode_info *ci = SCOUTFS_I(dir);
return crc32c(ci->salt, name, len) >> (32 - SCOUTFS_DIRENT_OFF_BITS);
}
static unsigned int dent_bytes(unsigned int name_len)
{
return sizeof(struct scoutfs_dirent) + name_len;
}
static unsigned int dent_val_off(struct scoutfs_item *item,
struct scoutfs_dirent *dent)
{
return (char *)dent - (char *)item->val;
}
static inline struct scoutfs_dirent *next_dent(struct scoutfs_item *item,
struct scoutfs_dirent *dent)
{
unsigned int next_off;
next_off = dent_val_off(item, dent) + dent_bytes(dent->name_len);
if (next_off == item->val_len)
return NULL;
return item->val + next_off;
}
#define for_each_item_dent(item, dent) \
for (dent = item->val; dent; dent = next_dent(item, dent))
struct dentry_info {
/*
* The key offset and collision nr are stored so that we don't
* have to either hash the name to find the item or compare
* names to find the dirent in the item.
*/
u32 key_offset;
u8 coll_nr;
};
static struct kmem_cache *scoutfs_dentry_cachep;
static struct dentry_info *alloc_dentry_info(struct dentry *dentry)
{
struct dentry_info *di;
/* XXX read mb? */
if (dentry->d_fsdata)
return dentry->d_fsdata;
di = kmem_cache_zalloc(scoutfs_dentry_cachep, GFP_NOFS);
if (!di)
return ERR_PTR(-ENOMEM);
spin_lock(&dentry->d_lock);
if (!dentry->d_fsdata)
dentry->d_fsdata = di;
spin_unlock(&dentry->d_lock);
if (di != dentry->d_fsdata)
kmem_cache_free(scoutfs_dentry_cachep, di);
return dentry->d_fsdata;
}
/*
* Lookup searches for an entry for the given name amongst the entries
* stored in the item at the name's hash.
*/
static struct dentry *scoutfs_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
struct super_block *sb = dir->i_sb;
struct scoutfs_dirent *dent;
struct scoutfs_item *item;
struct dentry_info *di;
struct scoutfs_key key;
struct inode *inode;
u64 ino = 0;
u32 h = 0;
u32 nr = 0;
int ret;
di = alloc_dentry_info(dentry);
if (IS_ERR(di)) {
ret = PTR_ERR(di);
goto out;
}
if (dentry->d_name.len > SCOUTFS_NAME_LEN) {
ret = -ENAMETOOLONG;
goto out;
}
h = name_hash(dir, dentry->d_name.name, dentry->d_name.len);
scoutfs_set_key(&key, scoutfs_ino(dir), SCOUTFS_DIRENT_KEY, h);
item = scoutfs_item_lookup(sb, &key);
if (IS_ERR(item)) {
ret = PTR_ERR(item);
goto out;
}
ret = -ENOENT;
for_each_item_dent(item, dent) {
if (names_equal(dentry->d_name.name, dentry->d_name.len,
dent->name, dent->name_len)) {
ino = le64_to_cpu(dent->ino);
nr = dent->coll_nr;
ret = 0;
break;
}
}
scoutfs_item_put(item);
out:
if (ret == -ENOENT) {
inode = NULL;
} else if (ret) {
inode = ERR_PTR(ret);
} else {
di->key_offset = h;
di->coll_nr = nr;
inode = scoutfs_iget(sb, ino);
}
return d_splice_alias(inode, dentry);
}
/* this exists upstream so we can just delete it in a forward port */
static int dir_emit_dots(struct file *file, void *dirent, filldir_t filldir)
{
struct dentry *dentry = file->f_path.dentry;
struct inode *inode = dentry->d_inode;
struct inode *parent = dentry->d_parent->d_inode;
if (file->f_pos == 0) {
if (!filldir(dirent, ".", 1, 1, scoutfs_ino(inode), DT_DIR))
return 0;
file->f_pos = 1;
}
if (file->f_pos == 1) {
if (!filldir(dirent, "..", 2, 1, scoutfs_ino(parent), DT_DIR))
return 0;
file->f_pos = 2;
}
return 1;
}
/*
* readdir finds the next entry at or past the hash|coll_nr stored in
* the ctx->pos (f_pos).
*
* It will need to be careful not to read past the region of the dirent
* hash offset keys that it has access to.
*/
static int scoutfs_readdir(struct file *file, void *dirent, filldir_t filldir)
{
struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
struct scoutfs_dirent *dent;
struct scoutfs_key last_key;
struct scoutfs_item *item;
struct scoutfs_key key;
u32 nr;
u32 off;
u64 pos;
int ret = 0;
if (!dir_emit_dots(file, dirent, filldir))
return 0;
scoutfs_set_key(&last_key, scoutfs_ino(inode), SCOUTFS_DIRENT_KEY,
SCOUTFS_DIRENT_OFF_MASK);
do {
off = file->f_pos >> SCOUTFS_DIRENT_COLL_BITS;
nr = file->f_pos & SCOUTFS_DIRENT_COLL_MASK;
scoutfs_set_key(&key, scoutfs_ino(inode), SCOUTFS_DIRENT_KEY,
off);
item = scoutfs_item_next(sb, &key);
if (IS_ERR(item)) {
ret = PTR_ERR(item);
if (ret == -ENOENT)
ret = 0;
break;
}
if (scoutfs_key_cmp(&item->key, &last_key) > 0) {
scoutfs_item_put(item);
break;
}
/* reset nr to 0 if we found the next item */
if (scoutfs_key_offset(&item->key) != off)
nr = 0;
pos = scoutfs_key_offset(&item->key)
<< SCOUTFS_DIRENT_COLL_BITS;
for_each_item_dent(item, dent) {
if (dent->coll_nr < nr)
continue;
if (!filldir(dirent, dent->name, dent->name_len, pos,
le64_to_cpu(dent->ino), dent->type))
break;
file->f_pos = (pos | dent->coll_nr) + 1;
}
scoutfs_item_put(item);
/* advance to the next hash value if we finished item */
if (dent == NULL)
file->f_pos = pos + (1 << SCOUTFS_DIRENT_COLL_BITS);
} while (dent == NULL);
return ret;
}
static int scoutfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
dev_t rdev)
{
struct super_block *sb = dir->i_sb;
struct inode *inode = NULL;
struct scoutfs_dirent *dent;
struct scoutfs_item *item;
struct dentry_info *di;
struct scoutfs_key key;
int bytes;
int ret;
int off;
u64 nr;
u64 h;
di = alloc_dentry_info(dentry);
if (IS_ERR(di))
return PTR_ERR(di);
if (dentry->d_name.len > SCOUTFS_NAME_LEN)
return -ENAMETOOLONG;
inode = scoutfs_new_inode(sb, dir, mode, rdev);
if (IS_ERR(inode))
return PTR_ERR(inode);
h = name_hash(dir, dentry->d_name.name, dentry->d_name.len);
scoutfs_set_key(&key, scoutfs_ino(dir), SCOUTFS_DIRENT_KEY, h);
bytes = dent_bytes(dentry->d_name.len);
item = scoutfs_item_lookup(sb, &key);
if (item == ERR_PTR(-ENOENT)) {
item = scoutfs_item_create(sb, &key, bytes);
if (!IS_ERR(item)) {
/* mark a newly created item */
dent = item->val;
dent->name_len = 0;
}
}
if (IS_ERR(item)) {
ret = PTR_ERR(item);
goto out;
}
ret = 0;
nr = 0;
for_each_item_dent(item, dent) {
/* the common case of a newly created item */
if (!dent->name_len)
break;
/* XXX check for eexist? can't happen? */
/* found a free coll nr, insert here */
if (nr < dent->coll_nr) {
off = dent_val_off(item, dent);
ret = scoutfs_item_expand(item, off, bytes);
if (!ret)
dent = item->val + off;
break;
}
/* the item's full */
if (nr++ == SCOUTFS_DIRENT_COLL_MASK) {
ret = -ENOSPC;
break;
}
}
if (!ret) {
dent->ino = cpu_to_le64(scoutfs_ino(inode));
dent->type = mode_to_type(inode->i_mode);
dent->coll_nr = nr;
dent->name_len = dentry->d_name.len;
memcpy(dent->name, dentry->d_name.name, dent->name_len);
di->key_offset = h;
di->coll_nr = nr;
}
scoutfs_item_put(item);
if (ret)
goto out;
i_size_write(dir, i_size_read(dir) + dentry->d_name.len);
dir->i_mtime = dir->i_ctime = CURRENT_TIME;
if (S_ISDIR(mode)) {
inc_nlink(inode);
inc_nlink(dir);
}
mark_inode_dirty(inode);
mark_inode_dirty(dir);
insert_inode_hash(inode);
d_instantiate(dentry, inode);
out:
/* XXX delete the inode item here */
if (ret && !IS_ERR_OR_NULL(inode))
iput(inode);
return ret;
}
/* XXX hmm, do something with excl? */
static int scoutfs_create(struct inode *dir, struct dentry *dentry,
umode_t mode, bool excl)
{
return scoutfs_mknod(dir, dentry, mode | S_IFREG, 0);
}
static int scoutfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
return scoutfs_mknod(dir, dentry, mode | S_IFDIR, 0);
}
/*
* Unlink removes the entry from its item and removes the item if ours
* was the only remaining entry.
*/
static int scoutfs_unlink(struct inode *dir, struct dentry *dentry)
{
struct super_block *sb = dir->i_sb;
struct inode *inode = dentry->d_inode;
struct timespec ts = current_kernel_time();
struct scoutfs_dirent *dent;
struct scoutfs_item *item;
struct dentry_info *di;
struct scoutfs_key key;
int ret = 0;
if (WARN_ON_ONCE(!dentry->d_fsdata))
return -EINVAL;
di = dentry->d_fsdata;
trace_printk("dir size %llu entry k_off nr %u %u\n",
i_size_read(inode), di->key_offset, di->coll_nr);
if (S_ISDIR(inode->i_mode) && i_size_read(inode))
return -ENOTEMPTY;
scoutfs_set_key(&key, scoutfs_ino(dir), SCOUTFS_DIRENT_KEY,
di->key_offset);
item = scoutfs_item_lookup(sb, &key);
if (IS_ERR(item)) {
ret = PTR_ERR(item);
goto out;
}
/* XXX error to not find the coll nr we were looking for? */
for_each_item_dent(item, dent) {
if (dent->coll_nr != di->coll_nr)
continue;
/* XXX compare names and eio? */
if (item->val_len == dent_bytes(dent->name_len)) {
scoutfs_item_delete(sb, item);
ret = 0;
} else {
ret = scoutfs_item_shrink(item,
dent_val_off(item, dent),
dent_bytes(dent->name_len));
}
dent = NULL;
break;
}
scoutfs_item_put(item);
if (ret)
goto out;
dir->i_ctime = ts;
dir->i_mtime = ts;
i_size_write(dir, i_size_read(dir) - dentry->d_name.len);
inode->i_ctime = ts;
drop_nlink(inode);
if (S_ISDIR(inode->i_mode)) {
drop_nlink(dir);
drop_nlink(inode);
}
mark_inode_dirty(inode);
mark_inode_dirty(dir);
out:
return ret;
}
const struct file_operations scoutfs_dir_fops = {
.readdir = scoutfs_readdir,
};
const struct inode_operations scoutfs_dir_iops = {
.lookup = scoutfs_lookup,
.mknod = scoutfs_mknod,
.create = scoutfs_create,
.mkdir = scoutfs_mkdir,
.unlink = scoutfs_unlink,
.rmdir = scoutfs_unlink,
};
void scoutfs_dir_exit(void)
{
if (scoutfs_dentry_cachep) {
kmem_cache_destroy(scoutfs_dentry_cachep);
scoutfs_dentry_cachep = NULL;
}
}
int scoutfs_dir_init(void)
{
scoutfs_dentry_cachep = kmem_cache_create("scoutfs_dentry_info",
sizeof(struct dentry_info), 0,
SLAB_RECLAIM_ACCOUNT, NULL);
if (!scoutfs_dentry_cachep)
return -ENOMEM;
return 0;
}

10
kmod/src/dir.h Normal file
View File

@@ -0,0 +1,10 @@
#ifndef _SCOUTFS_DIR_H_
#define _SCOUTFS_DIR_H_
extern const struct file_operations scoutfs_dir_fops;
extern const struct inode_operations scoutfs_dir_iops;
int scoutfs_dir_init(void);
void scoutfs_dir_exit(void);
#endif

122
kmod/src/format.h Normal file
View File

@@ -0,0 +1,122 @@
#ifndef _SCOUTFS_FORMAT_H_
#define _SCOUTFS_FORMAT_H_
#define SCOUTFS_SUPER_MAGIC 0x554f4353 /* "SCOU" */
#define SCOUTFS_BLOCK_SHIFT 22
#define SCOUTFS_BLOCK_SIZE (1 << SCOUTFS_BLOCK_SHIFT)
/*
* This bloom size is chosen to have a roughly 1% false positive rate
* for ~90k items which is roughly the worst case for a block full of
* dirents with reasonably small names. Pathologically smaller items
* could be even more dense.
*/
#define SCOUTFS_BLOOM_FILTER_BYTES (128 * 1024)
#define SCOUTFS_BLOOM_FILTER_BITS (SCOUTFS_BLOOM_FILTER_BYTES * 8)
#define SCOUTFS_BLOOM_INDEX_BITS (ilog2(SCOUTFS_BLOOM_FILTER_BITS))
#define SCOUTFS_BLOOM_INDEX_MASK ((1 << SCOUTFS_BLOOM_INDEX_BITS) - 1)
#define SCOUTFS_BLOOM_INDEX_NR 7
/*
* We should be able to make the offset smaller if neither dirents nor
* data items use the full 64 bits.
*/
struct scoutfs_key {
__le64 inode;
u8 type;
__le64 offset;
} __packed;
#define SCOUTFS_INODE_KEY 128
#define SCOUTFS_DIRENT_KEY 192
struct scoutfs_lsm_block {
struct scoutfs_key first;
struct scoutfs_key last;
__le32 nr_items;
/* u8 bloom[SCOUTFS_BLOOM_BYTES]; */
/* struct scoutfs_item_header items[0] .. */
} __packed;
struct scoutfs_item_header {
struct scoutfs_key key;
__le16 val_len;
} __packed;
struct scoutfs_timespec {
__le64 sec;
__le32 nsec;
} __packed;
/*
* XXX
* - otime?
* - compat flags?
* - version?
* - generation?
* - be more careful with rdev?
*/
struct scoutfs_inode {
__le64 size;
__le64 blocks;
__le32 nlink;
__le32 uid;
__le32 gid;
__le32 mode;
__le32 rdev;
__le32 salt;
struct scoutfs_timespec atime;
struct scoutfs_timespec ctime;
struct scoutfs_timespec mtime;
} __packed;
#define SCOUTFS_ROOT_INO 1
/*
* Dirents are stored in items with an offset of the hash of their name.
* Colliding names are packed into the value.
*/
struct scoutfs_dirent {
__le64 ino;
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u8 type:4,
coll_nr:4;
#else
__u8 coll_nr:4,
type:4;
#endif
__u8 name_len;
__u8 name[0];
} __packed;
#define SCOUTFS_NAME_LEN 255
/*
* We only use 31 bits for readdir positions so that we don't confuse
* old signed 32bit f_pos applications or those on the other side of
* network protocols that have limited readir positions.
*/
#define SCOUTFS_DIRENT_OFF_BITS 27
#define SCOUTFS_DIRENT_OFF_MASK ((1 << SCOUTFS_DIRENT_OFF_BITS) - 1)
#define SCOUTFS_DIRENT_COLL_BITS 4
#define SCOUTFS_DIRENT_COLL_MASK ((1 << SCOUTFS_DIRENT_COLL_BITS) - 1)
/* getdents returns the *next* pos with each entry. so we can't return ~0 */
#define SCOUTFS_DIRENT_MAX_POS \
(((1 << (SCOUTFS_DIRENT_OFF_BITS + SCOUTFS_DIRENT_COLL_BITS)) - 1) - 1)
enum {
SCOUTFS_DT_FIFO = 0,
SCOUTFS_DT_CHR,
SCOUTFS_DT_DIR,
SCOUTFS_DT_BLK,
SCOUTFS_DT_REG,
SCOUTFS_DT_LNK,
SCOUTFS_DT_SOCK,
SCOUTFS_DT_WHT,
};
#endif

272
kmod/src/inode.c Normal file
View File

@@ -0,0 +1,272 @@
/*
* Copyright (C) 2015 Versity Software, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/random.h>
#include "format.h"
#include "super.h"
#include "key.h"
#include "inode.h"
#include "item.h"
#include "dir.h"
/*
* XXX
* - worry about i_ino trunctation, not sure if we do anything
* - use inode item value lengths for forward/back compat
*/
static struct kmem_cache *scoutfs_inode_cachep;
static void scoutfs_inode_ctor(void *obj)
{
struct scoutfs_inode_info *ci = obj;
inode_init_once(&ci->inode);
}
struct inode *scoutfs_alloc_inode(struct super_block *sb)
{
struct scoutfs_inode_info *ci;
ci = kmem_cache_alloc(scoutfs_inode_cachep, GFP_NOFS);
if (!ci)
return NULL;
return &ci->inode;
}
static void scoutfs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
trace_printk("freeing inode %p\n", inode);
kmem_cache_free(scoutfs_inode_cachep, SCOUTFS_I(inode));
}
void scoutfs_destroy_inode(struct inode *inode)
{
call_rcu(&inode->i_rcu, scoutfs_i_callback);
}
/*
* Called once new inode allocation or inode reading has initialized
* enough of the inode for us to set the ops based on the mode.
*/
static void set_inode_ops(struct inode *inode)
{
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
// inode->i_mapping->a_ops = &scoutfs_file_aops;
// inode->i_op = &scoutfs_file_iops;
// inode->i_fop = &scoutfs_file_fops;
break;
case S_IFDIR:
inode->i_op = &scoutfs_dir_iops;
inode->i_fop = &scoutfs_dir_fops;
break;
case S_IFLNK:
// inode->i_op = &scoutfs_symlink_iops;
break;
default:
// inode->i_op = &scoutfs_special_iops;
init_special_inode(inode, inode->i_mode, inode->i_rdev);
break;
}
}
static void load_inode(struct inode *inode, struct scoutfs_inode *cinode)
{
struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
i_size_write(inode, le64_to_cpu(cinode->size));
set_nlink(inode, le32_to_cpu(cinode->nlink));
i_uid_write(inode, le32_to_cpu(cinode->uid));
i_gid_write(inode, le32_to_cpu(cinode->gid));
inode->i_mode = le32_to_cpu(cinode->mode);
inode->i_rdev = le32_to_cpu(cinode->rdev);
inode->i_atime.tv_sec = le64_to_cpu(cinode->atime.sec);
inode->i_atime.tv_nsec = le32_to_cpu(cinode->atime.nsec);
inode->i_mtime.tv_sec = le64_to_cpu(cinode->mtime.sec);
inode->i_mtime.tv_nsec = le32_to_cpu(cinode->mtime.nsec);
inode->i_ctime.tv_sec = le64_to_cpu(cinode->ctime.sec);
inode->i_ctime.tv_nsec = le32_to_cpu(cinode->ctime.nsec);
ci->salt = le32_to_cpu(cinode->salt);
}
static int scoutfs_read_locked_inode(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
struct scoutfs_item *item;
struct scoutfs_key key;
scoutfs_set_key(&key, scoutfs_ino(inode), SCOUTFS_INODE_KEY, 0);
item = scoutfs_item_lookup(sb, &key);
if (IS_ERR(item))
return PTR_ERR(item);
load_inode(inode, item->val);
scoutfs_item_put(item);
return 0;
}
static int scoutfs_iget_test(struct inode *inode, void *arg)
{
struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
u64 *ino = arg;
return ci->ino == *ino;
}
static int scoutfs_iget_set(struct inode *inode, void *arg)
{
struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
u64 *ino = arg;
inode->i_ino = *ino;
ci->ino = *ino;
return 0;
}
struct inode *scoutfs_iget(struct super_block *sb, u64 ino)
{
struct inode *inode;
int ret;
inode = iget5_locked(sb, ino, scoutfs_iget_test, scoutfs_iget_set,
&ino);
if (!inode)
return ERR_PTR(-ENOMEM);
if (inode->i_state & I_NEW) {
ret = scoutfs_read_locked_inode(inode);
if (ret) {
iget_failed(inode);
inode = ERR_PTR(ret);
} else {
set_inode_ops(inode);
unlock_new_inode(inode);
}
}
return inode;
}
static void store_inode(struct scoutfs_inode *cinode, struct inode *inode)
{
struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
cinode->size = cpu_to_le64(i_size_read(inode));
cinode->nlink = cpu_to_le32(inode->i_nlink);
cinode->uid = cpu_to_le32(i_uid_read(inode));
cinode->gid = cpu_to_le32(i_gid_read(inode));
cinode->mode = cpu_to_le32(inode->i_mode);
cinode->rdev = cpu_to_le32(inode->i_rdev);
cinode->atime.sec = cpu_to_le64(inode->i_atime.tv_sec);
cinode->atime.nsec = cpu_to_le32(inode->i_atime.tv_nsec);
cinode->ctime.sec = cpu_to_le64(inode->i_ctime.tv_sec);
cinode->ctime.nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
cinode->mtime.sec = cpu_to_le64(inode->i_mtime.tv_sec);
cinode->mtime.nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
cinode->salt = cpu_to_le32(ci->salt);
}
/*
* Every time we modify the inode in memory we copy it to its inode
* item. This lets us write out blocks of items without having to track
* down dirty vfs inodes and safely copy them into items before writing.
*/
int scoutfs_inode_update(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
struct scoutfs_item *item;
struct scoutfs_key key;
scoutfs_set_key(&key, scoutfs_ino(inode), SCOUTFS_INODE_KEY, 0);
item = scoutfs_item_lookup(sb, &key);
if (IS_ERR(item))
return PTR_ERR(item);
store_inode(item->val, inode);
scoutfs_item_put(item);
return 0;
}
/*
* Allocate and initialize a new inode. The caller is responsible for
* creating links to it and updating it. @dir can be null.
*/
struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir,
umode_t mode, dev_t rdev)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct scoutfs_inode_info *ci;
struct scoutfs_item *item;
struct scoutfs_key key;
struct inode *inode;
inode = new_inode(sb);
if (!inode)
return ERR_PTR(-ENOMEM);
ci = SCOUTFS_I(inode);
ci->ino = atomic64_inc_return(&sbi->next_ino);
get_random_bytes(&ci->salt, sizeof(ci->salt));
inode->i_ino = ci->ino;
inode_init_owner(inode, dir, mode);
inode_set_bytes(inode, 0);
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
inode->i_rdev = rdev;
set_inode_ops(inode);
scoutfs_set_key(&key, scoutfs_ino(inode), SCOUTFS_INODE_KEY, 0);
item = scoutfs_item_create(inode->i_sb, &key,
sizeof(struct scoutfs_inode));
if (IS_ERR(item)) {
iput(inode);
inode = ERR_CAST(item);
}
return inode;
}
void scoutfs_inode_exit(void)
{
if (scoutfs_inode_cachep) {
rcu_barrier();
kmem_cache_destroy(scoutfs_inode_cachep);
scoutfs_inode_cachep = NULL;
}
}
int scoutfs_inode_init(void)
{
scoutfs_inode_cachep = kmem_cache_create("scoutfs_inode_info",
sizeof(struct scoutfs_inode_info), 0,
SLAB_RECLAIM_ACCOUNT,
scoutfs_inode_ctor);
if (!scoutfs_inode_cachep)
return -ENOMEM;
return 0;
}

32
kmod/src/inode.h Normal file
View File

@@ -0,0 +1,32 @@
#ifndef _SCOUTFS_INODE_H_
#define _SCOUTFS_INODE_H_
struct scoutfs_inode_info {
u64 ino;
u32 salt;
struct inode inode;
};
static inline struct scoutfs_inode_info *SCOUTFS_I(struct inode *inode)
{
return container_of(inode, struct scoutfs_inode_info, inode);
}
static inline u64 scoutfs_ino(struct inode *inode)
{
return SCOUTFS_I(inode)->ino;
}
struct inode *scoutfs_alloc_inode(struct super_block *sb);
void scoutfs_destroy_inode(struct inode *inode);
struct inode *scoutfs_iget(struct super_block *sb, u64 ino);
int scoutfs_inode_update(struct inode *inode);
struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir,
umode_t mode, dev_t rdev);
void scoutfs_inode_exit(void);
int scoutfs_inode_init(void);
#endif

423
kmod/src/item.c Normal file
View File

@@ -0,0 +1,423 @@
/*
* Copyright (C) 2015 Versity Software, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/spinlock.h>
#include <linux/rbtree.h>
#include <linux/slab.h>
#include "super.h"
#include "key.h"
#include "item.h"
/*
* describe:
* - tracks per-item dirty state for writing
* - decouples vfs cache lifetimes from item lifetimes
* - item-granular cache for things vfs doesn't cache (readdir, xattr)
*
* XXX:
* - warnings for invalid keys/lens
* - memory pressure
*/
enum {
ITW_NEXT = 1,
ITW_PREV,
};
static inline struct scoutfs_item *node_item(struct super_block *sb,
struct rb_root *root,
struct rb_node *node)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
unsigned long off;
if (root == &sbi->item_root)
off = offsetof(struct scoutfs_item, node);
else
off = offsetof(struct scoutfs_item, dirty_node);
return (void *)((char *)node - off);
}
static inline struct rb_node *item_node(struct super_block *sb,
struct rb_root *root,
struct scoutfs_item *item)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
unsigned long off;
if (root == &sbi->item_root)
off = offsetof(struct scoutfs_item, node);
else
off = offsetof(struct scoutfs_item, dirty_node);
return (void *)((char *)item + off);
}
/*
* Insert a new item in the tree. The caller must have done a lookup to
* ensure that the key is not already present.
*/
static void insert_item(struct super_block *sb, struct rb_root *root,
struct scoutfs_item *ins)
{
struct rb_node **node = &root->rb_node;
struct rb_node *parent = NULL;
struct scoutfs_item *item;
int cmp;
while (*node) {
parent = *node;
item = node_item(sb, root, *node);
cmp = scoutfs_key_cmp(&ins->key, &item->key);
BUG_ON(cmp == 0);
if (cmp < 0)
node = &(*node)->rb_left;
else
node = &(*node)->rb_right;
}
rb_link_node(item_node(sb, root, ins), parent, node);
rb_insert_color(item_node(sb, root, ins), root);
}
enum {
FI_NEXT = 1,
FI_PREV,
};
/*
* Walk the tree looking for an item.
*
* If NEXT or PREV are specified then those will be returned
* if the specific item isn't found.
*/
static struct scoutfs_item *find_item(struct super_block *sb,
struct rb_root *root,
struct scoutfs_key *key, int np)
{
struct rb_node *node = root->rb_node;
struct scoutfs_item *found = NULL;
struct scoutfs_item *item;
int cmp;
while (node) {
item = node_item(sb, root, node);
cmp = scoutfs_key_cmp(key, &item->key);
if (cmp < 0) {
if (np == FI_NEXT)
found = item;
node = node->rb_left;
} else if (cmp > 0) {
if (np == FI_PREV)
found = item;
node = node->rb_right;
} else {
found = item;
break;
}
}
return found;
}
static struct scoutfs_item *alloc_item(struct scoutfs_key *key,
unsigned int val_len)
{
struct scoutfs_item *item;
void *val;
item = kmalloc(sizeof(struct scoutfs_item), GFP_NOFS);
val = kmalloc(val_len, GFP_NOFS);
if (!item || !val) {
kfree(item);
kfree(val);
return ERR_PTR(-ENOMEM);
}
RB_CLEAR_NODE(&item->node);
RB_CLEAR_NODE(&item->dirty_node);
atomic_set(&item->refcount, 1);
item->key = *key;
item->val_len = val_len;
item->val = val;
return item;
}
/*
* Create a new item stored at the given key. Return it with a reference.
* return an ERR_PTR with ENOMEM or EEXIST.
*
* The caller is responsible for initializing the item's value.
*/
struct scoutfs_item *scoutfs_item_create(struct super_block *sb,
struct scoutfs_key *key,
unsigned int val_len)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct scoutfs_item *existing;
struct scoutfs_item *item;
unsigned long flags;
item = alloc_item(key, val_len);
if (IS_ERR(item))
return item;
spin_lock_irqsave(&sbi->item_lock, flags);
existing = find_item(sb, &sbi->item_root, key, 0);
if (!existing) {
insert_item(sb, &sbi->item_root, item);
insert_item(sb, &sbi->dirty_item_root, item);
atomic_add(2, &item->refcount);
}
spin_unlock_irqrestore(&sbi->item_lock, flags);
if (existing) {
scoutfs_item_put(item);
item = ERR_PTR(-EEXIST);
}
trace_printk("item %p key "CKF" val_len %d\n", item, CKA(key), val_len);
return item;
}
/*
* The caller is still responsible for unlocking and putting the item.
*
* We don't try and optimize away the lock for items that are already
* removed from the tree. The caller's locking and item behaviour means
* that racing to remove an item is extremely rare.
*
* XXX for now we're just removing it from the rbtree. We'd need to leave
* behind a deletion record for lsm.
*/
void scoutfs_item_delete(struct super_block *sb, struct scoutfs_item *item)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
unsigned long flags;
spin_lock_irqsave(&sbi->item_lock, flags);
if (!RB_EMPTY_NODE(&item->dirty_node)) {
rb_erase(&item->dirty_node, &sbi->dirty_item_root);
RB_CLEAR_NODE(&item->dirty_node);
scoutfs_item_put(item);
}
if (!RB_EMPTY_NODE(&item->node)) {
rb_erase(&item->node, &sbi->item_root);
RB_CLEAR_NODE(&item->node);
scoutfs_item_put(item);
}
spin_unlock_irqrestore(&sbi->item_lock, flags);
}
static struct scoutfs_item *item_lookup(struct super_block *sb,
struct scoutfs_key *key, int np)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct scoutfs_item *item;
unsigned long flags;
spin_lock_irqsave(&sbi->item_lock, flags);
item = find_item(sb, &sbi->item_root, key, np);
if (item)
atomic_inc(&item->refcount);
else
item = ERR_PTR(-ENOENT);
spin_unlock_irqrestore(&sbi->item_lock, flags);
return item;
}
struct scoutfs_item *scoutfs_item_lookup(struct super_block *sb,
struct scoutfs_key *key)
{
return item_lookup(sb, key, 0);
}
struct scoutfs_item *scoutfs_item_next(struct super_block *sb,
struct scoutfs_key *key)
{
return item_lookup(sb, key, FI_NEXT);
}
struct scoutfs_item *scoutfs_item_prev(struct super_block *sb,
struct scoutfs_key *key)
{
return item_lookup(sb, key, FI_PREV);
}
/*
* Expand the item's value by inserting bytes at the given offset. The
* new bytes are not initialized.
*/
int scoutfs_item_expand(struct scoutfs_item *item, int off, int bytes)
{
void *val;
/* XXX bytes too big */
if (WARN_ON_ONCE(off < 0 || off > item->val_len))
return -EINVAL;
val = kmalloc(item->val_len + bytes, GFP_NOFS);
if (!val)
return -ENOMEM;
memcpy(val, item->val, off);
memcpy(val + off + bytes, item->val + off, item->val_len - off);
kfree(item->val);
item->val = val;
item->val_len += bytes;
return 0;
}
/*
* Shrink the item's value by remove bytes at the given offset.
*/
int scoutfs_item_shrink(struct scoutfs_item *item, int off, int bytes)
{
void *val;
if (WARN_ON_ONCE(off < 0 || off >= item->val_len ||
bytes <= 0 || (off + bytes) > item->val_len ||
bytes == item->val_len))
return -EINVAL;
val = kmalloc(item->val_len - bytes, GFP_NOFS);
if (!val)
return -ENOMEM;
memcpy(val, item->val, off);
memcpy(val + off, item->val + off + bytes,
item->val_len - (off + bytes));
kfree(item->val);
item->val = val;
item->val_len -= bytes;
return 0;
}
void scoutfs_item_mark_dirty(struct super_block *sb, struct scoutfs_item *item)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
unsigned long flags;
spin_lock_irqsave(&sbi->item_lock, flags);
if (RB_EMPTY_NODE(&item->dirty_node)) {
insert_item(sb, &sbi->dirty_item_root, item);
atomic_inc(&item->refcount);
}
spin_unlock_irqrestore(&sbi->item_lock, flags);
}
/*
* Mark all the dirty items clean by emptying the dirty rbtree. The
* caller should be preventing writes from dirtying new items.
*
* We erase leaf nodes with no children to minimize rotation
* overhead during erase. Dirty items must be in the main rbtree if
* they're in the dirty rbtree so the puts here shouldn't free the
* items.
*/
void scoutfs_item_all_clean(struct super_block *sb)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct rb_root *root = &sbi->dirty_item_root;
struct scoutfs_item *item;
struct rb_node *node;
unsigned long flags;
spin_lock_irqsave(&sbi->item_lock, flags);
node = sbi->dirty_item_root.rb_node;
while (node) {
if (node->rb_left)
node = node->rb_left;
else if (node->rb_right)
node = node->rb_right;
else {
item = node_item(sb, root, node);
node = rb_parent(node);
trace_printk("item %p key "CKF"\n",
item, CKA(&item->key));
rb_erase(&item->dirty_node, root);
RB_CLEAR_NODE(&item->dirty_node);
scoutfs_item_put(item);
}
}
spin_unlock_irqrestore(&sbi->item_lock, flags);
}
/*
* If the item is null then the first dirty item is returned. If an
* item is given then the next dirty item is returned. NULL is returned
* if there are no more dirty items.
*
* The caller is given a reference that it has to put. The given item
* will always have its item dropped including if it returns NULL.
*/
struct scoutfs_item *scoutfs_item_next_dirty(struct super_block *sb,
struct scoutfs_item *item)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct scoutfs_item *next_item;
struct rb_node *node;
unsigned long flags;
spin_lock_irqsave(&sbi->item_lock, flags);
if (item)
node = rb_next(&item->dirty_node);
else
node = rb_first(&sbi->dirty_item_root);
if (node) {
next_item = node_item(sb, &sbi->dirty_item_root, node);
atomic_inc(&next_item->refcount);
} else {
next_item = NULL;
}
spin_unlock_irqrestore(&sbi->item_lock, flags);
scoutfs_item_put(item);
return next_item;
}
void scoutfs_item_put(struct scoutfs_item *item)
{
if (!IS_ERR_OR_NULL(item) && atomic_dec_and_test(&item->refcount)) {
WARN_ON_ONCE(!RB_EMPTY_NODE(&item->node));
WARN_ON_ONCE(!RB_EMPTY_NODE(&item->dirty_node));
kfree(item);
}
}

37
kmod/src/item.h Normal file
View File

@@ -0,0 +1,37 @@
#ifndef _SCOUTFS_ITEM_H_
#define _SCOUTFS_ITEM_H_
#include "format.h"
struct scoutfs_item {
struct rb_node node;
struct rb_node dirty_node;
atomic_t refcount;
/* the key is constant for the life of the item */
struct scoutfs_key key;
/* the value can be changed by expansion or shrinking */
unsigned int val_len;
void *val;
};
struct scoutfs_item *scoutfs_item_create(struct super_block *sb,
struct scoutfs_key *key,
unsigned int val_len);
struct scoutfs_item *scoutfs_item_lookup(struct super_block *sb,
struct scoutfs_key *key);
struct scoutfs_item *scoutfs_item_next(struct super_block *sb,
struct scoutfs_key *key);
struct scoutfs_item *scoutfs_item_prev(struct super_block *sb,
struct scoutfs_key *key);
int scoutfs_item_expand(struct scoutfs_item *item, int off, int bytes);
int scoutfs_item_shrink(struct scoutfs_item *item, int off, int bytes);
void scoutfs_item_delete(struct super_block *sb, struct scoutfs_item *item);
void scoutfs_item_mark_dirty(struct super_block *sb, struct scoutfs_item *item);
struct scoutfs_item *scoutfs_item_next_dirty(struct super_block *sb,
struct scoutfs_item *item);
void scoutfs_item_all_clean(struct super_block *sb);
void scoutfs_item_put(struct scoutfs_item *item);
#endif

43
kmod/src/key.h Normal file
View File

@@ -0,0 +1,43 @@
#ifndef _SCOUTFS_KEY_H_
#define _SCOUTFS_KEY_H_
#include <linux/types.h>
#include "format.h"
#define CKF "%llu.%u.%llu"
#define CKA(key) \
le64_to_cpu((key)->inode), (key)->type, le64_to_cpu((key)->offset)
static inline u64 scoutfs_key_inode(struct scoutfs_key *key)
{
return le64_to_cpu(key->inode);
}
static inline u64 scoutfs_key_offset(struct scoutfs_key *key)
{
return le64_to_cpu(key->offset);
}
static inline int le64_cmp(__le64 a, __le64 b)
{
return le64_to_cpu(a) < le64_to_cpu(b) ? -1 :
le64_to_cpu(a) > le64_to_cpu(b) ? 1 : 0;
}
static inline int scoutfs_key_cmp(struct scoutfs_key *a, struct scoutfs_key *b)
{
return le64_cmp(a->inode, b->inode) ?:
((short)a->type - (short)b->type) ?:
le64_cmp(a->offset, b->offset);
}
static inline void scoutfs_set_key(struct scoutfs_key *key, u64 inode, u8 type,
u64 offset)
{
key->inode = cpu_to_le64(inode);
key->type = type;
key->offset = cpu_to_le64(offset);
}
#endif

330
kmod/src/lsm.c Normal file
View File

@@ -0,0 +1,330 @@
/*
* Copyright (C) 2016 Versity Software, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/crc32c.h>
#include <linux/pagemap.h>
#include <linux/writeback.h>
#include "format.h"
#include "dir.h"
#include "inode.h"
#include "key.h"
#include "item.h"
#include "super.h"
#include "lsm.h"
#define PAGE_CACHE_PAGE_BITS (PAGE_CACHE_SIZE * 8)
/* XXX garbage hack until we have siphash */
static u64 bloom_hash(struct scoutfs_key *key, __le64 *hash_key)
{
__le32 *salts = (void *)hash_key;
return ((u64)crc32c(le32_to_cpu(salts[0]), key, sizeof(*key)) << 32) |
crc32c(le32_to_cpu(salts[1]), key, sizeof(*key));
}
/*
* Set the caller's bloom indices for their item key.
*/
static void get_bloom_indices(struct super_block *sb,
struct scoutfs_key *key, u32 *ind)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
__le64 *hash_key = sbi->bloom_hash_keys;
u64 hash;
int h;
int i;
for (i = 0; ; ) {
hash = bloom_hash(key, hash_key);
hash_key += 2;
for (h = 0; h < 64 / SCOUTFS_BLOOM_INDEX_BITS; h++) {
ind[i++] = hash & SCOUTFS_BLOOM_INDEX_MASK;
if (i == SCOUTFS_BLOOM_INDEX_NR)
return;
hash >>= SCOUTFS_BLOOM_INDEX_BITS;
}
}
}
struct pages {
/* fixed for the group of pages */
struct address_space *mapping;
struct page **pages;
pgoff_t pgoff;
/* number of pages stored in the pages array */
int nr;
/* byte offset of the free space at end of current page */
int off;
/* bytes remaining in the ovarall large block */
int remaining;
};
/*
* The caller has our fixed-size bloom filter in the locked pages
* starting at the given byte offset in the first page. Our job is to
* hash the key and set its bits in the bloom filter.
*/
static void set_bloom_bits(struct super_block *sb, struct page **pages,
unsigned int offset, struct scoutfs_key *key)
{
u32 inds[SCOUTFS_BLOOM_INDEX_NR];
struct page *page;
int offset_bits = offset * 8;
int full_bit;
int page_bit;
void *addr;
int i;
get_bloom_indices(sb, key, inds);
for (i = 0; i < SCOUTFS_BLOOM_INDEX_NR; i++) {
full_bit = offset_bits + inds[i];
page = pages[full_bit / PAGE_CACHE_PAGE_BITS];
page_bit = full_bit % PAGE_CACHE_PAGE_BITS;
addr = kmap_atomic(page);
set_bit_le(page_bit, addr);
kunmap_atomic(addr);
}
}
/*
* XXX the zeroing here is unreliable. We'll want to zero the bloom but
* not all the pages that are about to be overwritten. Bleh.
*
* Returns the number of bytes copied if there was room. Returns 0 if
* there wasn't. Returns -errno on a hard failure.
*/
static int copy_to_pages(struct pages *pgs, void *ptr, size_t count)
{
struct page *page;
int ret = count;
void *addr;
int bytes;
if (count > pgs->remaining)
return 0;
while (count) {
if (pgs->off == PAGE_CACHE_SIZE) {
page = find_or_create_page(pgs->mapping,
pgs->pgoff + pgs->nr,
GFP_NOFS | __GFP_ZERO);
trace_printk("page %p\n", page);
if (!page) {
ret = -ENOMEM;
break;
}
pgs->pages[pgs->nr++] = page;
pgs->off = 0;
} else {
page = pgs->pages[pgs->nr - 1];
}
bytes = min(PAGE_CACHE_SIZE - pgs->off, count);
trace_printk("page %p off %d ptr %p count %zu bytes %d remaining %d\n",
page, pgs->off, ptr, count, bytes, pgs->remaining);
if (ptr) {
addr = kmap_atomic(page);
memcpy(addr + pgs->off, ptr, bytes);
kunmap_atomic(addr);
ptr += bytes;
}
count -= bytes;
pgs->off += bytes;
pgs->remaining -= bytes;
}
return ret;
}
static void drop_pages(struct pages *pgs, bool dirty)
{
struct page *page;
int i;
if (!pgs->pages)
return;
for (i = 0; i < pgs->nr; i++) {
page = pgs->pages[i];
SetPageUptodate(page);
if (dirty)
set_page_dirty(page);
unlock_page(page);
page_cache_release(page);
}
}
/*
* Write dirty items from the given item into dirty page cache pages in
* the block device at the given large block number.
*
* All the page cache pages are locked and pinned while they're being
* dirtied. The intent is to have a single large IO leave once they're
* all ready. This is an easy way to do that while maintaining
* consistency with the block device page cache. But it might not work :).
*
* We do one sweep over the items. The item's aren't indexed. We might
* want to change that.
*
* Even though we're doing one sweep over the items we're holding the
* bloom filter and header pinned until the items are done. If we didn't
* mind the risk of the blocks going out of order we wouldn't need the
* allocated array of page pointers.
*/
static struct scoutfs_item *dirty_block_pages(struct super_block *sb,
struct scoutfs_item *item, u64 blkno)
{
struct scoutfs_item_header ihdr;
struct scoutfs_lsm_block lblk;
struct pages pgs;
void *addr;
int ret;
/* assuming header starts page, and pgoff shift calculation */
BUILD_BUG_ON(SCOUTFS_BLOCK_SHIFT < PAGE_CACHE_SHIFT);
if (WARN_ON_ONCE(!item))
return item;
/* XXX not super thrilled with this allocation */
pgs.pages = kmalloc_array(SCOUTFS_BLOCK_SIZE / PAGE_CACHE_SIZE,
sizeof(struct page *), GFP_NOFS);
if (!pgs.pages) {
ret = -ENOMEM;
goto out;
}
pgs.mapping = sb->s_bdev->bd_inode->i_mapping;
pgs.pgoff = blkno >> (SCOUTFS_BLOCK_SHIFT - PAGE_CACHE_SHIFT);
pgs.nr = 0;
pgs.off = PAGE_CACHE_SIZE,
pgs.remaining = SCOUTFS_BLOCK_SIZE;
/* reserve space at the start of the block for header and bloom */
ret = copy_to_pages(&pgs, NULL, sizeof(lblk));
if (ret > 0)
ret = copy_to_pages(&pgs, NULL, SCOUTFS_BLOOM_FILTER_BYTES);
if (ret <= 0)
goto out;
lblk.first = item->key;
lblk.nr_items = 0;
do {
trace_printk("item %p key "CKF"\n", item, CKA(&item->key));
ihdr.key = item->key;
ihdr.val_len = cpu_to_le16(item->val_len);
ret = copy_to_pages(&pgs, &ihdr, sizeof(ihdr));
if (ret > 0)
ret = copy_to_pages(&pgs, item->val, item->val_len);
if (ret <= 0)
goto out;
lblk.last = item->key;
le32_add_cpu(&lblk.nr_items, 1);
/* set each item's bloom bits */
set_bloom_bits(sb, pgs.pages, sizeof(lblk), &item->key);
item = scoutfs_item_next_dirty(sb, item);
} while (item);
/* copy the filled in header to the start of the block */
addr = kmap_atomic(pgs.pages[0]);
memcpy(addr, &lblk, sizeof(lblk));
kunmap_atomic(addr);
out:
/* dirty if no error (null ok!), unlock, and release */
drop_pages(&pgs, !IS_ERR(item));
kfree(pgs.pages);
if (ret < 0) {
scoutfs_item_put(item);
item = ERR_PTR(ret);
}
return item;
}
/*
* Sync dirty data by writing all the dirty items into a series of level
* 0 blocks.
*
* This is an initial first pass, the full method will need to:
* - wait for pending writers
* - block future writers
* - update our manifest regardless of server communication
* - communicate blocks and key ranges to server
* - ensure that racing sync/dirty don't livelock
*/
int scoutfs_sync_fs(struct super_block *sb, int wait)
{
struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct scoutfs_item *item;
u64 blknos[16]; /* XXX */
u64 blkno;
int ret = 0;
int i;
item = scoutfs_item_next_dirty(sb, NULL);
if (!item)
return 0;
for (i = 0; i < ARRAY_SIZE(blknos); i++) {
blkno = atomic64_inc_return(&sbi->next_blkno);
item = dirty_block_pages(sb, item, blkno);
if (IS_ERR(item)) {
ret = PTR_ERR(item);
goto out;
}
/* start each block's IO */
ret = filemap_flush(mapping);
if (ret)
goto out;
if (!item)
break;
}
/* dirty items should have been limited */
WARN_ON_ONCE(i >= ARRAY_SIZE(blknos));
/* then wait for all block IO to finish */
if (wait) {
ret = filemap_write_and_wait(mapping);
if (ret)
goto out;
}
/* mark everything clean */
scoutfs_item_all_clean(sb);
ret = 0;
out:
trace_printk("ret %d\n", ret);
WARN_ON_ONCE(ret);
return ret;
}

6
kmod/src/lsm.h Normal file
View File

@@ -0,0 +1,6 @@
#ifndef _SCOUTFS_LSM_H_
#define _SCOUTFS_LSM_H_
int scoutfs_sync_fs(struct super_block *sb, int wait);
#endif

52
kmod/src/mkfs.c Normal file
View File

@@ -0,0 +1,52 @@
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/time.h>
#include <linux/random.h>
#include "super.h"
#include "item.h"
#include "key.h"
#include "mkfs.h"
/*
* For now a file system system only exists in the item cache for the
* duration of the mount. This "mkfs" hack creates a root dir inode in
* the item cache on mount so that we can run tests in memory and not
* worry about user space or persistent storage.
*/
int scoutfs_mkfs(struct super_block *sb)
{
const struct timespec ts = current_kernel_time();
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct scoutfs_inode *cinode;
struct scoutfs_item *item;
struct scoutfs_key key;
int i;
atomic64_set(&sbi->next_ino, SCOUTFS_ROOT_INO + 1);
atomic64_set(&sbi->next_blkno, 2);
for (i = 0; i < ARRAY_SIZE(sbi->bloom_hash_keys); i++) {
get_random_bytes(&sbi->bloom_hash_keys[i],
sizeof(sbi->bloom_hash_keys[i]));
}
scoutfs_set_key(&key, SCOUTFS_ROOT_INO, SCOUTFS_INODE_KEY, 0);
item = scoutfs_item_create(sb, &key, sizeof(struct scoutfs_inode));
if (IS_ERR(item))
return PTR_ERR(item);
cinode = item->val;
memset(cinode, 0, sizeof(struct scoutfs_inode));
cinode->nlink = cpu_to_le32(2);
cinode->mode = cpu_to_le32(S_IFDIR | 0755);
cinode->atime.sec = cpu_to_le64(ts.tv_sec);
cinode->atime.nsec = cpu_to_le32(ts.tv_nsec);
cinode->ctime = cinode->atime;
cinode->mtime = cinode->atime;
get_random_bytes(&cinode->salt, sizeof(cinode->salt));
scoutfs_item_put(item);
return 0;
}

6
kmod/src/mkfs.h Normal file
View File

@@ -0,0 +1,6 @@
#ifndef _SCOUTFS_MKFS_H_
#define _SCOUTFS_MKFS_H_
int scoutfs_mkfs(struct super_block *sb);
#endif

103
kmod/src/super.c Normal file
View File

@@ -0,0 +1,103 @@
/*
* Copyright (C) 2015 Versity Software, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/magic.h>
#include "super.h"
#include "format.h"
#include "mkfs.h"
#include "inode.h"
#include "dir.h"
#include "lsm.h"
static const struct super_operations scoutfs_super_ops = {
.alloc_inode = scoutfs_alloc_inode,
.destroy_inode = scoutfs_destroy_inode,
.sync_fs = scoutfs_sync_fs,
};
static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
{
struct scoutfs_sb_info *sbi;
struct inode *inode;
int ret;
sb->s_magic = SCOUTFS_SUPER_MAGIC;
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_op = &scoutfs_super_ops;
sbi = kzalloc(sizeof(struct scoutfs_sb_info), GFP_KERNEL);
sb->s_fs_info = sbi;
if (!sbi)
return -ENOMEM;
spin_lock_init(&sbi->item_lock);
sbi->item_root = RB_ROOT;
sbi->dirty_item_root = RB_ROOT;
ret = scoutfs_mkfs(sb);
if (ret)
return ret;
inode = scoutfs_iget(sb, SCOUTFS_ROOT_INO);
if (IS_ERR(inode))
return PTR_ERR(inode);
sb->s_root = d_make_root(inode);
if (!sb->s_root)
return -ENOMEM;
return 0;
}
static struct dentry *scoutfs_mount(struct file_system_type *fs_type, int flags,
const char *dev_name, void *data)
{
return mount_bdev(fs_type, flags, dev_name, data, scoutfs_fill_super);
}
static void scoutfs_kill_sb(struct super_block *sb)
{
kill_block_super(sb);
kfree(sb->s_fs_info);
}
static struct file_system_type scoutfs_fs_type = {
.owner = THIS_MODULE,
.name = "scoutfs",
.mount = scoutfs_mount,
.kill_sb = scoutfs_kill_sb,
.fs_flags = FS_REQUIRES_DEV,
};
static int __init scoutfs_module_init(void)
{
return scoutfs_inode_init() ?:
scoutfs_dir_init() ?:
register_filesystem(&scoutfs_fs_type);
}
module_init(scoutfs_module_init)
static void __exit scoutfs_module_exit(void)
{
unregister_filesystem(&scoutfs_fs_type);
scoutfs_dir_exit();
scoutfs_inode_exit();
}
module_exit(scoutfs_module_exit)
MODULE_AUTHOR("Zach Brown <zab@versity.com>");
MODULE_LICENSE("GPL");

22
kmod/src/super.h Normal file
View File

@@ -0,0 +1,22 @@
#ifndef _SCOUTFS_SUPER_H_
#define _SCOUTFS_SUPER_H_
#include <linux/rbtree.h>
struct scoutfs_sb_info {
atomic64_t next_ino;
atomic64_t next_blkno;
__le64 bloom_hash_keys[6]; /* XXX */
spinlock_t item_lock;
struct rb_root item_root;
struct rb_root dirty_item_root;
};
static inline struct scoutfs_sb_info *SCOUTFS_SB(struct super_block *sb)
{
return sb->s_fs_info;
}
#endif