From 25a1e8d1b71d1f877bec09cb92af0caf7c28db2a Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Fri, 5 Feb 2016 13:15:41 -0800 Subject: [PATCH] Initial commit This is the initial commit of the repo that will track development against distro kernels. This is an import of a prototype branch in the upstream kernel that only had a few initial commits. It needed to move to the old readdir interface and use find_or_create_page() instead of pagecache_get_page() to build in older distro kernels. --- kmod/.gitignore | 7 + kmod/Makefile | 4 + kmod/src/Kconfig | 10 + kmod/src/Makefile | 3 + kmod/src/dir.c | 551 ++++++++++++++++++++++++++++++++++++++++++++++ kmod/src/dir.h | 10 + kmod/src/format.h | 122 ++++++++++ kmod/src/inode.c | 272 +++++++++++++++++++++++ kmod/src/inode.h | 32 +++ kmod/src/item.c | 423 +++++++++++++++++++++++++++++++++++ kmod/src/item.h | 37 ++++ kmod/src/key.h | 43 ++++ kmod/src/lsm.c | 330 +++++++++++++++++++++++++++ kmod/src/lsm.h | 6 + kmod/src/mkfs.c | 52 +++++ kmod/src/mkfs.h | 6 + kmod/src/super.c | 103 +++++++++ kmod/src/super.h | 22 ++ 18 files changed, 2033 insertions(+) create mode 100644 kmod/.gitignore create mode 100644 kmod/Makefile create mode 100644 kmod/src/Kconfig create mode 100644 kmod/src/Makefile create mode 100644 kmod/src/dir.c create mode 100644 kmod/src/dir.h create mode 100644 kmod/src/format.h create mode 100644 kmod/src/inode.c create mode 100644 kmod/src/inode.h create mode 100644 kmod/src/item.c create mode 100644 kmod/src/item.h create mode 100644 kmod/src/key.h create mode 100644 kmod/src/lsm.c create mode 100644 kmod/src/lsm.h create mode 100644 kmod/src/mkfs.c create mode 100644 kmod/src/mkfs.h create mode 100644 kmod/src/super.c create mode 100644 kmod/src/super.h diff --git a/kmod/.gitignore b/kmod/.gitignore new file mode 100644 index 00000000..9d66c4e8 --- /dev/null +++ b/kmod/.gitignore @@ -0,0 +1,7 @@ +src/*.o +src/*.ko +src/*.mod.c +src/*.cmd +src/.tmp_versions/ +src/Module.symvers +src/modules.order diff --git a/kmod/Makefile b/kmod/Makefile new file mode 100644 index 00000000..07fbc001 --- /dev/null +++ b/kmod/Makefile @@ -0,0 +1,4 @@ +ALL: module + +module: + make CONFIG_SCOUTFS_FS=m -C $(SK_KSRC) M=$(PWD)/src diff --git a/kmod/src/Kconfig b/kmod/src/Kconfig new file mode 100644 index 00000000..eb097405 --- /dev/null +++ b/kmod/src/Kconfig @@ -0,0 +1,10 @@ +config SCOUTFS_FS + tristate "scoutfs filesystem" + help + scoutfs is a clustered file system that stores data in large + blocks in shared block storage. + + To compile this file system support as a module, choose M here. The + module will be called scoutfs. + + If unsure, say N. diff --git a/kmod/src/Makefile b/kmod/src/Makefile new file mode 100644 index 00000000..239e8aef --- /dev/null +++ b/kmod/src/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_SCOUTFS_FS) := scoutfs.o + +scoutfs-y += dir.o inode.o item.o lsm.o mkfs.o super.o diff --git a/kmod/src/dir.c b/kmod/src/dir.c new file mode 100644 index 00000000..744759a9 --- /dev/null +++ b/kmod/src/dir.c @@ -0,0 +1,551 @@ +/* + * Copyright (C) 2016 Versity Software, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include +#include +#include +#include +#include + +#include "format.h" +#include "dir.h" +#include "inode.h" +#include "key.h" +#include "item.h" +#include "super.h" + +/* + * Directory entries are stored in items whose offset is determined by + * the hash of the entry's name. This was primarily chosen to minimize + * the amount of data stored for each entry. + * + * Because we're hashing the name we need to worry about collisions. We + * store all the entries with the same hash value in the item. This was + * done so that create works with one specific item. + * + * readdir iterates over these items in hash order. The high bits of + * the entry's readdir f_pos come from the item offset while the low + * bits come from a collision number in the entry. + * + * The full readdir position, and thus the absolute max number of + * entries in a directory, is limited to 2^31 to avoid the risk of + * breaking legacy environments. Even with a relatively small 27bit + * item offset allowing 16 colliding entries gets well into hundreds of + * millions of entries before an item fills up and we return a premature + * ENOSPC. Hundreds of millions in a single dir ought to be, wait for + * it, good enough for anybody. + * + * Each item's contents are protected by the dir inode's i_mutex that + * callers acquire before calling our dir operations. If we wanted more + * fine grained concurrency, and we might, we'd have to be careful to + * manage the shared items. + */ + +static unsigned int mode_to_type(umode_t mode) +{ +#define S_SHIFT 12 + static unsigned char mode_types[S_IFMT >> S_SHIFT] = { + [S_IFIFO >> S_SHIFT] = SCOUTFS_DT_FIFO, + [S_IFCHR >> S_SHIFT] = SCOUTFS_DT_CHR, + [S_IFDIR >> S_SHIFT] = SCOUTFS_DT_DIR, + [S_IFBLK >> S_SHIFT] = SCOUTFS_DT_BLK, + [S_IFREG >> S_SHIFT] = SCOUTFS_DT_REG, + [S_IFLNK >> S_SHIFT] = SCOUTFS_DT_LNK, + [S_IFSOCK >> S_SHIFT] = SCOUTFS_DT_SOCK, + }; + + return mode_types[(mode & S_IFMT) >> S_SHIFT]; +#undef S_SHIFT +} + +#if 0 +static unsigned int dentry_type(unsigned int type) +{ + static unsigned char types[] = { + [SCOUTFS_DT_FIFO] = DT_FIFO, + [SCOUTFS_DT_CHR] = DT_CHR, + [SCOUTFS_DT_DIR] = DT_DIR, + [SCOUTFS_DT_BLK] = DT_BLK, + [SCOUTFS_DT_REG] = DT_REG, + [SCOUTFS_DT_LNK] = DT_LNK, + [SCOUTFS_DT_SOCK] = DT_SOCK, + [SCOUTFS_DT_WHT] = DT_WHT, + }; + + if (type < ARRAY_SIZE(types)) + return types[type]; + + return DT_UNKNOWN; +} +#endif + +static int names_equal(const char *name_a, int len_a, const char *name_b, + int len_b) +{ + return (len_a == len_b) && !memcmp(name_a, name_b, len_a); +} + +/* + * Return the offset portion of a dirent key from the hash of the name. + * + * XXX This crc nonsense is a quick hack. We'll want something a + * lot stronger like siphash. + */ +static u32 name_hash(struct inode *dir, const char *name, unsigned int len) +{ + struct scoutfs_inode_info *ci = SCOUTFS_I(dir); + + return crc32c(ci->salt, name, len) >> (32 - SCOUTFS_DIRENT_OFF_BITS); +} + +static unsigned int dent_bytes(unsigned int name_len) +{ + return sizeof(struct scoutfs_dirent) + name_len; +} + +static unsigned int dent_val_off(struct scoutfs_item *item, + struct scoutfs_dirent *dent) +{ + return (char *)dent - (char *)item->val; +} + +static inline struct scoutfs_dirent *next_dent(struct scoutfs_item *item, + struct scoutfs_dirent *dent) +{ + unsigned int next_off; + + next_off = dent_val_off(item, dent) + dent_bytes(dent->name_len); + if (next_off == item->val_len) + return NULL; + + return item->val + next_off; +} + +#define for_each_item_dent(item, dent) \ + for (dent = item->val; dent; dent = next_dent(item, dent)) + +struct dentry_info { + /* + * The key offset and collision nr are stored so that we don't + * have to either hash the name to find the item or compare + * names to find the dirent in the item. + */ + u32 key_offset; + u8 coll_nr; +}; + +static struct kmem_cache *scoutfs_dentry_cachep; + +static struct dentry_info *alloc_dentry_info(struct dentry *dentry) +{ + struct dentry_info *di; + + /* XXX read mb? */ + if (dentry->d_fsdata) + return dentry->d_fsdata; + + di = kmem_cache_zalloc(scoutfs_dentry_cachep, GFP_NOFS); + if (!di) + return ERR_PTR(-ENOMEM); + + spin_lock(&dentry->d_lock); + if (!dentry->d_fsdata) + dentry->d_fsdata = di; + spin_unlock(&dentry->d_lock); + + if (di != dentry->d_fsdata) + kmem_cache_free(scoutfs_dentry_cachep, di); + + return dentry->d_fsdata; +} + +/* + * Lookup searches for an entry for the given name amongst the entries + * stored in the item at the name's hash. + */ +static struct dentry *scoutfs_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct super_block *sb = dir->i_sb; + struct scoutfs_dirent *dent; + struct scoutfs_item *item; + struct dentry_info *di; + struct scoutfs_key key; + struct inode *inode; + u64 ino = 0; + u32 h = 0; + u32 nr = 0; + int ret; + + di = alloc_dentry_info(dentry); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + + if (dentry->d_name.len > SCOUTFS_NAME_LEN) { + ret = -ENAMETOOLONG; + goto out; + } + + h = name_hash(dir, dentry->d_name.name, dentry->d_name.len); + scoutfs_set_key(&key, scoutfs_ino(dir), SCOUTFS_DIRENT_KEY, h); + + item = scoutfs_item_lookup(sb, &key); + if (IS_ERR(item)) { + ret = PTR_ERR(item); + goto out; + } + + ret = -ENOENT; + for_each_item_dent(item, dent) { + if (names_equal(dentry->d_name.name, dentry->d_name.len, + dent->name, dent->name_len)) { + ino = le64_to_cpu(dent->ino); + nr = dent->coll_nr; + ret = 0; + break; + } + } + + scoutfs_item_put(item); +out: + if (ret == -ENOENT) { + inode = NULL; + } else if (ret) { + inode = ERR_PTR(ret); + } else { + di->key_offset = h; + di->coll_nr = nr; + inode = scoutfs_iget(sb, ino); + } + + return d_splice_alias(inode, dentry); +} + +/* this exists upstream so we can just delete it in a forward port */ +static int dir_emit_dots(struct file *file, void *dirent, filldir_t filldir) +{ + struct dentry *dentry = file->f_path.dentry; + struct inode *inode = dentry->d_inode; + struct inode *parent = dentry->d_parent->d_inode; + + if (file->f_pos == 0) { + if (!filldir(dirent, ".", 1, 1, scoutfs_ino(inode), DT_DIR)) + return 0; + file->f_pos = 1; + } + + if (file->f_pos == 1) { + if (!filldir(dirent, "..", 2, 1, scoutfs_ino(parent), DT_DIR)) + return 0; + file->f_pos = 2; + } + + return 1; +} + +/* + * readdir finds the next entry at or past the hash|coll_nr stored in + * the ctx->pos (f_pos). + * + * It will need to be careful not to read past the region of the dirent + * hash offset keys that it has access to. + */ +static int scoutfs_readdir(struct file *file, void *dirent, filldir_t filldir) +{ + struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; + struct scoutfs_dirent *dent; + struct scoutfs_key last_key; + struct scoutfs_item *item; + struct scoutfs_key key; + u32 nr; + u32 off; + u64 pos; + int ret = 0; + + if (!dir_emit_dots(file, dirent, filldir)) + return 0; + + scoutfs_set_key(&last_key, scoutfs_ino(inode), SCOUTFS_DIRENT_KEY, + SCOUTFS_DIRENT_OFF_MASK); + + do { + off = file->f_pos >> SCOUTFS_DIRENT_COLL_BITS; + nr = file->f_pos & SCOUTFS_DIRENT_COLL_MASK; + + scoutfs_set_key(&key, scoutfs_ino(inode), SCOUTFS_DIRENT_KEY, + off); + item = scoutfs_item_next(sb, &key); + if (IS_ERR(item)) { + ret = PTR_ERR(item); + if (ret == -ENOENT) + ret = 0; + break; + } + + if (scoutfs_key_cmp(&item->key, &last_key) > 0) { + scoutfs_item_put(item); + break; + } + + /* reset nr to 0 if we found the next item */ + if (scoutfs_key_offset(&item->key) != off) + nr = 0; + + pos = scoutfs_key_offset(&item->key) + << SCOUTFS_DIRENT_COLL_BITS; + for_each_item_dent(item, dent) { + if (dent->coll_nr < nr) + continue; + + if (!filldir(dirent, dent->name, dent->name_len, pos, + le64_to_cpu(dent->ino), dent->type)) + break; + + file->f_pos = (pos | dent->coll_nr) + 1; + } + + scoutfs_item_put(item); + + /* advance to the next hash value if we finished item */ + if (dent == NULL) + file->f_pos = pos + (1 << SCOUTFS_DIRENT_COLL_BITS); + + } while (dent == NULL); + + return ret; +} + +static int scoutfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, + dev_t rdev) +{ + struct super_block *sb = dir->i_sb; + struct inode *inode = NULL; + struct scoutfs_dirent *dent; + struct scoutfs_item *item; + struct dentry_info *di; + struct scoutfs_key key; + int bytes; + int ret; + int off; + u64 nr; + u64 h; + + di = alloc_dentry_info(dentry); + if (IS_ERR(di)) + return PTR_ERR(di); + + if (dentry->d_name.len > SCOUTFS_NAME_LEN) + return -ENAMETOOLONG; + + inode = scoutfs_new_inode(sb, dir, mode, rdev); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + h = name_hash(dir, dentry->d_name.name, dentry->d_name.len); + scoutfs_set_key(&key, scoutfs_ino(dir), SCOUTFS_DIRENT_KEY, h); + bytes = dent_bytes(dentry->d_name.len); + + item = scoutfs_item_lookup(sb, &key); + if (item == ERR_PTR(-ENOENT)) { + item = scoutfs_item_create(sb, &key, bytes); + if (!IS_ERR(item)) { + /* mark a newly created item */ + dent = item->val; + dent->name_len = 0; + } + } + if (IS_ERR(item)) { + ret = PTR_ERR(item); + goto out; + } + + ret = 0; + nr = 0; + for_each_item_dent(item, dent) { + /* the common case of a newly created item */ + if (!dent->name_len) + break; + + /* XXX check for eexist? can't happen? */ + + /* found a free coll nr, insert here */ + if (nr < dent->coll_nr) { + off = dent_val_off(item, dent); + ret = scoutfs_item_expand(item, off, bytes); + if (!ret) + dent = item->val + off; + break; + } + + /* the item's full */ + if (nr++ == SCOUTFS_DIRENT_COLL_MASK) { + ret = -ENOSPC; + break; + } + } + + if (!ret) { + dent->ino = cpu_to_le64(scoutfs_ino(inode)); + dent->type = mode_to_type(inode->i_mode); + dent->coll_nr = nr; + dent->name_len = dentry->d_name.len; + memcpy(dent->name, dentry->d_name.name, dent->name_len); + di->key_offset = h; + di->coll_nr = nr; + } + + scoutfs_item_put(item); + + if (ret) + goto out; + + i_size_write(dir, i_size_read(dir) + dentry->d_name.len); + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + + if (S_ISDIR(mode)) { + inc_nlink(inode); + inc_nlink(dir); + } + + mark_inode_dirty(inode); + mark_inode_dirty(dir); + + insert_inode_hash(inode); + d_instantiate(dentry, inode); +out: + /* XXX delete the inode item here */ + if (ret && !IS_ERR_OR_NULL(inode)) + iput(inode); + return ret; +} + +/* XXX hmm, do something with excl? */ +static int scoutfs_create(struct inode *dir, struct dentry *dentry, + umode_t mode, bool excl) +{ + return scoutfs_mknod(dir, dentry, mode | S_IFREG, 0); +} + +static int scoutfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + return scoutfs_mknod(dir, dentry, mode | S_IFDIR, 0); +} + +/* + * Unlink removes the entry from its item and removes the item if ours + * was the only remaining entry. + */ +static int scoutfs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct super_block *sb = dir->i_sb; + struct inode *inode = dentry->d_inode; + struct timespec ts = current_kernel_time(); + struct scoutfs_dirent *dent; + struct scoutfs_item *item; + struct dentry_info *di; + struct scoutfs_key key; + int ret = 0; + + if (WARN_ON_ONCE(!dentry->d_fsdata)) + return -EINVAL; + di = dentry->d_fsdata; + + trace_printk("dir size %llu entry k_off nr %u %u\n", + i_size_read(inode), di->key_offset, di->coll_nr); + + if (S_ISDIR(inode->i_mode) && i_size_read(inode)) + return -ENOTEMPTY; + + scoutfs_set_key(&key, scoutfs_ino(dir), SCOUTFS_DIRENT_KEY, + di->key_offset); + + item = scoutfs_item_lookup(sb, &key); + if (IS_ERR(item)) { + ret = PTR_ERR(item); + goto out; + } + + /* XXX error to not find the coll nr we were looking for? */ + for_each_item_dent(item, dent) { + if (dent->coll_nr != di->coll_nr) + continue; + + /* XXX compare names and eio? */ + + if (item->val_len == dent_bytes(dent->name_len)) { + scoutfs_item_delete(sb, item); + ret = 0; + } else { + ret = scoutfs_item_shrink(item, + dent_val_off(item, dent), + dent_bytes(dent->name_len)); + } + dent = NULL; + break; + } + + scoutfs_item_put(item); + + if (ret) + goto out; + + dir->i_ctime = ts; + dir->i_mtime = ts; + i_size_write(dir, i_size_read(dir) - dentry->d_name.len); + + inode->i_ctime = ts; + drop_nlink(inode); + if (S_ISDIR(inode->i_mode)) { + drop_nlink(dir); + drop_nlink(inode); + } + mark_inode_dirty(inode); + mark_inode_dirty(dir); + +out: + return ret; +} + +const struct file_operations scoutfs_dir_fops = { + .readdir = scoutfs_readdir, +}; + +const struct inode_operations scoutfs_dir_iops = { + .lookup = scoutfs_lookup, + .mknod = scoutfs_mknod, + .create = scoutfs_create, + .mkdir = scoutfs_mkdir, + .unlink = scoutfs_unlink, + .rmdir = scoutfs_unlink, +}; + +void scoutfs_dir_exit(void) +{ + if (scoutfs_dentry_cachep) { + kmem_cache_destroy(scoutfs_dentry_cachep); + scoutfs_dentry_cachep = NULL; + } +} + +int scoutfs_dir_init(void) +{ + scoutfs_dentry_cachep = kmem_cache_create("scoutfs_dentry_info", + sizeof(struct dentry_info), 0, + SLAB_RECLAIM_ACCOUNT, NULL); + if (!scoutfs_dentry_cachep) + return -ENOMEM; + + return 0; +} diff --git a/kmod/src/dir.h b/kmod/src/dir.h new file mode 100644 index 00000000..3ee15f0f --- /dev/null +++ b/kmod/src/dir.h @@ -0,0 +1,10 @@ +#ifndef _SCOUTFS_DIR_H_ +#define _SCOUTFS_DIR_H_ + +extern const struct file_operations scoutfs_dir_fops; +extern const struct inode_operations scoutfs_dir_iops; + +int scoutfs_dir_init(void); +void scoutfs_dir_exit(void); + +#endif diff --git a/kmod/src/format.h b/kmod/src/format.h new file mode 100644 index 00000000..67958f12 --- /dev/null +++ b/kmod/src/format.h @@ -0,0 +1,122 @@ +#ifndef _SCOUTFS_FORMAT_H_ +#define _SCOUTFS_FORMAT_H_ + +#define SCOUTFS_SUPER_MAGIC 0x554f4353 /* "SCOU" */ + +#define SCOUTFS_BLOCK_SHIFT 22 +#define SCOUTFS_BLOCK_SIZE (1 << SCOUTFS_BLOCK_SHIFT) + +/* + * This bloom size is chosen to have a roughly 1% false positive rate + * for ~90k items which is roughly the worst case for a block full of + * dirents with reasonably small names. Pathologically smaller items + * could be even more dense. + */ +#define SCOUTFS_BLOOM_FILTER_BYTES (128 * 1024) +#define SCOUTFS_BLOOM_FILTER_BITS (SCOUTFS_BLOOM_FILTER_BYTES * 8) +#define SCOUTFS_BLOOM_INDEX_BITS (ilog2(SCOUTFS_BLOOM_FILTER_BITS)) +#define SCOUTFS_BLOOM_INDEX_MASK ((1 << SCOUTFS_BLOOM_INDEX_BITS) - 1) +#define SCOUTFS_BLOOM_INDEX_NR 7 + +/* + * We should be able to make the offset smaller if neither dirents nor + * data items use the full 64 bits. + */ +struct scoutfs_key { + __le64 inode; + u8 type; + __le64 offset; +} __packed; + +#define SCOUTFS_INODE_KEY 128 +#define SCOUTFS_DIRENT_KEY 192 + +struct scoutfs_lsm_block { + struct scoutfs_key first; + struct scoutfs_key last; + __le32 nr_items; + /* u8 bloom[SCOUTFS_BLOOM_BYTES]; */ + /* struct scoutfs_item_header items[0] .. */ +} __packed; + +struct scoutfs_item_header { + struct scoutfs_key key; + __le16 val_len; +} __packed; + + +struct scoutfs_timespec { + __le64 sec; + __le32 nsec; +} __packed; + +/* + * XXX + * - otime? + * - compat flags? + * - version? + * - generation? + * - be more careful with rdev? + */ +struct scoutfs_inode { + __le64 size; + __le64 blocks; + __le32 nlink; + __le32 uid; + __le32 gid; + __le32 mode; + __le32 rdev; + __le32 salt; + struct scoutfs_timespec atime; + struct scoutfs_timespec ctime; + struct scoutfs_timespec mtime; +} __packed; + +#define SCOUTFS_ROOT_INO 1 + +/* + * Dirents are stored in items with an offset of the hash of their name. + * Colliding names are packed into the value. + */ +struct scoutfs_dirent { + __le64 ino; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 type:4, + coll_nr:4; +#else + __u8 coll_nr:4, + type:4; +#endif + __u8 name_len; + __u8 name[0]; +} __packed; + +#define SCOUTFS_NAME_LEN 255 + +/* + * We only use 31 bits for readdir positions so that we don't confuse + * old signed 32bit f_pos applications or those on the other side of + * network protocols that have limited readir positions. + */ + +#define SCOUTFS_DIRENT_OFF_BITS 27 +#define SCOUTFS_DIRENT_OFF_MASK ((1 << SCOUTFS_DIRENT_OFF_BITS) - 1) +#define SCOUTFS_DIRENT_COLL_BITS 4 +#define SCOUTFS_DIRENT_COLL_MASK ((1 << SCOUTFS_DIRENT_COLL_BITS) - 1) + +/* getdents returns the *next* pos with each entry. so we can't return ~0 */ +#define SCOUTFS_DIRENT_MAX_POS \ + (((1 << (SCOUTFS_DIRENT_OFF_BITS + SCOUTFS_DIRENT_COLL_BITS)) - 1) - 1) + +enum { + SCOUTFS_DT_FIFO = 0, + SCOUTFS_DT_CHR, + SCOUTFS_DT_DIR, + SCOUTFS_DT_BLK, + SCOUTFS_DT_REG, + SCOUTFS_DT_LNK, + SCOUTFS_DT_SOCK, + SCOUTFS_DT_WHT, +}; + +#endif diff --git a/kmod/src/inode.c b/kmod/src/inode.c new file mode 100644 index 00000000..02446332 --- /dev/null +++ b/kmod/src/inode.c @@ -0,0 +1,272 @@ +/* + * Copyright (C) 2015 Versity Software, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include +#include +#include +#include + +#include "format.h" +#include "super.h" +#include "key.h" +#include "inode.h" +#include "item.h" +#include "dir.h" + +/* + * XXX + * - worry about i_ino trunctation, not sure if we do anything + * - use inode item value lengths for forward/back compat + */ + +static struct kmem_cache *scoutfs_inode_cachep; + +static void scoutfs_inode_ctor(void *obj) +{ + struct scoutfs_inode_info *ci = obj; + + inode_init_once(&ci->inode); +} + +struct inode *scoutfs_alloc_inode(struct super_block *sb) +{ + struct scoutfs_inode_info *ci; + + ci = kmem_cache_alloc(scoutfs_inode_cachep, GFP_NOFS); + if (!ci) + return NULL; + + return &ci->inode; +} + +static void scoutfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + + trace_printk("freeing inode %p\n", inode); + kmem_cache_free(scoutfs_inode_cachep, SCOUTFS_I(inode)); +} + +void scoutfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, scoutfs_i_callback); +} + +/* + * Called once new inode allocation or inode reading has initialized + * enough of the inode for us to set the ops based on the mode. + */ +static void set_inode_ops(struct inode *inode) +{ + switch (inode->i_mode & S_IFMT) { + case S_IFREG: +// inode->i_mapping->a_ops = &scoutfs_file_aops; +// inode->i_op = &scoutfs_file_iops; +// inode->i_fop = &scoutfs_file_fops; + break; + case S_IFDIR: + inode->i_op = &scoutfs_dir_iops; + inode->i_fop = &scoutfs_dir_fops; + break; + case S_IFLNK: +// inode->i_op = &scoutfs_symlink_iops; + break; + default: +// inode->i_op = &scoutfs_special_iops; + init_special_inode(inode, inode->i_mode, inode->i_rdev); + break; + } +} + +static void load_inode(struct inode *inode, struct scoutfs_inode *cinode) +{ + struct scoutfs_inode_info *ci = SCOUTFS_I(inode); + + i_size_write(inode, le64_to_cpu(cinode->size)); + set_nlink(inode, le32_to_cpu(cinode->nlink)); + i_uid_write(inode, le32_to_cpu(cinode->uid)); + i_gid_write(inode, le32_to_cpu(cinode->gid)); + inode->i_mode = le32_to_cpu(cinode->mode); + inode->i_rdev = le32_to_cpu(cinode->rdev); + inode->i_atime.tv_sec = le64_to_cpu(cinode->atime.sec); + inode->i_atime.tv_nsec = le32_to_cpu(cinode->atime.nsec); + inode->i_mtime.tv_sec = le64_to_cpu(cinode->mtime.sec); + inode->i_mtime.tv_nsec = le32_to_cpu(cinode->mtime.nsec); + inode->i_ctime.tv_sec = le64_to_cpu(cinode->ctime.sec); + inode->i_ctime.tv_nsec = le32_to_cpu(cinode->ctime.nsec); + + ci->salt = le32_to_cpu(cinode->salt); +} + +static int scoutfs_read_locked_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct scoutfs_item *item; + struct scoutfs_key key; + + scoutfs_set_key(&key, scoutfs_ino(inode), SCOUTFS_INODE_KEY, 0); + + item = scoutfs_item_lookup(sb, &key); + if (IS_ERR(item)) + return PTR_ERR(item); + + load_inode(inode, item->val); + scoutfs_item_put(item); + + return 0; +} + +static int scoutfs_iget_test(struct inode *inode, void *arg) +{ + struct scoutfs_inode_info *ci = SCOUTFS_I(inode); + u64 *ino = arg; + + return ci->ino == *ino; +} + +static int scoutfs_iget_set(struct inode *inode, void *arg) +{ + struct scoutfs_inode_info *ci = SCOUTFS_I(inode); + u64 *ino = arg; + + inode->i_ino = *ino; + ci->ino = *ino; + + return 0; +} + +struct inode *scoutfs_iget(struct super_block *sb, u64 ino) +{ + struct inode *inode; + int ret; + + inode = iget5_locked(sb, ino, scoutfs_iget_test, scoutfs_iget_set, + &ino); + if (!inode) + return ERR_PTR(-ENOMEM); + + if (inode->i_state & I_NEW) { + ret = scoutfs_read_locked_inode(inode); + if (ret) { + iget_failed(inode); + inode = ERR_PTR(ret); + } else { + set_inode_ops(inode); + unlock_new_inode(inode); + } + } + + return inode; +} + +static void store_inode(struct scoutfs_inode *cinode, struct inode *inode) +{ + struct scoutfs_inode_info *ci = SCOUTFS_I(inode); + + cinode->size = cpu_to_le64(i_size_read(inode)); + cinode->nlink = cpu_to_le32(inode->i_nlink); + cinode->uid = cpu_to_le32(i_uid_read(inode)); + cinode->gid = cpu_to_le32(i_gid_read(inode)); + cinode->mode = cpu_to_le32(inode->i_mode); + cinode->rdev = cpu_to_le32(inode->i_rdev); + cinode->atime.sec = cpu_to_le64(inode->i_atime.tv_sec); + cinode->atime.nsec = cpu_to_le32(inode->i_atime.tv_nsec); + cinode->ctime.sec = cpu_to_le64(inode->i_ctime.tv_sec); + cinode->ctime.nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + cinode->mtime.sec = cpu_to_le64(inode->i_mtime.tv_sec); + cinode->mtime.nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + + cinode->salt = cpu_to_le32(ci->salt); +} + +/* + * Every time we modify the inode in memory we copy it to its inode + * item. This lets us write out blocks of items without having to track + * down dirty vfs inodes and safely copy them into items before writing. + */ +int scoutfs_inode_update(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct scoutfs_item *item; + struct scoutfs_key key; + + scoutfs_set_key(&key, scoutfs_ino(inode), SCOUTFS_INODE_KEY, 0); + + item = scoutfs_item_lookup(sb, &key); + if (IS_ERR(item)) + return PTR_ERR(item); + + store_inode(item->val, inode); + scoutfs_item_put(item); + + return 0; +} + +/* + * Allocate and initialize a new inode. The caller is responsible for + * creating links to it and updating it. @dir can be null. + */ +struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir, + umode_t mode, dev_t rdev) +{ + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + struct scoutfs_inode_info *ci; + struct scoutfs_item *item; + struct scoutfs_key key; + struct inode *inode; + + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + ci = SCOUTFS_I(inode); + ci->ino = atomic64_inc_return(&sbi->next_ino); + get_random_bytes(&ci->salt, sizeof(ci->salt)); + + inode->i_ino = ci->ino; + inode_init_owner(inode, dir, mode); + inode_set_bytes(inode, 0); + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_rdev = rdev; + set_inode_ops(inode); + + scoutfs_set_key(&key, scoutfs_ino(inode), SCOUTFS_INODE_KEY, 0); + + item = scoutfs_item_create(inode->i_sb, &key, + sizeof(struct scoutfs_inode)); + if (IS_ERR(item)) { + iput(inode); + inode = ERR_CAST(item); + } + return inode; +} + +void scoutfs_inode_exit(void) +{ + if (scoutfs_inode_cachep) { + rcu_barrier(); + kmem_cache_destroy(scoutfs_inode_cachep); + scoutfs_inode_cachep = NULL; + } +} + +int scoutfs_inode_init(void) +{ + scoutfs_inode_cachep = kmem_cache_create("scoutfs_inode_info", + sizeof(struct scoutfs_inode_info), 0, + SLAB_RECLAIM_ACCOUNT, + scoutfs_inode_ctor); + if (!scoutfs_inode_cachep) + return -ENOMEM; + + return 0; +} diff --git a/kmod/src/inode.h b/kmod/src/inode.h new file mode 100644 index 00000000..bb9a6149 --- /dev/null +++ b/kmod/src/inode.h @@ -0,0 +1,32 @@ +#ifndef _SCOUTFS_INODE_H_ +#define _SCOUTFS_INODE_H_ + +struct scoutfs_inode_info { + u64 ino; + u32 salt; + + struct inode inode; +}; + +static inline struct scoutfs_inode_info *SCOUTFS_I(struct inode *inode) +{ + return container_of(inode, struct scoutfs_inode_info, inode); +} + +static inline u64 scoutfs_ino(struct inode *inode) +{ + return SCOUTFS_I(inode)->ino; +} + +struct inode *scoutfs_alloc_inode(struct super_block *sb); +void scoutfs_destroy_inode(struct inode *inode); + +struct inode *scoutfs_iget(struct super_block *sb, u64 ino); +int scoutfs_inode_update(struct inode *inode); +struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir, + umode_t mode, dev_t rdev); + +void scoutfs_inode_exit(void); +int scoutfs_inode_init(void); + +#endif diff --git a/kmod/src/item.c b/kmod/src/item.c new file mode 100644 index 00000000..d5c8f204 --- /dev/null +++ b/kmod/src/item.c @@ -0,0 +1,423 @@ +/* + * Copyright (C) 2015 Versity Software, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include +#include +#include +#include +#include + +#include "super.h" +#include "key.h" +#include "item.h" + +/* + * describe: + * - tracks per-item dirty state for writing + * - decouples vfs cache lifetimes from item lifetimes + * - item-granular cache for things vfs doesn't cache (readdir, xattr) + * + * XXX: + * - warnings for invalid keys/lens + * - memory pressure + */ + +enum { + ITW_NEXT = 1, + ITW_PREV, +}; + +static inline struct scoutfs_item *node_item(struct super_block *sb, + struct rb_root *root, + struct rb_node *node) +{ + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + unsigned long off; + + if (root == &sbi->item_root) + off = offsetof(struct scoutfs_item, node); + else + off = offsetof(struct scoutfs_item, dirty_node); + + return (void *)((char *)node - off); +} + +static inline struct rb_node *item_node(struct super_block *sb, + struct rb_root *root, + struct scoutfs_item *item) +{ + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + unsigned long off; + + if (root == &sbi->item_root) + off = offsetof(struct scoutfs_item, node); + else + off = offsetof(struct scoutfs_item, dirty_node); + + return (void *)((char *)item + off); +} + +/* + * Insert a new item in the tree. The caller must have done a lookup to + * ensure that the key is not already present. + */ +static void insert_item(struct super_block *sb, struct rb_root *root, + struct scoutfs_item *ins) +{ + struct rb_node **node = &root->rb_node; + struct rb_node *parent = NULL; + struct scoutfs_item *item; + int cmp; + + while (*node) { + parent = *node; + item = node_item(sb, root, *node); + + cmp = scoutfs_key_cmp(&ins->key, &item->key); + BUG_ON(cmp == 0); + if (cmp < 0) + node = &(*node)->rb_left; + else + node = &(*node)->rb_right; + } + + rb_link_node(item_node(sb, root, ins), parent, node); + rb_insert_color(item_node(sb, root, ins), root); +} + +enum { + FI_NEXT = 1, + FI_PREV, +}; + +/* + * Walk the tree looking for an item. + * + * If NEXT or PREV are specified then those will be returned + * if the specific item isn't found. + */ +static struct scoutfs_item *find_item(struct super_block *sb, + struct rb_root *root, + struct scoutfs_key *key, int np) +{ + struct rb_node *node = root->rb_node; + struct scoutfs_item *found = NULL; + struct scoutfs_item *item; + int cmp; + + while (node) { + item = node_item(sb, root, node); + + cmp = scoutfs_key_cmp(key, &item->key); + if (cmp < 0) { + if (np == FI_NEXT) + found = item; + node = node->rb_left; + } else if (cmp > 0) { + if (np == FI_PREV) + found = item; + node = node->rb_right; + } else { + found = item; + break; + } + } + + return found; +} + +static struct scoutfs_item *alloc_item(struct scoutfs_key *key, + unsigned int val_len) +{ + struct scoutfs_item *item; + void *val; + + item = kmalloc(sizeof(struct scoutfs_item), GFP_NOFS); + val = kmalloc(val_len, GFP_NOFS); + if (!item || !val) { + kfree(item); + kfree(val); + return ERR_PTR(-ENOMEM); + } + + RB_CLEAR_NODE(&item->node); + RB_CLEAR_NODE(&item->dirty_node); + atomic_set(&item->refcount, 1); + item->key = *key; + item->val_len = val_len; + item->val = val; + + return item; +} + +/* + * Create a new item stored at the given key. Return it with a reference. + * return an ERR_PTR with ENOMEM or EEXIST. + * + * The caller is responsible for initializing the item's value. + */ +struct scoutfs_item *scoutfs_item_create(struct super_block *sb, + struct scoutfs_key *key, + unsigned int val_len) +{ + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + struct scoutfs_item *existing; + struct scoutfs_item *item; + unsigned long flags; + + item = alloc_item(key, val_len); + if (IS_ERR(item)) + return item; + + spin_lock_irqsave(&sbi->item_lock, flags); + + existing = find_item(sb, &sbi->item_root, key, 0); + if (!existing) { + insert_item(sb, &sbi->item_root, item); + insert_item(sb, &sbi->dirty_item_root, item); + atomic_add(2, &item->refcount); + } + spin_unlock_irqrestore(&sbi->item_lock, flags); + + if (existing) { + scoutfs_item_put(item); + item = ERR_PTR(-EEXIST); + } + + trace_printk("item %p key "CKF" val_len %d\n", item, CKA(key), val_len); + + return item; +} + +/* + * The caller is still responsible for unlocking and putting the item. + * + * We don't try and optimize away the lock for items that are already + * removed from the tree. The caller's locking and item behaviour means + * that racing to remove an item is extremely rare. + * + * XXX for now we're just removing it from the rbtree. We'd need to leave + * behind a deletion record for lsm. + */ +void scoutfs_item_delete(struct super_block *sb, struct scoutfs_item *item) +{ + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + unsigned long flags; + + spin_lock_irqsave(&sbi->item_lock, flags); + + if (!RB_EMPTY_NODE(&item->dirty_node)) { + rb_erase(&item->dirty_node, &sbi->dirty_item_root); + RB_CLEAR_NODE(&item->dirty_node); + scoutfs_item_put(item); + } + + if (!RB_EMPTY_NODE(&item->node)) { + rb_erase(&item->node, &sbi->item_root); + RB_CLEAR_NODE(&item->node); + scoutfs_item_put(item); + } + + spin_unlock_irqrestore(&sbi->item_lock, flags); +} + +static struct scoutfs_item *item_lookup(struct super_block *sb, + struct scoutfs_key *key, int np) +{ + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + struct scoutfs_item *item; + unsigned long flags; + + spin_lock_irqsave(&sbi->item_lock, flags); + + item = find_item(sb, &sbi->item_root, key, np); + if (item) + atomic_inc(&item->refcount); + else + item = ERR_PTR(-ENOENT); + + spin_unlock_irqrestore(&sbi->item_lock, flags); + + return item; +} + +struct scoutfs_item *scoutfs_item_lookup(struct super_block *sb, + struct scoutfs_key *key) +{ + return item_lookup(sb, key, 0); +} + +struct scoutfs_item *scoutfs_item_next(struct super_block *sb, + struct scoutfs_key *key) +{ + return item_lookup(sb, key, FI_NEXT); +} + +struct scoutfs_item *scoutfs_item_prev(struct super_block *sb, + struct scoutfs_key *key) +{ + return item_lookup(sb, key, FI_PREV); +} + +/* + * Expand the item's value by inserting bytes at the given offset. The + * new bytes are not initialized. + */ +int scoutfs_item_expand(struct scoutfs_item *item, int off, int bytes) +{ + void *val; + + /* XXX bytes too big */ + if (WARN_ON_ONCE(off < 0 || off > item->val_len)) + return -EINVAL; + + val = kmalloc(item->val_len + bytes, GFP_NOFS); + if (!val) + return -ENOMEM; + + memcpy(val, item->val, off); + memcpy(val + off + bytes, item->val + off, item->val_len - off); + + kfree(item->val); + item->val = val; + item->val_len += bytes; + + return 0; +} + +/* + * Shrink the item's value by remove bytes at the given offset. + */ +int scoutfs_item_shrink(struct scoutfs_item *item, int off, int bytes) +{ + void *val; + + if (WARN_ON_ONCE(off < 0 || off >= item->val_len || + bytes <= 0 || (off + bytes) > item->val_len || + bytes == item->val_len)) + return -EINVAL; + + val = kmalloc(item->val_len - bytes, GFP_NOFS); + if (!val) + return -ENOMEM; + + memcpy(val, item->val, off); + memcpy(val + off, item->val + off + bytes, + item->val_len - (off + bytes)); + + kfree(item->val); + item->val = val; + item->val_len -= bytes; + + return 0; +} + +void scoutfs_item_mark_dirty(struct super_block *sb, struct scoutfs_item *item) +{ + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + unsigned long flags; + + spin_lock_irqsave(&sbi->item_lock, flags); + + if (RB_EMPTY_NODE(&item->dirty_node)) { + insert_item(sb, &sbi->dirty_item_root, item); + atomic_inc(&item->refcount); + } + + spin_unlock_irqrestore(&sbi->item_lock, flags); +} + +/* + * Mark all the dirty items clean by emptying the dirty rbtree. The + * caller should be preventing writes from dirtying new items. + * + * We erase leaf nodes with no children to minimize rotation + * overhead during erase. Dirty items must be in the main rbtree if + * they're in the dirty rbtree so the puts here shouldn't free the + * items. + */ +void scoutfs_item_all_clean(struct super_block *sb) +{ + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + struct rb_root *root = &sbi->dirty_item_root; + struct scoutfs_item *item; + struct rb_node *node; + unsigned long flags; + + spin_lock_irqsave(&sbi->item_lock, flags); + + node = sbi->dirty_item_root.rb_node; + while (node) { + if (node->rb_left) + node = node->rb_left; + else if (node->rb_right) + node = node->rb_right; + else { + item = node_item(sb, root, node); + node = rb_parent(node); + + trace_printk("item %p key "CKF"\n", + item, CKA(&item->key)); + rb_erase(&item->dirty_node, root); + RB_CLEAR_NODE(&item->dirty_node); + scoutfs_item_put(item); + } + } + + spin_unlock_irqrestore(&sbi->item_lock, flags); +} + +/* + * If the item is null then the first dirty item is returned. If an + * item is given then the next dirty item is returned. NULL is returned + * if there are no more dirty items. + * + * The caller is given a reference that it has to put. The given item + * will always have its item dropped including if it returns NULL. + */ +struct scoutfs_item *scoutfs_item_next_dirty(struct super_block *sb, + struct scoutfs_item *item) +{ + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + struct scoutfs_item *next_item; + struct rb_node *node; + unsigned long flags; + + spin_lock_irqsave(&sbi->item_lock, flags); + + if (item) + node = rb_next(&item->dirty_node); + else + node = rb_first(&sbi->dirty_item_root); + + if (node) { + next_item = node_item(sb, &sbi->dirty_item_root, node); + atomic_inc(&next_item->refcount); + } else { + next_item = NULL; + } + + spin_unlock_irqrestore(&sbi->item_lock, flags); + + scoutfs_item_put(item); + + return next_item; +} + +void scoutfs_item_put(struct scoutfs_item *item) +{ + if (!IS_ERR_OR_NULL(item) && atomic_dec_and_test(&item->refcount)) { + WARN_ON_ONCE(!RB_EMPTY_NODE(&item->node)); + WARN_ON_ONCE(!RB_EMPTY_NODE(&item->dirty_node)); + kfree(item); + } +} diff --git a/kmod/src/item.h b/kmod/src/item.h new file mode 100644 index 00000000..27c8fe0d --- /dev/null +++ b/kmod/src/item.h @@ -0,0 +1,37 @@ +#ifndef _SCOUTFS_ITEM_H_ +#define _SCOUTFS_ITEM_H_ + +#include "format.h" + +struct scoutfs_item { + struct rb_node node; + struct rb_node dirty_node; + atomic_t refcount; + + /* the key is constant for the life of the item */ + struct scoutfs_key key; + + /* the value can be changed by expansion or shrinking */ + unsigned int val_len; + void *val; +}; + +struct scoutfs_item *scoutfs_item_create(struct super_block *sb, + struct scoutfs_key *key, + unsigned int val_len); +struct scoutfs_item *scoutfs_item_lookup(struct super_block *sb, + struct scoutfs_key *key); +struct scoutfs_item *scoutfs_item_next(struct super_block *sb, + struct scoutfs_key *key); +struct scoutfs_item *scoutfs_item_prev(struct super_block *sb, + struct scoutfs_key *key); +int scoutfs_item_expand(struct scoutfs_item *item, int off, int bytes); +int scoutfs_item_shrink(struct scoutfs_item *item, int off, int bytes); +void scoutfs_item_delete(struct super_block *sb, struct scoutfs_item *item); +void scoutfs_item_mark_dirty(struct super_block *sb, struct scoutfs_item *item); +struct scoutfs_item *scoutfs_item_next_dirty(struct super_block *sb, + struct scoutfs_item *item); +void scoutfs_item_all_clean(struct super_block *sb); +void scoutfs_item_put(struct scoutfs_item *item); + +#endif diff --git a/kmod/src/key.h b/kmod/src/key.h new file mode 100644 index 00000000..342a0529 --- /dev/null +++ b/kmod/src/key.h @@ -0,0 +1,43 @@ +#ifndef _SCOUTFS_KEY_H_ +#define _SCOUTFS_KEY_H_ + +#include +#include "format.h" + +#define CKF "%llu.%u.%llu" +#define CKA(key) \ + le64_to_cpu((key)->inode), (key)->type, le64_to_cpu((key)->offset) + +static inline u64 scoutfs_key_inode(struct scoutfs_key *key) +{ + return le64_to_cpu(key->inode); +} + +static inline u64 scoutfs_key_offset(struct scoutfs_key *key) +{ + return le64_to_cpu(key->offset); +} + +static inline int le64_cmp(__le64 a, __le64 b) +{ + return le64_to_cpu(a) < le64_to_cpu(b) ? -1 : + le64_to_cpu(a) > le64_to_cpu(b) ? 1 : 0; +} + +static inline int scoutfs_key_cmp(struct scoutfs_key *a, struct scoutfs_key *b) +{ + return le64_cmp(a->inode, b->inode) ?: + ((short)a->type - (short)b->type) ?: + le64_cmp(a->offset, b->offset); +} + + +static inline void scoutfs_set_key(struct scoutfs_key *key, u64 inode, u8 type, + u64 offset) +{ + key->inode = cpu_to_le64(inode); + key->type = type; + key->offset = cpu_to_le64(offset); +} + +#endif diff --git a/kmod/src/lsm.c b/kmod/src/lsm.c new file mode 100644 index 00000000..da1758bf --- /dev/null +++ b/kmod/src/lsm.c @@ -0,0 +1,330 @@ +/* + * Copyright (C) 2016 Versity Software, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include +#include +#include +#include +#include +#include + +#include "format.h" +#include "dir.h" +#include "inode.h" +#include "key.h" +#include "item.h" +#include "super.h" +#include "lsm.h" + +#define PAGE_CACHE_PAGE_BITS (PAGE_CACHE_SIZE * 8) + +/* XXX garbage hack until we have siphash */ +static u64 bloom_hash(struct scoutfs_key *key, __le64 *hash_key) +{ + __le32 *salts = (void *)hash_key; + + return ((u64)crc32c(le32_to_cpu(salts[0]), key, sizeof(*key)) << 32) | + crc32c(le32_to_cpu(salts[1]), key, sizeof(*key)); +} + +/* + * Set the caller's bloom indices for their item key. + */ +static void get_bloom_indices(struct super_block *sb, + struct scoutfs_key *key, u32 *ind) +{ + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + __le64 *hash_key = sbi->bloom_hash_keys; + u64 hash; + int h; + int i; + + for (i = 0; ; ) { + hash = bloom_hash(key, hash_key); + hash_key += 2; + + for (h = 0; h < 64 / SCOUTFS_BLOOM_INDEX_BITS; h++) { + ind[i++] = hash & SCOUTFS_BLOOM_INDEX_MASK; + if (i == SCOUTFS_BLOOM_INDEX_NR) + return; + + hash >>= SCOUTFS_BLOOM_INDEX_BITS; + } + } +} + +struct pages { + /* fixed for the group of pages */ + struct address_space *mapping; + struct page **pages; + pgoff_t pgoff; + + /* number of pages stored in the pages array */ + int nr; + /* byte offset of the free space at end of current page */ + int off; + /* bytes remaining in the ovarall large block */ + int remaining; +}; + +/* + * The caller has our fixed-size bloom filter in the locked pages + * starting at the given byte offset in the first page. Our job is to + * hash the key and set its bits in the bloom filter. + */ +static void set_bloom_bits(struct super_block *sb, struct page **pages, + unsigned int offset, struct scoutfs_key *key) +{ + u32 inds[SCOUTFS_BLOOM_INDEX_NR]; + struct page *page; + int offset_bits = offset * 8; + int full_bit; + int page_bit; + void *addr; + int i; + + get_bloom_indices(sb, key, inds); + + for (i = 0; i < SCOUTFS_BLOOM_INDEX_NR; i++) { + full_bit = offset_bits + inds[i]; + page = pages[full_bit / PAGE_CACHE_PAGE_BITS]; + page_bit = full_bit % PAGE_CACHE_PAGE_BITS; + + addr = kmap_atomic(page); + set_bit_le(page_bit, addr); + kunmap_atomic(addr); + } +} + +/* + * XXX the zeroing here is unreliable. We'll want to zero the bloom but + * not all the pages that are about to be overwritten. Bleh. + * + * Returns the number of bytes copied if there was room. Returns 0 if + * there wasn't. Returns -errno on a hard failure. + */ +static int copy_to_pages(struct pages *pgs, void *ptr, size_t count) +{ + struct page *page; + int ret = count; + void *addr; + int bytes; + + if (count > pgs->remaining) + return 0; + + while (count) { + if (pgs->off == PAGE_CACHE_SIZE) { + page = find_or_create_page(pgs->mapping, + pgs->pgoff + pgs->nr, + GFP_NOFS | __GFP_ZERO); + trace_printk("page %p\n", page); + if (!page) { + ret = -ENOMEM; + break; + } + + pgs->pages[pgs->nr++] = page; + pgs->off = 0; + } else { + page = pgs->pages[pgs->nr - 1]; + } + + bytes = min(PAGE_CACHE_SIZE - pgs->off, count); + + trace_printk("page %p off %d ptr %p count %zu bytes %d remaining %d\n", + page, pgs->off, ptr, count, bytes, pgs->remaining); + + if (ptr) { + addr = kmap_atomic(page); + memcpy(addr + pgs->off, ptr, bytes); + kunmap_atomic(addr); + ptr += bytes; + } + count -= bytes; + pgs->off += bytes; + pgs->remaining -= bytes; + } + + return ret; +} + +static void drop_pages(struct pages *pgs, bool dirty) +{ + struct page *page; + int i; + + if (!pgs->pages) + return; + + for (i = 0; i < pgs->nr; i++) { + page = pgs->pages[i]; + + SetPageUptodate(page); + if (dirty) + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); + } +} + +/* + * Write dirty items from the given item into dirty page cache pages in + * the block device at the given large block number. + * + * All the page cache pages are locked and pinned while they're being + * dirtied. The intent is to have a single large IO leave once they're + * all ready. This is an easy way to do that while maintaining + * consistency with the block device page cache. But it might not work :). + * + * We do one sweep over the items. The item's aren't indexed. We might + * want to change that. + * + * Even though we're doing one sweep over the items we're holding the + * bloom filter and header pinned until the items are done. If we didn't + * mind the risk of the blocks going out of order we wouldn't need the + * allocated array of page pointers. + */ +static struct scoutfs_item *dirty_block_pages(struct super_block *sb, + struct scoutfs_item *item, u64 blkno) +{ + struct scoutfs_item_header ihdr; + struct scoutfs_lsm_block lblk; + struct pages pgs; + void *addr; + int ret; + + /* assuming header starts page, and pgoff shift calculation */ + BUILD_BUG_ON(SCOUTFS_BLOCK_SHIFT < PAGE_CACHE_SHIFT); + + if (WARN_ON_ONCE(!item)) + return item; + + /* XXX not super thrilled with this allocation */ + pgs.pages = kmalloc_array(SCOUTFS_BLOCK_SIZE / PAGE_CACHE_SIZE, + sizeof(struct page *), GFP_NOFS); + if (!pgs.pages) { + ret = -ENOMEM; + goto out; + } + + pgs.mapping = sb->s_bdev->bd_inode->i_mapping; + pgs.pgoff = blkno >> (SCOUTFS_BLOCK_SHIFT - PAGE_CACHE_SHIFT); + pgs.nr = 0; + pgs.off = PAGE_CACHE_SIZE, + pgs.remaining = SCOUTFS_BLOCK_SIZE; + + /* reserve space at the start of the block for header and bloom */ + ret = copy_to_pages(&pgs, NULL, sizeof(lblk)); + if (ret > 0) + ret = copy_to_pages(&pgs, NULL, SCOUTFS_BLOOM_FILTER_BYTES); + if (ret <= 0) + goto out; + + lblk.first = item->key; + lblk.nr_items = 0; + do { + trace_printk("item %p key "CKF"\n", item, CKA(&item->key)); + + ihdr.key = item->key; + ihdr.val_len = cpu_to_le16(item->val_len); + ret = copy_to_pages(&pgs, &ihdr, sizeof(ihdr)); + if (ret > 0) + ret = copy_to_pages(&pgs, item->val, item->val_len); + if (ret <= 0) + goto out; + + lblk.last = item->key; + le32_add_cpu(&lblk.nr_items, 1); + + /* set each item's bloom bits */ + set_bloom_bits(sb, pgs.pages, sizeof(lblk), &item->key); + + item = scoutfs_item_next_dirty(sb, item); + } while (item); + + /* copy the filled in header to the start of the block */ + addr = kmap_atomic(pgs.pages[0]); + memcpy(addr, &lblk, sizeof(lblk)); + kunmap_atomic(addr); + +out: + /* dirty if no error (null ok!), unlock, and release */ + drop_pages(&pgs, !IS_ERR(item)); + kfree(pgs.pages); + if (ret < 0) { + scoutfs_item_put(item); + item = ERR_PTR(ret); + } + return item; +} + +/* + * Sync dirty data by writing all the dirty items into a series of level + * 0 blocks. + * + * This is an initial first pass, the full method will need to: + * - wait for pending writers + * - block future writers + * - update our manifest regardless of server communication + * - communicate blocks and key ranges to server + * - ensure that racing sync/dirty don't livelock + */ +int scoutfs_sync_fs(struct super_block *sb, int wait) +{ + struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + struct scoutfs_item *item; + u64 blknos[16]; /* XXX */ + u64 blkno; + int ret = 0; + int i; + + item = scoutfs_item_next_dirty(sb, NULL); + if (!item) + return 0; + + for (i = 0; i < ARRAY_SIZE(blknos); i++) { + blkno = atomic64_inc_return(&sbi->next_blkno); + + item = dirty_block_pages(sb, item, blkno); + if (IS_ERR(item)) { + ret = PTR_ERR(item); + goto out; + } + + /* start each block's IO */ + ret = filemap_flush(mapping); + if (ret) + goto out; + + if (!item) + break; + } + /* dirty items should have been limited */ + WARN_ON_ONCE(i >= ARRAY_SIZE(blknos)); + + /* then wait for all block IO to finish */ + if (wait) { + ret = filemap_write_and_wait(mapping); + if (ret) + goto out; + } + + /* mark everything clean */ + scoutfs_item_all_clean(sb); + ret = 0; +out: + trace_printk("ret %d\n", ret); + WARN_ON_ONCE(ret); + return ret; +} diff --git a/kmod/src/lsm.h b/kmod/src/lsm.h new file mode 100644 index 00000000..efed64e9 --- /dev/null +++ b/kmod/src/lsm.h @@ -0,0 +1,6 @@ +#ifndef _SCOUTFS_LSM_H_ +#define _SCOUTFS_LSM_H_ + +int scoutfs_sync_fs(struct super_block *sb, int wait); + +#endif diff --git a/kmod/src/mkfs.c b/kmod/src/mkfs.c new file mode 100644 index 00000000..2a1df169 --- /dev/null +++ b/kmod/src/mkfs.c @@ -0,0 +1,52 @@ +#include +#include +#include +#include + +#include "super.h" +#include "item.h" +#include "key.h" +#include "mkfs.h" + +/* + * For now a file system system only exists in the item cache for the + * duration of the mount. This "mkfs" hack creates a root dir inode in + * the item cache on mount so that we can run tests in memory and not + * worry about user space or persistent storage. + */ +int scoutfs_mkfs(struct super_block *sb) +{ + const struct timespec ts = current_kernel_time(); + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + struct scoutfs_inode *cinode; + struct scoutfs_item *item; + struct scoutfs_key key; + int i; + + atomic64_set(&sbi->next_ino, SCOUTFS_ROOT_INO + 1); + atomic64_set(&sbi->next_blkno, 2); + + for (i = 0; i < ARRAY_SIZE(sbi->bloom_hash_keys); i++) { + get_random_bytes(&sbi->bloom_hash_keys[i], + sizeof(sbi->bloom_hash_keys[i])); + } + + scoutfs_set_key(&key, SCOUTFS_ROOT_INO, SCOUTFS_INODE_KEY, 0); + + item = scoutfs_item_create(sb, &key, sizeof(struct scoutfs_inode)); + if (IS_ERR(item)) + return PTR_ERR(item); + + cinode = item->val; + memset(cinode, 0, sizeof(struct scoutfs_inode)); + cinode->nlink = cpu_to_le32(2); + cinode->mode = cpu_to_le32(S_IFDIR | 0755); + cinode->atime.sec = cpu_to_le64(ts.tv_sec); + cinode->atime.nsec = cpu_to_le32(ts.tv_nsec); + cinode->ctime = cinode->atime; + cinode->mtime = cinode->atime; + get_random_bytes(&cinode->salt, sizeof(cinode->salt)); + + scoutfs_item_put(item); + return 0; +} diff --git a/kmod/src/mkfs.h b/kmod/src/mkfs.h new file mode 100644 index 00000000..51679417 --- /dev/null +++ b/kmod/src/mkfs.h @@ -0,0 +1,6 @@ +#ifndef _SCOUTFS_MKFS_H_ +#define _SCOUTFS_MKFS_H_ + +int scoutfs_mkfs(struct super_block *sb); + +#endif diff --git a/kmod/src/super.c b/kmod/src/super.c new file mode 100644 index 00000000..27336d03 --- /dev/null +++ b/kmod/src/super.c @@ -0,0 +1,103 @@ +/* + * Copyright (C) 2015 Versity Software, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include +#include +#include +#include +#include + +#include "super.h" +#include "format.h" +#include "mkfs.h" +#include "inode.h" +#include "dir.h" +#include "lsm.h" + +static const struct super_operations scoutfs_super_ops = { + .alloc_inode = scoutfs_alloc_inode, + .destroy_inode = scoutfs_destroy_inode, + .sync_fs = scoutfs_sync_fs, +}; + +static int scoutfs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct scoutfs_sb_info *sbi; + struct inode *inode; + int ret; + + sb->s_magic = SCOUTFS_SUPER_MAGIC; + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_op = &scoutfs_super_ops; + + sbi = kzalloc(sizeof(struct scoutfs_sb_info), GFP_KERNEL); + sb->s_fs_info = sbi; + if (!sbi) + return -ENOMEM; + + spin_lock_init(&sbi->item_lock); + sbi->item_root = RB_ROOT; + sbi->dirty_item_root = RB_ROOT; + + ret = scoutfs_mkfs(sb); + if (ret) + return ret; + + inode = scoutfs_iget(sb, SCOUTFS_ROOT_INO); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + sb->s_root = d_make_root(inode); + if (!sb->s_root) + return -ENOMEM; + + return 0; +} + +static struct dentry *scoutfs_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, scoutfs_fill_super); +} + +static void scoutfs_kill_sb(struct super_block *sb) +{ + kill_block_super(sb); + kfree(sb->s_fs_info); +} + +static struct file_system_type scoutfs_fs_type = { + .owner = THIS_MODULE, + .name = "scoutfs", + .mount = scoutfs_mount, + .kill_sb = scoutfs_kill_sb, + .fs_flags = FS_REQUIRES_DEV, +}; + +static int __init scoutfs_module_init(void) +{ + return scoutfs_inode_init() ?: + scoutfs_dir_init() ?: + register_filesystem(&scoutfs_fs_type); +} +module_init(scoutfs_module_init) + +static void __exit scoutfs_module_exit(void) +{ + unregister_filesystem(&scoutfs_fs_type); + scoutfs_dir_exit(); + scoutfs_inode_exit(); +} +module_exit(scoutfs_module_exit) + +MODULE_AUTHOR("Zach Brown "); +MODULE_LICENSE("GPL"); diff --git a/kmod/src/super.h b/kmod/src/super.h new file mode 100644 index 00000000..1b6f0be0 --- /dev/null +++ b/kmod/src/super.h @@ -0,0 +1,22 @@ +#ifndef _SCOUTFS_SUPER_H_ +#define _SCOUTFS_SUPER_H_ + +#include + +struct scoutfs_sb_info { + atomic64_t next_ino; + atomic64_t next_blkno; + + __le64 bloom_hash_keys[6]; /* XXX */ + + spinlock_t item_lock; + struct rb_root item_root; + struct rb_root dirty_item_root; +}; + +static inline struct scoutfs_sb_info *SCOUTFS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +#endif