Initial commit

This is the initial commit of the repo that will track development against distro kernels. This is an import of a prototype branch in the upstream kernel that only had a few initial commits. It needed to move to the old readdir interface and use find_or_create_page() instead of pagecache_get_page() to build in older distro kernels.
2026-01-03 10:55:20 +00:00 · 2016-02-05 13:15:41 -08:00
commit 25a1e8d1b7
18 changed files with 2033 additions and 0 deletions
--- a/kmod/.gitignore
+++ b/kmod/.gitignore
@@ -0,0 +1,7 @@
+src/*.o
+src/*.ko
+src/*.mod.c
+src/*.cmd
+src/.tmp_versions/
+src/Module.symvers
+src/modules.order
--- a/kmod/Makefile
+++ b/kmod/Makefile
@@ -0,0 +1,4 @@
+ALL: module
+
+module:
+	make CONFIG_SCOUTFS_FS=m -C $(SK_KSRC) M=$(PWD)/src
--- a/kmod/src/Kconfig
+++ b/kmod/src/Kconfig
@@ -0,0 +1,10 @@
+config SCOUTFS_FS
+	tristate "scoutfs filesystem"
+	help
+	  scoutfs is a clustered file system that stores data in large
+	  blocks in shared block storage.
+
+	  To compile this file system support as a module, choose M here. The
+	  module will be called scoutfs.
+
+	  If unsure, say N.
--- a/kmod/src/Makefile
+++ b/kmod/src/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_SCOUTFS_FS) := scoutfs.o
+ 
+scoutfs-y += dir.o inode.o item.o lsm.o mkfs.o super.o
--- a/kmod/src/dir.c
+++ b/kmod/src/dir.c
@@ -0,0 +1,551 @@
+/*
+ * Copyright (C) 2016 Versity Software, Inc.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/crc32c.h>
+#include <linux/uio.h>
+
+#include "format.h"
+#include "dir.h"
+#include "inode.h"
+#include "key.h"
+#include "item.h"
+#include "super.h"
+
+/*
+ * Directory entries are stored in items whose offset is determined by
+ * the hash of the entry's name.  This was primarily chosen to minimize
+ * the amount of data stored for each entry.
+ *
+ * Because we're hashing the name we need to worry about collisions.  We
+ * store all the entries with the same hash value in the item.  This was
+ * done so that create works with one specific item.
+ *
+ * readdir iterates over these items in hash order.  The high bits of
+ * the entry's readdir f_pos come from the item offset while the low
+ * bits come from a collision number in the entry.
+ *
+ * The full readdir position, and thus the absolute max number of
+ * entries in a directory, is limited to 2^31 to avoid the risk of
+ * breaking legacy environments.  Even with a relatively small 27bit
+ * item offset allowing 16 colliding entries gets well into hundreds of
+ * millions of entries before an item fills up and we return a premature
+ * ENOSPC.  Hundreds of millions in a single dir ought to be, wait for
+ * it, good enough for anybody.
+ *
+ * Each item's contents are protected by the dir inode's i_mutex that
+ * callers acquire before calling our dir operations.  If we wanted more
+ * fine grained concurrency, and we might, we'd have to be careful to
+ * manage the shared items.
+ */
+
+static unsigned int mode_to_type(umode_t mode)
+{
+#define S_SHIFT 12
+	static unsigned char mode_types[S_IFMT >> S_SHIFT] = {
+		[S_IFIFO >> S_SHIFT]	= SCOUTFS_DT_FIFO,
+		[S_IFCHR >> S_SHIFT]	= SCOUTFS_DT_CHR,
+		[S_IFDIR >> S_SHIFT]	= SCOUTFS_DT_DIR,
+		[S_IFBLK >> S_SHIFT]	= SCOUTFS_DT_BLK,
+		[S_IFREG >> S_SHIFT]	= SCOUTFS_DT_REG,
+		[S_IFLNK >> S_SHIFT]	= SCOUTFS_DT_LNK,
+		[S_IFSOCK >> S_SHIFT]	= SCOUTFS_DT_SOCK,
+	};
+
+	return mode_types[(mode & S_IFMT) >> S_SHIFT];
+#undef S_SHIFT
+}
+
+#if 0
+static unsigned int dentry_type(unsigned int type)
+{
+	static unsigned char types[] = {
+		[SCOUTFS_DT_FIFO]	= DT_FIFO,
+		[SCOUTFS_DT_CHR]	= DT_CHR,
+		[SCOUTFS_DT_DIR]	= DT_DIR,
+		[SCOUTFS_DT_BLK]	= DT_BLK,
+		[SCOUTFS_DT_REG]	= DT_REG,
+		[SCOUTFS_DT_LNK]	= DT_LNK,
+		[SCOUTFS_DT_SOCK]	= DT_SOCK,
+		[SCOUTFS_DT_WHT]	= DT_WHT,
+	};
+
+	if (type < ARRAY_SIZE(types))
+		return types[type];
+
+	return DT_UNKNOWN;
+}
+#endif
+
+static int names_equal(const char *name_a, int len_a, const char *name_b,
+		       int len_b)
+{
+	return (len_a == len_b) && !memcmp(name_a, name_b, len_a);
+}
+
+/*
+ * Return the offset portion of a dirent key from the hash of the name.
+ *
+ * XXX This crc nonsense is a quick hack.  We'll want something a
+ * lot stronger like siphash.
+ */
+static u32 name_hash(struct inode *dir, const char *name, unsigned int len)
+{
+	struct scoutfs_inode_info *ci = SCOUTFS_I(dir);
+
+	return crc32c(ci->salt, name, len) >> (32 - SCOUTFS_DIRENT_OFF_BITS);
+}
+
+static unsigned int dent_bytes(unsigned int name_len)
+{
+	return sizeof(struct scoutfs_dirent) + name_len;
+}
+
+static unsigned int dent_val_off(struct scoutfs_item *item,
+				 struct scoutfs_dirent *dent)
+{
+	return (char *)dent - (char *)item->val;
+}
+
+static inline struct scoutfs_dirent *next_dent(struct scoutfs_item *item,
+					       struct scoutfs_dirent *dent)
+{
+	unsigned int next_off;
+
+	next_off = dent_val_off(item, dent) + dent_bytes(dent->name_len);
+	if (next_off == item->val_len)
+		return NULL;
+
+	return item->val + next_off;
+}
+
+#define for_each_item_dent(item, dent) \
+	for (dent = item->val; dent; dent = next_dent(item, dent))
+
+struct dentry_info {
+	/*
+	 * The key offset and collision nr are stored so that we don't
+	 * have to either hash the name to find the item or compare
+	 * names to find the dirent in the item.
+	 */
+	u32 key_offset;
+	u8 coll_nr;
+};
+
+static struct kmem_cache *scoutfs_dentry_cachep;
+
+static struct dentry_info *alloc_dentry_info(struct dentry *dentry)
+{
+	struct dentry_info *di;
+
+	/* XXX read mb? */
+	if (dentry->d_fsdata)
+		return dentry->d_fsdata;
+
+	di = kmem_cache_zalloc(scoutfs_dentry_cachep, GFP_NOFS);
+	if (!di)
+		return ERR_PTR(-ENOMEM);
+
+	spin_lock(&dentry->d_lock);
+	if (!dentry->d_fsdata)
+		dentry->d_fsdata = di;
+	spin_unlock(&dentry->d_lock);
+
+	if (di != dentry->d_fsdata)
+		kmem_cache_free(scoutfs_dentry_cachep, di);
+
+	return dentry->d_fsdata;
+}
+
+/*
+ * Lookup searches for an entry for the given name amongst the entries
+ * stored in the item at the name's hash. 
+ */
+static struct dentry *scoutfs_lookup(struct inode *dir, struct dentry *dentry,
+				     unsigned int flags)
+{
+	struct super_block *sb = dir->i_sb;
+	struct scoutfs_dirent *dent;
+	struct scoutfs_item *item;
+	struct dentry_info *di;
+	struct scoutfs_key key;
+	struct inode *inode;
+	u64 ino = 0;
+	u32 h = 0;
+	u32 nr = 0;
+	int ret;
+
+	di = alloc_dentry_info(dentry);
+	if (IS_ERR(di)) {
+		ret = PTR_ERR(di);
+		goto out;
+	}
+
+	if (dentry->d_name.len > SCOUTFS_NAME_LEN) {
+		ret = -ENAMETOOLONG;
+		goto out;
+	}
+
+	h = name_hash(dir, dentry->d_name.name, dentry->d_name.len);
+	scoutfs_set_key(&key, scoutfs_ino(dir), SCOUTFS_DIRENT_KEY, h);
+
+	item = scoutfs_item_lookup(sb, &key);
+	if (IS_ERR(item)) {
+		ret = PTR_ERR(item);
+		goto out;
+	}
+
+	ret = -ENOENT;
+	for_each_item_dent(item, dent) {
+		if (names_equal(dentry->d_name.name, dentry->d_name.len,
+				dent->name, dent->name_len)) {
+			ino = le64_to_cpu(dent->ino);
+			nr = dent->coll_nr;
+			ret = 0;
+			break;
+		}
+	}
+
+	scoutfs_item_put(item);
+out:
+	if (ret == -ENOENT) {
+		inode = NULL;
+	} else if (ret) {
+		inode = ERR_PTR(ret);
+	} else {
+		di->key_offset = h;
+		di->coll_nr = nr;
+		inode = scoutfs_iget(sb, ino);
+	}
+
+	return d_splice_alias(inode, dentry);
+}
+
+/* this exists upstream so we can just delete it in a forward port */
+static int dir_emit_dots(struct file *file, void *dirent, filldir_t filldir)
+{
+	struct dentry *dentry = file->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	struct inode *parent = dentry->d_parent->d_inode;
+
+	if (file->f_pos == 0) {
+		if (!filldir(dirent, ".", 1, 1, scoutfs_ino(inode), DT_DIR))
+			return 0;
+		file->f_pos = 1;
+	}
+
+	if (file->f_pos == 1) {
+		if (!filldir(dirent, "..", 2, 1, scoutfs_ino(parent), DT_DIR))
+			return 0;
+		file->f_pos = 2;
+	}
+
+	return 1;
+}
+
+/*
+ * readdir finds the next entry at or past the hash|coll_nr stored in
+ * the ctx->pos (f_pos).
+ *
+ * It will need to be careful not to read past the region of the dirent
+ * hash offset keys that it has access to.
+ */
+static int scoutfs_readdir(struct file *file, void *dirent, filldir_t filldir)
+{
+	struct inode *inode = file_inode(file);
+	struct super_block *sb = inode->i_sb;
+	struct scoutfs_dirent *dent;
+	struct scoutfs_key last_key;
+	struct scoutfs_item *item;
+	struct scoutfs_key key;
+	u32 nr;
+	u32 off;
+	u64 pos;
+	int ret = 0;
+
+	if (!dir_emit_dots(file, dirent, filldir))
+		return 0;
+
+	scoutfs_set_key(&last_key, scoutfs_ino(inode), SCOUTFS_DIRENT_KEY,
+			SCOUTFS_DIRENT_OFF_MASK);
+
+	do {
+		off = file->f_pos >> SCOUTFS_DIRENT_COLL_BITS;
+		nr = file->f_pos & SCOUTFS_DIRENT_COLL_MASK;
+
+		scoutfs_set_key(&key, scoutfs_ino(inode), SCOUTFS_DIRENT_KEY,
+				off);
+		item = scoutfs_item_next(sb, &key);
+		if (IS_ERR(item)) {
+			ret = PTR_ERR(item);
+			if (ret == -ENOENT)
+				ret = 0;
+			break;
+		}
+
+		if (scoutfs_key_cmp(&item->key, &last_key) > 0) {
+			scoutfs_item_put(item);
+			break;
+		}
+
+		/* reset nr to 0 if we found the next item */
+		if (scoutfs_key_offset(&item->key) != off)
+			nr = 0;
+
+		pos = scoutfs_key_offset(&item->key)
+			<< SCOUTFS_DIRENT_COLL_BITS;
+		for_each_item_dent(item, dent) {
+			if (dent->coll_nr < nr)
+				continue;
+
+			if (!filldir(dirent, dent->name, dent->name_len, pos,
+				     le64_to_cpu(dent->ino), dent->type))
+				break;
+
+			file->f_pos = (pos | dent->coll_nr) + 1;
+		}
+
+		scoutfs_item_put(item);
+
+		/* advance to the next hash value if we finished item */
+		if (dent == NULL)
+			file->f_pos = pos + (1 << SCOUTFS_DIRENT_COLL_BITS);
+
+	} while (dent == NULL);
+
+	return ret;
+}
+
+static int scoutfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
+		       dev_t rdev)
+{
+	struct super_block *sb = dir->i_sb;
+	struct inode *inode = NULL;
+	struct scoutfs_dirent *dent;
+	struct scoutfs_item *item;
+	struct dentry_info *di;
+	struct scoutfs_key key;
+	int bytes;
+	int ret;
+	int off;
+	u64 nr;
+	u64 h;
+
+	di = alloc_dentry_info(dentry);
+	if (IS_ERR(di))
+		return PTR_ERR(di);
+
+	if (dentry->d_name.len > SCOUTFS_NAME_LEN)
+		return -ENAMETOOLONG;
+
+	inode = scoutfs_new_inode(sb, dir, mode, rdev);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	h = name_hash(dir, dentry->d_name.name, dentry->d_name.len);
+	scoutfs_set_key(&key, scoutfs_ino(dir), SCOUTFS_DIRENT_KEY, h);
+	bytes = dent_bytes(dentry->d_name.len);
+
+	item = scoutfs_item_lookup(sb, &key);
+	if (item == ERR_PTR(-ENOENT)) {
+		item = scoutfs_item_create(sb, &key, bytes);
+		if (!IS_ERR(item)) {
+			/* mark a newly created item */
+			dent = item->val;
+			dent->name_len = 0;
+		}
+	}
+	if (IS_ERR(item)) {
+		ret = PTR_ERR(item);
+		goto out;
+	}
+
+	ret = 0;
+	nr = 0;
+	for_each_item_dent(item, dent) {
+		/* the common case of a newly created item */
+		if (!dent->name_len)
+			break;
+
+		/* XXX check for eexist?  can't happen? */
+
+		/* found a free coll nr, insert here */
+		if (nr < dent->coll_nr) {
+			off = dent_val_off(item, dent);
+			ret = scoutfs_item_expand(item, off, bytes);
+			if (!ret)
+				dent = item->val + off;
+			break;
+		}
+
+		/* the item's full */
+		if (nr++ == SCOUTFS_DIRENT_COLL_MASK) {
+			ret = -ENOSPC;
+			break;
+		}
+	}
+
+	if (!ret) {
+		dent->ino = cpu_to_le64(scoutfs_ino(inode));
+		dent->type = mode_to_type(inode->i_mode);
+		dent->coll_nr = nr;
+		dent->name_len = dentry->d_name.len;
+		memcpy(dent->name, dentry->d_name.name, dent->name_len);
+		di->key_offset = h;
+		di->coll_nr = nr;
+	}
+
+	scoutfs_item_put(item);
+
+	if (ret)
+		goto out;
+
+	i_size_write(dir, i_size_read(dir) + dentry->d_name.len);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+
+	if (S_ISDIR(mode)) {
+		inc_nlink(inode);
+		inc_nlink(dir);
+	}
+
+	mark_inode_dirty(inode);
+	mark_inode_dirty(dir);
+
+	insert_inode_hash(inode);
+	d_instantiate(dentry, inode);
+out:
+	/* XXX delete the inode item here */
+	if (ret && !IS_ERR_OR_NULL(inode))
+		iput(inode);
+	return ret;
+}
+
+/* XXX hmm, do something with excl? */
+static int scoutfs_create(struct inode *dir, struct dentry *dentry,
+			  umode_t mode, bool excl)
+{
+	return scoutfs_mknod(dir, dentry, mode | S_IFREG, 0);
+}
+
+static int scoutfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	return scoutfs_mknod(dir, dentry, mode | S_IFDIR, 0);
+}
+
+/*
+ * Unlink removes the entry from its item and removes the item if ours
+ * was the only remaining entry.
+ */
+static int scoutfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct super_block *sb = dir->i_sb;
+	struct inode *inode = dentry->d_inode;
+	struct timespec ts = current_kernel_time();
+	struct scoutfs_dirent *dent;
+	struct scoutfs_item *item;
+	struct dentry_info *di;
+	struct scoutfs_key key;
+	int ret = 0;
+
+	if (WARN_ON_ONCE(!dentry->d_fsdata))
+		return -EINVAL;
+	di = dentry->d_fsdata;
+
+	trace_printk("dir size %llu entry k_off nr %u %u\n",
+		     i_size_read(inode), di->key_offset, di->coll_nr);
+
+	if (S_ISDIR(inode->i_mode) && i_size_read(inode))
+		return -ENOTEMPTY;
+
+	scoutfs_set_key(&key, scoutfs_ino(dir), SCOUTFS_DIRENT_KEY,
+			di->key_offset);
+
+	item = scoutfs_item_lookup(sb, &key);
+	if (IS_ERR(item)) {
+		ret = PTR_ERR(item);
+		goto out;
+	}
+
+	/* XXX error to not find the coll nr we were looking for? */
+	for_each_item_dent(item, dent) {
+		if (dent->coll_nr != di->coll_nr)
+			continue;
+
+		/* XXX compare names and eio? */
+
+		if (item->val_len == dent_bytes(dent->name_len)) {
+			scoutfs_item_delete(sb, item);
+			ret = 0;
+		} else {
+			ret = scoutfs_item_shrink(item,
+						  dent_val_off(item, dent),
+						  dent_bytes(dent->name_len));
+		}
+		dent = NULL;
+		break;
+	}
+
+	scoutfs_item_put(item);
+
+	if (ret)
+		goto out;
+
+	dir->i_ctime = ts;
+	dir->i_mtime = ts;
+	i_size_write(dir, i_size_read(dir) - dentry->d_name.len);
+
+	inode->i_ctime = ts;
+	drop_nlink(inode);
+	if (S_ISDIR(inode->i_mode)) {
+		drop_nlink(dir);
+		drop_nlink(inode);
+	}
+	mark_inode_dirty(inode);
+	mark_inode_dirty(dir);
+
+out:
+	return ret;
+}
+
+const struct file_operations scoutfs_dir_fops = {
+	.readdir	= scoutfs_readdir,
+};
+
+const struct inode_operations scoutfs_dir_iops = {
+	.lookup		= scoutfs_lookup,
+	.mknod		= scoutfs_mknod,
+	.create		= scoutfs_create,
+	.mkdir		= scoutfs_mkdir,
+	.unlink		= scoutfs_unlink,
+	.rmdir		= scoutfs_unlink,
+};
+
+void scoutfs_dir_exit(void)
+{
+	if (scoutfs_dentry_cachep) {
+		kmem_cache_destroy(scoutfs_dentry_cachep);
+		scoutfs_dentry_cachep = NULL;
+	}
+}
+
+int scoutfs_dir_init(void)
+{
+	scoutfs_dentry_cachep = kmem_cache_create("scoutfs_dentry_info",
+						  sizeof(struct dentry_info), 0,
+						  SLAB_RECLAIM_ACCOUNT, NULL);
+	if (!scoutfs_dentry_cachep)
+		return -ENOMEM;
+
+	return 0;
+}
--- a/kmod/src/dir.h
+++ b/kmod/src/dir.h
@@ -0,0 +1,10 @@
+#ifndef _SCOUTFS_DIR_H_
+#define _SCOUTFS_DIR_H_
+
+extern const struct file_operations scoutfs_dir_fops;
+extern const struct inode_operations scoutfs_dir_iops;
+
+int scoutfs_dir_init(void);
+void scoutfs_dir_exit(void);
+
+#endif
--- a/kmod/src/format.h
+++ b/kmod/src/format.h
@@ -0,0 +1,122 @@
+#ifndef _SCOUTFS_FORMAT_H_
+#define _SCOUTFS_FORMAT_H_
+
+#define SCOUTFS_SUPER_MAGIC       0x554f4353      /* "SCOU" */
+
+#define SCOUTFS_BLOCK_SHIFT 22
+#define SCOUTFS_BLOCK_SIZE (1 << SCOUTFS_BLOCK_SHIFT)
+
+/*
+ * This bloom size is chosen to have a roughly 1% false positive rate
+ * for ~90k items which is roughly the worst case for a block full of
+ * dirents with reasonably small names.  Pathologically smaller items
+ * could be even more dense.
+ */
+#define SCOUTFS_BLOOM_FILTER_BYTES (128 * 1024)
+#define SCOUTFS_BLOOM_FILTER_BITS (SCOUTFS_BLOOM_FILTER_BYTES * 8)
+#define SCOUTFS_BLOOM_INDEX_BITS (ilog2(SCOUTFS_BLOOM_FILTER_BITS))
+#define SCOUTFS_BLOOM_INDEX_MASK ((1 << SCOUTFS_BLOOM_INDEX_BITS) - 1)
+#define SCOUTFS_BLOOM_INDEX_NR 7
+
+/*
+ * We should be able to make the offset smaller if neither dirents nor
+ * data items use the full 64 bits.
+ */
+struct scoutfs_key {
+	__le64 inode;
+	u8 type;
+	__le64 offset;
+} __packed;
+
+#define SCOUTFS_INODE_KEY 128
+#define SCOUTFS_DIRENT_KEY 192
+
+struct scoutfs_lsm_block {
+	struct scoutfs_key first;
+	struct scoutfs_key last;
+	__le32 nr_items;
+	/* u8 bloom[SCOUTFS_BLOOM_BYTES]; */
+	/* struct scoutfs_item_header items[0] .. */
+} __packed;
+
+struct scoutfs_item_header {
+	struct scoutfs_key key;
+	__le16 val_len;
+} __packed;
+
+
+struct scoutfs_timespec {
+	__le64 sec;
+	__le32 nsec;
+} __packed;
+
+/*
+ * XXX
+ *	- otime?
+ *	- compat flags?
+ *	- version?
+ *	- generation?
+ *	- be more careful with rdev?
+ */
+struct scoutfs_inode {
+	__le64 size;
+	__le64 blocks;
+	__le32 nlink;
+	__le32 uid;
+	__le32 gid;
+	__le32 mode;
+	__le32 rdev;
+	__le32 salt;
+	struct scoutfs_timespec atime;
+	struct scoutfs_timespec ctime;
+	struct scoutfs_timespec mtime;
+} __packed;
+
+#define SCOUTFS_ROOT_INO 1
+
+/*
+ * Dirents are stored in items with an offset of the hash of their name.
+ * Colliding names are packed into the value.
+ */
+struct scoutfs_dirent {
+	__le64 ino;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8 type:4,
+	     coll_nr:4;
+#else
+	__u8 coll_nr:4,
+	     type:4;
+#endif
+	__u8 name_len;
+	__u8 name[0];
+} __packed;
+
+#define SCOUTFS_NAME_LEN 255
+
+/*
+ * We only use 31 bits for readdir positions so that we don't confuse
+ * old signed 32bit f_pos applications or those on the other side of
+ * network protocols that have limited readir positions.
+ */
+
+#define SCOUTFS_DIRENT_OFF_BITS 27
+#define SCOUTFS_DIRENT_OFF_MASK ((1 << SCOUTFS_DIRENT_OFF_BITS) - 1)
+#define SCOUTFS_DIRENT_COLL_BITS 4
+#define SCOUTFS_DIRENT_COLL_MASK ((1 << SCOUTFS_DIRENT_COLL_BITS) - 1)
+
+/* getdents returns the *next* pos with each entry. so we can't return ~0 */
+#define SCOUTFS_DIRENT_MAX_POS \
+	(((1 << (SCOUTFS_DIRENT_OFF_BITS + SCOUTFS_DIRENT_COLL_BITS)) - 1) - 1)
+
+enum {
+	SCOUTFS_DT_FIFO = 0,
+	SCOUTFS_DT_CHR,
+	SCOUTFS_DT_DIR,
+	SCOUTFS_DT_BLK,
+	SCOUTFS_DT_REG,
+	SCOUTFS_DT_LNK,
+	SCOUTFS_DT_SOCK,
+	SCOUTFS_DT_WHT,
+};
+
+#endif
--- a/kmod/src/inode.c
+++ b/kmod/src/inode.c
@@ -0,0 +1,272 @@
+/*
+ * Copyright (C) 2015 Versity Software, Inc.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+
+#include "format.h"
+#include "super.h"
+#include "key.h"
+#include "inode.h"
+#include "item.h"
+#include "dir.h"
+
+/*
+ * XXX
+ *  - worry about i_ino trunctation, not sure if we do anything
+ *  - use inode item value lengths for forward/back compat
+ */
+
+static struct kmem_cache *scoutfs_inode_cachep;
+
+static void scoutfs_inode_ctor(void *obj)
+{
+	struct scoutfs_inode_info *ci = obj;
+
+	inode_init_once(&ci->inode);
+}
+
+struct inode *scoutfs_alloc_inode(struct super_block *sb)
+{
+	struct scoutfs_inode_info *ci;
+
+	ci = kmem_cache_alloc(scoutfs_inode_cachep, GFP_NOFS);
+	if (!ci)
+		return NULL;
+
+	return &ci->inode;
+}
+
+static void scoutfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+
+	trace_printk("freeing inode %p\n", inode);
+	kmem_cache_free(scoutfs_inode_cachep, SCOUTFS_I(inode));
+}
+
+void scoutfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, scoutfs_i_callback);
+}
+
+/*
+ * Called once new inode allocation or inode reading has initialized
+ * enough of the inode for us to set the ops based on the mode.
+ */
+static void set_inode_ops(struct inode *inode)
+{
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFREG:
+//		inode->i_mapping->a_ops = &scoutfs_file_aops;
+//		inode->i_op = &scoutfs_file_iops;
+//		inode->i_fop = &scoutfs_file_fops;
+		break;
+	case S_IFDIR:
+		inode->i_op = &scoutfs_dir_iops;
+		inode->i_fop = &scoutfs_dir_fops;
+		break;
+	case S_IFLNK:
+//		inode->i_op = &scoutfs_symlink_iops;
+		break;
+	default:
+//		inode->i_op = &scoutfs_special_iops;
+		init_special_inode(inode, inode->i_mode, inode->i_rdev);
+		break;
+	}
+}
+
+static void load_inode(struct inode *inode, struct scoutfs_inode *cinode)
+{
+	struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
+
+	i_size_write(inode, le64_to_cpu(cinode->size));
+	set_nlink(inode, le32_to_cpu(cinode->nlink));
+	i_uid_write(inode, le32_to_cpu(cinode->uid));
+	i_gid_write(inode, le32_to_cpu(cinode->gid));
+	inode->i_mode = le32_to_cpu(cinode->mode);
+	inode->i_rdev = le32_to_cpu(cinode->rdev);
+	inode->i_atime.tv_sec = le64_to_cpu(cinode->atime.sec);
+	inode->i_atime.tv_nsec = le32_to_cpu(cinode->atime.nsec);
+	inode->i_mtime.tv_sec = le64_to_cpu(cinode->mtime.sec);
+	inode->i_mtime.tv_nsec = le32_to_cpu(cinode->mtime.nsec);
+	inode->i_ctime.tv_sec = le64_to_cpu(cinode->ctime.sec);
+	inode->i_ctime.tv_nsec = le32_to_cpu(cinode->ctime.nsec);
+	
+	ci->salt = le32_to_cpu(cinode->salt);
+}
+
+static int scoutfs_read_locked_inode(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct scoutfs_item *item;
+	struct scoutfs_key key;
+
+	scoutfs_set_key(&key, scoutfs_ino(inode), SCOUTFS_INODE_KEY, 0);
+
+	item = scoutfs_item_lookup(sb, &key);
+	if (IS_ERR(item))
+		return PTR_ERR(item);
+
+	load_inode(inode, item->val);
+	scoutfs_item_put(item);
+
+	return 0;
+}
+
+static int scoutfs_iget_test(struct inode *inode, void *arg)
+{
+	struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
+	u64 *ino = arg;
+
+	return ci->ino == *ino;
+}
+
+static int scoutfs_iget_set(struct inode *inode, void *arg)
+{
+	struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
+	u64 *ino = arg;
+
+	inode->i_ino = *ino;
+	ci->ino = *ino;
+
+	return 0;
+}
+
+struct inode *scoutfs_iget(struct super_block *sb, u64 ino)
+{
+	struct inode *inode;
+	int ret;
+
+	inode = iget5_locked(sb, ino, scoutfs_iget_test, scoutfs_iget_set,
+			     &ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	if (inode->i_state & I_NEW) {
+		ret = scoutfs_read_locked_inode(inode);
+		if (ret) {
+			iget_failed(inode);
+			inode = ERR_PTR(ret);
+		} else {
+			set_inode_ops(inode);
+			unlock_new_inode(inode);
+		}
+	}
+
+	return inode;
+}
+
+static void store_inode(struct scoutfs_inode *cinode, struct inode *inode)
+{
+	struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
+
+	cinode->size = cpu_to_le64(i_size_read(inode));
+	cinode->nlink = cpu_to_le32(inode->i_nlink);
+	cinode->uid = cpu_to_le32(i_uid_read(inode));
+	cinode->gid = cpu_to_le32(i_gid_read(inode));
+	cinode->mode = cpu_to_le32(inode->i_mode);
+	cinode->rdev = cpu_to_le32(inode->i_rdev);
+	cinode->atime.sec = cpu_to_le64(inode->i_atime.tv_sec);
+	cinode->atime.nsec = cpu_to_le32(inode->i_atime.tv_nsec);
+	cinode->ctime.sec = cpu_to_le64(inode->i_ctime.tv_sec);
+	cinode->ctime.nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+	cinode->mtime.sec = cpu_to_le64(inode->i_mtime.tv_sec);
+	cinode->mtime.nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+
+	cinode->salt = cpu_to_le32(ci->salt);
+}
+
+/*
+ * Every time we modify the inode in memory we copy it to its inode
+ * item.  This lets us write out blocks of items without having to track
+ * down dirty vfs inodes and safely copy them into items before writing.
+ */
+int scoutfs_inode_update(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct scoutfs_item *item;
+	struct scoutfs_key key;
+
+	scoutfs_set_key(&key, scoutfs_ino(inode), SCOUTFS_INODE_KEY, 0);
+
+	item = scoutfs_item_lookup(sb, &key);
+	if (IS_ERR(item))
+		return PTR_ERR(item);
+
+	store_inode(item->val, inode);
+	scoutfs_item_put(item);
+
+	return 0;
+}
+
+/*
+ * Allocate and initialize a new inode.  The caller is responsible for
+ * creating links to it and updating it.  @dir can be null.
+ */
+struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir,
+				umode_t mode, dev_t rdev)
+{
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	struct scoutfs_inode_info *ci;
+	struct scoutfs_item *item;
+	struct scoutfs_key key;
+	struct inode *inode;
+
+	inode = new_inode(sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	ci = SCOUTFS_I(inode);
+	ci->ino = atomic64_inc_return(&sbi->next_ino);
+	get_random_bytes(&ci->salt, sizeof(ci->salt));
+
+	inode->i_ino = ci->ino;
+	inode_init_owner(inode, dir, mode);
+	inode_set_bytes(inode, 0);
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+	inode->i_rdev = rdev;
+	set_inode_ops(inode);
+
+	scoutfs_set_key(&key, scoutfs_ino(inode), SCOUTFS_INODE_KEY, 0);
+
+	item = scoutfs_item_create(inode->i_sb, &key,
+				   sizeof(struct scoutfs_inode));
+	if (IS_ERR(item)) {
+		iput(inode);
+		inode = ERR_CAST(item);
+	}
+	return inode;
+}
+
+void scoutfs_inode_exit(void)
+{
+	if (scoutfs_inode_cachep) {
+		rcu_barrier();
+		kmem_cache_destroy(scoutfs_inode_cachep);
+		scoutfs_inode_cachep = NULL;
+	}
+}
+
+int scoutfs_inode_init(void)
+{
+	scoutfs_inode_cachep = kmem_cache_create("scoutfs_inode_info",
+					sizeof(struct scoutfs_inode_info), 0,
+					SLAB_RECLAIM_ACCOUNT,
+					scoutfs_inode_ctor);
+	if (!scoutfs_inode_cachep)
+		return -ENOMEM;
+
+	return 0;
+}
--- a/kmod/src/inode.h
+++ b/kmod/src/inode.h
@@ -0,0 +1,32 @@
+#ifndef _SCOUTFS_INODE_H_
+#define _SCOUTFS_INODE_H_
+
+struct scoutfs_inode_info {
+	u64 ino;
+	u32 salt;
+
+	struct inode inode;
+};
+
+static inline struct scoutfs_inode_info *SCOUTFS_I(struct inode *inode)
+{
+	return container_of(inode, struct scoutfs_inode_info, inode);
+}
+
+static inline u64 scoutfs_ino(struct inode *inode)
+{
+	return SCOUTFS_I(inode)->ino;
+}
+
+struct inode *scoutfs_alloc_inode(struct super_block *sb);
+void scoutfs_destroy_inode(struct inode *inode);
+
+struct inode *scoutfs_iget(struct super_block *sb, u64 ino);
+int scoutfs_inode_update(struct inode *inode);
+struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir,
+				umode_t mode, dev_t rdev);
+
+void scoutfs_inode_exit(void);
+int scoutfs_inode_init(void);
+
+#endif
--- a/kmod/src/item.c
+++ b/kmod/src/item.c
@@ -0,0 +1,423 @@
+/*
+ * Copyright (C) 2015 Versity Software, Inc.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/spinlock.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+
+#include "super.h"
+#include "key.h"
+#include "item.h"
+
+/*
+ * describe:
+ *  - tracks per-item dirty state for writing
+ *  - decouples vfs cache lifetimes from item lifetimes
+ *  - item-granular cache for things vfs doesn't cache (readdir, xattr)
+ *
+ * XXX:
+ *  - warnings for invalid keys/lens
+ *  - memory pressure
+ */
+
+enum {
+	ITW_NEXT = 1,
+	ITW_PREV,
+};
+
+static inline struct scoutfs_item *node_item(struct super_block *sb,
+					     struct rb_root *root,
+					     struct rb_node *node)
+{
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	unsigned long off;
+
+	if (root == &sbi->item_root)
+		off = offsetof(struct scoutfs_item, node);
+	else
+		off = offsetof(struct scoutfs_item, dirty_node);
+
+	return (void *)((char *)node - off);
+}
+
+static inline struct rb_node *item_node(struct super_block *sb,
+					struct rb_root *root,
+					struct scoutfs_item *item)
+{
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	unsigned long off;
+
+	if (root == &sbi->item_root)
+		off = offsetof(struct scoutfs_item, node);
+	else
+		off = offsetof(struct scoutfs_item, dirty_node);
+
+	return (void *)((char *)item + off);
+}
+
+/*
+ * Insert a new item in the tree.  The caller must have done a lookup to
+ * ensure that the key is not already present.
+ */
+static void insert_item(struct super_block *sb, struct rb_root *root,
+			struct scoutfs_item *ins)
+{
+	struct rb_node **node = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct scoutfs_item *item;
+	int cmp;
+
+	while (*node) {
+		parent = *node;
+		item = node_item(sb, root, *node);
+
+		cmp = scoutfs_key_cmp(&ins->key, &item->key);
+		BUG_ON(cmp == 0);
+		if (cmp < 0)
+			node = &(*node)->rb_left;
+		else
+			node = &(*node)->rb_right;
+	}
+
+	rb_link_node(item_node(sb, root, ins), parent, node);
+	rb_insert_color(item_node(sb, root, ins), root);
+}
+
+enum {
+	FI_NEXT = 1,
+	FI_PREV,
+};
+
+/*
+ * Walk the tree looking for an item.
+ *
+ * If NEXT or PREV are specified then those will be returned
+ * if the specific item isn't found.
+ */
+static struct scoutfs_item *find_item(struct super_block *sb,
+				      struct rb_root *root,
+				      struct scoutfs_key *key, int np)
+{
+	struct rb_node *node = root->rb_node;
+	struct scoutfs_item *found = NULL;
+	struct scoutfs_item *item;
+	int cmp;
+
+	while (node) {
+		item = node_item(sb, root, node);
+
+		cmp = scoutfs_key_cmp(key, &item->key);
+		if (cmp < 0) {
+			if (np == FI_NEXT)
+				found = item;
+			node = node->rb_left;
+		} else if (cmp > 0) {
+			if (np == FI_PREV)
+				found = item;
+			node = node->rb_right;
+		} else {
+			found = item;
+			break;
+		}
+	}
+
+	return found;
+}
+
+static struct scoutfs_item *alloc_item(struct scoutfs_key *key,
+				       unsigned int val_len)
+{
+	struct scoutfs_item *item;
+	void *val;
+
+	item = kmalloc(sizeof(struct scoutfs_item), GFP_NOFS);
+	val = kmalloc(val_len, GFP_NOFS);
+	if (!item || !val) {
+		kfree(item);
+		kfree(val);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	RB_CLEAR_NODE(&item->node);
+	RB_CLEAR_NODE(&item->dirty_node);
+	atomic_set(&item->refcount, 1);
+	item->key = *key;
+	item->val_len = val_len;
+	item->val = val;
+
+	return item;
+}
+
+/*
+ * Create a new item stored at the given key.  Return it with a reference.
+ * return an ERR_PTR with ENOMEM or EEXIST.
+ *
+ * The caller is responsible for initializing the item's value.
+ */
+struct scoutfs_item *scoutfs_item_create(struct super_block *sb,
+					 struct scoutfs_key *key,
+					 unsigned int val_len)
+{
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	struct scoutfs_item *existing;
+	struct scoutfs_item *item;
+	unsigned long flags;
+
+	item = alloc_item(key, val_len);
+	if (IS_ERR(item))
+		return item;
+
+	spin_lock_irqsave(&sbi->item_lock, flags);
+
+	existing = find_item(sb, &sbi->item_root, key, 0);
+	if (!existing) {
+		insert_item(sb, &sbi->item_root, item);
+		insert_item(sb, &sbi->dirty_item_root, item);
+		atomic_add(2, &item->refcount);
+	}
+	spin_unlock_irqrestore(&sbi->item_lock, flags);
+
+	if (existing) {
+		scoutfs_item_put(item);
+		item = ERR_PTR(-EEXIST);
+	}
+
+	trace_printk("item %p key "CKF" val_len %d\n", item, CKA(key), val_len);
+
+	return item;
+}
+
+/*
+ * The caller is still responsible for unlocking and putting the item.
+ *
+ * We don't try and optimize away the lock for items that are already
+ * removed from the tree.  The caller's locking and item behaviour means
+ * that racing to remove an item is extremely rare.
+ *
+ * XXX for now we're just removing it from the rbtree.  We'd need to leave
+ * behind a deletion record for lsm.
+ */
+void scoutfs_item_delete(struct super_block *sb, struct scoutfs_item *item)
+{
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	unsigned long flags;
+
+	spin_lock_irqsave(&sbi->item_lock, flags);
+
+	if (!RB_EMPTY_NODE(&item->dirty_node)) {
+		rb_erase(&item->dirty_node, &sbi->dirty_item_root);
+		RB_CLEAR_NODE(&item->dirty_node);
+		scoutfs_item_put(item);
+	}
+
+	if (!RB_EMPTY_NODE(&item->node)) {
+		rb_erase(&item->node, &sbi->item_root);
+		RB_CLEAR_NODE(&item->node);
+		scoutfs_item_put(item);
+	}
+
+	spin_unlock_irqrestore(&sbi->item_lock, flags);
+}
+
+static struct scoutfs_item *item_lookup(struct super_block *sb,
+					struct scoutfs_key *key, int np)
+{
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	struct scoutfs_item *item;
+	unsigned long flags;
+
+	spin_lock_irqsave(&sbi->item_lock, flags);
+
+	item = find_item(sb, &sbi->item_root, key, np);
+	if (item)
+		atomic_inc(&item->refcount);
+	else
+		item = ERR_PTR(-ENOENT);
+
+	spin_unlock_irqrestore(&sbi->item_lock, flags);
+
+	return item;
+}
+
+struct scoutfs_item *scoutfs_item_lookup(struct super_block *sb,
+					 struct scoutfs_key *key)
+{
+	return item_lookup(sb, key, 0);
+}
+
+struct scoutfs_item *scoutfs_item_next(struct super_block *sb,
+				       struct scoutfs_key *key)
+{
+	return item_lookup(sb, key, FI_NEXT);
+}
+
+struct scoutfs_item *scoutfs_item_prev(struct super_block *sb,
+				       struct scoutfs_key *key)
+{
+	return item_lookup(sb, key, FI_PREV);
+}
+
+/*
+ * Expand the item's value by inserting bytes at the given offset.  The
+ * new bytes are not initialized.
+ */
+int scoutfs_item_expand(struct scoutfs_item *item, int off, int bytes)
+{
+	void *val;
+
+	/* XXX bytes too big */
+	if (WARN_ON_ONCE(off < 0 || off > item->val_len))
+		return -EINVAL;
+
+	val = kmalloc(item->val_len + bytes, GFP_NOFS);
+	if (!val)
+		return -ENOMEM;
+
+	memcpy(val, item->val, off);
+	memcpy(val + off + bytes, item->val + off, item->val_len - off);
+
+	kfree(item->val);
+	item->val = val;
+	item->val_len += bytes;
+
+	return 0;
+}
+
+/*
+ * Shrink the item's value by remove bytes at the given offset.
+ */
+int scoutfs_item_shrink(struct scoutfs_item *item, int off, int bytes)
+{
+	void *val;
+
+	if (WARN_ON_ONCE(off < 0 || off >= item->val_len ||
+		         bytes <= 0 || (off + bytes) > item->val_len ||
+		         bytes == item->val_len))
+		return -EINVAL;
+
+	val = kmalloc(item->val_len - bytes, GFP_NOFS);
+	if (!val)
+		return -ENOMEM;
+
+	memcpy(val, item->val, off);
+	memcpy(val + off, item->val + off + bytes,
+	       item->val_len - (off + bytes));
+
+	kfree(item->val);
+	item->val = val;
+	item->val_len -= bytes;
+
+	return 0;
+}
+
+void scoutfs_item_mark_dirty(struct super_block *sb, struct scoutfs_item *item)
+{
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	unsigned long flags;
+
+	spin_lock_irqsave(&sbi->item_lock, flags);
+
+	if (RB_EMPTY_NODE(&item->dirty_node)) {
+		insert_item(sb, &sbi->dirty_item_root, item);
+		atomic_inc(&item->refcount);
+	}
+
+	spin_unlock_irqrestore(&sbi->item_lock, flags);
+}
+
+/*
+ * Mark all the dirty items clean by emptying the dirty rbtree.  The
+ * caller should be preventing writes from dirtying new items.
+ *
+ * We erase leaf nodes with no children to minimize rotation
+ * overhead during erase.  Dirty items must be in the main rbtree if
+ * they're in the dirty rbtree so the puts here shouldn't free the
+ * items.
+ */
+void scoutfs_item_all_clean(struct super_block *sb)
+{
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	struct rb_root *root = &sbi->dirty_item_root;
+	struct scoutfs_item *item;
+	struct rb_node *node;
+	unsigned long flags;
+
+	spin_lock_irqsave(&sbi->item_lock, flags);
+
+	node = sbi->dirty_item_root.rb_node;
+	while (node) {
+		if (node->rb_left)
+			node = node->rb_left;
+		else if (node->rb_right)
+			node = node->rb_right;
+		else {
+			item = node_item(sb, root, node);
+			node = rb_parent(node);
+
+			trace_printk("item %p key "CKF"\n",
+				     item, CKA(&item->key));
+			rb_erase(&item->dirty_node, root);
+			RB_CLEAR_NODE(&item->dirty_node);
+			scoutfs_item_put(item);
+		}
+	}
+
+	spin_unlock_irqrestore(&sbi->item_lock, flags);
+}
+
+/*
+ * If the item is null then the first dirty item is returned.  If an
+ * item is given then the next dirty item is returned.  NULL is returned
+ * if there are no more dirty items.
+ *
+ * The caller is given a reference that it has to put.  The given item
+ * will always have its item dropped including if it returns NULL.
+ */
+struct scoutfs_item *scoutfs_item_next_dirty(struct super_block *sb,
+					     struct scoutfs_item *item)
+{
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	struct scoutfs_item *next_item;
+	struct rb_node *node;
+	unsigned long flags;
+
+	spin_lock_irqsave(&sbi->item_lock, flags);
+
+	if (item)
+		node = rb_next(&item->dirty_node);
+	else
+		node = rb_first(&sbi->dirty_item_root);
+
+	if (node) {
+		next_item = node_item(sb, &sbi->dirty_item_root, node);
+		atomic_inc(&next_item->refcount);
+	} else {
+		next_item = NULL;
+	}
+
+	spin_unlock_irqrestore(&sbi->item_lock, flags);
+
+	scoutfs_item_put(item);
+
+	return next_item;
+}
+
+void scoutfs_item_put(struct scoutfs_item *item)
+{
+	if (!IS_ERR_OR_NULL(item) && atomic_dec_and_test(&item->refcount)) {
+		WARN_ON_ONCE(!RB_EMPTY_NODE(&item->node));
+		WARN_ON_ONCE(!RB_EMPTY_NODE(&item->dirty_node));
+		kfree(item);
+	}
+}
--- a/kmod/src/item.h
+++ b/kmod/src/item.h
@@ -0,0 +1,37 @@
+#ifndef _SCOUTFS_ITEM_H_
+#define _SCOUTFS_ITEM_H_
+
+#include "format.h"
+
+struct scoutfs_item {
+	struct rb_node node;
+	struct rb_node dirty_node;
+	atomic_t refcount;
+
+	/* the key is constant for the life of the item */
+	struct scoutfs_key key;
+
+	/* the value can be changed by expansion or shrinking */
+	unsigned int val_len;
+	void *val;
+};
+
+struct scoutfs_item *scoutfs_item_create(struct super_block *sb,
+					 struct scoutfs_key *key,
+					 unsigned int val_len);
+struct scoutfs_item *scoutfs_item_lookup(struct super_block *sb,
+					 struct scoutfs_key *key);
+struct scoutfs_item *scoutfs_item_next(struct super_block *sb,
+				       struct scoutfs_key *key);
+struct scoutfs_item *scoutfs_item_prev(struct super_block *sb,
+				       struct scoutfs_key *key);
+int scoutfs_item_expand(struct scoutfs_item *item, int off, int bytes);
+int scoutfs_item_shrink(struct scoutfs_item *item, int off, int bytes);
+void scoutfs_item_delete(struct super_block *sb, struct scoutfs_item *item);
+void scoutfs_item_mark_dirty(struct super_block *sb, struct scoutfs_item *item);
+struct scoutfs_item *scoutfs_item_next_dirty(struct super_block *sb,
+					     struct scoutfs_item *item);
+void scoutfs_item_all_clean(struct super_block *sb);
+void scoutfs_item_put(struct scoutfs_item *item);
+
+#endif
--- a/kmod/src/key.h
+++ b/kmod/src/key.h
@@ -0,0 +1,43 @@
+#ifndef _SCOUTFS_KEY_H_
+#define _SCOUTFS_KEY_H_
+
+#include <linux/types.h>
+#include "format.h"
+
+#define CKF "%llu.%u.%llu"
+#define CKA(key) \
+	le64_to_cpu((key)->inode), (key)->type, le64_to_cpu((key)->offset)
+
+static inline u64 scoutfs_key_inode(struct scoutfs_key *key)
+{
+	return le64_to_cpu(key->inode);
+}
+
+static inline u64 scoutfs_key_offset(struct scoutfs_key *key)
+{
+	return le64_to_cpu(key->offset);
+}
+
+static inline int le64_cmp(__le64 a, __le64 b)
+{
+	return le64_to_cpu(a) < le64_to_cpu(b) ? -1 : 
+	       le64_to_cpu(a) > le64_to_cpu(b) ? 1 : 0;
+}
+
+static inline int scoutfs_key_cmp(struct scoutfs_key *a, struct scoutfs_key *b)
+{
+	return le64_cmp(a->inode, b->inode) ?:
+	       ((short)a->type - (short)b->type) ?: 
+	       le64_cmp(a->offset, b->offset);
+}
+
+	
+static inline void scoutfs_set_key(struct scoutfs_key *key, u64 inode, u8 type,
+				   u64 offset)
+{
+	key->inode = cpu_to_le64(inode);
+	key->type = type;
+	key->offset = cpu_to_le64(offset);
+}
+
+#endif
--- a/kmod/src/lsm.c
+++ b/kmod/src/lsm.c
@@ -0,0 +1,330 @@
+/*
+ * Copyright (C) 2016 Versity Software, Inc.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/crc32c.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+
+#include "format.h"
+#include "dir.h"
+#include "inode.h"
+#include "key.h"
+#include "item.h"
+#include "super.h"
+#include "lsm.h"
+
+#define PAGE_CACHE_PAGE_BITS (PAGE_CACHE_SIZE * 8)
+
+/* XXX garbage hack until we have siphash */
+static u64 bloom_hash(struct scoutfs_key *key, __le64 *hash_key)
+{
+	__le32 *salts = (void *)hash_key;
+
+	return ((u64)crc32c(le32_to_cpu(salts[0]), key, sizeof(*key)) << 32) |
+		     crc32c(le32_to_cpu(salts[1]), key, sizeof(*key));
+}
+
+/*
+ * Set the caller's bloom indices for their item key.
+ */
+static void get_bloom_indices(struct super_block *sb,
+			      struct scoutfs_key *key, u32 *ind)
+{
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	__le64 *hash_key = sbi->bloom_hash_keys;
+	u64 hash;
+	int h;
+	int i;
+
+	for (i = 0; ; ) {
+		hash = bloom_hash(key, hash_key);
+		hash_key += 2;
+
+		for (h = 0; h < 64 / SCOUTFS_BLOOM_INDEX_BITS; h++) {
+			ind[i++] = hash & SCOUTFS_BLOOM_INDEX_MASK;
+			if (i == SCOUTFS_BLOOM_INDEX_NR)
+				return;
+
+			hash >>= SCOUTFS_BLOOM_INDEX_BITS;
+		}
+	}
+}
+
+struct pages {
+	/* fixed for the group of pages */
+	struct address_space *mapping;
+	struct page **pages;
+	pgoff_t pgoff;
+
+	/* number of pages stored in the pages array */
+	int nr;
+	/* byte offset of the free space at end of current page */
+	int off;
+	/* bytes remaining in the ovarall large block */
+	int remaining;
+};
+
+/*
+ * The caller has our fixed-size bloom filter in the locked pages
+ * starting at the given byte offset in the first page.  Our job is to
+ * hash the key and set its bits in the bloom filter.
+ */
+static void set_bloom_bits(struct super_block *sb, struct page **pages,
+			   unsigned int offset, struct scoutfs_key *key)
+{
+	u32 inds[SCOUTFS_BLOOM_INDEX_NR];
+	struct page *page;
+	int offset_bits = offset * 8;
+	int full_bit;
+	int page_bit;
+	void *addr;
+	int i;
+
+	get_bloom_indices(sb, key, inds);
+
+	for (i = 0; i < SCOUTFS_BLOOM_INDEX_NR; i++) {
+		full_bit = offset_bits + inds[i];
+		page = pages[full_bit / PAGE_CACHE_PAGE_BITS];
+		page_bit = full_bit % PAGE_CACHE_PAGE_BITS;
+
+		addr = kmap_atomic(page);
+		set_bit_le(page_bit, addr);
+		kunmap_atomic(addr);
+	}
+}
+
+/*
+ * XXX the zeroing here is unreliable.  We'll want to zero the bloom but
+ * not all the pages that are about to be overwritten.  Bleh.
+ *
+ * Returns the number of bytes copied if there was room.  Returns 0 if
+ * there wasn't.  Returns -errno on a hard failure.
+ */
+static int copy_to_pages(struct pages *pgs, void *ptr, size_t count)
+{
+	struct page *page;
+	int ret = count;
+	void *addr;
+	int bytes;
+
+	if (count > pgs->remaining)
+		return 0;
+
+	while (count) {
+		if (pgs->off == PAGE_CACHE_SIZE) {
+			page = find_or_create_page(pgs->mapping,
+						   pgs->pgoff + pgs->nr,
+						   GFP_NOFS | __GFP_ZERO);
+			trace_printk("page %p\n", page);
+			if (!page) {
+				ret = -ENOMEM;
+				break;
+			}
+
+			pgs->pages[pgs->nr++] = page;
+			pgs->off = 0;
+		} else {
+			page = pgs->pages[pgs->nr - 1];
+		}
+
+		bytes = min(PAGE_CACHE_SIZE - pgs->off, count);
+
+		trace_printk("page %p off %d ptr %p count %zu bytes %d remaining %d\n",
+			     page, pgs->off, ptr, count, bytes, pgs->remaining);
+
+		if (ptr) {
+			addr = kmap_atomic(page);
+			memcpy(addr + pgs->off, ptr, bytes);
+			kunmap_atomic(addr);
+			ptr += bytes;
+		}
+		count -= bytes;
+		pgs->off += bytes;
+		pgs->remaining -= bytes;
+	}
+
+	return ret;
+}
+
+static void drop_pages(struct pages *pgs, bool dirty)
+{
+	struct page *page;
+	int i;
+
+	if (!pgs->pages)
+		return;
+
+	for (i = 0; i < pgs->nr; i++) {
+		page = pgs->pages[i];
+
+		SetPageUptodate(page);
+		if (dirty)
+			set_page_dirty(page);
+		unlock_page(page);
+		page_cache_release(page);
+	}
+}
+
+/*
+ * Write dirty items from the given item into dirty page cache pages in
+ * the block device at the given large block number.
+ *
+ * All the page cache pages are locked and pinned while they're being
+ * dirtied.  The intent is to have a single large IO leave once they're
+ * all ready.  This is an easy way to do that while maintaining
+ * consistency with the block device page cache.  But it might not work :).
+ *
+ * We do one sweep over the items.  The item's aren't indexed.  We might
+ * want to change that.
+ *
+ * Even though we're doing one sweep over the items we're holding the
+ * bloom filter and header pinned until the items are done.  If we didn't
+ * mind the risk of the blocks going out of order we wouldn't need the
+ * allocated array of page pointers.
+ */
+static struct scoutfs_item *dirty_block_pages(struct super_block *sb,
+					    struct scoutfs_item *item, u64 blkno)
+{
+	struct scoutfs_item_header ihdr;
+	struct scoutfs_lsm_block lblk;
+	struct pages pgs;
+	void *addr;
+	int ret;
+
+	/* assuming header starts page, and pgoff shift calculation */
+	BUILD_BUG_ON(SCOUTFS_BLOCK_SHIFT < PAGE_CACHE_SHIFT);
+
+	if (WARN_ON_ONCE(!item))
+		return item;
+
+	/* XXX not super thrilled with this allocation */
+	pgs.pages = kmalloc_array(SCOUTFS_BLOCK_SIZE / PAGE_CACHE_SIZE,
+				  sizeof(struct page *), GFP_NOFS);
+	if (!pgs.pages) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	pgs.mapping = sb->s_bdev->bd_inode->i_mapping;
+	pgs.pgoff = blkno >> (SCOUTFS_BLOCK_SHIFT - PAGE_CACHE_SHIFT);
+	pgs.nr = 0;
+	pgs.off = PAGE_CACHE_SIZE,
+	pgs.remaining = SCOUTFS_BLOCK_SIZE;
+
+	/* reserve space at the start of the block for header and bloom */
+	ret = copy_to_pages(&pgs, NULL, sizeof(lblk));
+	if (ret > 0)
+		ret = copy_to_pages(&pgs, NULL, SCOUTFS_BLOOM_FILTER_BYTES);
+	if (ret <= 0)
+		goto out;
+
+	lblk.first = item->key;
+	lblk.nr_items = 0;
+	do {
+		trace_printk("item %p key "CKF"\n", item, CKA(&item->key));
+
+		ihdr.key = item->key;
+		ihdr.val_len = cpu_to_le16(item->val_len);
+		ret = copy_to_pages(&pgs, &ihdr, sizeof(ihdr));
+		if (ret > 0)
+		      ret = copy_to_pages(&pgs, item->val, item->val_len);
+		if (ret <= 0)
+			goto out;
+
+		lblk.last = item->key;
+		le32_add_cpu(&lblk.nr_items, 1);
+
+		/* set each item's bloom bits */
+		set_bloom_bits(sb, pgs.pages, sizeof(lblk), &item->key);
+
+		item = scoutfs_item_next_dirty(sb, item);
+	} while (item);
+
+	/* copy the filled in header to the start of the block */
+	addr = kmap_atomic(pgs.pages[0]);
+	memcpy(addr, &lblk, sizeof(lblk));
+	kunmap_atomic(addr);
+
+out:
+	/* dirty if no error (null ok!), unlock, and release */
+	drop_pages(&pgs, !IS_ERR(item));
+	kfree(pgs.pages);
+	if (ret < 0) {
+		scoutfs_item_put(item);
+		item = ERR_PTR(ret);
+	}
+	return item;
+}
+
+/*
+ * Sync dirty data by writing all the dirty items into a series of level
+ * 0 blocks.
+ *
+ * This is an initial first pass, the full method will need to:
+ *  - wait for pending writers
+ *  - block future writers
+ *  - update our manifest regardless of server communication
+ *  - communicate blocks and key ranges to server
+ *  - ensure that racing sync/dirty don't livelock
+ */
+int scoutfs_sync_fs(struct super_block *sb, int wait)
+{
+	struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	struct scoutfs_item *item;
+	u64 blknos[16]; /* XXX */
+	u64 blkno;
+	int ret = 0;
+	int i;
+
+	item = scoutfs_item_next_dirty(sb, NULL);
+	if (!item)
+		return 0;
+
+	for (i = 0; i < ARRAY_SIZE(blknos); i++) {
+		blkno = atomic64_inc_return(&sbi->next_blkno);
+
+		item = dirty_block_pages(sb, item, blkno);
+		if (IS_ERR(item)) {
+			ret = PTR_ERR(item);
+			goto out;
+		}
+
+		/* start each block's IO */
+		ret = filemap_flush(mapping);
+		if (ret)
+			goto out;
+
+		if (!item)
+			break;
+	}
+	/* dirty items should have been limited */
+	WARN_ON_ONCE(i >= ARRAY_SIZE(blknos));
+
+	/* then wait for all block IO to finish */
+	if (wait) {
+		ret = filemap_write_and_wait(mapping);
+		if (ret)
+			goto out;
+	}
+
+	/* mark everything clean */
+	scoutfs_item_all_clean(sb);
+	ret = 0;
+out:
+	trace_printk("ret %d\n", ret);
+	WARN_ON_ONCE(ret);
+	return ret;
+}
--- a/kmod/src/lsm.h
+++ b/kmod/src/lsm.h
@@ -0,0 +1,6 @@
+#ifndef _SCOUTFS_LSM_H_
+#define _SCOUTFS_LSM_H_
+
+int scoutfs_sync_fs(struct super_block *sb, int wait);
+
+#endif
--- a/kmod/src/mkfs.c
+++ b/kmod/src/mkfs.c
@@ -0,0 +1,52 @@
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/random.h>
+
+#include "super.h"
+#include "item.h"
+#include "key.h"
+#include "mkfs.h"
+
+/*
+ * For now a file system system only exists in the item cache for the
+ * duration of the mount.  This "mkfs" hack creates a root dir inode in
+ * the item cache on mount so that we can run tests in memory and not
+ * worry about user space or persistent storage.
+ */
+int scoutfs_mkfs(struct super_block *sb)
+{
+	const struct timespec ts = current_kernel_time();
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	struct scoutfs_inode *cinode;
+	struct scoutfs_item *item;
+	struct scoutfs_key key;
+	int i;
+
+	atomic64_set(&sbi->next_ino, SCOUTFS_ROOT_INO + 1);
+	atomic64_set(&sbi->next_blkno, 2);
+
+	for (i = 0; i < ARRAY_SIZE(sbi->bloom_hash_keys); i++) {
+		get_random_bytes(&sbi->bloom_hash_keys[i],
+				 sizeof(sbi->bloom_hash_keys[i]));
+	}
+
+	scoutfs_set_key(&key, SCOUTFS_ROOT_INO, SCOUTFS_INODE_KEY, 0);
+
+	item = scoutfs_item_create(sb, &key, sizeof(struct scoutfs_inode));
+	if (IS_ERR(item))
+		return PTR_ERR(item);
+
+	cinode = item->val;
+	memset(cinode, 0, sizeof(struct scoutfs_inode));
+	cinode->nlink = cpu_to_le32(2);
+	cinode->mode = cpu_to_le32(S_IFDIR | 0755);
+	cinode->atime.sec = cpu_to_le64(ts.tv_sec);
+	cinode->atime.nsec = cpu_to_le32(ts.tv_nsec);
+	cinode->ctime = cinode->atime;
+	cinode->mtime = cinode->atime;
+	get_random_bytes(&cinode->salt, sizeof(cinode->salt));
+
+	scoutfs_item_put(item);
+	return 0;
+}
--- a/kmod/src/mkfs.h
+++ b/kmod/src/mkfs.h
@@ -0,0 +1,6 @@
+#ifndef _SCOUTFS_MKFS_H_
+#define _SCOUTFS_MKFS_H_
+
+int scoutfs_mkfs(struct super_block *sb);
+
+#endif
--- a/kmod/src/super.c
+++ b/kmod/src/super.c
@@ -0,0 +1,103 @@
+/*
+ * Copyright (C) 2015 Versity Software, Inc.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/magic.h>
+
+#include "super.h"
+#include "format.h"
+#include "mkfs.h"
+#include "inode.h"
+#include "dir.h"
+#include "lsm.h"
+
+static const struct super_operations scoutfs_super_ops = {
+	.alloc_inode = scoutfs_alloc_inode,
+	.destroy_inode = scoutfs_destroy_inode,
+	.sync_fs = scoutfs_sync_fs,
+};
+
+static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct scoutfs_sb_info *sbi;
+	struct inode *inode;
+	int ret;
+
+	sb->s_magic = SCOUTFS_SUPER_MAGIC;
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sb->s_op = &scoutfs_super_ops;
+
+	sbi = kzalloc(sizeof(struct scoutfs_sb_info), GFP_KERNEL);
+	sb->s_fs_info = sbi;
+	if (!sbi)
+		return -ENOMEM;
+
+	spin_lock_init(&sbi->item_lock);
+	sbi->item_root = RB_ROOT;
+	sbi->dirty_item_root = RB_ROOT;
+
+	ret = scoutfs_mkfs(sb);
+	if (ret)
+		return ret;
+
+	inode = scoutfs_iget(sb, SCOUTFS_ROOT_INO);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	sb->s_root = d_make_root(inode);
+	if (!sb->s_root)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static struct dentry *scoutfs_mount(struct file_system_type *fs_type, int flags,
+				    const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, scoutfs_fill_super);
+}
+
+static void scoutfs_kill_sb(struct super_block *sb)
+{
+	kill_block_super(sb);
+	kfree(sb->s_fs_info);
+}
+
+static struct file_system_type scoutfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "scoutfs",
+	.mount		= scoutfs_mount,
+	.kill_sb	= scoutfs_kill_sb,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+static int __init scoutfs_module_init(void)
+{
+	return scoutfs_inode_init() ?:
+	       scoutfs_dir_init() ?:
+	       register_filesystem(&scoutfs_fs_type);
+}
+module_init(scoutfs_module_init)
+
+static void __exit scoutfs_module_exit(void)
+{
+	unregister_filesystem(&scoutfs_fs_type);
+	scoutfs_dir_exit();
+	scoutfs_inode_exit();
+}
+module_exit(scoutfs_module_exit)
+
+MODULE_AUTHOR("Zach Brown <zab@versity.com>");
+MODULE_LICENSE("GPL");
--- a/kmod/src/super.h
+++ b/kmod/src/super.h
@@ -0,0 +1,22 @@
+#ifndef _SCOUTFS_SUPER_H_
+#define _SCOUTFS_SUPER_H_
+
+#include <linux/rbtree.h>
+
+struct scoutfs_sb_info {
+	atomic64_t next_ino;
+	atomic64_t next_blkno;
+
+	__le64 bloom_hash_keys[6]; /* XXX */
+
+	spinlock_t item_lock;
+	struct rb_root item_root;
+	struct rb_root dirty_item_root;
+};
+
+static inline struct scoutfs_sb_info *SCOUTFS_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+#endif