Add initial LSM implementation

Add the initial core components of the LSM implementation to be able to
read the root inode:

 - bio.c: read big block regions
 - seg.c: cache logical segments
 - ring.c: read the manifest from storage
 - manifest.c: organize segments into an LSM
 - kvec.c: work with arbitrary memory vectors
 - item.c: cache fs metadata items read from segments

Signed-off-by: Zach Brown <zab@versity.com>
This commit is contained in:
Zach Brown
2016-12-02 20:37:44 -08:00
parent c6b688c2bf
commit 43d0d44e48
18 changed files with 1915 additions and 16 deletions

View File

@@ -2,6 +2,6 @@ obj-$(CONFIG_SCOUTFS_FS) := scoutfs.o
CFLAGS_scoutfs_trace.o = -I$(src) # define_trace.h double include
scoutfs-y += block.o btree.o buddy.o counters.o crc.o dir.o filerw.o \
inode.o ioctl.o msg.o name.o scoutfs_trace.o super.o trans.o \
xattr.o
scoutfs-y += bio.o block.o btree.o buddy.o counters.o crc.o dir.o filerw.o \
kvec.o inode.o ioctl.o item.o manifest.o msg.o name.o ring.o \
seg.o scoutfs_trace.o super.o trans.o xattr.o

169
kmod/src/bio.c Normal file
View File

@@ -0,0 +1,169 @@
/*
* Copyright (C) 2016 Versity Software, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include <linux/kernel.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
#include "super.h"
#include "format.h"
#include "bio.h"
struct bio_end_io_args {
struct super_block *sb;
atomic_t bytes_in_flight;
int err;
scoutfs_bio_end_io_t end_io;
void *data;
};
static void dec_end_io(struct bio_end_io_args *args, size_t bytes, int err)
{
if (err && !args->err)
args->err = err;
if (atomic_sub_return(bytes, &args->bytes_in_flight) == 0) {
args->end_io(args->sb, args->data, args->err);
kfree(args);
}
}
static void bio_end_io(struct bio *bio, int err)
{
struct bio_end_io_args *args = bio->bi_private;
dec_end_io(args, bio->bi_size, err);
bio_put(bio);
}
/*
* Read or write the given number of 4k blocks from the front of the
* pages provided by the caller. We translate the block count into a
* page count and fill bios a page at a time.
*
* The caller is responsible for ensuring that the pages aren't freed
* while bios are in flight.
*
* The end_io function is always called once with the error result of
* the IO. It can be called before _submit returns.
*/
void scoutfs_bio_submit(struct super_block *sb, int rw, struct page **pages,
u64 blkno, unsigned int nr_blocks,
scoutfs_bio_end_io_t end_io, void *data)
{
unsigned int nr_pages = DIV_ROUND_UP(nr_blocks,
SCOUTFS_BLOCKS_PER_PAGE);
struct bio_end_io_args *args;
struct blk_plug plug;
unsigned int bytes;
struct page *page;
struct bio *bio = NULL;
int ret = 0;
int i;
args = kmalloc(sizeof(struct bio_end_io_args), GFP_NOFS);
if (!args) {
end_io(sb, data, -ENOMEM);
return;
}
args->sb = sb;
atomic_set(&args->bytes_in_flight, 1);
args->err = 0;
args->end_io = end_io;
args->data = data;
blk_start_plug(&plug);
for (i = 0; i < nr_pages; i++) {
page = pages[i];
if (!bio) {
bio = bio_alloc(GFP_NOFS, nr_pages - i);
if (!bio)
bio = bio_alloc(GFP_NOFS, 1);
if (!bio) {
ret = -ENOMEM;
break;
}
bio->bi_sector = blkno << (SCOUTFS_BLOCK_SHIFT - 9);
bio->bi_bdev = sb->s_bdev;
bio->bi_end_io = bio_end_io;
bio->bi_private = args;
}
bytes = min_t(int, nr_blocks << SCOUTFS_BLOCK_SHIFT, PAGE_SIZE);
if (bio_add_page(bio, page, bytes, 0) != bytes) {
/* submit the full bio and retry this page */
atomic_add(bio->bi_size, &args->bytes_in_flight);
submit_bio(rw, bio);
bio = NULL;
i--;
continue;
}
blkno += SCOUTFS_BLOCKS_PER_PAGE;
nr_blocks -= SCOUTFS_BLOCKS_PER_PAGE;
}
if (bio) {
atomic_add(bio->bi_size, &args->bytes_in_flight);
submit_bio(rw, bio);
}
blk_finish_plug(&plug);
dec_end_io(args, 1, ret);
}
struct end_io_completion {
struct completion comp;
int err;
};
static void end_io_complete(struct super_block *sb, void *data, int err)
{
struct end_io_completion *comp = data;
comp->err = err;
complete(&comp->comp);
}
/*
* A synchronous read of the given blocks.
*
* XXX we could make this interruptible.
*/
int scoutfs_bio_read(struct super_block *sb, struct page **pages,
u64 blkno, unsigned int nr_blocks)
{
struct end_io_completion comp = {
.comp = COMPLETION_INITIALIZER(comp.comp),
};
scoutfs_bio_submit(sb, READ, pages, blkno, nr_blocks,
end_io_complete, &comp);
wait_for_completion(&comp.comp);
return comp.err;
}
/* return pointer to the blk 4k block offset amongst the pages */
void *scoutfs_page_block_address(struct page **pages, unsigned int blk)
{
unsigned int i = blk / SCOUTFS_BLOCKS_PER_PAGE;
unsigned int off = (blk % SCOUTFS_BLOCKS_PER_PAGE) <<
SCOUTFS_BLOCK_SHIFT;
return page_address(pages[i]) + off;
}

23
kmod/src/bio.h Normal file
View File

@@ -0,0 +1,23 @@
#ifndef _SCOUTFS_BIO_H_
#define _SCOUTFS_BIO_H_
/*
* Our little block IO wrapper is just a convenience wrapper that takes
* our block size units and handles tracks multiple bios per larger io.
*
* If bios could hold an unlimited number of pages instead of
* BIO_MAX_PAGES then this would just use a single bio directly.
*/
typedef void (*scoutfs_bio_end_io_t)(struct super_block *sb, void *data,
int err);
void scoutfs_bio_submit(struct super_block *sb, int rw, struct page **pages,
u64 blkno, unsigned int nr_blocks,
scoutfs_bio_end_io_t end_io, void *data);
int scoutfs_bio_read(struct super_block *sb, struct page **pages,
u64 blkno, unsigned int nr_blocks);
void *scoutfs_page_block_address(struct page **pages, unsigned int blk);
#endif

View File

@@ -6,9 +6,23 @@
/* super block id */
#define SCOUTFS_SUPER_ID 0x2e736674756f6373ULL /* "scoutfs." */
/*
* The super block and ring blocks are fixed 4k.
*/
#define SCOUTFS_BLOCK_SHIFT 12
#define SCOUTFS_BLOCK_SIZE (1 << SCOUTFS_BLOCK_SHIFT)
#define SCOUTFS_BLOCK_MASK (SCOUTFS_BLOCK_SIZE - 1)
#define SCOUTFS_BLOCKS_PER_PAGE (PAGE_SIZE / SCOUTFS_BLOCK_SIZE)
/*
* FS data is stored in segments, for now they're fixed size. They'll
* be dynamic.
*/
#define SCOUTFS_SEGMENT_SHIFT 20
#define SCOUTFS_SEGMENT_SIZE (1 << SCOUTFS_SEGMENT_SHIFT)
#define SCOUTFS_SEGMENT_MASK (SCOUTFS_SEGMENT_SIZE - 1)
#define SCOUTFS_SEGMENT_PAGES (SCOUTFS_SEGMENT_SIZE / PAGE_SIZE)
#define SCOUTFS_SEGMENT_BLOCKS (SCOUTFS_SEGMENT_SIZE / BLOCK_SIZE)
#define SCOUTFS_PAGES_PER_BLOCK (SCOUTFS_BLOCK_SIZE / PAGE_SIZE)
#define SCOUTFS_BLOCK_PAGE_ORDER (SCOUTFS_BLOCK_SHIFT - PAGE_SHIFT)
@@ -37,6 +51,67 @@ struct scoutfs_block_header {
__le64 blkno;
} __packed;
struct scoutfs_ring_entry_header {
__u8 type;
__le16 len;
} __packed;
#define SCOUTFS_RING_ADD_MANIFEST 1
struct scoutfs_ring_add_manifest {
struct scoutfs_ring_entry_header eh;
__le64 segno;
__le64 seq;
__le16 first_key_len;
__le16 last_key_len;
__u8 level;
/* first and last key bytes */
} __packed;
/*
* This is absurdly huge. If there was only ever 1 item per segment and
* 2^64 items the tree could get this deep.
*/
#define SCOUTFS_MANIFEST_MAX_LEVEL 20
struct scoutfs_ring_block {
struct scoutfs_block_header hdr;
__le32 nr_entries;
struct scoutfs_ring_entry_header entries[0];
} __packed;
struct scoutfs_segment_item {
__le64 seq;
__le32 key_off;
__le32 val_off;
__le16 key_len;
__le16 val_len;
} __packed;
/*
* Each large segment starts with a segment block that describes the
* rest of the blocks that make up the segment.
*/
struct scoutfs_segment_block {
__le32 crc;
__le32 _padding;
__le64 segno;
__le64 max_seq;
__le32 nr_items;
/* item array with gaps so they don't cross 4k blocks */
/* packed keys */
/* packed vals */
} __packed;
/* the first block in the segment has the header and items */
#define SCOUTFS_SEGMENT_FIRST_BLOCK_ITEMS \
((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_segment_block)) / \
sizeof(struct scoutfs_segment_item))
/* the rest of the header blocks are full of items */
#define SCOUTFS_SEGMENT_ITEMS_PER_BLOCK \
(SCOUTFS_BLOCK_SIZE / sizeof(struct scoutfs_segment_item))
/*
* Block references include the sequence number so that we can detect
* readers racing with writers and so that we can tell that we don't
@@ -118,6 +193,11 @@ struct scoutfs_key {
#define SCOUTFS_MAX_ITEM_LEN 512
struct scoutfs_inode_key {
__u8 type;
__be64 ino;
} __packed;
struct scoutfs_btree_root {
u8 height;
struct scoutfs_block_ref ref;
@@ -180,6 +260,11 @@ struct scoutfs_btree_item {
#define SCOUTFS_UUID_BYTES 16
/*
* The ring fields describe the statically allocated ring log. The
* head and tail indexes are logical 4k blocks offsets inside the ring.
* The head block should contain the seq.
*/
struct scoutfs_super_block {
struct scoutfs_block_header hdr;
__le64 id;
@@ -187,6 +272,11 @@ struct scoutfs_super_block {
__le64 next_ino;
__le64 total_blocks;
__le64 free_blocks;
__le64 ring_blkno;
__le64 ring_blocks;
__le64 ring_head_index;
__le64 ring_tail_index;
__le64 ring_head_seq;
__le64 buddy_blocks;
struct scoutfs_buddy_root buddy_root;
struct scoutfs_btree_root btree_root;

View File

@@ -29,6 +29,8 @@
#include "trans.h"
#include "btree.h"
#include "msg.h"
#include "kvec.h"
#include "item.h"
/*
* XXX
@@ -126,25 +128,28 @@ static void load_inode(struct inode *inode, struct scoutfs_inode *cinode)
ci->data_version = le64_to_cpu(cinode->data_version);
}
static void set_inode_key(struct scoutfs_inode_key *ikey, u64 ino)
{
ikey->type = SCOUTFS_INODE_KEY;
ikey->ino = cpu_to_be64(ino);
}
static int scoutfs_read_locked_inode(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
struct scoutfs_btree_root *meta = SCOUTFS_META(sb);
struct scoutfs_btree_val val;
struct scoutfs_inode_key ikey;
struct scoutfs_inode sinode;
struct scoutfs_key key;
SCOUTFS_DECLARE_KVEC(key);
SCOUTFS_DECLARE_KVEC(val);
int ret;
scoutfs_set_key(&key, scoutfs_ino(inode), SCOUTFS_INODE_KEY, 0);
scoutfs_btree_init_val(&val, &sinode, sizeof(sinode));
set_inode_key(&ikey, scoutfs_ino(inode));
scoutfs_kvec_init(key, &ikey, sizeof(ikey));
scoutfs_kvec_init(val, &sinode, sizeof(sinode));
ret = scoutfs_btree_lookup(sb, meta, &key, &val);
if (ret == sizeof(sinode)) {
ret = scoutfs_item_lookup_exact(sb, key, val, sizeof(sinode));
if (ret == 0)
load_inode(inode, &sinode);
ret = 0;
} else if (ret >= 0) {
ret = -EIO;
}
return ret;
}

View File

@@ -49,4 +49,7 @@ u64 scoutfs_last_ino(struct super_block *sb);
void scoutfs_inode_exit(void);
int scoutfs_inode_init(void);
int scoutfs_item_setup(struct super_block *sb);
void scoutfs_item_destroy(struct super_block *sb);
#endif

217
kmod/src/item.c Normal file
View File

@@ -0,0 +1,217 @@
/*
* Copyright (C) 2016 Versity Software, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include "super.h"
#include "format.h"
#include "kvec.h"
#include "manifest.h"
#include "item.h"
struct item_cache {
spinlock_t lock;
struct rb_root root;
};
struct cached_item {
struct rb_node node;
SCOUTFS_DECLARE_KVEC(key);
SCOUTFS_DECLARE_KVEC(val);
};
static struct cached_item *find_item(struct rb_root *root, struct kvec *key)
{
struct rb_node *node = root->rb_node;
struct rb_node *parent = NULL;
struct cached_item *item;
int cmp;
while (node) {
parent = node;
item = container_of(node, struct cached_item, node);
cmp = scoutfs_kvec_memcmp(key, item->key);
if (cmp < 0)
node = node->rb_left;
else if (cmp > 0)
node = node->rb_right;
else
return item;
}
return NULL;
}
static struct cached_item *insert_item(struct rb_root *root,
struct cached_item *ins)
{
struct rb_node **node = &root->rb_node;
struct rb_node *parent = NULL;
struct cached_item *found = NULL;
struct cached_item *item;
int cmp;
while (*node) {
parent = *node;
item = container_of(*node, struct cached_item, node);
cmp = scoutfs_kvec_memcmp(ins->key, item->key);
if (cmp < 0) {
node = &(*node)->rb_left;
} else if (cmp > 0) {
node = &(*node)->rb_right;
} else {
rb_replace_node(&item->node, &ins->node, root);
found = item;
break;
}
}
if (!found) {
rb_link_node(&ins->node, parent, node);
rb_insert_color(&ins->node, root);
}
return found;
}
/*
* Find an item with the given key and copy its value into the caller's
* value vector. The amount of bytes copied is returned which can be
* 0 or truncated if the caller's buffer isn't big enough.
*/
int scoutfs_item_lookup(struct super_block *sb, struct kvec *key,
struct kvec *val)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct item_cache *cac = sbi->item_cache;
struct cached_item *item;
unsigned long flags;
int ret;
do {
spin_lock_irqsave(&cac->lock, flags);
item = find_item(&cac->root, key);
if (item)
ret = scoutfs_kvec_memcpy(val, item->val);
else
ret = -ENOENT;
spin_unlock_irqrestore(&cac->lock, flags);
} while (!item && ((ret = scoutfs_manifest_read_items(sb, key)) == 0));
return ret;
}
/*
* This requires that the item at the specified key has a value of the
* same length as the specified value. Callers are asserting that
* mismatched size are corruption so it returns -EIO if the sizes don't
* match. This isn't the fast path so we don't mind the copying
* overhead that comes from only detecting the size mismatch after the
* copy by reusing the more permissive _lookup().
*/
int scoutfs_item_lookup_exact(struct super_block *sb, struct kvec *key,
struct kvec *val, int size)
{
int ret;
ret = scoutfs_item_lookup(sb, key, val);
if (ret >= 0 && ret != size)
ret = -EIO;
return ret;
}
static void free_item(struct cached_item *item)
{
if (!IS_ERR_OR_NULL(item)) {
scoutfs_kvec_kfree(item->val);
scoutfs_kvec_kfree(item->key);
kfree(item);
}
}
/*
* Add an item with the key and value to the item cache. The new item
* is clean. Any existing item at the key will be removed and freed.
*/
int scoutfs_item_insert(struct super_block *sb, struct kvec *key,
struct kvec *val)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct item_cache *cac = sbi->item_cache;
struct cached_item *found;
struct cached_item *item;
unsigned long flags;
int ret;
item = kmalloc(sizeof(struct cached_item), GFP_NOFS);
if (!item)
return -ENOMEM;
ret = scoutfs_kvec_dup_flatten(item->key, key) ?:
scoutfs_kvec_dup_flatten(item->val, val);
if (ret) {
free_item(item);
return ret;
}
spin_lock_irqsave(&cac->lock, flags);
found = insert_item(&cac->root, item);
spin_unlock_irqrestore(&cac->lock, flags);
free_item(found);
return 0;
}
int scoutfs_item_setup(struct super_block *sb)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct item_cache *cac;
cac = kzalloc(sizeof(struct item_cache), GFP_KERNEL);
if (!cac)
return -ENOMEM;
sbi->item_cache = cac;
spin_lock_init(&cac->lock);
cac->root = RB_ROOT;
return 0;
}
void scoutfs_item_destroy(struct super_block *sb)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct item_cache *cac = sbi->item_cache;
struct rb_node *node;
struct cached_item *item;
if (cac) {
for (node = rb_first(&cac->root); node; ) {
item = container_of(node, struct cached_item, node);
node = rb_next(node);
free_item(item);
}
kfree(cac);
}
}

16
kmod/src/item.h Normal file
View File

@@ -0,0 +1,16 @@
#ifndef _SCOUTFS_ITEM_H_
#define _SCOUTFS_ITEM_H_
#include <linux/uio.h>
int scoutfs_item_lookup(struct super_block *sb, struct kvec *key,
struct kvec *val);
int scoutfs_item_lookup_exact(struct super_block *sb, struct kvec *key,
struct kvec *val, int size);
int scoutfs_item_insert(struct super_block *sb, struct kvec *key,
struct kvec *val);
int scoutfs_item_setup(struct super_block *sb);
void scoutfs_item_destroy(struct super_block *sb);
#endif

141
kmod/src/kvec.c Normal file
View File

@@ -0,0 +1,141 @@
/*
* Copyright (C) 2016 Versity Software, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/magic.h>
#include <linux/random.h>
#include <linux/statfs.h>
#include "super.h"
#include "format.h"
#include "inode.h"
#include "dir.h"
#include "xattr.h"
#include "msg.h"
#include "block.h"
#include "counters.h"
#include "trans.h"
#include "buddy.h"
#include "kvec.h"
#include "scoutfs_trace.h"
/*
* Return the result of memcmp between the min of the two total lengths.
* If their shorter lengths are equal than the shorter length is considered
* smaller than the longer.
*/
int scoutfs_kvec_memcmp(struct kvec *a, struct kvec *b)
{
int b_off = 0;
int a_off = 0;
int len;
int ret;
while (a->iov_base && b->iov_base) {
len = min(a->iov_len - a_off, b->iov_len - b_off);
ret = memcmp(a->iov_base + a_off, b->iov_base + b_off, len);
if (ret)
return ret;
b_off += len;
if (b_off == b->iov_len)
b++;
a_off += len;
if (a_off == a->iov_len)
a++;
}
return a->iov_base ? 1 : b->iov_base ? -1 : 0;
}
/*
* Returns 0 if [a,b] overlaps with [c,d]. Returns -1 if a < c and
* 1 if b > d.
*/
int scoutfs_kvec_cmp_overlap(struct kvec *a, struct kvec *b,
struct kvec *c, struct kvec *d)
{
return scoutfs_kvec_memcmp(a, c) < 0 ? -1 :
scoutfs_kvec_memcmp(b, d) > 0 ? 1 : 0;
}
/*
* Set just the pointers and length fields in the dst vector to point to
* the source vector.
*/
void scoutfs_kvec_clone(struct kvec *dst, struct kvec *src)
{
int i;
for (i = 0; i < SCOUTFS_KVEC_NR; i++)
*(dst++) = *(src++);
}
/*
* Copy as much of src as fits in dst. Null base pointers termintae the
* copy. The number of bytes copied is returned. Only the buffers
* pointed to by dst are changed, the kvec elements are not changed.
*/
int scoutfs_kvec_memcpy(struct kvec *dst, struct kvec *src)
{
int src_off = 0;
int dst_off = 0;
int copied = 0;
int len;
while (dst->iov_base && src->iov_base) {
len = min(dst->iov_len - dst_off, src->iov_len - src_off);
memcpy(dst->iov_base + dst_off, src->iov_base + src_off, len);
copied += len;
src_off += len;
if (src_off == src->iov_len)
src++;
dst_off += len;
if (dst_off == dst->iov_len)
dst++;
}
return copied;
}
/*
* Copy the src key vector into one new allocation in the dst. The existing
* dst is clobbered. The source isn't changed.
*/
int scoutfs_kvec_dup_flatten(struct kvec *dst, struct kvec *src)
{
void *ptr;
size_t len = scoutfs_kvec_length(src);
ptr = kmalloc(len, GFP_NOFS);
if (!ptr)
return -ENOMEM;
scoutfs_kvec_init(dst, ptr, len);
scoutfs_kvec_memcpy(dst, src);
return 0;
}
/*
* Free all the set pointers in the kvec. The pointer values aren't modified
* if they're freed.
*/
void scoutfs_kvec_kfree(struct kvec *kvec)
{
while (kvec->iov_base)
kfree((kvec++)->iov_base);
}

67
kmod/src/kvec.h Normal file
View File

@@ -0,0 +1,67 @@
#ifndef _SCOUTFS_KVEC_H_
#define _SCOUTFS_KVEC_H_
#include <linux/uio.h>
/*
* The item APIs use kvecs to represent variable size item keys and
* values.
*/
/*
* This ends up defining the max item size as nr - 1 * page _size.
*/
#define SCOUTFS_KVEC_NR 4
#define SCOUTFS_DECLARE_KVEC(name) \
struct kvec name[SCOUTFS_KVEC_NR]
static inline void scoutfs_kvec_init_all(struct kvec *kvec,
void *ptr0, size_t len0,
void *ptr1, size_t len1,
void *ptr2, size_t len2,
void *ptr3, size_t len3, ...)
{
BUG_ON(ptr3 != NULL);
kvec[0].iov_base = ptr0;
kvec[0].iov_len = len0;
kvec[1].iov_base = ptr1;
kvec[1].iov_len = len1;
kvec[2].iov_base = ptr2;
kvec[2].iov_len = len2;
kvec[3].iov_base = ptr3;
kvec[3].iov_len = len3;
}
/*
* Provide a nice variadic initialization function without having to
* iterate over the callers arg types. We play some macro games to pad
* out the callers ptr/len pairs to the full possible number. This will
* produce confusing errors if an odd number of arguments is given and
* the padded ptr/length types aren't compatible with the fixed
* arguments in the static inline.
*/
#define scoutfs_kvec_init(val, ...) \
scoutfs_kvec_init_all(val, __VA_ARGS__, NULL, 0, NULL, 0, NULL, 0)
static inline int scoutfs_kvec_length(struct kvec *kvec)
{
BUILD_BUG_ON(sizeof(struct kvec) != sizeof(struct iovec));
BUILD_BUG_ON(offsetof(struct kvec, iov_len) !=
offsetof(struct iovec, iov_len));
BUILD_BUG_ON(member_sizeof(struct kvec, iov_len) !=
member_sizeof(struct iovec, iov_len));
return iov_length((struct iovec *)kvec, SCOUTFS_KVEC_NR);
}
void scoutfs_kvec_clone(struct kvec *dst, struct kvec *src);
int scoutfs_kvec_memcmp(struct kvec *a, struct kvec *b);
int scoutfs_kvec_cmp_overlap(struct kvec *a, struct kvec *b,
struct kvec *c, struct kvec *d);
int scoutfs_kvec_memcpy(struct kvec *dst, struct kvec *src);
int scoutfs_kvec_dup_flatten(struct kvec *dst, struct kvec *src);
void scoutfs_kvec_kfree(struct kvec *kvec);
#endif

449
kmod/src/manifest.c Normal file
View File

@@ -0,0 +1,449 @@
/*
* Copyright (C) 2016 Versity Software, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include "super.h"
#include "format.h"
#include "kvec.h"
#include "seg.h"
#include "item.h"
#include "manifest.h"
struct manifest {
spinlock_t lock;
struct list_head level0_list;
unsigned int level0_nr;
u8 last_level;
struct rb_root level_roots[SCOUTFS_MANIFEST_MAX_LEVEL + 1];
};
#define DECLARE_MANIFEST(sb, name) \
struct manifest *name = SCOUTFS_SB(sb)->manifest
struct manifest_entry {
union {
struct list_head level0_entry;
struct rb_node node;
};
struct kvec *first;
struct kvec *last;
u64 segno;
u64 seq;
u8 level;
};
/*
* A path tracks all the segments from level 0 to the last level that
* overlap with the search key.
*/
struct manifest_ref {
u64 segno;
u64 seq;
struct scoutfs_segment *seg;
int pos;
u8 level;
};
static struct manifest_entry *find_ment(struct rb_root *root, struct kvec *key)
{
struct rb_node *node = root->rb_node;
struct manifest_entry *ment;
int cmp;
while (node) {
ment = container_of(node, struct manifest_entry, node);
cmp = scoutfs_kvec_cmp_overlap(key, key,
ment->first, ment->last);
if (cmp < 0)
node = node->rb_left;
else if (cmp > 0)
node = node->rb_right;
else
return ment;
}
return NULL;
}
/*
* Insert a new entry into one of the L1+ trees. There should never be
* entries that overlap.
*/
static int insert_ment(struct rb_root *root, struct manifest_entry *ins)
{
struct rb_node **node = &root->rb_node;
struct rb_node *parent = NULL;
struct manifest_entry *ment;
int cmp;
while (*node) {
parent = *node;
ment = container_of(*node, struct manifest_entry, node);
cmp = scoutfs_kvec_cmp_overlap(ins->first, ins->last,
ment->first, ment->last);
if (cmp < 0) {
node = &(*node)->rb_left;
} else if (cmp > 0) {
node = &(*node)->rb_right;
} else {
return -EEXIST;
}
}
rb_link_node(&ins->node, parent, node);
rb_insert_color(&ins->node, root);
return 0;
}
static void free_ment(struct manifest_entry *ment)
{
if (!IS_ERR_OR_NULL(ment)) {
scoutfs_kvec_kfree(ment->first);
scoutfs_kvec_kfree(ment->last);
kfree(ment);
}
}
static int add_ment(struct manifest *mani, struct manifest_entry *ment)
{
int ret;
if (ment->level) {
ret = insert_ment(&mani->level_roots[ment->level], ment);
if (!ret)
mani->last_level = max(mani->last_level, ment->level);
} else {
list_add_tail(&ment->level0_entry, &mani->level0_list);
mani->level0_nr++;
ret = 0;
}
return ret;
}
static void update_last_level(struct manifest *mani)
{
int i;
for (i = mani->last_level;
i > 0 && RB_EMPTY_ROOT(&mani->level_roots[i]); i--)
;
mani->last_level = i;
}
static void remove_ment(struct manifest *mani, struct manifest_entry *ment)
{
if (ment->level) {
rb_erase(&ment->node, &mani->level_roots[ment->level]);
update_last_level(mani);
} else {
list_del_init(&ment->level0_entry);
mani->level0_nr--;
}
}
int scoutfs_manifest_add(struct super_block *sb, struct kvec *first,
struct kvec *last, u64 segno, u64 seq, u8 level)
{
DECLARE_MANIFEST(sb, mani);
struct manifest_entry *ment;
unsigned long flags;
int ret;
ment = kmalloc(sizeof(struct manifest_entry), GFP_NOFS);
if (!ment)
return -ENOMEM;
ret = scoutfs_kvec_dup_flatten(ment->first, first) ?:
scoutfs_kvec_dup_flatten(ment->first, last);
if (ret) {
free_ment(ment);
return -ENOMEM;
}
ment->segno = segno;
ment->seq = seq;
ment->level = level;
/* XXX think about where to insert level 0 */
spin_lock_irqsave(&mani->lock, flags);
ret = add_ment(mani, ment);
spin_unlock_irqrestore(&mani->lock, flags);
if (WARN_ON_ONCE(ret)) /* XXX can this happen? ring corruption? */
free_ment(ment);
return ret;
}
static void set_ref(struct manifest_ref *ref, struct manifest_entry *mani)
{
ref->segno = mani->segno;
ref->seq = mani->seq;
ref->level = mani->level;
}
/*
* Returns refs if intersecting segments are found, NULL if none intersect,
* and PTR_ERR on failure.
*/
static struct manifest_ref *get_key_refs(struct manifest *mani,
struct kvec *key,
unsigned int *nr_ret)
{
struct manifest_ref *refs = NULL;
struct manifest_entry *ment;
struct rb_root *root;
unsigned long flags;
unsigned int total;
unsigned int nr;
int i;
spin_lock_irqsave(&mani->lock, flags);
total = mani->level0_nr + mani->last_level;
while (nr != total) {
nr = total;
spin_unlock_irqrestore(&mani->lock, flags);
kfree(refs);
refs = kcalloc(total, sizeof(struct manifest_ref), GFP_NOFS);
if (!refs)
return ERR_PTR(-ENOMEM);
spin_lock_irqsave(&mani->lock, flags);
}
nr = 0;
list_for_each_entry(ment, &mani->level0_list, level0_entry) {
if (scoutfs_kvec_cmp_overlap(key, key,
ment->first, ment->last))
continue;
set_ref(&refs[nr++], ment);
}
for (i = 1; i <= mani->last_level; i++) {
root = &mani->level_roots[i];
if (RB_EMPTY_ROOT(root))
continue;
ment = find_ment(root, key);
if (ment)
set_ref(&refs[nr++], ment);
}
spin_unlock_irqrestore(&mani->lock, flags);
*nr_ret = nr;
if (!nr) {
kfree(refs);
refs = NULL;
}
return refs;
}
/*
* The caller didn't find an item for the given key in the item cache
* and wants us to search for it in the lsm segments. We search the
* manifest for all the segments that contain the key. We then read the
* segments and iterate over their items looking for ours. We insert it
* and some number of other surrounding items to amortize the relatively
* expensive multi-segment searches.
*
* This is asking the seg code to read each entire segment. The seg
* code could give it it helpers to submit and wait on blocks within the
* segment so that we don't have wild bandwidth amplification in the
* cold random read case.
*
* The segments are immutable at this point so we can use their contents
* as long as we hold refs.
*/
int scoutfs_manifest_read_items(struct super_block *sb, struct kvec *key)
{
DECLARE_MANIFEST(sb, mani);
SCOUTFS_DECLARE_KVEC(item_key);
SCOUTFS_DECLARE_KVEC(item_val);
SCOUTFS_DECLARE_KVEC(found_key);
SCOUTFS_DECLARE_KVEC(found_val);
struct scoutfs_segment *seg;
struct manifest_ref *refs;
unsigned long had_found;
bool found;
int ret = 0;
int err;
int nr_refs;
int cmp;
int i;
int n;
refs = get_key_refs(mani, key, &nr_refs);
if (IS_ERR(refs))
return PTR_ERR(refs);
if (!refs)
return -ENOENT;
/* submit reads for all the segments */
for (i = 0; i < nr_refs; i++) {
seg = scoutfs_seg_submit_read(sb, refs[i].segno);
if (IS_ERR(seg)) {
ret = PTR_ERR(seg);
break;
}
refs[i].seg = seg;
}
/* wait for submitted segments and search if we haven't seen failure */
for (n = 0; n < i; n++) {
seg = refs[i].seg;
err = scoutfs_seg_wait(sb, seg);
if (err && !ret)
ret = err;
if (!ret)
refs[i].pos = scoutfs_seg_find_pos(seg, key);
}
/* done if we saw errors */
if (ret)
goto out;
/* walk sorted items, resolving across segments, and insert */
for (n = 0; n < 16; n++) {
found = false;
/* find the most recent least key */
for (i = 0; i < nr_refs; i++) {
seg = refs[i].seg;
if (!seg)
continue;
/* get kvecs, removing if we ran out of items */
ret = scoutfs_seg_item_kvecs(seg, refs[i].pos,
item_key, item_val);
if (ret < 0) {
scoutfs_seg_put(seg);
refs[i].seg = NULL;
continue;
}
if (found) {
cmp = scoutfs_kvec_memcmp(item_key, found_key);
if (cmp >= 0) {
if (cmp == 0)
set_bit(i, &had_found);
continue;
}
}
/* remember new least key */
scoutfs_kvec_clone(found_key, key);
scoutfs_kvec_clone(found_val, item_val);
found = true;
had_found = 0;
set_bit(i, &had_found);
}
/* return -ENOENT if we didn't find any or the callers item */
if (n == 0 &&
(!found || scoutfs_kvec_memcmp(key, found_key))) {
ret = -ENOENT;
break;
}
if (!found) {
ret = 0;
break;
}
ret = scoutfs_item_insert(sb, item_key, item_val);
if (ret)
break;
/* advance all the positions past the found key */
for_each_set_bit(i, &had_found, BITS_PER_LONG)
refs[i].pos++;
}
out:
for (i = 0; i < nr_refs; i++)
scoutfs_seg_put(refs[i].seg);
kfree(refs);
return ret;
}
int scoutfs_manifest_setup(struct super_block *sb)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct manifest *mani;
int i;
mani = kzalloc(sizeof(struct manifest), GFP_KERNEL);
if (!mani)
return -ENOMEM;
sbi->manifest = mani;
spin_lock_init(&mani->lock);
INIT_LIST_HEAD(&mani->level0_list);
for (i = 0; i < ARRAY_SIZE(mani->level_roots); i++)
mani->level_roots[i] = RB_ROOT;
return 0;
}
void scoutfs_manifest_destroy(struct super_block *sb)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct manifest *mani = sbi->manifest;
struct manifest_entry *ment;
struct manifest_entry *tmp;
struct rb_node *node;
struct rb_root *root;
int i;
if (!mani)
return;
for (i = 1; i <= mani->last_level; i++) {
root = &mani->level_roots[i];
for (node = rb_first(root); node; ) {
ment = container_of(node, struct manifest_entry, node);
node = rb_next(node);
remove_ment(mani, ment);
free_ment(ment);
}
}
list_for_each_entry_safe(ment, tmp, &mani->level0_list, level0_entry) {
remove_ment(mani, ment);
free_ment(ment);
}
kfree(mani);
}

11
kmod/src/manifest.h Normal file
View File

@@ -0,0 +1,11 @@
#ifndef _SCOUTFS_MANIFEST_H_
#define _SCOUTFS_MANIFEST_H_
int scoutfs_manifest_add(struct super_block *sb, struct kvec *first,
struct kvec *last, u64 segno, u64 seq, u8 level);
int scoutfs_manifest_read_items(struct super_block *sb, struct kvec *key);
int scoutfs_manifest_setup(struct super_block *sb);
void scoutfs_manifest_destroy(struct super_block *sb);
#endif

263
kmod/src/ring.c Normal file
View File

@@ -0,0 +1,263 @@
/*
* Copyright (C) 2016 Versity Software, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include "super.h"
#include "format.h"
#include "kvec.h"
#include "bio.h"
#include "manifest.h"
#include "ring.h"
/*
* OK, log:
* - big preallocated ring of variable length entries
* - entries are rounded to 4k blocks
* - entire thing is read and indexed in rbtree
* - static allocated page is kept around to record and write entries
* - indexes have cursor that points to next node to migrate
* - any time an entry is written an entry is migrated
* - allocate room for 4x (maybe including worst case rounding)
* - mount does binary search looking for newest entry
* - newest entry describes block where we started migrating
* - replay then walks from oldest to newest replaying
* - entries are marked with migration so we know where to set cursor after
*
* XXX
* - verify blocks
* - could compress
*/
/* read in a meg at a time */
#define NR_PAGES DIV_ROUND_UP(1024 * 1024, PAGE_SIZE)
#define NR_BLOCKS (NR_PAGES * SCOUTFS_BLOCKS_PER_PAGE)
#if 0
#define BLOCKS_PER_PAGE (PAGE_SIZE / SCOUTFS_BLOCK_SIZE)
static void read_page_end_io(struct bio *bio, int err)
{
struct bio_vec *bvec;
struct page *page;
unsigned long i;
for_each_bio_segment(bio, bvec, i) {
page = bvec->bv_page;
if (err)
SetPageError(page);
else
SetPageUptodate(page);
unlock_page(page);
}
bio_put(bio);
}
/*
* Read the given number of 4k blocks into the pages provided by the
* caller. We translate the block count into a page count and fill
* bios a page at a time.
*/
static int read_blocks(struct super_block *sb, struct page **pages,
u64 blkno, unsigned int nr_blocks)
{
unsigned int nr_pages = DIV_ROUND_UP(nr_blocks, PAGES_PER_BLOCK);
unsigned int bytes;
struct bio *bio;
int ret = 0;
for (i = 0; i < nr_pages; i++) {
page = pages[i];
if (!bio) {
bio = bio_alloc(GFP_NOFS, nr_pages - i);
if (!bio)
bio = bio_alloc(GFP_NOFS, 1);
if (!bio) {
ret = -ENOMEM;
break;
}
bio->bi_sector = blkno << (SCOUTFS_BLOCK_SHIFT - 9);
bio->bi_bdev = sb->s_bdev;
bio->bi_end_io = read_pages_end_io;
}
lock_page(page);
ClearPageError(page);
ClearPageUptodate(page);
bytes = min(nr_blocks << SCOUTFS_BLOCK_SHIFT, PAGE_SIZE);
if (bio_add_page(bio, page, bytes, 0) != bytes) {
/* submit the full bio and retry this page */
submit_bio(READ, bio);
bio = NULL;
unlock_page(page);
i--;
continue;
}
blkno += BLOCKS_PER_PAGE;
nr_blocks -= BLOCKS_PER_PAGE;
}
if (bio)
submit_bio(READ, bio);
for (i = 0; i < nr_pages; i++) {
page = pages[i];
wait_on_page_locked(page);
if (!ret && (!PageUptodate(page) || PageError(page)))
ret = -EIO;
}
return ret;
}
#endif
static int read_one_entry(struct super_block *sb,
struct scoutfs_ring_entry_header *eh)
{
struct scoutfs_ring_add_manifest *am;
SCOUTFS_DECLARE_KVEC(first);
SCOUTFS_DECLARE_KVEC(last);
int ret;
switch(eh->type) {
case SCOUTFS_RING_ADD_MANIFEST:
am = container_of(eh, struct scoutfs_ring_add_manifest, eh);
scoutfs_kvec_init(first, am + 1,
le16_to_cpu(am->first_key_len));
scoutfs_kvec_init(last,
first[0].iov_base + first[0].iov_len,
le16_to_cpu(am->last_key_len));
ret = scoutfs_manifest_add(sb, first, last,
le64_to_cpu(am->segno),
le64_to_cpu(am->seq), am->level);
break;
default:
ret = -EINVAL;
}
return ret;
}
static int read_entries(struct super_block *sb,
struct scoutfs_ring_block *ring)
{
struct scoutfs_ring_entry_header *eh;
int ret = 0;
int i;
eh = ring->entries;
for (i = 0; i < le32_to_cpu(ring->nr_entries); i++) {
ret = read_one_entry(sb, eh);
if (ret)
break;
eh = (void *)eh + le16_to_cpu(eh->len);
}
return ret;
}
#if 0
/* return pointer to the blk 4k block offset amongst the pages */
static void *page_block_address(struct page **pages, unsigned int blk)
{
unsigned int i = blk / BLOCKS_PER_PAGE;
unsigned int off = (blk % BLOCKS_PER_PAGE) << SCOUTFS_BLOCK_SHIFT;
return page_address(pages[i]) + off;
}
#endif
int scoutfs_ring_read(struct super_block *sb)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct scoutfs_super_block *super = &sbi->super;
struct scoutfs_ring_block *ring;
struct page **pages;
struct page *page;
u64 index;
u64 blkno;
u64 tail;
u64 seq;
int ret;
int nr;
int i;
/* nr_blocks/pages calc doesn't handle multiple pages per block */
BUILD_BUG_ON(PAGE_SIZE < SCOUTFS_BLOCK_SIZE);
pages = kcalloc(NR_PAGES, sizeof(struct page *), GFP_NOFS);
if (!pages)
return -ENOMEM;
for (i = 0; i < NR_PAGES; i++) {
page = alloc_page(GFP_NOFS);
if (!page) {
ret = -ENOMEM;
goto out;
}
pages[i] = page;
}
index = le64_to_cpu(super->ring_head_index);
tail = le64_to_cpu(super->ring_tail_index);
seq = le64_to_cpu(super->ring_head_seq);
do {
blkno = le64_to_cpu(super->ring_blkno) + index;
if (index <= tail)
nr = tail - index + 1;
else
nr = le64_to_cpu(super->ring_blocks) - index;
nr = min_t(int, nr, NR_BLOCKS);
ret = scoutfs_bio_read(sb, pages, index, nr);
if (ret)
goto out;
/* XXX verify block header */
for (i = 0; i < nr; i++) {
ring = scoutfs_page_block_address(pages, i);
ret = read_entries(sb, ring);
if (ret)
goto out;
}
index += nr;
if (index == le64_to_cpu(super->ring_blocks))
index = 0;
} while (index != tail);
out:
for (i = 0; i < NR_PAGES && pages && pages[i]; i++)
__free_page(pages[i]);
kfree(pages);
return ret;
}

8
kmod/src/ring.h Normal file
View File

@@ -0,0 +1,8 @@
#ifndef _SCOUTFS_RING_H_
#define _SCOUTFS_RING_H_
#include <linux/uio.h>
int scoutfs_ring_read(struct super_block *sb);
#endif

399
kmod/src/seg.c Normal file
View File

@@ -0,0 +1,399 @@
/*
* Copyright (C) 2016 Versity Software, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include "super.h"
#include "format.h"
#include "seg.h"
#include "bio.h"
#include "kvec.h"
/*
* seg.c should just be about the cache and io, and maybe
* iteration and stuff.
*
* XXX:
* - lru and shrinker
* - verify csum
* - make sure item headers don't cross page boundaries
* - just wait on pages instead of weird flags?
*/
struct segment_cache {
spinlock_t lock;
struct rb_root root;
wait_queue_head_t waitq;
};
struct scoutfs_segment {
struct rb_node node;
atomic_t refcount;
u64 segno;
unsigned long flags;
int err;
struct page *pages[SCOUTFS_SEGMENT_PAGES];
};
enum {
SF_END_IO = 0,
};
static struct scoutfs_segment *alloc_seg(u64 segno)
{
struct scoutfs_segment *seg;
struct page *page;
int i;
/* don't waste the tail of pages */
BUILD_BUG_ON(SCOUTFS_SEGMENT_SIZE % PAGE_SIZE);
seg = kzalloc(sizeof(struct scoutfs_segment), GFP_NOFS);
if (!seg)
return seg;
RB_CLEAR_NODE(&seg->node);
atomic_set(&seg->refcount, 1);
seg->segno = segno;
for (i = 0; i < SCOUTFS_SEGMENT_PAGES; i++) {
page = alloc_page(GFP_NOFS);
if (!page) {
scoutfs_seg_put(seg);
return ERR_PTR(-ENOMEM);
}
seg->pages[i] = page;
}
return seg;
}
void scoutfs_seg_put(struct scoutfs_segment *seg)
{
int i;
if (!IS_ERR_OR_NULL(seg) && atomic_dec_and_test(&seg->refcount)) {
WARN_ON_ONCE(!RB_EMPTY_NODE(&seg->node));
for (i = 0; i < SCOUTFS_SEGMENT_PAGES; i++)
if (seg->pages[i])
__free_page(seg->pages[i]);
kfree(seg);
}
}
static int cmp_u64s(u64 a, u64 b)
{
return a < b ? -1 : a > b ? 1 : 0;
}
static struct scoutfs_segment *find_seg(struct rb_root *root, u64 segno)
{
struct rb_node *node = root->rb_node;
struct rb_node *parent = NULL;
struct scoutfs_segment *seg;
int cmp;
while (node) {
parent = node;
seg = container_of(node, struct scoutfs_segment, node);
cmp = cmp_u64s(segno, seg->segno);
if (cmp < 0)
node = node->rb_left;
else if (cmp > 0)
node = node->rb_right;
else
return seg;
}
return NULL;
}
/*
* This always inserts the segment into the rbtree. If there's already
* a segment at the given seg then it is removed and returned. The caller
* doesn't have to erase it from the tree if it's returned.
*/
static struct scoutfs_segment *replace_seg(struct rb_root *root,
struct scoutfs_segment *ins)
{
struct rb_node **node = &root->rb_node;
struct rb_node *parent = NULL;
struct scoutfs_segment *seg;
struct scoutfs_segment *found = NULL;
int cmp;
while (*node) {
parent = *node;
seg = container_of(*node, struct scoutfs_segment, node);
cmp = cmp_u64s(ins->segno, seg->segno);
if (cmp < 0) {
node = &(*node)->rb_left;
} else if (cmp > 0) {
node = &(*node)->rb_right;
} else {
rb_replace_node(&seg->node, &ins->node, root);
found = seg;
break;
}
}
if (!found) {
rb_link_node(&ins->node, parent, node);
rb_insert_color(&ins->node, root);
}
return found;
}
static bool erase_seg(struct rb_root *root, struct scoutfs_segment *seg)
{
if (!RB_EMPTY_NODE(&seg->node)) {
rb_erase(&seg->node, root);
RB_CLEAR_NODE(&seg->node);
return true;
}
return false;
}
static void seg_end_io(struct super_block *sb, void *data, int err)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct segment_cache *cac = sbi->segment_cache;
struct scoutfs_segment *seg = data;
unsigned long flags;
bool erased;
if (err) {
seg->err = err;
spin_lock_irqsave(&cac->lock, flags);
erased = erase_seg(&cac->root, seg);
spin_unlock_irqrestore(&cac->lock, flags);
if (erased)
scoutfs_seg_put(seg);
}
set_bit(SF_END_IO, &seg->flags);
smp_mb__after_atomic();
if (waitqueue_active(&cac->waitq))
wake_up(&cac->waitq);
scoutfs_seg_put(seg);
}
static u64 segno_to_blkno(u64 blkno)
{
return blkno << (SCOUTFS_SEGMENT_SHIFT - SCOUTFS_BLOCK_SHIFT);
}
/*
* The bios submitted by this don't have page references themselves. If
* this succeeds then the caller must call _wait before putting their
* seg ref.
*/
struct scoutfs_segment *scoutfs_seg_submit_read(struct super_block *sb,
u64 segno)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct segment_cache *cac = sbi->segment_cache;
struct scoutfs_segment *existing;
struct scoutfs_segment *seg;
unsigned long flags;
spin_lock_irqsave(&cac->lock, flags);
seg = find_seg(&cac->root, segno);
if (seg)
atomic_inc(&seg->refcount);
spin_unlock_irqrestore(&cac->lock, flags);
if (seg)
return seg;
seg = alloc_seg(segno);
if (IS_ERR(seg))
return seg;
/* always drop existing segs, could compare seqs */
spin_lock_irqsave(&cac->lock, flags);
atomic_inc(&seg->refcount);
existing = replace_seg(&cac->root, seg);
spin_unlock_irqrestore(&cac->lock, flags);
if (existing)
scoutfs_seg_put(existing);
atomic_inc(&seg->refcount);
scoutfs_bio_submit(sb, READ, seg->pages, segno_to_blkno(seg->segno),
SCOUTFS_SEGMENT_BLOCKS, seg_end_io, seg);
return seg;
}
int scoutfs_seg_wait(struct super_block *sb, struct scoutfs_segment *seg)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct segment_cache *cac = sbi->segment_cache;
int ret;
ret = wait_event_interruptible(cac->waitq,
test_bit(SF_END_IO, &seg->flags));
if (!ret)
ret = seg->err;
return ret;
}
static void *off_ptr(struct scoutfs_segment *seg, u32 off)
{
unsigned int pg = off >> PAGE_SHIFT;
unsigned int pg_off = off & ~PAGE_MASK;
return page_address(seg->pages[pg]) + pg_off;
}
/*
* Return a pointer to the item in the array at the given position.
*
* The item structs fill the first block in the segment after the
* initial segment block struct. Item structs don't cross block
* boundaries so the final bytes that would make up a partial item
* struct are skipped.
*/
static struct scoutfs_segment_item *pos_item(struct scoutfs_segment *seg,
int pos)
{
u32 off;
if (pos < SCOUTFS_SEGMENT_FIRST_BLOCK_ITEMS) {
off = sizeof(struct scoutfs_segment_block);
} else {
pos -= SCOUTFS_SEGMENT_FIRST_BLOCK_ITEMS;
off = (1 + (pos / SCOUTFS_SEGMENT_ITEMS_PER_BLOCK)) *
SCOUTFS_BLOCK_SIZE;
pos %= SCOUTFS_SEGMENT_ITEMS_PER_BLOCK;
}
return off_ptr(seg, off + (pos * sizeof(struct scoutfs_segment_item)));
}
static void kvec_from_pages(struct scoutfs_segment *seg,
struct kvec *kvec, u32 off, u16 len)
{
u32 first;
first = min_t(int, len, PAGE_SIZE - (off & ~PAGE_MASK));
if (first == len)
scoutfs_kvec_init(kvec, off_ptr(seg, off), len);
else
scoutfs_kvec_init(kvec, off_ptr(seg, off), first,
off_ptr(seg, off + first), len - first);
}
int scoutfs_seg_item_kvecs(struct scoutfs_segment *seg, int pos,
struct kvec *key, struct kvec *val)
{
struct scoutfs_segment_block *sblk = off_ptr(seg, 0);
struct scoutfs_segment_item *item;
if (pos < 0 || pos >= le32_to_cpu(sblk->nr_items))
return -ENOENT;
item = pos_item(seg, pos);
if (key)
kvec_from_pages(seg, key, le32_to_cpu(item->key_off),
le16_to_cpu(item->key_len));
if (val)
kvec_from_pages(seg, val, le32_to_cpu(item->val_off),
le16_to_cpu(item->val_len));
return 0;
}
/*
* Find the first item array position whose key is >= the search key.
* This can return the number of positions if the key is greater than
* all the keys.
*/
static int find_key_pos(struct scoutfs_segment *seg, struct kvec *search)
{
struct scoutfs_segment_block *sblk = off_ptr(seg, 0);
SCOUTFS_DECLARE_KVEC(key);
unsigned int start = 0;
unsigned int end = le32_to_cpu(sblk->nr_items);
unsigned int pos = 0;
int cmp;
while (start < end) {
pos = start + (end - start) / 2;
scoutfs_seg_item_kvecs(seg, pos, key, NULL);
cmp = scoutfs_kvec_memcmp(search, key);
if (cmp < 0)
end = pos;
else if (cmp > 0)
start = ++pos;
else
break;
}
return pos;
}
int scoutfs_seg_find_pos(struct scoutfs_segment *seg, struct kvec *key)
{
return find_key_pos(seg, key);
}
int scoutfs_seg_setup(struct super_block *sb)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct segment_cache *cac;
cac = kzalloc(sizeof(struct segment_cache), GFP_KERNEL);
if (!cac)
return -ENOMEM;
sbi->segment_cache = cac;
spin_lock_init(&cac->lock);
cac->root = RB_ROOT;
init_waitqueue_head(&cac->waitq);
return 0;
}
void scoutfs_seg_destroy(struct super_block *sb)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct segment_cache *cac = sbi->segment_cache;
struct scoutfs_segment *seg;
struct rb_node *node;
if (cac) {
for (node = rb_first(&cac->root); node; ) {
seg = container_of(node, struct scoutfs_segment, node);
node = rb_next(node);
erase_seg(&cac->root, seg);
scoutfs_seg_put(seg);
}
kfree(cac);
}
}

20
kmod/src/seg.h Normal file
View File

@@ -0,0 +1,20 @@
#ifndef _SCOUTFS_SEG_H_
#define _SCOUTFS_SEG_H_
struct scoutfs_segment;
struct kvec;
struct scoutfs_segment *scoutfs_seg_submit_read(struct super_block *sb,
u64 segno);
int scoutfs_seg_wait(struct super_block *sb, struct scoutfs_segment *seg);
int scoutfs_seg_find_pos(struct scoutfs_segment *seg, struct kvec *key);
int scoutfs_seg_item_kvecs(struct scoutfs_segment *seg, int pos,
struct kvec *key, struct kvec *val);
void scoutfs_seg_put(struct scoutfs_segment *seg);
int scoutfs_seg_setup(struct super_block *sb);
void scoutfs_seg_destroy(struct super_block *sb);
#endif

View File

@@ -28,6 +28,10 @@
#include "counters.h"
#include "trans.h"
#include "buddy.h"
#include "ring.h"
#include "item.h"
#include "manifest.h"
#include "seg.h"
#include "scoutfs_trace.h"
static struct kset *scoutfs_kset;
@@ -212,7 +216,11 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
ret = scoutfs_setup_counters(sb) ?:
read_supers(sb) ?:
scoutfs_buddy_setup(sb) ?:
scoutfs_seg_setup(sb) ?:
scoutfs_manifest_setup(sb) ?:
scoutfs_item_setup(sb) ?:
scoutfs_ring_read(sb) ?:
// scoutfs_buddy_setup(sb) ?:
scoutfs_setup_trans(sb);
if (ret)
return ret;
@@ -227,7 +235,7 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
if (!sb->s_root)
return -ENOMEM;
scoutfs_scan_orphans(sb);
// scoutfs_scan_orphans(sb);
return 0;
}
@@ -248,6 +256,9 @@ static void scoutfs_kill_sb(struct super_block *sb)
scoutfs_buddy_destroy(sb);
if (sbi->block_shrinker.shrink == scoutfs_block_shrink)
unregister_shrinker(&sbi->block_shrinker);
scoutfs_item_destroy(sb);
scoutfs_manifest_destroy(sb);
scoutfs_seg_destroy(sb);
scoutfs_block_destroy(sb);
scoutfs_destroy_counters(sb);
if (sbi->kset)

View File

@@ -9,6 +9,9 @@
struct scoutfs_counters;
struct buddy_info;
struct item_cache;
struct manifest;
struct segment_cache;
struct scoutfs_sb_info {
struct super_block *sb;
@@ -28,6 +31,10 @@ struct scoutfs_sb_info {
struct list_head block_lru_list;
unsigned long block_lru_nr;
struct manifest *manifest;
struct item_cache *item_cache;
struct segment_cache *segment_cache;
struct buddy_info *buddy_info;
struct rw_semaphore btree_rwsem;