scoutfs: reference file data with extent items

Our first attempt at storing file data put them in items.  This was easy
to implement but won't be acceptable in the long term.  The cost of the
power of LSM indexing is compaction overhead.  That's acceptable for
fine grained metadata but is totally unacceptable for bulk file data.

This switches to storing file data in seperate block allocations which
are referenced by extent items.

The bulk of the change is the mechanics of working with extents.  We
have high level callers which add or remove logical extents and then
underlying mechanisms that insert, merge, or split the items that
the extents are stored in.

We have three types of extent items.  The primary type maps logical file
regions to physical block extents.  The next two store free extents
per-node so that clients don't create lock and LSM contention as they
try and allocate extents.

To fill those per-node free extents we add messages that communcate free
extents in the form of lists of segment allocations from the server.

We don't do any fancy multi-block allocation yet.  We only allocate
blocks in get_blocks as writes find unmapped blocks.  We do use some
per-task cursors to cache block allocation positions so that these
single block allocations are very likely to merge into larger extents as
tasks stream wites.

This is just the first chunk of the extent work that's coming.  A later
patch adds offline flags and fixes up the change nonsense that seemed
like a good idea here.

The final moving part is that we initiate writeback on all newly
allocated extents before we commit the metadata that references the new
blocks.  We do this with our own dirty inode tracking because the high
level vfs methods are unusably slow in some upstream kernels (they walk
all inodes, not just dirty inodes.)

Signed-off-by: Zach Brown <zab@versity.com>
This commit is contained in:
Zach Brown
2017-05-01 13:57:59 -07:00
parent 6719733ddc
commit 6afeb97802
10 changed files with 1224 additions and 436 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -6,7 +6,6 @@ extern const struct file_operations scoutfs_file_fops;
int scoutfs_data_truncate_items(struct super_block *sb, u64 ino, u64 iblock,
u64 len, bool offline);
void scoutfs_data_end_writeback(struct super_block *sb, int err);
int scoutfs_data_setup(struct super_block *sb);
void scoutfs_data_destroy(struct super_block *sb);

View File

@@ -156,9 +156,10 @@ struct scoutfs_segment_block {
#define SCOUTFS_READDIR_KEY 6
#define SCOUTFS_LINK_BACKREF_KEY 7
#define SCOUTFS_SYMLINK_KEY 8
#define SCOUTFS_EXTENT_KEY 9
#define SCOUTFS_FILE_EXTENT_KEY 9
#define SCOUTFS_ORPHAN_KEY 10
#define SCOUTFS_DATA_KEY 11
#define SCOUTFS_FREE_EXTENT_BLKNO_KEY 11
#define SCOUTFS_FREE_EXTENT_BLOCKS_KEY 12
/* not found in the fs */
#define SCOUTFS_MAX_UNUSED_KEY 253
#define SCOUTFS_NET_ADDR_KEY 254
@@ -198,11 +199,28 @@ struct scoutfs_orphan_key {
__be64 ino;
} __packed;
/* value is data payload bytes */
struct scoutfs_data_key {
/* no value */
struct scoutfs_file_extent_key {
__u8 type;
__be64 ino;
__be64 block;
__be64 last_blk_off;
__be64 last_blkno;
__be64 blocks;
} __packed;
/* no value */
struct scoutfs_free_extent_blkno_key {
__u8 type;
__be64 node_id;
__be64 last_blkno;
__be64 blocks;
} __packed;
struct scoutfs_free_extent_blocks_key {
__u8 type;
__be64 node_id;
__be64 blocks;
__be64 last_blkno;
} __packed;
/* value is each item's part of the full xattr value for the off/len */
@@ -384,6 +402,11 @@ struct scoutfs_net_manifest_entries {
struct scoutfs_manifest_entry ments[0];
} __packed;
struct scoutfs_net_segnos {
__le16 nr;
__le64 segnos[0];
} __packed;
enum {
/* sends and receives a struct scoutfs_timeval */
SCOUTFS_NET_TRADE_TIME = 0,
@@ -391,6 +414,7 @@ enum {
SCOUTFS_NET_MANIFEST_RANGE_ENTRIES,
SCOUTFS_NET_ALLOC_SEGNO,
SCOUTFS_NET_RECORD_SEGMENT,
SCOUTFS_NET_BULK_ALLOC,
SCOUTFS_NET_UNKNOWN,
};

View File

@@ -47,6 +47,16 @@ struct free_ino_pool {
bool in_flight;
};
struct inode_sb_info {
struct free_ino_pool pool;
spinlock_t writeback_lock;
struct rb_root writeback_inodes;
};
#define DECLARE_INODE_SB_INFO(sb, name) \
struct inode_sb_info *name = SCOUTFS_SB(sb)->inode_sb_info
static struct kmem_cache *scoutfs_inode_cachep;
/*
@@ -61,6 +71,7 @@ static void scoutfs_inode_ctor(void *obj)
seqcount_init(&ci->seqcount);
ci->staging = false;
init_rwsem(&ci->xattr_rwsem);
RB_CLEAR_NODE(&ci->writeback_node);
inode_init_once(&ci->inode);
}
@@ -84,8 +95,48 @@ static void scoutfs_i_callback(struct rcu_head *head)
kmem_cache_free(scoutfs_inode_cachep, SCOUTFS_I(inode));
}
static void insert_writeback_inode(struct inode_sb_info *inf,
struct scoutfs_inode_info *ins)
{
struct rb_root *root = &inf->writeback_inodes;
struct rb_node **node = &root->rb_node;
struct rb_node *parent = NULL;
struct scoutfs_inode_info *si;
while (*node) {
parent = *node;
si = container_of(*node, struct scoutfs_inode_info,
writeback_node);
if (ins->ino < si->ino)
node = &(*node)->rb_left;
else if (ins->ino > si->ino)
node = &(*node)->rb_right;
else
BUG();
}
rb_link_node(&ins->writeback_node, parent, node);
rb_insert_color(&ins->writeback_node, root);
}
static void remove_writeback_inode(struct inode_sb_info *inf,
struct scoutfs_inode_info *si)
{
if (!RB_EMPTY_NODE(&si->writeback_node)) {
rb_erase(&si->writeback_node, &inf->writeback_inodes);
RB_CLEAR_NODE(&si->writeback_node);
}
}
void scoutfs_destroy_inode(struct inode *inode)
{
DECLARE_INODE_SB_INFO(inode->i_sb, inf);
spin_lock(&inf->writeback_lock);
remove_writeback_inode(inf, SCOUTFS_I(inode));
spin_unlock(&inf->writeback_lock);
call_rcu(&inode->i_rcu, scoutfs_i_callback);
}
@@ -393,7 +444,7 @@ u64 scoutfs_last_ino(struct super_block *sb)
*/
void scoutfs_inode_fill_pool(struct super_block *sb, u64 ino, u64 nr)
{
struct free_ino_pool *pool = SCOUTFS_SB(sb)->free_ino_pool;
struct free_ino_pool *pool = &SCOUTFS_SB(sb)->inode_sb_info->pool;
trace_printk("filling ino %llu nr %llu\n", ino, nr);
@@ -427,7 +478,7 @@ static bool pool_in_flight(struct free_ino_pool *pool)
*/
static int alloc_ino(struct super_block *sb, u64 *ino)
{
struct free_ino_pool *pool = SCOUTFS_SB(sb)->free_ino_pool;
struct free_ino_pool *pool = &SCOUTFS_SB(sb)->inode_sb_info->pool;
bool request;
int ret;
@@ -733,28 +784,121 @@ int scoutfs_orphan_inode(struct inode *inode)
return ret;
}
/*
* Track an inode that could have dirty pages. Used to kick off writeback
* on all dirty pages during transaction commit without tying ourselves in
* knots trying to call through the high level vfs sync methods.
*/
void scoutfs_inode_queue_writeback(struct inode *inode)
{
DECLARE_INODE_SB_INFO(inode->i_sb, inf);
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
spin_lock(&inf->writeback_lock);
if (RB_EMPTY_NODE(&si->writeback_node))
insert_writeback_inode(inf, si);
spin_unlock(&inf->writeback_lock);
}
/*
* Walk our dirty inodes in ino order and either start dirty page
* writeback or wait for writeback to complete.
*
* This is called by transaction commiting so other writers are
* excluded. We're still very careful to iterate over the tree while it
* and the inodes could be changing.
*
* Because writes are excluded we know that there's no remaining dirty
* pages once waiting returns successfully.
*
* XXX not sure what to do about retrying io errors.
*/
int scoutfs_inode_walk_writeback(struct super_block *sb, bool write)
{
DECLARE_INODE_SB_INFO(sb, inf);
struct scoutfs_inode_info *si;
struct rb_node *node;
struct inode *inode;
struct inode *defer_iput = NULL;
int ret;
spin_lock(&inf->writeback_lock);
node = rb_first(&inf->writeback_inodes);
while (node) {
si = container_of(node, struct scoutfs_inode_info,
writeback_node);
node = rb_next(node);
inode = igrab(&si->inode);
if (!inode)
continue;
spin_unlock(&inf->writeback_lock);
if (defer_iput) {
iput(defer_iput);
defer_iput = NULL;
}
if (write)
ret = filemap_fdatawrite(inode->i_mapping);
else
ret = filemap_fdatawait(inode->i_mapping);
trace_printk("ino %llu write %d ret %d\n",
scoutfs_ino(inode), write, ret);
if (ret) {
iput(inode);
goto out;
}
spin_lock(&inf->writeback_lock);
if (WARN_ON_ONCE(RB_EMPTY_NODE(&si->writeback_node)))
node = rb_first(&inf->writeback_inodes);
else
node = rb_next(&si->writeback_node);
if (!write)
remove_writeback_inode(inf, si);
/* avoid iput->destroy lock deadlock */
defer_iput = inode;
}
spin_unlock(&inf->writeback_lock);
out:
if (defer_iput)
iput(defer_iput);
return ret;
}
int scoutfs_inode_setup(struct super_block *sb)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct free_ino_pool *pool;
struct inode_sb_info *inf;
pool = kzalloc(sizeof(struct free_ino_pool), GFP_KERNEL);
if (!pool)
inf = kzalloc(sizeof(struct inode_sb_info), GFP_KERNEL);
if (!inf)
return -ENOMEM;
pool = &inf->pool;
init_waitqueue_head(&pool->waitq);
spin_lock_init(&pool->lock);
sbi->free_ino_pool = pool;
spin_lock_init(&inf->writeback_lock);
inf->writeback_inodes = RB_ROOT;
sbi->inode_sb_info = inf;
return 0;
}
void scoutfs_inode_destroy(struct super_block *sb)
{
struct free_ino_pool *pool = SCOUTFS_SB(sb)->free_ino_pool;
struct inode_sb_info *inf = SCOUTFS_SB(sb)->inode_sb_info;
kfree(pool);
kfree(inf);
}
void scoutfs_inode_exit(void)

View File

@@ -13,6 +13,7 @@ struct scoutfs_inode_info {
seqcount_t seqcount;
bool staging; /* holder of i_mutex is staging */
struct rw_semaphore xattr_rwsem;
struct rb_node writeback_node;
struct inode inode;
};
@@ -48,6 +49,9 @@ u64 scoutfs_inode_get_data_version(struct inode *inode);
int scoutfs_scan_orphans(struct super_block *sb);
void scoutfs_inode_queue_writeback(struct inode *inode);
int scoutfs_inode_walk_writeback(struct super_block *sb, bool write);
u64 scoutfs_last_ino(struct super_block *sb);
void scoutfs_inode_exit(void);

View File

@@ -18,6 +18,7 @@
#include <linux/in.h>
#include <net/sock.h>
#include <net/tcp.h>
#include <linux/sort.h>
#include "format.h"
#include "net.h"
@@ -363,6 +364,61 @@ static struct send_buf *alloc_sbuf(unsigned data_len)
return sbuf;
}
/* XXX I dunno, totally made up */
#define BULK_COUNT 32
static struct send_buf *process_bulk_alloc(struct super_block *sb,void *req,
int req_len)
{
DECLARE_NET_INFO(sb, nti);
struct scoutfs_net_segnos *ns;
struct commit_waiter cw;
struct send_buf *sbuf;
u64 segno;
int ret;
int i;
if (req_len != 0)
return ERR_PTR(-EINVAL);
sbuf = alloc_sbuf(offsetof(struct scoutfs_net_segnos,
segnos[BULK_COUNT]));
if (!sbuf)
return ERR_PTR(-ENOMEM);
ns = (void *)sbuf->nh->data;
ns->nr = cpu_to_le16(BULK_COUNT);
down_read(&nti->ring_commit_rwsem);
for (i = 0; i < BULK_COUNT; i++) {
ret = scoutfs_alloc_segno(sb, &segno);
if (ret) {
while (i-- > 0)
scoutfs_alloc_free(sb,
le64_to_cpu(ns->segnos[i]));
break;
}
ns->segnos[i] = cpu_to_le64(segno);
}
if (ret == 0)
queue_commit_work(nti, &cw);
up_read(&nti->ring_commit_rwsem);
if (ret == 0)
ret = wait_for_commit(&cw);
if (ret)
sbuf->nh->status = SCOUTFS_NET_STATUS_ERROR;
else
sbuf->nh->status = SCOUTFS_NET_STATUS_SUCCESS;
return sbuf;
}
static struct send_buf *process_record_segment(struct super_block *sb,
void *req, int req_len)
{
@@ -616,6 +672,7 @@ static proc_func_t type_proc_func(u8 type)
process_manifest_range_entries,
[SCOUTFS_NET_ALLOC_SEGNO] = process_alloc_segno,
[SCOUTFS_NET_RECORD_SEGMENT] = process_record_segment,
[SCOUTFS_NET_BULK_ALLOC] = process_bulk_alloc,
};
return type < SCOUTFS_NET_UNKNOWN ? funcs[type] : NULL;
@@ -1100,6 +1157,113 @@ static int add_send_buf(struct super_block *sb, int type, void *data,
return 0;
}
struct bulk_alloc_args {
struct completion comp;
u64 *segnos;
int ret;
};
static int sort_cmp_u64s(const void *A, const void *B)
{
const u64 *a = A;
const u64 *b = B;
return *a < *b ? -1 : *a > *b ? 1 : 0;
}
static void sort_swap_u64s(void *A, void *B, int size)
{
u64 *a = A;
u64 *b = B;
swap(*a, *b);
}
static int bulk_alloc_reply(struct super_block *sb, void *reply, int ret,
void *arg)
{
struct bulk_alloc_args *args = arg;
struct scoutfs_net_segnos *ns = reply;
u16 nr;
int i;
if (ret < sizeof(struct scoutfs_net_segnos) ||
ret != offsetof(struct scoutfs_net_segnos,
segnos[le16_to_cpu(ns->nr)])) {
ret = -EINVAL;
goto out;
}
nr = le16_to_cpu(ns->nr);
args->segnos = kmalloc((nr + 1) * sizeof(args->segnos[0]), GFP_NOFS);
if (args->segnos == NULL) {
ret = -ENOMEM; /* XXX hmm. */
goto out;
}
for (i = 0; i < nr; i++) {
args->segnos[i] = le64_to_cpu(ns->segnos[i]);
/* make sure they're all non-zero */
if (args->segnos[i] == 0) {
ret = -EINVAL;
goto out;
}
}
sort(args->segnos, nr, sizeof(args->segnos[0]),
sort_cmp_u64s, sort_swap_u64s);
/* make sure they're all unique */
for (i = 1; i < nr; i++) {
if (args->segnos[i] == args->segnos[i - 1]) {
ret = -EINVAL;
goto out;
}
}
args->segnos[nr] = 0;
ret = 0;
out:
if (ret && args->segnos) {
kfree(args->segnos);
args->segnos = NULL;
}
args->ret = ret;
complete(&args->comp);
return args->ret;
}
/*
* Returns a 0-terminated allocated array of segnos, the caller is
* responsible for freeing it.
*/
u64 *scoutfs_net_bulk_alloc(struct super_block *sb)
{
struct bulk_alloc_args args;
int ret;
args.segnos = NULL;
init_completion(&args.comp);
ret = add_send_buf(sb, SCOUTFS_NET_BULK_ALLOC, NULL, 0,
bulk_alloc_reply, &args);
if (ret == 0) {
wait_for_completion(&args.comp);
ret = args.ret;
if (ret == 0 && (args.segnos == NULL || args.segnos[0] == 0))
ret = -ENOSPC;
}
if (ret) {
kfree(args.segnos);
args.segnos = ERR_PTR(ret);
}
return args.segnos;
}
/*
* Eventually we're going to have messages that control compaction.
* Each client mount would have long-lived work that sends requests

View File

@@ -13,6 +13,7 @@ int scoutfs_net_manifest_range_entries(struct super_block *sb,
int scoutfs_net_alloc_segno(struct super_block *sb, u64 *segno);
int scoutfs_net_record_segment(struct super_block *sb,
struct scoutfs_segment *seg, u8 level);
u64 *scoutfs_net_bulk_alloc(struct super_block *sb);
int scoutfs_net_get_compaction(struct super_block *sb, void *curs);
int scoutfs_net_finish_compaction(struct super_block *sb, void *curs,

View File

@@ -204,6 +204,12 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
if (!sbi)
return -ENOMEM;
/*
* XXX this is random today for initial testing, but we'll want
* it to be assigned by the server.
*/
get_random_bytes_arch(&sbi->node_id, sizeof(sbi->node_id));
spin_lock_init(&sbi->next_ino_lock);
atomic_set(&sbi->trans_holds, 0);
init_waitqueue_head(&sbi->trans_hold_wq);

View File

@@ -14,11 +14,13 @@ struct compact_info;
struct data_info;
struct lock_info;
struct net_info;
struct free_ino_pool;
struct inode_sb_info;
struct scoutfs_sb_info {
struct super_block *sb;
u64 node_id;
struct scoutfs_super_block super;
spinlock_t next_ino_lock;
@@ -29,7 +31,7 @@ struct scoutfs_sb_info {
struct seg_alloc *seg_alloc;
struct compact_info *compact_info;
struct data_info *data_info;
struct free_ino_pool *free_ino_pool;
struct inode_sb_info *inode_sb_info;
atomic_t trans_holds;
wait_queue_head_t trans_hold_wq;

View File

@@ -26,6 +26,7 @@
#include "seg.h"
#include "counters.h"
#include "net.h"
#include "inode.h"
#include "scoutfs_trace.h"
/*
@@ -97,10 +98,12 @@ void scoutfs_trans_write_func(struct work_struct *work)
* about leaking segnos nor duplicate manifest entries
* on crashes between us and the server.
*/
ret = scoutfs_net_alloc_segno(sb, &segno) ?:
ret = scoutfs_inode_walk_writeback(sb, true) ?:
scoutfs_net_alloc_segno(sb, &segno) ?:
scoutfs_seg_alloc(sb, segno, &seg) ?:
scoutfs_item_dirty_seg(sb, seg) ?:
scoutfs_seg_submit_write(sb, seg, &comp) ?:
scoutfs_inode_walk_writeback(sb, false) ?:
scoutfs_bio_wait_comp(sb, &comp) ?:
scoutfs_net_record_segment(sb, seg, 0);
if (ret)
@@ -112,9 +115,6 @@ out:
/* XXX this all needs serious work for dealing with errors */
WARN_ON_ONCE(ret);
/* must be done before waking waiting trans holders who might dirty */
scoutfs_data_end_writeback(sb, ret);
spin_lock(&sbi->trans_write_lock);
sbi->trans_write_count++;
sbi->trans_write_ret = ret;