Files
scoutfs/kmod/src/data.c
Zach Brown 5f11cdbfe5 scoutfs: add and index inode meta and data seqs
For each transaction we send a message to to the server asking for a
unique sequence number to associate with the transaction.  When we
change metadata or data of an inode we store the current transaction seq
in the inode and we index it with index items like the other inode
fields.

The server remembers the sequences it gives out.  When we go to walk the
inode sequence indexes we ask the server for the largest stable seq and
limit results to that seq.  This ensures that we never return seqs that
are past dirty items so never have inodes and seqs appear in the past.

Nodes use the sync timer to regularly cycle through seqs and ensure that
inode seq index walks don't get stuck on their otherwise idle seq.

Signed-off-by: Zach Brown <zab@versity.com>
2017-05-23 12:12:24 -07:00

1270 lines
33 KiB
C

/*
* Copyright (C) 2017 Versity Software, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/mpage.h>
#include <linux/rhashtable.h>
#include <linux/sched.h>
#include <linux/buffer_head.h>
#include <linux/hash.h>
#include "format.h"
#include "super.h"
#include "inode.h"
#include "key.h"
#include "data.h"
#include "trans.h"
#include "counters.h"
#include "scoutfs_trace.h"
#include "item.h"
#include "ioctl.h"
#include "net.h"
#define EXTF "[off %llu bno %llu bks %llu fl %x]"
#define EXTA(ne) (ne)->blk_off, (ne)->blkno, (ne)->blocks, (ne)->flags
/*
* scoutfs uses extent items to reference file data.
*
* The extent items map logical file regions to device blocks at 4K
* block granularity. File data isn't overwritten so that overwriting
* doesn't generate extent item locking and modification.
*
* Nodes have their own free extent items stored at their node id to
* avoid lock contention during allocation and freeing. These pools are
* filled and drained with messages to the server who allocates
* segment-sized regions.
*
* Block allocation maintains a fixed number of allocation cursors that
* remember the position of tasks within free regions. This is very
* simple and maintains decent extents for simple streaming writes. It
* eventually won't be good enough and we'll spend complexity on
* delalloc but we want to put that off as long as possible.
*
* There's no unwritten extents. As we dirty file data pages, possibly
* allocating extents for the first time, we track their inodes. Before
* we commit dirty metadata we write out all tracked inodes. This
* ensures that data is persistent before the metadata that references
* it is visible.
*
* Weirdly, the extents are indexed by the *final* logical block and
* blkno of the extent. This lets us search for neighbouring previous
* extents with a _next() call and avoids having to implement item
* reading that iterates backwards through the manifest and segments.
*
* There are two items that track free extents, one indexed by the block
* location of the free extent and one indexed by the size of the free
* extent. This means that one allocation can update a great number of
* items throughout the tree as items are created and deleted as extents
* are split and merged. This can introduce inconsistent failure
* states. We'll some day address that with preallocation and pinning.
*
* XXX
* - truncate
* - mmap
* - better io error propagation
* - forced unmount with dirty data
* - direct IO
*/
struct data_info {
struct rw_semaphore alloc_rwsem;
u64 next_large_blkno;
struct rhashtable cursors;
struct list_head cursor_lru;
};
#define DECLARE_DATA_INFO(sb, name) \
struct data_info *name = SCOUTFS_SB(sb)->data_info
/* more than enough for a few tasks per core on moderate hardware */
#define NR_CURSORS 4096
/*
* This is the size of extents that are tracked by a cursor and so end
* up being the largest file item extent length given concurrent
* streaming writes.
*
* XXX We probably want this to be a bit larger to further reduce the
* amount of item churn involved in truncating tremendous files.
*/
#define LARGE_EXTENT_BLOCKS SCOUTFS_SEGMENT_BLOCKS
struct cursor_id {
struct task_struct *task;
pid_t pid;
} __packed; /* rhashtable_lookup() always memcmp()s, avoid padding */
struct task_cursor {
u64 blkno;
u64 blocks;
struct rhash_head hash_head;
struct list_head list_head;
struct cursor_id id;
};
/*
* Both file extent and free extent keys are converted into this native
* form for manipulation. The free extents set blk_off to blkno.
*/
struct native_extent {
u64 blk_off;
u64 blkno;
u64 blocks;
u8 flags;
};
/* avoiding dynamic on-stack array initializers :/ */
union extent_key_union {
struct scoutfs_file_extent_key file;
struct scoutfs_free_extent_blkno_key blkno;
struct scoutfs_free_extent_blocks_key blocks;
} __packed;
#define MAX_KEY_BYTES sizeof(union extent_key_union)
static void init_file_extent_key(struct scoutfs_key_buf *key, void *key_bytes,
struct native_extent *ext, u64 arg)
{
struct scoutfs_file_extent_key *fkey = key_bytes;
fkey->type = SCOUTFS_FILE_EXTENT_KEY;
fkey->ino = cpu_to_be64(arg);
fkey->last_blk_off = cpu_to_be64(ext->blk_off + ext->blocks - 1);
fkey->last_blkno = cpu_to_be64(ext->blkno + ext->blocks - 1);
fkey->blocks = cpu_to_be64(ext->blocks);
fkey->flags = ext->flags;
scoutfs_key_init(key, fkey, sizeof(struct scoutfs_file_extent_key));
}
#define INIT_FREE_EXTENT_KEY(which_type, key, key_bytes, ext, arg, type) \
do { \
struct which_type *fkey = key_bytes; \
\
fkey->type = type; \
fkey->node_id = cpu_to_be64(arg); \
fkey->last_blkno = cpu_to_be64(ext->blkno + ext->blocks - 1); \
fkey->blocks = cpu_to_be64(ext->blocks); \
\
scoutfs_key_init(key, fkey, sizeof(struct which_type)); \
} while (0)
static void init_extent_key(struct scoutfs_key_buf *key, void *key_bytes,
struct native_extent *ext, u64 arg, u8 type)
{
if (type == SCOUTFS_FILE_EXTENT_KEY)
init_file_extent_key(key, key_bytes, ext, arg);
else if(type == SCOUTFS_FREE_EXTENT_BLKNO_KEY)
INIT_FREE_EXTENT_KEY(scoutfs_free_extent_blkno_key,
key, key_bytes, ext, arg, type);
else
INIT_FREE_EXTENT_KEY(scoutfs_free_extent_blocks_key,
key, key_bytes, ext, arg, type);
}
/* XXX could have some sanity checks */
static void load_file_extent(struct native_extent *ext,
struct scoutfs_key_buf *key)
{
struct scoutfs_file_extent_key *fkey = key->data;
ext->blocks = be64_to_cpu(fkey->blocks);
ext->blk_off = be64_to_cpu(fkey->last_blk_off) - ext->blocks + 1;
ext->blkno = be64_to_cpu(fkey->last_blkno) - ext->blocks + 1;
ext->flags = fkey->flags;
}
#define LOAD_FREE_EXTENT(which_type, ext, key) \
do { \
struct which_type *fkey = key->data; \
\
ext->blkno = be64_to_cpu(fkey->last_blkno) - \
be64_to_cpu(fkey->blocks) + 1; \
ext->blk_off = ext->blkno; \
ext->blocks = be64_to_cpu(fkey->blocks); \
ext->flags = 0; \
} while (0)
static void load_extent(struct native_extent *ext, struct scoutfs_key_buf *key)
{
struct scoutfs_free_extent_blocks_key *fkey = key->data;
BUILD_BUG_ON(offsetof(struct scoutfs_file_extent_key, type) !=
offsetof(struct scoutfs_free_extent_blkno_key, type) ||
offsetof(struct scoutfs_file_extent_key, type) !=
offsetof(struct scoutfs_free_extent_blocks_key, type));
if (fkey->type == SCOUTFS_FILE_EXTENT_KEY)
load_file_extent(ext, key);
else if (fkey->type == SCOUTFS_FREE_EXTENT_BLKNO_KEY)
LOAD_FREE_EXTENT(scoutfs_free_extent_blkno_key, ext, key);
else
LOAD_FREE_EXTENT(scoutfs_free_extent_blocks_key, ext, key);
}
/*
* Merge two extents if they're adjacent. First we arrange them to
* only test their adjoining endpoints, then are careful to not reference
* fields after we've modified them.
*/
static int merge_extents(struct native_extent *mod,
struct native_extent *ext)
{
struct native_extent *left;
struct native_extent *right;
if (mod->blk_off < ext->blk_off) {
left = mod;
right = ext;
} else {
left = ext;
right = mod;
}
if (left->blk_off + left->blocks == right->blk_off &&
left->blkno + left->blocks == right->blkno &&
left->flags == right->flags) {
mod->blk_off = left->blk_off;
mod->blkno = left->blkno;
mod->blocks = left->blocks + right->blocks;
return 1;
}
return 0;
}
/*
* The caller has ensured that the inner extent is entirely within
* the outer extent. Fill out the left and right regions of outter
* that don't overlap with inner.
*/
static void trim_extents(struct native_extent *left,
struct native_extent *right,
struct native_extent *outer,
struct native_extent *inner)
{
left->blk_off = outer->blk_off;
left->blkno = outer->blkno;
left->blocks = inner->blk_off - outer->blk_off;
left->flags = outer->flags;
right->blk_off = inner->blk_off + inner->blocks;
right->blkno = inner->blkno + inner->blocks;
right->blocks = (outer->blk_off + outer->blocks) - right->blk_off;
right->flags = outer->flags;
}
/* return true if inner is fully contained by outer */
static bool extents_within(struct native_extent *outer,
struct native_extent *inner)
{
u64 outer_end = outer->blk_off + outer->blocks - 1;
u64 inner_end = inner->blk_off + inner->blocks - 1;
return outer->blk_off <= inner_end && outer_end >= inner_end;
}
/*
* Find an adjacent extent in the direction of the delta. If we can
* merge with it then we modify the incoming cur extent. nei is set to
* the neighbour we found. If we didn't merge then nei's blocks is set
* to 0.
*/
static int try_merge(struct super_block *sb, struct native_extent *cur,
s64 delta, struct native_extent *nei, u64 arg, u8 type)
{
u8 last_bytes[MAX_KEY_BYTES];
u8 key_bytes[MAX_KEY_BYTES];
struct scoutfs_key_buf last;
struct scoutfs_key_buf key;
struct native_extent ext;
int ret;
memset(nei, 0, sizeof(struct native_extent));
/* short circuit prev search for common first block alloc */
if (cur->blk_off == 0 && delta < 0)
return 0;
memset(&ext, ~0, sizeof(ext));
init_extent_key(&last, last_bytes, &ext, arg, type);
ext.blk_off = cur->blk_off + delta;
ext.blkno = cur->blkno + delta;
ext.blocks = 1;
ext.flags = 0;
init_extent_key(&key, key_bytes, &ext, arg, type);
ret = scoutfs_item_next_same(sb, &key, &last, NULL);
if (ret < 0) {
if (ret == -ENOENT)
ret = 0;
goto out;
}
load_extent(&ext, &key);
trace_printk("merge nei "EXTF"\n", EXTA(&ext));
if (merge_extents(cur, &ext))
*nei = ext;
ret = 0;
out:
return ret;
}
/*
* We have two item types for indexing free extents by either the
* location of the extent or the size of the extent. When we create
* logical extents we might be finding neighbouring extents that could
* be merged. We can only search for neighbours in the location items.
* Once we find them we mirror the item modifications for both the
* location and size items.
*
* If this returns an error then nothing will have changed.
*/
static int modify_items(struct super_block *sb, struct native_extent *ext,
u64 arg, u8 type, bool create)
{
u8 key_bytes[MAX_KEY_BYTES];
struct scoutfs_key_buf key;
int ret;
int err;
trace_printk("mod cre %u "EXTF"\n", create, EXTA(ext));
BUG_ON(type != SCOUTFS_FILE_EXTENT_KEY &&
type != SCOUTFS_FREE_EXTENT_BLKNO_KEY);
init_extent_key(&key, key_bytes, ext, arg, type);
ret = create ? scoutfs_item_create(sb, &key, NULL) :
scoutfs_item_delete(sb, &key);
if (ret == 0 && type == SCOUTFS_FREE_EXTENT_BLKNO_KEY) {
init_extent_key(&key, key_bytes, ext, arg,
SCOUTFS_FREE_EXTENT_BLOCKS_KEY);
ret = create ? scoutfs_item_create(sb, &key, NULL) :
scoutfs_item_delete(sb, &key);
if (ret) {
init_extent_key(&key, key_bytes, ext, arg, type);
err = create ? scoutfs_item_delete(sb, &key) :
scoutfs_item_create(sb, &key, NULL);
BUG_ON(err);
}
}
return ret;
}
/*
* Insert a new extent. We see if it can be merged with adjacent
* existing extents. If this returns an error then the existing extents
* will not have changed.
*/
static int insert_extent(struct super_block *sb,
struct native_extent *caller_ins,
u64 arg, u8 type)
{
struct native_extent left;
struct native_extent right;
struct native_extent ins = *caller_ins;
bool del_ins = false;
bool ins_left = false;
int err;
int ret;
trace_printk("inserting "EXTF"\n", EXTA(caller_ins));
/* find previous that might be adjacent */
ret = try_merge(sb, &ins, -1, &left, arg, type);
try_merge(sb, &ins, 1, &right, arg, type);
if (ret < 0)
goto out;
trace_printk("merge left "EXTF"\n", EXTA(&left));
trace_printk("merge right "EXTF"\n", EXTA(&right));
ret = modify_items(sb, &ins, arg, type, true);
if (ret)
goto out;
del_ins = true;
if (left.blocks) {
ret = modify_items(sb, &left, arg, type, false);
if (ret)
goto undo;
ins_left = true;
}
if (right.blocks)
ret = modify_items(sb, &right, arg, type, false);
undo:
if (ret) {
if (ins_left) {
err = modify_items(sb, &left, arg, type, true);
BUG_ON(err);
}
if (del_ins) {
err = modify_items(sb, &ins, arg, type, false);
BUG_ON(err);
}
}
out:
return ret;
}
/*
* Remove a portion of an existing extent. The removal might leave
* behind non-overlapping edges of the existing extent. If this returns
* an error then the existing extent will not have changed.
*/
static int remove_extent(struct super_block *sb,
struct native_extent *rem, u64 arg, u8 type)
{
u8 last_bytes[MAX_KEY_BYTES];
u8 key_bytes[MAX_KEY_BYTES];
struct scoutfs_key_buf last;
struct scoutfs_key_buf key;
struct native_extent left = {0,};
struct native_extent right = {0,};
struct native_extent outer;
bool rem_left = false;
bool rem_right = false;
int err = 0;
int ret;
trace_printk("removing "EXTF"\n", EXTA(rem));
memset(&outer, ~0, sizeof(outer));
init_extent_key(&last, last_bytes, &outer, arg, type);
/* find outer existing extent that contains removal extent */
init_extent_key(&key, key_bytes, rem, arg, type);
ret = scoutfs_item_next_same(sb, &key, &last, NULL);
if (ret)
goto out;
load_extent(&outer, &key);
trace_printk("outer "EXTF"\n", EXTA(&outer));
if (!extents_within(&outer, rem) || outer.flags != rem->flags) {
ret = -EIO;
goto out;
}
trim_extents(&left, &right, &outer, rem);
trace_printk("trim left "EXTF"\n", EXTA(&left));
trace_printk("trim right "EXTF"\n", EXTA(&right));
if (left.blocks) {
ret = modify_items(sb, &left, arg, type, true);
if (ret)
goto out;
rem_left = true;
}
if (right.blocks) {
ret = modify_items(sb, &right, arg, type, true);
if (ret)
goto out;
rem_right = true;
}
ret = modify_items(sb, &outer, arg, type, false);
out:
if (ret) {
if (rem_right) {
err = modify_items(sb, &right, arg, type, false);
BUG_ON(err);
}
if (rem_left) {
err = modify_items(sb, &left, arg, type, false);
BUG_ON(err);
}
}
trace_printk("ret %d\n", ret);
return ret;
}
/*
* Free extents whose blocks fall inside the specified logical block
* range.
*
* If 'offline' is given then blocks are freed but the extent items are
* left behind and their _OFFLINE flag is set.
*
* This is the low level extent item manipulation code. Callers manage
* higher order locking and transactional consistency.
*/
int scoutfs_data_truncate_items(struct super_block *sb, u64 ino, u64 iblock,
u64 len, bool offline)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
u8 last_bytes[MAX_KEY_BYTES];
u8 key_bytes[MAX_KEY_BYTES];
struct scoutfs_key_buf last;
struct scoutfs_key_buf key;
struct native_extent found;
struct native_extent rng;
struct native_extent ext;
struct native_extent ofl;
struct native_extent fr;
bool rem_fr = false;
bool ins_ext = false;
int ret = 0;
int err;
trace_printk("iblock %llu len %llu offline %u\n",
iblock, len, offline);
memset(&ext, ~0, sizeof(ext));
init_extent_key(&last, last_bytes, &ext, ino, SCOUTFS_FILE_EXTENT_KEY);
rng.blk_off = iblock;
rng.blocks = len;
rng.blkno = 0;
rng.flags = 0;
while (rng.blocks) {
/* find the next extent that could include our first block */
init_extent_key(&key, key_bytes, &rng, ino,
SCOUTFS_FILE_EXTENT_KEY);
ret = scoutfs_item_next_same(sb, &key, &last, NULL);
if (ret < 0) {
if (ret == -ENOENT)
ret = 0;
break;
}
load_extent(&found, &key);
trace_printk("found "EXTF"\n", EXTA(&found));
/* XXX corruption: offline and allocation are exclusive */
if (!!found.blkno ==
!!(found.flags & SCOUTFS_FILE_EXTENT_OFFLINE)) {
ret = -EIO;
break;
}
/* we're done if the found extent is past us */
if (found.blk_off >= rng.blk_off + rng.blocks) {
ret = 0;
break;
}
/* find the intersection */
ext.blk_off = max(rng.blk_off, found.blk_off);
ext.blocks = min(rng.blk_off + rng.blocks,
found.blk_off + found.blocks) - ext.blk_off;
ext.blkno = found.blkno + (ext.blk_off - found.blk_off);
ext.flags = found.flags;
/* next search will be past the extent we truncate */
rng.blk_off = ext.blk_off + ext.blocks;
if (rng.blk_off < iblock + len)
rng.blocks = (iblock + len) - rng.blk_off;
else
rng.blocks = 0;
/* done if already offline */
if (offline && (ext.flags & SCOUTFS_FILE_EXTENT_OFFLINE))
continue;
/* free the old extent if it was allocated */
if (ext.blkno) {
fr = ext;
fr.blk_off = fr.blkno;
ret = insert_extent(sb, &fr, sbi->node_id,
SCOUTFS_FREE_EXTENT_BLKNO_KEY);
if (ret)
break;
rem_fr = true;
}
/* always remove the overlapping file extent */
ret = remove_extent(sb, &ext, ino, SCOUTFS_FILE_EXTENT_KEY);
if (ret)
break;
ins_ext = true;
/* maybe add new file extents with the offline flag set */
if (offline) {
ofl = ext;
ofl.blkno = 0;
ofl.flags = SCOUTFS_FILE_EXTENT_OFFLINE;
ret = insert_extent(sb, &ofl, sbi->node_id,
SCOUTFS_FILE_EXTENT_KEY);
if (ret)
break;
}
rem_fr = false;
ins_ext = false;
}
if (ret) {
if (ins_ext) {
err = insert_extent(sb, &ext, ino,
SCOUTFS_FILE_EXTENT_KEY);
BUG_ON(err);
}
if (rem_fr) {
err = remove_extent(sb, &fr, sbi->node_id,
SCOUTFS_FREE_EXTENT_BLKNO_KEY);
BUG_ON(err);
}
}
return ret;
}
/*
* These cheesy cursors are only meant to encourage nice IO patterns for
* concurrent tasks either streaming large file writes or creating lots
* of small files. It will do very poorly in many other situations. To
* do better we'd need to go further down the road to delalloc and take
* more surrounding context into account.
*/
static struct task_cursor *get_cursor(struct data_info *datinf)
{
struct task_cursor *curs;
struct cursor_id id = {
.task = current,
.pid = current->pid,
};
curs = rhashtable_lookup(&datinf->cursors, &id);
if (!curs) {
curs = list_last_entry(&datinf->cursor_lru,
struct task_cursor, list_head);
trace_printk("resetting curs %p was task %p pid %u\n",
curs, curs->id.task, curs->id.pid);
rhashtable_remove(&datinf->cursors, &curs->hash_head, GFP_NOFS);
curs->id = id;
rhashtable_insert(&datinf->cursors, &curs->hash_head, GFP_NOFS);
curs->blkno = 0;
curs->blocks = 0;
}
list_move(&curs->list_head, &datinf->cursor_lru);
return curs;
}
static int bulk_alloc(struct super_block *sb)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct native_extent ext;
u64 *segnos = NULL;
int ret;
int i;
segnos = scoutfs_net_bulk_alloc(sb);
if (IS_ERR(segnos)) {
ret = PTR_ERR(segnos);
goto out;
}
for (i = 0; segnos[i]; i++) {
/* merge or set this one */
if (i > 0 && (segnos[i] == segnos[i - 1] + 1)) {
ext.blocks += SCOUTFS_SEGMENT_BLOCKS;
trace_printk("merged segno [%u] %llu blocks %llu\n",
i, segnos[i], ext.blocks);
} else {
ext.blkno = segnos[i] << SCOUTFS_SEGMENT_BLOCK_SHIFT;
ext.blocks = SCOUTFS_SEGMENT_BLOCKS;
trace_printk("set extent segno [%u] %llu blkno %llu\n",
i, segnos[i], ext.blkno);
}
/* don't write if we merge with the next one */
if ((segnos[i] + 1) == segnos[i + 1])
continue;
trace_printk("inserting [%u] "EXTF"\n", i, EXTA(&ext));
ext.blk_off = ext.blkno;
ext.flags = 0;
ret = insert_extent(sb, &ext, sbi->node_id,
SCOUTFS_FREE_EXTENT_BLKNO_KEY);
if (ret)
break;
}
out:
if (!IS_ERR_OR_NULL(segnos))
kfree(segnos);
/* XXX don't orphan segnos on error, crash recovery with server */
return ret;
}
/*
* Allocate a single block for the logical block offset in the file.
*
* We try to merge single block allocations into large extents by using
* per-task cursors. Each cursor tracks a block region that should be
* searched for free extents. If we don't have a cursor, or we find
* free space outside of our cursor, then we look for the next large
* free extent.
*/
static int allocate_block(struct inode *inode, sector_t iblock, u64 *blkno,
bool was_offline)
{
struct super_block *sb = inode->i_sb;
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
DECLARE_DATA_INFO(sb, datinf);
u8 last_bytes[MAX_KEY_BYTES];
u8 key_bytes[MAX_KEY_BYTES];
struct scoutfs_key_buf last;
struct scoutfs_key_buf key;
struct native_extent last_ext;
struct native_extent found;
struct native_extent ext;
struct native_extent ofl;
struct native_extent fr;
struct task_cursor *curs;
bool alloced = false;
const u64 ino = scoutfs_ino(inode);
bool rem_ext = false;
bool ins_ofl = false;
u8 type;
int err;
int ret;
memset(&last_ext, ~0, sizeof(last_ext));
down_write(&datinf->alloc_rwsem);
curs = get_cursor(datinf);
/* start from the cursor or look for the next large extent */
reset_cursor:
if (curs->blocks) {
ext.blkno = curs->blkno;
ext.blocks = 0;
type = SCOUTFS_FREE_EXTENT_BLKNO_KEY;
} else {
ext.blkno = datinf->next_large_blkno;
ext.blocks = LARGE_EXTENT_BLOCKS;
type = SCOUTFS_FREE_EXTENT_BLOCKS_KEY;
}
ext.flags = 0;
retry:
trace_printk("searching %llu,%llu curs %p task %p pid %u %llu,%llu\n",
ext.blkno, ext.blocks, curs, curs->id.task, curs->id.pid,
curs->blkno, curs->blocks);
ext.blk_off = ext.blkno;
init_extent_key(&key, key_bytes, &ext, sbi->node_id, type);
init_extent_key(&last, last_bytes, &last_ext, sbi->node_id, type);
ret = scoutfs_item_next_same(sb, &key, &last, NULL);
if (ret < 0) {
if (ret == -ENOENT) {
/* if the cursor's empty fall back to next large */
if (ext.blkno && ext.blocks == 0) {
curs->blkno = 0;
curs->blocks = 0;
goto reset_cursor;
}
/* wrap the search for large extents */
if (ext.blkno > LARGE_EXTENT_BLOCKS && ext.blocks) {
datinf->next_large_blkno = LARGE_EXTENT_BLOCKS;
ext.blkno = datinf->next_large_blkno;
goto retry;
}
/* ask the server for more extents */
if (ext.blocks && !alloced) {
ret = bulk_alloc(sb);
if (ret < 0)
goto out;
alloced = true;
goto retry;
}
/* finally look for any free block at all */
if (ext.blocks) {
ext.blkno = 0;
ext.blocks = 0;
type = SCOUTFS_FREE_EXTENT_BLKNO_KEY;
goto retry;
}
/* after all that return -ENOSPC */
ret = -ENOSPC;
}
goto out;
}
load_extent(&found, &key);
trace_printk("found nei "EXTF"\n", EXTA(&found));
/* look for a new large extent if found is outside cursor */
if (curs->blocks &&
(found.blkno + found.blocks <= curs->blkno ||
found.blkno >= curs->blkno + curs->blocks)) {
curs->blkno = 0;
curs->blocks = 0;
goto reset_cursor;
}
/*
* Set the cursor if:
* - we didn't already have one
* - it's large enough for a large extent with alignment padding
* - the sufficiently large free region is past next large
*/
if (!curs->blocks &&
found.blocks >= (2 * LARGE_EXTENT_BLOCKS) &&
(found.blkno + found.blocks - (2 * LARGE_EXTENT_BLOCKS) >=
datinf->next_large_blkno)) {
curs->blkno = ALIGN(max(found.blkno, datinf->next_large_blkno),
LARGE_EXTENT_BLOCKS);
curs->blocks = LARGE_EXTENT_BLOCKS;
found.blkno = curs->blkno;
found.blocks = curs->blocks;
datinf->next_large_blkno = curs->blkno + LARGE_EXTENT_BLOCKS;
}
trace_printk("using %llu,%llu curs %llu,%llu\n",
found.blkno, found.blocks, curs->blkno, curs->blocks);
/* remove old offline block if we're staging */
if (was_offline) {
ofl.blk_off = iblock;
ofl.blkno = 0;
ofl.blocks = 1;
ofl.flags = SCOUTFS_FILE_EXTENT_OFFLINE;
ret = remove_extent(sb, &ofl, ino, SCOUTFS_FILE_EXTENT_KEY);
if (ret < 0)
goto out;
ins_ofl = true;
}
/* insert new file extent */
*blkno = found.blkno;
ext.blk_off = iblock;
ext.blkno = found.blkno;
ext.blocks = 1;
ext.flags = 0;
ret = insert_extent(sb, &ext, ino, SCOUTFS_FILE_EXTENT_KEY);
if (ret < 0)
goto out;
rem_ext = true;
/* and remove free extents */
fr = ext;
fr.blk_off = ext.blkno;
ret = remove_extent(sb, &fr, sbi->node_id,
SCOUTFS_FREE_EXTENT_BLKNO_KEY);
if (ret)
goto out;
/* advance cursor if we're using it */
if (curs->blocks) {
if (--curs->blocks == 0)
curs->blkno = 0;
else
curs->blkno++;
}
ret = 0;
out:
if (ret) {
if (rem_ext) {
err = remove_extent(sb, &ext, ino,
SCOUTFS_FILE_EXTENT_KEY);
BUG_ON(err);
}
if (ins_ofl) {
err = insert_extent(sb, &ofl, ino,
SCOUTFS_FILE_EXTENT_KEY);
BUG_ON(err);
}
}
up_write(&datinf->alloc_rwsem);
trace_printk("ret %d\n", ret);
return ret;
}
static int scoutfs_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int create)
{
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
struct super_block *sb = inode->i_sb;
DECLARE_DATA_INFO(sb, datinf);
u8 last_bytes[MAX_KEY_BYTES];
u8 key_bytes[MAX_KEY_BYTES];
struct scoutfs_key_buf last;
struct scoutfs_key_buf key;
struct native_extent ext;
bool was_offline = false;
u64 blkno;
u64 off;
int ret;
bh->b_blocknr = 0;
bh->b_size = 0;
ext.blk_off = iblock;
ext.blocks = 1;
ext.blkno = 0;
ext.flags = 0;
init_extent_key(&key, key_bytes, &ext, scoutfs_ino(inode),
SCOUTFS_FILE_EXTENT_KEY);
memset(&ext, ~0, sizeof(ext));
init_extent_key(&last, last_bytes, &ext, scoutfs_ino(inode),
SCOUTFS_FILE_EXTENT_KEY);
/*
* XXX think about how far this next can go, given locking and
* item consistency.
*/
down_read(&datinf->alloc_rwsem);
ret = scoutfs_item_next_same(sb, &key, &last, NULL);
up_read(&datinf->alloc_rwsem);
if (ret < 0) {
if (ret == -ENOENT)
memset(&ext, 0, sizeof(ext));
else
goto out;
} else {
load_extent(&ext, &key);
trace_printk("found nei "EXTF"\n", EXTA(&ext));
}
if ((ext.flags & SCOUTFS_FILE_EXTENT_OFFLINE) && !si->staging) {
ret = -EINVAL;
goto out;
}
/* use the extent if it intersects */
if (iblock >= ext.blk_off && iblock < (ext.blk_off + ext.blocks)) {
if (ext.flags & SCOUTFS_FILE_EXTENT_OFFLINE) {
/* non-stage can't write to offline */
if (!si->staging) {
ret = -EINVAL;
goto out;
}
was_offline = true;
} else {
/* found online extent */
off = iblock - ext.blk_off;
map_bh(bh, inode->i_sb, ext.blkno + off);
bh->b_size = min_t(u64, SIZE_MAX,
(ext.blocks - off) << SCOUTFS_BLOCK_SHIFT);
}
}
if (!buffer_mapped(bh) && create) {
ret = allocate_block(inode, iblock, &blkno, was_offline);
if (ret)
goto out;
map_bh(bh, inode->i_sb, blkno);
bh->b_size = SCOUTFS_BLOCK_SHIFT;
set_buffer_new(bh);
}
ret = 0;
out:
trace_printk("ino %llu iblock %llu create %d ret %d bnr %llu size %zu\n",
scoutfs_ino(inode), (u64)iblock, create, ret,
(u64)bh->b_blocknr, bh->b_size);
return ret;
}
static int scoutfs_readpage(struct file *file, struct page *page)
{
return mpage_readpage(page, scoutfs_get_block);
}
static int scoutfs_readpages(struct file *file, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages)
{
return mpage_readpages(mapping, pages, nr_pages, scoutfs_get_block);
}
static int scoutfs_writepage(struct page *page, struct writeback_control *wbc)
{
return block_write_full_page(page, scoutfs_get_block, wbc);
}
static int scoutfs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
return mpage_writepages(mapping, wbc, scoutfs_get_block);
}
static int scoutfs_write_begin(struct file *file,
struct address_space *mapping, loff_t pos,
unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
{
struct inode *inode = mapping->host;
struct super_block *sb = inode->i_sb;
int ret;
trace_printk("ino %llu pos %llu len %u\n",
scoutfs_ino(inode), (u64)pos, len);
ret = scoutfs_hold_trans(sb);
if (ret)
goto out;
/* can't re-enter fs, have trans */
flags |= AOP_FLAG_NOFS;
/* generic write_end updates i_size and calls dirty_inode */
ret = scoutfs_dirty_inode_item(inode);
if (ret == 0)
ret = block_write_begin(mapping, pos, len, flags, pagep,
scoutfs_get_block);
if (ret)
scoutfs_release_trans(sb);
out:
return ret;
}
static int scoutfs_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
struct inode *inode = mapping->host;
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
struct super_block *sb = inode->i_sb;
int ret;
trace_printk("ino %llu pgind %lu pos %llu len %u copied %d\n",
scoutfs_ino(inode), page->index, (u64)pos, len, copied);
ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
if (ret > 0) {
if (!si->staging) {
scoutfs_inode_set_data_seq(inode);
scoutfs_inode_inc_data_version(inode);
}
/* XXX kind of a big hammer, inode life cycle needs work */
scoutfs_update_inode_item(inode);
scoutfs_inode_queue_writeback(inode);
}
scoutfs_release_trans(sb);
return ret;
}
/*
* Return the extents that intersect with the given byte range. It doesn't
* trim the returned extents to the byte range.
*/
int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
struct super_block *sb = inode->i_sb;
const u8 type = SCOUTFS_FILE_EXTENT_KEY;
const u64 ino = scoutfs_ino(inode);
u8 last_bytes[MAX_KEY_BYTES];
u8 key_bytes[MAX_KEY_BYTES];
struct scoutfs_key_buf last;
struct scoutfs_key_buf key;
struct native_extent ext;
u64 logical;
u64 blk_off;
u64 final;
u64 phys;
u64 size;
u32 flags;
int ret = 0;
ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
if (ret)
goto out;
memset(&ext, ~0, sizeof(ext));
init_extent_key(&last, last_bytes, &ext, ino, type);
blk_off = start >> SCOUTFS_BLOCK_SHIFT;
final = (start + len - 1) >> SCOUTFS_BLOCK_SHIFT;
size = 0;
flags = 0;
/* XXX overkill? */
mutex_lock(&inode->i_mutex);
for (;;) {
ext.blk_off = blk_off;
ext.blkno = 0;
ext.blocks = 1;
ext.flags = 0;
init_extent_key(&key, key_bytes, &ext, ino, type);
ret = scoutfs_item_next_same(sb, &key, &last, NULL);
if (ret < 0) {
if (ret != -ENOENT)
break;
flags |= FIEMAP_EXTENT_LAST;
ret = 0;
}
load_extent(&ext, &key);
if (ext.blk_off > final)
flags |= FIEMAP_EXTENT_LAST;
if (size) {
ret = fiemap_fill_next_extent(fieinfo, logical, phys,
size, flags);
if (ret != 0) {
if (ret == 1)
ret = 0;
break;
}
}
if (flags & FIEMAP_EXTENT_LAST)
break;
logical = ext.blk_off << SCOUTFS_BLOCK_SHIFT;
phys = ext.blkno << SCOUTFS_BLOCK_SHIFT;
size = ext.blocks << SCOUTFS_BLOCK_SHIFT;
flags = ext.flags & SCOUTFS_FILE_EXTENT_OFFLINE ?
FIEMAP_EXTENT_UNKNOWN : 0;
blk_off = ext.blk_off + ext.blocks;
}
mutex_unlock(&inode->i_mutex);
out:
return ret;
}
const struct address_space_operations scoutfs_file_aops = {
.readpage = scoutfs_readpage,
.readpages = scoutfs_readpages,
.writepage = scoutfs_writepage,
.writepages = scoutfs_writepages,
.write_begin = scoutfs_write_begin,
.write_end = scoutfs_write_end,
};
const struct file_operations scoutfs_file_fops = {
.read = do_sync_read,
.write = do_sync_write,
.aio_read = generic_file_aio_read,
.aio_write = generic_file_aio_write,
.unlocked_ioctl = scoutfs_ioctl,
.fsync = scoutfs_file_fsync,
};
static int derpy_global_mutex_is_held(void)
{
return 1;
}
static struct rhashtable_params cursor_hash_params = {
.key_len = member_sizeof(struct task_cursor, id),
.key_offset = offsetof(struct task_cursor, id),
.head_offset = offsetof(struct task_cursor, hash_head),
.hashfn = arch_fast_hash,
.grow_decision = rht_grow_above_75,
.shrink_decision = rht_shrink_below_30,
.mutex_is_held = derpy_global_mutex_is_held,
};
static void destroy_cursors(struct data_info *datinf)
{
struct task_cursor *curs;
struct task_cursor *pos;
list_for_each_entry_safe(curs, pos, &datinf->cursor_lru, list_head) {
list_del_init(&curs->list_head);
kfree(curs);
}
rhashtable_destroy(&datinf->cursors);
}
int scoutfs_data_setup(struct super_block *sb)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct data_info *datinf;
struct task_cursor *curs;
int ret;
int i;
datinf = kzalloc(sizeof(struct data_info), GFP_KERNEL);
if (!datinf)
return -ENOMEM;
init_rwsem(&datinf->alloc_rwsem);
INIT_LIST_HEAD(&datinf->cursor_lru);
/* always search for large aligned extents */
datinf->next_large_blkno = LARGE_EXTENT_BLOCKS;
ret = rhashtable_init(&datinf->cursors, &cursor_hash_params);
if (ret) {
kfree(datinf);
return -ENOMEM;
}
/* just allocate all of these up front */
for (i = 0; i < NR_CURSORS; i++) {
curs = kzalloc(sizeof(struct task_cursor), GFP_KERNEL);
if (!curs) {
destroy_cursors(datinf);
kfree(datinf);
return -ENOMEM;
}
curs->id.pid = i;
rhashtable_insert(&datinf->cursors, &curs->hash_head,
GFP_KERNEL);
list_add(&curs->list_head, &datinf->cursor_lru);
}
sbi->data_info = datinf;
return 0;
}
void scoutfs_data_destroy(struct super_block *sb)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct data_info *datinf = sbi->data_info;
if (datinf) {
destroy_cursors(datinf);
kfree(datinf);
}
}