Files
scoutfs/kmod/src/data.c
Mark Fasheh d1ae486d83 scoutfs: provide ->llseek
Without this we return -ESPIPE when a process tries to seek on a regular
file.

Signed-off-by: Mark Fasheh <mfasheh@versity.com>
2017-08-14 19:57:13 -07:00

1317 lines
34 KiB
C

/*
* Copyright (C) 2017 Versity Software, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/mpage.h>
#include <linux/sched.h>
#include <linux/buffer_head.h>
#include <linux/hash.h>
#include "format.h"
#include "super.h"
#include "inode.h"
#include "key.h"
#include "data.h"
#include "trans.h"
#include "counters.h"
#include "scoutfs_trace.h"
#include "item.h"
#include "ioctl.h"
#include "client.h"
#include "lock.h"
#define EXTF "[off %llu bno %llu bks %llu fl %x]"
#define EXTA(ne) (ne)->blk_off, (ne)->blkno, (ne)->blocks, (ne)->flags
/*
* scoutfs uses extent items to reference file data.
*
* The extent items map logical file regions to device blocks at 4K
* block granularity. File data isn't overwritten so that overwriting
* doesn't generate extent item locking and modification.
*
* Nodes have their own free extent items stored at their node id to
* avoid lock contention during allocation and freeing. These pools are
* filled and drained with messages to the server who allocates
* segment-sized regions.
*
* Block allocation maintains a fixed number of allocation cursors that
* remember the position of tasks within free regions. This is very
* simple and maintains decent extents for simple streaming writes. It
* eventually won't be good enough and we'll spend complexity on
* delalloc but we want to put that off as long as possible.
*
* There's no unwritten extents. As we dirty file data pages, possibly
* allocating extents for the first time, we track their inodes. Before
* we commit dirty metadata we write out all tracked inodes. This
* ensures that data is persistent before the metadata that references
* it is visible.
*
* Files can have offline extents. They have no allocated file data but
* the offline status represents file data that can be recalled through
* staging. While offline the extents have their physical blkno set to
* the logical blk_off so that all the usual block extent calculations
* still hold. It's mapped back to phys == 0 for fiemap.
*
* Weirdly, the extents are indexed by the *final* logical block and
* blkno of the extent. This lets us search for neighbouring previous
* extents with a _next() call and avoids having to implement item
* reading that iterates backwards through the manifest and segments.
*
* There are two items that track free extents, one indexed by the block
* location of the free extent and one indexed by the size of the free
* extent. This means that one allocation can update a great number of
* items throughout the tree as items are created and deleted as extents
* are split and merged. This can introduce inconsistent failure
* states. We'll some day address that with preallocation and pinning.
*
* XXX
* - truncate
* - mmap
* - better io error propagation
* - forced unmount with dirty data
* - direct IO
*/
/* more than enough for a few tasks per core on moderate hardware */
#define NR_CURSORS 4096
#define CURSOR_HASH_HEADS (PAGE_SIZE / sizeof(void *) / 2)
#define CURSOR_HASH_BITS ilog2(CURSOR_HASH_HEADS)
struct data_info {
struct rw_semaphore alloc_rwsem;
u64 next_large_blkno;
struct list_head cursor_lru;
struct hlist_head cursor_hash[CURSOR_HASH_HEADS];
};
#define DECLARE_DATA_INFO(sb, name) \
struct data_info *name = SCOUTFS_SB(sb)->data_info
/*
* This is the size of extents that are tracked by a cursor and so end
* up being the largest file item extent length given concurrent
* streaming writes.
*
* XXX We probably want this to be a bit larger to further reduce the
* amount of item churn involved in truncating tremendous files.
*/
#define LARGE_EXTENT_BLOCKS SCOUTFS_SEGMENT_BLOCKS
struct task_cursor {
u64 blkno;
u64 blocks;
struct hlist_node hnode;
struct list_head list_head;
struct task_struct *task;
pid_t pid;
};
/*
* Both file extent and free extent keys are converted into this native
* form for manipulation. The free extents set blk_off to blkno.
*/
struct native_extent {
u64 blk_off;
u64 blkno;
u64 blocks;
u8 flags;
};
/* avoiding dynamic on-stack array initializers :/ */
union extent_key_union {
struct scoutfs_file_extent_key file;
struct scoutfs_free_extent_blkno_key blkno;
struct scoutfs_free_extent_blocks_key blocks;
} __packed;
#define MAX_KEY_BYTES sizeof(union extent_key_union)
static void init_file_extent_key(struct scoutfs_key_buf *key, void *key_bytes,
struct native_extent *ext, u64 arg)
{
struct scoutfs_file_extent_key *fkey = key_bytes;
fkey->zone = SCOUTFS_FS_ZONE;
fkey->ino = cpu_to_be64(arg);
fkey->type = SCOUTFS_FILE_EXTENT_TYPE;
fkey->last_blk_off = cpu_to_be64(ext->blk_off + ext->blocks - 1);
fkey->last_blkno = cpu_to_be64(ext->blkno + ext->blocks - 1);
fkey->blocks = cpu_to_be64(ext->blocks);
fkey->flags = ext->flags;
scoutfs_key_init(key, fkey, sizeof(struct scoutfs_file_extent_key));
}
#define INIT_FREE_EXTENT_KEY(which_type, key, key_bytes, ext, arg, type) \
do { \
struct which_type *fkey = key_bytes; \
\
fkey->zone = SCOUTFS_NODE_ZONE; \
fkey->node_id = cpu_to_be64(arg); \
fkey->type = type; \
fkey->last_blkno = cpu_to_be64(ext->blkno + ext->blocks - 1); \
fkey->blocks = cpu_to_be64(ext->blocks); \
\
scoutfs_key_init(key, fkey, sizeof(struct which_type)); \
} while (0)
static void init_extent_key(struct scoutfs_key_buf *key, void *key_bytes,
struct native_extent *ext, u64 arg, u8 type)
{
if (type == SCOUTFS_FILE_EXTENT_TYPE)
init_file_extent_key(key, key_bytes, ext, arg);
else if(type == SCOUTFS_FREE_EXTENT_BLKNO_TYPE)
INIT_FREE_EXTENT_KEY(scoutfs_free_extent_blkno_key,
key, key_bytes, ext, arg, type);
else
INIT_FREE_EXTENT_KEY(scoutfs_free_extent_blocks_key,
key, key_bytes, ext, arg, type);
}
/* XXX could have some sanity checks */
static void load_file_extent(struct native_extent *ext,
struct scoutfs_key_buf *key)
{
struct scoutfs_file_extent_key *fkey = key->data;
ext->blocks = be64_to_cpu(fkey->blocks);
ext->blk_off = be64_to_cpu(fkey->last_blk_off) - ext->blocks + 1;
ext->blkno = be64_to_cpu(fkey->last_blkno) - ext->blocks + 1;
ext->flags = fkey->flags;
}
#define LOAD_FREE_EXTENT(which_type, ext, key) \
do { \
struct which_type *fkey = key->data; \
\
ext->blkno = be64_to_cpu(fkey->last_blkno) - \
be64_to_cpu(fkey->blocks) + 1; \
ext->blk_off = ext->blkno; \
ext->blocks = be64_to_cpu(fkey->blocks); \
ext->flags = 0; \
} while (0)
static void load_extent(struct native_extent *ext, struct scoutfs_key_buf *key)
{
struct scoutfs_free_extent_blocks_key *fkey = key->data;
BUILD_BUG_ON(offsetof(struct scoutfs_file_extent_key, type) !=
offsetof(struct scoutfs_free_extent_blkno_key, type) ||
offsetof(struct scoutfs_file_extent_key, type) !=
offsetof(struct scoutfs_free_extent_blocks_key, type));
if (fkey->type == SCOUTFS_FILE_EXTENT_TYPE)
load_file_extent(ext, key);
else if (fkey->type == SCOUTFS_FREE_EXTENT_BLKNO_TYPE)
LOAD_FREE_EXTENT(scoutfs_free_extent_blkno_key, ext, key);
else
LOAD_FREE_EXTENT(scoutfs_free_extent_blocks_key, ext, key);
}
/*
* Merge two extents if they're adjacent. First we arrange them to
* only test their adjoining endpoints, then are careful to not reference
* fields after we've modified them.
*/
static int merge_extents(struct native_extent *mod,
struct native_extent *ext)
{
struct native_extent *left;
struct native_extent *right;
if (mod->blk_off < ext->blk_off) {
left = mod;
right = ext;
} else {
left = ext;
right = mod;
}
if (left->blk_off + left->blocks == right->blk_off &&
left->blkno + left->blocks == right->blkno &&
left->flags == right->flags) {
mod->blk_off = left->blk_off;
mod->blkno = left->blkno;
mod->blocks = left->blocks + right->blocks;
return 1;
}
return 0;
}
/*
* The caller has ensured that the inner extent is entirely within
* the outer extent. Fill out the left and right regions of outter
* that don't overlap with inner.
*/
static void trim_extents(struct native_extent *left,
struct native_extent *right,
struct native_extent *outer,
struct native_extent *inner)
{
left->blk_off = outer->blk_off;
left->blkno = outer->blkno;
left->blocks = inner->blk_off - outer->blk_off;
left->flags = outer->flags;
right->blk_off = inner->blk_off + inner->blocks;
right->blkno = inner->blkno + inner->blocks;
right->blocks = (outer->blk_off + outer->blocks) - right->blk_off;
right->flags = outer->flags;
}
/* return true if inner is fully contained by outer */
static bool extents_within(struct native_extent *outer,
struct native_extent *inner)
{
u64 outer_end = outer->blk_off + outer->blocks - 1;
u64 inner_end = inner->blk_off + inner->blocks - 1;
return outer->blk_off <= inner_end && outer_end >= inner_end;
}
/*
* Find an adjacent extent in the direction of the delta. If we can
* merge with it then we modify the incoming cur extent. nei is set to
* the neighbour we found. If we didn't merge then nei's blocks is set
* to 0.
*/
static int try_merge(struct super_block *sb, struct native_extent *cur,
s64 delta, struct native_extent *nei, u64 arg, u8 type)
{
u8 last_bytes[MAX_KEY_BYTES];
u8 key_bytes[MAX_KEY_BYTES];
struct scoutfs_key_buf last;
struct scoutfs_key_buf key;
struct native_extent ext;
int ret;
memset(nei, 0, sizeof(struct native_extent));
/* short circuit prev search for common first block alloc */
if (cur->blk_off == 0 && delta < 0)
return 0;
memset(&ext, ~0, sizeof(ext));
init_extent_key(&last, last_bytes, &ext, arg, type);
ext.blk_off = cur->blk_off + delta;
ext.blkno = cur->blkno + delta;
ext.blocks = 1;
ext.flags = 0;
init_extent_key(&key, key_bytes, &ext, arg, type);
ret = scoutfs_item_next_same(sb, &key, &last, NULL, NULL);
if (ret < 0) {
if (ret == -ENOENT)
ret = 0;
goto out;
}
load_extent(&ext, &key);
trace_printk("merge nei "EXTF"\n", EXTA(&ext));
if (merge_extents(cur, &ext))
*nei = ext;
ret = 0;
out:
return ret;
}
/*
* We have two item types for indexing free extents by either the
* location of the extent or the size of the extent. When we create
* logical extents we might be finding neighbouring extents that could
* be merged. We can only search for neighbours in the location items.
* Once we find them we mirror the item modifications for both the
* location and size items.
*
* If this returns an error then nothing will have changed.
*/
static int modify_items(struct super_block *sb, struct native_extent *ext,
u64 arg, u8 type, bool create)
{
u8 key_bytes[MAX_KEY_BYTES];
struct scoutfs_key_buf key;
int ret;
int err;
trace_printk("mod cre %u "EXTF"\n", create, EXTA(ext));
BUG_ON(type != SCOUTFS_FILE_EXTENT_TYPE &&
type != SCOUTFS_FREE_EXTENT_BLKNO_TYPE);
init_extent_key(&key, key_bytes, ext, arg, type);
ret = create ? scoutfs_item_create(sb, &key, NULL) :
scoutfs_item_delete(sb, &key, NULL);
if (ret == 0 && type == SCOUTFS_FREE_EXTENT_BLKNO_TYPE) {
init_extent_key(&key, key_bytes, ext, arg,
SCOUTFS_FREE_EXTENT_BLOCKS_TYPE);
ret = create ? scoutfs_item_create(sb, &key, NULL) :
scoutfs_item_delete(sb, &key, NULL);
if (ret) {
init_extent_key(&key, key_bytes, ext, arg, type);
err = create ? scoutfs_item_delete(sb, &key, NULL) :
scoutfs_item_create(sb, &key, NULL);
BUG_ON(err);
}
}
return ret;
}
/*
* Insert a new extent. We see if it can be merged with adjacent
* existing extents. If this returns an error then the existing extents
* will not have changed.
*/
static int insert_extent(struct super_block *sb,
struct native_extent *caller_ins,
u64 arg, u8 type)
{
struct native_extent left;
struct native_extent right;
struct native_extent ins = *caller_ins;
bool del_ins = false;
bool ins_left = false;
int err;
int ret;
trace_printk("inserting "EXTF"\n", EXTA(caller_ins));
/* find previous that might be adjacent */
ret = try_merge(sb, &ins, -1, &left, arg, type) ?:
try_merge(sb, &ins, 1, &right, arg, type);
if (ret < 0)
goto out;
trace_printk("merge left "EXTF"\n", EXTA(&left));
trace_printk("merge right "EXTF"\n", EXTA(&right));
ret = modify_items(sb, &ins, arg, type, true);
if (ret)
goto out;
del_ins = true;
if (left.blocks) {
ret = modify_items(sb, &left, arg, type, false);
if (ret)
goto undo;
ins_left = true;
}
if (right.blocks)
ret = modify_items(sb, &right, arg, type, false);
undo:
if (ret) {
if (ins_left) {
err = modify_items(sb, &left, arg, type, true);
BUG_ON(err);
}
if (del_ins) {
err = modify_items(sb, &ins, arg, type, false);
BUG_ON(err);
}
}
out:
return ret;
}
/*
* Remove a portion of an existing extent. The removal might leave
* behind non-overlapping edges of the existing extent. If this returns
* an error then the existing extent will not have changed.
*/
static int remove_extent(struct super_block *sb,
struct native_extent *rem, u64 arg, u8 type)
{
u8 last_bytes[MAX_KEY_BYTES];
u8 key_bytes[MAX_KEY_BYTES];
struct scoutfs_key_buf last;
struct scoutfs_key_buf key;
struct native_extent left = {0,};
struct native_extent right = {0,};
struct native_extent outer;
bool rem_left = false;
bool rem_right = false;
int err = 0;
int ret;
trace_printk("removing "EXTF"\n", EXTA(rem));
memset(&outer, ~0, sizeof(outer));
init_extent_key(&last, last_bytes, &outer, arg, type);
/* find outer existing extent that contains removal extent */
init_extent_key(&key, key_bytes, rem, arg, type);
ret = scoutfs_item_next_same(sb, &key, &last, NULL, NULL);
if (ret)
goto out;
load_extent(&outer, &key);
trace_printk("outer "EXTF"\n", EXTA(&outer));
if (!extents_within(&outer, rem) || outer.flags != rem->flags) {
ret = -EIO;
goto out;
}
trim_extents(&left, &right, &outer, rem);
trace_printk("trim left "EXTF"\n", EXTA(&left));
trace_printk("trim right "EXTF"\n", EXTA(&right));
if (left.blocks) {
ret = modify_items(sb, &left, arg, type, true);
if (ret)
goto out;
rem_left = true;
}
if (right.blocks) {
ret = modify_items(sb, &right, arg, type, true);
if (ret)
goto out;
rem_right = true;
}
ret = modify_items(sb, &outer, arg, type, false);
out:
if (ret) {
if (rem_right) {
err = modify_items(sb, &right, arg, type, false);
BUG_ON(err);
}
if (rem_left) {
err = modify_items(sb, &left, arg, type, false);
BUG_ON(err);
}
}
trace_printk("ret %d\n", ret);
return ret;
}
/*
* Free extents whose blocks fall inside the specified logical block
* range.
*
* If 'offline' is given then blocks are freed but the extent items are
* left behind and their _OFFLINE flag is set.
*
* This is the low level extent item manipulation code. We hold and
* release the transaction so the caller doesn't have to deal with
* partial progress.
*/
int scoutfs_data_truncate_items(struct super_block *sb, u64 ino, u64 iblock,
u64 len, bool offline)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
u8 last_bytes[MAX_KEY_BYTES];
u8 key_bytes[MAX_KEY_BYTES];
struct scoutfs_key_buf last;
struct scoutfs_key_buf key;
struct native_extent found;
struct native_extent rng;
struct native_extent ext;
struct native_extent ofl;
struct native_extent fr;
DECLARE_ITEM_COUNT(cnt);
bool rem_fr = false;
bool ins_ext = false;
bool holding = false;
int ret = 0;
int err;
trace_printk("iblock %llu len %llu offline %u\n",
iblock, len, offline);
memset(&ext, ~0, sizeof(ext));
init_extent_key(&last, last_bytes, &ext, ino, SCOUTFS_FILE_EXTENT_TYPE);
rng.blk_off = iblock;
rng.blocks = len;
rng.blkno = 0;
rng.flags = 0;
while (rng.blocks) {
/* find the next extent that could include our first block */
init_extent_key(&key, key_bytes, &rng, ino,
SCOUTFS_FILE_EXTENT_TYPE);
ret = scoutfs_item_next_same(sb, &key, &last, NULL, NULL);
if (ret < 0) {
if (ret == -ENOENT)
ret = 0;
break;
}
load_extent(&found, &key);
trace_printk("found "EXTF"\n", EXTA(&found));
/* XXX corruption: offline has phys == log */
if ((found.flags & SCOUTFS_FILE_EXTENT_OFFLINE) &&
found.blkno != found.blk_off) {
ret = -EIO;
break;
}
/* we're done if the found extent is past us */
if (found.blk_off >= rng.blk_off + rng.blocks) {
ret = 0;
break;
}
/* find the intersection */
ext.blk_off = max(rng.blk_off, found.blk_off);
ext.blocks = min(rng.blk_off + rng.blocks,
found.blk_off + found.blocks) - ext.blk_off;
ext.blkno = found.blkno + (ext.blk_off - found.blk_off);
ext.flags = found.flags;
/* next search will be past the extent we truncate */
rng.blk_off = ext.blk_off + ext.blocks;
if (rng.blk_off < iblock + len)
rng.blocks = (iblock + len) - rng.blk_off;
else
rng.blocks = 0;
/* done if already offline */
if (offline && (ext.flags & SCOUTFS_FILE_EXTENT_OFFLINE))
continue;
scoutfs_count_trunc_block(&cnt);
ret = scoutfs_hold_trans(sb, &cnt);
if (ret)
break;
holding = true;
/* free the old extent if it was allocated */
if (ext.blkno) {
fr = ext;
fr.blk_off = fr.blkno;
ret = insert_extent(sb, &fr, sbi->node_id,
SCOUTFS_FREE_EXTENT_BLKNO_TYPE);
if (ret)
break;
rem_fr = true;
}
/* always remove the overlapping file extent */
ret = remove_extent(sb, &ext, ino, SCOUTFS_FILE_EXTENT_TYPE);
if (ret)
break;
ins_ext = true;
/* maybe add new file extents with the offline flag set */
if (offline) {
ofl = ext;
ofl.blkno = ofl.blk_off;
ofl.flags = SCOUTFS_FILE_EXTENT_OFFLINE;
ret = insert_extent(sb, &ofl, ino,
SCOUTFS_FILE_EXTENT_TYPE);
if (ret)
break;
}
rem_fr = false;
ins_ext = false;
scoutfs_release_trans(sb);
holding = false;
}
if (ret) {
if (ins_ext) {
err = insert_extent(sb, &ext, ino,
SCOUTFS_FILE_EXTENT_TYPE);
BUG_ON(err);
}
if (rem_fr) {
err = remove_extent(sb, &fr, sbi->node_id,
SCOUTFS_FREE_EXTENT_BLKNO_TYPE);
BUG_ON(err);
}
}
if (holding)
scoutfs_release_trans(sb);
return ret;
}
static inline struct hlist_head *cursor_head(struct data_info *datinf,
struct task_struct *task,
pid_t pid)
{
unsigned h = hash_ptr(task, CURSOR_HASH_BITS) ^
hash_long(pid, CURSOR_HASH_BITS);
return &datinf->cursor_hash[h];
}
static struct task_cursor *search_head(struct hlist_head *head,
struct task_struct *task, pid_t pid)
{
struct task_cursor *curs;
hlist_for_each_entry(curs, head, hnode) {
if (curs->task == task && curs->pid == pid)
return curs;
}
return NULL;
}
static void destroy_cursors(struct data_info *datinf)
{
struct task_cursor *curs;
struct hlist_node *tmp;
int i;
for (i = 0; i < CURSOR_HASH_HEADS; i++) {
hlist_for_each_entry_safe(curs, tmp, &datinf->cursor_hash[i],
hnode) {
hlist_del_init(&curs->hnode);
kfree(curs);
}
}
}
/*
* These cheesy cursors are only meant to encourage nice IO patterns for
* concurrent tasks either streaming large file writes or creating lots
* of small files. It will do very poorly in many other situations. To
* do better we'd need to go further down the road to delalloc and take
* more surrounding context into account.
*/
static struct task_cursor *get_cursor(struct data_info *datinf)
{
struct task_struct *task = current;
pid_t pid = current->pid;
struct hlist_head *head;
struct task_cursor *curs;
head = cursor_head(datinf, task, pid);
curs = search_head(head, task, pid);
if (!curs) {
curs = list_last_entry(&datinf->cursor_lru,
struct task_cursor, list_head);
trace_printk("resetting curs %p was task %p pid %u\n",
curs, task, pid);
hlist_del_init(&curs->hnode);
curs->task = task;
curs->pid = pid;
hlist_add_head(&curs->hnode, head);
curs->blkno = 0;
curs->blocks = 0;
}
list_move(&curs->list_head, &datinf->cursor_lru);
return curs;
}
static int bulk_alloc(struct super_block *sb)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct native_extent ext;
u64 *segnos = NULL;
int ret;
int i;
segnos = scoutfs_client_bulk_alloc(sb);
if (IS_ERR(segnos)) {
ret = PTR_ERR(segnos);
goto out;
}
for (i = 0; segnos[i]; i++) {
/* merge or set this one */
if (i > 0 && (segnos[i] == segnos[i - 1] + 1)) {
ext.blocks += SCOUTFS_SEGMENT_BLOCKS;
trace_printk("merged segno [%u] %llu blocks %llu\n",
i, segnos[i], ext.blocks);
} else {
ext.blkno = segnos[i] << SCOUTFS_SEGMENT_BLOCK_SHIFT;
ext.blocks = SCOUTFS_SEGMENT_BLOCKS;
trace_printk("set extent segno [%u] %llu blkno %llu\n",
i, segnos[i], ext.blkno);
}
/* don't write if we merge with the next one */
if ((segnos[i] + 1) == segnos[i + 1])
continue;
trace_printk("inserting [%u] "EXTF"\n", i, EXTA(&ext));
ext.blk_off = ext.blkno;
ext.flags = 0;
ret = insert_extent(sb, &ext, sbi->node_id,
SCOUTFS_FREE_EXTENT_BLKNO_TYPE);
if (ret)
break;
}
out:
if (!IS_ERR_OR_NULL(segnos))
kfree(segnos);
/* XXX don't orphan segnos on error, crash recovery with server */
return ret;
}
/*
* Allocate a single block for the logical block offset in the file.
*
* We try to merge single block allocations into large extents by using
* per-task cursors. Each cursor tracks a block region that should be
* searched for free extents. If we don't have a cursor, or we find
* free space outside of our cursor, then we look for the next large
* free extent.
*/
static int allocate_block(struct inode *inode, sector_t iblock, u64 *blkno,
bool was_offline)
{
struct super_block *sb = inode->i_sb;
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
DECLARE_DATA_INFO(sb, datinf);
u8 last_bytes[MAX_KEY_BYTES];
u8 key_bytes[MAX_KEY_BYTES];
struct scoutfs_key_buf last;
struct scoutfs_key_buf key;
struct native_extent last_ext;
struct native_extent found;
struct native_extent ext;
struct native_extent ofl;
struct native_extent fr;
struct task_cursor *curs;
bool alloced = false;
const u64 ino = scoutfs_ino(inode);
bool rem_ext = false;
bool ins_ofl = false;
u8 type;
int err;
int ret;
memset(&last_ext, ~0, sizeof(last_ext));
down_write(&datinf->alloc_rwsem);
curs = get_cursor(datinf);
/* start from the cursor or look for the next large extent */
reset_cursor:
if (curs->blocks) {
ext.blkno = curs->blkno;
ext.blocks = 0;
type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE;
} else {
ext.blkno = datinf->next_large_blkno;
ext.blocks = LARGE_EXTENT_BLOCKS;
type = SCOUTFS_FREE_EXTENT_BLOCKS_TYPE;
}
ext.flags = 0;
retry:
trace_printk("searching %llu,%llu curs %p task %p pid %u %llu,%llu\n",
ext.blkno, ext.blocks, curs, curs->task, curs->pid,
curs->blkno, curs->blocks);
ext.blk_off = ext.blkno;
init_extent_key(&key, key_bytes, &ext, sbi->node_id, type);
init_extent_key(&last, last_bytes, &last_ext, sbi->node_id, type);
ret = scoutfs_item_next_same(sb, &key, &last, NULL, NULL);
if (ret < 0) {
if (ret == -ENOENT) {
/* if the cursor's empty fall back to next large */
if (ext.blkno && ext.blocks == 0) {
curs->blkno = 0;
curs->blocks = 0;
goto reset_cursor;
}
/* wrap the search for large extents */
if (ext.blkno > LARGE_EXTENT_BLOCKS && ext.blocks) {
datinf->next_large_blkno = LARGE_EXTENT_BLOCKS;
ext.blkno = datinf->next_large_blkno;
goto retry;
}
/* ask the server for more extents */
if (ext.blocks && !alloced) {
ret = bulk_alloc(sb);
if (ret < 0)
goto out;
alloced = true;
goto retry;
}
/* finally look for any free block at all */
if (ext.blocks) {
ext.blkno = 0;
ext.blocks = 0;
type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE;
goto retry;
}
/* after all that return -ENOSPC */
ret = -ENOSPC;
}
goto out;
}
load_extent(&found, &key);
trace_printk("found nei "EXTF"\n", EXTA(&found));
/* look for a new large extent if found is outside cursor */
if (curs->blocks &&
(found.blkno + found.blocks <= curs->blkno ||
found.blkno >= curs->blkno + curs->blocks)) {
curs->blkno = 0;
curs->blocks = 0;
goto reset_cursor;
}
/*
* Set the cursor if:
* - we didn't already have one
* - it's large enough for a large extent with alignment padding
* - the sufficiently large free region is past next large
*/
if (!curs->blocks &&
found.blocks >= (2 * LARGE_EXTENT_BLOCKS) &&
(found.blkno + found.blocks - (2 * LARGE_EXTENT_BLOCKS) >=
datinf->next_large_blkno)) {
curs->blkno = ALIGN(max(found.blkno, datinf->next_large_blkno),
LARGE_EXTENT_BLOCKS);
curs->blocks = LARGE_EXTENT_BLOCKS;
found.blkno = curs->blkno;
found.blocks = curs->blocks;
datinf->next_large_blkno = curs->blkno + LARGE_EXTENT_BLOCKS;
}
trace_printk("using %llu,%llu curs %llu,%llu\n",
found.blkno, found.blocks, curs->blkno, curs->blocks);
/* remove old offline block if we're staging */
if (was_offline) {
ofl.blk_off = iblock;
ofl.blkno = iblock;
ofl.blocks = 1;
ofl.flags = SCOUTFS_FILE_EXTENT_OFFLINE;
ret = remove_extent(sb, &ofl, ino, SCOUTFS_FILE_EXTENT_TYPE);
if (ret < 0)
goto out;
ins_ofl = true;
}
/* insert new file extent */
*blkno = found.blkno;
ext.blk_off = iblock;
ext.blkno = found.blkno;
ext.blocks = 1;
ext.flags = 0;
ret = insert_extent(sb, &ext, ino, SCOUTFS_FILE_EXTENT_TYPE);
if (ret < 0)
goto out;
rem_ext = true;
/* and remove free extents */
fr = ext;
fr.blk_off = ext.blkno;
ret = remove_extent(sb, &fr, sbi->node_id,
SCOUTFS_FREE_EXTENT_BLKNO_TYPE);
if (ret)
goto out;
/* advance cursor if we're using it */
if (curs->blocks) {
if (--curs->blocks == 0)
curs->blkno = 0;
else
curs->blkno++;
}
ret = 0;
out:
if (ret) {
if (rem_ext) {
err = remove_extent(sb, &ext, ino,
SCOUTFS_FILE_EXTENT_TYPE);
BUG_ON(err);
}
if (ins_ofl) {
err = insert_extent(sb, &ofl, ino,
SCOUTFS_FILE_EXTENT_TYPE);
BUG_ON(err);
}
}
up_write(&datinf->alloc_rwsem);
trace_printk("ret %d\n", ret);
return ret;
}
static int scoutfs_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int create)
{
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
struct super_block *sb = inode->i_sb;
DECLARE_DATA_INFO(sb, datinf);
u8 last_bytes[MAX_KEY_BYTES];
u8 key_bytes[MAX_KEY_BYTES];
struct scoutfs_key_buf last;
struct scoutfs_key_buf key;
struct native_extent ext;
bool was_offline = false;
u64 blkno;
u64 off;
int ret;
bh->b_blocknr = 0;
bh->b_size = 0;
ext.blk_off = iblock;
ext.blocks = 1;
ext.blkno = 0;
ext.flags = 0;
init_extent_key(&key, key_bytes, &ext, scoutfs_ino(inode),
SCOUTFS_FILE_EXTENT_TYPE);
memset(&ext, ~0, sizeof(ext));
init_extent_key(&last, last_bytes, &ext, scoutfs_ino(inode),
SCOUTFS_FILE_EXTENT_TYPE);
/*
* XXX think about how far this next can go, given locking and
* item consistency.
*/
down_read(&datinf->alloc_rwsem);
ret = scoutfs_item_next_same(sb, &key, &last, NULL, NULL);
up_read(&datinf->alloc_rwsem);
if (ret < 0) {
if (ret == -ENOENT)
memset(&ext, 0, sizeof(ext));
else
goto out;
} else {
load_extent(&ext, &key);
trace_printk("found nei "EXTF"\n", EXTA(&ext));
}
if ((ext.flags & SCOUTFS_FILE_EXTENT_OFFLINE) && !si->staging) {
ret = -EINVAL;
goto out;
}
/* use the extent if it intersects */
if (iblock >= ext.blk_off && iblock < (ext.blk_off + ext.blocks)) {
if (ext.flags & SCOUTFS_FILE_EXTENT_OFFLINE) {
/* non-stage can't write to offline */
if (!si->staging) {
ret = -EINVAL;
goto out;
}
was_offline = true;
} else {
/* found online extent */
off = iblock - ext.blk_off;
map_bh(bh, inode->i_sb, ext.blkno + off);
bh->b_size = min_t(u64, SIZE_MAX,
(ext.blocks - off) << SCOUTFS_BLOCK_SHIFT);
}
}
if (!buffer_mapped(bh) && create) {
ret = allocate_block(inode, iblock, &blkno, was_offline);
if (ret)
goto out;
map_bh(bh, inode->i_sb, blkno);
bh->b_size = SCOUTFS_BLOCK_SHIFT;
set_buffer_new(bh);
}
ret = 0;
out:
trace_printk("ino %llu iblock %llu create %d ret %d bnr %llu size %zu\n",
scoutfs_ino(inode), (u64)iblock, create, ret,
(u64)bh->b_blocknr, bh->b_size);
return ret;
}
static int scoutfs_readpage(struct file *file, struct page *page)
{
return mpage_readpage(page, scoutfs_get_block);
}
static int scoutfs_readpages(struct file *file, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages)
{
return mpage_readpages(mapping, pages, nr_pages, scoutfs_get_block);
}
static int scoutfs_writepage(struct page *page, struct writeback_control *wbc)
{
return block_write_full_page(page, scoutfs_get_block, wbc);
}
static int scoutfs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
return mpage_writepages(mapping, wbc, scoutfs_get_block);
}
static int scoutfs_write_begin(struct file *file,
struct address_space *mapping, loff_t pos,
unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
{
struct inode *inode = mapping->host;
struct super_block *sb = inode->i_sb;
DECLARE_ITEM_COUNT(cnt);
int ret;
trace_printk("ino %llu pos %llu len %u\n",
scoutfs_ino(inode), (u64)pos, len);
scoutfs_count_write_begin(&cnt);
ret = scoutfs_hold_trans(sb, &cnt);
if (ret)
goto out;
/* can't re-enter fs, have trans */
flags |= AOP_FLAG_NOFS;
/* generic write_end updates i_size and calls dirty_inode */
ret = scoutfs_dirty_inode_item(inode, NULL);
if (ret == 0)
ret = block_write_begin(mapping, pos, len, flags, pagep,
scoutfs_get_block);
if (ret)
scoutfs_release_trans(sb);
out:
return ret;
}
static int scoutfs_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
struct inode *inode = mapping->host;
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
struct super_block *sb = inode->i_sb;
int ret;
trace_printk("ino %llu pgind %lu pos %llu len %u copied %d\n",
scoutfs_ino(inode), page->index, (u64)pos, len, copied);
ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
if (ret > 0) {
if (!si->staging) {
scoutfs_inode_set_data_seq(inode);
scoutfs_inode_inc_data_version(inode);
}
/* XXX kind of a big hammer, inode life cycle needs work */
scoutfs_update_inode_item(inode);
scoutfs_inode_queue_writeback(inode);
}
scoutfs_release_trans(sb);
return ret;
}
/*
* Return the extents that intersect with the given byte range. It doesn't
* trim the returned extents to the byte range.
*/
int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
struct super_block *sb = inode->i_sb;
const u8 type = SCOUTFS_FILE_EXTENT_TYPE;
const u64 ino = scoutfs_ino(inode);
u8 last_bytes[MAX_KEY_BYTES];
u8 key_bytes[MAX_KEY_BYTES];
struct scoutfs_key_buf last;
struct scoutfs_key_buf key;
struct native_extent ext;
struct scoutfs_lock *inode_lock = NULL;
u64 logical;
u64 blk_off;
u64 final;
u64 phys;
u64 size;
u32 flags;
int ret;
ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
if (ret)
return ret;
memset(&ext, ~0, sizeof(ext));
init_extent_key(&last, last_bytes, &ext, ino, type);
blk_off = start >> SCOUTFS_BLOCK_SHIFT;
final = (start + len - 1) >> SCOUTFS_BLOCK_SHIFT;
size = 0;
flags = 0;
/* XXX overkill? */
mutex_lock(&inode->i_mutex);
ret = scoutfs_lock_ino_group(sb, DLM_LOCK_PR, scoutfs_ino(inode),
&inode_lock);
if (ret)
goto out;
for (;;) {
ext.blk_off = blk_off;
ext.blkno = 0;
ext.blocks = 1;
ext.flags = 0;
init_extent_key(&key, key_bytes, &ext, ino, type);
ret = scoutfs_item_next_same(sb, &key, &last, NULL, NULL);
if (ret < 0) {
if (ret != -ENOENT)
break;
flags |= FIEMAP_EXTENT_LAST;
ret = 0;
}
load_extent(&ext, &key);
if (ext.blk_off > final)
flags |= FIEMAP_EXTENT_LAST;
if (size) {
ret = fiemap_fill_next_extent(fieinfo, logical, phys,
size, flags);
if (ret != 0) {
if (ret == 1)
ret = 0;
break;
}
}
if (flags & FIEMAP_EXTENT_LAST)
break;
logical = ext.blk_off << SCOUTFS_BLOCK_SHIFT;
phys = ext.blkno << SCOUTFS_BLOCK_SHIFT;
size = ext.blocks << SCOUTFS_BLOCK_SHIFT;
flags = 0;
if (ext.flags & SCOUTFS_FILE_EXTENT_OFFLINE) {
phys = 0;
flags = FIEMAP_EXTENT_UNKNOWN;
}
blk_off = ext.blk_off + ext.blocks;
}
scoutfs_unlock(sb, inode_lock);
out:
mutex_unlock(&inode->i_mutex);
return ret;
}
const struct address_space_operations scoutfs_file_aops = {
.readpage = scoutfs_readpage,
.readpages = scoutfs_readpages,
.writepage = scoutfs_writepage,
.writepages = scoutfs_writepages,
.write_begin = scoutfs_write_begin,
.write_end = scoutfs_write_end,
};
const struct file_operations scoutfs_file_fops = {
.read = do_sync_read,
.write = do_sync_write,
.aio_read = generic_file_aio_read,
.aio_write = generic_file_aio_write,
.unlocked_ioctl = scoutfs_ioctl,
.fsync = scoutfs_file_fsync,
.llseek = generic_file_llseek,
};
int scoutfs_data_setup(struct super_block *sb)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct hlist_head *head;
struct data_info *datinf;
struct task_cursor *curs;
int i;
datinf = kzalloc(sizeof(struct data_info), GFP_KERNEL);
if (!datinf)
return -ENOMEM;
init_rwsem(&datinf->alloc_rwsem);
INIT_LIST_HEAD(&datinf->cursor_lru);
/* always search for large aligned extents */
datinf->next_large_blkno = LARGE_EXTENT_BLOCKS;
for (i = 0; i < CURSOR_HASH_HEADS; i++)
INIT_HLIST_HEAD(&datinf->cursor_hash[i]);
/* just allocate all of these up front */
for (i = 0; i < NR_CURSORS; i++) {
curs = kzalloc(sizeof(struct task_cursor), GFP_KERNEL);
if (!curs) {
destroy_cursors(datinf);
kfree(datinf);
return -ENOMEM;
}
curs->pid = i;
head = cursor_head(datinf, curs->task, curs->pid);
hlist_add_head(&curs->hnode, head);
list_add(&curs->list_head, &datinf->cursor_lru);
}
sbi->data_info = datinf;
return 0;
}
void scoutfs_data_destroy(struct super_block *sb)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct data_info *datinf = sbi->data_info;
if (datinf) {
destroy_cursors(datinf);
kfree(datinf);
}
}