mirror of
https://github.com/versity/scoutfs.git
synced 2026-02-09 20:20:08 +00:00
Without this we return -ESPIPE when a process tries to seek on a regular file. Signed-off-by: Mark Fasheh <mfasheh@versity.com>
1317 lines
34 KiB
C
1317 lines
34 KiB
C
/*
|
|
* Copyright (C) 2017 Versity Software, Inc. All rights reserved.
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public
|
|
* License v2 as published by the Free Software Foundation.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* General Public License for more details.
|
|
*/
|
|
#include <linux/kernel.h>
|
|
#include <linux/fs.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/pagemap.h>
|
|
#include <linux/mpage.h>
|
|
#include <linux/sched.h>
|
|
#include <linux/buffer_head.h>
|
|
#include <linux/hash.h>
|
|
|
|
#include "format.h"
|
|
#include "super.h"
|
|
#include "inode.h"
|
|
#include "key.h"
|
|
#include "data.h"
|
|
#include "trans.h"
|
|
#include "counters.h"
|
|
#include "scoutfs_trace.h"
|
|
#include "item.h"
|
|
#include "ioctl.h"
|
|
#include "client.h"
|
|
#include "lock.h"
|
|
|
|
#define EXTF "[off %llu bno %llu bks %llu fl %x]"
|
|
#define EXTA(ne) (ne)->blk_off, (ne)->blkno, (ne)->blocks, (ne)->flags
|
|
|
|
/*
|
|
* scoutfs uses extent items to reference file data.
|
|
*
|
|
* The extent items map logical file regions to device blocks at 4K
|
|
* block granularity. File data isn't overwritten so that overwriting
|
|
* doesn't generate extent item locking and modification.
|
|
*
|
|
* Nodes have their own free extent items stored at their node id to
|
|
* avoid lock contention during allocation and freeing. These pools are
|
|
* filled and drained with messages to the server who allocates
|
|
* segment-sized regions.
|
|
*
|
|
* Block allocation maintains a fixed number of allocation cursors that
|
|
* remember the position of tasks within free regions. This is very
|
|
* simple and maintains decent extents for simple streaming writes. It
|
|
* eventually won't be good enough and we'll spend complexity on
|
|
* delalloc but we want to put that off as long as possible.
|
|
*
|
|
* There's no unwritten extents. As we dirty file data pages, possibly
|
|
* allocating extents for the first time, we track their inodes. Before
|
|
* we commit dirty metadata we write out all tracked inodes. This
|
|
* ensures that data is persistent before the metadata that references
|
|
* it is visible.
|
|
*
|
|
* Files can have offline extents. They have no allocated file data but
|
|
* the offline status represents file data that can be recalled through
|
|
* staging. While offline the extents have their physical blkno set to
|
|
* the logical blk_off so that all the usual block extent calculations
|
|
* still hold. It's mapped back to phys == 0 for fiemap.
|
|
*
|
|
* Weirdly, the extents are indexed by the *final* logical block and
|
|
* blkno of the extent. This lets us search for neighbouring previous
|
|
* extents with a _next() call and avoids having to implement item
|
|
* reading that iterates backwards through the manifest and segments.
|
|
*
|
|
* There are two items that track free extents, one indexed by the block
|
|
* location of the free extent and one indexed by the size of the free
|
|
* extent. This means that one allocation can update a great number of
|
|
* items throughout the tree as items are created and deleted as extents
|
|
* are split and merged. This can introduce inconsistent failure
|
|
* states. We'll some day address that with preallocation and pinning.
|
|
*
|
|
* XXX
|
|
* - truncate
|
|
* - mmap
|
|
* - better io error propagation
|
|
* - forced unmount with dirty data
|
|
* - direct IO
|
|
*/
|
|
|
|
/* more than enough for a few tasks per core on moderate hardware */
|
|
#define NR_CURSORS 4096
|
|
#define CURSOR_HASH_HEADS (PAGE_SIZE / sizeof(void *) / 2)
|
|
#define CURSOR_HASH_BITS ilog2(CURSOR_HASH_HEADS)
|
|
|
|
struct data_info {
|
|
struct rw_semaphore alloc_rwsem;
|
|
u64 next_large_blkno;
|
|
struct list_head cursor_lru;
|
|
struct hlist_head cursor_hash[CURSOR_HASH_HEADS];
|
|
};
|
|
|
|
#define DECLARE_DATA_INFO(sb, name) \
|
|
struct data_info *name = SCOUTFS_SB(sb)->data_info
|
|
|
|
|
|
/*
|
|
* This is the size of extents that are tracked by a cursor and so end
|
|
* up being the largest file item extent length given concurrent
|
|
* streaming writes.
|
|
*
|
|
* XXX We probably want this to be a bit larger to further reduce the
|
|
* amount of item churn involved in truncating tremendous files.
|
|
*/
|
|
#define LARGE_EXTENT_BLOCKS SCOUTFS_SEGMENT_BLOCKS
|
|
|
|
struct task_cursor {
|
|
u64 blkno;
|
|
u64 blocks;
|
|
struct hlist_node hnode;
|
|
struct list_head list_head;
|
|
struct task_struct *task;
|
|
pid_t pid;
|
|
};
|
|
|
|
/*
|
|
* Both file extent and free extent keys are converted into this native
|
|
* form for manipulation. The free extents set blk_off to blkno.
|
|
*/
|
|
struct native_extent {
|
|
u64 blk_off;
|
|
u64 blkno;
|
|
u64 blocks;
|
|
u8 flags;
|
|
};
|
|
|
|
/* avoiding dynamic on-stack array initializers :/ */
|
|
union extent_key_union {
|
|
struct scoutfs_file_extent_key file;
|
|
struct scoutfs_free_extent_blkno_key blkno;
|
|
struct scoutfs_free_extent_blocks_key blocks;
|
|
} __packed;
|
|
#define MAX_KEY_BYTES sizeof(union extent_key_union)
|
|
|
|
static void init_file_extent_key(struct scoutfs_key_buf *key, void *key_bytes,
|
|
struct native_extent *ext, u64 arg)
|
|
{
|
|
struct scoutfs_file_extent_key *fkey = key_bytes;
|
|
|
|
fkey->zone = SCOUTFS_FS_ZONE;
|
|
fkey->ino = cpu_to_be64(arg);
|
|
fkey->type = SCOUTFS_FILE_EXTENT_TYPE;
|
|
fkey->last_blk_off = cpu_to_be64(ext->blk_off + ext->blocks - 1);
|
|
fkey->last_blkno = cpu_to_be64(ext->blkno + ext->blocks - 1);
|
|
fkey->blocks = cpu_to_be64(ext->blocks);
|
|
fkey->flags = ext->flags;
|
|
|
|
scoutfs_key_init(key, fkey, sizeof(struct scoutfs_file_extent_key));
|
|
}
|
|
|
|
#define INIT_FREE_EXTENT_KEY(which_type, key, key_bytes, ext, arg, type) \
|
|
do { \
|
|
struct which_type *fkey = key_bytes; \
|
|
\
|
|
fkey->zone = SCOUTFS_NODE_ZONE; \
|
|
fkey->node_id = cpu_to_be64(arg); \
|
|
fkey->type = type; \
|
|
fkey->last_blkno = cpu_to_be64(ext->blkno + ext->blocks - 1); \
|
|
fkey->blocks = cpu_to_be64(ext->blocks); \
|
|
\
|
|
scoutfs_key_init(key, fkey, sizeof(struct which_type)); \
|
|
} while (0)
|
|
|
|
static void init_extent_key(struct scoutfs_key_buf *key, void *key_bytes,
|
|
struct native_extent *ext, u64 arg, u8 type)
|
|
{
|
|
if (type == SCOUTFS_FILE_EXTENT_TYPE)
|
|
init_file_extent_key(key, key_bytes, ext, arg);
|
|
else if(type == SCOUTFS_FREE_EXTENT_BLKNO_TYPE)
|
|
INIT_FREE_EXTENT_KEY(scoutfs_free_extent_blkno_key,
|
|
key, key_bytes, ext, arg, type);
|
|
else
|
|
INIT_FREE_EXTENT_KEY(scoutfs_free_extent_blocks_key,
|
|
key, key_bytes, ext, arg, type);
|
|
}
|
|
|
|
/* XXX could have some sanity checks */
|
|
static void load_file_extent(struct native_extent *ext,
|
|
struct scoutfs_key_buf *key)
|
|
{
|
|
struct scoutfs_file_extent_key *fkey = key->data;
|
|
|
|
ext->blocks = be64_to_cpu(fkey->blocks);
|
|
ext->blk_off = be64_to_cpu(fkey->last_blk_off) - ext->blocks + 1;
|
|
ext->blkno = be64_to_cpu(fkey->last_blkno) - ext->blocks + 1;
|
|
ext->flags = fkey->flags;
|
|
}
|
|
|
|
#define LOAD_FREE_EXTENT(which_type, ext, key) \
|
|
do { \
|
|
struct which_type *fkey = key->data; \
|
|
\
|
|
ext->blkno = be64_to_cpu(fkey->last_blkno) - \
|
|
be64_to_cpu(fkey->blocks) + 1; \
|
|
ext->blk_off = ext->blkno; \
|
|
ext->blocks = be64_to_cpu(fkey->blocks); \
|
|
ext->flags = 0; \
|
|
} while (0)
|
|
|
|
static void load_extent(struct native_extent *ext, struct scoutfs_key_buf *key)
|
|
{
|
|
struct scoutfs_free_extent_blocks_key *fkey = key->data;
|
|
|
|
BUILD_BUG_ON(offsetof(struct scoutfs_file_extent_key, type) !=
|
|
offsetof(struct scoutfs_free_extent_blkno_key, type) ||
|
|
offsetof(struct scoutfs_file_extent_key, type) !=
|
|
offsetof(struct scoutfs_free_extent_blocks_key, type));
|
|
|
|
if (fkey->type == SCOUTFS_FILE_EXTENT_TYPE)
|
|
load_file_extent(ext, key);
|
|
else if (fkey->type == SCOUTFS_FREE_EXTENT_BLKNO_TYPE)
|
|
LOAD_FREE_EXTENT(scoutfs_free_extent_blkno_key, ext, key);
|
|
else
|
|
LOAD_FREE_EXTENT(scoutfs_free_extent_blocks_key, ext, key);
|
|
}
|
|
|
|
/*
|
|
* Merge two extents if they're adjacent. First we arrange them to
|
|
* only test their adjoining endpoints, then are careful to not reference
|
|
* fields after we've modified them.
|
|
*/
|
|
static int merge_extents(struct native_extent *mod,
|
|
struct native_extent *ext)
|
|
{
|
|
struct native_extent *left;
|
|
struct native_extent *right;
|
|
|
|
if (mod->blk_off < ext->blk_off) {
|
|
left = mod;
|
|
right = ext;
|
|
} else {
|
|
left = ext;
|
|
right = mod;
|
|
}
|
|
|
|
if (left->blk_off + left->blocks == right->blk_off &&
|
|
left->blkno + left->blocks == right->blkno &&
|
|
left->flags == right->flags) {
|
|
mod->blk_off = left->blk_off;
|
|
mod->blkno = left->blkno;
|
|
mod->blocks = left->blocks + right->blocks;
|
|
return 1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* The caller has ensured that the inner extent is entirely within
|
|
* the outer extent. Fill out the left and right regions of outter
|
|
* that don't overlap with inner.
|
|
*/
|
|
static void trim_extents(struct native_extent *left,
|
|
struct native_extent *right,
|
|
struct native_extent *outer,
|
|
struct native_extent *inner)
|
|
{
|
|
left->blk_off = outer->blk_off;
|
|
left->blkno = outer->blkno;
|
|
left->blocks = inner->blk_off - outer->blk_off;
|
|
left->flags = outer->flags;
|
|
|
|
right->blk_off = inner->blk_off + inner->blocks;
|
|
right->blkno = inner->blkno + inner->blocks;
|
|
right->blocks = (outer->blk_off + outer->blocks) - right->blk_off;
|
|
right->flags = outer->flags;
|
|
}
|
|
|
|
/* return true if inner is fully contained by outer */
|
|
static bool extents_within(struct native_extent *outer,
|
|
struct native_extent *inner)
|
|
{
|
|
u64 outer_end = outer->blk_off + outer->blocks - 1;
|
|
u64 inner_end = inner->blk_off + inner->blocks - 1;
|
|
|
|
return outer->blk_off <= inner_end && outer_end >= inner_end;
|
|
}
|
|
|
|
/*
|
|
* Find an adjacent extent in the direction of the delta. If we can
|
|
* merge with it then we modify the incoming cur extent. nei is set to
|
|
* the neighbour we found. If we didn't merge then nei's blocks is set
|
|
* to 0.
|
|
*/
|
|
static int try_merge(struct super_block *sb, struct native_extent *cur,
|
|
s64 delta, struct native_extent *nei, u64 arg, u8 type)
|
|
{
|
|
u8 last_bytes[MAX_KEY_BYTES];
|
|
u8 key_bytes[MAX_KEY_BYTES];
|
|
struct scoutfs_key_buf last;
|
|
struct scoutfs_key_buf key;
|
|
struct native_extent ext;
|
|
int ret;
|
|
|
|
memset(nei, 0, sizeof(struct native_extent));
|
|
|
|
/* short circuit prev search for common first block alloc */
|
|
if (cur->blk_off == 0 && delta < 0)
|
|
return 0;
|
|
|
|
memset(&ext, ~0, sizeof(ext));
|
|
init_extent_key(&last, last_bytes, &ext, arg, type);
|
|
|
|
ext.blk_off = cur->blk_off + delta;
|
|
ext.blkno = cur->blkno + delta;
|
|
ext.blocks = 1;
|
|
ext.flags = 0;
|
|
init_extent_key(&key, key_bytes, &ext, arg, type);
|
|
|
|
ret = scoutfs_item_next_same(sb, &key, &last, NULL, NULL);
|
|
if (ret < 0) {
|
|
if (ret == -ENOENT)
|
|
ret = 0;
|
|
goto out;
|
|
}
|
|
|
|
load_extent(&ext, &key);
|
|
trace_printk("merge nei "EXTF"\n", EXTA(&ext));
|
|
|
|
if (merge_extents(cur, &ext))
|
|
*nei = ext;
|
|
ret = 0;
|
|
out:
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* We have two item types for indexing free extents by either the
|
|
* location of the extent or the size of the extent. When we create
|
|
* logical extents we might be finding neighbouring extents that could
|
|
* be merged. We can only search for neighbours in the location items.
|
|
* Once we find them we mirror the item modifications for both the
|
|
* location and size items.
|
|
*
|
|
* If this returns an error then nothing will have changed.
|
|
*/
|
|
static int modify_items(struct super_block *sb, struct native_extent *ext,
|
|
u64 arg, u8 type, bool create)
|
|
{
|
|
u8 key_bytes[MAX_KEY_BYTES];
|
|
struct scoutfs_key_buf key;
|
|
int ret;
|
|
int err;
|
|
|
|
trace_printk("mod cre %u "EXTF"\n", create, EXTA(ext));
|
|
|
|
BUG_ON(type != SCOUTFS_FILE_EXTENT_TYPE &&
|
|
type != SCOUTFS_FREE_EXTENT_BLKNO_TYPE);
|
|
|
|
init_extent_key(&key, key_bytes, ext, arg, type);
|
|
ret = create ? scoutfs_item_create(sb, &key, NULL) :
|
|
scoutfs_item_delete(sb, &key, NULL);
|
|
|
|
if (ret == 0 && type == SCOUTFS_FREE_EXTENT_BLKNO_TYPE) {
|
|
init_extent_key(&key, key_bytes, ext, arg,
|
|
SCOUTFS_FREE_EXTENT_BLOCKS_TYPE);
|
|
ret = create ? scoutfs_item_create(sb, &key, NULL) :
|
|
scoutfs_item_delete(sb, &key, NULL);
|
|
if (ret) {
|
|
init_extent_key(&key, key_bytes, ext, arg, type);
|
|
err = create ? scoutfs_item_delete(sb, &key, NULL) :
|
|
scoutfs_item_create(sb, &key, NULL);
|
|
BUG_ON(err);
|
|
}
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Insert a new extent. We see if it can be merged with adjacent
|
|
* existing extents. If this returns an error then the existing extents
|
|
* will not have changed.
|
|
*/
|
|
static int insert_extent(struct super_block *sb,
|
|
struct native_extent *caller_ins,
|
|
u64 arg, u8 type)
|
|
{
|
|
struct native_extent left;
|
|
struct native_extent right;
|
|
struct native_extent ins = *caller_ins;
|
|
bool del_ins = false;
|
|
bool ins_left = false;
|
|
int err;
|
|
int ret;
|
|
|
|
trace_printk("inserting "EXTF"\n", EXTA(caller_ins));
|
|
|
|
/* find previous that might be adjacent */
|
|
ret = try_merge(sb, &ins, -1, &left, arg, type) ?:
|
|
try_merge(sb, &ins, 1, &right, arg, type);
|
|
if (ret < 0)
|
|
goto out;
|
|
|
|
trace_printk("merge left "EXTF"\n", EXTA(&left));
|
|
trace_printk("merge right "EXTF"\n", EXTA(&right));
|
|
|
|
ret = modify_items(sb, &ins, arg, type, true);
|
|
if (ret)
|
|
goto out;
|
|
del_ins = true;
|
|
|
|
if (left.blocks) {
|
|
ret = modify_items(sb, &left, arg, type, false);
|
|
if (ret)
|
|
goto undo;
|
|
ins_left = true;
|
|
}
|
|
|
|
if (right.blocks)
|
|
ret = modify_items(sb, &right, arg, type, false);
|
|
|
|
undo:
|
|
if (ret) {
|
|
if (ins_left) {
|
|
err = modify_items(sb, &left, arg, type, true);
|
|
BUG_ON(err);
|
|
}
|
|
if (del_ins) {
|
|
err = modify_items(sb, &ins, arg, type, false);
|
|
BUG_ON(err);
|
|
}
|
|
}
|
|
|
|
out:
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Remove a portion of an existing extent. The removal might leave
|
|
* behind non-overlapping edges of the existing extent. If this returns
|
|
* an error then the existing extent will not have changed.
|
|
*/
|
|
static int remove_extent(struct super_block *sb,
|
|
struct native_extent *rem, u64 arg, u8 type)
|
|
{
|
|
u8 last_bytes[MAX_KEY_BYTES];
|
|
u8 key_bytes[MAX_KEY_BYTES];
|
|
struct scoutfs_key_buf last;
|
|
struct scoutfs_key_buf key;
|
|
struct native_extent left = {0,};
|
|
struct native_extent right = {0,};
|
|
struct native_extent outer;
|
|
bool rem_left = false;
|
|
bool rem_right = false;
|
|
int err = 0;
|
|
int ret;
|
|
|
|
trace_printk("removing "EXTF"\n", EXTA(rem));
|
|
|
|
memset(&outer, ~0, sizeof(outer));
|
|
init_extent_key(&last, last_bytes, &outer, arg, type);
|
|
|
|
/* find outer existing extent that contains removal extent */
|
|
init_extent_key(&key, key_bytes, rem, arg, type);
|
|
ret = scoutfs_item_next_same(sb, &key, &last, NULL, NULL);
|
|
if (ret)
|
|
goto out;
|
|
|
|
load_extent(&outer, &key);
|
|
|
|
trace_printk("outer "EXTF"\n", EXTA(&outer));
|
|
|
|
if (!extents_within(&outer, rem) || outer.flags != rem->flags) {
|
|
ret = -EIO;
|
|
goto out;
|
|
}
|
|
|
|
trim_extents(&left, &right, &outer, rem);
|
|
|
|
trace_printk("trim left "EXTF"\n", EXTA(&left));
|
|
trace_printk("trim right "EXTF"\n", EXTA(&right));
|
|
|
|
if (left.blocks) {
|
|
ret = modify_items(sb, &left, arg, type, true);
|
|
if (ret)
|
|
goto out;
|
|
rem_left = true;
|
|
}
|
|
|
|
if (right.blocks) {
|
|
ret = modify_items(sb, &right, arg, type, true);
|
|
if (ret)
|
|
goto out;
|
|
rem_right = true;
|
|
}
|
|
|
|
ret = modify_items(sb, &outer, arg, type, false);
|
|
|
|
out:
|
|
if (ret) {
|
|
if (rem_right) {
|
|
err = modify_items(sb, &right, arg, type, false);
|
|
BUG_ON(err);
|
|
}
|
|
if (rem_left) {
|
|
err = modify_items(sb, &left, arg, type, false);
|
|
BUG_ON(err);
|
|
}
|
|
}
|
|
|
|
trace_printk("ret %d\n", ret);
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Free extents whose blocks fall inside the specified logical block
|
|
* range.
|
|
*
|
|
* If 'offline' is given then blocks are freed but the extent items are
|
|
* left behind and their _OFFLINE flag is set.
|
|
*
|
|
* This is the low level extent item manipulation code. We hold and
|
|
* release the transaction so the caller doesn't have to deal with
|
|
* partial progress.
|
|
*/
|
|
int scoutfs_data_truncate_items(struct super_block *sb, u64 ino, u64 iblock,
|
|
u64 len, bool offline)
|
|
{
|
|
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
|
u8 last_bytes[MAX_KEY_BYTES];
|
|
u8 key_bytes[MAX_KEY_BYTES];
|
|
struct scoutfs_key_buf last;
|
|
struct scoutfs_key_buf key;
|
|
struct native_extent found;
|
|
struct native_extent rng;
|
|
struct native_extent ext;
|
|
struct native_extent ofl;
|
|
struct native_extent fr;
|
|
DECLARE_ITEM_COUNT(cnt);
|
|
bool rem_fr = false;
|
|
bool ins_ext = false;
|
|
bool holding = false;
|
|
int ret = 0;
|
|
int err;
|
|
|
|
trace_printk("iblock %llu len %llu offline %u\n",
|
|
iblock, len, offline);
|
|
|
|
memset(&ext, ~0, sizeof(ext));
|
|
init_extent_key(&last, last_bytes, &ext, ino, SCOUTFS_FILE_EXTENT_TYPE);
|
|
|
|
rng.blk_off = iblock;
|
|
rng.blocks = len;
|
|
rng.blkno = 0;
|
|
rng.flags = 0;
|
|
|
|
while (rng.blocks) {
|
|
/* find the next extent that could include our first block */
|
|
init_extent_key(&key, key_bytes, &rng, ino,
|
|
SCOUTFS_FILE_EXTENT_TYPE);
|
|
|
|
ret = scoutfs_item_next_same(sb, &key, &last, NULL, NULL);
|
|
if (ret < 0) {
|
|
if (ret == -ENOENT)
|
|
ret = 0;
|
|
break;
|
|
}
|
|
|
|
load_extent(&found, &key);
|
|
trace_printk("found "EXTF"\n", EXTA(&found));
|
|
|
|
/* XXX corruption: offline has phys == log */
|
|
if ((found.flags & SCOUTFS_FILE_EXTENT_OFFLINE) &&
|
|
found.blkno != found.blk_off) {
|
|
ret = -EIO;
|
|
break;
|
|
}
|
|
|
|
/* we're done if the found extent is past us */
|
|
if (found.blk_off >= rng.blk_off + rng.blocks) {
|
|
ret = 0;
|
|
break;
|
|
}
|
|
|
|
/* find the intersection */
|
|
ext.blk_off = max(rng.blk_off, found.blk_off);
|
|
ext.blocks = min(rng.blk_off + rng.blocks,
|
|
found.blk_off + found.blocks) - ext.blk_off;
|
|
ext.blkno = found.blkno + (ext.blk_off - found.blk_off);
|
|
ext.flags = found.flags;
|
|
|
|
/* next search will be past the extent we truncate */
|
|
rng.blk_off = ext.blk_off + ext.blocks;
|
|
if (rng.blk_off < iblock + len)
|
|
rng.blocks = (iblock + len) - rng.blk_off;
|
|
else
|
|
rng.blocks = 0;
|
|
|
|
/* done if already offline */
|
|
if (offline && (ext.flags & SCOUTFS_FILE_EXTENT_OFFLINE))
|
|
continue;
|
|
|
|
scoutfs_count_trunc_block(&cnt);
|
|
ret = scoutfs_hold_trans(sb, &cnt);
|
|
if (ret)
|
|
break;
|
|
holding = true;
|
|
|
|
/* free the old extent if it was allocated */
|
|
if (ext.blkno) {
|
|
fr = ext;
|
|
fr.blk_off = fr.blkno;
|
|
ret = insert_extent(sb, &fr, sbi->node_id,
|
|
SCOUTFS_FREE_EXTENT_BLKNO_TYPE);
|
|
if (ret)
|
|
break;
|
|
rem_fr = true;
|
|
}
|
|
|
|
/* always remove the overlapping file extent */
|
|
ret = remove_extent(sb, &ext, ino, SCOUTFS_FILE_EXTENT_TYPE);
|
|
if (ret)
|
|
break;
|
|
ins_ext = true;
|
|
|
|
/* maybe add new file extents with the offline flag set */
|
|
if (offline) {
|
|
ofl = ext;
|
|
ofl.blkno = ofl.blk_off;
|
|
ofl.flags = SCOUTFS_FILE_EXTENT_OFFLINE;
|
|
ret = insert_extent(sb, &ofl, ino,
|
|
SCOUTFS_FILE_EXTENT_TYPE);
|
|
if (ret)
|
|
break;
|
|
}
|
|
|
|
rem_fr = false;
|
|
ins_ext = false;
|
|
scoutfs_release_trans(sb);
|
|
holding = false;
|
|
}
|
|
|
|
if (ret) {
|
|
if (ins_ext) {
|
|
err = insert_extent(sb, &ext, ino,
|
|
SCOUTFS_FILE_EXTENT_TYPE);
|
|
BUG_ON(err);
|
|
}
|
|
if (rem_fr) {
|
|
err = remove_extent(sb, &fr, sbi->node_id,
|
|
SCOUTFS_FREE_EXTENT_BLKNO_TYPE);
|
|
BUG_ON(err);
|
|
}
|
|
}
|
|
|
|
if (holding)
|
|
scoutfs_release_trans(sb);
|
|
|
|
return ret;
|
|
}
|
|
|
|
static inline struct hlist_head *cursor_head(struct data_info *datinf,
|
|
struct task_struct *task,
|
|
pid_t pid)
|
|
{
|
|
unsigned h = hash_ptr(task, CURSOR_HASH_BITS) ^
|
|
hash_long(pid, CURSOR_HASH_BITS);
|
|
|
|
return &datinf->cursor_hash[h];
|
|
}
|
|
|
|
static struct task_cursor *search_head(struct hlist_head *head,
|
|
struct task_struct *task, pid_t pid)
|
|
{
|
|
struct task_cursor *curs;
|
|
|
|
hlist_for_each_entry(curs, head, hnode) {
|
|
if (curs->task == task && curs->pid == pid)
|
|
return curs;
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
static void destroy_cursors(struct data_info *datinf)
|
|
{
|
|
struct task_cursor *curs;
|
|
struct hlist_node *tmp;
|
|
int i;
|
|
|
|
for (i = 0; i < CURSOR_HASH_HEADS; i++) {
|
|
hlist_for_each_entry_safe(curs, tmp, &datinf->cursor_hash[i],
|
|
hnode) {
|
|
hlist_del_init(&curs->hnode);
|
|
kfree(curs);
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* These cheesy cursors are only meant to encourage nice IO patterns for
|
|
* concurrent tasks either streaming large file writes or creating lots
|
|
* of small files. It will do very poorly in many other situations. To
|
|
* do better we'd need to go further down the road to delalloc and take
|
|
* more surrounding context into account.
|
|
*/
|
|
static struct task_cursor *get_cursor(struct data_info *datinf)
|
|
{
|
|
struct task_struct *task = current;
|
|
pid_t pid = current->pid;
|
|
struct hlist_head *head;
|
|
struct task_cursor *curs;
|
|
|
|
head = cursor_head(datinf, task, pid);
|
|
curs = search_head(head, task, pid);
|
|
if (!curs) {
|
|
curs = list_last_entry(&datinf->cursor_lru,
|
|
struct task_cursor, list_head);
|
|
trace_printk("resetting curs %p was task %p pid %u\n",
|
|
curs, task, pid);
|
|
hlist_del_init(&curs->hnode);
|
|
curs->task = task;
|
|
curs->pid = pid;
|
|
hlist_add_head(&curs->hnode, head);
|
|
curs->blkno = 0;
|
|
curs->blocks = 0;
|
|
}
|
|
|
|
list_move(&curs->list_head, &datinf->cursor_lru);
|
|
|
|
return curs;
|
|
}
|
|
|
|
static int bulk_alloc(struct super_block *sb)
|
|
{
|
|
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
|
struct native_extent ext;
|
|
u64 *segnos = NULL;
|
|
int ret;
|
|
int i;
|
|
|
|
segnos = scoutfs_client_bulk_alloc(sb);
|
|
if (IS_ERR(segnos)) {
|
|
ret = PTR_ERR(segnos);
|
|
goto out;
|
|
}
|
|
|
|
for (i = 0; segnos[i]; i++) {
|
|
|
|
/* merge or set this one */
|
|
if (i > 0 && (segnos[i] == segnos[i - 1] + 1)) {
|
|
ext.blocks += SCOUTFS_SEGMENT_BLOCKS;
|
|
trace_printk("merged segno [%u] %llu blocks %llu\n",
|
|
i, segnos[i], ext.blocks);
|
|
} else {
|
|
ext.blkno = segnos[i] << SCOUTFS_SEGMENT_BLOCK_SHIFT;
|
|
ext.blocks = SCOUTFS_SEGMENT_BLOCKS;
|
|
trace_printk("set extent segno [%u] %llu blkno %llu\n",
|
|
i, segnos[i], ext.blkno);
|
|
}
|
|
|
|
/* don't write if we merge with the next one */
|
|
if ((segnos[i] + 1) == segnos[i + 1])
|
|
continue;
|
|
|
|
trace_printk("inserting [%u] "EXTF"\n", i, EXTA(&ext));
|
|
|
|
ext.blk_off = ext.blkno;
|
|
ext.flags = 0;
|
|
ret = insert_extent(sb, &ext, sbi->node_id,
|
|
SCOUTFS_FREE_EXTENT_BLKNO_TYPE);
|
|
if (ret)
|
|
break;
|
|
}
|
|
|
|
out:
|
|
if (!IS_ERR_OR_NULL(segnos))
|
|
kfree(segnos);
|
|
|
|
/* XXX don't orphan segnos on error, crash recovery with server */
|
|
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Allocate a single block for the logical block offset in the file.
|
|
*
|
|
* We try to merge single block allocations into large extents by using
|
|
* per-task cursors. Each cursor tracks a block region that should be
|
|
* searched for free extents. If we don't have a cursor, or we find
|
|
* free space outside of our cursor, then we look for the next large
|
|
* free extent.
|
|
*/
|
|
static int allocate_block(struct inode *inode, sector_t iblock, u64 *blkno,
|
|
bool was_offline)
|
|
{
|
|
struct super_block *sb = inode->i_sb;
|
|
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
|
DECLARE_DATA_INFO(sb, datinf);
|
|
u8 last_bytes[MAX_KEY_BYTES];
|
|
u8 key_bytes[MAX_KEY_BYTES];
|
|
struct scoutfs_key_buf last;
|
|
struct scoutfs_key_buf key;
|
|
struct native_extent last_ext;
|
|
struct native_extent found;
|
|
struct native_extent ext;
|
|
struct native_extent ofl;
|
|
struct native_extent fr;
|
|
struct task_cursor *curs;
|
|
bool alloced = false;
|
|
const u64 ino = scoutfs_ino(inode);
|
|
bool rem_ext = false;
|
|
bool ins_ofl = false;
|
|
u8 type;
|
|
int err;
|
|
int ret;
|
|
|
|
memset(&last_ext, ~0, sizeof(last_ext));
|
|
|
|
down_write(&datinf->alloc_rwsem);
|
|
|
|
curs = get_cursor(datinf);
|
|
|
|
/* start from the cursor or look for the next large extent */
|
|
reset_cursor:
|
|
if (curs->blocks) {
|
|
ext.blkno = curs->blkno;
|
|
ext.blocks = 0;
|
|
type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE;
|
|
} else {
|
|
ext.blkno = datinf->next_large_blkno;
|
|
ext.blocks = LARGE_EXTENT_BLOCKS;
|
|
type = SCOUTFS_FREE_EXTENT_BLOCKS_TYPE;
|
|
}
|
|
ext.flags = 0;
|
|
|
|
retry:
|
|
trace_printk("searching %llu,%llu curs %p task %p pid %u %llu,%llu\n",
|
|
ext.blkno, ext.blocks, curs, curs->task, curs->pid,
|
|
curs->blkno, curs->blocks);
|
|
|
|
ext.blk_off = ext.blkno;
|
|
init_extent_key(&key, key_bytes, &ext, sbi->node_id, type);
|
|
init_extent_key(&last, last_bytes, &last_ext, sbi->node_id, type);
|
|
|
|
ret = scoutfs_item_next_same(sb, &key, &last, NULL, NULL);
|
|
if (ret < 0) {
|
|
if (ret == -ENOENT) {
|
|
/* if the cursor's empty fall back to next large */
|
|
if (ext.blkno && ext.blocks == 0) {
|
|
curs->blkno = 0;
|
|
curs->blocks = 0;
|
|
goto reset_cursor;
|
|
}
|
|
|
|
/* wrap the search for large extents */
|
|
if (ext.blkno > LARGE_EXTENT_BLOCKS && ext.blocks) {
|
|
datinf->next_large_blkno = LARGE_EXTENT_BLOCKS;
|
|
ext.blkno = datinf->next_large_blkno;
|
|
goto retry;
|
|
}
|
|
|
|
/* ask the server for more extents */
|
|
if (ext.blocks && !alloced) {
|
|
ret = bulk_alloc(sb);
|
|
if (ret < 0)
|
|
goto out;
|
|
alloced = true;
|
|
goto retry;
|
|
}
|
|
|
|
/* finally look for any free block at all */
|
|
if (ext.blocks) {
|
|
ext.blkno = 0;
|
|
ext.blocks = 0;
|
|
type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE;
|
|
goto retry;
|
|
}
|
|
|
|
/* after all that return -ENOSPC */
|
|
ret = -ENOSPC;
|
|
}
|
|
goto out;
|
|
}
|
|
|
|
load_extent(&found, &key);
|
|
trace_printk("found nei "EXTF"\n", EXTA(&found));
|
|
|
|
/* look for a new large extent if found is outside cursor */
|
|
if (curs->blocks &&
|
|
(found.blkno + found.blocks <= curs->blkno ||
|
|
found.blkno >= curs->blkno + curs->blocks)) {
|
|
curs->blkno = 0;
|
|
curs->blocks = 0;
|
|
goto reset_cursor;
|
|
}
|
|
|
|
/*
|
|
* Set the cursor if:
|
|
* - we didn't already have one
|
|
* - it's large enough for a large extent with alignment padding
|
|
* - the sufficiently large free region is past next large
|
|
*/
|
|
if (!curs->blocks &&
|
|
found.blocks >= (2 * LARGE_EXTENT_BLOCKS) &&
|
|
(found.blkno + found.blocks - (2 * LARGE_EXTENT_BLOCKS) >=
|
|
datinf->next_large_blkno)) {
|
|
|
|
curs->blkno = ALIGN(max(found.blkno, datinf->next_large_blkno),
|
|
LARGE_EXTENT_BLOCKS);
|
|
curs->blocks = LARGE_EXTENT_BLOCKS;
|
|
found.blkno = curs->blkno;
|
|
found.blocks = curs->blocks;
|
|
|
|
datinf->next_large_blkno = curs->blkno + LARGE_EXTENT_BLOCKS;
|
|
}
|
|
|
|
trace_printk("using %llu,%llu curs %llu,%llu\n",
|
|
found.blkno, found.blocks, curs->blkno, curs->blocks);
|
|
|
|
/* remove old offline block if we're staging */
|
|
if (was_offline) {
|
|
ofl.blk_off = iblock;
|
|
ofl.blkno = iblock;
|
|
ofl.blocks = 1;
|
|
ofl.flags = SCOUTFS_FILE_EXTENT_OFFLINE;
|
|
ret = remove_extent(sb, &ofl, ino, SCOUTFS_FILE_EXTENT_TYPE);
|
|
if (ret < 0)
|
|
goto out;
|
|
ins_ofl = true;
|
|
}
|
|
|
|
/* insert new file extent */
|
|
*blkno = found.blkno;
|
|
ext.blk_off = iblock;
|
|
ext.blkno = found.blkno;
|
|
ext.blocks = 1;
|
|
ext.flags = 0;
|
|
ret = insert_extent(sb, &ext, ino, SCOUTFS_FILE_EXTENT_TYPE);
|
|
if (ret < 0)
|
|
goto out;
|
|
rem_ext = true;
|
|
|
|
/* and remove free extents */
|
|
fr = ext;
|
|
fr.blk_off = ext.blkno;
|
|
ret = remove_extent(sb, &fr, sbi->node_id,
|
|
SCOUTFS_FREE_EXTENT_BLKNO_TYPE);
|
|
if (ret)
|
|
goto out;
|
|
|
|
/* advance cursor if we're using it */
|
|
if (curs->blocks) {
|
|
if (--curs->blocks == 0)
|
|
curs->blkno = 0;
|
|
else
|
|
curs->blkno++;
|
|
}
|
|
|
|
ret = 0;
|
|
out:
|
|
if (ret) {
|
|
if (rem_ext) {
|
|
err = remove_extent(sb, &ext, ino,
|
|
SCOUTFS_FILE_EXTENT_TYPE);
|
|
BUG_ON(err);
|
|
}
|
|
if (ins_ofl) {
|
|
err = insert_extent(sb, &ofl, ino,
|
|
SCOUTFS_FILE_EXTENT_TYPE);
|
|
BUG_ON(err);
|
|
}
|
|
}
|
|
|
|
up_write(&datinf->alloc_rwsem);
|
|
trace_printk("ret %d\n", ret);
|
|
return ret;
|
|
}
|
|
|
|
static int scoutfs_get_block(struct inode *inode, sector_t iblock,
|
|
struct buffer_head *bh, int create)
|
|
{
|
|
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
|
|
struct super_block *sb = inode->i_sb;
|
|
DECLARE_DATA_INFO(sb, datinf);
|
|
u8 last_bytes[MAX_KEY_BYTES];
|
|
u8 key_bytes[MAX_KEY_BYTES];
|
|
struct scoutfs_key_buf last;
|
|
struct scoutfs_key_buf key;
|
|
struct native_extent ext;
|
|
bool was_offline = false;
|
|
u64 blkno;
|
|
u64 off;
|
|
int ret;
|
|
|
|
bh->b_blocknr = 0;
|
|
bh->b_size = 0;
|
|
|
|
ext.blk_off = iblock;
|
|
ext.blocks = 1;
|
|
ext.blkno = 0;
|
|
ext.flags = 0;
|
|
init_extent_key(&key, key_bytes, &ext, scoutfs_ino(inode),
|
|
SCOUTFS_FILE_EXTENT_TYPE);
|
|
|
|
memset(&ext, ~0, sizeof(ext));
|
|
init_extent_key(&last, last_bytes, &ext, scoutfs_ino(inode),
|
|
SCOUTFS_FILE_EXTENT_TYPE);
|
|
|
|
/*
|
|
* XXX think about how far this next can go, given locking and
|
|
* item consistency.
|
|
*/
|
|
down_read(&datinf->alloc_rwsem);
|
|
ret = scoutfs_item_next_same(sb, &key, &last, NULL, NULL);
|
|
up_read(&datinf->alloc_rwsem);
|
|
if (ret < 0) {
|
|
if (ret == -ENOENT)
|
|
memset(&ext, 0, sizeof(ext));
|
|
else
|
|
goto out;
|
|
} else {
|
|
load_extent(&ext, &key);
|
|
trace_printk("found nei "EXTF"\n", EXTA(&ext));
|
|
}
|
|
|
|
if ((ext.flags & SCOUTFS_FILE_EXTENT_OFFLINE) && !si->staging) {
|
|
ret = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
/* use the extent if it intersects */
|
|
if (iblock >= ext.blk_off && iblock < (ext.blk_off + ext.blocks)) {
|
|
|
|
if (ext.flags & SCOUTFS_FILE_EXTENT_OFFLINE) {
|
|
/* non-stage can't write to offline */
|
|
if (!si->staging) {
|
|
ret = -EINVAL;
|
|
goto out;
|
|
}
|
|
was_offline = true;
|
|
} else {
|
|
/* found online extent */
|
|
off = iblock - ext.blk_off;
|
|
map_bh(bh, inode->i_sb, ext.blkno + off);
|
|
bh->b_size = min_t(u64, SIZE_MAX,
|
|
(ext.blocks - off) << SCOUTFS_BLOCK_SHIFT);
|
|
}
|
|
}
|
|
|
|
if (!buffer_mapped(bh) && create) {
|
|
ret = allocate_block(inode, iblock, &blkno, was_offline);
|
|
if (ret)
|
|
goto out;
|
|
|
|
map_bh(bh, inode->i_sb, blkno);
|
|
bh->b_size = SCOUTFS_BLOCK_SHIFT;
|
|
set_buffer_new(bh);
|
|
}
|
|
|
|
ret = 0;
|
|
out:
|
|
trace_printk("ino %llu iblock %llu create %d ret %d bnr %llu size %zu\n",
|
|
scoutfs_ino(inode), (u64)iblock, create, ret,
|
|
(u64)bh->b_blocknr, bh->b_size);
|
|
|
|
return ret;
|
|
}
|
|
|
|
static int scoutfs_readpage(struct file *file, struct page *page)
|
|
{
|
|
return mpage_readpage(page, scoutfs_get_block);
|
|
}
|
|
|
|
static int scoutfs_readpages(struct file *file, struct address_space *mapping,
|
|
struct list_head *pages, unsigned nr_pages)
|
|
{
|
|
return mpage_readpages(mapping, pages, nr_pages, scoutfs_get_block);
|
|
}
|
|
|
|
static int scoutfs_writepage(struct page *page, struct writeback_control *wbc)
|
|
{
|
|
return block_write_full_page(page, scoutfs_get_block, wbc);
|
|
}
|
|
|
|
static int scoutfs_writepages(struct address_space *mapping,
|
|
struct writeback_control *wbc)
|
|
{
|
|
return mpage_writepages(mapping, wbc, scoutfs_get_block);
|
|
}
|
|
|
|
static int scoutfs_write_begin(struct file *file,
|
|
struct address_space *mapping, loff_t pos,
|
|
unsigned len, unsigned flags,
|
|
struct page **pagep, void **fsdata)
|
|
{
|
|
struct inode *inode = mapping->host;
|
|
struct super_block *sb = inode->i_sb;
|
|
DECLARE_ITEM_COUNT(cnt);
|
|
int ret;
|
|
|
|
trace_printk("ino %llu pos %llu len %u\n",
|
|
scoutfs_ino(inode), (u64)pos, len);
|
|
|
|
scoutfs_count_write_begin(&cnt);
|
|
ret = scoutfs_hold_trans(sb, &cnt);
|
|
if (ret)
|
|
goto out;
|
|
|
|
/* can't re-enter fs, have trans */
|
|
flags |= AOP_FLAG_NOFS;
|
|
|
|
/* generic write_end updates i_size and calls dirty_inode */
|
|
ret = scoutfs_dirty_inode_item(inode, NULL);
|
|
if (ret == 0)
|
|
ret = block_write_begin(mapping, pos, len, flags, pagep,
|
|
scoutfs_get_block);
|
|
if (ret)
|
|
scoutfs_release_trans(sb);
|
|
out:
|
|
return ret;
|
|
}
|
|
|
|
static int scoutfs_write_end(struct file *file, struct address_space *mapping,
|
|
loff_t pos, unsigned len, unsigned copied,
|
|
struct page *page, void *fsdata)
|
|
{
|
|
struct inode *inode = mapping->host;
|
|
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
|
|
struct super_block *sb = inode->i_sb;
|
|
int ret;
|
|
|
|
trace_printk("ino %llu pgind %lu pos %llu len %u copied %d\n",
|
|
scoutfs_ino(inode), page->index, (u64)pos, len, copied);
|
|
|
|
ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
|
|
if (ret > 0) {
|
|
if (!si->staging) {
|
|
scoutfs_inode_set_data_seq(inode);
|
|
scoutfs_inode_inc_data_version(inode);
|
|
}
|
|
/* XXX kind of a big hammer, inode life cycle needs work */
|
|
scoutfs_update_inode_item(inode);
|
|
scoutfs_inode_queue_writeback(inode);
|
|
}
|
|
scoutfs_release_trans(sb);
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Return the extents that intersect with the given byte range. It doesn't
|
|
* trim the returned extents to the byte range.
|
|
*/
|
|
int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
|
|
u64 start, u64 len)
|
|
{
|
|
struct super_block *sb = inode->i_sb;
|
|
const u8 type = SCOUTFS_FILE_EXTENT_TYPE;
|
|
const u64 ino = scoutfs_ino(inode);
|
|
u8 last_bytes[MAX_KEY_BYTES];
|
|
u8 key_bytes[MAX_KEY_BYTES];
|
|
struct scoutfs_key_buf last;
|
|
struct scoutfs_key_buf key;
|
|
struct native_extent ext;
|
|
struct scoutfs_lock *inode_lock = NULL;
|
|
u64 logical;
|
|
u64 blk_off;
|
|
u64 final;
|
|
u64 phys;
|
|
u64 size;
|
|
u32 flags;
|
|
int ret;
|
|
|
|
ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
|
|
if (ret)
|
|
return ret;
|
|
|
|
memset(&ext, ~0, sizeof(ext));
|
|
init_extent_key(&last, last_bytes, &ext, ino, type);
|
|
|
|
blk_off = start >> SCOUTFS_BLOCK_SHIFT;
|
|
final = (start + len - 1) >> SCOUTFS_BLOCK_SHIFT;
|
|
size = 0;
|
|
flags = 0;
|
|
|
|
/* XXX overkill? */
|
|
mutex_lock(&inode->i_mutex);
|
|
|
|
ret = scoutfs_lock_ino_group(sb, DLM_LOCK_PR, scoutfs_ino(inode),
|
|
&inode_lock);
|
|
if (ret)
|
|
goto out;
|
|
|
|
for (;;) {
|
|
ext.blk_off = blk_off;
|
|
ext.blkno = 0;
|
|
ext.blocks = 1;
|
|
ext.flags = 0;
|
|
init_extent_key(&key, key_bytes, &ext, ino, type);
|
|
|
|
ret = scoutfs_item_next_same(sb, &key, &last, NULL, NULL);
|
|
if (ret < 0) {
|
|
if (ret != -ENOENT)
|
|
break;
|
|
flags |= FIEMAP_EXTENT_LAST;
|
|
ret = 0;
|
|
}
|
|
|
|
load_extent(&ext, &key);
|
|
|
|
if (ext.blk_off > final)
|
|
flags |= FIEMAP_EXTENT_LAST;
|
|
|
|
if (size) {
|
|
ret = fiemap_fill_next_extent(fieinfo, logical, phys,
|
|
size, flags);
|
|
if (ret != 0) {
|
|
if (ret == 1)
|
|
ret = 0;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (flags & FIEMAP_EXTENT_LAST)
|
|
break;
|
|
|
|
logical = ext.blk_off << SCOUTFS_BLOCK_SHIFT;
|
|
phys = ext.blkno << SCOUTFS_BLOCK_SHIFT;
|
|
size = ext.blocks << SCOUTFS_BLOCK_SHIFT;
|
|
flags = 0;
|
|
|
|
if (ext.flags & SCOUTFS_FILE_EXTENT_OFFLINE) {
|
|
phys = 0;
|
|
flags = FIEMAP_EXTENT_UNKNOWN;
|
|
}
|
|
|
|
blk_off = ext.blk_off + ext.blocks;
|
|
}
|
|
|
|
scoutfs_unlock(sb, inode_lock);
|
|
out:
|
|
mutex_unlock(&inode->i_mutex);
|
|
|
|
return ret;
|
|
}
|
|
|
|
const struct address_space_operations scoutfs_file_aops = {
|
|
.readpage = scoutfs_readpage,
|
|
.readpages = scoutfs_readpages,
|
|
.writepage = scoutfs_writepage,
|
|
.writepages = scoutfs_writepages,
|
|
.write_begin = scoutfs_write_begin,
|
|
.write_end = scoutfs_write_end,
|
|
};
|
|
|
|
const struct file_operations scoutfs_file_fops = {
|
|
.read = do_sync_read,
|
|
.write = do_sync_write,
|
|
.aio_read = generic_file_aio_read,
|
|
.aio_write = generic_file_aio_write,
|
|
.unlocked_ioctl = scoutfs_ioctl,
|
|
.fsync = scoutfs_file_fsync,
|
|
.llseek = generic_file_llseek,
|
|
};
|
|
|
|
|
|
int scoutfs_data_setup(struct super_block *sb)
|
|
{
|
|
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
|
struct hlist_head *head;
|
|
struct data_info *datinf;
|
|
struct task_cursor *curs;
|
|
int i;
|
|
|
|
datinf = kzalloc(sizeof(struct data_info), GFP_KERNEL);
|
|
if (!datinf)
|
|
return -ENOMEM;
|
|
|
|
init_rwsem(&datinf->alloc_rwsem);
|
|
INIT_LIST_HEAD(&datinf->cursor_lru);
|
|
/* always search for large aligned extents */
|
|
datinf->next_large_blkno = LARGE_EXTENT_BLOCKS;
|
|
|
|
for (i = 0; i < CURSOR_HASH_HEADS; i++)
|
|
INIT_HLIST_HEAD(&datinf->cursor_hash[i]);
|
|
|
|
/* just allocate all of these up front */
|
|
for (i = 0; i < NR_CURSORS; i++) {
|
|
curs = kzalloc(sizeof(struct task_cursor), GFP_KERNEL);
|
|
if (!curs) {
|
|
destroy_cursors(datinf);
|
|
kfree(datinf);
|
|
return -ENOMEM;
|
|
}
|
|
|
|
curs->pid = i;
|
|
|
|
head = cursor_head(datinf, curs->task, curs->pid);
|
|
hlist_add_head(&curs->hnode, head);
|
|
|
|
list_add(&curs->list_head, &datinf->cursor_lru);
|
|
}
|
|
|
|
sbi->data_info = datinf;
|
|
|
|
return 0;
|
|
}
|
|
|
|
void scoutfs_data_destroy(struct super_block *sb)
|
|
{
|
|
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
|
struct data_info *datinf = sbi->data_info;
|
|
|
|
if (datinf) {
|
|
destroy_cursors(datinf);
|
|
kfree(datinf);
|
|
}
|
|
}
|