Files
scoutfs/kmod/src/data.c
Auke Kok e2b06f2c92 mpage_readpage() is now replaced with mpage_read_folio.
Folios are the new data types used for passing pages. For now,
folios only appear to have a single page. Future kernels will
change that.

Signed-off-by: Auke Kok <auke.kok@versity.com>
2024-10-03 12:41:05 -07:00

2024 lines
51 KiB
C

/*
* Copyright (C) 2019 Versity Software, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/mpage.h>
#include <linux/sched.h>
#include <linux/buffer_head.h>
#include <linux/hash.h>
#include <linux/log2.h>
#include <linux/falloc.h>
#include <linux/fiemap.h>
#include <linux/writeback.h>
#include "format.h"
#include "super.h"
#include "inode.h"
#include "key.h"
#include "alloc.h"
#include "data.h"
#include "trans.h"
#include "counters.h"
#include "scoutfs_trace.h"
#include "item.h"
#include "ioctl.h"
#include "btree.h"
#include "lock.h"
#include "file.h"
#include "msg.h"
#include "ext.h"
#include "util.h"
/*
* We want to amortize work done after dirtying the shared transaction
* accounting, but we don't want to blow out dirty allocator btree
* blocks. Each allocation can dirty quite a few allocator btree blocks
* so we check in pretty often.
*/
#define EXTENTS_PER_HOLD 8
struct data_info {
struct super_block *sb;
struct mutex mutex;
struct scoutfs_alloc *alloc;
struct scoutfs_block_writer *wri;
struct scoutfs_alloc_root data_freed;
struct scoutfs_data_alloc dalloc;
};
#define DECLARE_DATA_INFO(sb, name) \
struct data_info *name = SCOUTFS_SB(sb)->data_info
struct data_ext_args {
u64 ino;
struct inode *inode;
struct scoutfs_lock *lock;
};
static void item_from_extent(struct scoutfs_key *key,
struct scoutfs_data_extent_val *dv, u64 ino,
u64 start, u64 len, u64 map, u8 flags)
{
*key = (struct scoutfs_key) {
.sk_zone = SCOUTFS_FS_ZONE,
.skdx_ino = cpu_to_le64(ino),
.sk_type = SCOUTFS_DATA_EXTENT_TYPE,
.skdx_end = cpu_to_le64(start + len - 1),
.skdx_len = cpu_to_le64(len),
};
dv->blkno = cpu_to_le64(map);
dv->flags = flags;
}
static void ext_from_item(struct scoutfs_extent *ext,
struct scoutfs_key *key,
struct scoutfs_data_extent_val *dv)
{
ext->start = le64_to_cpu(key->skdx_end) -
le64_to_cpu(key->skdx_len) + 1;
ext->len = le64_to_cpu(key->skdx_len);
ext->map = le64_to_cpu(dv->blkno);
ext->flags = dv->flags;
}
static void data_ext_op_warn(struct inode *inode)
{
struct scoutfs_inode_info *si;
if (inode) {
si = SCOUTFS_I(inode);
WARN_ON_ONCE(!rwsem_is_locked(&si->extent_sem));
}
}
static int data_ext_next(struct super_block *sb, void *arg, u64 start, u64 len,
struct scoutfs_extent *ext)
{
struct data_ext_args *args = arg;
struct scoutfs_data_extent_val dv;
struct scoutfs_key key;
struct scoutfs_key last;
int ret;
data_ext_op_warn(args->inode);
item_from_extent(&last, &dv, args->ino, U64_MAX, 1, 0, 0);
item_from_extent(&key, &dv, args->ino, start, len, 0, 0);
ret = scoutfs_item_next(sb, &key, &last, &dv, sizeof(dv), args->lock);
if (ret == sizeof(dv)) {
ext_from_item(ext, &key, &dv);
ret = 0;
} else if (ret >= 0) {
ret = -EIO;
}
if (ret < 0)
memset(ext, 0, sizeof(struct scoutfs_extent));
return ret;
}
static void add_onoff(struct inode *inode, u64 map, u8 flags, s64 len)
{
s64 on = 0;
s64 off = 0;
if (map && !(flags & SEF_UNWRITTEN))
on += len;
else if (flags & SEF_OFFLINE)
off += len;
scoutfs_inode_add_onoff(inode, on, off);
}
static int data_ext_insert(struct super_block *sb, void *arg, u64 start,
u64 len, u64 map, u8 flags)
{
struct data_ext_args *args = arg;
struct scoutfs_data_extent_val dv;
struct scoutfs_key key;
int ret;
data_ext_op_warn(args->inode);
item_from_extent(&key, &dv, args->ino, start, len, map, flags);
ret = scoutfs_item_create(sb, &key, &dv, sizeof(dv), args->lock);
if (ret == 0 && args->inode)
add_onoff(args->inode, map, flags, len);
return ret;
}
static int data_ext_remove(struct super_block *sb, void *arg, u64 start,
u64 len, u64 map, u8 flags)
{
struct data_ext_args *args = arg;
struct scoutfs_data_extent_val dv;
struct scoutfs_key key;
int ret;
data_ext_op_warn(args->inode);
item_from_extent(&key, &dv, args->ino, start, len, map, flags);
ret = scoutfs_item_delete(sb, &key, args->lock);
if (ret == 0 && args->inode)
add_onoff(args->inode, map, flags, -len);
return ret;
}
static struct scoutfs_ext_ops data_ext_ops = {
.next = data_ext_next,
.insert = data_ext_insert,
.remove = data_ext_remove,
};
/*
* Find and remove or mark offline the block mappings that intersect
* with the caller's range. The caller is responsible for transactions
* and locks.
*
* Returns:
* - -errno on errors
* - 0 if there are no more extents to stop iteration
* - +iblock of next logical block to truncate the next block from
*/
static s64 truncate_extents(struct super_block *sb, struct inode *inode,
u64 ino, u64 iblock, u64 last, bool offline,
struct scoutfs_lock *lock)
{
DECLARE_DATA_INFO(sb, datinf);
struct data_ext_args args = {
.ino = ino,
.inode = inode,
.lock = lock,
};
struct scoutfs_extent ext;
struct scoutfs_extent tr;
u64 offset;
s64 ret;
u8 flags;
int err;
int i;
flags = offline ? SEF_OFFLINE : 0;
ret = 0;
for (i = 0; iblock <= last; i++) {
if (i == EXTENTS_PER_HOLD) {
ret = iblock;
break;
}
ret = scoutfs_ext_next(sb, &data_ext_ops, &args,
iblock, 1, &ext);
if (ret < 0) {
if (ret == -ENOENT)
ret = 0;
break;
}
/* done if we went past the region */
if (ext.start > last) {
ret = 0;
break;
}
/* nothing to do when already offline and unmapped */
if ((offline && (ext.flags & SEF_OFFLINE)) && !ext.map) {
iblock = ext.start + ext.len;
continue;
}
iblock = max(ext.start, iblock);
offset = iblock - ext.start;
tr.start = iblock;
tr.map = ext.map ? ext.map + offset : 0;
tr.len = min(ext.len - offset, last - iblock + 1);
tr.flags = ext.flags;
trace_scoutfs_data_extent_truncated(sb, ino, &tr);
ret = scoutfs_ext_set(sb, &data_ext_ops, &args,
tr.start, tr.len, 0, flags);
if (ret < 0) {
if (WARN_ON_ONCE(ret == -EINVAL)) {
scoutfs_err(sb, "unexpected truncate inconsistency: ino %llu iblock %llu last %llu, start %llu len %llu",
ino, iblock, last, tr.start, tr.len);
}
break;
}
if (tr.map) {
mutex_lock(&datinf->mutex);
ret = scoutfs_free_data(sb, datinf->alloc,
datinf->wri,
&datinf->data_freed,
tr.map, tr.len);
mutex_unlock(&datinf->mutex);
if (ret < 0) {
err = scoutfs_ext_set(sb, &data_ext_ops, &args,
tr.start, tr.len, tr.map, tr.flags);
if (err < 0)
scoutfs_err(sb, "truncate err %d restoring extent after error %lld: ino %llu start %llu len %llu",
err, ret, ino, tr.start, tr.len);
break;
}
}
iblock += tr.len;
}
return ret;
}
/*
* Free blocks inside the logical block range from 'iblock' to 'last',
* inclusive.
*
* If 'offline' is given then blocks are freed an offline mapping is
* left behind. Only blocks that have been allocated can be marked
* offline.
*
* If the inode is provided then we update its tracking of the online
* and offline blocks. If it's not provided then the inode is being
* destroyed and isn't reachable, we don't need to update it.
*
* The caller is in charge of locking the inode and data, but we may
* have to modify far more items than fit in a transaction so we're in
* charge of batching updates into transactions. If the inode is
* provided then we're responsible for updating its item as we go.
*/
int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,
u64 ino, u64 iblock, u64 last, bool offline,
struct scoutfs_lock *lock)
{
struct scoutfs_inode_info *si = NULL;
LIST_HEAD(ind_locks);
s64 ret = 0;
WARN_ON_ONCE(inode && !inode_is_locked(inode));
/* clamp last to the last possible block? */
if (last > SCOUTFS_BLOCK_SM_MAX)
last = SCOUTFS_BLOCK_SM_MAX;
trace_scoutfs_data_truncate_items(sb, iblock, last, offline);
if (WARN_ON_ONCE(last < iblock))
return -EINVAL;
if (inode) {
si = SCOUTFS_I(inode);
down_write(&si->extent_sem);
}
while (iblock <= last) {
if (inode)
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true, false);
else
ret = scoutfs_hold_trans(sb, false);
if (ret)
break;
if (inode)
ret = scoutfs_dirty_inode_item(inode, lock);
else
ret = 0;
if (ret == 0)
ret = truncate_extents(sb, inode, ino, iblock, last,
offline, lock);
if (inode)
scoutfs_update_inode_item(inode, lock, &ind_locks);
scoutfs_release_trans(sb);
if (inode)
scoutfs_inode_index_unlock(sb, &ind_locks);
if (ret <= 0)
break;
iblock = ret;
ret = 0;
}
if (si)
up_write(&si->extent_sem);
return ret;
}
static inline u64 ext_last(struct scoutfs_extent *ext)
{
return ext->start + ext->len - 1;
}
/*
* The caller is writing to a logical iblock that doesn't have an
* allocated extent. The caller has searched for an extent containing
* iblock. If it already existed then it must be unallocated and
* offline.
*
* We implement two preallocation strategies. Typically we only
* preallocate for simple streaming writes and limit preallocation while
* the file is small. The largest efficient allocation size is
* typically large enough that it would be unreasonable to allocate that
* much for all small files.
*
* Optionally, we can simply preallocate large empty aligned regions.
* This can waste a lot of space for small or sparse files but is
* reasonable when a file population is known to be large and dense but
* known to be written with non-streaming write patterns.
*/
static int alloc_block(struct super_block *sb, struct inode *inode,
struct scoutfs_extent *ext, u64 iblock,
struct scoutfs_lock *lock)
{
DECLARE_DATA_INFO(sb, datinf);
struct scoutfs_mount_options opts;
const u64 ino = scoutfs_ino(inode);
struct data_ext_args args = {
.ino = ino,
.inode = inode,
.lock = lock,
};
struct scoutfs_extent found;
struct scoutfs_extent pre = {0,};
bool undo_pre = false;
u64 blkno = 0;
u64 online;
u64 offline;
u8 flags;
u64 start;
u64 count;
u64 rem;
int ret;
int err;
trace_scoutfs_data_alloc_block_enter(sb, ino, iblock, ext);
scoutfs_options_read(sb, &opts);
/* can only allocate over existing unallocated offline extent */
if (WARN_ON_ONCE(ext->len &&
!(iblock >= ext->start && iblock <= ext_last(ext) &&
ext->map == 0 && (ext->flags & SEF_OFFLINE))))
return -EINVAL;
mutex_lock(&datinf->mutex);
/* default to single allocation at the written block */
start = iblock;
count = 1;
/* copy existing flags for preallocated regions */
flags = ext->len ? ext->flags : 0;
if (ext->len) {
/*
* Assume that offline writers are going to be writing
* all the offline extents and try to preallocate the
* rest of the unwritten extent.
*/
count = ext->len - (iblock - ext->start);
} else if (opts.data_prealloc_contig_only) {
/*
* Only preallocate when a quick test of the online
* block counts looks like we're a simple streaming
* write. Try to write until the next extent but limit
* the preallocation size to the number of online
* blocks.
*/
scoutfs_inode_get_onoff(inode, &online, &offline);
if (iblock > 1 && iblock == online) {
ret = scoutfs_ext_next(sb, &data_ext_ops, &args,
iblock, 1, &found);
if (ret < 0 && ret != -ENOENT)
goto out;
if (found.len && found.start > iblock)
count = found.start - iblock;
else
count = opts.data_prealloc_blocks;
count = min(iblock, count);
}
} else {
/*
* Preallocation within aligned regions tries to
* allocate an extent to fill the hole in the region
* that contains iblock. We'd have to add a bit of plumbing
* to find previous extents so we only search for a next
* extent from the front of the region and from iblock.
*/
div64_u64_rem(iblock, opts.data_prealloc_blocks, &rem);
start = iblock - rem;
count = opts.data_prealloc_blocks;
ret = scoutfs_ext_next(sb, &data_ext_ops, &args, start, 1, &found);
if (ret < 0 && ret != -ENOENT)
goto out;
/* trim count if there's an extent in the region before iblock */
if (found.len && found.start < iblock) {
count -= iblock - start;
start = iblock;
/* see if there's also an extent after iblock */
ret = scoutfs_ext_next(sb, &data_ext_ops, &args, iblock, 1, &found);
if (ret < 0 && ret != -ENOENT)
goto out;
}
/* trim count by next extent after iblock */
if (found.len && found.start > start && found.start < start + count)
count = (found.start - start);
}
/* overall prealloc limit */
count = min_t(u64, count, opts.data_prealloc_blocks);
ret = scoutfs_alloc_data(sb, datinf->alloc, datinf->wri,
&datinf->dalloc, count, &blkno, &count);
if (ret < 0)
goto out;
/*
* An aligned prealloc attempt that gets a smaller extent can
* fail to cover iblock, make sure that it does. This is a
* pathological case so we don't try to move the window past
* iblock. Just enough to cover it, which we know is safe.
*/
if (start + count <= iblock)
start += (iblock - (start + count) + 1);
if (count > 1) {
pre.start = start;
pre.len = count;
pre.map = blkno;
pre.flags = flags | SEF_UNWRITTEN;
ret = scoutfs_ext_set(sb, &data_ext_ops, &args, pre.start,
pre.len, pre.map, pre.flags);
if (ret < 0)
goto out;
undo_pre = true;
}
ret = scoutfs_ext_set(sb, &data_ext_ops, &args, iblock, 1, blkno + (iblock - start), 0);
if (ret < 0)
goto out;
/* tell the caller we have a single block, could check next? */
ext->start = iblock;
ext->len = 1;
ext->map = blkno + (iblock - start);
ext->flags = 0;
ret = 0;
out:
if (ret < 0 && blkno > 0) {
if (undo_pre) {
err = scoutfs_ext_set(sb, &data_ext_ops, &args,
pre.start, pre.len, 0, flags);
BUG_ON(err); /* leaked preallocated extent */
}
err = scoutfs_free_data(sb, datinf->alloc, datinf->wri,
&datinf->data_freed, blkno, count);
BUG_ON(err); /* leaked free blocks */
}
if (ret == 0) {
trace_scoutfs_data_alloc(sb, ino, ext);
trace_scoutfs_data_prealloc(sb, ino, &pre);
}
mutex_unlock(&datinf->mutex);
return ret;
}
static int scoutfs_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int create)
{
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
const u64 ino = scoutfs_ino(inode);
struct super_block *sb = inode->i_sb;
struct data_ext_args args;
struct scoutfs_lock *lock = NULL;
struct scoutfs_extent ext = {0,};
struct scoutfs_extent un;
u64 offset;
int ret;
WARN_ON_ONCE(create && !inode_is_locked(inode));
/* make sure caller holds a cluster lock */
lock = scoutfs_per_task_get(&si->pt_data_lock);
if (WARN_ON_ONCE(!lock)) {
ret = -EINVAL;
goto out;
}
args.ino = ino;
args.inode = inode;
args.lock = lock;
ret = scoutfs_ext_next(sb, &data_ext_ops, &args, iblock, 1, &ext);
if (ret == -ENOENT || (ret == 0 && ext.start > iblock))
memset(&ext, 0, sizeof(ext));
else if (ret < 0)
goto out;
if (ext.len)
trace_scoutfs_data_get_block_found(sb, ino, &ext);
/* non-staging callers should have waited on offline blocks */
if (WARN_ON_ONCE(ext.map && (ext.flags & SEF_OFFLINE) && !si->staging)){
ret = -EIO;
goto out;
}
if (create && !si->staging) {
ret = scoutfs_inode_check_retention(inode);
if (ret < 0)
goto out;
}
/* convert unwritten to written, could be staging */
if (create && ext.map && (ext.flags & SEF_UNWRITTEN)) {
un.start = iblock;
un.len = 1;
un.map = ext.map + (iblock - ext.start);
un.flags = ext.flags & ~(SEF_OFFLINE|SEF_UNWRITTEN);
ret = scoutfs_ext_set(sb, &data_ext_ops, &args,
un.start, un.len, un.map, un.flags);
if (ret == 0) {
ext = un;
set_buffer_new(bh);
}
goto out;
}
/* allocate and map blocks containing our logical block */
if (create && !ext.map) {
ret = alloc_block(sb, inode, &ext, iblock, lock);
if (ret == 0)
set_buffer_new(bh);
} else {
ret = 0;
}
out:
/* map usable extent, else leave bh unmapped for sparse reads */
if (ret == 0 && ext.map && !(ext.flags & SEF_UNWRITTEN)) {
offset = iblock - ext.start;
map_bh(bh, inode->i_sb, ext.map + offset);
bh->b_size = min_t(u64, bh->b_size,
(ext.len - offset) << SCOUTFS_BLOCK_SM_SHIFT);
trace_scoutfs_data_get_block_mapped(sb, ino, &ext);
}
trace_scoutfs_get_block(sb, scoutfs_ino(inode), iblock, create,
&ext, ret, bh->b_blocknr, bh->b_size);
return ret;
}
/*
* Typically extent item users are serialized by i_mutex. But page
* readers only hold the page lock and need to be protected from writers
* in other pages which can be manipulating neighbouring extents as
* they split and merge.
*/
static int scoutfs_get_block_read(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int create)
{
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
int ret;
down_read(&si->extent_sem);
ret = scoutfs_get_block(inode, iblock, bh, create);
up_read(&si->extent_sem);
return ret;
}
int scoutfs_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh,
int create)
{
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
int ret;
down_write(&si->extent_sem);
ret = scoutfs_get_block(inode, iblock, bh, create);
up_write(&si->extent_sem);
return ret;
}
/*
* This is almost never used. We can't block on a cluster lock while
* holding the page lock because lock invalidation gets the page lock
* while blocking locks. If a non blocking lock attempt fails we unlock
* the page and block acquiring the lock. We unlocked the page so it
* could have been truncated away, or whatever, so we return
* AOP_TRUNCATED_PAGE to have the caller try again.
*
* A similar process happens if we try to read from an offline extent
* that a caller hasn't already waited for. Instead of blocking
* acquiring the lock we block waiting for the offline extent. The page
* lock protects the page from release while we're checking and
* reading the extent.
*
* We can return errors from locking and checking offline extents. The
* page is unlocked if we return an error.
*/
#ifdef KC_MPAGE_READ_FOLIO
static int scoutfs_read_folio(struct file *file, struct folio *folio)
{
struct page *page = &folio->page;
#else
static int scoutfs_readpage(struct file *file, struct page *page)
{
#endif
struct inode *inode = file->f_inode;
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
struct super_block *sb = inode->i_sb;
struct scoutfs_lock *inode_lock = NULL;
SCOUTFS_DECLARE_PER_TASK_ENTRY(pt_ent);
DECLARE_DATA_WAIT(dw);
int flags;
int ret;
flags = SCOUTFS_LKF_REFRESH_INODE | SCOUTFS_LKF_NONBLOCK;
ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, flags, inode,
&inode_lock);
if (ret < 0) {
unlock_page(page);
if (ret == -EAGAIN) {
flags &= ~SCOUTFS_LKF_NONBLOCK;
ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, flags,
inode, &inode_lock);
if (ret == 0) {
scoutfs_unlock(sb, inode_lock,
SCOUTFS_LOCK_READ);
ret = AOP_TRUNCATED_PAGE;
}
}
return ret;
}
if (scoutfs_per_task_add_excl(&si->pt_data_lock, &pt_ent, inode_lock)) {
ret = scoutfs_data_wait_check(inode, page_offset(page),
PAGE_SIZE, SEF_OFFLINE,
SCOUTFS_IOC_DWO_READ, &dw,
inode_lock);
if (ret != 0) {
unlock_page(page);
scoutfs_per_task_del(&si->pt_data_lock, &pt_ent);
scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ);
}
if (ret > 0) {
ret = scoutfs_data_wait(inode, &dw);
if (ret == 0)
ret = AOP_TRUNCATED_PAGE;
}
if (ret != 0)
return ret;
}
#ifdef KC_MPAGE_READ_FOLIO
ret = mpage_read_folio(folio, scoutfs_get_block_read);
#else
ret = mpage_readpage(page, scoutfs_get_block_read);
#endif
scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ);
scoutfs_per_task_del(&si->pt_data_lock, &pt_ent);
return ret;
}
#ifndef KC_FILE_AOPS_READAHEAD
/*
* This is used for opportunistic read-ahead which can throw the pages
* away if it needs to. If the caller didn't deal with offline extents
* then we drop those pages rather than trying to wait. Whoever is
* staging offline extents should be doing it in enormous chunks so that
* read-ahead can ramp up within each staged region. The check for
* offline extents is cheap when the inode has no offline extents.
*/
static int scoutfs_readpages(struct file *file, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages)
{
struct inode *inode = file->f_inode;
struct super_block *sb = inode->i_sb;
struct scoutfs_lock *inode_lock = NULL;
struct page *page;
struct page *tmp;
int ret;
ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ,
SCOUTFS_LKF_REFRESH_INODE, inode, &inode_lock);
if (ret)
goto out;
list_for_each_entry_safe(page, tmp, pages, lru) {
ret = scoutfs_data_wait_check(inode, page_offset(page),
PAGE_SIZE, SEF_OFFLINE,
SCOUTFS_IOC_DWO_READ, NULL,
inode_lock);
if (ret < 0)
goto out;
if (ret > 0) {
list_del(&page->lru);
put_page(page);
if (--nr_pages == 0) {
ret = 0;
goto out;
}
}
}
ret = mpage_readpages(mapping, pages, nr_pages, scoutfs_get_block_read);
out:
scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ);
BUG_ON(!list_empty(pages));
return ret;
}
#else
static void scoutfs_readahead(struct readahead_control *rac)
{
struct inode *inode = rac->file->f_inode;
struct super_block *sb = inode->i_sb;
struct scoutfs_lock *inode_lock = NULL;
int ret;
ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ,
SCOUTFS_LKF_REFRESH_INODE, inode, &inode_lock);
if (ret)
return;
ret = scoutfs_data_wait_check(inode, readahead_pos(rac),
readahead_length(rac), SEF_OFFLINE,
SCOUTFS_IOC_DWO_READ, NULL,
inode_lock);
if (ret == 0)
mpage_readahead(rac, scoutfs_get_block_read);
scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ);
}
#endif
static int scoutfs_writepage(struct page *page, struct writeback_control *wbc)
{
return block_write_full_page(page, scoutfs_get_block_write, wbc);
}
static int scoutfs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
return mpage_writepages(mapping, wbc, scoutfs_get_block_write);
}
/* fsdata allocated in write_begin and freed in write_end */
struct write_begin_data {
struct list_head ind_locks;
struct scoutfs_lock *lock;
};
static int scoutfs_write_begin(struct file *file,
struct address_space *mapping, loff_t pos,
unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
{
struct inode *inode = mapping->host;
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
struct super_block *sb = inode->i_sb;
struct write_begin_data *wbd;
u64 ind_seq;
int ret;
trace_scoutfs_write_begin(sb, scoutfs_ino(inode), (__u64)pos, len);
wbd = kmalloc(sizeof(struct write_begin_data), GFP_NOFS);
if (!wbd)
return -ENOMEM;
INIT_LIST_HEAD(&wbd->ind_locks);
*fsdata = wbd;
wbd->lock = scoutfs_per_task_get(&si->pt_data_lock);
if (WARN_ON_ONCE(!wbd->lock)) {
ret = -EINVAL;
goto out;
}
retry:
do {
ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
scoutfs_inode_index_prepare(sb, &wbd->ind_locks, inode,
true) ?:
scoutfs_inode_index_try_lock_hold(sb, &wbd->ind_locks, ind_seq, true);
} while (ret > 0);
if (ret < 0)
goto out;
/* can't re-enter fs, have trans */
flags |= AOP_FLAG_NOFS;
/* generic write_end updates i_size and calls dirty_inode */
ret = scoutfs_dirty_inode_item(inode, wbd->lock) ?:
block_write_begin(mapping, pos, len, flags, pagep,
scoutfs_get_block_write);
if (ret < 0) {
scoutfs_release_trans(sb);
scoutfs_inode_index_unlock(sb, &wbd->ind_locks);
if (ret == -ENOBUFS) {
/* Retry with a new transaction. */
scoutfs_inc_counter(sb, data_write_begin_enobufs_retry);
goto retry;
}
}
out:
if (ret < 0)
kfree(wbd);
return ret;
}
/* kinda like __filemap_fdatawrite_range! :P */
static int writepages_sync_none(struct address_space *mapping, loff_t start,
loff_t end)
{
struct writeback_control wbc = {
.sync_mode = WB_SYNC_NONE,
.nr_to_write = LONG_MAX,
.range_start = start,
.range_end = end,
};
return mapping->a_ops->writepages(mapping, &wbc);
}
static int scoutfs_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
struct inode *inode = mapping->host;
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
struct super_block *sb = inode->i_sb;
struct write_begin_data *wbd = fsdata;
int ret;
trace_scoutfs_write_end(sb, scoutfs_ino(inode), page->index, (u64)pos,
len, copied);
ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
if (ret > 0) {
if (!si->staging) {
scoutfs_inode_set_data_seq(inode);
scoutfs_inode_inc_data_version(inode);
}
inode_inc_iversion(inode);
scoutfs_update_inode_item(inode, wbd->lock, &wbd->ind_locks);
scoutfs_inode_queue_writeback(inode);
}
scoutfs_release_trans(sb);
scoutfs_inode_index_unlock(sb, &wbd->ind_locks);
kfree(wbd);
/*
* Currently transactions are kept very simple. Only one is
* open at a time and commit excludes concurrent dirtying. It
* writes out all dirty file data during commit. This can lead
* to very long commit latencies with lots of dirty file data.
*
* This hack tries to minimize these writeback latencies while
* keeping concurrent large file strreaming writes from
* suffering too terribly. Every N bytes we kick off background
* writbeack on the previous N bytes. By the time transaction
* commit comes along it will find that dirty file blocks have
* already been written.
*/
#define BACKGROUND_WRITEBACK_BYTES (16 * 1024 * 1024)
#define BACKGROUND_WRITEBACK_MASK (BACKGROUND_WRITEBACK_BYTES - 1)
if (ret > 0 && ((pos + ret) & BACKGROUND_WRITEBACK_MASK) == 0)
writepages_sync_none(mapping,
pos + ret - BACKGROUND_WRITEBACK_BYTES,
pos + ret - 1);
return ret;
}
/*
* Try to allocate unwritten extents for any unallocated regions of the
* logical block extent from the caller. The caller manages locks and
* transactions. We limit ourselves to a reasonable number of extents
* before returning to open another transaction.
*
* We return an error or the number of blocks starting at iblock that
* were successfully processed. The caller will continue after those
* blocks until they reach last.
*/
static s64 fallocate_extents(struct super_block *sb, struct inode *inode,
u64 iblock, u64 last, struct scoutfs_lock *lock)
{
DECLARE_DATA_INFO(sb, datinf);
struct data_ext_args args = {
.ino = scoutfs_ino(inode),
.inode = inode,
.lock = lock,
};
struct scoutfs_extent ext;
u8 ext_fl;
u64 blkno;
u64 count;
s64 done = 0;
int ret = 0;
int err;
int i;
for (i = 0; iblock <= last && i < EXTENTS_PER_HOLD; i++) {
ret = scoutfs_ext_next(sb, &data_ext_ops, &args,
iblock, 1, &ext);
if (ret == -ENOENT)
ret = 0;
else if (ret < 0)
break;
/* default to allocate to end of region */
count = last - iblock + 1;
ext_fl = 0;
if (!ext.len) {
/* no extent, default alloc from above */
} else if (ext.start <= iblock && ext.map) {
/* skip portion of allocated extent */
count = min_t(u64, count,
ext.len - (iblock - ext.start));
iblock += count;
done += count;
continue;
} else if (ext.start <= iblock && !ext.map) {
/* alloc portion of unallocated extent */
count = min_t(u64, count,
ext.len - (iblock - ext.start));
ext_fl = ext.flags;
} else if (iblock < ext.start) {
/* alloc hole until next extent */
count = min_t(u64, count, ext.start - iblock);
}
/* limit allocation attempts */
count = min_t(u64, count, SCOUTFS_FALLOCATE_ALLOC_LIMIT);
mutex_lock(&datinf->mutex);
ret = scoutfs_alloc_data(sb, datinf->alloc, datinf->wri,
&datinf->dalloc, count,
&blkno, &count);
if (ret == 0) {
ret = scoutfs_ext_set(sb, &data_ext_ops, &args, iblock,
count, blkno,
ext_fl | SEF_UNWRITTEN);
if (ret < 0) {
err = scoutfs_free_data(sb, datinf->alloc,
datinf->wri,
&datinf->data_freed,
blkno, count);
BUG_ON(err); /* inconsistent */
}
}
mutex_unlock(&datinf->mutex);
if (ret < 0)
break;
iblock += count;
done += count;
}
if (ret == 0)
ret = done;
return ret;
}
/*
* Modify the extents that map the blocks that store the len byte region
* starting at offset.
*
* The caller has only prevented freezing by entering a fs write
* context. We're responsible for all other locking and consistency.
*
* This can be used to preallocate files for staging. We find existing
* offline extents and allocate block for them and set unwritten.
*/
long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
struct inode *inode = file_inode(file);
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
struct super_block *sb = inode->i_sb;
const u64 ino = scoutfs_ino(inode);
struct scoutfs_lock *lock = NULL;
LIST_HEAD(ind_locks);
loff_t end;
u64 iblock;
u64 last;
s64 ret;
/* XXX support more flags */
if (mode & ~(FALLOC_FL_KEEP_SIZE)) {
ret = -EOPNOTSUPP;
goto out;
}
/* catch wrapping */
if (offset + len < offset) {
ret = -EINVAL;
goto out;
}
if (len == 0) {
ret = 0;
goto out;
}
inode_lock(inode);
ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_WRITE,
SCOUTFS_LKF_REFRESH_INODE, inode, &lock);
if (ret)
goto out_mutex;
inode_dio_wait(inode);
down_write(&si->extent_sem);
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
(offset + len > i_size_read(inode))) {
ret = inode_newsize_ok(inode, offset + len);
if (ret)
goto out_extent;
}
iblock = offset >> SCOUTFS_BLOCK_SM_SHIFT;
last = (offset + len - 1) >> SCOUTFS_BLOCK_SM_SHIFT;
while(iblock <= last) {
ret = scoutfs_quota_check_data(sb, inode);
if (ret)
goto out_extent;
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, true);
if (ret)
goto out_extent;
ret = fallocate_extents(sb, inode, iblock, last, lock);
if (ret >= 0 && !(mode & FALLOC_FL_KEEP_SIZE)) {
end = (iblock + ret) << SCOUTFS_BLOCK_SM_SHIFT;
if (end > offset + len)
end = offset + len;
if (end > i_size_read(inode)) {
i_size_write(inode, end);
inode_inc_iversion(inode);
scoutfs_inode_inc_data_version(inode);
}
}
if (ret >= 0)
scoutfs_update_inode_item(inode, lock, &ind_locks);
scoutfs_release_trans(sb);
scoutfs_inode_index_unlock(sb, &ind_locks);
/* txn couldn't meet the request. Let's try with a new txn */
if (ret == -ENOBUFS) {
scoutfs_inc_counter(sb, data_fallocate_enobufs_retry);
continue;
}
if (ret <= 0)
goto out_extent;
iblock += ret;
ret = 0;
}
out_extent:
up_write(&si->extent_sem);
out_mutex:
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
inode_unlock(inode);
out:
trace_scoutfs_data_fallocate(sb, ino, mode, offset, len, ret);
return ret;
}
/*
* A special case of initializing a single large offline extent. This
* chooses not to deal with any existing extents. It can only be used
* on regular files with no data extents. It's used to restore a file
* with an offline extent which can then trigger staging.
*
* The caller must take care of cluster locking, transactions, inode
* updates, and index updates (so that they can atomically make this
* change along with other metadata changes).
*/
int scoutfs_data_init_offline_extent(struct inode *inode, u64 size,
struct scoutfs_lock *lock)
{
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
struct super_block *sb = inode->i_sb;
struct data_ext_args args = {
.ino = scoutfs_ino(inode),
.inode = inode,
.lock = lock,
};
const u64 count = DIV_ROUND_UP(size, SCOUTFS_BLOCK_SM_SIZE);
u64 on;
u64 off;
int ret;
scoutfs_inode_get_onoff(inode, &on, &off);
/* caller should have checked */
if (on > 0 || off > 0) {
ret = -EINVAL;
goto out;
}
down_write(&si->extent_sem);
ret = scoutfs_ext_insert(sb, &data_ext_ops, &args,
0, count, 0, SEF_OFFLINE);
up_write(&si->extent_sem);
out:
return ret;
}
/*
* We're using truncate_inode_pages_range to maintain consistency
* between the page cache and extents that just changed. We have to
* call with full aligned page offsets or it thinks that it should leave
* behind a zeroed partial page.
*/
static void truncate_inode_pages_extent(struct inode *inode, u64 start, u64 len)
{
truncate_inode_pages_range(&inode->i_data,
start << SCOUTFS_BLOCK_SM_SHIFT,
((start + len) << SCOUTFS_BLOCK_SM_SHIFT) - 1);
}
/*
* Move extents from one file to another. The behaviour is more fully
* explained above the move_blocks ioctl argument structure definition.
*
* The caller has processed the ioctl args and performed the most basic
* argument sanity and inode checks, but we perform more detailed inode
* checks once we have the inode lock and refreshed inodes. Our job is
* to safely lock the two files and move the extents.
*/
#define MOVE_DATA_EXTENTS_PER_HOLD 16
int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
u64 byte_len, struct inode *to, u64 to_off, bool is_stage,
u64 data_version)
{
struct scoutfs_inode_info *from_si = SCOUTFS_I(from);
struct scoutfs_inode_info *to_si = SCOUTFS_I(to);
struct super_block *sb = from->i_sb;
struct scoutfs_lock *from_lock = NULL;
struct scoutfs_lock *to_lock = NULL;
struct data_ext_args from_args;
struct data_ext_args to_args;
struct scoutfs_extent ext;
struct kc_timespec cur_time;
LIST_HEAD(locks);
bool done = false;
loff_t from_size;
loff_t to_size;
u64 from_offline;
u64 to_offline;
u64 from_start;
u64 to_start;
u64 from_iblock;
u64 to_iblock;
u64 count;
u64 junk;
u64 seq;
u64 map;
u64 len;
int ret;
int err;
int i;
lock_two_nondirectories(from, to);
ret = scoutfs_lock_inodes(sb, SCOUTFS_LOCK_WRITE,
SCOUTFS_LKF_REFRESH_INODE, from, &from_lock,
to, &to_lock, NULL, NULL, NULL, NULL);
if (ret)
goto out;
if (!is_stage && (ret = scoutfs_inode_check_retention(to)))
goto out;
if ((from_off & SCOUTFS_BLOCK_SM_MASK) ||
(to_off & SCOUTFS_BLOCK_SM_MASK) ||
((byte_len & SCOUTFS_BLOCK_SM_MASK) &&
(from_off + byte_len != i_size_read(from)))) {
ret = -EINVAL;
goto out;
}
if (is_stage && (data_version != SCOUTFS_I(to)->data_version)) {
ret = -ESTALE;
goto out;
}
from_iblock = from_off >> SCOUTFS_BLOCK_SM_SHIFT;
count = (byte_len + SCOUTFS_BLOCK_SM_MASK) >> SCOUTFS_BLOCK_SM_SHIFT;
to_iblock = to_off >> SCOUTFS_BLOCK_SM_SHIFT;
from_start = from_iblock;
/* only move extent blocks inside i_size, careful not to wrap */
from_size = i_size_read(from);
if (from_off >= from_size) {
ret = 0;
goto out;
}
if (from_off + byte_len > from_size)
count = ((from_size - from_off) + SCOUTFS_BLOCK_SM_MASK) >> SCOUTFS_BLOCK_SM_SHIFT;
if (S_ISDIR(from->i_mode) || S_ISDIR(to->i_mode)) {
ret = -EISDIR;
goto out;
}
if (!S_ISREG(from->i_mode) || !S_ISREG(to->i_mode)) {
ret = -EINVAL;
goto out;
}
ret = inode_permission(KC_VFS_INIT_NS from, MAY_WRITE) ?:
inode_permission(KC_VFS_INIT_NS to, MAY_WRITE);
if (ret < 0)
goto out;
/* can't stage once data_version changes */
scoutfs_inode_get_onoff(from, &junk, &from_offline);
scoutfs_inode_get_onoff(to, &junk, &to_offline);
if (from_offline || (to_offline && !is_stage)) {
ret = -ENODATA;
goto out;
}
from_args = (struct data_ext_args) {
.ino = scoutfs_ino(from),
.inode = from,
.lock = from_lock,
};
to_args = (struct data_ext_args) {
.ino = scoutfs_ino(to),
.inode = to,
.lock = to_lock,
};
inode_dio_wait(from);
inode_dio_wait(to);
ret = filemap_write_and_wait_range(&from->i_data, from_off,
from_off + byte_len - 1);
if (ret < 0)
goto out;
for (;;) {
ret = scoutfs_inode_index_start(sb, &seq) ?:
scoutfs_inode_index_prepare(sb, &locks, from, true) ?:
scoutfs_inode_index_prepare(sb, &locks, to, true) ?:
scoutfs_inode_index_try_lock_hold(sb, &locks, seq, false);
if (ret > 0)
continue;
if (ret < 0)
goto out;
ret = scoutfs_dirty_inode_item(from, from_lock) ?:
scoutfs_dirty_inode_item(to, to_lock);
if (ret < 0)
goto out;
down_write_two(&from_si->extent_sem, &to_si->extent_sem);
/* arbitrarily limit the number of extents per trans hold */
for (i = 0; i < MOVE_DATA_EXTENTS_PER_HOLD; i++) {
struct scoutfs_extent off_ext;
/* find the next extent to move */
ret = scoutfs_ext_next(sb, &data_ext_ops, &from_args,
from_start, 1, &ext);
if (ret < 0) {
if (ret == -ENOENT) {
done = true;
ret = 0;
}
break;
}
/* done if next extent starts after moving region */
if (ext.start >= from_iblock + count) {
done = true;
ret = 0;
break;
}
from_start = max(ext.start, from_iblock);
map = ext.map + (from_start - ext.start);
len = min(from_iblock + count, ext.start + ext.len) - from_start;
to_start = to_iblock + (from_start - from_iblock);
/* we'd get stuck, shouldn't happen */
if (WARN_ON_ONCE(len == 0)) {
ret = -EIO;
goto out;
}
if (is_stage) {
ret = scoutfs_ext_next(sb, &data_ext_ops, &to_args,
to_start, 1, &off_ext);
if (ret)
break;
if (!scoutfs_ext_inside(to_start, len, &off_ext) ||
!(off_ext.flags & SEF_OFFLINE)) {
ret = -EINVAL;
break;
}
ret = scoutfs_ext_set(sb, &data_ext_ops, &to_args,
to_start, len,
map, ext.flags);
} else {
/* insert the new, fails if it overlaps */
ret = scoutfs_ext_insert(sb, &data_ext_ops, &to_args,
to_start, len,
map, ext.flags);
}
if (ret < 0)
break;
/* remove the old, possibly splitting */
ret = scoutfs_ext_set(sb, &data_ext_ops, &from_args,
from_start, len, 0, 0);
if (ret < 0) {
if (is_stage) {
/* re-mark dest range as offline */
WARN_ON_ONCE(!(off_ext.flags & SEF_OFFLINE));
err = scoutfs_ext_set(sb, &data_ext_ops, &to_args,
to_start, len,
0, off_ext.flags);
} else {
/* remove inserted new on err */
err = scoutfs_ext_remove(sb, &data_ext_ops,
&to_args, to_start,
len);
}
BUG_ON(err); /* XXX inconsistent */
break;
}
trace_scoutfs_data_move_blocks(sb, scoutfs_ino(from),
from_start, len, map,
ext.flags,
scoutfs_ino(to),
to_start);
/* moved extent might extend i_size */
to_size = (to_start + len) << SCOUTFS_BLOCK_SM_SHIFT;
if (to_size > i_size_read(to)) {
/* while maintaining final partial */
from_size = (from_start + len) <<
SCOUTFS_BLOCK_SM_SHIFT;
if (from_size > i_size_read(from))
to_size -= from_size -
i_size_read(from);
i_size_write(to, to_size);
}
/* find next after moved extent, avoiding wrapping */
if (from_start + len < from_start)
from_start = from_iblock + count + 1;
else
from_start += len;
}
up_write(&from_si->extent_sem);
up_write(&to_si->extent_sem);
cur_time = current_time(from);
if (!is_stage) {
to->i_ctime = to->i_mtime = cur_time;
inode_inc_iversion(to);
scoutfs_inode_inc_data_version(to);
scoutfs_inode_set_data_seq(to);
}
from->i_ctime = from->i_mtime = cur_time;
inode_inc_iversion(from);
scoutfs_inode_inc_data_version(from);
scoutfs_inode_set_data_seq(from);
scoutfs_update_inode_item(from, from_lock, &locks);
scoutfs_update_inode_item(to, to_lock, &locks);
scoutfs_release_trans(sb);
scoutfs_inode_index_unlock(sb, &locks);
if (ret < 0 || done)
break;
}
/* remove any cached pages from old extents */
truncate_inode_pages_extent(from, from_iblock, count);
truncate_inode_pages_extent(to, to_iblock, count);
out:
scoutfs_unlock(sb, from_lock, SCOUTFS_LOCK_WRITE);
scoutfs_unlock(sb, to_lock, SCOUTFS_LOCK_WRITE);
unlock_two_nondirectories(from, to);
return ret;
}
/*
* This copies to userspace :/
*/
static int fill_extent(struct fiemap_extent_info *fieinfo,
struct scoutfs_extent *ext, u32 fiemap_flags)
{
u32 flags;
if (ext->len == 0)
return 0;
flags = fiemap_flags;
if (ext->flags & SEF_OFFLINE)
flags |= FIEMAP_EXTENT_UNKNOWN;
else if (ext->flags & SEF_UNWRITTEN)
flags |= FIEMAP_EXTENT_UNWRITTEN;
return fiemap_fill_next_extent(fieinfo,
ext->start << SCOUTFS_BLOCK_SM_SHIFT,
ext->map << SCOUTFS_BLOCK_SM_SHIFT,
ext->len << SCOUTFS_BLOCK_SM_SHIFT,
flags);
}
/*
* Return all the file's extents whose blocks overlap with the caller's
* byte region. We set _LAST on the last extent and _UNKNOWN on offline
* extents.
*/
int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
struct super_block *sb = inode->i_sb;
const u64 ino = scoutfs_ino(inode);
struct scoutfs_lock *lock = NULL;
struct scoutfs_extent ext;
struct scoutfs_extent cur;
struct data_ext_args args;
u32 last_flags;
u64 iblock;
u64 last;
int ret;
if (len == 0) {
ret = 0;
goto out;
}
ret = fiemap_prep(inode, fieinfo, start, &len, FIEMAP_FLAG_SYNC);
if (ret)
goto out;
inode_lock(inode);
down_read(&si->extent_sem);
ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, 0, inode, &lock);
if (ret)
goto unlock;
args.ino = ino;
args.inode = inode;
args.lock = lock;
/* use a dummy extent to track */
memset(&cur, 0, sizeof(cur));
last_flags = 0;
iblock = start >> SCOUTFS_BLOCK_SM_SHIFT;
last = (start + len - 1) >> SCOUTFS_BLOCK_SM_SHIFT;
while (iblock <= last) {
ret = scoutfs_ext_next(sb, &data_ext_ops, &args,
iblock, 1, &ext);
if (ret < 0) {
if (ret == -ENOENT)
ret = 0;
last_flags = FIEMAP_EXTENT_LAST;
break;
}
trace_scoutfs_data_fiemap_extent(sb, ino, &ext);
if (ext.start > last) {
/* not setting _LAST, it's for end of file */
ret = 0;
break;
}
if (scoutfs_ext_can_merge(&cur, &ext)) {
/* merged extents could be greater than input len */
cur.len += ext.len;
} else {
ret = fill_extent(fieinfo, &cur, 0);
if (ret != 0)
goto unlock;
cur = ext;
}
iblock = ext.start + ext.len;
}
if (cur.len)
ret = fill_extent(fieinfo, &cur, last_flags);
unlock:
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);
up_read(&si->extent_sem);
inode_unlock(inode);
out:
if (ret == 1)
ret = 0;
trace_scoutfs_data_fiemap(sb, start, len, ret);
return ret;
}
/*
* Insert a new waiter. This supports multiple tasks waiting for the
* same ino and iblock by also comparing waiters by their addresses.
*/
static void insert_offline_waiting(struct rb_root *root,
struct scoutfs_data_wait *ins)
{
struct rb_node **node = &root->rb_node;
struct rb_node *parent = NULL;
struct scoutfs_data_wait *dw;
int cmp;
while (*node) {
parent = *node;
dw = rb_entry(*node, struct scoutfs_data_wait, node);
cmp = scoutfs_cmp_u64s(ins->ino, dw->ino) ?:
scoutfs_cmp_u64s(ins->iblock, dw->iblock) ?:
scoutfs_cmp(ins, dw);
if (cmp < 0)
node = &(*node)->rb_left;
else
node = &(*node)->rb_right;
}
rb_link_node(&ins->node, parent, node);
rb_insert_color(&ins->node, root);
}
static struct scoutfs_data_wait *next_data_wait(struct rb_root *root, u64 ino,
u64 iblock)
{
struct rb_node **node = &root->rb_node;
struct rb_node *parent = NULL;
struct scoutfs_data_wait *next = NULL;
struct scoutfs_data_wait *dw;
int cmp;
while (*node) {
parent = *node;
dw = rb_entry(*node, struct scoutfs_data_wait, node);
/* go left when ino/iblock are equal to get first task */
cmp = scoutfs_cmp_u64s(ino, dw->ino) ?:
scoutfs_cmp_u64s(iblock, dw->iblock);
if (cmp <= 0) {
node = &(*node)->rb_left;
next = dw;
} else if (cmp > 0) {
node = &(*node)->rb_right;
}
}
return next;
}
static struct scoutfs_data_wait *dw_next(struct scoutfs_data_wait *dw)
{
struct rb_node *node = rb_next(&dw->node);
if (node)
return container_of(node, struct scoutfs_data_wait, node);
return NULL;
}
/*
* Check if we should wait by looking for extents whose flags match.
* Returns 0 if no extents were found or any error encountered.
*
* The caller must have acquired a cluster lock that covers the extent
* items. We acquire the extent_sem to protect our read from writers in
* other tasks.
*
* Returns 1 if any file extents in the caller's region matched. If the
* wait struct is provided then it is initialized to be woken when the
* extents change after the caller unlocks after the check. The caller
* must come through _data_wait() to clean up the wait struct if we set
* it up.
*/
int scoutfs_data_wait_check(struct inode *inode, loff_t pos, loff_t len,
u8 sef, u8 op, struct scoutfs_data_wait *dw,
struct scoutfs_lock *lock)
{
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
struct super_block *sb = inode->i_sb;
const u64 ino = scoutfs_ino(inode);
struct data_ext_args args = {
.ino = ino,
.inode = inode,
.lock = lock,
};
DECLARE_DATA_WAIT_ROOT(sb, rt);
DECLARE_DATA_WAITQ(inode, wq);
struct scoutfs_extent ext = {0,};
u64 iblock;
u64 last_block;
u64 on;
u64 off;
int ret = 0;
if (WARN_ON_ONCE(sef & SEF_UNKNOWN) ||
WARN_ON_ONCE(op & SCOUTFS_IOC_DWO_UNKNOWN) ||
WARN_ON_ONCE(dw && !RB_EMPTY_NODE(&dw->node)) ||
WARN_ON_ONCE(pos + len < pos)) {
ret = -EINVAL;
goto out;
}
if ((sef & SEF_OFFLINE)) {
scoutfs_inode_get_onoff(inode, &on, &off);
if (off == 0) {
ret = 0;
goto out;
}
}
down_read(&si->extent_sem);
iblock = pos >> SCOUTFS_BLOCK_SM_SHIFT;
last_block = (pos + len - 1) >> SCOUTFS_BLOCK_SM_SHIFT;
while(iblock <= last_block) {
ret = scoutfs_ext_next(sb, &data_ext_ops, &args,
iblock, 1, &ext);
if (ret < 0) {
if (ret == -ENOENT)
ret = 0;
break;
}
if (ext.start > last_block) {
ret = 0;
break;
}
if (sef & ext.flags) {
if (dw) {
dw->chg = atomic64_read(&wq->changed);
dw->ino = ino;
dw->iblock = max(iblock, ext.start);
dw->op = op;
spin_lock(&rt->lock);
insert_offline_waiting(&rt->root, dw);
spin_unlock(&rt->lock);
}
ret = 1;
break;
}
iblock = ext.start + ext.len;
}
up_read(&si->extent_sem);
out:
trace_scoutfs_data_wait_check(sb, ino, pos, len, sef, op, &ext, ret);
return ret;
}
bool scoutfs_data_wait_found(struct scoutfs_data_wait *dw)
{
return !RB_EMPTY_NODE(&dw->node);
}
int scoutfs_data_wait_check_iov(struct inode *inode, const struct iovec *iov,
unsigned long nr_segs, loff_t pos, u8 sef,
u8 op, struct scoutfs_data_wait *dw,
struct scoutfs_lock *lock)
{
unsigned long i;
int ret = 0;
for (i = 0; i < nr_segs; i++) {
if (iov[i].iov_len == 0)
continue;
ret = scoutfs_data_wait_check(inode, pos, iov[i].iov_len, sef,
op, dw, lock);
if (ret != 0)
break;
pos += iov[i].iov_len;
}
return ret;
}
int scoutfs_data_wait(struct inode *inode, struct scoutfs_data_wait *dw)
{
DECLARE_DATA_WAIT_ROOT(inode->i_sb, rt);
DECLARE_DATA_WAITQ(inode, wq);
int ret;
ret = wait_event_interruptible(wq->waitq,
atomic64_read(&wq->changed) != dw->chg);
spin_lock(&rt->lock);
rb_erase(&dw->node, &rt->root);
RB_CLEAR_NODE(&dw->node);
if (!ret && dw->err)
ret = dw->err;
spin_unlock(&rt->lock);
return ret;
}
void scoutfs_data_wait_changed(struct inode *inode)
{
DECLARE_DATA_WAITQ(inode, wq);
atomic64_inc(&wq->changed);
wake_up(&wq->waitq);
}
long scoutfs_data_wait_err(struct inode *inode, u64 sblock, u64 eblock,
u64 op, long err)
{
struct super_block *sb = inode->i_sb;
const u64 ino = scoutfs_ino(inode);
DECLARE_DATA_WAIT_ROOT(sb, rt);
struct scoutfs_data_wait *dw;
long nr = 0;
if (!err)
return 0;
spin_lock(&rt->lock);
for (dw = next_data_wait(&rt->root, ino, sblock);
dw; dw = dw_next(dw)) {
if (dw->ino != ino || dw->iblock > eblock)
break;
if ((dw->op & op) && !dw->err) {
dw->err = err;
nr++;
}
}
spin_unlock(&rt->lock);
if (nr)
scoutfs_data_wait_changed(inode);
return nr;
}
int scoutfs_data_waiting(struct super_block *sb, u64 ino, u64 iblock,
struct scoutfs_ioctl_data_waiting_entry *dwe,
unsigned int nr)
{
DECLARE_DATA_WAIT_ROOT(sb, rt);
struct scoutfs_data_wait *dw;
int ret = 0;
spin_lock(&rt->lock);
dw = next_data_wait(&rt->root, ino, iblock);
while (dw && ret < nr) {
dwe->ino = dw->ino;
dwe->iblock = dw->iblock;
dwe->op = dw->op;
while ((dw = dw_next(dw)) &&
(dw->ino == dwe->ino && dw->iblock == dwe->iblock)) {
dwe->op |= dw->op;
}
dwe++;
ret++;
}
spin_unlock(&rt->lock);
return ret;
}
const struct address_space_operations scoutfs_file_aops = {
#ifdef KC_MPAGE_READ_FOLIO
.read_folio = scoutfs_read_folio,
#else
.readpage = scoutfs_readpage,
#endif
#ifndef KC_FILE_AOPS_READAHEAD
.readpages = scoutfs_readpages,
#else
.readahead = scoutfs_readahead,
#endif
.writepage = scoutfs_writepage,
.writepages = scoutfs_writepages,
.write_begin = scoutfs_write_begin,
.write_end = scoutfs_write_end,
};
const struct file_operations scoutfs_file_fops = {
#ifdef KC_LINUX_HAVE_FOP_AIO_READ
.read = do_sync_read,
.write = do_sync_write,
.aio_read = scoutfs_file_aio_read,
.aio_write = scoutfs_file_aio_write,
#else
.read_iter = scoutfs_file_read_iter,
.write_iter = scoutfs_file_write_iter,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
#endif
.unlocked_ioctl = scoutfs_ioctl,
.fsync = scoutfs_file_fsync,
.llseek = scoutfs_file_llseek,
.fallocate = scoutfs_fallocate,
};
void scoutfs_data_init_btrees(struct super_block *sb,
struct scoutfs_alloc *alloc,
struct scoutfs_block_writer *wri,
struct scoutfs_log_trees *lt)
{
DECLARE_DATA_INFO(sb, datinf);
mutex_lock(&datinf->mutex);
datinf->alloc = alloc;
datinf->wri = wri;
scoutfs_dalloc_init(&datinf->dalloc, &lt->data_avail);
datinf->data_freed = lt->data_freed;
mutex_unlock(&datinf->mutex);
}
void scoutfs_data_get_btrees(struct super_block *sb,
struct scoutfs_log_trees *lt)
{
DECLARE_DATA_INFO(sb, datinf);
mutex_lock(&datinf->mutex);
scoutfs_dalloc_get_root(&datinf->dalloc, &lt->data_avail);
lt->data_freed = datinf->data_freed;
mutex_unlock(&datinf->mutex);
}
/*
* This should be called before preparing the allocators for the commit
* because it can allocate and free btree blocks in the data allocator.
*/
int scoutfs_data_prepare_commit(struct super_block *sb)
{
DECLARE_DATA_INFO(sb, datinf);
int ret;
mutex_lock(&datinf->mutex);
ret = scoutfs_dalloc_return_cached(sb, datinf->alloc, datinf->wri,
&datinf->dalloc);
mutex_unlock(&datinf->mutex);
return ret;
}
/*
* Return true if the data allocator is lower than the caller's
* requirement and we haven't been told by the server that we're out of
* free extents.
*/
bool scoutfs_data_alloc_should_refill(struct super_block *sb, u64 blocks)
{
DECLARE_DATA_INFO(sb, datinf);
return (scoutfs_dalloc_total_len(&datinf->dalloc) < blocks) &&
!(le32_to_cpu(datinf->dalloc.root.flags) & SCOUTFS_ALLOC_FLAG_LOW);
}
int scoutfs_data_setup(struct super_block *sb)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct data_info *datinf;
datinf = kzalloc(sizeof(struct data_info), GFP_KERNEL);
if (!datinf)
return -ENOMEM;
datinf->sb = sb;
mutex_init(&datinf->mutex);
sbi->data_info = datinf;
return 0;
}
void scoutfs_data_destroy(struct super_block *sb)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct data_info *datinf = sbi->data_info;
if (datinf) {
sbi->data_info = NULL;
kfree(datinf);
}
}