scoutfs/kmod/src/data.c

/*
 * Copyright (C) 2019 Versity Software, Inc.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License v2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 */
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/mpage.h>
#include <linux/sched.h>
#include <linux/buffer_head.h>
#include <linux/hash.h>
#include <linux/log2.h>
#include <linux/falloc.h>
#include <linux/fiemap.h>
#include <linux/writeback.h>

#include "format.h"
#include "super.h"
#include "inode.h"
#include "key.h"
#include "alloc.h"
#include "data.h"
#include "trans.h"
#include "counters.h"
#include "scoutfs_trace.h"
#include "item.h"
#include "ioctl.h"
#include "btree.h"
#include "lock.h"
#include "file.h"
#include "msg.h"
#include "ext.h"
#include "util.h"

/*
 * We want to amortize work done after dirtying the shared transaction
 * accounting, but we don't want to blow out dirty allocator btree
 * blocks.  Each allocation can dirty quite a few allocator btree blocks
 * so we check in pretty often.
 */
#define EXTENTS_PER_HOLD 8

struct data_info {
	struct super_block *sb;
	struct mutex mutex;
	struct scoutfs_alloc *alloc;
	struct scoutfs_block_writer *wri;
	struct scoutfs_alloc_root data_freed;
	struct scoutfs_data_alloc dalloc;
};

#define DECLARE_DATA_INFO(sb, name) \
	struct data_info *name = SCOUTFS_SB(sb)->data_info

struct data_ext_args {
	u64 ino;
	struct inode *inode;
	struct scoutfs_lock *lock;
};

static void item_from_extent(struct scoutfs_key *key,
			     struct scoutfs_data_extent_val *dv, u64 ino,
			     u64 start, u64 len, u64 map, u8 flags)
{
	*key = (struct scoutfs_key) {
		.sk_zone = SCOUTFS_FS_ZONE,
		.skdx_ino = cpu_to_le64(ino),
		.sk_type = SCOUTFS_DATA_EXTENT_TYPE,
		.skdx_end = cpu_to_le64(start + len - 1),
		.skdx_len = cpu_to_le64(len),
	};
	dv->blkno = cpu_to_le64(map);
	dv->flags = flags;
}

static void ext_from_item(struct scoutfs_extent *ext,
			  struct scoutfs_key *key,
			  struct scoutfs_data_extent_val *dv)
{
	ext->start = le64_to_cpu(key->skdx_end) -
		     le64_to_cpu(key->skdx_len) + 1;
	ext->len = le64_to_cpu(key->skdx_len);
	ext->map = le64_to_cpu(dv->blkno);
	ext->flags = dv->flags;
}

static void data_ext_op_warn(struct inode *inode)
{
	struct scoutfs_inode_info *si;

	if (inode) {
		si = SCOUTFS_I(inode);
		WARN_ON_ONCE(!rwsem_is_locked(&si->extent_sem));
	}
}

static int data_ext_next(struct super_block *sb, void *arg, u64 start, u64 len,
			 struct scoutfs_extent *ext)
{
	struct data_ext_args *args = arg;
	struct scoutfs_data_extent_val dv;
	struct scoutfs_key key;
	struct scoutfs_key last;
	int ret;

	data_ext_op_warn(args->inode);

	item_from_extent(&last, &dv, args->ino, U64_MAX, 1, 0, 0);
	item_from_extent(&key, &dv, args->ino, start, len, 0, 0);

	ret = scoutfs_item_next(sb, &key, &last, &dv, sizeof(dv), args->lock);
	if (ret == sizeof(dv)) {
		ext_from_item(ext, &key, &dv);
		ret = 0;
	} else if (ret >= 0) {
		ret = -EIO;
	}

	if (ret < 0)
		memset(ext, 0, sizeof(struct scoutfs_extent));
	return ret;
}

static void add_onoff(struct inode *inode, u64 map, u8 flags, s64 len)
{
	s64 on = 0;
	s64 off = 0;

	if (map && !(flags & SEF_UNWRITTEN))
		on += len;
	else if (flags & SEF_OFFLINE)
		off += len;

	scoutfs_inode_add_onoff(inode, on, off);
}

static int data_ext_insert(struct super_block *sb, void *arg, u64 start,
			   u64 len, u64 map, u8 flags)
{
	struct data_ext_args *args = arg;
	struct scoutfs_data_extent_val dv;
	struct scoutfs_key key;
	int ret;

	data_ext_op_warn(args->inode);

	item_from_extent(&key, &dv, args->ino, start, len, map, flags);
	ret = scoutfs_item_create(sb, &key, &dv, sizeof(dv), args->lock);
	if (ret == 0 && args->inode)
		add_onoff(args->inode, map, flags, len);
	return ret;
}

static int data_ext_remove(struct super_block *sb, void *arg, u64 start,
			   u64 len, u64 map, u8 flags)
{
	struct data_ext_args *args = arg;
	struct scoutfs_data_extent_val dv;
	struct scoutfs_key key;
	int ret;

	data_ext_op_warn(args->inode);

	item_from_extent(&key, &dv, args->ino, start, len, map, flags);
	ret = scoutfs_item_delete(sb, &key, args->lock);
	if (ret == 0 && args->inode)
		add_onoff(args->inode, map, flags, -len);
	return ret;
}

static struct scoutfs_ext_ops data_ext_ops = {
	.next = data_ext_next,
	.insert = data_ext_insert,
	.remove = data_ext_remove,
};

/*
 * Find and remove or mark offline the block mappings that intersect
 * with the caller's range.  The caller is responsible for transactions
 * and locks.
 *
 * Returns:
 *  - -errno on errors
 *  - 0 if there are no more extents to stop iteration
 *  - +iblock of next logical block to truncate the next block from
 */
static s64 truncate_extents(struct super_block *sb, struct inode *inode,
			    u64 ino, u64 iblock, u64 last, bool offline,
			    struct scoutfs_lock *lock)
{
	DECLARE_DATA_INFO(sb, datinf);
	struct data_ext_args args = {
		.ino = ino,
		.inode = inode,
		.lock = lock,
	};
	struct scoutfs_extent ext;
	struct scoutfs_extent tr;
	u64 offset;
	s64 ret;
	u8 flags;
	int err;
	int i;

	flags = offline ? SEF_OFFLINE : 0;
	ret = 0;

	for (i = 0; iblock <= last; i++) {
		if (i == EXTENTS_PER_HOLD) {
			ret = iblock;
			break;
		}

		ret = scoutfs_ext_next(sb, &data_ext_ops, &args,
				       iblock, 1, &ext);
		if (ret < 0) {
			if (ret == -ENOENT)
				ret = 0;
			break;
		}

		/* done if we went past the region */
		if (ext.start > last) {
			ret = 0;
			break;
		}

		/* nothing to do when already offline and unmapped */
		if ((offline && (ext.flags & SEF_OFFLINE)) && !ext.map) {
			iblock = ext.start + ext.len;
			continue;
		}

		iblock = max(ext.start, iblock);
		offset = iblock - ext.start;

		tr.start = iblock;
		tr.map = ext.map ? ext.map + offset : 0;
		tr.len = min(ext.len - offset, last - iblock + 1);
		tr.flags = ext.flags;

		trace_scoutfs_data_extent_truncated(sb, ino, &tr);

		ret = scoutfs_ext_set(sb, &data_ext_ops, &args,
				      tr.start, tr.len, 0, flags);
		if (ret < 0) {
			if (WARN_ON_ONCE(ret == -EINVAL)) {
				scoutfs_err(sb, "unexpected truncate inconsistency: ino %llu iblock %llu last %llu, start %llu len %llu",
					    ino, iblock, last, tr.start, tr.len);
			}
			break;
		}

		if (tr.map) {
			mutex_lock(&datinf->mutex);
			ret = scoutfs_free_data(sb, datinf->alloc,
						datinf->wri,
						&datinf->data_freed,
						tr.map, tr.len);
			mutex_unlock(&datinf->mutex);
			if (ret < 0) {
				err = scoutfs_ext_set(sb, &data_ext_ops, &args,
						      tr.start, tr.len, tr.map, tr.flags);
				if (err < 0)
					scoutfs_err(sb, "truncate err %d restoring extent after error %lld: ino %llu start %llu len %llu",
						    err, ret, ino, tr.start, tr.len);
				break;
			}
		}

		iblock += tr.len;
	}

	return ret;
}

/*
 * Free blocks inside the logical block range from 'iblock' to 'last',
 * inclusive.
 *
 * If 'offline' is given then blocks are freed an offline mapping is
 * left behind.  Only blocks that have been allocated can be marked
 * offline.
 *
 * If the inode is provided then we update its tracking of the online
 * and offline blocks.  If it's not provided then the inode is being
 * destroyed and isn't reachable, we don't need to update it.
 *
 * The caller is in charge of locking the inode and data, but we may
 * have to modify far more items than fit in a transaction so we're in
 * charge of batching updates into transactions.  If the inode is
 * provided then we're responsible for updating its item as we go.
 */
int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,
				u64 ino, u64 iblock, u64 last, bool offline,
				struct scoutfs_lock *lock)
{
	struct scoutfs_inode_info *si = NULL;
	LIST_HEAD(ind_locks);
	s64 ret = 0;

	WARN_ON_ONCE(inode && !inode_is_locked(inode));

	/* clamp last to the last possible block? */
	if (last > SCOUTFS_BLOCK_SM_MAX)
		last = SCOUTFS_BLOCK_SM_MAX;

	trace_scoutfs_data_truncate_items(sb, iblock, last, offline);

	if (WARN_ON_ONCE(last < iblock))
		return -EINVAL;

	if (inode) {
		si = SCOUTFS_I(inode);
		down_write(&si->extent_sem);
	}

	while (iblock <= last) {
		if (inode)
			ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true, false);
		else
			ret = scoutfs_hold_trans(sb, false);
		if (ret)
			break;

		if (inode)
			ret = scoutfs_dirty_inode_item(inode, lock);
		else
			ret = 0;

		if (ret == 0)
			ret = truncate_extents(sb, inode, ino, iblock, last,
					       offline, lock);

		if (inode)
			scoutfs_update_inode_item(inode, lock, &ind_locks);
		scoutfs_release_trans(sb);
		if (inode)
			scoutfs_inode_index_unlock(sb, &ind_locks);

		if (ret <= 0)
			break;

		iblock = ret;
		ret = 0;
	}

	if (si)
		up_write(&si->extent_sem);

	return ret;
}

static inline u64 ext_last(struct scoutfs_extent *ext)
{
	return ext->start + ext->len - 1;
}

/*
 * The caller is writing to a logical iblock that doesn't have an
 * allocated extent.  The caller has searched for an extent containing
 * iblock.  If it already existed then it must be unallocated and
 * offline.
 *
 * We implement two preallocation strategies.  Typically we only
 * preallocate for simple streaming writes and limit preallocation while
 * the file is small.   The largest efficient allocation size is
 * typically large enough that it would be unreasonable to allocate that
 * much for all small files.
 *
 * Optionally, we can simply preallocate large empty aligned regions.
 * This can waste a lot of space for small or sparse files but is
 * reasonable when a file population is known to be large and dense but
 * known to be written with non-streaming write patterns.
 */
static int alloc_block(struct super_block *sb, struct inode *inode,
		       struct scoutfs_extent *ext, u64 iblock,
		       struct scoutfs_lock *lock)
{
	DECLARE_DATA_INFO(sb, datinf);
	struct scoutfs_mount_options opts;
	const u64 ino = scoutfs_ino(inode);
	struct data_ext_args args = {
		.ino = ino,
		.inode = inode,
		.lock = lock,
	};
	struct scoutfs_extent found;
	struct scoutfs_extent pre = {0,};
	bool undo_pre = false;
	u64 blkno = 0;
	u64 online;
	u64 offline;
	u8 flags;
	u64 start;
	u64 count;
	u64 rem;
	int ret;
	int err;

	trace_scoutfs_data_alloc_block_enter(sb, ino, iblock, ext);

	scoutfs_options_read(sb, &opts);

	/* can only allocate over existing unallocated offline extent */
	if (WARN_ON_ONCE(ext->len &&
			 !(iblock >= ext->start && iblock <= ext_last(ext) &&
			  ext->map == 0 && (ext->flags & SEF_OFFLINE))))
		return -EINVAL;

	mutex_lock(&datinf->mutex);

	/* default to single allocation at the written block */
	start = iblock;
	count = 1;
	/* copy existing flags for preallocated regions */
	flags = ext->len ? ext->flags : 0;

	if (ext->len) {
		/*
		 * Assume that offline writers are going to be writing
		 * all the offline extents and try to preallocate the
		 * rest of the unwritten extent.
		 */
		count = ext->len - (iblock - ext->start);

	} else if (opts.data_prealloc_contig_only) {
		/*
		 * Only preallocate when a quick test of the online
		 * block counts looks like we're a simple streaming
		 * write.  Try to write until the next extent but limit
		 * the preallocation size to the number of online
		 * blocks.
		 */
		scoutfs_inode_get_onoff(inode, &online, &offline);
		if (iblock > 1 && iblock == online) {
			ret = scoutfs_ext_next(sb, &data_ext_ops, &args,
					       iblock, 1, &found);
			if (ret < 0 && ret != -ENOENT)
				goto out;
			if (found.len && found.start > iblock)
				count = found.start - iblock;
			else
				count = opts.data_prealloc_blocks;

			count = min(iblock, count);
		}

	} else {
		/*
		 * Preallocation within aligned regions tries to
		 * allocate an extent to fill the hole in the region
		 * that contains iblock.  We'd have to add a bit of plumbing
		 * to find previous extents so we only search for a next
		 * extent from the front of the region and from iblock.
		 */
		div64_u64_rem(iblock, opts.data_prealloc_blocks, &rem);
		start = iblock - rem;
		count = opts.data_prealloc_blocks;
		ret = scoutfs_ext_next(sb, &data_ext_ops, &args, start, 1, &found);
		if (ret < 0 && ret != -ENOENT)
			goto out;

		/* trim count if there's an extent in the region before iblock */
		if (found.len && found.start < iblock) {
			count -= iblock - start;
			start = iblock;
			/* see if there's also an extent after iblock */
			ret = scoutfs_ext_next(sb, &data_ext_ops, &args, iblock, 1, &found);
			if (ret < 0 && ret != -ENOENT)
				goto out;
		}

		/* trim count by next extent after iblock */
		if (found.len && found.start > start && found.start < start + count)
			count = (found.start - start);
	}

	/* overall prealloc limit */
	count = min_t(u64, count, opts.data_prealloc_blocks);

	ret = scoutfs_alloc_data(sb, datinf->alloc, datinf->wri,
				 &datinf->dalloc, count, &blkno, &count);
	if (ret < 0)
		goto out;

	/*
	 * An aligned prealloc attempt that gets a smaller extent can
	 * fail to cover iblock, make sure that it does.  This is a
	 * pathological case so we don't try to move the window past
	 * iblock.  Just enough to cover it, which we know is safe.
	 */
	if (start + count <= iblock)
		start += (iblock - (start + count) + 1);

	if (count > 1) {
		pre.start = start;
		pre.len = count;
		pre.map = blkno;
		pre.flags = flags | SEF_UNWRITTEN;
		ret = scoutfs_ext_set(sb, &data_ext_ops, &args, pre.start,
				      pre.len, pre.map, pre.flags);
		if (ret < 0)
			goto out;
		undo_pre = true;
	}

	ret = scoutfs_ext_set(sb, &data_ext_ops, &args, iblock, 1, blkno + (iblock - start), 0);
	if (ret < 0)
		goto out;

	/* tell the caller we have a single block, could check next? */
	ext->start = iblock;
	ext->len = 1;
	ext->map = blkno + (iblock - start);
	ext->flags = 0;
	ret = 0;
out:
	if (ret < 0 && blkno > 0) {
		if (undo_pre) {
			err = scoutfs_ext_set(sb, &data_ext_ops, &args,
					      pre.start, pre.len, 0, flags);
			BUG_ON(err); /* leaked preallocated extent */
		}
		err = scoutfs_free_data(sb, datinf->alloc, datinf->wri,
				        &datinf->data_freed, blkno, count);
		BUG_ON(err); /* leaked free blocks */
	}

	if (ret == 0) {
		trace_scoutfs_data_alloc(sb, ino, ext);
		trace_scoutfs_data_prealloc(sb, ino, &pre);
	}

	mutex_unlock(&datinf->mutex);

	return ret;
}

static int scoutfs_get_block(struct inode *inode, sector_t iblock,
			     struct buffer_head *bh, int create)
{
	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
	const u64 ino = scoutfs_ino(inode);
	struct super_block *sb = inode->i_sb;
	struct data_ext_args args;
	struct scoutfs_lock *lock = NULL;
	struct scoutfs_extent ext = {0,};
	struct scoutfs_extent un;
	u64 offset;
	int ret;

	WARN_ON_ONCE(create && !inode_is_locked(inode));

	/* make sure caller holds a cluster lock */
	lock = scoutfs_per_task_get(&si->pt_data_lock);
	if (WARN_ON_ONCE(!lock)) {
		ret = -EINVAL;
		goto out;
	}

	args.ino = ino;
	args.inode = inode;
	args.lock = lock;

	ret = scoutfs_ext_next(sb, &data_ext_ops, &args, iblock, 1, &ext);
	if (ret == -ENOENT || (ret == 0 && ext.start > iblock))
		memset(&ext, 0, sizeof(ext));
	else if (ret < 0)
		goto out;

	if (ext.len)
		trace_scoutfs_data_get_block_found(sb, ino, &ext);

	/* non-staging callers should have waited on offline blocks */
	if (WARN_ON_ONCE(ext.map && (ext.flags & SEF_OFFLINE) && !si->staging)){
		ret = -EIO;
		goto out;
	}

	if (create && !si->staging) {
		ret = scoutfs_inode_check_retention(inode);
		if (ret < 0)
			goto out;
	}

	/* convert unwritten to written, could be staging */
	if (create && ext.map && (ext.flags & SEF_UNWRITTEN)) {
		un.start = iblock;
		un.len = 1;
		un.map = ext.map + (iblock - ext.start);
		un.flags = ext.flags & ~(SEF_OFFLINE|SEF_UNWRITTEN);
		ret = scoutfs_ext_set(sb, &data_ext_ops, &args,
				      un.start, un.len, un.map, un.flags);
		if (ret == 0) {
			ext = un;
			set_buffer_new(bh);
		}
		goto out;
	}

	/* allocate and map blocks containing our logical block */
	if (create && !ext.map) {
		ret = alloc_block(sb, inode, &ext, iblock, lock);
		if (ret == 0)
			set_buffer_new(bh);
	} else {
		ret = 0;
	}
out:
	/* map usable extent, else leave bh unmapped for sparse reads */
	if (ret == 0 && ext.map && !(ext.flags & SEF_UNWRITTEN)) {
		offset = iblock - ext.start;
		map_bh(bh, inode->i_sb, ext.map + offset);
		bh->b_size = min_t(u64, bh->b_size,
				(ext.len - offset) << SCOUTFS_BLOCK_SM_SHIFT);
		trace_scoutfs_data_get_block_mapped(sb, ino, &ext);
	}

	trace_scoutfs_get_block(sb, scoutfs_ino(inode), iblock, create,
				&ext, ret, bh->b_blocknr, bh->b_size);
	return ret;
}

/*
 * Typically extent item users are serialized by i_mutex.  But page
 * readers only hold the page lock and need to be protected from writers
 * in other pages which can be manipulating neighbouring extents as
 * they split and merge.
 */
static int scoutfs_get_block_read(struct inode *inode, sector_t iblock,
				  struct buffer_head *bh, int create)
{
	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
	int ret;

	down_read(&si->extent_sem);
	ret = scoutfs_get_block(inode, iblock, bh, create);
	up_read(&si->extent_sem);

	return ret;
}

int scoutfs_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh,
			    int create)
{
	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
	int ret;

	down_write(&si->extent_sem);
	ret = scoutfs_get_block(inode, iblock, bh, create);
	up_write(&si->extent_sem);

	return ret;
}

/*
 * This is almost never used.  We can't block on a cluster lock while
 * holding the page lock because lock invalidation gets the page lock
 * while blocking locks.  If a non blocking lock attempt fails we unlock
 * the page and block acquiring the lock.  We unlocked the page so it
 * could have been truncated away, or whatever, so we return
 * AOP_TRUNCATED_PAGE to have the caller try again.
 *
 * A similar process happens if we try to read from an offline extent
 * that a caller hasn't already waited for.  Instead of blocking
 * acquiring the lock we block waiting for the offline extent.  The page
 * lock protects the page from release while we're checking and
 * reading the extent.
 *
 * We can return errors from locking and checking offline extents.  The
 * page is unlocked if we return an error.
 */
#ifdef KC_MPAGE_READ_FOLIO
static int scoutfs_read_folio(struct file *file, struct folio *folio)
{
	struct page *page = &folio->page;
#else
static int scoutfs_readpage(struct file *file, struct page *page)
{
#endif
	struct inode *inode = file->f_inode;
	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
	struct super_block *sb = inode->i_sb;
	struct scoutfs_lock *inode_lock = NULL;
	SCOUTFS_DECLARE_PER_TASK_ENTRY(pt_ent);
	DECLARE_DATA_WAIT(dw);
	int flags;
	int ret;

	flags = SCOUTFS_LKF_REFRESH_INODE | SCOUTFS_LKF_NONBLOCK;
	ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, flags, inode,
				 &inode_lock);
	if (ret < 0) {
		unlock_page(page);
		if (ret == -EAGAIN) {
			flags &= ~SCOUTFS_LKF_NONBLOCK;
			ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, flags,
						 inode, &inode_lock);
			if (ret == 0) {
				scoutfs_unlock(sb, inode_lock,
					       SCOUTFS_LOCK_READ);
				ret = AOP_TRUNCATED_PAGE;
			}
		}
		return ret;
	}

	if (scoutfs_per_task_add_excl(&si->pt_data_lock, &pt_ent, inode_lock)) {
		ret = scoutfs_data_wait_check(inode, page_offset(page),
					      PAGE_SIZE, SEF_OFFLINE,
					      SCOUTFS_IOC_DWO_READ, &dw,
					      inode_lock);
		if (ret != 0) {
			unlock_page(page);
			scoutfs_per_task_del(&si->pt_data_lock, &pt_ent);
			scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ);
		}
		if (ret > 0) {
			ret = scoutfs_data_wait(inode, &dw);
			if (ret == 0)
				ret = AOP_TRUNCATED_PAGE;
		}
		if (ret != 0)
			return ret;
	}

#ifdef KC_MPAGE_READ_FOLIO
	ret = mpage_read_folio(folio, scoutfs_get_block_read);
#else
	ret = mpage_readpage(page, scoutfs_get_block_read);
#endif

	scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ);
	scoutfs_per_task_del(&si->pt_data_lock, &pt_ent);

	return ret;
}

#ifndef KC_FILE_AOPS_READAHEAD
/*
 * This is used for opportunistic read-ahead which can throw the pages
 * away if it needs to.  If the caller didn't deal with offline extents
 * then we drop those pages rather than trying to wait.  Whoever is
 * staging offline extents should be doing it in enormous chunks so that
 * read-ahead can ramp up within each staged region.  The check for
 * offline extents is cheap when the inode has no offline extents.
 */
static int scoutfs_readpages(struct file *file, struct address_space *mapping,
			     struct list_head *pages, unsigned nr_pages)
{
	struct inode *inode = file->f_inode;
	struct super_block *sb = inode->i_sb;
	struct scoutfs_lock *inode_lock = NULL;
	struct page *page;
	struct page *tmp;
	int ret;

	ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ,
				 SCOUTFS_LKF_REFRESH_INODE, inode, &inode_lock);
	if (ret)
		goto out;

	list_for_each_entry_safe(page, tmp, pages, lru) {
		ret = scoutfs_data_wait_check(inode, page_offset(page),
					      PAGE_SIZE, SEF_OFFLINE,
					      SCOUTFS_IOC_DWO_READ, NULL,
					      inode_lock);
		if (ret < 0)
			goto out;
		if (ret > 0) {
			list_del(&page->lru);
			put_page(page);
			if (--nr_pages == 0) {
				ret = 0;
				goto out;
			}
		}
	}

	ret = mpage_readpages(mapping, pages, nr_pages, scoutfs_get_block_read);
out:
	scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ);
	BUG_ON(!list_empty(pages));
	return ret;
}
#else
static void scoutfs_readahead(struct readahead_control *rac)
{
	struct inode *inode = rac->file->f_inode;
	struct super_block *sb = inode->i_sb;
	struct scoutfs_lock *inode_lock = NULL;
	int ret;

	ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ,
				 SCOUTFS_LKF_REFRESH_INODE, inode, &inode_lock);
	if (ret)
		return;

	ret = scoutfs_data_wait_check(inode, readahead_pos(rac),
				      readahead_length(rac), SEF_OFFLINE,
				      SCOUTFS_IOC_DWO_READ, NULL,
				      inode_lock);
	if (ret == 0)
		mpage_readahead(rac, scoutfs_get_block_read);

	scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ);
}
#endif

static int scoutfs_writepage(struct page *page, struct writeback_control *wbc)
{
	return block_write_full_page(page, scoutfs_get_block_write, wbc);
}

static int scoutfs_writepages(struct address_space *mapping,
			      struct writeback_control *wbc)
{
	return mpage_writepages(mapping, wbc, scoutfs_get_block_write);
}

/* fsdata allocated in write_begin and freed in write_end */
struct write_begin_data {
	struct list_head ind_locks;
	struct scoutfs_lock *lock;
};

static int scoutfs_write_begin(struct file *file,
			       struct address_space *mapping, loff_t pos,
			       unsigned len, unsigned flags,
			       struct page **pagep, void **fsdata)
{
	struct inode *inode = mapping->host;
	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
	struct super_block *sb = inode->i_sb;
	struct write_begin_data *wbd;
	u64 ind_seq;
	int ret;

	trace_scoutfs_write_begin(sb, scoutfs_ino(inode), (__u64)pos, len);

	wbd = kmalloc(sizeof(struct write_begin_data), GFP_NOFS);
	if (!wbd)
		return -ENOMEM;

	INIT_LIST_HEAD(&wbd->ind_locks);
	*fsdata = wbd;

	wbd->lock = scoutfs_per_task_get(&si->pt_data_lock);
	if (WARN_ON_ONCE(!wbd->lock)) {
		ret = -EINVAL;
		goto out;
	}

retry:
	do {
		ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
		      scoutfs_inode_index_prepare(sb, &wbd->ind_locks, inode,
						  true) ?:
		      scoutfs_inode_index_try_lock_hold(sb, &wbd->ind_locks, ind_seq, true);
	} while (ret > 0);
	if (ret < 0)
		goto out;

	/* can't re-enter fs, have trans */
	flags |= AOP_FLAG_NOFS;

	/* generic write_end updates i_size and calls dirty_inode */
	ret = scoutfs_dirty_inode_item(inode, wbd->lock) ?:
	      block_write_begin(mapping, pos, len, flags, pagep,
				scoutfs_get_block_write);
	if (ret < 0) {
		scoutfs_release_trans(sb);
		scoutfs_inode_index_unlock(sb, &wbd->ind_locks);
		if (ret == -ENOBUFS) {
			/* Retry with a new transaction. */
			scoutfs_inc_counter(sb, data_write_begin_enobufs_retry);
			goto retry;
		}
	}

out:
	if (ret < 0)
		kfree(wbd);
        return ret;
}

/* kinda like __filemap_fdatawrite_range! :P */
static int writepages_sync_none(struct address_space *mapping, loff_t start,
				loff_t end)
{
        struct writeback_control wbc = {
                .sync_mode = WB_SYNC_NONE,
                .nr_to_write = LONG_MAX,
                .range_start = start,
                .range_end = end,
        };

	return mapping->a_ops->writepages(mapping, &wbc);
}

static int scoutfs_write_end(struct file *file, struct address_space *mapping,
			     loff_t pos, unsigned len, unsigned copied,
			     struct page *page, void *fsdata)
{
	struct inode *inode = mapping->host;
	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
	struct super_block *sb = inode->i_sb;
	struct write_begin_data *wbd = fsdata;
	int ret;

	trace_scoutfs_write_end(sb, scoutfs_ino(inode), page->index, (u64)pos,
				len, copied);

	ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
	if (ret > 0) {
		if (!si->staging) {
			scoutfs_inode_set_data_seq(inode);
			scoutfs_inode_inc_data_version(inode);
		}

		inode_inc_iversion(inode);
		scoutfs_update_inode_item(inode, wbd->lock, &wbd->ind_locks);
		scoutfs_inode_queue_writeback(inode);
	}
	scoutfs_release_trans(sb);
	scoutfs_inode_index_unlock(sb, &wbd->ind_locks);
	kfree(wbd);

	/*
	 * Currently transactions are kept very simple.  Only one is
	 * open at a time and commit excludes concurrent dirtying.  It
	 * writes out all dirty file data during commit.  This can lead
	 * to very long commit latencies with lots of dirty file data.
	 *
	 * This hack tries to minimize these writeback latencies while
	 * keeping concurrent large file strreaming writes from
	 * suffering too terribly.  Every N bytes we kick off background
	 * writbeack on the previous N bytes.  By the time transaction
	 * commit comes along it will find that dirty file blocks have
	 * already been written.
	 */
#define BACKGROUND_WRITEBACK_BYTES (16 * 1024 * 1024)
#define BACKGROUND_WRITEBACK_MASK (BACKGROUND_WRITEBACK_BYTES - 1)
	if (ret > 0 && ((pos + ret) & BACKGROUND_WRITEBACK_MASK) == 0)
		writepages_sync_none(mapping,
				     pos + ret - BACKGROUND_WRITEBACK_BYTES,
				     pos + ret - 1);

	return ret;
}

/*
 * Try to allocate unwritten extents for any unallocated regions of the
 * logical block extent from the caller.  The caller manages locks and
 * transactions.  We limit ourselves to a reasonable number of extents
 * before returning to open another transaction.
 *
 * We return an error or the number of blocks starting at iblock that
 * were successfully processed.  The caller will continue after those
 * blocks until they reach last.
 */
static s64 fallocate_extents(struct super_block *sb, struct inode *inode,
			     u64 iblock, u64 last, struct scoutfs_lock *lock)
{
	DECLARE_DATA_INFO(sb, datinf);
	struct data_ext_args args = {
		.ino = scoutfs_ino(inode),
		.inode = inode,
		.lock = lock,
	};
	struct scoutfs_extent ext;
	u8 ext_fl;
	u64 blkno;
	u64 count;
	s64 done = 0;
	int ret = 0;
	int err;
	int i;

	for (i = 0; iblock <= last && i < EXTENTS_PER_HOLD; i++) {

		ret = scoutfs_ext_next(sb, &data_ext_ops, &args,
				       iblock, 1, &ext);
		if (ret == -ENOENT)
			ret = 0;
		else if (ret < 0)
			break;

		/* default to allocate to end of region */
		count = last - iblock + 1;
		ext_fl = 0;

		if (!ext.len) {
			/* no extent, default alloc from above */

		} else if (ext.start <= iblock && ext.map) {
			/* skip portion of allocated extent */
			count = min_t(u64, count,
				      ext.len - (iblock - ext.start));
			iblock += count;
			done += count;
			continue;

		} else if (ext.start <= iblock && !ext.map) {
			/* alloc portion of unallocated extent */
			count = min_t(u64, count,
				      ext.len - (iblock - ext.start));
			ext_fl = ext.flags;

		} else if (iblock < ext.start) {
			/* alloc hole until next extent */
			count = min_t(u64, count, ext.start - iblock);
		}

		/* limit allocation attempts */
		count = min_t(u64, count, SCOUTFS_FALLOCATE_ALLOC_LIMIT);

		mutex_lock(&datinf->mutex);

		ret = scoutfs_alloc_data(sb, datinf->alloc, datinf->wri,
					 &datinf->dalloc, count,
					 &blkno, &count);
		if (ret == 0) {
			ret = scoutfs_ext_set(sb, &data_ext_ops, &args, iblock,
					      count, blkno,
					      ext_fl | SEF_UNWRITTEN);
			if (ret < 0) {
				err = scoutfs_free_data(sb, datinf->alloc,
							datinf->wri,
							&datinf->data_freed,
							blkno, count);
				BUG_ON(err); /* inconsistent */
			}
		}

		mutex_unlock(&datinf->mutex);

		if (ret < 0)
			break;

		iblock += count;
		done += count;
	}

	if (ret == 0)
		ret = done;

	return ret;
}

/*
 * Modify the extents that map the blocks that store the len byte region
 * starting at offset.
 *
 * The caller has only prevented freezing by entering a fs write
 * context.  We're responsible for all other locking and consistency.
 *
 * This can be used to preallocate files for staging.  We find existing
 * offline extents and allocate block for them and set unwritten.
 */
long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
	struct inode *inode = file_inode(file);
	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
	struct super_block *sb = inode->i_sb;
	const u64 ino = scoutfs_ino(inode);
	struct scoutfs_lock *lock = NULL;
	LIST_HEAD(ind_locks);
	loff_t end;
	u64 iblock;
	u64 last;
	s64 ret;

	/* XXX support more flags */
        if (mode & ~(FALLOC_FL_KEEP_SIZE)) {
		ret = -EOPNOTSUPP;
		goto out;
	}

	/* catch wrapping */
	if (offset + len < offset) {
		ret = -EINVAL;
		goto out;
	}

	if (len == 0) {
		ret = 0;
		goto out;
	}

	inode_lock(inode);

	ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_WRITE,
				 SCOUTFS_LKF_REFRESH_INODE, inode, &lock);
	if (ret)
		goto out_mutex;

	inode_dio_wait(inode);

	down_write(&si->extent_sem);

	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
	    (offset + len > i_size_read(inode))) {
                ret = inode_newsize_ok(inode, offset + len);
                if (ret)
                        goto out_extent;
        }

	iblock = offset >> SCOUTFS_BLOCK_SM_SHIFT;
	last = (offset + len - 1) >> SCOUTFS_BLOCK_SM_SHIFT;

	while(iblock <= last) {

		ret = scoutfs_quota_check_data(sb, inode);
		if (ret)
			goto out_extent;

		ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, true);
		if (ret)
			goto out_extent;

		ret = fallocate_extents(sb, inode, iblock, last, lock);

		if (ret >= 0 && !(mode & FALLOC_FL_KEEP_SIZE)) {
			end = (iblock + ret) << SCOUTFS_BLOCK_SM_SHIFT;
			if (end > offset + len)
				end = offset + len;
			if (end > i_size_read(inode)) {
				i_size_write(inode, end);
				inode_inc_iversion(inode);
				scoutfs_inode_inc_data_version(inode);
			}
		}
		if (ret >= 0)
			scoutfs_update_inode_item(inode, lock, &ind_locks);
		scoutfs_release_trans(sb);
		scoutfs_inode_index_unlock(sb, &ind_locks);

		/* txn couldn't meet the request. Let's try with a new txn */
		if (ret == -ENOBUFS) {
			scoutfs_inc_counter(sb, data_fallocate_enobufs_retry);
			continue;
		}

		if (ret <= 0)
			goto out_extent;

		iblock += ret;
		ret = 0;
	}

out_extent:
	up_write(&si->extent_sem);
out_mutex:
	scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
	inode_unlock(inode);

out:
	trace_scoutfs_data_fallocate(sb, ino, mode, offset, len, ret);
	return ret;
}

/*
 * A special case of initializing a single large offline extent.  This
 * chooses not to deal with any existing extents.  It can only be used
 * on regular files with no data extents.  It's used to restore a file
 * with an offline extent which can then trigger staging.
 *
 * The caller must take care of cluster locking, transactions, inode
 * updates, and index updates (so that they can atomically make this
 * change along with other metadata changes).
 */
int scoutfs_data_init_offline_extent(struct inode *inode, u64 size,
				     struct scoutfs_lock *lock)

{
	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
	struct super_block *sb = inode->i_sb;
	struct data_ext_args args = {
		.ino = scoutfs_ino(inode),
		.inode = inode,
		.lock = lock,
	};
	const u64 count = DIV_ROUND_UP(size, SCOUTFS_BLOCK_SM_SIZE);
	u64 on;
	u64 off;
	int ret;

	scoutfs_inode_get_onoff(inode, &on, &off);

	/* caller should have checked */
	if (on > 0 || off > 0) {
		ret = -EINVAL;
		goto out;
	}

	down_write(&si->extent_sem);
	ret = scoutfs_ext_insert(sb, &data_ext_ops, &args,
				 0, count, 0, SEF_OFFLINE);
	up_write(&si->extent_sem);
out:
	return ret;
}

/*
 * We're using truncate_inode_pages_range to maintain consistency
 * between the page cache and extents that just changed.  We have to
 * call with full aligned page offsets or it thinks that it should leave
 * behind a zeroed partial page.
 */
static void truncate_inode_pages_extent(struct inode *inode, u64 start, u64 len)
{
	truncate_inode_pages_range(&inode->i_data,
				start << SCOUTFS_BLOCK_SM_SHIFT,
				((start + len) << SCOUTFS_BLOCK_SM_SHIFT) - 1);
}

/*
 * Move extents from one file to another.  The behaviour is more fully
 * explained above the move_blocks ioctl argument structure definition.
 *
 * The caller has processed the ioctl args and performed the most basic
 * argument sanity and inode checks, but we perform more detailed inode
 * checks once we have the inode lock and refreshed inodes.  Our job is
 * to safely lock the two files and move the extents.
 */
#define MOVE_DATA_EXTENTS_PER_HOLD 16
int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
			     u64 byte_len, struct inode *to, u64 to_off, bool is_stage,
			     u64 data_version)
{
	struct scoutfs_inode_info *from_si = SCOUTFS_I(from);
	struct scoutfs_inode_info *to_si = SCOUTFS_I(to);
	struct super_block *sb = from->i_sb;
	struct scoutfs_lock *from_lock = NULL;
	struct scoutfs_lock *to_lock = NULL;
	struct data_ext_args from_args;
	struct data_ext_args to_args;
	struct scoutfs_extent ext;
	struct kc_timespec cur_time;
	LIST_HEAD(locks);
	bool done = false;
	loff_t from_size;
	loff_t to_size;
	u64 from_offline;
	u64 to_offline;
	u64 from_start;
	u64 to_start;
	u64 from_iblock;
	u64 to_iblock;
	u64 count;
	u64 junk;
	u64 seq;
	u64 map;
	u64 len;
	int ret;
	int err;
	int i;

	lock_two_nondirectories(from, to);

	ret = scoutfs_lock_inodes(sb, SCOUTFS_LOCK_WRITE,
				  SCOUTFS_LKF_REFRESH_INODE, from, &from_lock,
				  to, &to_lock, NULL, NULL, NULL, NULL);
	if (ret)
		goto out;

	if (!is_stage && (ret = scoutfs_inode_check_retention(to)))
		goto out;

	if ((from_off & SCOUTFS_BLOCK_SM_MASK) ||
	    (to_off & SCOUTFS_BLOCK_SM_MASK) ||
	    ((byte_len & SCOUTFS_BLOCK_SM_MASK) &&
	     (from_off + byte_len != i_size_read(from)))) {
		ret = -EINVAL;
		goto out;
	}

	if (is_stage && (data_version != SCOUTFS_I(to)->data_version)) {
		ret = -ESTALE;
		goto out;
	}

	from_iblock = from_off >> SCOUTFS_BLOCK_SM_SHIFT;
	count = (byte_len + SCOUTFS_BLOCK_SM_MASK) >> SCOUTFS_BLOCK_SM_SHIFT;
	to_iblock = to_off >> SCOUTFS_BLOCK_SM_SHIFT;
	from_start = from_iblock;

	/* only move extent blocks inside i_size, careful not to wrap */
	from_size = i_size_read(from);
	if (from_off >= from_size) {
		ret = 0;
		goto out;
	}
	if (from_off + byte_len > from_size)
		count = ((from_size - from_off) + SCOUTFS_BLOCK_SM_MASK) >> SCOUTFS_BLOCK_SM_SHIFT;

	if (S_ISDIR(from->i_mode) || S_ISDIR(to->i_mode)) {
		ret = -EISDIR;
		goto out;
	}

	if (!S_ISREG(from->i_mode) || !S_ISREG(to->i_mode)) {
		ret = -EINVAL;
		goto out;
	}

	ret = inode_permission(KC_VFS_INIT_NS from, MAY_WRITE) ?:
	      inode_permission(KC_VFS_INIT_NS to, MAY_WRITE);
	if (ret < 0)
		goto out;

	/* can't stage once data_version changes */
	scoutfs_inode_get_onoff(from, &junk, &from_offline);
	scoutfs_inode_get_onoff(to, &junk, &to_offline);
	if (from_offline || (to_offline && !is_stage)) {
		ret = -ENODATA;
		goto out;
	}

	from_args = (struct data_ext_args) {
		.ino = scoutfs_ino(from),
		.inode = from,
		.lock = from_lock,
	};

	to_args = (struct data_ext_args) {
		.ino = scoutfs_ino(to),
		.inode = to,
		.lock = to_lock,
	};

	inode_dio_wait(from);
	inode_dio_wait(to);

	ret = filemap_write_and_wait_range(&from->i_data, from_off,
				   from_off + byte_len - 1);
	if (ret < 0)
		goto out;

	for (;;) {
		ret = scoutfs_inode_index_start(sb, &seq) ?:
		      scoutfs_inode_index_prepare(sb, &locks, from, true) ?:
		      scoutfs_inode_index_prepare(sb, &locks, to, true) ?:
		      scoutfs_inode_index_try_lock_hold(sb, &locks, seq, false);
		if (ret > 0)
			continue;
		if (ret < 0)
			goto out;

		ret = scoutfs_dirty_inode_item(from, from_lock) ?:
		      scoutfs_dirty_inode_item(to, to_lock);
		if (ret < 0)
			goto out;

		down_write_two(&from_si->extent_sem, &to_si->extent_sem);

		/* arbitrarily limit the number of extents per trans hold */
		for (i = 0; i < MOVE_DATA_EXTENTS_PER_HOLD; i++) {
			struct scoutfs_extent off_ext;

			/* find the next extent to move */
			ret = scoutfs_ext_next(sb, &data_ext_ops, &from_args,
					       from_start, 1, &ext);
			if (ret < 0) {
				if (ret == -ENOENT) {
					done = true;
					ret = 0;
				}
				break;
			}

			/* done if next extent starts after moving region */
			if (ext.start >= from_iblock + count) {
				done = true;
				ret = 0;
				break;
			}

			from_start = max(ext.start, from_iblock);
			map = ext.map + (from_start - ext.start);
			len = min(from_iblock + count, ext.start + ext.len) - from_start;
			to_start = to_iblock + (from_start - from_iblock);

			/* we'd get stuck, shouldn't happen */
			if (WARN_ON_ONCE(len == 0)) {
				ret = -EIO;
				goto out;
			}

			if (is_stage) {
				ret = scoutfs_ext_next(sb, &data_ext_ops, &to_args,
						       to_start, 1, &off_ext);
				if (ret)
					break;

				if (!scoutfs_ext_inside(to_start, len, &off_ext) ||
				    !(off_ext.flags & SEF_OFFLINE)) {
					ret = -EINVAL;
					break;
				}

				ret = scoutfs_ext_set(sb, &data_ext_ops, &to_args,
							 to_start, len,
							 map, ext.flags);
			} else {
				/* insert the new, fails if it overlaps */
				ret = scoutfs_ext_insert(sb, &data_ext_ops, &to_args,
							 to_start, len,
							 map, ext.flags);
			}
			if (ret < 0)
				break;

			/* remove the old, possibly splitting */
			ret = scoutfs_ext_set(sb, &data_ext_ops, &from_args,
					      from_start, len, 0, 0);
			if (ret < 0) {
				if (is_stage) {
					/* re-mark dest range as offline */
					WARN_ON_ONCE(!(off_ext.flags & SEF_OFFLINE));
					err = scoutfs_ext_set(sb, &data_ext_ops, &to_args,
							      to_start, len,
							      0, off_ext.flags);
				} else {
					/* remove inserted new on err */
					err = scoutfs_ext_remove(sb, &data_ext_ops,
								 &to_args, to_start,
								 len);
				}
				BUG_ON(err); /* XXX inconsistent */
				break;
			}

			trace_scoutfs_data_move_blocks(sb, scoutfs_ino(from),
						       from_start, len, map,
						       ext.flags,
						       scoutfs_ino(to),
						       to_start);

			/* moved extent might extend i_size */
			to_size = (to_start + len) << SCOUTFS_BLOCK_SM_SHIFT;
			if (to_size > i_size_read(to)) {
				/* while maintaining final partial */
				from_size = (from_start + len) <<
						SCOUTFS_BLOCK_SM_SHIFT;
				if (from_size > i_size_read(from))
					to_size -= from_size -
							i_size_read(from);
				i_size_write(to, to_size);
			}

			/* find next after moved extent, avoiding wrapping */
			if (from_start + len < from_start)
				from_start = from_iblock + count + 1;
			else
				from_start += len;
		}


		up_write(&from_si->extent_sem);
		up_write(&to_si->extent_sem);

		cur_time = current_time(from);
		if (!is_stage) {
			to->i_ctime = to->i_mtime = cur_time;
			inode_inc_iversion(to);
			scoutfs_inode_inc_data_version(to);
			scoutfs_inode_set_data_seq(to);
		}
		from->i_ctime = from->i_mtime = cur_time;
		inode_inc_iversion(from);
		scoutfs_inode_inc_data_version(from);
		scoutfs_inode_set_data_seq(from);

		scoutfs_update_inode_item(from, from_lock, &locks);
		scoutfs_update_inode_item(to, to_lock, &locks);
		scoutfs_release_trans(sb);
		scoutfs_inode_index_unlock(sb, &locks);

		if (ret < 0 || done)
			break;
	}

	/* remove any cached pages from old extents */
	truncate_inode_pages_extent(from, from_iblock, count);
	truncate_inode_pages_extent(to, to_iblock, count);

out:
	scoutfs_unlock(sb, from_lock, SCOUTFS_LOCK_WRITE);
	scoutfs_unlock(sb, to_lock, SCOUTFS_LOCK_WRITE);

	unlock_two_nondirectories(from, to);

	return ret;
}

/*
 * This copies to userspace :/
 */
static int fill_extent(struct fiemap_extent_info *fieinfo,
		       struct scoutfs_extent *ext, u32 fiemap_flags)
{
	u32 flags;

	if (ext->len == 0)
		return 0;

	flags = fiemap_flags;
	if (ext->flags & SEF_OFFLINE)
		flags |= FIEMAP_EXTENT_UNKNOWN;
	else if (ext->flags & SEF_UNWRITTEN)
		flags |= FIEMAP_EXTENT_UNWRITTEN;

	return fiemap_fill_next_extent(fieinfo,
				       ext->start << SCOUTFS_BLOCK_SM_SHIFT,
				       ext->map << SCOUTFS_BLOCK_SM_SHIFT,
				       ext->len << SCOUTFS_BLOCK_SM_SHIFT,
				       flags);
}

/*
 * Return all the file's extents whose blocks overlap with the caller's
 * byte region.  We set _LAST on the last extent and _UNKNOWN on offline
 * extents.
 */
int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
			u64 start, u64 len)
{
	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
	struct super_block *sb = inode->i_sb;
	const u64 ino = scoutfs_ino(inode);
	struct scoutfs_lock *lock = NULL;
	struct scoutfs_extent ext;
	struct scoutfs_extent cur;
	struct data_ext_args args;
	u32 last_flags;
	u64 iblock;
	u64 last;
	int ret;

	if (len == 0) {
		ret = 0;
		goto out;
	}

	ret = fiemap_prep(inode, fieinfo, start, &len, FIEMAP_FLAG_SYNC);
	if (ret)
		goto out;

	inode_lock(inode);
	down_read(&si->extent_sem);

	ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, 0, inode, &lock);
	if (ret)
		goto unlock;

	args.ino = ino;
	args.inode = inode;
	args.lock = lock;

	/* use a dummy extent to track */
	memset(&cur, 0, sizeof(cur));
	last_flags = 0;

	iblock = start >> SCOUTFS_BLOCK_SM_SHIFT;
	last = (start + len - 1) >> SCOUTFS_BLOCK_SM_SHIFT;

	while (iblock <= last) {
		ret = scoutfs_ext_next(sb, &data_ext_ops, &args,
				       iblock, 1, &ext);
		if (ret < 0) {
			if (ret == -ENOENT)
				ret = 0;
			last_flags = FIEMAP_EXTENT_LAST;
			break;
		}

		trace_scoutfs_data_fiemap_extent(sb, ino, &ext);

		if (ext.start > last) {
			/* not setting _LAST, it's for end of file */
			ret = 0;
			break;
		}

		if (scoutfs_ext_can_merge(&cur, &ext)) {
			/* merged extents could be greater than input len */
			cur.len += ext.len;
		} else {
			ret = fill_extent(fieinfo, &cur, 0);
			if (ret != 0)
				goto unlock;
			cur = ext;
		}

		iblock = ext.start + ext.len;
	}

	if (cur.len)
		ret = fill_extent(fieinfo, &cur, last_flags);
unlock:
	scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);
	up_read(&si->extent_sem);
	inode_unlock(inode);

out:
	if (ret == 1)
		ret = 0;

	trace_scoutfs_data_fiemap(sb, start, len, ret);

	return ret;
}

/*
 * Insert a new waiter.  This supports multiple tasks waiting for the
 * same ino and iblock by also comparing waiters by their addresses.
 */
static void insert_offline_waiting(struct rb_root *root,
				   struct scoutfs_data_wait *ins)
{
	struct rb_node **node = &root->rb_node;
	struct rb_node *parent = NULL;
	struct scoutfs_data_wait *dw;
	int cmp;

	while (*node) {
		parent = *node;
		dw = rb_entry(*node, struct scoutfs_data_wait, node);

		cmp = scoutfs_cmp_u64s(ins->ino, dw->ino) ?:
		      scoutfs_cmp_u64s(ins->iblock, dw->iblock) ?:
		      scoutfs_cmp(ins, dw);
		if (cmp < 0)
			node = &(*node)->rb_left;
		else
			node = &(*node)->rb_right;
	}

	rb_link_node(&ins->node, parent, node);
	rb_insert_color(&ins->node, root);
}

static struct scoutfs_data_wait *next_data_wait(struct rb_root *root, u64 ino,
						u64 iblock)
{
	struct rb_node **node = &root->rb_node;
	struct rb_node *parent = NULL;
	struct scoutfs_data_wait *next = NULL;
	struct scoutfs_data_wait *dw;
	int cmp;

	while (*node) {
		parent = *node;
		dw = rb_entry(*node, struct scoutfs_data_wait, node);

		/* go left when ino/iblock are equal to get first task */
		cmp = scoutfs_cmp_u64s(ino, dw->ino) ?:
		      scoutfs_cmp_u64s(iblock, dw->iblock);
		if (cmp <= 0) {
			node = &(*node)->rb_left;
			next = dw;
		} else if (cmp > 0) {
			node = &(*node)->rb_right;
		}
	}

	return next;
}

static struct scoutfs_data_wait *dw_next(struct scoutfs_data_wait *dw)
{
	struct rb_node *node = rb_next(&dw->node);
	if (node)
		return container_of(node, struct scoutfs_data_wait, node);
	return NULL;
}

/*
 * Check if we should wait by looking for extents whose flags match.
 * Returns 0 if no extents were found or any error encountered.
 *
 * The caller must have acquired a cluster lock that covers the extent
 * items.  We acquire the extent_sem to protect our read from writers in
 * other tasks.
 *
 * Returns 1 if any file extents in the caller's region matched.  If the
 * wait struct is provided then it is initialized to be woken when the
 * extents change after the caller unlocks after the check.  The caller
 * must come through _data_wait() to clean up the wait struct if we set
 * it up.
 */
int scoutfs_data_wait_check(struct inode *inode, loff_t pos, loff_t len,
			    u8 sef, u8 op, struct scoutfs_data_wait *dw,
			    struct scoutfs_lock *lock)
{
	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
	struct super_block *sb = inode->i_sb;
	const u64 ino = scoutfs_ino(inode);
	struct data_ext_args args = {
		.ino = ino,
		.inode = inode,
		.lock = lock,
	};
	DECLARE_DATA_WAIT_ROOT(sb, rt);
	DECLARE_DATA_WAITQ(inode, wq);
	struct scoutfs_extent ext = {0,};
	u64 iblock;
	u64 last_block;
	u64 on;
	u64 off;
	int ret = 0;

	if (WARN_ON_ONCE(sef & SEF_UNKNOWN) ||
	    WARN_ON_ONCE(op & SCOUTFS_IOC_DWO_UNKNOWN) ||
	    WARN_ON_ONCE(dw && !RB_EMPTY_NODE(&dw->node)) ||
	    WARN_ON_ONCE(pos + len < pos)) {
		ret = -EINVAL;
		goto out;
	}

	if ((sef & SEF_OFFLINE)) {
		scoutfs_inode_get_onoff(inode, &on, &off);
		if (off == 0) {
			ret = 0;
			goto out;
		}
	}

	down_read(&si->extent_sem);

	iblock = pos >> SCOUTFS_BLOCK_SM_SHIFT;
	last_block = (pos + len - 1) >> SCOUTFS_BLOCK_SM_SHIFT;

	while(iblock <= last_block) {
		ret = scoutfs_ext_next(sb, &data_ext_ops, &args,
				       iblock, 1, &ext);
		if (ret < 0) {
			if (ret == -ENOENT)
				ret = 0;
			break;
		}

		if (ext.start > last_block) {
			ret = 0;
			break;
		}

		if (sef & ext.flags) {
			if (dw) {
				dw->chg = atomic64_read(&wq->changed);
				dw->ino = ino;
				dw->iblock = max(iblock, ext.start);
				dw->op = op;

				spin_lock(&rt->lock);
				insert_offline_waiting(&rt->root, dw);
				spin_unlock(&rt->lock);
			}

			ret = 1;
			break;
		}

		iblock = ext.start + ext.len;
	}

	up_read(&si->extent_sem);

out:
	trace_scoutfs_data_wait_check(sb, ino, pos, len, sef, op, &ext, ret);

	return ret;
}

bool scoutfs_data_wait_found(struct scoutfs_data_wait *dw)
{
	return !RB_EMPTY_NODE(&dw->node);
}

int scoutfs_data_wait_check_iov(struct inode *inode, const struct iovec *iov,
				unsigned long nr_segs, loff_t pos, u8 sef,
				u8 op, struct scoutfs_data_wait *dw,
				struct scoutfs_lock *lock)
{
	unsigned long i;
	int ret = 0;

	for (i = 0; i < nr_segs; i++) {
		if (iov[i].iov_len == 0)
			continue;

		ret = scoutfs_data_wait_check(inode, pos, iov[i].iov_len, sef,
					      op, dw, lock);
		if (ret != 0)
			break;

		pos += iov[i].iov_len;
	}

	return ret;
}

int scoutfs_data_wait(struct inode *inode, struct scoutfs_data_wait *dw)
{
	DECLARE_DATA_WAIT_ROOT(inode->i_sb, rt);
	DECLARE_DATA_WAITQ(inode, wq);
	int ret;

	ret = wait_event_interruptible(wq->waitq,
					atomic64_read(&wq->changed) != dw->chg);

	spin_lock(&rt->lock);
	rb_erase(&dw->node, &rt->root);
	RB_CLEAR_NODE(&dw->node);
	if (!ret && dw->err)
		ret = dw->err;
	spin_unlock(&rt->lock);

	return ret;
}

void scoutfs_data_wait_changed(struct inode *inode)
{
	DECLARE_DATA_WAITQ(inode, wq);

	atomic64_inc(&wq->changed);
	wake_up(&wq->waitq);
}

long scoutfs_data_wait_err(struct inode *inode, u64 sblock, u64 eblock,
			   u64 op, long err)
{
	struct super_block *sb = inode->i_sb;
	const u64 ino = scoutfs_ino(inode);
	DECLARE_DATA_WAIT_ROOT(sb, rt);
	struct scoutfs_data_wait *dw;
	long nr = 0;

	if (!err)
		return 0;

	spin_lock(&rt->lock);

	for (dw = next_data_wait(&rt->root, ino, sblock);
	     dw; dw = dw_next(dw)) {
		if (dw->ino != ino || dw->iblock > eblock)
			break;
		if ((dw->op & op) && !dw->err) {
			dw->err = err;
			nr++;
		}
	}

	spin_unlock(&rt->lock);
	if (nr)
		scoutfs_data_wait_changed(inode);
	return nr;
}

int scoutfs_data_waiting(struct super_block *sb, u64 ino, u64 iblock,
			 struct scoutfs_ioctl_data_waiting_entry *dwe,
			 unsigned int nr)
{
	DECLARE_DATA_WAIT_ROOT(sb, rt);
	struct scoutfs_data_wait *dw;
	int ret = 0;

	spin_lock(&rt->lock);

	dw = next_data_wait(&rt->root, ino, iblock);
	while (dw && ret < nr) {

		dwe->ino = dw->ino;
		dwe->iblock = dw->iblock;
		dwe->op = dw->op;

		while ((dw = dw_next(dw)) &&
		       (dw->ino == dwe->ino && dw->iblock == dwe->iblock)) {
			dwe->op |= dw->op;
		}

		dwe++;
		ret++;
	}

	spin_unlock(&rt->lock);

	return ret;
}

const struct address_space_operations scoutfs_file_aops = {
#ifdef KC_MPAGE_READ_FOLIO
	.read_folio		= scoutfs_read_folio,
#else
	.readpage		= scoutfs_readpage,
#endif
#ifndef KC_FILE_AOPS_READAHEAD
	.readpages		= scoutfs_readpages,
#else
	.readahead		= scoutfs_readahead,
#endif
	.writepage		= scoutfs_writepage,
	.writepages		= scoutfs_writepages,
	.write_begin		= scoutfs_write_begin,
	.write_end		= scoutfs_write_end,
};

const struct file_operations scoutfs_file_fops = {
#ifdef KC_LINUX_HAVE_FOP_AIO_READ
	.read		= do_sync_read,
	.write		= do_sync_write,
	.aio_read	= scoutfs_file_aio_read,
	.aio_write	= scoutfs_file_aio_write,
#else
	.read_iter	= scoutfs_file_read_iter,
	.write_iter	= scoutfs_file_write_iter,
	.splice_read	= generic_file_splice_read,
	.splice_write	= iter_file_splice_write,
#endif
	.unlocked_ioctl	= scoutfs_ioctl,
	.fsync		= scoutfs_file_fsync,
	.llseek		= scoutfs_file_llseek,
	.fallocate	= scoutfs_fallocate,
};

void scoutfs_data_init_btrees(struct super_block *sb,
			      struct scoutfs_alloc *alloc,
			      struct scoutfs_block_writer *wri,
			      struct scoutfs_log_trees *lt)
{
	DECLARE_DATA_INFO(sb, datinf);

	mutex_lock(&datinf->mutex);

	datinf->alloc = alloc;
	datinf->wri = wri;
	scoutfs_dalloc_init(&datinf->dalloc, &lt->data_avail);
	datinf->data_freed = lt->data_freed;

	mutex_unlock(&datinf->mutex);
}

void scoutfs_data_get_btrees(struct super_block *sb,
			     struct scoutfs_log_trees *lt)
{
	DECLARE_DATA_INFO(sb, datinf);

	mutex_lock(&datinf->mutex);

	scoutfs_dalloc_get_root(&datinf->dalloc, &lt->data_avail);
	lt->data_freed = datinf->data_freed;

	mutex_unlock(&datinf->mutex);
}

/*
 * This should be called before preparing the allocators for the commit
 * because it can allocate and free btree blocks in the data allocator.
 */
int scoutfs_data_prepare_commit(struct super_block *sb)
{
	DECLARE_DATA_INFO(sb, datinf);
	int ret;

	mutex_lock(&datinf->mutex);
	ret = scoutfs_dalloc_return_cached(sb, datinf->alloc, datinf->wri,
					   &datinf->dalloc);
	mutex_unlock(&datinf->mutex);

	return ret;
}

/*
 * Return true if the data allocator is lower than the caller's
 * requirement and we haven't been told by the server that we're out of
 * free extents.
 */
bool scoutfs_data_alloc_should_refill(struct super_block *sb, u64 blocks)
{
	DECLARE_DATA_INFO(sb, datinf);

	return (scoutfs_dalloc_total_len(&datinf->dalloc) < blocks) &&
	       !(le32_to_cpu(datinf->dalloc.root.flags) & SCOUTFS_ALLOC_FLAG_LOW);
}

int scoutfs_data_setup(struct super_block *sb)
{
	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
	struct data_info *datinf;

	datinf = kzalloc(sizeof(struct data_info), GFP_KERNEL);
	if (!datinf)
		return -ENOMEM;

	datinf->sb = sb;
	mutex_init(&datinf->mutex);

	sbi->data_info = datinf;
	return 0;
}

void scoutfs_data_destroy(struct super_block *sb)
{
	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
	struct data_info *datinf = sbi->data_info;

	if (datinf) {
		sbi->data_info = NULL;
		kfree(datinf);
	}
}