From a6782fc03ff3959d065232b6c9a762721783f4d2 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Tue, 23 Apr 2019 09:38:34 -0700
Subject: [PATCH] scoutfs: add data waiting

One of the core features of scoutfs is the ability to transparently
migrate file contents to and from an archive tier.  For this to be
transparent we need file system operations to trigger staging the file
contents back into the file system as needed.

This adds the infrastructure which operations use to wait for offline
extents to come online and which provides userspace with a list of
blocks that the operations are waiting for.

We add some waiting infrastructure that callers use to lock, check for
offline extents, and unlock and wait before checking again to see if
they're still offline.  We add these checks and waiting to data io
operations that could encounter offline extents.

This has to be done carefully so that we don't wait while holding locks
that would prevent staging.  We use per-task structures to discover when
we are the first user of a cluster lock on an inode, indicating that
it's safe for us to wait because we don't hold any locks.

And while we're waiting our operation is tracked and reported to
userspace through an ioctl.  This is a non-blocking ioctl, it's up to
userspace to decide how often to check and how large a region to stage.

Waiters are woken up when the file contents could have changed, not
specifically when we know that the extent has come online.  This lets us
wake waiters when their lock is revoked so that they can block waiting
to reacquire the lock and test the extents again.  It lets us provide
coherent demand staging across the cluster without fine grained waiting
protocols sent betwen the nodes.  It may result in some spurious wakeups
and work but hopefully it won't, and it's a very simple and functional
first pass.

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/data.c          | 311 ++++++++++++++++++++++++++++++++++++++-
 kmod/src/data.h          |  50 +++++++
 kmod/src/file.c          |  54 ++++++-
 kmod/src/format.h        |   5 +-
 kmod/src/inode.c         |  43 ++++++
 kmod/src/inode.h         |   3 +
 kmod/src/ioctl.c         |  52 +++++++
 kmod/src/ioctl.h         |  24 +++
 kmod/src/lock.c          |   5 +-
 kmod/src/scoutfs_trace.h |  39 +++++
 kmod/src/super.c         |   2 +
 kmod/src/super.h         |   4 +
 12 files changed, 576 insertions(+), 16 deletions(-)

diff --git a/kmod/src/data.c b/kmod/src/data.c
index cee53511..ce31e642 100644
--- a/kmod/src/data.c
+++ b/kmod/src/data.c
@@ -732,9 +732,9 @@ static int scoutfs_get_block(struct inode *inode, sector_t iblock,
 	if (ext.len)
 		trace_scoutfs_data_get_block_intersection(sb, &ext);
 
-	/* fail read and write if it's offline and we're not staging */
-	if ((ext.flags & SEF_OFFLINE) && !si->staging) {
-		ret = -EINVAL;
+	/* non-staging callers should have waited on offline blocks */
+	if (WARN_ON_ONCE((ext.flags & SEF_OFFLINE) && !si->staging)) {
+		ret = -EIO;
 		goto out;
 	}
 
@@ -780,14 +780,28 @@ out:
 /*
  * This is almost never used.  We can't block on a cluster lock while
  * holding the page lock because lock invalidation gets the page lock
- * while blocking locks.  If we can't use an existing lock then we drop
- * the page lock and try again.
+ * while blocking locks.  If a non blocking lock attempt fails we unlock
+ * the page and block acquiring the lock.  We unlocked the page so it
+ * could have been truncated away, or whatever, so we return
+ * AOP_TRUNCATED_PAGE to have the caller try again.
+ *
+ * A similar process happens if we try to read from an offline extent
+ * that a caller hasn't already waited for.  Instead of blocking
+ * acquiring the lock we block waiting for the offline extent.  The page
+ * lock protects the page from release while we're checking and
+ * reading the extent.
+ *
+ * We can return errors from locking and checking offline extents.  The
+ * page is unlocked if we return an error.
  */
 static int scoutfs_readpage(struct file *file, struct page *page)
 {
 	struct inode *inode = file->f_inode;
+	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
 	struct super_block *sb = inode->i_sb;
 	struct scoutfs_lock *inode_lock = NULL;
+	SCOUTFS_DECLARE_PER_TASK_ENTRY(pt_ent);
+	DECLARE_DATA_WAIT(dw);
 	int flags;
 	int ret;
 
@@ -809,27 +823,77 @@ static int scoutfs_readpage(struct file *file, struct page *page)
 		return ret;
 	}
 
+	if (scoutfs_per_task_add_excl(&si->pt_data_lock, &pt_ent, inode_lock)) {
+		ret = scoutfs_data_wait_check(inode, page_offset(page),
+					      PAGE_CACHE_SIZE, SEF_OFFLINE,
+					      SCOUTFS_IOC_DWO_READ, &dw,
+					      inode_lock);
+		if (ret != 0) {
+			unlock_page(page);
+			scoutfs_per_task_del(&si->pt_data_lock, &pt_ent);
+			scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ);
+		}
+		if (ret > 0) {
+			ret = scoutfs_data_wait(inode, &dw);
+			if (ret == 0)
+				ret = AOP_TRUNCATED_PAGE;
+		}
+		if (ret != 0)
+			return ret;
+	}
+
 	ret = mpage_readpage(page, scoutfs_get_block);
+
 	scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ);
+	scoutfs_per_task_del(&si->pt_data_lock, &pt_ent);
+
 	return ret;
 }
 
+/*
+ * This is used for opportunistic read-ahead which can throw the pages
+ * away if it needs to.  If the caller didn't deal with offline extents
+ * then we drop those pages rather than trying to wait.  Whoever is
+ * staging offline extents should be doing it in enormous chunks so that
+ * read-ahead can ramp up within each staged region.  The check for
+ * offline extents is cheap when the inode has no offline extents.
+ */
 static int scoutfs_readpages(struct file *file, struct address_space *mapping,
 			     struct list_head *pages, unsigned nr_pages)
 {
 	struct inode *inode = file->f_inode;
 	struct super_block *sb = inode->i_sb;
 	struct scoutfs_lock *inode_lock = NULL;
+	struct page *page;
+	struct page *tmp;
 	int ret;
 
 	ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ,
 				 SCOUTFS_LKF_REFRESH_INODE, inode, &inode_lock);
 	if (ret)
-		return ret;
+		goto out;
+
+	list_for_each_entry_safe(page, tmp, pages, lru) {
+		ret = scoutfs_data_wait_check(inode, page_offset(page),
+					      PAGE_CACHE_SIZE, SEF_OFFLINE,
+					      SCOUTFS_IOC_DWO_READ, NULL,
+					      inode_lock);
+		if (ret < 0)
+			goto out;
+		if (ret > 0) {
+			list_del(&page->lru);
+			page_cache_release(page);
+			if (--nr_pages == 0) {
+				ret = 0;
+				goto out;
+			}
+		}
+	}
 
 	ret = mpage_readpages(mapping, pages, nr_pages, scoutfs_get_block);
-
+out:
 	scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ);
+	BUG_ON(!list_empty(pages));
 	return ret;
 }
 
@@ -1249,6 +1313,239 @@ out:
 	return ret;
 }
 
+/*
+ * Insert a new waiter.  This supports multiple tasks waiting for the
+ * same ino and iblock by also comparing waiters by their addresses.
+ */
+static void insert_offline_waiting(struct rb_root *root,
+				   struct scoutfs_data_wait *ins)
+{
+	struct rb_node **node = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct scoutfs_data_wait *dw;
+	int cmp;
+
+	while (*node) {
+		parent = *node;
+		dw = rb_entry(*node, struct scoutfs_data_wait, node);
+
+		cmp = scoutfs_cmp_u64s(ins->ino, dw->ino) ?:
+		      scoutfs_cmp_u64s(ins->iblock, dw->iblock) ?:
+		      scoutfs_cmp(ins, dw);
+		if (cmp < 0)
+			node = &(*node)->rb_left;
+		else
+			node = &(*node)->rb_right;
+	}
+
+	rb_link_node(&ins->node, parent, node);
+	rb_insert_color(&ins->node, root);
+}
+
+static struct scoutfs_data_wait *next_data_wait(struct rb_root *root, u64 ino,
+						u64 iblock)
+{
+	struct rb_node **node = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct scoutfs_data_wait *next = NULL;
+	struct scoutfs_data_wait *dw;
+	int cmp;
+
+	while (*node) {
+		parent = *node;
+		dw = rb_entry(*node, struct scoutfs_data_wait, node);
+
+		/* go left when ino/iblock are equal to get first task */
+		cmp = scoutfs_cmp_u64s(ino, dw->ino) ?:
+		      scoutfs_cmp_u64s(iblock, dw->iblock);
+		if (cmp <= 0) {
+			node = &(*node)->rb_left;
+			next = dw;
+		} else if (cmp > 0) {
+			node = &(*node)->rb_right;
+		}
+	}
+
+	return next;
+}
+
+static struct scoutfs_data_wait *dw_next(struct scoutfs_data_wait *dw)
+{
+	struct rb_node *node = rb_next(&dw->node);
+	if (node)
+		return container_of(node, struct scoutfs_data_wait, node);
+	return NULL;
+}
+
+/*
+ * Check if we should wait by looking for extents whose flags match.
+ * Returns 0 if no extents were found or any error encountered.
+ *
+ * The caller must have locked the extents before calling, both across
+ * mounts and within this mount.
+ *
+ * Returns 1 if any file extents in the caller's region matched.  If the
+ * wait struct is provided then it is initialized to be woken when the
+ * extents change after the caller unlocks after the check.  The caller
+ * must come through _data_wait() to clean up the wait struct if we set
+ * it up.
+ */
+int scoutfs_data_wait_check(struct inode *inode, loff_t pos, loff_t len,
+			    u8 sef, u8 op, struct scoutfs_data_wait *dw,
+			    struct scoutfs_lock *lock)
+{
+	struct super_block *sb = inode->i_sb;
+	DECLARE_DATA_WAIT_ROOT(sb, rt);
+	DECLARE_DATA_WAITQ(inode, wq);
+	struct scoutfs_extent ext = {0,};
+	u64 iblock;
+	u64 last_block;
+	u64 on;
+	u64 off;
+	int ret = 0;
+
+	if (WARN_ON_ONCE(sef & SEF_UNKNOWN) ||
+	    WARN_ON_ONCE(op & SCOUTFS_IOC_DWO_UNKNOWN) ||
+	    WARN_ON_ONCE(dw && !RB_EMPTY_NODE(&dw->node)) ||
+	    WARN_ON_ONCE(pos + len < pos)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if ((sef & SEF_OFFLINE)) {
+		scoutfs_inode_get_onoff(inode, &on, &off);
+		if (off == 0) {
+			ret = 0;
+			goto out;
+		}
+	}
+
+	iblock = pos >> SCOUTFS_BLOCK_SHIFT;
+	last_block = (pos + len - 1) >> SCOUTFS_BLOCK_SHIFT;
+
+	while(iblock <= last_block) {
+		scoutfs_extent_init(&ext, SCOUTFS_FILE_EXTENT_TYPE,
+				    scoutfs_ino(inode), iblock, 1, 0, 0);
+		ret = scoutfs_extent_next(sb, data_extent_io, &ext, lock);
+		if (ret < 0) {
+			if (ret == -ENOENT)
+				ret = 0;
+			break;
+		}
+
+		if (ext.start > last_block)
+			break;
+
+		if (sef & ext.flags) {
+			if (dw) {
+				dw->chg = atomic64_read(&wq->changed);
+				dw->ino = scoutfs_ino(inode);
+				dw->iblock = max(iblock, ext.start);
+				dw->op = op;
+
+				spin_lock(&rt->lock);
+				insert_offline_waiting(&rt->root, dw);
+				spin_unlock(&rt->lock);
+			}
+
+			ret = 1;
+			break;
+		}
+
+		iblock = ext.start + ext.len;
+	}
+
+out:
+	trace_scoutfs_data_wait_check(sb, scoutfs_ino(inode), pos, len,
+				      sef, op, ext.start, ext.len, ext.flags,
+				      ret);
+	return ret;
+}
+
+bool scoutfs_data_wait_found(struct scoutfs_data_wait *dw)
+{
+	return !RB_EMPTY_NODE(&dw->node);
+}
+
+int scoutfs_data_wait_check_iov(struct inode *inode, const struct iovec *iov,
+				unsigned long nr_segs, loff_t pos, u8 sef,
+				u8 op, struct scoutfs_data_wait *dw,
+				struct scoutfs_lock *lock)
+{
+	unsigned long i;
+	int ret = 0;
+
+	for (i = 0; i < nr_segs; i++) {
+		if (iov[i].iov_len == 0)
+			continue;
+
+		ret = scoutfs_data_wait_check(inode, pos, iov[i].iov_len, sef,
+					      op, dw, lock);
+		if (ret != 0)
+			break;
+
+		pos += iov[i].iov_len;
+	}
+
+	return ret;
+}
+
+int scoutfs_data_wait(struct inode *inode, struct scoutfs_data_wait *dw)
+{
+	DECLARE_DATA_WAIT_ROOT(inode->i_sb, rt);
+	DECLARE_DATA_WAITQ(inode, wq);
+	int ret;
+
+	ret = wait_event_interruptible(wq->waitq,
+					atomic64_read(&wq->changed) != dw->chg);
+
+	spin_lock(&rt->lock);
+	rb_erase(&dw->node, &rt->root);
+	RB_CLEAR_NODE(&dw->node);
+	spin_unlock(&rt->lock);
+
+	return ret;
+}
+
+void scoutfs_data_wait_changed(struct inode *inode)
+{
+	DECLARE_DATA_WAITQ(inode, wq);
+
+	atomic64_inc(&wq->changed);
+	wake_up(&wq->waitq);
+}
+
+int scoutfs_data_waiting(struct super_block *sb, u64 ino, u64 iblock,
+			 struct scoutfs_ioctl_data_waiting_entry *dwe,
+			 unsigned int nr)
+{
+	DECLARE_DATA_WAIT_ROOT(sb, rt);
+	struct scoutfs_data_wait *dw;
+	int ret = 0;
+
+	spin_lock(&rt->lock);
+
+	dw = next_data_wait(&rt->root, ino, iblock);
+	while (dw && ret < nr) {
+
+		dwe->ino = dw->ino;
+		dwe->iblock = dw->iblock;
+		dwe->op = dw->op;
+
+		while ((dw = dw_next(dw)) &&
+		       (dw->ino == dwe->ino && dw->iblock == dwe->iblock)) {
+			dwe->op |= dw->op;
+		}
+
+		dwe++;
+		ret++;
+	}
+
+	spin_unlock(&rt->lock);
+
+	return ret;
+}
+
 const struct address_space_operations scoutfs_file_aops = {
 	.readpage		= scoutfs_readpage,
 	.readpages		= scoutfs_readpages,
diff --git a/kmod/src/data.h b/kmod/src/data.h
index bd9f84fa..d45114da 100644
--- a/kmod/src/data.h
+++ b/kmod/src/data.h
@@ -1,6 +1,41 @@
 #ifndef _SCOUTFS_FILERW_H_
 #define _SCOUTFS_FILERW_H_
 
+struct scoutfs_lock;
+struct scoutfs_ioctl_data_waiting_entry;
+
+struct scoutfs_data_wait_root {
+	spinlock_t lock;
+	struct rb_root root;
+};
+
+#define DECLARE_DATA_WAIT_ROOT(sb, nm) \
+	struct scoutfs_data_wait_root *nm = &SCOUTFS_SB(sb)->data_wait_root
+
+struct scoutfs_data_waitq {
+	atomic64_t changed;
+	wait_queue_head_t waitq;
+};
+
+#define DECLARE_DATA_WAITQ(in, nm) \
+	struct scoutfs_data_waitq *nm = &SCOUTFS_I(in)->data_waitq
+
+/*
+ * Tasks can wait for data extents.
+ */
+struct scoutfs_data_wait {
+	struct rb_node node;
+	u64 chg;
+	u64 ino;
+	u64 iblock;
+	u8 op;
+};
+
+#define DECLARE_DATA_WAIT(nm)						\
+	struct scoutfs_data_wait nm = {					\
+		.node.__rb_parent_color = (unsigned long)(&nm.node),	\
+	}
+
 extern const struct address_space_operations scoutfs_file_aops;
 extern const struct file_operations scoutfs_file_fops;
 
@@ -11,6 +46,21 @@ int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			u64 start, u64 len);
 long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len);
 
+int scoutfs_data_wait_check(struct inode *inode, loff_t pos, loff_t len,
+			    u8 sef, u8 op, struct scoutfs_data_wait *ow,
+			    struct scoutfs_lock *lock);
+int scoutfs_data_wait_check_iov(struct inode *inode, const struct iovec *iov,
+				unsigned long nr_segs, loff_t pos, u8 sef,
+				u8 op, struct scoutfs_data_wait *ow,
+				struct scoutfs_lock *lock);
+bool scoutfs_data_wait_found(struct scoutfs_data_wait *ow);
+int scoutfs_data_wait(struct inode *inode,
+			      struct scoutfs_data_wait *ow);
+void scoutfs_data_wait_changed(struct inode *inode);
+int scoutfs_data_waiting(struct super_block *sb, u64 ino, u64 iblock,
+			 struct scoutfs_ioctl_data_waiting_entry *dwe,
+			 unsigned int nr);
+
 int scoutfs_data_setup(struct super_block *sb);
 void scoutfs_data_destroy(struct super_block *sb);
 
diff --git a/kmod/src/file.c b/kmod/src/file.c
index f78e5721..765e5f1e 100644
--- a/kmod/src/file.c
+++ b/kmod/src/file.c
@@ -39,15 +39,40 @@ ssize_t scoutfs_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 	struct super_block *sb = inode->i_sb;
 	struct scoutfs_lock *inode_lock = NULL;
 	SCOUTFS_DECLARE_PER_TASK_ENTRY(pt_ent);
+	DECLARE_DATA_WAIT(dw);
 	int ret;
 
+retry:
 	ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ,
 				 SCOUTFS_LKF_REFRESH_INODE, inode, &inode_lock);
-	if (ret == 0) {
-		scoutfs_per_task_add(&si->pt_data_lock, &pt_ent, inode_lock);
-		ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
-		scoutfs_per_task_del(&si->pt_data_lock, &pt_ent);
-		scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ);
+	if (ret)
+		goto out;
+
+	if (scoutfs_per_task_add_excl(&si->pt_data_lock, &pt_ent, inode_lock)) {
+		/* protect checked extents from stage/release */
+		mutex_lock(&inode->i_mutex);
+		atomic_inc(&inode->i_dio_count);
+		mutex_unlock(&inode->i_mutex);
+
+		ret = scoutfs_data_wait_check_iov(inode, iov, nr_segs, pos,
+						  SEF_OFFLINE,
+						  SCOUTFS_IOC_DWO_READ,
+						  &dw, inode_lock);
+		if (ret != 0)
+			goto out;
+	}
+
+	ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
+
+out:
+	if (scoutfs_per_task_del(&si->pt_data_lock, &pt_ent))
+		inode_dio_done(inode);
+	scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ);
+
+	if (scoutfs_data_wait_found(&dw)) {
+		ret = scoutfs_data_wait(inode, &dw);
+		if (ret == 0)
+			goto retry;
 	}
 
 	return ret;
@@ -62,11 +87,13 @@ ssize_t scoutfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	struct super_block *sb = inode->i_sb;
 	struct scoutfs_lock *inode_lock = NULL;
 	SCOUTFS_DECLARE_PER_TASK_ENTRY(pt_ent);
+	DECLARE_DATA_WAIT(dw);
 	int ret;
 
 	if (iocb->ki_left == 0) /* Does this even happen? */
 		return 0;
 
+retry:
 	mutex_lock(&inode->i_mutex);
 	ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_WRITE,
 				 SCOUTFS_LKF_REFRESH_INODE, inode, &inode_lock);
@@ -77,16 +104,31 @@ ssize_t scoutfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	if (ret)
 		goto out;
 
-	scoutfs_per_task_add(&si->pt_data_lock, &pt_ent, inode_lock);
+	if (scoutfs_per_task_add_excl(&si->pt_data_lock, &pt_ent, inode_lock)) {
+		/* data_version is per inode, whole file must be online */
+		ret = scoutfs_data_wait_check(inode, 0, i_size_read(inode),
+					      SEF_OFFLINE,
+					      SCOUTFS_IOC_DWO_WRITE,
+					      &dw, inode_lock);
+		if (ret != 0)
+			goto out;
+	}
 
 	/* XXX: remove SUID bit */
 
 	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
+
 out:
 	scoutfs_per_task_del(&si->pt_data_lock, &pt_ent);
 	scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE);
 	mutex_unlock(&inode->i_mutex);
 
+	if (scoutfs_data_wait_found(&dw)) {
+		ret = scoutfs_data_wait(inode, &dw);
+		if (ret == 0)
+			goto retry;
+	}
+
 	if (ret > 0 || ret == -EIOCBQUEUED) {
 		ssize_t err;
 
diff --git a/kmod/src/format.h b/kmod/src/format.h
index 936b0717..9fcbc082 100644
--- a/kmod/src/format.h
+++ b/kmod/src/format.h
@@ -390,8 +390,9 @@ struct scoutfs_file_extent {
 	__u8 flags;
 } __packed;
 
-#define SEF_OFFLINE	0x1
-#define SEF_UNWRITTEN	0x2
+#define SEF_OFFLINE	(1 << 0)
+#define SEF_UNWRITTEN	(1 << 1)
+#define SEF_UNKNOWN	(U8_MAX << 2)
 
 /*
  * The first xattr part item has a header that describes the xattr.  The
diff --git a/kmod/src/inode.c b/kmod/src/inode.c
index b9e6fcf9..3bdc756a 100644
--- a/kmod/src/inode.c
+++ b/kmod/src/inode.c
@@ -70,6 +70,8 @@ static void scoutfs_inode_ctor(void *obj)
 	seqcount_init(&ci->seqcount);
 	ci->staging = false;
 	scoutfs_per_task_init(&ci->pt_data_lock);
+	atomic64_set(&ci->data_waitq.changed, 0);
+	init_waitqueue_head(&ci->data_waitq.waitq);
 	init_rwsem(&ci->xattr_rwsem);
 	RB_CLEAR_NODE(&ci->writeback_node);
 	spin_lock_init(&ci->ino_alloc.lock);
@@ -340,6 +342,9 @@ static int set_inode_size(struct inode *inode, struct scoutfs_lock *lock,
 	if (ret)
 		return ret;
 
+	if (new_size != i_size_read(inode))
+		scoutfs_inode_inc_data_version(inode);
+
 	truncate_setsize(inode, new_size);
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 	if (truncate)
@@ -394,11 +399,22 @@ int scoutfs_complete_truncate(struct inode *inode, struct scoutfs_lock *lock)
 	return ret ? ret : err;
 }
 
+/*
+ * If we're changing the file size than the contents of the file are
+ * changing and we increment the data_version.  This would prevent
+ * staging because the data_version is per-inode today, not per-extent.
+ * So if there are any offline extents within the new size then we need
+ * to stage them before we truncate.  And this is called with the
+ * i_mutex held which would prevent staging so we release it and
+ * re-acquire it.  Ideally we'd fix this so that we can acquire the lock
+ * instead of the caller.
+ */
 int scoutfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = dentry->d_inode;
 	struct super_block *sb = inode->i_sb;
 	struct scoutfs_lock *lock = NULL;
+	DECLARE_DATA_WAIT(dw);
 	LIST_HEAD(ind_locks);
 	bool truncate = false;
 	u64 attr_size;
@@ -406,6 +422,7 @@ int scoutfs_setattr(struct dentry *dentry, struct iattr *attr)
 
 	trace_scoutfs_setattr(dentry, attr);
 
+retry:
 	ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_WRITE,
 				 SCOUTFS_LKF_REFRESH_INODE, inode, &lock);
 	if (ret)
@@ -427,6 +444,28 @@ int scoutfs_setattr(struct dentry *dentry, struct iattr *attr)
 		if (ret)
 			goto out;
 
+		/* data_version is per inode, all must be online */
+		if (attr_size > 0 && attr_size != i_size_read(inode)) {
+			ret = scoutfs_data_wait_check(inode, 0, attr_size,
+						SEF_OFFLINE,
+						SCOUTFS_IOC_DWO_CHANGE_SIZE,
+						&dw, lock);
+			if (ret < 0)
+				goto out;
+			if (scoutfs_data_wait_found(&dw)) {
+				scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
+
+				/* XXX callee locks instead? */
+				mutex_unlock(&inode->i_mutex);
+				ret = scoutfs_data_wait(inode, &dw);
+				mutex_lock(&inode->i_mutex);
+
+				if (ret == 0)
+					goto retry;
+				goto out;
+			}
+		}
+
 		/* truncating to current size truncates extents past size */
 		truncate = i_size_read(inode) >= attr_size;
 
@@ -532,6 +571,10 @@ void scoutfs_inode_add_onoff(struct inode *inode, s64 on, s64 off)
 		write_seqcount_end(&si->seqcount);
 		preempt_enable();
 	}
+
+	/* any time offline extents decreased we try and wake waiters */
+	if (inode && off < 0)
+		scoutfs_data_wait_changed(inode);
 }
 
 static u64 read_seqcount_u64(struct inode *inode, u64 *val)
diff --git a/kmod/src/inode.h b/kmod/src/inode.h
index 7ae34de8..0ccd0184 100644
--- a/kmod/src/inode.h
+++ b/kmod/src/inode.h
@@ -6,6 +6,7 @@
 #include "per_task.h"
 #include "count.h"
 #include "format.h"
+#include "data.h"
 
 struct scoutfs_lock;
 
@@ -48,8 +49,10 @@ struct scoutfs_inode_info {
 	seqcount_t seqcount;
 	bool staging;			/* holder of i_mutex is staging */
 	struct scoutfs_per_task pt_data_lock;
+	struct scoutfs_data_waitq data_waitq;
 	struct rw_semaphore xattr_rwsem;
 	struct rb_node writeback_node;
+
 	struct inode inode;
 };
 
diff --git a/kmod/src/ioctl.c b/kmod/src/ioctl.c
index 2c1fa74e..738173e9 100644
--- a/kmod/src/ioctl.c
+++ b/kmod/src/ioctl.c
@@ -541,6 +541,56 @@ static long scoutfs_ioc_item_cache_keys(struct file *file, unsigned long arg)
 	return ret ?: total;
 }
 
+static bool inc_wrapped(u64 *ino, u64 *iblock)
+{
+	return (++(*iblock) == 0) && (++(*ino) == 0);
+}
+
+static long scoutfs_ioc_data_waiting(struct file *file, unsigned long arg)
+{
+	struct super_block *sb = file_inode(file)->i_sb;
+	struct scoutfs_ioctl_data_waiting idw;
+	struct scoutfs_ioctl_data_waiting_entry __user *udwe;
+	struct scoutfs_ioctl_data_waiting_entry dwe[16];
+	unsigned int nr;
+	int total;
+	int ret;
+
+	if (copy_from_user(&idw, (void __user *)arg, sizeof(idw)))
+		return -EFAULT;
+
+	if (idw.flags & SCOUTFS_IOC_DATA_WAITING_FLAGS_UNKNOWN)
+		return -EINVAL;
+
+	udwe = (void __user *)(long)idw.ents_ptr;
+	total = 0;
+	ret = 0;
+	while (idw.ents_nr && !inc_wrapped(&idw.after_ino, &idw.after_iblock)) {
+		nr = min_t(size_t, idw.ents_nr, ARRAY_SIZE(dwe));
+
+		ret = scoutfs_data_waiting(sb, idw.after_ino, idw.after_iblock,
+					   dwe, nr);
+		BUG_ON(ret > nr); /* stack overflow \o/ */
+		if (ret <= 0)
+			break;
+
+		if (copy_to_user(udwe, dwe, ret * sizeof(dwe[0]))) {
+			ret = -EFAULT;
+			break;
+		}
+
+		idw.after_ino = dwe[ret - 1].ino;
+		idw.after_iblock = dwe[ret - 1].iblock;
+
+		udwe += ret;
+		idw.ents_nr -= ret;
+		total += ret;
+		ret = 0;
+	}
+
+	return ret ?: total;
+}
+
 long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	switch (cmd) {
@@ -556,6 +606,8 @@ long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		return scoutfs_ioc_stat_more(file, arg);
 	case SCOUTFS_IOC_ITEM_CACHE_KEYS:
 		return scoutfs_ioc_item_cache_keys(file, arg);
+	case SCOUTFS_IOC_DATA_WAITING:
+		return scoutfs_ioc_data_waiting(file, arg);
 	}
 
 	return -ENOTTY;
diff --git a/kmod/src/ioctl.h b/kmod/src/ioctl.h
index 915a130b..1b592522 100644
--- a/kmod/src/ioctl.h
+++ b/kmod/src/ioctl.h
@@ -229,4 +229,28 @@ enum {
 #define SCOUTFS_IOC_ITEM_CACHE_KEYS _IOW(SCOUTFS_IOCTL_MAGIC, 8, \
 					 struct scoutfs_ioctl_item_cache_keys)
 
+struct scoutfs_ioctl_data_waiting_entry {
+	__u64 ino;
+	__u64 iblock;
+	__u8 op;
+} __packed;
+
+#define SCOUTFS_IOC_DWO_READ		(1 << 0)
+#define SCOUTFS_IOC_DWO_WRITE		(1 << 1)
+#define SCOUTFS_IOC_DWO_CHANGE_SIZE	(1 << 2)
+#define SCOUTFS_IOC_DWO_UNKNOWN		(U8_MAX << 3)
+
+struct scoutfs_ioctl_data_waiting {
+	__u64 flags;
+	__u64 after_ino;
+	__u64 after_iblock;
+	__u64 ents_ptr;
+	__u16 ents_nr;
+} __packed;
+
+#define SCOUTFS_IOC_DATA_WAITING_FLAGS_UNKNOWN		(U8_MAX << 0)
+
+#define SCOUTFS_IOC_DATA_WAITING _IOW(SCOUTFS_IOCTL_MAGIC, 9, \
+				      struct scoutfs_ioctl_data_waiting)
+
 #endif
diff --git a/kmod/src/lock.c b/kmod/src/lock.c
index d268dd82..6697ec66 100644
--- a/kmod/src/lock.c
+++ b/kmod/src/lock.c
@@ -32,6 +32,7 @@
 #include "triggers.h"
 #include "tseq.h"
 #include "client.h"
+#include "data.h"
 
 /*
  * scoutfs uses a lock service to manage item cache consistency between
@@ -126,8 +127,10 @@ static void invalidate_inode(struct super_block *sb, u64 ino)
 	inode = scoutfs_ilookup(sb, ino);
 	if (inode) {
 		scoutfs_inc_counter(sb, lock_invalidate_inode);
-		if (S_ISREG(inode->i_mode))
+		if (S_ISREG(inode->i_mode)) {
 			truncate_inode_pages(inode->i_mapping, 0);
+			scoutfs_data_wait_changed(inode);
+		}
 		iput(inode);
 	}
 }
diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h
index 5e85305e..08d4572b 100644
--- a/kmod/src/scoutfs_trace.h
+++ b/kmod/src/scoutfs_trace.h
@@ -514,6 +514,45 @@ TRACE_EVENT(scoutfs_data_truncate_items,
 		  __entry->iblock, __entry->last, __entry->offline)
 );
 
+TRACE_EVENT(scoutfs_data_wait_check,
+	TP_PROTO(struct super_block *sb, __u64 ino, __u64 pos, __u64 len,
+		 __u8 sef, __u8 op, __u64 ext_start, __u64 ext_len,
+		 __u8 ext_flags, int ret),
+
+	TP_ARGS(sb, ino, pos, len, sef, op, ext_start, ext_len, ext_flags, ret),
+
+	TP_STRUCT__entry(
+		__field(__u64, fsid)
+		__field(__u64, ino)
+		__field(__u64, pos)
+		__field(__u64, len)
+		__field(__u8, sef)
+		__field(__u8, op)
+		__field(__u64, ext_start)
+		__field(__u64, ext_len)
+		__field(__u8, ext_flags)
+		__field(int, ret)
+	),
+
+	TP_fast_assign(
+		__entry->fsid = FSID_ARG(sb);
+		__entry->ino = ino;
+		__entry->pos = pos;
+		__entry->len = len;
+		__entry->sef = sef;
+		__entry->op = op;
+		__entry->ext_start = ext_start;
+		__entry->ext_len = ext_len;
+		__entry->ext_flags = ext_flags;
+		__entry->ret = ret;
+	),
+
+	TP_printk(FSID_FMT" ino %llu pos %llu len %llu sef 0x%x op 0x%x ext_start %llu ext_len %llu ext_flags 0x%x ret %d",
+			__entry->fsid, __entry->ino, __entry->pos, __entry->len,
+			__entry->sef, __entry->op, __entry->ext_start,
+			__entry->ext_len, __entry->ext_flags, __entry->ret)
+);
+
 TRACE_EVENT(scoutfs_sync_fs,
 	TP_PROTO(struct super_block *sb, int wait),
 
diff --git a/kmod/src/super.c b/kmod/src/super.c
index 02f38934..ff39a6fb 100644
--- a/kmod/src/super.c
+++ b/kmod/src/super.c
@@ -339,6 +339,8 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
 
 	spin_lock_init(&sbi->next_ino_lock);
 	init_waitqueue_head(&sbi->trans_hold_wq);
+	spin_lock_init(&sbi->data_wait_root.lock);
+	sbi->data_wait_root.root = RB_ROOT;
 	spin_lock_init(&sbi->trans_write_lock);
 	INIT_DELAYED_WORK(&sbi->trans_write_work, scoutfs_trans_write_func);
 	init_waitqueue_head(&sbi->trans_write_wq);
diff --git a/kmod/src/super.h b/kmod/src/super.h
index e24f4d9a..6dd03ac6 100644
--- a/kmod/src/super.h
+++ b/kmod/src/super.h
@@ -6,6 +6,7 @@
 
 #include "format.h"
 #include "options.h"
+#include "data.h"
 
 struct scoutfs_counters;
 struct scoutfs_triggers;
@@ -49,6 +50,9 @@ struct scoutfs_sb_info {
 	wait_queue_head_t trans_hold_wq;
 	struct task_struct *trans_task;
 
+	/* tracks tasks waiting for data extents */
+	struct scoutfs_data_wait_root data_wait_root;
+
 	spinlock_t trans_write_lock;
 	u64 trans_write_count;
 	u64 trans_seq;