scoutfs: add basic file page cache read and write

Add basic file data support by implementing the address space file and page read and write methods. This passis basic read/write tests but is only the seed of a final implementation. Signed-off-by: Zach Brown <zab@versity.com>
2026-01-05 03:44:05 +00:00 · 2016-03-26 10:58:06 -07:00
parent 867d717d2b
commit 9cf87ee571
5 changed files with 260 additions and 6 deletions
--- a/kmod/src/Makefile
+++ b/kmod/src/Makefile
@@ -1,4 +1,4 @@
 obj-$(CONFIG_SCOUTFS_FS) := scoutfs.o
 
-scoutfs-y += block.o bloom.o chunk.o crc.o dir.o inode.o manifest.o msg.o \
-	     ring.o segment.o skip.o super.o
+scoutfs-y += block.o bloom.o chunk.o crc.o dir.o filerw.o inode.o manifest.o \
+	     msg.o ring.o segment.o skip.o super.o
--- a/kmod/src/filerw.c
+++ b/kmod/src/filerw.c
@@ -0,0 +1,218 @@
+/*
+* Copyright (C) 2016 Versity Software, Inc.  All rights reserved.
+*
+* This program is free software; you can redistribute it and/or
+* modify it under the terms of the GNU General Public
+* License v2 as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+* General Public License for more details.
+*/
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+
+#include "format.h"
+#include "segment.h"
+#include "inode.h"
+#include "key.h"
+#include "filerw.h"
+
+/*
+ * File data is stored in items just like everything else.  This is very
+ * easy to implement but incurs a copying overhead.  We'll see how
+ * expensive that gets.
+ *
+ * By making the max item size a bit less than the block size we can
+ * still have room for the block header which gets us file data
+ * checksums.  File item key offsets are multiples of this max block
+ * size though items can be smaller if the data is sparse.  This lets us
+ * do lookups for specific keys and take advantage of the bloom filters.
+ *
+ * This is a minimal first pass and will need more work.  It'll need to
+ * worry about enospc in writepage and cluster access for a start.
+ */
+
+/*
+* Track the intersection of the logical region of a file with a page
+* and file data item.
+*/
+struct data_region {
+	u64 item_key;
+	unsigned int page_off;
+	unsigned short len;
+	unsigned short item_off;
+};
+
+/*
+ * Map the file offset to its intersection with the page and item region.
+ * Returns false if the byte position is outside the page.
+*/
+static bool map_data_region(struct data_region *dr, u64 pos, struct page *page)
+{
+	if (pos >> PAGE_SHIFT != page->index)
+		return false;
+
+	dr->page_off = pos & ~PAGE_MASK;
+
+	dr->item_off = do_div(pos, SCOUTFS_MAX_ITEM_LEN);
+	dr->item_key = pos;
+
+	dr->len = min(SCOUTFS_MAX_ITEM_LEN - dr->item_off,
+		      PAGE_SIZE - dr->page_off);
+
+	return true;
+}
+
+#define for_each_data_region(dr, page, pos) 			\
+	for (pos = (u64)page->index << PAGE_SHIFT;		\
+	     map_data_region(dr, pos, page); pos += (dr)->len)
+
+/*
+ * Copy the contents of file data items into the page.  If we don't
+ * find an item then we zero that region of the page.
+ *
+ * XXX i_size?
+ * XXX async?
+ */
+static int scoutfs_readpage(struct file *file, struct page *page)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct super_block *sb = inode->i_sb;
+	DECLARE_SCOUTFS_ITEM_REF(ref);
+	struct scoutfs_key key;
+	struct data_region dr;
+	int ret = 0;
+	void *addr;
+	u64 pos;
+
+	for_each_data_region(&dr, page, pos) {
+		scoutfs_set_key(&key, scoutfs_ino(inode), SCOUTFS_DATA_KEY,
+				dr.item_key);
+
+		ret = scoutfs_read_item(sb, &key, &ref);
+		if (ret == -ENOENT) {
+			addr = kmap_atomic(page);
+			memset(addr + dr.page_off, 0, dr.len);
+			kunmap_atomic(addr);
+			continue;
+		}
+		if (ret)
+			break;
+
+		addr = kmap_atomic(page);
+		memcpy(addr + dr.page_off, ref.val + dr.item_off, dr.len);
+		kunmap_atomic(addr);
+	}
+
+	if (!ret)
+		SetPageUptodate(page);
+	unlock_page(page);
+	return ret;
+}
+
+/*
+ * Copy the contents of the page into file items.  Data integrity syncs
+ * will later write the dirty segment to the device.
+ *
+* XXX zeroing regions of data items?
+* XXX wbc counters?
+* XXX reserve space so dirty item doesn't get enospc -- our "delalloc"?
+*/
+static int scoutfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	struct super_block *sb = inode->i_sb;
+	DECLARE_SCOUTFS_ITEM_REF(ref);
+	struct scoutfs_key key;
+	struct data_region dr;
+	void *addr;
+	u64 pos;
+	int ret;
+
+	set_page_writeback(page);
+
+	for_each_data_region(&dr, page, pos) {
+		scoutfs_set_key(&key, scoutfs_ino(inode), SCOUTFS_DATA_KEY,
+				dr.item_key);
+
+		ret = scoutfs_dirty_item(sb, &key, SCOUTFS_MAX_ITEM_LEN, &ref);
+		if (ret)
+			break;
+
+		addr = kmap_atomic(page);
+		memcpy(ref.val + dr.item_off, addr + dr.page_off, dr.len);
+		kunmap_atomic(addr);
+
+		scoutfs_put_ref(&ref);
+
+	}
+
+	scoutfs_put_ref(&ref);
+
+	if (ret) {
+		SetPageError(page);
+		mapping_set_error(&inode->i_data, ret);
+	}
+
+	end_page_writeback(page);
+	unlock_page(page);
+
+	return ret;
+}
+
+static int scoutfs_write_begin(struct file *file, struct address_space *mapping,
+			       loff_t pos, unsigned len, unsigned flags,
+			       struct page **pagep, void **fsdata)
+{
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	struct page *page;
+
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page)
+		return -ENOMEM;
+
+	*pagep = page;
+	return 0;
+}
+
+static int scoutfs_write_end(struct file *file, struct address_space *mapping,
+			     loff_t pos, unsigned len, unsigned copied,
+			     struct page *page, void *fsdata)
+{
+	struct inode *inode = mapping->host;
+	unsigned off;
+
+	off = pos & (PAGE_CACHE_SIZE - 1);
+
+	/* zero the stale part of the page if we did a short copy */
+	if (copied < len)
+		zero_user_segment(page, off + copied, len);
+
+	if (pos + copied > inode->i_size)
+		i_size_write(inode, pos + copied);
+
+	if (!PageUptodate(page))
+		SetPageUptodate(page);
+	set_page_dirty(page);
+	unlock_page(page);
+	page_cache_release(page);
+
+	return copied;
+}
+
+const struct address_space_operations scoutfs_file_aops = {
+	.readpage		= scoutfs_readpage,
+	.writepage		= scoutfs_writepage,
+	.write_begin		= scoutfs_write_begin,
+	.write_end		= scoutfs_write_end,
+};
+
+const struct file_operations scoutfs_file_fops = {
+	.read		= do_sync_read,
+	.write		= do_sync_write,
+	.aio_read	= generic_file_aio_read,
+	.aio_write	= generic_file_aio_write,
+};
--- a/kmod/src/filerw.h
+++ b/kmod/src/filerw.h
@@ -0,0 +1,7 @@
+#ifndef _SCOUTFS_FILERW_H_
+#define _SCOUTFS_FILERW_H_
+
+extern const struct address_space_operations scoutfs_file_aops;
+extern const struct file_operations scoutfs_file_fops;
+
+#endif
--- a/kmod/src/format.h
+++ b/kmod/src/format.h
@@ -100,8 +100,14 @@ struct scoutfs_key {

 #define SCOUTFS_ROOT_INO 1

-#define SCOUTFS_INODE_KEY 128
-#define SCOUTFS_DIRENT_KEY 192
+/*
+ * Currently we sort keys by the numeric value of the types, but that
+ * isn't necessary.  We could have an arbitrary sort order.  So we don't
+ * have to stress about cleverly allocating the types.
+ */
+#define SCOUTFS_INODE_KEY	1
+#define SCOUTFS_DIRENT_KEY	2
+#define SCOUTFS_DATA_KEY	3

 struct scoutfs_ring_map_block {
 	struct scoutfs_block_header hdr;
@@ -203,6 +209,13 @@ struct scoutfs_item {
 	__le32 skip_next[0];
 } __packed;

+/*
+ * Item size caps item file data item length so that they fit in checksummed
+ * 4k blocks with a bit of expansion room.
+ */
+#define SCOUTFS_MAX_ITEM_LEN \
+	(SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_block_header) - 32)
+
 struct scoutfs_timespec {
 	__le64 sec;
 	__le32 nsec;
--- a/kmod/src/inode.c
+++ b/kmod/src/inode.c
@@ -21,6 +21,7 @@
 #include "inode.h"
 #include "segment.h"
 #include "dir.h"
+#include "filerw.h"

 /*
 * XXX
@@ -68,10 +69,25 @@ void scoutfs_destroy_inode(struct inode *inode)
 static void set_inode_ops(struct inode *inode)
 {
 	switch (inode->i_mode & S_IFMT) {
+	/*
+	 * I guess we add a reg.c for regular files?  Or pagecache.c?
+	 * I guess that makes more sense.
+	 *
+	 * - page dirtying makes sure there's a dirty item
+	 * - sync writes back page cache pages
+	 * - writepage copies to dirty item
+	 * - crc calculated after copying
+	 * - pages can be pretty large
+	 * - tail items can be partial?
+	 * - tracing all over the place
+	 * - maybe just less than 4k is the answer?
+	 * - so allocation pulls the value back
+	 * - probably leave some overhead for header growth
+	 */
 	case S_IFREG:
-//		inode->i_mapping->a_ops = &scoutfs_file_aops;
+		inode->i_mapping->a_ops = &scoutfs_file_aops;
 //		inode->i_op = &scoutfs_file_iops;
-//		inode->i_fop = &scoutfs_file_fops;
+		inode->i_fop = &scoutfs_file_fops;
 		break;
 	case S_IFDIR:
 		inode->i_op = &scoutfs_dir_iops;