diff --git a/kmod/src/data.c b/kmod/src/data.c index 61547c2e..8533a3c5 100644 --- a/kmod/src/data.c +++ b/kmod/src/data.c @@ -14,8 +14,12 @@ #include #include #include -#include -#include +#include +#include +#include +#include +#include +#include #include "format.h" #include "super.h" @@ -27,354 +31,844 @@ #include "scoutfs_trace.h" #include "item.h" #include "ioctl.h" +#include "net.h" /* - * scoutfs stores data in items that can be up to the small 4K block - * size. The page cache address space callbacks work with the item - * cache. Each OS page can be stored in multiple of our smaller fixed - * size items. The code doesn't understand OS pages that are smaller - * than our block size. + * scoutfs uses extent records to reference file data. * - * readpage does a blocking read of the item and then copies its - * contents into the page. Since the segments are huge we sort of get - * limited read-ahead by reading in segments at a time. + * The extent items map logical file regions to device blocks at at 4K + * block granularity. File data isn't overwritten so that overwriting + * doesn't generate extent item locking and modification. * - * Writing is quite a bit more fiddly. We want to pack small files. - * The item cache and transactions want to accurately track the size of - * dirty items to fill the next segment. And we would like to minimize - * cpu copying as much as we can. + * Nodes have their own free extent items stored at their node id to + * avoid lock contention during allocation and freeing. These pools are + * filled and drained with RPCs to the server who allocates blocks in + * segment-sized regions. * - * This simplest first pass creates dirty items as pages are dirtied - * whose values reference the page contents. They're freed after - * they're written to the segment so that we don't have to worry about - * items that reference clean pages. Invalidatepage forgets any items - * if a dirty page is truncated away. + * Block allocation maintains a fixed number of allocation cursors that + * remember the position of tasks within free regions. This is very + * simple and maintains decent extents for simple streaming writes. It + * eventually won't be good enough and we'll spend complexity on + * delalloc but we want to put that off as long as possible. * - * Writeback is built around all the dirty items being written by a - * commit. This can happen naturally in the backgroud. Or writepage - * can initiate it to start by kicking the commit thread. In either - * case our dirty pages are "in writeback" by being put on a list that - * is walked by the end of the commit. Because writes and page dirtying - * are serialized with the commit we know that there can be no dirty - * pages after the commit and we can mark writeback complete on all the - * pages that started writeback before the commit finished. motivate - * having items in the item cache while there are dirty pages. + * There's no unwritten extents. As we dirty file data pages, possibly + * allocating extents for the first time, we track their inodes. Before + * we commit dirty metadata we write out all tracked inodes. This + * ensures that data is persistent before the metadata that references + * it is usable. * - * Data is copied from the dirty page contents into the segment pages - * for writing. This lets us easily pack small files without worrying - * about DMA alignment and avoids the stable page problem of the page - * being modified after the cpu calculates the checksum but before the - * DMA reads to the device. + * Weirdly, the extents are indexed by the *final* logical block and + * blkno of the extent. This lets us search for neighbouring previous + * extents with a _next() call and avoids having to implement item + * reading that iterates backwards through the manifest and segments. + * + * There are two items that track free extents, one indexed by the block + * location of the free extent and one indexed by the size of the free + * region. This means that one allocation can update a great number of + * items throughout the tree as file and both kinds of free extents + * split and merge. The code goes to great lengths to stage these + * updates so that it can always unwind and return errors without + * leaving the items inconsistent. * * XXX * - truncate * - mmap * - better io error propagation - * - async readpages for more concurrent readahead * - forced unmount with dirty data * - direct IO - * - probably stitch page vecs into block struct page fragments for bios - * - maybe cut segment boundaries on aligned data offsets - * - maybe decouple metadata and data segment writes */ struct data_info { - struct llist_head writeback_pages; + struct rw_semaphore alloc_rwsem; + u64 next_large_blkno; + struct rhashtable cursors; + struct list_head cursor_lru; }; #define DECLARE_DATA_INFO(sb, name) \ struct data_info *name = SCOUTFS_SB(sb)->data_info -/* - * trace_printk() doesn't support %c? - * - * 1 - 1ocked - * a - uptodAte - * d - Dirty - * b - writeBack - * e - Error - */ -#define page_hexflag(page, name, val, shift) \ - (Page##name(page) ? (val << (shift * 4)) : 0) - -#define page_hexflags(page) \ - (page_hexflag(page, Locked, 0x1, 4) | \ - page_hexflag(page, Uptodate, 0xa, 3) | \ - page_hexflag(page, Dirty, 0xd, 2) | \ - page_hexflag(page, Writeback, 0xb, 1) | \ - page_hexflag(page, Error, 0xe, 0)) - -#define PGF "page %p [index %lu flags %x]" -#define PGA(page) \ - (page), (page)->index, page_hexflags(page) \ - -#define BHF "bh %p [blocknr %llu size %zu state %lx]" -#define BHA(bh) \ - (bh), (u64)(bh)->b_blocknr, (bh)->b_size, (bh)->b_state \ - -static void init_data_key(struct scoutfs_key_buf *key, - struct scoutfs_data_key *dkey, u64 ino, u64 block) -{ - dkey->type = SCOUTFS_DATA_KEY; - dkey->ino = cpu_to_be64(ino); - dkey->block = cpu_to_be64(block); - - scoutfs_key_init(key, dkey, sizeof(struct scoutfs_data_key)); -} +/* more than enough for a few tasks per core on moderate hardware */ +#define NR_CURSORS 4096 /* - * Delete the data block items in the given region. + * This is the size of extents that are tracked by a cursor and so end + * up being the largest file item extent length given concurrent + * streaming writes. * - * This is the low level extent item truncate code. Callers manage - * higher order truncation and orphan cleanup. - * - * XXX - * - restore support for releasing data. - * - for final unlink this would be better as a range deletion - * - probably don't want to read items to find them for removal + * XXX We probably want this to be a bit larger to further reduce the + * amount of item churn involved in truncating tremendous files. */ -int scoutfs_data_truncate_items(struct super_block *sb, u64 ino, u64 iblock, - u64 len, bool offline) -{ - struct scoutfs_data_key last_dkey; - struct scoutfs_data_key dkey; - struct scoutfs_key_buf last; - struct scoutfs_key_buf key; - int ret; +#define LARGE_EXTENT_BLOCKS SCOUTFS_SEGMENT_BLOCKS - trace_printk("iblock %llu len %llu offline %u\n", - iblock, len, offline); +struct cursor_id { + struct task_struct *task; + pid_t pid; +} __packed; /* rhashtable_lookup() always memcmp()s, avoid padding */ - if (WARN_ON_ONCE(iblock + len <= iblock) || - WARN_ON_ONCE(offline)) - return -EINVAL; - - init_data_key(&key, &dkey, ino, iblock); - init_data_key(&last, &last_dkey, ino, iblock + len - 1); - - for (;;) { - ret = scoutfs_item_next(sb, &key, &last, NULL); - if (ret < 0) { - if (ret == -ENOENT) - ret = 0; - break; - } - - /* XXX would set offline bit items here */ - - ret = scoutfs_item_delete(sb, &key); - if (ret) - break; - } - - return ret; -} - -static inline struct page *page_from_llist_node(struct llist_node *node) -{ - BUILD_BUG_ON(member_sizeof(struct page, private) != - sizeof(struct llist_node)); - - return container_of((void *)node, struct page, private); -} - -static inline struct llist_node *llist_node_from_page(struct page *page) -{ - return (void *)&page->private; -} - -static inline void page_llist_add(struct page *page, struct llist_head *head) -{ - llist_add(llist_node_from_page(page), head); -} +struct task_cursor { + u64 blkno; + u64 blocks; + struct rhash_head hash_head; + struct list_head list_head; + struct cursor_id id; +}; /* - * The transaction has committed so there are no more dirty items. End - * writeback on all the dirty pages that started writeback before the - * commit finished. The commit doesn't start until all holders which - * could dirty are released so there couldn't have been new dirty pages - * and writeback entries while the commit was in flight. + * Both file extent and free extent keys are converted into this native + * form for manipulation. The free extents set blk_off to blkno. */ -void scoutfs_data_end_writeback(struct super_block *sb, int err) +struct native_extent { + u64 blk_off; + u64 blkno; + u64 blocks; +}; + +/* These are stored in a (type==0) terminated array on caller's stacks */ +struct extent_change { + struct native_extent ext; + u64 arg; + unsigned ins:1, + type; +}; + +/* insert file extent + remove both blkno and blocks extents + 0 term */ +#define MAX_CHANGES (3 + 3 + 3 + 1) + +/* XXX avoiding dynamic on-stack array initializers :/ */ +union extent_key_union { + struct scoutfs_file_extent_key file; + struct scoutfs_free_extent_blkno_key blkno; + struct scoutfs_free_extent_blocks_key blocks; +} __packed; +#define MAX_KEY_BYTES sizeof(union extent_key_union) + +static void init_file_extent_key(struct scoutfs_key_buf *key, void *key_bytes, + struct native_extent *ext, u64 arg) { - DECLARE_DATA_INFO(sb, datinf); - struct llist_node *node; - struct page *page; + struct scoutfs_file_extent_key *fkey = key_bytes; - /* XXX haven't thought about errors here */ - BUG_ON(err); + fkey->type = SCOUTFS_FILE_EXTENT_KEY; + fkey->ino = cpu_to_be64(arg); + fkey->last_blk_off = cpu_to_be64(ext->blk_off + ext->blocks - 1); + fkey->last_blkno = cpu_to_be64(ext->blkno + ext->blocks - 1); + fkey->blocks = cpu_to_be64(ext->blocks); - node = llist_del_all(&datinf->writeback_pages); - - while (node) { - page = page_from_llist_node(node); - node = llist_next(node); - - trace_printk("ending writeback "PGF"\n", PGA(page)); - scoutfs_inc_counter(sb, data_end_writeback_page); - - - set_page_private(page, 0); - end_page_writeback(page); - page_cache_release(page); - } + scoutfs_key_init(key, fkey, sizeof(struct scoutfs_file_extent_key)); } -#define for_each_page_block(page, start, loff, block, key, dkey, val) \ - for (start = 0; \ - start < PAGE_CACHE_SIZE && \ - (loff = ((loff_t)page->index << PAGE_CACHE_SHIFT) + start, \ - block = loff >> SCOUTFS_BLOCK_SHIFT, \ - init_data_key(&key, &dkey, \ - scoutfs_ino(page->mapping->host), block), \ - scoutfs_kvec_init(val, page_address(page) + start, \ - SCOUTFS_BLOCK_SIZE), \ - 1); \ - start += SCOUTFS_BLOCK_SIZE) +#define INIT_FREE_EXTENT_KEY(which_type, key, key_bytes, ext, arg, type) \ +do { \ + struct which_type *fkey = key_bytes; \ + \ + fkey->type = type; \ + fkey->node_id = cpu_to_be64(arg); \ + fkey->last_blkno = cpu_to_be64(ext->blkno + ext->blocks - 1); \ + fkey->blocks = cpu_to_be64(ext->blocks); \ + \ + scoutfs_key_init(key, fkey, sizeof(struct which_type)); \ +} while (0) -/* - * Copy the contents of each item that makes up the page into their - * regions of the page, zeroing any page contents not covered by items. - * - * This is the simplest loop that looks up every possible block. We - * could instead have a readpages() that iterates over present items and - * puts them in the pages in the batch. - */ -static int scoutfs_readpage(struct file *file, struct page *page) +static void init_extent_key(struct scoutfs_key_buf *key, void *key_bytes, + struct native_extent *ext, u64 arg, u8 type) { - struct inode *inode = page->mapping->host; - struct super_block *sb = inode->i_sb; - loff_t size = i_size_read(inode); - struct scoutfs_data_key dkey; - struct scoutfs_key_buf key; - SCOUTFS_DECLARE_KVEC(val); - unsigned start; - loff_t loff; - u64 block; - int ret = 0; - - - trace_printk(PGF"\n", PGA(page)); - scoutfs_inc_counter(sb, data_readpage); - - for_each_page_block(page, start, loff, block, key, dkey, val) { - /* the rest of the page is zero when block is past i_size */ - if (loff >= size) - break; - - /* copy the block item contents into the page */ - ret = scoutfs_item_lookup(sb, &key, val); - if (ret < 0) { - if (ret == -ENOENT) - ret = 0; - else - break; - } - - /* - * XXX do we need to clamp the item length by i_size? - * truncate should purge the item cache and create - * truncation range items that'd merge away old data - * items, and invalidatepage should shrink any ephemeral - * vecs. Seems like the item length should be accurate? - */ - - /* zero the tail of the block */ - if (ret < SCOUTFS_BLOCK_SIZE) - zero_user(page, start, SCOUTFS_BLOCK_SIZE - ret); - } - - /* zero any remaining tail blocks */ - if (start < PAGE_CACHE_SIZE) - zero_user(page, start, PAGE_CACHE_SIZE - start); - - if (ret == 0) - SetPageUptodate(page); + if (type == SCOUTFS_FILE_EXTENT_KEY) + init_file_extent_key(key, key_bytes, ext, arg); + else if(type == SCOUTFS_FREE_EXTENT_BLKNO_KEY) + INIT_FREE_EXTENT_KEY(scoutfs_free_extent_blkno_key, + key, key_bytes, ext, arg, type); else - SetPageError(page); + INIT_FREE_EXTENT_KEY(scoutfs_free_extent_blocks_key, + key, key_bytes, ext, arg, type); +} - trace_printk("ret %d\n", ret); - unlock_page(page); - return ret; +/* XXX could have some sanity checks */ +static void load_file_extent(struct native_extent *ext, + struct scoutfs_key_buf *key) +{ + struct scoutfs_file_extent_key *fkey = key->data; + + ext->blocks = be64_to_cpu(fkey->blocks); + ext->blk_off = be64_to_cpu(fkey->last_blk_off) - ext->blocks + 1; + ext->blkno = be64_to_cpu(fkey->last_blkno) - ext->blocks + 1; +} + +#define LOAD_FREE_EXTENT(which_type, ext, key) \ +do { \ + struct which_type *fkey = key->data; \ + \ + ext->blkno = be64_to_cpu(fkey->last_blkno) - \ + be64_to_cpu(fkey->blocks) + 1; \ + ext->blk_off = ext->blkno; \ + ext->blocks = be64_to_cpu(fkey->blocks); \ +} while (0) + +static void load_extent(struct native_extent *ext, struct scoutfs_key_buf *key) +{ + struct scoutfs_free_extent_blocks_key *fkey = key->data; + + BUILD_BUG_ON(offsetof(struct scoutfs_file_extent_key, type) != + offsetof(struct scoutfs_free_extent_blkno_key, type) || + offsetof(struct scoutfs_file_extent_key, type) != + offsetof(struct scoutfs_free_extent_blocks_key, type)); + + if (fkey->type == SCOUTFS_FILE_EXTENT_KEY) + load_file_extent(ext, key); + else if (fkey->type == SCOUTFS_FREE_EXTENT_BLKNO_KEY) + LOAD_FREE_EXTENT(scoutfs_free_extent_blkno_key, ext, key); + else + LOAD_FREE_EXTENT(scoutfs_free_extent_blocks_key, ext, key); } /* - * Start writeback on a dirty page. We always try to kick off a commit. - * Repeated calls harmlessly bounce off the thread work's pending bit. - * (we could probably test that the writeback pgaes list is empty before - * trying to kick off a commit.) - * - * We add ourselves to a list of pages that the commit will end - * writeback on once its done. If there's no dirty data the commit - * thread will end writeback after not doing anything. + * Merge two extents if they're adjacent. First we arrange them to + * only test their adjoining endpoints, then are careful to not reference + * fields after we've modified them. */ -static int scoutfs_writepage(struct page *page, struct writeback_control *wbc) +static int merge_extents(struct native_extent *mod, + struct native_extent *ext) { - struct inode *inode = page->mapping->host; - struct super_block *sb = inode->i_sb; - DECLARE_DATA_INFO(sb, datinf); + struct native_extent *left; + struct native_extent *right; - trace_printk(PGF"\n", PGA(page)); - scoutfs_inc_counter(sb, data_writepage); + if (mod->blk_off < ext->blk_off) { + left = mod; + right = ext; + } else { + left = ext; + right = mod; + } - BUG_ON(PageWriteback(page)); - BUG_ON(page->private != 0); - - ClearPagePrivate(page); /* invalidatepage not needed */ - set_page_writeback(page); - page_cache_get(page); - page_llist_add(page, &datinf->writeback_pages); - unlock_page(page); - scoutfs_sync_fs(sb, 0); + if (left->blk_off + left->blocks == right->blk_off && + left->blkno + left->blocks == right->blkno) { + mod->blk_off = left->blk_off; + mod->blkno = left->blkno; + mod->blocks = left->blocks + right->blocks; + return 1; + } return 0; } /* - * Truncate is invalidating part of the contents of a page. - * - * We can't return errors here so our job is not to create dirty items - * that end up executing the truncate. That's the job of higher level - * callers. Our job is to make sure that we update references to the - * page from existing ephemeral items if they already exist. + * The caller has ensured that the inner extent is entirely within + * the outer extent. Fill out the left and right regions of outter + * that don't overlap with inner. */ -static void scoutfs_invalidatepage(struct page *page, unsigned long offset) +static void trim_extents(struct native_extent *left, + struct native_extent *right, + struct native_extent *outer, + struct native_extent *inner) { - struct inode *inode = page->mapping->host; - struct super_block *sb = inode->i_sb; - struct scoutfs_data_key dkey; - struct scoutfs_key_buf key; - SCOUTFS_DECLARE_KVEC(val); - unsigned start; - loff_t loff; - u64 block; + left->blk_off = outer->blk_off; + left->blkno = outer->blkno; + left->blocks = inner->blk_off - outer->blk_off; - trace_printk(PGF"\n", PGA(page)); - scoutfs_inc_counter(sb, data_invalidatepage); + right->blk_off = inner->blk_off + inner->blocks; + right->blkno = inner->blkno + inner->blocks; + right->blocks = (outer->blk_off + outer->blocks) - right->blk_off; +} - for_each_page_block(page, start, loff, block, key, dkey, val) { - if (offset) { - /* XXX maybe integrate offset into foreach */ - /* XXX ugh, kvecs are still clumsy :) */ - if (start + SCOUTFS_BLOCK_SIZE > offset) - val[0].iov_len = offset - start; - scoutfs_item_update_ephemeral(sb, &key, val); - } else { - scoutfs_item_forget(sb, &key); - } - } +/* return true if inner is fully contained by outer */ +static bool extents_within(struct native_extent *outer, + struct native_extent *inner) +{ + u64 outer_end = outer->blk_off + outer->blocks - 1; + u64 inner_end = inner->blk_off + inner->blocks - 1; + + return outer->blk_off <= inner_end && outer_end >= inner_end; } /* - * Start modifying a page cache page. - * - * We hold the transaction for write_end's inode updates before - * acquiring the page lock. - * - * We give the writer the current page contents in the relatively rare - * case of writing a partial page inside i_size. write_end will zero - * any region around the write if the page isn't uptodate. + * Add a new entry to the array of changes. The _BLOCKS extent items + * exactly match the _BLKNO items but with different field order for + * searching by size. We keep them in sync by always adding a _BLOCKS + * change for every _BLKNO change. */ +static struct extent_change *append_change(struct extent_change *chg, + bool ins, struct native_extent *ext, + u64 arg, u8 type) +{ + trace_printk("appending ins %d blk_off %llu blkno %llu blocks %llu arg %llu type %u\n", + ins, ext->blk_off, ext->blkno, ext->blocks, + arg, type); + + chg->ext = *ext; + chg->arg = arg; + chg->ins = ins; + chg->type = type; + + if (type == SCOUTFS_FREE_EXTENT_BLKNO_KEY) { + chg++; + *chg = *(chg - 1); + chg->type = SCOUTFS_FREE_EXTENT_BLOCKS_KEY; + } + + return chg + 1; +} + +/* + * Find an adjacent extent in the direction of the delta. If we can + * merge with it then we modify the incoming cur extent. nei is set to + * the neighbour we found. > 0 is returned if we merged, 0 if not, and + * < 0 on error. + */ +static int try_merge(struct super_block *sb, struct native_extent *cur, + s64 delta, struct native_extent *nei, u64 arg, u8 type) +{ + u8 last_bytes[MAX_KEY_BYTES]; + u8 key_bytes[MAX_KEY_BYTES]; + struct scoutfs_key_buf last; + struct scoutfs_key_buf key; + struct native_extent ext; + int ret; + + /* short circuit prev search for common first block alloc */ + if (cur->blk_off == 0 && delta < 0) + return 0; + + trace_printk("nei %lld from blk_off %llu blkno %llu blocks %llu\n", + delta, cur->blk_off, cur->blkno, cur->blocks); + + memset(&ext, ~0, sizeof(ext)); + init_extent_key(&last, last_bytes, &ext, arg, type); + + ext.blk_off = cur->blk_off + delta; + ext.blkno = cur->blkno + delta; + ext.blocks = 1; + init_extent_key(&key, key_bytes, &ext, arg, type); + + ret = scoutfs_item_next_same(sb, &key, &last, NULL); + if (ret < 0) { + if (ret == -ENOENT) + ret = 0; + goto out; + } + + load_extent(nei, &key); + trace_printk("found nei blk_off %llu blkno %llu blocks %llu\n", + nei->blk_off, nei->blkno, nei->blocks); + + ret = merge_extents(cur, nei); +out: + return ret; +} + +/* + * Build the changes needed to insert the given extent. The semantics + * of the extents and callers means that we should not find existing extents + * that overlap the insertion. + */ +static int record_insert_changes(struct super_block *sb, + struct extent_change *chg, + struct native_extent *caller_ins, + u64 arg, u8 type) +{ + struct native_extent ins = *caller_ins; + struct native_extent ext; + int ret; + + trace_printk("inserting arg %llu type %u blk_off %llu blkno %llu blocks %llu\n", + arg, type, ins.blk_off, ins.blkno, ins.blocks); + + /* find the end */ + while (chg->type) + chg++; + + /* find previous that might be adjacent */ + ret = try_merge(sb, &ins, -1, &ext, arg, type); + if (ret < 0) + goto out; + else if (ret > 0) + chg = append_change(chg, false, &ext, arg, type); + + /* find next that might be adjacent */ + ret = try_merge(sb, &ins, 1, &ext, arg, type); + if (ret < 0) + goto out; + else if (ret > 0) + chg = append_change(chg, false, &ext, arg, type); + + /* and insert the new extent, possibly including merged neighbours */ + chg = append_change(chg, true, &ins, arg, type); + ret = 0; +out: + return ret; +} + +/* + * Record the changes needed to remove a portion of an existing extent. + */ +static int record_remove_changes(struct super_block *sb, + struct extent_change *chg, + struct native_extent *rem, u64 arg, + u8 type) +{ + u8 last_bytes[MAX_KEY_BYTES]; + u8 key_bytes[MAX_KEY_BYTES]; + struct scoutfs_key_buf last; + struct scoutfs_key_buf key; + struct native_extent left; + struct native_extent right; + struct native_extent outer; + int ret; + + trace_printk("removing arg %llu type %u blk_off %llu blkno %llu blocks %llu\n", + arg, type, rem->blk_off, rem->blkno, rem->blocks); + + /* find the end */ + while (chg->type) + chg++; + + memset(&outer, ~0, sizeof(outer)); + init_extent_key(&last, last_bytes, &outer, arg, type); + + /* find outer existing extent that contains removal extent */ + init_extent_key(&key, key_bytes, rem, arg, type); + ret = scoutfs_item_next_same(sb, &key, &last, NULL); + if (ret) + goto out; + + load_extent(&outer, &key); + + trace_printk("found outer blk_off %llu blkno %llu blocks %llu\n", + outer.blk_off, outer.blkno, outer.blocks); + + if (!extents_within(&outer, rem)) { + ret = -EIO; + goto out; + } + + trim_extents(&left, &right, &outer, rem); + + chg = append_change(chg, false, &outer, arg, type); + + if (left.blocks) { + trace_printk("left trim blk_off %llu blkno %llu blocks %llu\n", + left.blk_off, left.blkno, left.blocks); + chg = append_change(chg, true, &left, arg, type); + } + + if (right.blocks) { + trace_printk("right trim blk_off %llu blkno %llu blocks %llu\n", + right.blk_off, right.blkno, right.blocks); + chg = append_change(chg, true, &right, arg, type); + } + + ret = 0; +out: + if (ret) + trace_printk("ret %d\n", ret); + return ret; +} + +/* + * Any given allocation or free of a file data extent can involve both + * insertion and deletion of both file extent and free extent items. To + * make these atomic we record all the insertions and deletions that are + * performed. We first dirty the deletions, then insert, then delete. + * This lets us always safely unwind on failure. + */ +static int apply_changes(struct super_block *sb, struct extent_change *changes) +{ + u8 key_bytes[MAX_KEY_BYTES]; + struct scoutfs_key_buf key; + struct extent_change *chg; + int ret; + int err; + + for (chg = changes; chg->type; chg++) { + if (chg->ins) + continue; + + init_extent_key(&key, key_bytes, &chg->ext, chg->arg, + chg->type); + ret = scoutfs_item_dirty(sb, &key); + if (ret) + goto out; + } + + for (chg = changes; chg->type; chg++) { + if (!chg->ins) + continue; + + init_extent_key(&key, key_bytes, &chg->ext, chg->arg, + chg->type); + ret = scoutfs_item_create(sb, &key, NULL); + if (ret) { + while ((--chg) >= changes) { + if (!chg->ins) + continue; + init_extent_key(&key, key_bytes, &chg->ext, + chg->arg, chg->type); + err = scoutfs_item_delete(sb, &key); + BUG_ON(err); + } + goto out; + } + } + + for (chg = changes; chg->type; chg++) { + if (chg->ins) + continue; + + init_extent_key(&key, key_bytes, &chg->ext, chg->arg, + chg->type); + ret = scoutfs_item_delete(sb, &key); + BUG_ON(ret); + } + +out: + return ret; +} + +int scoutfs_data_truncate_items(struct super_block *sb, u64 ino, u64 iblock, + u64 len, bool offline) +{ + BUG(); /* NYI */ +} + +/* + * These cheesy cursors are only meant to encourage nice IO patterns for + * concurrent tasks either streaming large file writes or creating lots + * of small files. It will do very poorly in many other situations. To + * do better we'd need to go further down the road to delalloc and take + * more surrounding context into account. + */ +static struct task_cursor *get_cursor(struct data_info *datinf) +{ + struct task_cursor *curs; + struct cursor_id id = { + .task = current, + .pid = current->pid, + }; + + curs = rhashtable_lookup(&datinf->cursors, &id); + if (!curs) { + curs = list_last_entry(&datinf->cursor_lru, + struct task_cursor, list_head); + trace_printk("resetting curs %p was task %p pid %u\n", + curs, curs->id.task, curs->id.pid); + rhashtable_remove(&datinf->cursors, &curs->hash_head, GFP_NOFS); + curs->id = id; + rhashtable_insert(&datinf->cursors, &curs->hash_head, GFP_NOFS); + curs->blkno = 0; + curs->blocks = 0; + } + + list_move(&curs->list_head, &datinf->cursor_lru); + + return curs; +} + +static int bulk_alloc(struct super_block *sb) +{ + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + struct extent_change changes[MAX_CHANGES]; + struct native_extent ext; + u64 *segnos = NULL; + int ret; + int i; + + segnos = scoutfs_net_bulk_alloc(sb); + if (IS_ERR(segnos)) { + ret = PTR_ERR(segnos); + goto out; + } + + for (i = 0; segnos[i]; i++) { + memset(changes, 0, sizeof(changes)); + + /* merge or set this one */ + if (i > 0 && (segnos[i] == segnos[i - 1] + 1)) { + ext.blocks += SCOUTFS_SEGMENT_BLOCKS; + trace_printk("merged segno [%u] %llu blocks %llu\n", + i, segnos[i], ext.blocks); + } else { + ext.blkno = segnos[i] << SCOUTFS_SEGMENT_BLOCK_SHIFT; + ext.blocks = SCOUTFS_SEGMENT_BLOCKS; + trace_printk("set extent segno [%u] %llu blkno %llu\n", + i, segnos[i], ext.blkno); + } + + /* don't write if we merge with the next one */ + if ((segnos[i] + 1) == segnos[i + 1]) + continue; + + trace_printk("inserting extent [%u] blkno %llu blocks %llu\n", + i, ext.blkno, ext.blocks); + + ext.blk_off = ext.blkno; + ret = record_insert_changes(sb, changes, &ext, sbi->node_id, + SCOUTFS_FREE_EXTENT_BLKNO_KEY) ?: + apply_changes(sb, changes); + /* XXX error here leaks segnos */ + if (ret) + break; + } + +out: + if (!IS_ERR_OR_NULL(segnos)) + kfree(segnos); + + return ret; +} + +/* + * Allocate a single block for the logical block offset in the file. + * + * We try to merge single block allocations into large extents by using + * per-task cursors. Each cursor tracks a block region that should be + * searched for free extents. If we don't have a cursor, or we find + * free space outside of our cursor, then we look for the next large + * free extent. + */ +static int allocate_block(struct inode *inode, sector_t iblock, u64 *blkno) +{ + struct super_block *sb = inode->i_sb; + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + DECLARE_DATA_INFO(sb, datinf); + struct extent_change changes[MAX_CHANGES] = {{{0,}}}; + u8 last_bytes[MAX_KEY_BYTES]; + u8 key_bytes[MAX_KEY_BYTES]; + struct scoutfs_key_buf last; + struct scoutfs_key_buf key; + struct native_extent last_ext; + struct native_extent found; + struct native_extent ext; + struct task_cursor *curs; + bool alloced = false; + u8 type; + int ret; + + memset(&last_ext, ~0, sizeof(last_ext)); + + down_write(&datinf->alloc_rwsem); + + curs = get_cursor(datinf); + + /* start from the cursor or look for the next large extent */ +reset_cursor: + if (curs->blocks) { + ext.blkno = curs->blkno; + ext.blocks = 0; + type = SCOUTFS_FREE_EXTENT_BLKNO_KEY; + } else { + ext.blkno = datinf->next_large_blkno; + ext.blocks = LARGE_EXTENT_BLOCKS; + type = SCOUTFS_FREE_EXTENT_BLOCKS_KEY; + } + +retry: + trace_printk("searching %llu,%llu curs %p task %p pid %u %llu,%llu\n", + ext.blkno, ext.blocks, curs, curs->id.task, curs->id.pid, + curs->blkno, curs->blocks); + + ext.blk_off = ext.blkno; + init_extent_key(&key, key_bytes, &ext, sbi->node_id, type); + init_extent_key(&last, last_bytes, &last_ext, sbi->node_id, type); + + ret = scoutfs_item_next_same(sb, &key, &last, NULL); + if (ret < 0) { + if (ret == -ENOENT) { + /* if the cursor's empty fall back to next large */ + if (ext.blkno && ext.blocks == 0) { + curs->blkno = 0; + curs->blocks = 0; + goto reset_cursor; + } + + /* wrap the search for large extents */ + if (ext.blkno > LARGE_EXTENT_BLOCKS && ext.blocks) { + datinf->next_large_blkno = LARGE_EXTENT_BLOCKS; + ext.blkno = datinf->next_large_blkno; + goto retry; + } + + /* ask the server for more extents */ + if (ext.blocks && !alloced) { + ret = bulk_alloc(sb); + if (ret < 0) + goto out; + alloced = true; + goto retry; + } + + /* finally look for any free block at all */ + if (ext.blocks) { + ext.blkno = 0; + ext.blocks = 0; + type = SCOUTFS_FREE_EXTENT_BLKNO_KEY; + goto retry; + } + + /* after all that return -ENOSPC */ + ret = -ENOSPC; + } + goto out; + } + + load_extent(&found, &key); + trace_printk("found %llu,%llu\n", found.blkno, found.blocks); + + /* look for a new large extent if found is outside cursor */ + if (curs->blocks && + (found.blkno + found.blocks <= curs->blkno || + found.blkno >= curs->blkno + curs->blocks)) { + curs->blkno = 0; + curs->blocks = 0; + goto reset_cursor; + } + + /* + * Set the cursor if: + * - we didn't already have one + * - it's large enough for a large extent with alignment padding + * - the sufficiently large free region is past next large + */ + if (!curs->blocks && + found.blocks >= (2 * LARGE_EXTENT_BLOCKS) && + (found.blkno + found.blocks - (2 * LARGE_EXTENT_BLOCKS) >= + datinf->next_large_blkno)) { + + curs->blkno = ALIGN(max(found.blkno, datinf->next_large_blkno), + LARGE_EXTENT_BLOCKS); + curs->blocks = LARGE_EXTENT_BLOCKS; + found.blkno = curs->blkno; + found.blocks = curs->blocks; + + datinf->next_large_blkno = curs->blkno + LARGE_EXTENT_BLOCKS; + } + + trace_printk("using %llu,%llu curs %llu,%llu\n", + found.blkno, found.blocks, curs->blkno, curs->blocks); + + *blkno = found.blkno; + ext.blk_off = iblock; + ext.blkno = found.blkno; + ext.blocks = 1; + ret = record_insert_changes(sb, changes, &ext, scoutfs_ino(inode), + SCOUTFS_FILE_EXTENT_KEY); + if (ret < 0) + goto out; + + ext.blk_off = ext.blkno; + ret = record_remove_changes(sb, changes, &ext, sbi->node_id, + SCOUTFS_FREE_EXTENT_BLKNO_KEY) ?: + apply_changes(sb, changes); + + /* advance cursor if we're using it */ + if (ret == 0 && curs->blocks) { + if (--curs->blocks == 0) + curs->blkno = 0; + else + curs->blkno++; + } + +out: + up_write(&datinf->alloc_rwsem); + return ret; +} + +static int scoutfs_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create) +{ + struct super_block *sb = inode->i_sb; + DECLARE_DATA_INFO(sb, datinf); + u8 last_bytes[MAX_KEY_BYTES]; + u8 key_bytes[MAX_KEY_BYTES]; + struct scoutfs_key_buf last; + struct scoutfs_key_buf key; + struct native_extent ext; + u64 blocks; + u64 blkno; + u64 off; + int ret; + + bh->b_blocknr = 0; + bh->b_size = 0; + blocks = 0; + + ext.blk_off = iblock; + ext.blocks = 1; + ext.blkno = 0; + init_extent_key(&key, key_bytes, &ext, scoutfs_ino(inode), + SCOUTFS_FILE_EXTENT_KEY); + + ext.blk_off = ~0ULL; + ext.blkno = ~0ULL; + ext.blocks = ~0ULL; + init_extent_key(&last, last_bytes, &ext, scoutfs_ino(inode), + SCOUTFS_FILE_EXTENT_KEY); + + /* + * XXX think about how far this next can go, given locking and + * item consistency. + */ + down_read(&datinf->alloc_rwsem); + ret = scoutfs_item_next_same(sb, &key, &last, NULL); + up_read(&datinf->alloc_rwsem); + if (ret < 0) { + if (ret == -ENOENT) + ret = 0; + else + goto out; + } else { + load_extent(&ext, &key); + trace_printk("found blk_off %llu blkno %llu blocks %llu\n", + ext.blk_off, ext.blkno, ext.blocks); + if (iblock >= ext.blk_off && + iblock < (ext.blk_off + ext.blocks)) { + off = iblock - ext.blk_off; + blkno = ext.blkno + off; + blocks = ext.blocks - off; + } + } + + if (blocks == 0 && create) { + ret = allocate_block(inode, iblock, &blkno); + if (ret) + goto out; + + blocks = 1; + } + + if (blocks) { + map_bh(bh, inode->i_sb, blkno); + bh->b_size = min_t(u64, SIZE_MAX, + blocks << SCOUTFS_BLOCK_SHIFT); + } + +out: + trace_printk("ino %llu iblock %llu create %d ret %d bnr %llu size %zu\n", + scoutfs_ino(inode), (u64)iblock, create, ret, + (u64)bh->b_blocknr, bh->b_size); + + return ret; +} + +static int scoutfs_readpage(struct file *file, struct page *page) +{ + return mpage_readpage(page, scoutfs_get_block); +} + +static int scoutfs_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, scoutfs_get_block); +} + +static int scoutfs_writepage(struct page *page, struct writeback_control *wbc) +{ + return block_write_full_page(page, scoutfs_get_block, wbc); +} + +static int scoutfs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + return mpage_writepages(mapping, wbc, scoutfs_get_block); +} + static int scoutfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, @@ -382,158 +876,56 @@ static int scoutfs_write_begin(struct file *file, { struct inode *inode = mapping->host; struct super_block *sb = inode->i_sb; - pgoff_t index = pos >> PAGE_SHIFT; - loff_t size = i_size_read(inode); - struct page *page; int ret; - trace_printk("ino %llu pos %llu len %u flags %x\n", - scoutfs_ino(inode), (u64)pos, len, flags); - scoutfs_inc_counter(sb, data_write_begin); + trace_printk("ino %llu pos %llu len %u\n", + scoutfs_ino(inode), (u64)pos, len); ret = scoutfs_hold_trans(sb); if (ret) - return ret; + goto out; /* can't re-enter fs, have trans */ flags |= AOP_FLAG_NOFS; + /* generic write_end updates i_size and calls dirty_inode */ ret = scoutfs_dirty_inode_item(inode); - if (ret) - goto out; - -retry: - page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) { - ret = -ENOMEM; - goto out; - } - - trace_printk(PGF"\n", PGA(page)); - - if (!PageUptodate(page) && (pos < size && len < PAGE_CACHE_SIZE)) { - ClearPageError(page); - ret = scoutfs_readpage(file, page); - if (!ret) { - wait_on_page_locked(page); - if (!PageUptodate(page)) - ret = -EIO; - } - page_cache_release(page); - if (ret) - goto out; - - /* let grab_ lock and check for truncated pages */ - goto retry; - } - - *pagep = page; - ret = 0; -out: + if (ret == 0) + ret = block_write_begin(mapping, pos, len, flags, pagep, + scoutfs_get_block); if (ret) scoutfs_release_trans(sb); - - trace_printk("ret %d\n", ret); +out: return ret; } -/* - * Finish modification of a page cache page. - * - * write_begin has held the transaction and dirtied the inode. We - * create items for each dirty block whose value references the page - * contents that will be written. - * - * We Modify the dirty item and its dependent metadata items while - * holding the transaction so that we never get missing data. - * - * XXX - * - detect no change with copied == 0? - * - only iterate over written blocks, not the whole page? - * - make sure page granular locking and concurrent extending writes works - * - error handling needs work, truncate partial writes on failure? - */ static int scoutfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { - struct inode *inode = page->mapping->host; + struct inode *inode = mapping->host; struct super_block *sb = inode->i_sb; - struct scoutfs_data_key dkey; - struct scoutfs_key_buf key; - SCOUTFS_DECLARE_KVEC(val); - loff_t old_size = i_size_read(inode); - bool update_inode = false; - loff_t new_size; - unsigned start; - loff_t loff; - u64 block; int ret; - trace_printk("ino %llu "PGF" pos %llu len %u copied %d\n", - scoutfs_ino(inode), PGA(page), (u64)pos, len, copied); - scoutfs_inc_counter(sb, data_write_end); + trace_printk("ino %llu pgind %lu pos %llu len %u copied %d\n", + scoutfs_ino(inode), page->index, (u64)pos, len, copied); - /* zero any unwritten portions of a new page around the write */ - if (!PageUptodate(page)) { - if (copied != PAGE_CACHE_SIZE) { - start = pos & ~PAGE_CACHE_MASK; - zero_user_segments(page, 0, start, - start + copied, PAGE_CACHE_SIZE); - } - SetPageUptodate(page); - } - - new_size = pos + copied; - - for_each_page_block(page, start, loff, block, key, dkey, val) { - - /* only put data inside i_size in items */ - /* XXX ugh, kvecs are still clumsy :) */ - if (loff + SCOUTFS_BLOCK_SIZE > new_size) - val[0].iov_len = new_size - loff; - - ret = scoutfs_item_create_ephemeral(sb, &key, val); - if (ret) - goto out; - } - - /* update i_size if we extended */ - if (new_size > inode->i_size) { - i_size_write(inode, new_size); - update_inode = true; - } - - if (old_size < pos) - pagecache_isize_extended(inode, old_size, pos); - - if (copied) { + ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + if (ret > 0) { scoutfs_inode_inc_data_version(inode); - update_inode = true; - } - - if (update_inode) + /* XXX kind of a big hammer, inode life cycle needs work */ scoutfs_update_inode_item(inode); - - flush_dcache_page(page); - set_page_dirty(page); - SetPagePrivate(page); /* call invalidatepage */ - - ret = copied; -out: - unlock_page(page); + scoutfs_inode_queue_writeback(inode); + } scoutfs_release_trans(sb); - - /* XXX error handling needs work */ - WARN_ON_ONCE(ret < 0); return ret; } const struct address_space_operations scoutfs_file_aops = { .readpage = scoutfs_readpage, + .readpages = scoutfs_readpages, .writepage = scoutfs_writepage, - .set_page_dirty = __set_page_dirty_nobuffers, - .invalidatepage = scoutfs_invalidatepage, + .writepages = scoutfs_writepages, .write_begin = scoutfs_write_begin, .write_end = scoutfs_write_end, }; @@ -545,23 +937,75 @@ const struct file_operations scoutfs_file_fops = { .aio_write = generic_file_aio_write, .unlocked_ioctl = scoutfs_ioctl, .fsync = scoutfs_file_fsync, - .llseek = generic_file_llseek, }; +static int derpy_global_mutex_is_held(void) +{ + return 1; +} + +static struct rhashtable_params cursor_hash_params = { + .key_len = member_sizeof(struct task_cursor, id), + .key_offset = offsetof(struct task_cursor, id), + .head_offset = offsetof(struct task_cursor, hash_head), + .hashfn = arch_fast_hash, + .grow_decision = rht_grow_above_75, + .shrink_decision = rht_shrink_below_30, + + .mutex_is_held = derpy_global_mutex_is_held, +}; + +static void destroy_cursors(struct data_info *datinf) +{ + struct task_cursor *curs; + struct task_cursor *pos; + + list_for_each_entry_safe(curs, pos, &datinf->cursor_lru, list_head) { + list_del_init(&curs->list_head); + kfree(curs); + } + rhashtable_destroy(&datinf->cursors); +} + int scoutfs_data_setup(struct super_block *sb) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct data_info *datinf; - - /* page block iteration doesn't understand multiple pages per block */ - BUILD_BUG_ON(PAGE_SIZE < SCOUTFS_BLOCK_SIZE); + struct task_cursor *curs; + int ret; + int i; datinf = kzalloc(sizeof(struct data_info), GFP_KERNEL); if (!datinf) return -ENOMEM; - sbi->data_info = datinf; - init_llist_head(&datinf->writeback_pages); + init_rwsem(&datinf->alloc_rwsem); + INIT_LIST_HEAD(&datinf->cursor_lru); + /* always search for large aligned extents */ + datinf->next_large_blkno = LARGE_EXTENT_BLOCKS; + + ret = rhashtable_init(&datinf->cursors, &cursor_hash_params); + if (ret) { + kfree(datinf); + return -ENOMEM; + } + + /* just allocate all of these up front */ + for (i = 0; i < NR_CURSORS; i++) { + curs = kzalloc(sizeof(struct task_cursor), GFP_KERNEL); + if (!curs) { + destroy_cursors(datinf); + kfree(datinf); + return -ENOMEM; + } + + curs->id.pid = i; + rhashtable_insert(&datinf->cursors, &curs->hash_head, + GFP_KERNEL); + list_add(&curs->list_head, &datinf->cursor_lru); + } + + sbi->data_info = datinf; return 0; } @@ -572,7 +1016,7 @@ void scoutfs_data_destroy(struct super_block *sb) struct data_info *datinf = sbi->data_info; if (datinf) { - WARN_ON_ONCE(!llist_empty(&datinf->writeback_pages)); + destroy_cursors(datinf); kfree(datinf); } } diff --git a/kmod/src/data.h b/kmod/src/data.h index 189b2cba..1319d100 100644 --- a/kmod/src/data.h +++ b/kmod/src/data.h @@ -6,7 +6,6 @@ extern const struct file_operations scoutfs_file_fops; int scoutfs_data_truncate_items(struct super_block *sb, u64 ino, u64 iblock, u64 len, bool offline); -void scoutfs_data_end_writeback(struct super_block *sb, int err); int scoutfs_data_setup(struct super_block *sb); void scoutfs_data_destroy(struct super_block *sb); diff --git a/kmod/src/format.h b/kmod/src/format.h index 5d3c184b..a58d12b6 100644 --- a/kmod/src/format.h +++ b/kmod/src/format.h @@ -156,9 +156,10 @@ struct scoutfs_segment_block { #define SCOUTFS_READDIR_KEY 6 #define SCOUTFS_LINK_BACKREF_KEY 7 #define SCOUTFS_SYMLINK_KEY 8 -#define SCOUTFS_EXTENT_KEY 9 +#define SCOUTFS_FILE_EXTENT_KEY 9 #define SCOUTFS_ORPHAN_KEY 10 -#define SCOUTFS_DATA_KEY 11 +#define SCOUTFS_FREE_EXTENT_BLKNO_KEY 11 +#define SCOUTFS_FREE_EXTENT_BLOCKS_KEY 12 /* not found in the fs */ #define SCOUTFS_MAX_UNUSED_KEY 253 #define SCOUTFS_NET_ADDR_KEY 254 @@ -198,11 +199,28 @@ struct scoutfs_orphan_key { __be64 ino; } __packed; -/* value is data payload bytes */ -struct scoutfs_data_key { +/* no value */ +struct scoutfs_file_extent_key { __u8 type; __be64 ino; - __be64 block; + __be64 last_blk_off; + __be64 last_blkno; + __be64 blocks; +} __packed; + +/* no value */ +struct scoutfs_free_extent_blkno_key { + __u8 type; + __be64 node_id; + __be64 last_blkno; + __be64 blocks; +} __packed; + +struct scoutfs_free_extent_blocks_key { + __u8 type; + __be64 node_id; + __be64 blocks; + __be64 last_blkno; } __packed; /* value is each item's part of the full xattr value for the off/len */ @@ -384,6 +402,11 @@ struct scoutfs_net_manifest_entries { struct scoutfs_manifest_entry ments[0]; } __packed; +struct scoutfs_net_segnos { + __le16 nr; + __le64 segnos[0]; +} __packed; + enum { /* sends and receives a struct scoutfs_timeval */ SCOUTFS_NET_TRADE_TIME = 0, @@ -391,6 +414,7 @@ enum { SCOUTFS_NET_MANIFEST_RANGE_ENTRIES, SCOUTFS_NET_ALLOC_SEGNO, SCOUTFS_NET_RECORD_SEGMENT, + SCOUTFS_NET_BULK_ALLOC, SCOUTFS_NET_UNKNOWN, }; diff --git a/kmod/src/inode.c b/kmod/src/inode.c index 7f61f552..71a98d33 100644 --- a/kmod/src/inode.c +++ b/kmod/src/inode.c @@ -47,6 +47,16 @@ struct free_ino_pool { bool in_flight; }; +struct inode_sb_info { + struct free_ino_pool pool; + + spinlock_t writeback_lock; + struct rb_root writeback_inodes; +}; + +#define DECLARE_INODE_SB_INFO(sb, name) \ + struct inode_sb_info *name = SCOUTFS_SB(sb)->inode_sb_info + static struct kmem_cache *scoutfs_inode_cachep; /* @@ -61,6 +71,7 @@ static void scoutfs_inode_ctor(void *obj) seqcount_init(&ci->seqcount); ci->staging = false; init_rwsem(&ci->xattr_rwsem); + RB_CLEAR_NODE(&ci->writeback_node); inode_init_once(&ci->inode); } @@ -84,8 +95,48 @@ static void scoutfs_i_callback(struct rcu_head *head) kmem_cache_free(scoutfs_inode_cachep, SCOUTFS_I(inode)); } +static void insert_writeback_inode(struct inode_sb_info *inf, + struct scoutfs_inode_info *ins) +{ + struct rb_root *root = &inf->writeback_inodes; + struct rb_node **node = &root->rb_node; + struct rb_node *parent = NULL; + struct scoutfs_inode_info *si; + + while (*node) { + parent = *node; + si = container_of(*node, struct scoutfs_inode_info, + writeback_node); + + if (ins->ino < si->ino) + node = &(*node)->rb_left; + else if (ins->ino > si->ino) + node = &(*node)->rb_right; + else + BUG(); + } + + rb_link_node(&ins->writeback_node, parent, node); + rb_insert_color(&ins->writeback_node, root); +} + +static void remove_writeback_inode(struct inode_sb_info *inf, + struct scoutfs_inode_info *si) +{ + if (!RB_EMPTY_NODE(&si->writeback_node)) { + rb_erase(&si->writeback_node, &inf->writeback_inodes); + RB_CLEAR_NODE(&si->writeback_node); + } +} + void scoutfs_destroy_inode(struct inode *inode) { + DECLARE_INODE_SB_INFO(inode->i_sb, inf); + + spin_lock(&inf->writeback_lock); + remove_writeback_inode(inf, SCOUTFS_I(inode)); + spin_unlock(&inf->writeback_lock); + call_rcu(&inode->i_rcu, scoutfs_i_callback); } @@ -393,7 +444,7 @@ u64 scoutfs_last_ino(struct super_block *sb) */ void scoutfs_inode_fill_pool(struct super_block *sb, u64 ino, u64 nr) { - struct free_ino_pool *pool = SCOUTFS_SB(sb)->free_ino_pool; + struct free_ino_pool *pool = &SCOUTFS_SB(sb)->inode_sb_info->pool; trace_printk("filling ino %llu nr %llu\n", ino, nr); @@ -427,7 +478,7 @@ static bool pool_in_flight(struct free_ino_pool *pool) */ static int alloc_ino(struct super_block *sb, u64 *ino) { - struct free_ino_pool *pool = SCOUTFS_SB(sb)->free_ino_pool; + struct free_ino_pool *pool = &SCOUTFS_SB(sb)->inode_sb_info->pool; bool request; int ret; @@ -733,28 +784,121 @@ int scoutfs_orphan_inode(struct inode *inode) return ret; } +/* + * Track an inode that could have dirty pages. Used to kick off writeback + * on all dirty pages during transaction commit without tying ourselves in + * knots trying to call through the high level vfs sync methods. + */ +void scoutfs_inode_queue_writeback(struct inode *inode) +{ + DECLARE_INODE_SB_INFO(inode->i_sb, inf); + struct scoutfs_inode_info *si = SCOUTFS_I(inode); + + spin_lock(&inf->writeback_lock); + if (RB_EMPTY_NODE(&si->writeback_node)) + insert_writeback_inode(inf, si); + spin_unlock(&inf->writeback_lock); +} + +/* + * Walk our dirty inodes in ino order and either start dirty page + * writeback or wait for writeback to complete. + * + * This is called by transaction commiting so other writers are + * excluded. We're still very careful to iterate over the tree while it + * and the inodes could be changing. + * + * Because writes are excluded we know that there's no remaining dirty + * pages once waiting returns successfully. + * + * XXX not sure what to do about retrying io errors. + */ +int scoutfs_inode_walk_writeback(struct super_block *sb, bool write) +{ + DECLARE_INODE_SB_INFO(sb, inf); + struct scoutfs_inode_info *si; + struct rb_node *node; + struct inode *inode; + struct inode *defer_iput = NULL; + int ret; + + spin_lock(&inf->writeback_lock); + + node = rb_first(&inf->writeback_inodes); + while (node) { + si = container_of(node, struct scoutfs_inode_info, + writeback_node); + node = rb_next(node); + inode = igrab(&si->inode); + if (!inode) + continue; + + spin_unlock(&inf->writeback_lock); + + if (defer_iput) { + iput(defer_iput); + defer_iput = NULL; + } + + if (write) + ret = filemap_fdatawrite(inode->i_mapping); + else + ret = filemap_fdatawait(inode->i_mapping); + trace_printk("ino %llu write %d ret %d\n", + scoutfs_ino(inode), write, ret); + if (ret) { + iput(inode); + goto out; + } + + spin_lock(&inf->writeback_lock); + + if (WARN_ON_ONCE(RB_EMPTY_NODE(&si->writeback_node))) + node = rb_first(&inf->writeback_inodes); + else + node = rb_next(&si->writeback_node); + + if (!write) + remove_writeback_inode(inf, si); + + /* avoid iput->destroy lock deadlock */ + defer_iput = inode; + } + + spin_unlock(&inf->writeback_lock); +out: + if (defer_iput) + iput(defer_iput); + return ret; +} + int scoutfs_inode_setup(struct super_block *sb) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct free_ino_pool *pool; + struct inode_sb_info *inf; - pool = kzalloc(sizeof(struct free_ino_pool), GFP_KERNEL); - if (!pool) + inf = kzalloc(sizeof(struct inode_sb_info), GFP_KERNEL); + if (!inf) return -ENOMEM; + pool = &inf->pool; init_waitqueue_head(&pool->waitq); spin_lock_init(&pool->lock); - sbi->free_ino_pool = pool; + spin_lock_init(&inf->writeback_lock); + inf->writeback_inodes = RB_ROOT; + + sbi->inode_sb_info = inf; return 0; } void scoutfs_inode_destroy(struct super_block *sb) { - struct free_ino_pool *pool = SCOUTFS_SB(sb)->free_ino_pool; + struct inode_sb_info *inf = SCOUTFS_SB(sb)->inode_sb_info; - kfree(pool); + kfree(inf); } void scoutfs_inode_exit(void) diff --git a/kmod/src/inode.h b/kmod/src/inode.h index da24e9af..59da8f7a 100644 --- a/kmod/src/inode.h +++ b/kmod/src/inode.h @@ -13,6 +13,7 @@ struct scoutfs_inode_info { seqcount_t seqcount; bool staging; /* holder of i_mutex is staging */ struct rw_semaphore xattr_rwsem; + struct rb_node writeback_node; struct inode inode; }; @@ -48,6 +49,9 @@ u64 scoutfs_inode_get_data_version(struct inode *inode); int scoutfs_scan_orphans(struct super_block *sb); +void scoutfs_inode_queue_writeback(struct inode *inode); +int scoutfs_inode_walk_writeback(struct super_block *sb, bool write); + u64 scoutfs_last_ino(struct super_block *sb); void scoutfs_inode_exit(void); diff --git a/kmod/src/net.c b/kmod/src/net.c index 11e5fe30..33d2da3c 100644 --- a/kmod/src/net.c +++ b/kmod/src/net.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "format.h" #include "net.h" @@ -363,6 +364,61 @@ static struct send_buf *alloc_sbuf(unsigned data_len) return sbuf; } +/* XXX I dunno, totally made up */ +#define BULK_COUNT 32 + +static struct send_buf *process_bulk_alloc(struct super_block *sb,void *req, + int req_len) +{ + DECLARE_NET_INFO(sb, nti); + struct scoutfs_net_segnos *ns; + struct commit_waiter cw; + struct send_buf *sbuf; + u64 segno; + int ret; + int i; + + if (req_len != 0) + return ERR_PTR(-EINVAL); + + sbuf = alloc_sbuf(offsetof(struct scoutfs_net_segnos, + segnos[BULK_COUNT])); + if (!sbuf) + return ERR_PTR(-ENOMEM); + + ns = (void *)sbuf->nh->data; + ns->nr = cpu_to_le16(BULK_COUNT); + + down_read(&nti->ring_commit_rwsem); + + for (i = 0; i < BULK_COUNT; i++) { + ret = scoutfs_alloc_segno(sb, &segno); + if (ret) { + while (i-- > 0) + scoutfs_alloc_free(sb, + le64_to_cpu(ns->segnos[i])); + break; + } + + ns->segnos[i] = cpu_to_le64(segno); + } + + + if (ret == 0) + queue_commit_work(nti, &cw); + up_read(&nti->ring_commit_rwsem); + + if (ret == 0) + ret = wait_for_commit(&cw); + + if (ret) + sbuf->nh->status = SCOUTFS_NET_STATUS_ERROR; + else + sbuf->nh->status = SCOUTFS_NET_STATUS_SUCCESS; + + return sbuf; +} + static struct send_buf *process_record_segment(struct super_block *sb, void *req, int req_len) { @@ -616,6 +672,7 @@ static proc_func_t type_proc_func(u8 type) process_manifest_range_entries, [SCOUTFS_NET_ALLOC_SEGNO] = process_alloc_segno, [SCOUTFS_NET_RECORD_SEGMENT] = process_record_segment, + [SCOUTFS_NET_BULK_ALLOC] = process_bulk_alloc, }; return type < SCOUTFS_NET_UNKNOWN ? funcs[type] : NULL; @@ -1100,6 +1157,113 @@ static int add_send_buf(struct super_block *sb, int type, void *data, return 0; } +struct bulk_alloc_args { + struct completion comp; + u64 *segnos; + int ret; +}; + +static int sort_cmp_u64s(const void *A, const void *B) +{ + const u64 *a = A; + const u64 *b = B; + + return *a < *b ? -1 : *a > *b ? 1 : 0; +} + +static void sort_swap_u64s(void *A, void *B, int size) +{ + u64 *a = A; + u64 *b = B; + + swap(*a, *b); +} + +static int bulk_alloc_reply(struct super_block *sb, void *reply, int ret, + void *arg) +{ + struct bulk_alloc_args *args = arg; + struct scoutfs_net_segnos *ns = reply; + u16 nr; + int i; + + if (ret < sizeof(struct scoutfs_net_segnos) || + ret != offsetof(struct scoutfs_net_segnos, + segnos[le16_to_cpu(ns->nr)])) { + ret = -EINVAL; + goto out; + } + + nr = le16_to_cpu(ns->nr); + + args->segnos = kmalloc((nr + 1) * sizeof(args->segnos[0]), GFP_NOFS); + if (args->segnos == NULL) { + ret = -ENOMEM; /* XXX hmm. */ + goto out; + } + + for (i = 0; i < nr; i++) { + args->segnos[i] = le64_to_cpu(ns->segnos[i]); + + /* make sure they're all non-zero */ + if (args->segnos[i] == 0) { + ret = -EINVAL; + goto out; + } + } + + sort(args->segnos, nr, sizeof(args->segnos[0]), + sort_cmp_u64s, sort_swap_u64s); + + /* make sure they're all unique */ + for (i = 1; i < nr; i++) { + if (args->segnos[i] == args->segnos[i - 1]) { + ret = -EINVAL; + goto out; + } + } + + args->segnos[nr] = 0; + ret = 0; +out: + if (ret && args->segnos) { + kfree(args->segnos); + args->segnos = NULL; + } + args->ret = ret; + complete(&args->comp); + return args->ret; +} + +/* + * Returns a 0-terminated allocated array of segnos, the caller is + * responsible for freeing it. + */ +u64 *scoutfs_net_bulk_alloc(struct super_block *sb) +{ + struct bulk_alloc_args args; + int ret; + + args.segnos = NULL; + init_completion(&args.comp); + + ret = add_send_buf(sb, SCOUTFS_NET_BULK_ALLOC, NULL, 0, + bulk_alloc_reply, &args); + if (ret == 0) { + wait_for_completion(&args.comp); + ret = args.ret; + if (ret == 0 && (args.segnos == NULL || args.segnos[0] == 0)) + ret = -ENOSPC; + } + + if (ret) { + kfree(args.segnos); + args.segnos = ERR_PTR(ret); + } + + return args.segnos; +} + /* * Eventually we're going to have messages that control compaction. * Each client mount would have long-lived work that sends requests diff --git a/kmod/src/net.h b/kmod/src/net.h index d48fc2b7..125bf327 100644 --- a/kmod/src/net.h +++ b/kmod/src/net.h @@ -13,6 +13,7 @@ int scoutfs_net_manifest_range_entries(struct super_block *sb, int scoutfs_net_alloc_segno(struct super_block *sb, u64 *segno); int scoutfs_net_record_segment(struct super_block *sb, struct scoutfs_segment *seg, u8 level); +u64 *scoutfs_net_bulk_alloc(struct super_block *sb); int scoutfs_net_get_compaction(struct super_block *sb, void *curs); int scoutfs_net_finish_compaction(struct super_block *sb, void *curs, diff --git a/kmod/src/super.c b/kmod/src/super.c index 48fb27d2..0fe2ed52 100644 --- a/kmod/src/super.c +++ b/kmod/src/super.c @@ -204,6 +204,12 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent) if (!sbi) return -ENOMEM; + /* + * XXX this is random today for initial testing, but we'll want + * it to be assigned by the server. + */ + get_random_bytes_arch(&sbi->node_id, sizeof(sbi->node_id)); + spin_lock_init(&sbi->next_ino_lock); atomic_set(&sbi->trans_holds, 0); init_waitqueue_head(&sbi->trans_hold_wq); diff --git a/kmod/src/super.h b/kmod/src/super.h index 184c92b8..5b6d5903 100644 --- a/kmod/src/super.h +++ b/kmod/src/super.h @@ -14,11 +14,13 @@ struct compact_info; struct data_info; struct lock_info; struct net_info; -struct free_ino_pool; +struct inode_sb_info; struct scoutfs_sb_info { struct super_block *sb; + u64 node_id; + struct scoutfs_super_block super; spinlock_t next_ino_lock; @@ -29,7 +31,7 @@ struct scoutfs_sb_info { struct seg_alloc *seg_alloc; struct compact_info *compact_info; struct data_info *data_info; - struct free_ino_pool *free_ino_pool; + struct inode_sb_info *inode_sb_info; atomic_t trans_holds; wait_queue_head_t trans_hold_wq; diff --git a/kmod/src/trans.c b/kmod/src/trans.c index e6247bc0..11941c7e 100644 --- a/kmod/src/trans.c +++ b/kmod/src/trans.c @@ -26,6 +26,7 @@ #include "seg.h" #include "counters.h" #include "net.h" +#include "inode.h" #include "scoutfs_trace.h" /* @@ -97,10 +98,12 @@ void scoutfs_trans_write_func(struct work_struct *work) * about leaking segnos nor duplicate manifest entries * on crashes between us and the server. */ - ret = scoutfs_net_alloc_segno(sb, &segno) ?: + ret = scoutfs_inode_walk_writeback(sb, true) ?: + scoutfs_net_alloc_segno(sb, &segno) ?: scoutfs_seg_alloc(sb, segno, &seg) ?: scoutfs_item_dirty_seg(sb, seg) ?: scoutfs_seg_submit_write(sb, seg, &comp) ?: + scoutfs_inode_walk_writeback(sb, false) ?: scoutfs_bio_wait_comp(sb, &comp) ?: scoutfs_net_record_segment(sb, seg, 0); if (ret) @@ -112,9 +115,6 @@ out: /* XXX this all needs serious work for dealing with errors */ WARN_ON_ONCE(ret); - /* must be done before waking waiting trans holders who might dirty */ - scoutfs_data_end_writeback(sb, ret); - spin_lock(&sbi->trans_write_lock); sbi->trans_write_count++; sbi->trans_write_ret = ret;