diff --git a/kmod/src/block.c b/kmod/src/block.c index cee3b681..08b37570 100644 --- a/kmod/src/block.c +++ b/kmod/src/block.c @@ -11,7 +11,6 @@ * General Public License for more details. */ #include -#include #include #include @@ -23,54 +22,105 @@ #include "buddy.h" /* - * scoutfs has a fixed 4k small block size for metadata blocks. This - * lets us consistently use buffer heads without worrying about having a - * block size greater than the page size. + * scoutfs maintains a cache of metadata blocks in a radix tree. This + * gives us blocks bigger than page size and avoids fixing the location + * of a logical cached block in one possible position in a larger block + * device page cache page. * - * This block interface does the work to cow dirty blocks, track dirty - * blocks, generate checksums as they're written, only write them in - * transactions, verify checksums on read, and invalidate and retry - * reads of stale cached blocks. (That last bit only has a hint of an - * implementation.) + * This does the work to cow dirty blocks, track dirty blocks, generate + * checksums as they're written, only write them in transactions, verify + * checksums on read, and invalidate and retry reads of stale cached + * blocks. (That last bit only has a hint of an implementation.) * * XXX * - tear down dirty blocks left by write errors on unmount - * - should invalidate dirty blocks if freed + * - multiple smaller page allocs + * - vmalloc? vm_map_ram? + * - blocks allocated from per-cpu pages when page size > block size + * - cmwq crc calcs if that makes sense + * - slab of block structs + * - don't verify checksums in end_io context? + * - fall back to multiple single bios per block io if bio alloc fails? + * - fail mount if total_blocks is greater than long radix blkno */ -struct scoutfs_block; - -struct block_bh_private { - struct super_block *sb; - struct buffer_head *bh; - struct rb_node node; +struct scoutfs_block { struct rw_semaphore rwsem; - bool rwsem_class; + atomic_t refcount; + u64 blkno; + + unsigned long bits; + + struct super_block *sb; + struct page *page; + void *data; }; +#define DIRTY_RADIX_TAG 0 + enum { - BH_ScoutfsVerified = BH_PrivateStart, + BLOCK_BIT_UPTODATE = 0, + BLOCK_BIT_ERROR, + BLOCK_BIT_CLASS_SET, }; -BUFFER_FNS(ScoutfsVerified, scoutfs_verified) -static int verify_block_header(struct scoutfs_sb_info *sbi, - struct buffer_head *bh) +static struct scoutfs_block *alloc_block(struct super_block *sb, u64 blkno) { + struct scoutfs_block *bl; + struct page *page; + + /* we'd need to be just a bit more careful */ + BUILD_BUG_ON(PAGE_SIZE > SCOUTFS_BLOCK_SIZE); + + bl = kzalloc(sizeof(struct scoutfs_block), GFP_NOFS); + if (bl) { + /* change _from_contents if allocs not aligned */ + page = alloc_pages(GFP_NOFS, SCOUTFS_BLOCK_PAGE_ORDER); + WARN_ON_ONCE(!page); + if (page) { + init_rwsem(&bl->rwsem); + atomic_set(&bl->refcount, 1); + bl->blkno = blkno; + bl->sb = sb; + bl->page = page; + bl->data = page_address(page); + trace_printk("allocated bl %p\n", bl); + } else { + kfree(bl); + bl = NULL; + } + } + + return bl; +} + +void scoutfs_block_put(struct scoutfs_block *bl) +{ + if (!IS_ERR_OR_NULL(bl) && atomic_dec_and_test(&bl->refcount)) { + trace_printk("freeing bl %p\n", bl); + __free_pages(bl->page, SCOUTFS_BLOCK_PAGE_ORDER); + kfree(bl); + scoutfs_inc_counter(bl->sb, block_mem_free); + } +} + +static int verify_block_header(struct super_block *sb, struct scoutfs_block *bl) +{ + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct scoutfs_super_block *super = &sbi->super; - struct scoutfs_block_header *hdr = (void *)bh->b_data; + struct scoutfs_block_header *hdr = bl->data; u32 crc = scoutfs_crc_block(hdr); int ret = -EIO; if (le32_to_cpu(hdr->crc) != crc) { - printk("blkno %llu hdr crc %x != calculated %x\n", - (u64)bh->b_blocknr, le32_to_cpu(hdr->crc), crc); + printk("blkno %llu hdr crc %x != calculated %x\n", bl->blkno, + le32_to_cpu(hdr->crc), crc); } else if (super->hdr.fsid && hdr->fsid != super->hdr.fsid) { - printk("blkno %llu fsid %llx != super fsid %llx\n", - (u64)bh->b_blocknr, le64_to_cpu(hdr->fsid), - le64_to_cpu(super->hdr.fsid)); - } else if (le64_to_cpu(hdr->blkno) != bh->b_blocknr) { - printk("blkno %llu invalid hdr blkno %llx\n", - (u64)bh->b_blocknr, le64_to_cpu(hdr->blkno)); + printk("blkno %llu fsid %llx != super fsid %llx\n", bl->blkno, + le64_to_cpu(hdr->fsid), le64_to_cpu(super->hdr.fsid)); + } else if (le64_to_cpu(hdr->blkno) != bl->blkno) { + printk("blkno %llu invalid hdr blkno %llx\n", bl->blkno, + le64_to_cpu(hdr->blkno)); } else { ret = 0; } @@ -78,143 +128,161 @@ static int verify_block_header(struct scoutfs_sb_info *sbi, return ret; } -static struct buffer_head *bh_from_bhp_node(struct rb_node *node) +static void block_read_end_io(struct bio *bio, int err) { - struct block_bh_private *bhp; + struct scoutfs_block *bl = bio->bi_private; + struct scoutfs_sb_info *sbi = SCOUTFS_SB(bl->sb); - bhp = container_of(node, struct block_bh_private, node); - return bhp->bh; -} + if (!err && !verify_block_header(bl->sb, bl)) + set_bit(BLOCK_BIT_UPTODATE, &bl->bits); + else + set_bit(BLOCK_BIT_ERROR, &bl->bits); -static struct scoutfs_sb_info *sbi_from_bh(struct buffer_head *bh) -{ - struct block_bh_private *bhp = bh->b_private; + /* + * uncontended spin_lock in wake_up and unconditional smp_mb to + * make waitqueue_active safe are about the same cost, so we + * prefer the obviously safe choice. + */ + wake_up(&sbi->block_wq); - return SCOUTFS_SB(bhp->sb); -} - -static void insert_bhp_rb(struct rb_root *root, struct buffer_head *ins) -{ - struct rb_node **node = &root->rb_node; - struct rb_node *parent = NULL; - struct block_bh_private *bhp; - struct buffer_head *bh; - - while (*node) { - parent = *node; - bh = bh_from_bhp_node(*node); - - if (ins->b_blocknr < bh->b_blocknr) - node = &(*node)->rb_left; - else - node = &(*node)->rb_right; - } - - bhp = ins->b_private; - rb_link_node(&bhp->node, parent, node); - rb_insert_color(&bhp->node, root); + scoutfs_block_put(bl); + bio_put(bio); } /* - * Track a dirty block by allocating private data and inserting it into - * the dirty rbtree in the super block. - * - * Callers are in transactions that prevent metadata writeback so blocks - * won't be written and cleaned while we're trying to dirty them. We - * serialize racing to add dirty tracking to the same block in case the - * caller didn't. - * - * Presence in the dirty tree holds a bh ref. + * Once a transaction block is persistent it's fine to drop the dirty + * tag. It's been checksummed so it can be read in again. It's seq + * will be in the current transaction so it'll simply be dirtied and + * checksummed and written out again. */ -static int insert_bhp(struct super_block *sb, struct buffer_head *bh) +static void block_write_end_io(struct bio *bio, int err) { - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct block_bh_private *bhp; + struct scoutfs_block *bl = bio->bi_private; + struct scoutfs_sb_info *sbi = SCOUTFS_SB(bl->sb); unsigned long flags; - int ret = 0; - if (bh->b_private) - return 0; - - lock_buffer(bh); - if (bh->b_private) - goto out; - - bhp = kmalloc(sizeof(*bhp), GFP_NOFS); - if (!bhp) { - ret = -ENOMEM; - goto out; + if (!err) { + spin_lock_irqsave(&sbi->block_lock, flags); + radix_tree_tag_clear(&sbi->block_radix, + bl->blkno, DIRTY_RADIX_TAG); + spin_unlock_irqrestore(&sbi->block_lock, flags); } - bhp->sb = sb; - bhp->bh = bh; - get_bh(bh); - bh->b_private = bhp; - /* lockdep class can be set by callers that use the lock */ - init_rwsem(&bhp->rwsem); - bhp->rwsem_class = false; + /* not too worried about racing ints */ + if (err && !sbi->block_write_err) + sbi->block_write_err = err; - spin_lock_irqsave(&sbi->block_lock, flags); - insert_bhp_rb(&sbi->block_dirty_tree, bh); - spin_unlock_irqrestore(&sbi->block_lock, flags); + if (atomic_dec_and_test(&sbi->block_writes)) + wake_up(&sbi->block_wq); + + scoutfs_block_put(bl); + bio_put(bio); - trace_printk("blkno %llu bh %p\n", (u64)bh->b_blocknr, bh); -out: - unlock_buffer(bh); - return ret; } -static void erase_bhp(struct buffer_head *bh) +static int block_submit_bio(struct scoutfs_block *bl, int rw) { - struct block_bh_private *bhp = bh->b_private; - struct scoutfs_sb_info *sbi = sbi_from_bh(bh); - unsigned long flags; + struct super_block *sb = bl->sb; + struct bio *bio; + int ret; - spin_lock_irqsave(&sbi->block_lock, flags); - rb_erase(&bhp->node, &sbi->block_dirty_tree); - spin_unlock_irqrestore(&sbi->block_lock, flags); + bio = bio_alloc(GFP_NOFS, SCOUTFS_PAGES_PER_BLOCK); + if (WARN_ON_ONCE(!bio)) + return -ENOMEM; - put_bh(bh); - kfree(bhp); - bh->b_private = NULL; + bio->bi_sector = bl->blkno << (SCOUTFS_BLOCK_SHIFT - 9); + bio->bi_bdev = sb->s_bdev; + if (rw & WRITE) { + bio->bi_end_io = block_write_end_io; + } else + bio->bi_end_io = block_read_end_io; + bio->bi_private = bl; - trace_printk("blkno %llu bh %p\n", (u64)bh->b_blocknr, bh); + ret = bio_add_page(bio, bl->page, SCOUTFS_BLOCK_SIZE, 0); + if (WARN_ON_ONCE(ret != SCOUTFS_BLOCK_SIZE)) { + bio_put(bio); + return -ENOMEM; + } + + atomic_inc(&bl->refcount); + submit_bio(rw, bio); + + return 0; } /* * Read an existing block from the device and verify its metadata header. - * The buffer head is returned unlocked and uptodate. */ struct scoutfs_block *scoutfs_block_read(struct super_block *sb, u64 blkno) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct buffer_head *bh; + struct scoutfs_block *found; + struct scoutfs_block *bl; + unsigned long flags; int ret; - bh = sb_bread(sb, blkno); - if (!bh) { - bh = ERR_PTR(-EIO); + /* find an existing block, dropping if it's errored */ + spin_lock_irqsave(&sbi->block_lock, flags); + + bl = radix_tree_lookup(&sbi->block_radix, blkno); + if (bl) { + if (test_bit(BLOCK_BIT_ERROR, &bl->bits)) { + radix_tree_delete(&sbi->block_radix, bl->blkno); + scoutfs_block_put(bl); + bl = NULL; + } else { + atomic_inc(&bl->refcount); + } + } + spin_unlock_irqrestore(&sbi->block_lock, flags); + if (bl) + goto wait; + + /* allocate a new block and try to insert it */ + bl = alloc_block(sb, blkno); + if (!bl) { + ret = -EIO; goto out; } - if (!buffer_scoutfs_verified(bh)) { - lock_buffer(bh); - if (!buffer_scoutfs_verified(bh)) { - ret = verify_block_header(sbi, bh); - if (!ret) - set_buffer_scoutfs_verified(bh); - } else { - ret = 0; - } - unlock_buffer(bh); - if (ret < 0) { - scoutfs_block_put((void *)bh); - bh = ERR_PTR(ret); - } + ret = radix_tree_preload(GFP_NOFS); + if (ret) + goto out; + + spin_lock_irqsave(&sbi->block_lock, flags); + + found = radix_tree_lookup(&sbi->block_radix, blkno); + if (found) { + scoutfs_block_put(bl); + bl = found; + atomic_inc(&bl->refcount); + } else { + radix_tree_insert(&sbi->block_radix, blkno, bl); + atomic_inc(&bl->refcount); } + spin_unlock_irqrestore(&sbi->block_lock, flags); + radix_tree_preload_end(); + + if (!found) { + ret = block_submit_bio(bl, READ_SYNC | REQ_META); + if (ret) + goto out; + } + +wait: + ret = wait_event_interruptible(sbi->block_wq, + test_bit(BLOCK_BIT_UPTODATE, &bl->bits) || + test_bit(BLOCK_BIT_ERROR, &bl->bits)); + if (ret == 0 && test_bit(BLOCK_BIT_ERROR, &bl->bits)) + ret = -EIO; out: - return (void *)bh; + if (ret) { + scoutfs_block_put(bl); + bl = ERR_PTR(ret); + } + + return bl; } /* @@ -226,7 +294,8 @@ out: * many times the caller assumes that we've hit persistent corruption * and returns an error. * - * XXX how does this race with + * XXX: + * - actually implement this * - reads that span transactions? * - writers creating a new dirty block? */ @@ -240,7 +309,6 @@ struct scoutfs_block *scoutfs_block_read_ref(struct super_block *sb, if (!IS_ERR(bl)) { hdr = scoutfs_block_data(bl); if (WARN_ON_ONCE(hdr->seq != ref->seq)) { - clear_buffer_uptodate(bl); scoutfs_block_put(bl); bl = ERR_PTR(-EAGAIN); } @@ -250,35 +318,19 @@ struct scoutfs_block *scoutfs_block_read_ref(struct super_block *sb, } /* - * We stop tracking dirty metadata blocks when their IO succeeds. This - * happens in the context of transaction commit which excludes other - * metadata dirtying paths. + * The caller knows that it's not racing with writers. */ -static void block_write_end_io(struct buffer_head *bh, int uptodate) +int scoutfs_block_has_dirty(struct super_block *sb) { - struct scoutfs_sb_info *sbi = sbi_from_bh(bh); + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - trace_printk("bh %p uptdate %d\n", bh, uptodate); - - /* XXX */ - unlock_buffer(bh); - - if (uptodate) { - erase_bhp(bh); - } else { - /* don't care if this is racey? */ - if (!sbi->block_write_err) - sbi->block_write_err = -EIO; - } - - if (atomic_dec_and_test(&sbi->block_writes)) - wake_up(&sbi->block_wq); + return radix_tree_tagged(&sbi->block_radix, DIRTY_RADIX_TAG); } /* - * Submit writes for all the buffer heads in the dirty block tree. The - * write transaction machinery ensures that the dirty blocks form a - * consistent image and excludes future dirtying while we're working. + * Submit writes for all the blocks in the radix with their dirty tag + * set. The transaction machinery ensures that the dirty blocks form a + * consistent image and excludes future dirtying while IO is in flight. * * Presence in the dirty tree holds a reference. Blocks are only * removed from the tree which drops the ref when IO completes. @@ -291,38 +343,49 @@ static void block_write_end_io(struct buffer_head *bh, int uptodate) int scoutfs_block_write_dirty(struct super_block *sb) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct buffer_head *bh; - struct rb_node *node; + struct scoutfs_block *blocks[16]; + struct scoutfs_block *bl; struct blk_plug plug; unsigned long flags; + u64 blkno; int ret; + int nr; + int i; atomic_set(&sbi->block_writes, 1); sbi->block_write_err = 0; + blkno = 0; ret = 0; blk_start_plug(&plug); - spin_lock_irqsave(&sbi->block_lock, flags); - node = rb_first(&sbi->block_dirty_tree); - while(node) { - bh = bh_from_bhp_node(node); - node = rb_next(node); + do { + /* get refs to a bunch of dirty blocks */ + spin_lock_irqsave(&sbi->block_lock, flags); + nr = radix_tree_gang_lookup_tag(&sbi->block_radix, + (void **)blocks, blkno, + ARRAY_SIZE(blocks), + DIRTY_RADIX_TAG); + if (nr > 0) + blkno = blocks[nr - 1]->blkno + 1; + for (i = 0; i < nr; i++) + atomic_inc(&blocks[i]->refcount); spin_unlock_irqrestore(&sbi->block_lock, flags); - atomic_inc(&sbi->block_writes); - scoutfs_block_set_crc((void *)bh); + /* submit them in order, being careful to put all on err */ + for (i = 0; i < nr; i++) { + bl = blocks[i]; - lock_buffer(bh); - - bh->b_end_io = block_write_end_io; - ret = submit_bh(WRITE, bh); /* doesn't actually fail? */ - - spin_lock_irqsave(&sbi->block_lock, flags); - if (ret) - break; - } - spin_unlock_irqrestore(&sbi->block_lock, flags); + if (ret == 0) { + scoutfs_block_set_crc(bl); + atomic_inc(&sbi->block_writes); + ret = block_submit_bio(bl, WRITE); + if (ret) + atomic_dec(&sbi->block_writes); + } + scoutfs_block_put(bl); + } + } while (nr && !ret); blk_finish_plug(&plug); @@ -330,18 +393,29 @@ int scoutfs_block_write_dirty(struct super_block *sb) atomic_dec(&sbi->block_writes); wait_event(sbi->block_wq, atomic_read(&sbi->block_writes) == 0); - trace_printk("ret %d\n", ret); - return ret; + return ret ?: sbi->block_write_err; } /* - * The caller knows that it's not racing with writers. + * XXX This is a gross hack for writing the super. It doesn't have + * per-block write completion indication. It knows that it's the only + * thing that will be writing. */ -int scoutfs_block_has_dirty(struct super_block *sb) +int scoutfs_block_write_sync(struct scoutfs_block *bl) { - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + struct scoutfs_sb_info *sbi = SCOUTFS_SB(bl->sb); + int ret; - return !RB_EMPTY_ROOT(&sbi->block_dirty_tree); + BUG_ON(atomic_read(&sbi->block_writes) != 0); + + atomic_inc(&sbi->block_writes); + ret = block_submit_bio(bl, WRITE); + if (ret) + atomic_dec(&sbi->block_writes); + else + wait_event(sbi->block_wq, atomic_read(&sbi->block_writes) == 0); + + return ret ?: sbi->block_write_err; } /* @@ -418,37 +492,64 @@ out: * Return a dirty metadata block with an updated block header to match * the current dirty seq. Callers are responsible for serializing * access to the block and for zeroing unwritten block contents. + * + * Always allocating a new block and replacing any old cached block + * serves a very specific purpose. We can have an unlocked reader + * traversing stable structures actively using a clean block while a + * writer gets that same blkno from the allocator and starts modifying + * it. By always allocating a new block we let the reader continue + * safely using their old immutable block while the writer works on the + * newly allocated block. The old stable block will be freed once the + * reader drops their reference. */ struct scoutfs_block *scoutfs_block_dirty(struct super_block *sb, u64 blkno) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct scoutfs_block_header *hdr; - struct buffer_head *bh; + struct scoutfs_block *found; + struct scoutfs_block *bl; + unsigned long flags; int ret; /* allocate a new block and try to insert it */ - bh = sb_getblk(sb, blkno); - if (!bh) { - bh = ERR_PTR(-ENOMEM); + bl = alloc_block(sb, blkno); + if (!bl) { + ret = -EIO; goto out; } - ret = insert_bhp(sb, bh); - if (ret < 0) { - scoutfs_block_put((void *)bh); - bh = ERR_PTR(ret); - goto out; - } + set_bit(BLOCK_BIT_UPTODATE, &bl->bits); - hdr = scoutfs_block_data((void *)bh); + ret = radix_tree_preload(GFP_NOFS); + if (ret) + goto out; + + hdr = bl->data; *hdr = sbi->super.hdr; hdr->blkno = cpu_to_le64(blkno); hdr->seq = sbi->super.hdr.seq; - set_buffer_uptodate(bh); - set_buffer_scoutfs_verified(bh); + spin_lock_irqsave(&sbi->block_lock, flags); + found = radix_tree_lookup(&sbi->block_radix, blkno); + if (found) { + radix_tree_delete(&sbi->block_radix, blkno); + scoutfs_block_put(found); + } + + radix_tree_insert(&sbi->block_radix, blkno, bl); + radix_tree_tag_set(&sbi->block_radix, blkno, DIRTY_RADIX_TAG); + atomic_inc(&bl->refcount); + spin_unlock_irqrestore(&sbi->block_lock, flags); + + radix_tree_preload_end(); + ret = 0; out: - return (void *)bh; + if (ret) { + scoutfs_block_put(bl); + bl = ERR_PTR(ret); + } + + return bl; } /* @@ -476,29 +577,6 @@ struct scoutfs_block *scoutfs_block_dirty_alloc(struct super_block *sb) return bl; } -/* - * Make sure that we don't have a dirty block at the given blkno. If we - * do we remove it from our tree of dirty blocks and clear the buffer - * dirty bit. - * - * XXX for now callers have only needed to forget blknos, maybe they'll - * have the bh some day. - */ -void scoutfs_block_forget(struct super_block *sb, u64 blkno) -{ - struct block_bh_private *bhp; - struct buffer_head *bh; - - bh = sb_find_get_block(sb, blkno); - if (bh) { - bhp = bh->b_private; - if (bhp) { - erase_bhp(bh); - bforget(bh); - } - } -} - void scoutfs_block_set_crc(struct scoutfs_block *bl) { struct scoutfs_block_header *hdr = scoutfs_block_data(bl); @@ -531,46 +609,41 @@ void scoutfs_block_zero_from(struct scoutfs_block *bl, void *ptr) void scoutfs_block_set_lock_class(struct scoutfs_block *bl, struct lock_class_key *class) { - struct buffer_head *bh = (void *)bl; - struct block_bh_private *bhp = bh->b_private; - - if (bhp && !bhp->rwsem_class) { - lockdep_set_class(&bhp->rwsem, class); - bhp->rwsem_class = true; + if (!test_bit(BLOCK_BIT_CLASS_SET, &bl->bits)) { + lockdep_set_class(&bl->rwsem, class); + set_bit(BLOCK_BIT_CLASS_SET, &bl->bits); } } void scoutfs_block_lock(struct scoutfs_block *bl, bool write, int subclass) { - struct buffer_head *bh = (void *)bl; - struct block_bh_private *bhp = bh->b_private; + struct scoutfs_block_header *hdr = scoutfs_block_data(bl); + struct scoutfs_sb_info *sbi = SCOUTFS_SB(bl->sb); - if (bhp) { + if (hdr->seq == sbi->super.hdr.seq) { if (write) - down_write_nested(&bhp->rwsem, subclass); + down_write_nested(&bl->rwsem, subclass); else - down_read_nested(&bhp->rwsem, subclass); + down_read_nested(&bl->rwsem, subclass); } } void scoutfs_block_unlock(struct scoutfs_block *bl, bool write) { - struct buffer_head *bh = (void *)bl; - struct block_bh_private *bhp = bh->b_private; + struct scoutfs_block_header *hdr = scoutfs_block_data(bl); + struct scoutfs_sb_info *sbi = SCOUTFS_SB(bl->sb); - if (bhp) { + if (hdr->seq == sbi->super.hdr.seq) { if (write) - up_write(&bhp->rwsem); + up_write(&bl->rwsem); else - up_read(&bhp->rwsem); + up_read(&bl->rwsem); } } void *scoutfs_block_data(struct scoutfs_block *bl) { - struct buffer_head *bh = (void *)bl; - - return (void *)bh->b_data; + return bl->data; } void *scoutfs_block_data_from_contents(const void *ptr) @@ -580,10 +653,23 @@ void *scoutfs_block_data_from_contents(const void *ptr) return (void *)(addr & ~((unsigned long)SCOUTFS_BLOCK_MASK)); } -void scoutfs_block_put(struct scoutfs_block *bl) +void scoutfs_block_destroy(struct super_block *sb) { - struct buffer_head *bh = (void *)bl; + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + struct scoutfs_block *blocks[16]; + struct scoutfs_block *bl; + unsigned long blkno = 0; + int nr; + int i; - if (!IS_ERR_OR_NULL(bh)) - brelse(bh); + do { + nr = radix_tree_gang_lookup(&sbi->block_radix, (void **)blocks, + blkno, ARRAY_SIZE(blocks)); + for (i = 0; i < nr; i++) { + bl = blocks[i]; + radix_tree_delete(&sbi->block_radix, bl->blkno); + blkno = bl->blkno + 1; + scoutfs_block_put(bl); + } + } while (nr); } diff --git a/kmod/src/block.h b/kmod/src/block.h index 58a0f952..8981fd9d 100644 --- a/kmod/src/block.h +++ b/kmod/src/block.h @@ -16,6 +16,7 @@ struct scoutfs_block *scoutfs_block_dirty_ref(struct super_block *sb, int scoutfs_block_has_dirty(struct super_block *sb); int scoutfs_block_write_dirty(struct super_block *sb); +int scoutfs_block_write_sync(struct scoutfs_block *bl); void scoutfs_block_set_crc(struct scoutfs_block *bl); void scoutfs_block_zero(struct scoutfs_block *bl, size_t off); @@ -26,10 +27,10 @@ void scoutfs_block_set_lock_class(struct scoutfs_block *bl, void scoutfs_block_lock(struct scoutfs_block *bl, bool write, int subclass); void scoutfs_block_unlock(struct scoutfs_block *bl, bool write); -void scoutfs_block_forget(struct super_block *sb, u64 blkno); - void *scoutfs_block_data(struct scoutfs_block *bl); void *scoutfs_block_data_from_contents(const void *ptr); void scoutfs_block_put(struct scoutfs_block *bl); +void scoutfs_block_destroy(struct super_block *sb); + #endif diff --git a/kmod/src/super.c b/kmod/src/super.c index 471bffef..aa7c2bcc 100644 --- a/kmod/src/super.c +++ b/kmod/src/super.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include @@ -109,23 +108,19 @@ int scoutfs_write_dirty_super(struct super_block *sb) struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct scoutfs_super_block *super; struct scoutfs_block *bl; - struct buffer_head *bh; int ret; - /* XXX hack is immediately repaired in the coming patches */ - bh = sb_getblk(sb, le64_to_cpu(sbi->super.hdr.blkno)); - if (!bh) - return -ENOMEM; - bl = (void *)bh; + /* XXX prealloc? */ + bl = scoutfs_block_dirty(sb, le64_to_cpu(sbi->super.hdr.blkno)); + if (WARN_ON_ONCE(IS_ERR(bl))) + return PTR_ERR(bl); super = scoutfs_block_data(bl); - *super = sbi->super; - scoutfs_block_zero(bl, sizeof(struct scoutfs_super_block)); + memcpy(super, &sbi->super, sizeof(*super)); + scoutfs_block_zero(bl, sizeof(*super)); scoutfs_block_set_crc(bl); - mark_buffer_dirty(bh); - ret = sync_dirty_buffer(bh); - + ret = scoutfs_block_write_sync(bl); scoutfs_block_put(bl); return ret; } @@ -193,7 +188,8 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent) spin_lock_init(&sbi->next_ino_lock); spin_lock_init(&sbi->block_lock); - sbi->block_dirty_tree = RB_ROOT; + /* radix only inserted with NOFS _preload */ + INIT_RADIX_TREE(&sbi->block_radix, GFP_ATOMIC); init_waitqueue_head(&sbi->block_wq); atomic_set(&sbi->block_writes, 0); init_rwsem(&sbi->btree_rwsem); @@ -204,11 +200,6 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent) init_waitqueue_head(&sbi->trans_write_wq); spin_lock_init(&sbi->file_alloc_lock); - if (!sb_set_blocksize(sb, SCOUTFS_BLOCK_SIZE)) { - printk(KERN_ERR "couldn't set blocksize\n"); - return -EINVAL; - } - /* XXX can have multiple mounts of a device, need mount id */ sbi->kset = kset_create_and_add(sb->s_id, NULL, &scoutfs_kset->kobj); if (!sbi->kset) @@ -250,12 +241,10 @@ static void scoutfs_kill_sb(struct super_block *sb) if (sbi) { scoutfs_shutdown_trans(sb); scoutfs_buddy_destroy(sb); + scoutfs_block_destroy(sb); scoutfs_destroy_counters(sb); if (sbi->kset) kset_unregister(sbi->kset); - - /* XXX write errors can leave dirty blocks */ - WARN_ON_ONCE(!RB_EMPTY_ROOT(&sbi->block_dirty_tree)); kfree(sbi); } } diff --git a/kmod/src/super.h b/kmod/src/super.h index 9247f917..141a606d 100644 --- a/kmod/src/super.h +++ b/kmod/src/super.h @@ -19,7 +19,7 @@ struct scoutfs_sb_info { spinlock_t next_ino_lock; spinlock_t block_lock; - struct rb_root block_dirty_tree; + struct radix_tree_root block_radix; wait_queue_head_t block_wq; atomic_t block_writes; int block_write_err;