From 97cb75bd88ba0de3f0aacfb8d82ab3b0441db758 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Fri, 10 Feb 2017 09:58:37 -0800 Subject: [PATCH] Remove dead btree, block, and buddy code Remove all the unused dead code from the previous btree block design. Signed-off-by: Zach Brown --- kmod/src/Makefile | 6 +- kmod/src/block.c | 786 ------------------- kmod/src/block.h | 38 - kmod/src/btree.c | 1582 -------------------------------------- kmod/src/btree.h | 77 -- kmod/src/buddy.c | 1063 ------------------------- kmod/src/buddy.h | 20 - kmod/src/counters.h | 2 - kmod/src/crc.c | 23 - kmod/src/crc.h | 6 - kmod/src/dir.c | 2 - kmod/src/format.h | 163 ---- kmod/src/inode.c | 7 - kmod/src/inode.h | 2 - kmod/src/ioctl.c | 1 - kmod/src/key.h | 123 --- kmod/src/kvec.c | 2 - kmod/src/name.c | 35 - kmod/src/name.h | 8 - kmod/src/scoutfs_trace.c | 1 - kmod/src/scoutfs_trace.h | 165 ---- kmod/src/super.c | 21 - kmod/src/super.h | 28 - kmod/src/trans.c | 2 - kmod/src/xattr.c | 1 - 25 files changed, 3 insertions(+), 4161 deletions(-) delete mode 100644 kmod/src/block.c delete mode 100644 kmod/src/block.h delete mode 100644 kmod/src/btree.c delete mode 100644 kmod/src/btree.h delete mode 100644 kmod/src/buddy.c delete mode 100644 kmod/src/buddy.h delete mode 100644 kmod/src/crc.c delete mode 100644 kmod/src/crc.h delete mode 100644 kmod/src/name.c delete mode 100644 kmod/src/name.h diff --git a/kmod/src/Makefile b/kmod/src/Makefile index 3e7c9b35..e31924ed 100644 --- a/kmod/src/Makefile +++ b/kmod/src/Makefile @@ -2,6 +2,6 @@ obj-$(CONFIG_SCOUTFS_FS) := scoutfs.o CFLAGS_scoutfs_trace.o = -I$(src) # define_trace.h double include -scoutfs-y += alloc.o bio.o block.o btree.o buddy.o compact.o counters.o crc.o \ - data.o dir.o kvec.o inode.o ioctl.o item.o key.o manifest.o \ - msg.o name.o seg.o scoutfs_trace.o super.o trans.o treap.o xattr.o +scoutfs-y += alloc.o bio.o compact.o counters.o data.o dir.o kvec.o inode.o \ + ioctl.o item.o key.o manifest.o msg.o seg.o scoutfs_trace.o \ + super.o trans.o treap.o xattr.o diff --git a/kmod/src/block.c b/kmod/src/block.c deleted file mode 100644 index 3ecf6a7b..00000000 --- a/kmod/src/block.c +++ /dev/null @@ -1,786 +0,0 @@ -/* - * Copyright (C) 2016 Versity Software, Inc. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ -#include -#include -#include - -#include "super.h" -#include "format.h" -#include "block.h" -#include "crc.h" -#include "counters.h" -#include "buddy.h" - -/* - * scoutfs maintains a cache of metadata blocks in a radix tree. This - * gives us blocks bigger than page size and avoids fixing the location - * of a logical cached block in one possible position in a larger block - * device page cache page. - * - * This does the work to cow dirty blocks, track dirty blocks, generate - * checksums as they're written, only write them in transactions, verify - * checksums on read, and invalidate and retry reads of stale cached - * blocks. (That last bit only has a hint of an implementation.) - * - * XXX - * - tear down dirty blocks left by write errors on unmount - * - multiple smaller page allocs - * - vmalloc? vm_map_ram? - * - blocks allocated from per-cpu pages when page size > block size - * - cmwq crc calcs if that makes sense - * - slab of block structs - * - don't verify checksums in end_io context? - * - fall back to multiple single bios per block io if bio alloc fails? - * - fail mount if total_blocks is greater than long radix blkno - */ - -struct scoutfs_block { - struct rw_semaphore rwsem; - atomic_t refcount; - struct list_head lru_entry; - u64 blkno; - - unsigned long bits; - - struct super_block *sb; - struct page *page; - void *data; -}; - -#define DIRTY_RADIX_TAG 0 - -enum { - BLOCK_BIT_UPTODATE = 0, - BLOCK_BIT_ERROR, - BLOCK_BIT_CLASS_SET, -}; - -static struct scoutfs_block *alloc_block(struct super_block *sb, u64 blkno) -{ - struct scoutfs_block *bl; - struct page *page; - - /* we'd need to be just a bit more careful */ - BUILD_BUG_ON(PAGE_SIZE > SCOUTFS_BLOCK_SIZE); - - bl = kzalloc(sizeof(struct scoutfs_block), GFP_NOFS); - if (bl) { - /* change _from_contents if allocs not aligned */ - page = alloc_pages(GFP_NOFS, SCOUTFS_BLOCK_PAGE_ORDER); - WARN_ON_ONCE(!page); - if (page) { - init_rwsem(&bl->rwsem); - atomic_set(&bl->refcount, 1); - INIT_LIST_HEAD(&bl->lru_entry); - bl->blkno = blkno; - bl->sb = sb; - bl->page = page; - bl->data = page_address(page); - trace_printk("allocated bl %p\n", bl); - } else { - kfree(bl); - bl = NULL; - } - } - - return bl; -} - -void scoutfs_block_put(struct scoutfs_block *bl) -{ - if (!IS_ERR_OR_NULL(bl) && atomic_dec_and_test(&bl->refcount)) { - trace_printk("freeing bl %p\n", bl); - WARN_ON_ONCE(!list_empty(&bl->lru_entry)); - __free_pages(bl->page, SCOUTFS_BLOCK_PAGE_ORDER); - kfree(bl); - scoutfs_inc_counter(bl->sb, block_mem_free); - } -} - -static void lru_add(struct scoutfs_sb_info *sbi, struct scoutfs_block *bl) -{ - if (list_empty(&bl->lru_entry)) { - list_add_tail(&bl->lru_entry, &sbi->block_lru_list); - sbi->block_lru_nr++; - } -} - -static void lru_del(struct scoutfs_sb_info *sbi, struct scoutfs_block *bl) -{ - if (!list_empty(&bl->lru_entry)) { - list_del_init(&bl->lru_entry); - sbi->block_lru_nr--; - } -} - -/* - * The caller is referencing a block but doesn't know if its in the LRU - * or not. If it is move it to the tail so it's last to be dropped by - * the shrinker. - */ -static void lru_move(struct scoutfs_sb_info *sbi, struct scoutfs_block *bl) -{ - if (!list_empty(&bl->lru_entry)) - list_move_tail(&bl->lru_entry, &sbi->block_lru_list); -} - -static void radix_insert(struct scoutfs_sb_info *sbi, struct scoutfs_block *bl, - bool dirty) -{ - radix_tree_insert(&sbi->block_radix, bl->blkno, bl); - if (dirty) - radix_tree_tag_set(&sbi->block_radix, bl->blkno, - DIRTY_RADIX_TAG); - else - lru_add(sbi, bl); - atomic_inc(&bl->refcount); -} - -/* deleting the blkno from the radix also clears the dirty tag if it was set */ -static void radix_delete(struct scoutfs_sb_info *sbi, struct scoutfs_block *bl) -{ - lru_del(sbi, bl); - radix_tree_delete(&sbi->block_radix, bl->blkno); - scoutfs_block_put(bl); -} - -static int verify_block_header(struct super_block *sb, struct scoutfs_block *bl) -{ - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct scoutfs_super_block *super = &sbi->super; - struct scoutfs_block_header *hdr = bl->data; - u32 crc = scoutfs_crc_block(hdr); - int ret = -EIO; - - if (le32_to_cpu(hdr->crc) != crc) { - printk("blkno %llu hdr crc %x != calculated %x\n", bl->blkno, - le32_to_cpu(hdr->crc), crc); - } else if (super->hdr.fsid && hdr->fsid != super->hdr.fsid) { - printk("blkno %llu fsid %llx != super fsid %llx\n", bl->blkno, - le64_to_cpu(hdr->fsid), le64_to_cpu(super->hdr.fsid)); - } else if (le64_to_cpu(hdr->blkno) != bl->blkno) { - printk("blkno %llu invalid hdr blkno %llx\n", bl->blkno, - le64_to_cpu(hdr->blkno)); - } else { - ret = 0; - } - - return ret; -} - -static void block_read_end_io(struct bio *bio, int err) -{ - struct scoutfs_block *bl = bio->bi_private; - struct scoutfs_sb_info *sbi = SCOUTFS_SB(bl->sb); - - if (!err && !verify_block_header(bl->sb, bl)) - set_bit(BLOCK_BIT_UPTODATE, &bl->bits); - else - set_bit(BLOCK_BIT_ERROR, &bl->bits); - - /* - * uncontended spin_lock in wake_up and unconditional smp_mb to - * make waitqueue_active safe are about the same cost, so we - * prefer the obviously safe choice. - */ - wake_up(&sbi->block_wq); - - scoutfs_block_put(bl); - bio_put(bio); -} - -/* - * Once a transaction block is persistent it's fine to drop the dirty - * tag. It's been checksummed so it can be read in again. It's seq - * will be in the current transaction so it'll simply be dirtied and - * checksummed and written out again. - */ -static void block_write_end_io(struct bio *bio, int err) -{ - struct scoutfs_block *bl = bio->bi_private; - struct scoutfs_sb_info *sbi = SCOUTFS_SB(bl->sb); - unsigned long flags; - - if (!err) { - spin_lock_irqsave(&sbi->block_lock, flags); - radix_tree_tag_clear(&sbi->block_radix, - bl->blkno, DIRTY_RADIX_TAG); - lru_add(sbi, bl); - spin_unlock_irqrestore(&sbi->block_lock, flags); - } - - /* not too worried about racing ints */ - if (err && !sbi->block_write_err) - sbi->block_write_err = err; - - if (atomic_dec_and_test(&sbi->block_writes)) - wake_up(&sbi->block_wq); - - scoutfs_block_put(bl); - bio_put(bio); - -} - -static int block_submit_bio(struct scoutfs_block *bl, int rw) -{ - struct super_block *sb = bl->sb; - struct bio *bio; - int ret; - - bio = bio_alloc(GFP_NOFS, SCOUTFS_PAGES_PER_BLOCK); - if (WARN_ON_ONCE(!bio)) - return -ENOMEM; - - bio->bi_sector = bl->blkno << (SCOUTFS_BLOCK_SHIFT - 9); - bio->bi_bdev = sb->s_bdev; - if (rw & WRITE) { - bio->bi_end_io = block_write_end_io; - } else - bio->bi_end_io = block_read_end_io; - bio->bi_private = bl; - - ret = bio_add_page(bio, bl->page, SCOUTFS_BLOCK_SIZE, 0); - if (WARN_ON_ONCE(ret != SCOUTFS_BLOCK_SIZE)) { - bio_put(bio); - return -ENOMEM; - } - - atomic_inc(&bl->refcount); - submit_bio(rw, bio); - - return 0; -} - -/* - * Read an existing block from the device and verify its metadata header. - */ -struct scoutfs_block *scoutfs_block_read(struct super_block *sb, u64 blkno) -{ - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct scoutfs_block *found; - struct scoutfs_block *bl; - unsigned long flags; - int ret; - - /* find an existing block, dropping if it's errored */ - spin_lock_irqsave(&sbi->block_lock, flags); - - bl = radix_tree_lookup(&sbi->block_radix, blkno); - if (bl) { - if (test_bit(BLOCK_BIT_ERROR, &bl->bits)) { - radix_delete(sbi, bl); - bl = NULL; - } else { - lru_move(sbi, bl); - atomic_inc(&bl->refcount); - } - } - spin_unlock_irqrestore(&sbi->block_lock, flags); - if (bl) - goto wait; - - /* allocate a new block and try to insert it */ - bl = alloc_block(sb, blkno); - if (!bl) { - ret = -EIO; - goto out; - } - - ret = radix_tree_preload(GFP_NOFS); - if (ret) - goto out; - - spin_lock_irqsave(&sbi->block_lock, flags); - - found = radix_tree_lookup(&sbi->block_radix, blkno); - if (found) { - scoutfs_block_put(bl); - bl = found; - lru_move(sbi, bl); - atomic_inc(&bl->refcount); - } else { - radix_insert(sbi, bl, false); - } - - spin_unlock_irqrestore(&sbi->block_lock, flags); - radix_tree_preload_end(); - - if (!found) { - ret = block_submit_bio(bl, READ_SYNC | REQ_META); - if (ret) - goto out; - } - -wait: - ret = wait_event_interruptible(sbi->block_wq, - test_bit(BLOCK_BIT_UPTODATE, &bl->bits) || - test_bit(BLOCK_BIT_ERROR, &bl->bits)); - if (ret == 0 && test_bit(BLOCK_BIT_ERROR, &bl->bits)) - ret = -EIO; -out: - if (ret) { - scoutfs_block_put(bl); - bl = ERR_PTR(ret); - } - - return bl; -} - -/* - * Read an existing block from the device described by the caller's - * reference. - * - * If the reference sequence numbers don't match then we could be racing - * with another writer. We back off and try again. If it happens too - * many times the caller assumes that we've hit persistent corruption - * and returns an error. - * - * XXX: - * - actually implement this - * - reads that span transactions? - * - writers creating a new dirty block? - */ -struct scoutfs_block *scoutfs_block_read_ref(struct super_block *sb, - struct scoutfs_block_ref *ref) -{ - struct scoutfs_block_header *hdr; - struct scoutfs_block *bl; - - bl = scoutfs_block_read(sb, le64_to_cpu(ref->blkno)); - if (!IS_ERR(bl)) { - hdr = scoutfs_block_data(bl); - if (WARN_ON_ONCE(hdr->seq != ref->seq)) { - scoutfs_block_put(bl); - bl = ERR_PTR(-EAGAIN); - } - } - - return bl; -} - -/* - * The caller knows that it's not racing with writers. - */ -int scoutfs_block_has_dirty(struct super_block *sb) -{ - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - - return radix_tree_tagged(&sbi->block_radix, DIRTY_RADIX_TAG); -} - -/* - * Submit writes for all the blocks in the radix with their dirty tag - * set. The transaction machinery ensures that the dirty blocks form a - * consistent image and excludes future dirtying while IO is in flight. - * - * Presence in the dirty tree holds a reference. Blocks are only - * removed from the tree which drops the ref when IO completes. - * - * Blocks that see write errors remain in the dirty tree and will try to - * be written again in the next transaction commit. - * - * Reads can traverse the blocks while they're in flight. - */ -int scoutfs_block_write_dirty(struct super_block *sb) -{ - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct scoutfs_block *blocks[16]; - struct scoutfs_block *bl; - struct blk_plug plug; - unsigned long flags; - u64 blkno; - int ret; - int nr; - int i; - - atomic_set(&sbi->block_writes, 1); - sbi->block_write_err = 0; - blkno = 0; - ret = 0; - - blk_start_plug(&plug); - - do { - /* get refs to a bunch of dirty blocks */ - spin_lock_irqsave(&sbi->block_lock, flags); - nr = radix_tree_gang_lookup_tag(&sbi->block_radix, - (void **)blocks, blkno, - ARRAY_SIZE(blocks), - DIRTY_RADIX_TAG); - if (nr > 0) - blkno = blocks[nr - 1]->blkno + 1; - for (i = 0; i < nr; i++) - atomic_inc(&blocks[i]->refcount); - spin_unlock_irqrestore(&sbi->block_lock, flags); - - /* submit them in order, being careful to put all on err */ - for (i = 0; i < nr; i++) { - bl = blocks[i]; - - if (ret == 0) { - scoutfs_block_set_crc(bl); - atomic_inc(&sbi->block_writes); - ret = block_submit_bio(bl, WRITE); - if (ret) - atomic_dec(&sbi->block_writes); - } - scoutfs_block_put(bl); - } - } while (nr && !ret); - - blk_finish_plug(&plug); - - /* wait for all io to drain */ - atomic_dec(&sbi->block_writes); - wait_event(sbi->block_wq, atomic_read(&sbi->block_writes) == 0); - - return ret ?: sbi->block_write_err; -} - -/* - * XXX This is a gross hack for writing the super. It doesn't have - * per-block write completion indication. It knows that it's the only - * thing that will be writing. - */ -int scoutfs_block_write_sync(struct scoutfs_block *bl) -{ - struct scoutfs_sb_info *sbi = SCOUTFS_SB(bl->sb); - int ret; - - BUG_ON(atomic_read(&sbi->block_writes) != 0); - - atomic_inc(&sbi->block_writes); - ret = block_submit_bio(bl, WRITE); - if (ret) - atomic_dec(&sbi->block_writes); - else - wait_event(sbi->block_wq, atomic_read(&sbi->block_writes) == 0); - - return ret ?: sbi->block_write_err; -} - -/* - * Give the caller a dirty block that they can safely modify. If the - * reference refers to a stable clean block then we allocate a new block - * and update the reference. - * - * Blocks are dirtied and modified within a transaction that has a given - * sequence number which we use to determine if the block is currently - * dirty or not. - * - * For now we're using the dirty super block in the sb_info to track the - * dirty seq. That'll be different when we have multiple btrees. - * - * Callers are responsible for serializing modification to the reference - * which is probably embedded in some other dirty persistent structure. - */ -struct scoutfs_block *scoutfs_block_dirty_ref(struct super_block *sb, - struct scoutfs_block_ref *ref) -{ - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct scoutfs_block_header *hdr; - struct scoutfs_block *copy_bl = NULL; - struct scoutfs_block *bl; - u64 blkno = 0; - int ret; - int err; - - bl = scoutfs_block_read(sb, le64_to_cpu(ref->blkno)); - if (IS_ERR(bl) || ref->seq == sbi->super.hdr.seq) - return bl; - - ret = scoutfs_buddy_alloc_same(sb, &blkno, le64_to_cpu(ref->blkno)); - if (ret < 0) - goto out; - - copy_bl = scoutfs_block_dirty(sb, blkno); - if (IS_ERR(copy_bl)) { - ret = PTR_ERR(copy_bl); - goto out; - } - - hdr = scoutfs_block_data(bl); - ret = scoutfs_buddy_free(sb, hdr->seq, le64_to_cpu(hdr->blkno), 0); - if (ret) - goto out; - - memcpy(scoutfs_block_data(copy_bl), scoutfs_block_data(bl), - SCOUTFS_BLOCK_SIZE); - - hdr = scoutfs_block_data(copy_bl); - hdr->blkno = cpu_to_le64(blkno); - hdr->seq = sbi->super.hdr.seq; - ref->blkno = hdr->blkno; - ref->seq = hdr->seq; - - ret = 0; -out: - scoutfs_block_put(bl); - if (ret) { - if (!IS_ERR_OR_NULL(copy_bl)) { - err = scoutfs_buddy_free(sb, sbi->super.hdr.seq, - blkno, 0); - WARN_ON_ONCE(err); /* freeing dirty must work */ - } - scoutfs_block_put(copy_bl); - copy_bl = ERR_PTR(ret); - } - - return copy_bl; -} - -/* - * Return a dirty metadata block with an updated block header to match - * the current dirty seq. Callers are responsible for serializing - * access to the block and for zeroing unwritten block contents. - * - * Always allocating a new block and replacing any old cached block - * serves a very specific purpose. We can have an unlocked reader - * traversing stable structures actively using a clean block while a - * writer gets that same blkno from the allocator and starts modifying - * it. By always allocating a new block we let the reader continue - * safely using their old immutable block while the writer works on the - * newly allocated block. The old stable block will be freed once the - * reader drops their reference. - */ -struct scoutfs_block *scoutfs_block_dirty(struct super_block *sb, u64 blkno) -{ - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct scoutfs_block_header *hdr; - struct scoutfs_block *found; - struct scoutfs_block *bl; - unsigned long flags; - int ret; - - /* allocate a new block and try to insert it */ - bl = alloc_block(sb, blkno); - if (!bl) { - ret = -EIO; - goto out; - } - - set_bit(BLOCK_BIT_UPTODATE, &bl->bits); - - ret = radix_tree_preload(GFP_NOFS); - if (ret) - goto out; - - hdr = bl->data; - *hdr = sbi->super.hdr; - hdr->blkno = cpu_to_le64(blkno); - hdr->seq = sbi->super.hdr.seq; - - spin_lock_irqsave(&sbi->block_lock, flags); - found = radix_tree_lookup(&sbi->block_radix, blkno); - if (found) - radix_delete(sbi, found); - radix_insert(sbi, bl, true); - spin_unlock_irqrestore(&sbi->block_lock, flags); - - radix_tree_preload_end(); - ret = 0; -out: - if (ret) { - scoutfs_block_put(bl); - bl = ERR_PTR(ret); - } - - return bl; -} - -/* - * Allocate a new dirty writable block. The caller must be in a - * transaction so that we can assign the dirty seq. - */ -struct scoutfs_block *scoutfs_block_dirty_alloc(struct super_block *sb) -{ - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct scoutfs_super_block *super = &sbi->stable_super; - struct scoutfs_block *bl; - u64 blkno; - int ret; - int err; - - ret = scoutfs_buddy_alloc(sb, &blkno, 0); - if (ret < 0) - return ERR_PTR(ret); - - bl = scoutfs_block_dirty(sb, blkno); - if (IS_ERR(bl)) { - err = scoutfs_buddy_free(sb, super->hdr.seq, blkno, 0); - WARN_ON_ONCE(err); /* freeing dirty must work */ - } - return bl; -} - -/* - * Forget the given block by removing it from the radix and clearing its - * dirty tag. It will not be found by future lookups and will not be - * written out. The caller can still use it until it drops its - * reference. - */ -void scoutfs_block_forget(struct scoutfs_block *bl) -{ - struct scoutfs_sb_info *sbi = SCOUTFS_SB(bl->sb); - struct scoutfs_block *found; - unsigned long flags; - u64 blkno = bl->blkno; - - spin_lock_irqsave(&sbi->block_lock, flags); - found = radix_tree_lookup(&sbi->block_radix, blkno); - if (found == bl) - radix_delete(sbi, bl); - spin_unlock_irqrestore(&sbi->block_lock, flags); -} - -/* - * We maintain an LRU of blocks so that the shrinker can free the oldest - * under memory pressure. We can't reclaim dirty blocks so only clean - * blocks are kept in the LRU. Blocks are only in the LRU while their - * presence in the radix holds a reference. We don't care if a reader - * has an active ref on a clean block that gets reclaimed. All we're - * doing is removing from the radix. The caller can still work with the - * block and it will be freed once they drop their ref. - * - * If this is called with nr_to_scan == 0 then it only returns the nr. - * We avoid acquiring the lock in that case. - * - * Lookup code only moves blocks around in the LRU while they're in the - * radix. Once we remove the block from the radix we're able to use the - * lru_entry to drop all the blocks outside the lock. - * - * XXX: - * - are sc->nr_to_scan and our return meant to be in units of pages? - * - should we sync a transaction here? - */ -int scoutfs_block_shrink(struct shrinker *shrink, struct shrink_control *sc) -{ - struct scoutfs_sb_info *sbi = container_of(shrink, - struct scoutfs_sb_info, - block_shrinker); - struct scoutfs_block *tmp; - struct scoutfs_block *bl; - unsigned long flags; - unsigned long nr; - LIST_HEAD(list); - - nr = sc->nr_to_scan; - if (!nr) - goto out; - - spin_lock_irqsave(&sbi->block_lock, flags); - - list_for_each_entry_safe(bl, tmp, &sbi->block_lru_list, lru_entry) { - if (nr-- == 0) - break; - atomic_inc(&bl->refcount); - radix_delete(sbi, bl); - list_add(&bl->lru_entry, &list); - } - - spin_unlock_irqrestore(&sbi->block_lock, flags); - - list_for_each_entry_safe(bl, tmp, &list, lru_entry) { - list_del_init(&bl->lru_entry); - scoutfs_block_put(bl); - } - -out: - return min_t(unsigned long, sbi->block_lru_nr, INT_MAX); -} - -void scoutfs_block_set_crc(struct scoutfs_block *bl) -{ - struct scoutfs_block_header *hdr = scoutfs_block_data(bl); - - hdr->crc = cpu_to_le32(scoutfs_crc_block(hdr)); -} - -/* - * Zero the block from the given byte to the end of the block. - */ -void scoutfs_block_zero(struct scoutfs_block *bl, size_t off) -{ - if (WARN_ON_ONCE(off > SCOUTFS_BLOCK_SIZE)) - return; - - if (off < SCOUTFS_BLOCK_SIZE) - memset(scoutfs_block_data(bl) + off, 0, - SCOUTFS_BLOCK_SIZE - off); -} - -/* - * Zero the block from the given byte to the end of the block. - */ -void scoutfs_block_zero_from(struct scoutfs_block *bl, void *ptr) -{ - return scoutfs_block_zero(bl, (char *)ptr - - (char *)scoutfs_block_data(bl)); -} - -void scoutfs_block_set_lock_class(struct scoutfs_block *bl, - struct lock_class_key *class) -{ - if (!test_bit(BLOCK_BIT_CLASS_SET, &bl->bits)) { - lockdep_set_class(&bl->rwsem, class); - set_bit(BLOCK_BIT_CLASS_SET, &bl->bits); - } -} - -void scoutfs_block_lock(struct scoutfs_block *bl, bool write, int subclass) -{ - if (write) - down_write_nested(&bl->rwsem, subclass); - else - down_read_nested(&bl->rwsem, subclass); -} - -void scoutfs_block_unlock(struct scoutfs_block *bl, bool write) -{ - if (write) - up_write(&bl->rwsem); - else - up_read(&bl->rwsem); -} - -void *scoutfs_block_data(struct scoutfs_block *bl) -{ - return bl->data; -} - -void *scoutfs_block_data_from_contents(const void *ptr) -{ - unsigned long addr = (unsigned long)ptr; - - return (void *)(addr & ~((unsigned long)SCOUTFS_BLOCK_MASK)); -} - -void scoutfs_block_destroy(struct super_block *sb) -{ - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct scoutfs_block *blocks[16]; - struct scoutfs_block *bl; - unsigned long blkno = 0; - int nr; - int i; - - do { - nr = radix_tree_gang_lookup(&sbi->block_radix, (void **)blocks, - blkno, ARRAY_SIZE(blocks)); - for (i = 0; i < nr; i++) { - bl = blocks[i]; - blkno = bl->blkno + 1; - radix_delete(sbi, bl); - } - } while (nr); -} diff --git a/kmod/src/block.h b/kmod/src/block.h deleted file mode 100644 index 0eb86837..00000000 --- a/kmod/src/block.h +++ /dev/null @@ -1,38 +0,0 @@ -#ifndef _SCOUTFS_BLOCK_H_ -#define _SCOUTFS_BLOCK_H_ - -struct scoutfs_block; - -#include - -struct scoutfs_block *scoutfs_block_read(struct super_block *sb, u64 blkno); -struct scoutfs_block *scoutfs_block_read_ref(struct super_block *sb, - struct scoutfs_block_ref *ref); - -struct scoutfs_block *scoutfs_block_dirty(struct super_block *sb, u64 blkno); -struct scoutfs_block *scoutfs_block_dirty_alloc(struct super_block *sb); -struct scoutfs_block *scoutfs_block_dirty_ref(struct super_block *sb, - struct scoutfs_block_ref *ref); - -int scoutfs_block_has_dirty(struct super_block *sb); -int scoutfs_block_write_dirty(struct super_block *sb); -int scoutfs_block_write_sync(struct scoutfs_block *bl); - -void scoutfs_block_set_crc(struct scoutfs_block *bl); -void scoutfs_block_zero(struct scoutfs_block *bl, size_t off); -void scoutfs_block_zero_from(struct scoutfs_block *bl, void *ptr); - -void scoutfs_block_set_lock_class(struct scoutfs_block *bl, - struct lock_class_key *class); -void scoutfs_block_lock(struct scoutfs_block *bl, bool write, int subclass); -void scoutfs_block_unlock(struct scoutfs_block *bl, bool write); - -void *scoutfs_block_data(struct scoutfs_block *bl); -void *scoutfs_block_data_from_contents(const void *ptr); -void scoutfs_block_forget(struct scoutfs_block *bl); -void scoutfs_block_put(struct scoutfs_block *bl); - -int scoutfs_block_shrink(struct shrinker *shrink, struct shrink_control *sc); -void scoutfs_block_destroy(struct super_block *sb); - -#endif diff --git a/kmod/src/btree.c b/kmod/src/btree.c deleted file mode 100644 index a1410134..00000000 --- a/kmod/src/btree.c +++ /dev/null @@ -1,1582 +0,0 @@ -/* - * Copyright (C) 2016 Zach Brown. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ -#include -#include -#include -#include - -#include "super.h" -#include "format.h" -#include "block.h" -#include "key.h" -#include "btree.h" - -#include "scoutfs_trace.h" - -/* - * scoutfs stores file system metadata in btrees whose items have fixed - * sized keys and variable length values. - * - * Items are stored as a small header with the key followed by the - * value. New items are allocated from the back of the block towards - * the front. Deleted items can be reclaimed by packing items towards - * the back of the block by walking them in reverse offset order. - * - * A dense array of item offsets after the btree block header header - * maintains the sorted order of the items by their keys. The array is - * small enough that the memmoves to keep it dense involves a few cache - * lines at most. - * - * Parent blocks in the btree have the same format as leaf blocks. - * There's one key for every child reference instead of having separator - * keys between child references. The key in a child reference contains - * the largest key that may be found in the child subtree. The right - * spine of the tree has maximal keys so that they don't have to be - * updated if we insert an item with a key greater than everything in - * the tree. - * - * btree blocks, block references, and items all have sequence numbers - * that are set to the current dirty btree sequence number when they're - * modified. This lets us efficiently search a range of keys for items - * that are newer than a given sequence number. - * - * Operations are performed in one pass down the tree. This lets us - * cascade locks from the root down to the leaves and avoids having to - * maintain a record of the path down the tree. Splits and merges are - * performed as we descend. - * - * XXX - * - do we want a level in the btree header? seems like we would? - * - validate structures on read? - * - internal bl/pos/cmp interface is clumsy.. - */ - -/* number of contiguous bytes used by the item header and val of given len */ -static inline unsigned int val_bytes(unsigned int val_len) -{ - return sizeof(struct scoutfs_btree_item) + val_len; -} - -/* number of contiguous bytes used by the item header its current value */ -static inline unsigned int item_bytes(struct scoutfs_btree_item *item) -{ - return val_bytes(le16_to_cpu(item->val_len)); -} - -/* total bytes consumed by an item with given val len: offset, header, value */ -static inline unsigned int all_val_bytes(unsigned int val_len) -{ - return sizeof(((struct scoutfs_btree_block *)NULL)->item_offs[0]) + - val_bytes(val_len); -} - -/* total bytes consumed by an item with its current value */ -static inline unsigned int all_item_bytes(struct scoutfs_btree_item *item) -{ - return all_val_bytes(le16_to_cpu(item->val_len)); -} - -/* number of contig free bytes between item offset and first item */ -static inline unsigned int contig_free(struct scoutfs_btree_block *bt) -{ - unsigned int nr = le16_to_cpu(bt->nr_items); - - return le16_to_cpu(bt->free_end) - - offsetof(struct scoutfs_btree_block, item_offs[nr]); -} - -/* number of contig bytes free after reclaiming free amongst items */ -static inline unsigned int reclaimable_free(struct scoutfs_btree_block *bt) -{ - return contig_free(bt) + le16_to_cpu(bt->free_reclaim); -} - -/* all bytes used by item offsets, headers, and values */ -static inline unsigned int used_total(struct scoutfs_btree_block *bt) -{ - return SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_btree_block) - - reclaimable_free(bt); -} - -static inline struct scoutfs_btree_item * -off_item(struct scoutfs_btree_block *bt, __le16 off) -{ - return (void *)bt + le16_to_cpu(off); -} - -static inline struct scoutfs_btree_item * -pos_item(struct scoutfs_btree_block *bt, unsigned int pos) -{ - return off_item(bt, bt->item_offs[pos]); -} - -static inline struct scoutfs_key *greatest_key(struct scoutfs_btree_block *bt) -{ - unsigned int nr = le16_to_cpu(bt->nr_items); - - return &pos_item(bt, nr - 1)->key; -} - -/* - * Copy as much of the item as fits in the value vector. The min of the - * value vec length and the item length is returned, including possibly - * 0. - */ -static int copy_to_val(struct scoutfs_btree_val *val, - struct scoutfs_btree_item *item) -{ - size_t val_len = le16_to_cpu(item->val_len); - char *val_ptr = item->val; - struct kvec *kv; - size_t bytes; - size_t off; - int i; - - /* - * Corruption check, right now we just return -EIO if the - * caller wants this. In the future we can grow this to do - * different things (go readonly, ignore, return error) based - * on the severity of the problem. - */ - /* XXX corruption */ - if (val->check_size_eq && val_len != scoutfs_btree_val_length(val)) - return -EIO; - if (val->check_size_lte && val_len > scoutfs_btree_val_length(val)) - return -EOVERFLOW; - - for (i = 0, off = 0; val_len > 0 && i < ARRAY_SIZE(val->vec); i++) { - kv = &val->vec[i]; - - if (WARN_ON_ONCE(kv->iov_len && !kv->iov_base)) - return -EINVAL; - - bytes = min(val_len, kv->iov_len); - if (bytes) - memcpy(kv->iov_base, val_ptr + off, bytes); - - val_len -= bytes; - off += bytes; - } - - return off; -} - -/* - * Copy the caller's value vector into the item in the tree block. This - * is only called when the item should exactly match the value vector. - * - * -EINVAL is returned if the lengths don't match. - */ -static int copy_to_item(struct scoutfs_btree_item *item, - struct scoutfs_btree_val *val) -{ - size_t val_len = le16_to_cpu(item->val_len); - char *val_ptr = item->val; - struct kvec *kv; - size_t bytes; - int i; - - if (val_len != scoutfs_btree_val_length(val)) - return -EINVAL; - - for (i = 0; i < ARRAY_SIZE(val->vec); i++) { - kv = &val->vec[i]; - - if (WARN_ON_ONCE(kv->iov_len && !kv->iov_base)) - return -EINVAL; - - bytes = min(val_len, kv->iov_len); - if (bytes) - memcpy(val_ptr, kv->iov_base, bytes); - - val_len -= bytes; - val_ptr += bytes; - } - - return 0; -} - -/* - * Returns the sorted item position that an item with the given key - * should occupy. - * - * It sets *cmp to the final comparison of the given key and the - * position's item key. - * - * If the given key is greater then all items' keys then the number of - * items can be returned. Callers need to be careful to test for this - * invalid index. - */ -static int find_pos(struct scoutfs_btree_block *bt, struct scoutfs_key *key, - int *cmp) -{ - unsigned int start = 0; - unsigned int end = le16_to_cpu(bt->nr_items); - unsigned int pos = 0; - - *cmp = -1; - - while (start < end) { - pos = start + (end - start) / 2; - - *cmp = scoutfs_key_cmp(key, &pos_item(bt, pos)->key); - if (*cmp < 0) { - end = pos; - } else if (*cmp > 0) { - start = ++pos; - *cmp = -1; - } else { - break; - } - } - - return pos; -} - -/* move a number of contigous elements from the src index to the dst index */ -#define memmove_arr(arr, dst, src, nr) \ - memmove(&(arr)[dst], &(arr)[src], (nr) * sizeof(*(arr))) - -/* - * Allocate and insert a new item into the block. The caller has made - * sure that there's room for everything. The caller is responsible for - * initializing the value. - */ -static struct scoutfs_btree_item *create_item(struct scoutfs_btree_block *bt, - unsigned int pos, - struct scoutfs_key *key, - unsigned int val_len) -{ - unsigned int nr = le16_to_cpu(bt->nr_items); - struct scoutfs_btree_item *item; - - if (pos < nr) - memmove_arr(bt->item_offs, pos + 1, pos, nr - pos); - - le16_add_cpu(&bt->free_end, -val_bytes(val_len)); - bt->item_offs[pos] = bt->free_end; - nr++; - bt->nr_items = cpu_to_le16(nr); - - BUG_ON(le16_to_cpu(bt->free_end) < - offsetof(struct scoutfs_btree_block, item_offs[nr])); - - item = pos_item(bt, pos); - item->key = *key; - item->seq = bt->hdr.seq; - item->val_len = cpu_to_le16(val_len); - - trace_printk("pos %u off %u\n", pos, le16_to_cpu(bt->item_offs[pos])); - - return item; -} - -/* - * Delete an item from a btree block. We record the amount of space it - * frees to later decide if we can satisfy an insertion by compaction - * instead of splitting. - */ -static void delete_item(struct scoutfs_btree_block *bt, unsigned int pos) -{ - struct scoutfs_btree_item *item = pos_item(bt, pos); - unsigned int nr = le16_to_cpu(bt->nr_items); - - trace_printk("pos %u off %u\n", pos, le16_to_cpu(bt->item_offs[pos])); - - if (pos < (nr - 1)) - memmove_arr(bt->item_offs, pos, pos + 1, nr - 1 - pos); - - le16_add_cpu(&bt->free_reclaim, item_bytes(item)); - nr--; - bt->nr_items = cpu_to_le16(nr); - - /* wipe deleted items to avoid leaking data */ - memset(item, 0, item_bytes(item)); -} - -/* - * Move items from a source block to a destination block. The caller - * tells us if we're moving from the tail of the source block right to - * the head of the destination block, or vice versa. We stop moving - * once we've moved enough bytes of items. - */ -static void move_items(struct scoutfs_btree_block *dst, - struct scoutfs_btree_block *src, bool move_right, - int to_move) -{ - struct scoutfs_btree_item *from; - struct scoutfs_btree_item *to; - unsigned int t; - unsigned int f; - - if (move_right) { - f = le16_to_cpu(src->nr_items) - 1; - t = 0; - } else { - f = 0; - t = le16_to_cpu(dst->nr_items); - } - - while (f < le16_to_cpu(src->nr_items) && to_move > 0) { - from = pos_item(src, f); - - to = create_item(dst, t, &from->key, - le16_to_cpu(from->val_len)); - - memcpy(to, from, item_bytes(from)); - to_move -= all_item_bytes(from); - - delete_item(src, f); - if (move_right) - f--; - else - t++; - } -} - -static int sort_key_cmp(const void *A, const void *B) -{ - struct scoutfs_btree_block *bt = scoutfs_block_data_from_contents(A); - const __le16 * __packed a = A; - const __le16 * __packed b = B; - - return scoutfs_key_cmp(&off_item(bt, *a)->key, &off_item(bt, *b)->key); -} - -static int sort_off_cmp(const void *A, const void *B) -{ - const __le16 * __packed a = A; - const __le16 * __packed b = B; - - return (int)le16_to_cpu(*a) - (int)le16_to_cpu(*b); -} - -static void sort_off_swap(void *A, void *B, int size) -{ - __le16 * __packed a = A; - __le16 * __packed b = B; - - swap(*a, *b); -} - -/* - * As items are deleted they create fragmented free space. Even if we - * indexed free space in the block it could still get sufficiently - * fragmented to force a split on insertion even though the two - * resulting blocks would have less than the minimum space consumed by - * items. - * - * We don't bother implementing free space indexing and addressing that - * corner case. Instead we track the number of bytes that could be - * reclaimed if we compacted the item space after the free_end offset. - * block. If this additional free space would satisfy an insertion then - * we compact the items instead of splitting the block. - * - * We move the free space to the center of the block by walking - * backwards through the items in offset order, moving items into free - * space between items towards the end of the block. - * - * We don't have specific metadata to either walk the items in offset - * order or to update the item offsets as we move items. We sort the - * item offset array to achieve both ends. First we sort it by offset - * so we can walk in reverse order. As we move items we update their - * position and then sort by keys once we're done. - * - * Compaction is only attempted during descent as we find a block that - * needs more or less free space. The caller has the parent locked for - * writing and there are no references to the items at this point so - * it's safe to scramble the block contents. - */ -static void compact_items(struct scoutfs_btree_block *bt) -{ - unsigned int nr = le16_to_cpu(bt->nr_items); - struct scoutfs_btree_item *from; - struct scoutfs_btree_item *to; - unsigned int bytes; - __le16 end; - int i; - - trace_printk("free_reclaim %u\n", le16_to_cpu(bt->free_reclaim)); - - sort(bt->item_offs, nr, sizeof(bt->item_offs[0]), - sort_off_cmp, sort_off_swap); - - end = cpu_to_le16(SCOUTFS_BLOCK_SIZE); - - for (i = nr - 1; i >= 0; i--) { - from = pos_item(bt, i); - - bytes = item_bytes(from); - le16_add_cpu(&end, -bytes); - to = off_item(bt, end); - bt->item_offs[i] = end; - - if (from != to) - memmove(to, from, bytes); - } - - bt->free_end = end; - bt->free_reclaim = 0; - - sort(bt->item_offs, nr, sizeof(bt->item_offs[0]), - sort_key_cmp, sort_off_swap); -} - - -/* - * Let's talk about btree locking. - * - * The main metadata btree has lots of callers who want concurrency. - * They have their own locks that protect multi item consistency -- say - * an inode's i_mutex protecting the items related to a given inode. - * But it's our responsibility to lock the btree itself. - * - * Our btree operations are implemented with a single walk down the - * tree. This gives us the opportunity to cascade block locks down the - * tree. We first lock the root. Then we lock the first block and - * unlock the root. Then lock the next block and unlock the first - * block. And so on down the tree. After contention on the root and - * first block we have lots of concurrency down paths of the tree to the - * leaves. - * - * Merging during descent has to lock the sibling block that it's - * pulling items from. It has to acquire these nested locks in - * consistent tree order. - * - * The cow tree updates let us skip block locking entirely for stable - * blocks because they're read only. All the blocks in the stable - * super tree are stable so we don't have to lock that tree at all. - * We let the block layer use the header's seq to avoid locking - * stable blocks. - * - * lockdep has to not be freaked out by all of this. The cascading - * block locks really make it angry without annotation so we add classes - * for each level and use nested subclasses for the locking of siblings - * during merge. - */ - -static void set_block_lock_class(struct scoutfs_block *bl, int level) -{ -#ifdef CONFIG_LOCKDEP - static struct lock_class_key tree_depth_classes[SCOUTFS_BTREE_MAX_DEPTH]; - - scoutfs_block_set_lock_class(bl, &tree_depth_classes[level]); -#endif -} - -static void lock_tree_block(struct super_block *sb, - struct scoutfs_btree_root *root, - struct scoutfs_block *bl, bool write, int subclass) -{ - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - - if (root == &sbi->super.btree_root) { - if (bl) { - scoutfs_block_lock(bl, write, subclass); - } else { - if (write) - down_write(&sbi->btree_rwsem); - else - down_read(&sbi->btree_rwsem); - } - } -} - -static void unlock_tree_block(struct super_block *sb, - struct scoutfs_btree_root *root, - struct scoutfs_block *bl, bool write) -{ - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - - if (root == &sbi->super.btree_root) { - if (bl) { - scoutfs_block_unlock(bl, write); - } else { - if (write) - up_write(&sbi->btree_rwsem); - else - up_read(&sbi->btree_rwsem); - } - } -} - -/* - * Allocate and initialize a new tree block. The caller adds references - * to it. - */ -static struct scoutfs_block *alloc_tree_block(struct super_block *sb, int level) -{ - struct scoutfs_btree_block *bt; - struct scoutfs_block *bl; - - bl = scoutfs_block_dirty_alloc(sb); - if (!IS_ERR(bl)) { - bt = scoutfs_block_data(bl); - - bt->free_end = cpu_to_le16(SCOUTFS_BLOCK_SIZE); - bt->free_reclaim = 0; - bt->nr_items = 0; - - set_block_lock_class(bl, level); - } - - return bl; -} - -/* the caller has ensured that the free must succeed */ -static void free_tree_block(struct super_block *sb, struct scoutfs_block *bl) -{ - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct scoutfs_super_block *super = &sbi->super; - struct scoutfs_btree_block *bt = scoutfs_block_data(bl); - int err; - - BUG_ON(bt->hdr.seq != super->hdr.seq); - - scoutfs_block_forget(bl); - err = scoutfs_buddy_free(sb, bt->hdr.seq, - le64_to_cpu(bt->hdr.blkno), 0); - BUG_ON(err); -} - -/* - * Allocate a new tree block and point the root at it. The caller - * is responsible for the items in the new root block. - */ -static struct scoutfs_block *grow_tree(struct super_block *sb, - struct scoutfs_btree_root *root) -{ - struct scoutfs_block_header *hdr; - struct scoutfs_block *bl; - - bl = alloc_tree_block(sb, root->height); - if (!IS_ERR(bl)) { - hdr = scoutfs_block_data(bl); - - root->height++; - root->ref.blkno = hdr->blkno; - root->ref.seq = hdr->seq; - - set_block_lock_class(bl, root->height - 1); - } - - return bl; -} - -static struct scoutfs_block *get_block_ref(struct super_block *sb, int level, - struct scoutfs_block_ref *ref, - bool dirty) -{ - struct scoutfs_block *bl; - - if (dirty) - bl = scoutfs_block_dirty_ref(sb, ref); - else - bl = scoutfs_block_read_ref(sb, ref); - - if (!IS_ERR(bl)) - set_block_lock_class(bl, level); - - return bl; -} - -/* - * Create a new item in the parent which references the child. The caller - * specifies the key in the item that describes the items in the child. - */ -static void create_parent_item(struct scoutfs_btree_block *parent, - unsigned int pos, - struct scoutfs_btree_block *child, - struct scoutfs_key *key) -{ - struct scoutfs_btree_item *item; - struct scoutfs_block_ref ref = { - .blkno = child->hdr.blkno, - .seq = child->hdr.seq, - }; - - item = create_item(parent, pos, key, sizeof(ref)); - memcpy(&item->val, &ref, sizeof(ref)); -} - -/* - * See if we need to split this block while descending for insertion so - * that we have enough space to insert. - * - * Parent blocks need enough space for a new item and child ref if a - * child block splits. Leaf blocks need enough space to insert the new - * item with its value. - * - * We split to the left so that the greatest key in the existing block - * doesn't change so we don't have to update the key in its parent item. - * - * If the search key falls in the new split block then we return it to - * the caller to walk through. - * - * The caller has the parent (or root) and our block locked. We don't - * have to lock the blocks we allocate while we have the references to - * them locked. We only need to lock the new sibling if we return it - * instead of our given block for the caller to continue descent. - */ -static struct scoutfs_block *try_split(struct super_block *sb, - struct scoutfs_btree_root *root, - int level, struct scoutfs_key *key, - unsigned int val_len, - struct scoutfs_btree_block *parent, - unsigned int parent_pos, - struct scoutfs_block *right_bl) -{ - struct scoutfs_btree_block *right = scoutfs_block_data(right_bl); - struct scoutfs_btree_block *left; - struct scoutfs_block *left_bl; - struct scoutfs_block *par_bl = NULL; - struct scoutfs_key maximal; - unsigned int all_bytes; - - if (level) - val_len = sizeof(struct scoutfs_block_ref); - all_bytes = all_val_bytes(val_len); - - if (contig_free(right) >= all_bytes) - return right_bl; - - if (reclaimable_free(right) >= all_bytes) { - compact_items(right); - return right_bl; - } - - /* alloc split neighbour first to avoid unwinding tree growth */ - left_bl = alloc_tree_block(sb, level); - if (IS_ERR(left_bl)) { - unlock_tree_block(sb, root, right_bl, true); - scoutfs_block_put(right_bl); - return left_bl; - } - left = scoutfs_block_data(left_bl); - - if (!parent) { - par_bl = grow_tree(sb, root); - if (IS_ERR(par_bl)) { - free_tree_block(sb, left_bl); - scoutfs_block_put(left_bl); - unlock_tree_block(sb, root, right_bl, true); - scoutfs_block_put(right_bl); - return par_bl; - } - - parent = scoutfs_block_data(par_bl); - parent_pos = 0; - - scoutfs_set_max_key(&maximal); - create_parent_item(parent, parent_pos, right, &maximal); - } - - move_items(left, right, false, used_total(right) / 2); - create_parent_item(parent, parent_pos, left, greatest_key(left)); - parent_pos++; /* not that anything uses it again :P */ - - if (scoutfs_key_cmp(key, greatest_key(left)) <= 0) { - /* insertion will go to the new left block */ - unlock_tree_block(sb, root, right_bl, true); - lock_tree_block(sb, root, left_bl, true, 0); - swap(right_bl, left_bl); - } else { - /* insertion will still go through us, might need to compact */ - if (contig_free(right) < all_bytes) - compact_items(right); - } - - scoutfs_block_put(par_bl); - scoutfs_block_put(left_bl); - - return right_bl; -} - -/* - * This is called during descent for deletion when we have a parent and - * might need to merge items from a sibling block if this block has too - * much free space. Eventually we'll be able to fit all of the - * sibling's items in our free space which lets us delete the sibling - * block. - * - * The error handling here is a little weird. We're returning an - * ERR_PTR buffer to match splitting so that the walk can handle errors - * from both easily. We have to unlock and release our buffer to return - * an error. - * - * The caller locks the parent and our given block. We need to - * lock sibling blocks in consistent tree order. Our common case - * has us pulling from our left sibling so we prefer to lock blocks - * from right to left. Splitting doesn't hold both sibling locks. - * - * We free sibling or parent btree block blknos if we drain them of items. - * They're dirtied either by descent or before we start migrating items - * so freeing their blkno must succeed. - * - * XXX this could more cleverly chose a merge candidate sibling - */ -static struct scoutfs_block *try_merge(struct super_block *sb, - struct scoutfs_btree_root *root, - struct scoutfs_block *par_bl, - int level, unsigned int pos, - struct scoutfs_block *bl) -{ - struct scoutfs_btree_block *parent = scoutfs_block_data(par_bl); - struct scoutfs_btree_block *bt = scoutfs_block_data(bl); - struct scoutfs_btree_item *sib_item; - struct scoutfs_btree_block *sib_bt; - struct scoutfs_block *sib_bl; - unsigned int sib_pos; - bool move_right; - int to_move; - - if (reclaimable_free(bt) <= SCOUTFS_BTREE_FREE_LIMIT) - return bl; - - /* move items right into our block if we have a left sibling */ - if (pos) { - sib_pos = pos - 1; - move_right = true; - } else { - sib_pos = pos + 1; - move_right = false; - } - sib_item = pos_item(parent, sib_pos); - - sib_bl = get_block_ref(sb, level, (void *)sib_item->val, true); - if (IS_ERR(sib_bl)) { - /* XXX do we need to unlock this? don't think so */ - scoutfs_block_put(bl); - return sib_bl; - } - sib_bt = scoutfs_block_data(sib_bl); - - if (!move_right) { - unlock_tree_block(sb, root, bl, true); - lock_tree_block(sb, root, sib_bl, true, 0); - lock_tree_block(sb, root, bl, true, 1); - - if (reclaimable_free(bt) <= SCOUTFS_BTREE_FREE_LIMIT) { - unlock_tree_block(sb, root, sib_bl, true); - scoutfs_block_put(sib_bl); - return bl; - } - } else { - lock_tree_block(sb, root, sib_bl, true, 1); - } - - if (used_total(sib_bt) <= reclaimable_free(bt)) - to_move = used_total(sib_bt); - else - to_move = reclaimable_free(bt) - SCOUTFS_BTREE_FREE_LIMIT; - - /* - * Make sure there's room to move a max size item if it's the - * next in line when we only have one byte left to try and move. - * - * XXX This is getting awfully fiddly. Should we be refactoring - * item insertion/deletion to do this for us? - */ - if (contig_free(bt) < (to_move + (SCOUTFS_MAX_ITEM_LEN - 1))) - compact_items(bt); - - trace_printk("sib_pos %d move_right %u to_move %u\n", - sib_pos, move_right, to_move); - - move_items(bt, sib_bt, move_right, to_move); - - /* update our parent's ref if we changed our greatest key */ - if (!move_right) - pos_item(parent, pos)->key = *greatest_key(bt); - - /* delete an empty sib or update if we changed its greatest key */ - if (le16_to_cpu(sib_bt->nr_items) == 0) { - delete_item(parent, sib_pos); - free_tree_block(sb, sib_bl); - } else if (move_right) { - sib_item->key = *greatest_key(sib_bt); - } - - /* and finally shrink the tree if our parent is the root with 1 */ - if (le16_to_cpu(parent->nr_items) == 1) { - root->height--; - root->ref.blkno = bt->hdr.blkno; - root->ref.seq = bt->hdr.seq; - free_tree_block(sb, par_bl); - /* caller just unlocks and drops parent */ - } - - unlock_tree_block(sb, root, sib_bl, true); - scoutfs_block_put(sib_bl); - - return bl; -} - -enum { - WALK_INSERT = 1, - WALK_DELETE, - WALK_NEXT_SEQ, - WALK_DIRTY, -}; - -static u64 item_block_ref_seq(struct scoutfs_btree_item *item) -{ - struct scoutfs_block_ref *ref = (void *)item->val; - - return le64_to_cpu(ref->seq); -} - -/* - * Return true if we should skip this item while iterating by sequence - * number. If it's a parent then we test the block ref's seq, if it's a - * leaf item then we check the item's seq. - */ -static bool skip_pos_seq(struct scoutfs_btree_block *bt, unsigned int pos, - int level, u64 seq, int op) -{ - struct scoutfs_btree_item *item; - - if (op != WALK_NEXT_SEQ || pos >= le16_to_cpu(bt->nr_items)) - return false; - - item = pos_item(bt, pos); - - return ((level > 0 && item_block_ref_seq(item) < seq) || - (level == 0 && le64_to_cpu(item->seq) < seq)); -} - -/* - * Return the next sorted item position, possibly skipping those with - * sequence numbers less than the desired sequence number. - */ -static unsigned int next_pos_seq(struct scoutfs_btree_block *bt, - unsigned int pos, int level, u64 seq, int op) -{ - do { - pos++; - } while (skip_pos_seq(bt, pos, level, seq, op)); - - return pos; -} - -/* - * Return the first item after the given key, possibly skipping those - * with sequence numbers less than the desired sequence number. - */ -static unsigned int find_pos_after_seq(struct scoutfs_btree_block *bt, - struct scoutfs_key *key, int level, - u64 seq, int op) -{ - unsigned int pos; - int cmp; - - pos = find_pos(bt, key, &cmp); - if (skip_pos_seq(bt, pos, level, seq, op)) - pos = next_pos_seq(bt, pos, level, seq, op); - - return pos; -} - -/* - * Verify that the btree block isn't corrupt. This is way too expensive - * to do for each block access though that's very helpful for debugging - * btree block corruption. - * - * It should be done the first time we read blocks and it doing it for - * every block access should be hidden behind runtime options. - * - * XXX - * - make sure items don't overlap - * - make sure offs point to live items - * - do things with level - * - see if item keys make sense - */ -static int verify_btree_block(struct scoutfs_btree_block *bt, int level, - struct scoutfs_key *small, - struct scoutfs_key *large) -{ - struct scoutfs_btree_item *item; - struct scoutfs_key *prev; - unsigned int bytes = 0; - unsigned int after_offs = sizeof(struct scoutfs_btree_block); - unsigned int first_off; - unsigned int off; - unsigned int nr; - unsigned int i = 0; - int bad = 1; - - nr = le16_to_cpu(bt->nr_items); - if (nr == 0) - goto out; - - if (nr > SCOUTFS_BTREE_MAX_ITEMS) { - nr = SCOUTFS_BTREE_MAX_ITEMS; - goto out; - } - - after_offs = offsetof(struct scoutfs_btree_block, item_offs[nr]); - first_off = SCOUTFS_BLOCK_SIZE; - - for (i = 0; i < nr; i++) { - - off = le16_to_cpu(bt->item_offs[i]); - if (off >= SCOUTFS_BLOCK_SIZE || off < after_offs) - goto out; - - first_off = min(first_off, off); - - item = pos_item(bt, i); - bytes += item_bytes(item); - - if ((i == 0 && scoutfs_key_cmp(&item->key, small) < 0) || - (i > 0 && scoutfs_key_cmp(&item->key, prev) <= 0) || - (i == (nr - 1) && scoutfs_key_cmp(&item->key, large) > 0)) - goto out; - - prev = &item->key; - } - - if (first_off < le16_to_cpu(bt->free_end)) - goto out; - - if ((le16_to_cpu(bt->free_end) + bytes + - le16_to_cpu(bt->free_reclaim)) != SCOUTFS_BLOCK_SIZE) - goto out; - - bad = 0; -out: - if (bad) { - printk("bt %p blkno %llu level %d small "CKF" large "CKF" end %u reclaim %u nr %u (max %lu after %u bytes %u)\n", - bt, le64_to_cpu(bt->hdr.blkno), level, - CKA(small), CKA(large), le16_to_cpu(bt->free_end), - le16_to_cpu(bt->free_reclaim), bt->nr_items, - SCOUTFS_BTREE_MAX_ITEMS, after_offs, bytes); - for (i = 0; i < nr; i++) { - item = pos_item(bt, i); - off = le16_to_cpu(bt->item_offs[i]); - printk(" [%u] off %u key "CKF" len %u\n", - i, off, CKA(&item->key), - le16_to_cpu(item->val_len)); - } - BUG_ON(bad); - } - - return 0; -} - -/* - * Return the leaf block that should contain the given key. The caller - * is responsible for searching the leaf block and performing their - * operation. The block is returned locked for either reading or - * writing depending on the operation. - * - * As we descend through parent items we set prev_key or next_key to the - * last key in the previous sibling's block or to the first key in the - * next sibling's block, respectively. This is used by iteration to - * keep searching sibling blocks if their search key falls at the end of - * a leaf in their search direction. - */ -static struct scoutfs_block *btree_walk(struct super_block *sb, - struct scoutfs_btree_root *root, - struct scoutfs_key *key, - struct scoutfs_key *prev_key, - struct scoutfs_key *next_key, - unsigned int val_len, u64 seq, int op) -{ - struct scoutfs_btree_block *parent = NULL; - struct scoutfs_block *par_bl = NULL; - struct scoutfs_block *bl = NULL; - struct scoutfs_btree_item *item = NULL; - struct scoutfs_block_ref *ref; - struct scoutfs_key small; - struct scoutfs_key large; - unsigned int level; - unsigned int pos = 0; - const bool dirty = op == WALK_INSERT || op == WALK_DELETE || - op == WALK_DIRTY; - int ret; - - /* no sibling blocks if we don't have parent blocks */ - if (next_key) - scoutfs_set_max_key(next_key); - if (prev_key) - scoutfs_key_set_zero(prev_key); - - lock_tree_block(sb, root, NULL, dirty, 0); - - ref = &root->ref; - level = root->height; - - if (!root->height) { - if (op == WALK_INSERT) { - bl = ERR_PTR(-ENOENT); - } else { - bl = grow_tree(sb, root); - if (!IS_ERR(bl)) { - lock_tree_block(sb, root, bl, dirty, 0); - unlock_tree_block(sb, root, NULL, dirty); - } - } - goto out; - } - - - /* skip the whole tree if the root ref's seq is old */ - if (op == WALK_NEXT_SEQ && le64_to_cpu(ref->seq) < seq) { - bl = ERR_PTR(-ENOENT); - goto out; - } - - scoutfs_set_key(&small, 0, 0, 0); - scoutfs_set_key(&large, ~0ULL, ~0, ~0ULL); - - while (level--) { - /* XXX hmm, need to think about retry */ - bl = get_block_ref(sb, level, ref, dirty); - if (IS_ERR(bl)) - break; - - /* XXX enable this */ - ret = 0 && verify_btree_block(scoutfs_block_data(bl), level, - &small, &large); - if (ret) { - scoutfs_block_put(bl); - bl = ERR_PTR(ret); - break; - } - - lock_tree_block(sb, root, bl, dirty, 0); - - if (op == WALK_INSERT) - bl = try_split(sb, root, level, key, val_len, parent, - pos, bl); - if ((op == WALK_DELETE) && parent) - bl = try_merge(sb, root, par_bl, level, pos, bl); - if (IS_ERR(bl)) - break; - - unlock_tree_block(sb, root, par_bl, dirty); - - if (!level) - break; - - scoutfs_block_put(par_bl); - par_bl = bl; - parent = scoutfs_block_data(par_bl); - - /* - * Find the parent item that references the next child - * block to search. If we're skipping items with old - * seqs then we might not have any child items to - * search. - */ - pos = find_pos_after_seq(parent, key, level, seq, op); - if (pos >= le16_to_cpu(parent->nr_items)) { - /* current block dropped as parent below */ - if (op == WALK_NEXT_SEQ) - bl = ERR_PTR(-ENOENT); - else - bl = ERR_PTR(-EIO); - break; - } - - /* XXX verify sane length */ - item = pos_item(parent, pos); - ref = (void *)item->val; - - /* - * Update the keys that iterators should continue - * searching from. Keep in mind that iteration is read - * only so the parent item won't be changed splitting or - * merging. - */ - if (next_key) { - *next_key = item->key; - scoutfs_inc_key(next_key); - } - - if (pos) { - small = pos_item(parent, pos - 1)->key; - if (prev_key) - *prev_key = small; - } - large = item->key; - } - -out: - if (IS_ERR(bl)) - unlock_tree_block(sb, root, par_bl, dirty); - scoutfs_block_put(par_bl); - - return bl; -} - -/* - * Copy the given value identified by the given key into the caller's - * buffer. The number of bytes copied is returned, -ENOENT if the key - * wasn't found, or -errno on errors. - */ -int scoutfs_btree_lookup(struct super_block *sb, - struct scoutfs_btree_root *root, - struct scoutfs_key *key, - struct scoutfs_btree_val *val) -{ - struct scoutfs_btree_item *item; - struct scoutfs_btree_block *bt; - struct scoutfs_block *bl; - unsigned int pos; - int cmp; - int ret; - - trace_scoutfs_btree_lookup(sb, key, scoutfs_btree_val_length(val)); - - bl = btree_walk(sb, root, key, NULL, NULL, 0, 0, 0); - if (IS_ERR(bl)) - return PTR_ERR(bl); - bt = scoutfs_block_data(bl); - - pos = find_pos(bt, key, &cmp); - if (cmp == 0) { - item = pos_item(bt, pos); - ret = copy_to_val(val, item); - } else { - ret = -ENOENT; - } - - unlock_tree_block(sb, root, bl, false); - scoutfs_block_put(bl); - - trace_printk("key "CKF" ret %d\n", CKA(key), ret); - - return ret; -} - -/* - * Insert a new item in the tree. - * - * 0 is returned on success. -EEXIST is returned if the key is already - * present in the tree. - * - * If no value pointer is given then the item is created with a zero - * length value. - */ -int scoutfs_btree_insert(struct super_block *sb, - struct scoutfs_btree_root *root, - struct scoutfs_key *key, - struct scoutfs_btree_val *val) -{ - struct scoutfs_btree_item *item; - struct scoutfs_btree_block *bt; - struct scoutfs_block *bl; - unsigned int val_len; - int pos; - int cmp; - int ret; - - if (val) - val_len = scoutfs_btree_val_length(val); - else - val_len = 0; - - trace_scoutfs_btree_insert(sb, key, val_len); - - if (WARN_ON_ONCE(val_len > SCOUTFS_MAX_ITEM_LEN)) - return -EINVAL; - - bl = btree_walk(sb, root, key, NULL, NULL, val_len, 0, WALK_INSERT); - if (IS_ERR(bl)) - return PTR_ERR(bl); - bt = scoutfs_block_data(bl); - - pos = find_pos(bt, key, &cmp); - if (cmp) { - item = create_item(bt, pos, key, val_len); - if (val) - ret = copy_to_item(item, val); - else - ret = 0; - } else { - ret = -EEXIST; - } - - unlock_tree_block(sb, root, bl, true); - scoutfs_block_put(bl); - - return ret; -} - -/* - * Delete an item from the tree. -ENOENT is returned if the key isn't - * found. - */ -int scoutfs_btree_delete(struct super_block *sb, - struct scoutfs_btree_root *root, - struct scoutfs_key *key) -{ - struct scoutfs_btree_block *bt; - struct scoutfs_block *bl; - int pos; - int cmp; - int ret; - - trace_scoutfs_btree_delete(sb, key, 0); - - bl = btree_walk(sb, root, key, NULL, NULL, 0, 0, WALK_DELETE); - if (IS_ERR(bl)) { - ret = PTR_ERR(bl); - goto out; - } - bt = scoutfs_block_data(bl); - - pos = find_pos(bt, key, &cmp); - if (cmp == 0) { - delete_item(bt, pos); - ret = 0; - - /* XXX this locking is broken.. hold root rwsem? */ - - /* delete the final block in the tree */ - if (bt->nr_items == 0) { - root->height = 0; - root->ref.blkno = 0; - root->ref.seq = 0; - - free_tree_block(sb, bl); - } - } else { - ret = -ENOENT; - } - - unlock_tree_block(sb, root, bl, true); - scoutfs_block_put(bl); - -out: - trace_printk("key "CKF" ret %d\n", CKA(key), ret); - return ret; -} - -/* - * Find the next key in the tree starting from 'first', and ending at - * 'last'. 'found', 'found_seq', and 'val' are set to the discovered - * item if they're provided. - * - * The caller can limit results to items with a sequence number greater - * than or equal to their sequence number. - * - * The only tricky bit is that they key we're searching for might not - * exist in the tree. We can get to the leaf and find that there are no - * greater items in the leaf. We have to search again from the keys - * greater than the parent item's keys which the walk gives us. We also - * star the search over from this next key if walking while filtering - * based on seqs terminates early. - * - * Returns the bytes copied into the value (0 if not provided), -ENOENT - * if there is no item past first until last, or -errno on errors. - * - * It's a common pattern to use the same key for first and found so we're - * careful to copy first before we modify found. - */ -static int btree_next(struct super_block *sb, struct scoutfs_btree_root *root, - struct scoutfs_key *first, struct scoutfs_key *last, - u64 seq, int op, struct scoutfs_key *found, - u64 *found_seq, struct scoutfs_btree_val *val) -{ - struct scoutfs_btree_item *item; - struct scoutfs_btree_block *bt; - struct scoutfs_key start = *first; - struct scoutfs_key key = *first; - struct scoutfs_key next_key; - struct scoutfs_block *bl; - int pos; - int ret; - - trace_printk("finding next first "CKF" last "CKF"\n", - CKA(&start), CKA(last)); - - /* find the leaf that contains the next item after the key */ - ret = -ENOENT; - while (scoutfs_key_cmp(&key, last) <= 0) { - - bl = btree_walk(sb, root, &key, NULL, &next_key, 0, seq, op); - - /* next seq walks can terminate in parents with old seqs */ - if (op == WALK_NEXT_SEQ && bl == ERR_PTR(-ENOENT)) { - key = next_key; - continue; - } - - if (IS_ERR(bl)) { - ret = PTR_ERR(bl); - break; - } - bt = scoutfs_block_data(bl); - - /* keep trying leaves until next_key passes last */ - pos = find_pos_after_seq(bt, &key, 0, seq, op); - if (pos >= le16_to_cpu(bt->nr_items)) { - key = next_key; - unlock_tree_block(sb, root, bl, false); - scoutfs_block_put(bl); - continue; - } - - item = pos_item(bt, pos); - if (scoutfs_key_cmp(&item->key, last) <= 0) { - *found = item->key; - if (found_seq) - *found_seq = le64_to_cpu(item->seq); - if (val) - ret = copy_to_val(val, item); - else - ret = 0; - } else { - ret = -ENOENT; - } - - unlock_tree_block(sb, root, bl, false); - scoutfs_block_put(bl); - break; - } - - trace_printk("next first "CKF" last "CKF" found "CKF" ret %d\n", - CKA(&start), CKA(last), CKA(found), ret); - return ret; -} - -int scoutfs_btree_next(struct super_block *sb, struct scoutfs_btree_root *root, - struct scoutfs_key *first, struct scoutfs_key *last, - struct scoutfs_key *found, - struct scoutfs_btree_val *val) -{ - trace_scoutfs_btree_next(sb, first, last); - - return btree_next(sb, root, first, last, 0, 0, found, NULL, val); -} - -int scoutfs_btree_since(struct super_block *sb, - struct scoutfs_btree_root *root, - struct scoutfs_key *first, struct scoutfs_key *last, - u64 seq, struct scoutfs_key *found, u64 *found_seq, - struct scoutfs_btree_val *val) -{ - trace_scoutfs_btree_since(sb, first, last); - - return btree_next(sb, root, first, last, seq, WALK_NEXT_SEQ, - found, found_seq, val); -} - -/* - * Find the greatest key that is >= first and <= last, starting at last. - * For each search cursor key we descend to the leaf and find its - * position in the items. The item binary search returns the position - * that the key would be inserted into, so if we didn't find the key - * specifically we go to the previous position. The btree walk gives us - * the previous key to search from if we fall off the front of the - * block. - * - * This doesn't support filtering the tree traversal by seqs. - */ -int scoutfs_btree_prev(struct super_block *sb, struct scoutfs_btree_root *root, - struct scoutfs_key *first, struct scoutfs_key *last, - struct scoutfs_key *found, u64 *found_seq, - struct scoutfs_btree_val *val) -{ - struct scoutfs_btree_item *item; - struct scoutfs_btree_block *bt; - struct scoutfs_key key = *last; - struct scoutfs_key prev_key; - struct scoutfs_block *bl; - int pos; - int cmp; - int ret; - - trace_scoutfs_btree_prev(sb, first, last); - - /* find the leaf that contains the next item after the key */ - ret = -ENOENT; - while (scoutfs_key_cmp(&key, first) >= 0) { - - bl = btree_walk(sb, root, &key, &prev_key, NULL, 0, 0, 0); - if (IS_ERR(bl)) { - ret = PTR_ERR(bl); - break; - } - bt = scoutfs_block_data(bl); - - pos = find_pos(bt, &key, &cmp); - - /* walk to the prev leaf if we hit the front of this leaf */ - if (pos == 0 && cmp != 0) { - unlock_tree_block(sb, root, bl, false); - scoutfs_block_put(bl); - if (scoutfs_key_is_zero(&key)) - break; - key = prev_key; - continue; - } - - /* we want the item before a non-matching position */ - if (pos && cmp) - pos--; - - /* return the item if it's still within our first bound */ - item = pos_item(bt, pos); - if (cmp == 0 || scoutfs_key_cmp(&item->key, first) >= 0) { - *found = item->key; - if (found_seq) - *found_seq = le64_to_cpu(item->seq); - if (val) - ret = copy_to_val(val, item); - else - ret = 0; - } - - unlock_tree_block(sb, root, bl, false); - scoutfs_block_put(bl); - break; - } - - return ret; -} - -/* - * Ensure that the blocks that lead to the item with the given key are - * dirty. caller can hold a transaction to pin the dirty blocks and - * guarantee that later updates of the item will succeed. - * - * <0 is returned on error, including -ENOENT if the key isn't present. - */ -int scoutfs_btree_dirty(struct super_block *sb, - struct scoutfs_btree_root *root, - struct scoutfs_key *key) -{ - struct scoutfs_btree_block *bt; - struct scoutfs_block *bl; - int cmp; - int ret; - - trace_scoutfs_btree_dirty(sb, key, 0); - - bl = btree_walk(sb, root, key, NULL, NULL, 0, 0, WALK_DIRTY); - if (IS_ERR(bl)) - return PTR_ERR(bl); - bt = scoutfs_block_data(bl); - - find_pos(bt, key, &cmp); - if (cmp == 0) { - ret = 0; - } else { - ret = -ENOENT; - } - - unlock_tree_block(sb, root, bl, true); - scoutfs_block_put(bl); - - trace_printk("key "CKF" ret %d\n", CKA(key), ret); - - return ret; -} - -/* - * This is guaranteed not to fail if the caller has already dirtied the - * block that contains the item in the current transaction. - * - * 0 is returned on success. -EINVAL is returned if the caller's value - * length doesn't match the existing item's value length. - */ -int scoutfs_btree_update(struct super_block *sb, - struct scoutfs_btree_root *root, - struct scoutfs_key *key, - struct scoutfs_btree_val *val) -{ - struct scoutfs_btree_item *item; - struct scoutfs_btree_block *bt; - struct scoutfs_block *bl; - int pos; - int cmp; - int ret; - - trace_scoutfs_btree_update(sb, key, - val ? scoutfs_btree_val_length(val) : 0); - - bl = btree_walk(sb, root, key, NULL, NULL, 0, 0, WALK_DIRTY); - if (IS_ERR(bl)) - return PTR_ERR(bl); - bt = scoutfs_block_data(bl); - - pos = find_pos(bt, key, &cmp); - if (cmp == 0) { - item = pos_item(bt, pos); - ret = copy_to_item(item, val); - if (ret == 0) - item->seq = bt->hdr.seq; - } else { - ret = -ENOENT; - } - - unlock_tree_block(sb, root, bl, true); - scoutfs_block_put(bl); - - return ret; -} - -/* - * Set hole to a missing key in the caller's range. - * - * 0 is returned if we find a missing key, -ENOSPC is returned if all - * the keys in the range are present in the tree, and -errno is returned - * if we saw an error. - * - * We try to find the first key in the range. If the next key is past - * the first key then we return the key before the found key. This will - * tend to let us find the hole with one btree search. - * - * We keep searching as long as we keep finding the first key and will - * return -ENOSPC if we fall off the end of the range doing so. - */ -int scoutfs_btree_hole(struct super_block *sb, struct scoutfs_btree_root *root, - struct scoutfs_key *first, - struct scoutfs_key *last, struct scoutfs_key *hole) -{ - struct scoutfs_key key = *first; - struct scoutfs_key found; - int ret; - - trace_scoutfs_btree_hole(sb, first, last); - - if (WARN_ON_ONCE(scoutfs_key_cmp(first, last) > 0)) { - scoutfs_key_set_zero(hole); - return -EINVAL; - } - - /* search as long as we keep finding our first key */ - do { - ret = scoutfs_btree_next(sb, root, &key, last, &found, NULL); - } while (ret == 0 && - scoutfs_key_cmp(&found, &key) == 0 && - (scoutfs_inc_key(&key), ret = -ENOSPC, - scoutfs_key_cmp(&key, last) <= 0)); - - if (ret == 0) { - *hole = found; - scoutfs_dec_key(hole); - } else if (ret == -ENOENT) { - *hole = *last; - ret = 0; - } - - trace_printk("first "CKF" last "CKF" hole "CKF" ret %d\n", - CKA(first), CKA(last), CKA(hole), ret); - - return ret; -} diff --git a/kmod/src/btree.h b/kmod/src/btree.h deleted file mode 100644 index dec2310c..00000000 --- a/kmod/src/btree.h +++ /dev/null @@ -1,77 +0,0 @@ -#ifndef _SCOUTFS_BTREE_H_ -#define _SCOUTFS_BTREE_H_ - -#include - -struct scoutfs_btree_val { - struct kvec vec[3]; - unsigned int check_size_eq:1; - unsigned int check_size_lte:1; -}; - -static inline void __scoutfs_btree_init_val(struct scoutfs_btree_val *val, - void *ptr0, unsigned int len0, - void *ptr1, unsigned int len1, - void *ptr2, unsigned int len2) -{ - *val = (struct scoutfs_btree_val) { - { { ptr0, len0 }, { ptr1, len1 }, { ptr2, len2 } } - }; -} - -#define _scoutfs_btree_init_val(v, p0, l0, p1, l1, p2, l2, ...) \ - __scoutfs_btree_init_val(v, p0, l0, p1, l1, p2, l2) - -/* - * Provide a nice variadic initialization function without having to - * iterate over the callers arg types. We play some macro games to pad - * out the callers ptr/len pairs to the full possible number. This will - * produce confusing errors if an odd number of arguments is given and - * the padded ptr/length types aren't compatible with the fixed - * arguments in the static inline. - */ -#define scoutfs_btree_init_val(val, ...) \ - _scoutfs_btree_init_val(val, __VA_ARGS__, NULL, 0, NULL, 0, NULL, 0) - -static inline int scoutfs_btree_val_length(struct scoutfs_btree_val *val) -{ - - return iov_length((struct iovec *)val->vec, ARRAY_SIZE(val->vec)); -} - -int scoutfs_btree_lookup(struct super_block *sb, - struct scoutfs_btree_root *root, - struct scoutfs_key *key, - struct scoutfs_btree_val *val); -int scoutfs_btree_insert(struct super_block *sb, - struct scoutfs_btree_root *root, - struct scoutfs_key *key, - struct scoutfs_btree_val *val); -int scoutfs_btree_delete(struct super_block *sb, - struct scoutfs_btree_root *root, - struct scoutfs_key *key); -int scoutfs_btree_next(struct super_block *sb, struct scoutfs_btree_root *root, - struct scoutfs_key *first, struct scoutfs_key *last, - struct scoutfs_key *found, - struct scoutfs_btree_val *val); -int scoutfs_btree_prev(struct super_block *sb, struct scoutfs_btree_root *root, - struct scoutfs_key *first, struct scoutfs_key *last, - struct scoutfs_key *found, u64 *found_seq, - struct scoutfs_btree_val *val); -int scoutfs_btree_dirty(struct super_block *sb, - struct scoutfs_btree_root *root, - struct scoutfs_key *key); -int scoutfs_btree_update(struct super_block *sb, - struct scoutfs_btree_root *root, - struct scoutfs_key *key, - struct scoutfs_btree_val *val); -int scoutfs_btree_hole(struct super_block *sb, struct scoutfs_btree_root *root, - struct scoutfs_key *first, - struct scoutfs_key *last, struct scoutfs_key *hole); -int scoutfs_btree_since(struct super_block *sb, - struct scoutfs_btree_root *root, - struct scoutfs_key *first, struct scoutfs_key *last, - u64 seq, struct scoutfs_key *found, u64 *found_seq, - struct scoutfs_btree_val *val); - -#endif diff --git a/kmod/src/buddy.c b/kmod/src/buddy.c deleted file mode 100644 index 9f4a16fd..00000000 --- a/kmod/src/buddy.c +++ /dev/null @@ -1,1063 +0,0 @@ -/* - * Copyright (C) 2016 Versity Software, Inc. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ -#include -#include -#include - -#include "super.h" -#include "format.h" -#include "block.h" -#include "buddy.h" -#include "scoutfs_trace.h" - -/* - * scoutfs uses buddy bitmaps in an augmented radix to index free space. - * - * At the heart of the allocator are the buddy bitmaps in the radix - * leaves. For a given region of blocks there are bitmaps for each - * power of two order of blocks that can be allocated. N bits record - * whether each order 0 size block region is allocated or freed, then - * N/2 bits describe order 1 regions that span pairs of order 0 blocks, - * and so on. This ends up using two bits in the bitmaps for each - * device block that's managed. - * - * An order bit is set when it is free. All of its lower order bits - * will be clear. To allocate we clear a bit. A partial allocation - * clears the higher order bit and each buddy for each lower order until - * the allocated order. Freeing sets an order bit. Then if it's buddy - * order is also set we clear both and set their higher order bit. This - * proceeds to the highest order. - * - * Each buddy block records the first set bit in each order bitmap. As - * bits are set they update these first set records if they're before - * the previous value. As bits are cleared we find the next set if it - * was the first. - * - * These buddy bitmap blocks that each fully describe a region of blocks - * are assembled into a radix tree. Each reference to a leaf block in - * parent blocks have a bitmap of the orders that are free in its leaf - * block. The parent blocks then also record the first slot that has - * each order bit set in its child references. This indexing holds all - * the way to the root. This lets us quickly determine an order that - * will satisfy an allocation and descend to the leaf that contains the - * first free region of that order. - * - * These buddy blocks themselves are located in preallocated space. Each - * logical position in the tree occupies two blocks on the device. In - * each transaction we use the currently referenced block to cow into - * its partner. Since the block positions are calculated the block - * references only need a bit to specify which of the pair is being - * referenced. The number of blocks needed is precisely calculated by - * taking the number of leaf blocks needed to track the device blocks - * and dividing by the radix fanout until we have a single root block. - * - * Each aligned block allocation order is stored in a path down the - * radix to a leaf that's a function of the block offset. This lets us - * ensure that we can allocate or free a given allocation order by - * dirtying those blocks. If we've allocated an order in a transaction - * it can always be freed (or re-allocated) while the transaction holds - * the dirty buddy blocks. - * - * We use that property to ensure that frees of stable data don't - * satisfy allocation until the next transaction. When we free stable - * data we dirty the path to its position in the radix and record the - * free in an rbtree. We can then apply these frees as we commit the - * transaction. If the transaction fails we can undo the frees and let - * the file system carry on. We'll try to reapply the frees before the - * next transaction commits. The allocator never introduces - * unrecoverable errors. - * - * The radix isn't fully populated when it's created. mkfs only - * initializes the two paths down the tree that have partially - * initialized parent slots and leaf bitmaps. The path down the left - * spine has the initial file system blocks allocated. The path down - * the right spine can have partial parent slots and bits set in the - * leaf when device sizes aren't multiples of the leaf block bit count - * and radix fanout. The kernel then only has to initialize the rest of - * the buddy blocks blocks which have fully populated parent slots and - * leaf bitmaps. - * - * XXX - * - resize is going to be a thing. figure out that thing. - */ - -struct buddy_info { - struct mutex mutex; - - atomic_t alloc_count; - struct rb_root pending_frees; - - /* max height given total blocks */ - u8 max_height; - /* the device blkno of the first block of a given level */ - u64 level_blkno[SCOUTFS_BUDDY_MAX_HEIGHT]; - /* blk divisor to find slot index at each level */ - u64 level_div[SCOUTFS_BUDDY_MAX_HEIGHT]; - - struct buddy_stack { - struct scoutfs_block *bl[SCOUTFS_BUDDY_MAX_HEIGHT]; - u16 sl[SCOUTFS_BUDDY_MAX_HEIGHT]; - int nr; - } stack; -}; - -/* the first device blkno covered by the buddy allocator */ -static u64 first_blkno(struct scoutfs_super_block *super) -{ - return SCOUTFS_BUDDY_BLKNO + le64_to_cpu(super->buddy_blocks); -} - -/* the last device blkno covered by the buddy allocator */ -static u64 last_blkno(struct scoutfs_super_block *super) -{ - return le64_to_cpu(super->total_blocks) - 1; -} - -/* the last relative blkno covered by the buddy allocator */ -static u64 last_blk(struct scoutfs_super_block *super) -{ - return last_blkno(super) - first_blkno(super); -} - -/* true when the device blkno is covered by the allocator */ -static bool device_blkno(struct scoutfs_super_block *super, u64 blkno) -{ - return blkno >= first_blkno(super) && blkno <= last_blkno(super); -} - -/* true when the device blkno is used for buddy blocks */ -static bool buddy_blkno(struct scoutfs_super_block *super, u64 blkno) -{ - return blkno < first_blkno(super); -} - -/* the order 0 bit offset in a buddy block of a given relative blk */ -static int buddy_bit(u64 blk) -{ - return do_div(blk, SCOUTFS_BUDDY_ORDER0_BITS); -} - -/* true if the rel blk could be the start of an allocation of the order */ -static bool valid_order(u64 blk, int order) -{ - return (buddy_bit(blk) & ((1 << order) - 1)) == 0; -} - -/* the block bit offset of the first bit of the given order's bitmap */ -static int order_off(int order) -{ - if (order == 0) - return 0; - - return (2 * SCOUTFS_BUDDY_ORDER0_BITS) - - (SCOUTFS_BUDDY_ORDER0_BITS / (1 << (order - 1))); -} - -/* the bit offset in the block bitmap of an order's bit */ -static int order_nr(int order, int nr) -{ - return order_off(order) + nr; -} - -static void stack_push(struct buddy_stack *sta, struct scoutfs_block *bl, - u16 sl) -{ - sta->bl[sta->nr] = bl; - sta->sl[sta->nr++] = sl; -} - -/* sl isn't returned because callers peek the leaf where sl is meaningless */ -static struct scoutfs_block *stack_peek(struct buddy_stack *sta) -{ - if (sta->nr) - return sta->bl[sta->nr - 1]; - - return NULL; -} - -static struct scoutfs_block *stack_pop(struct buddy_stack *sta, u16 *sl) -{ - if (sta->nr) { - *sl = sta->sl[--sta->nr]; - return sta->bl[sta->nr]; - } - - return NULL; -} - -/* update first_set if the caller set an earlier nr for the given order */ -static void set_order_nr(struct scoutfs_buddy_block *bud, int order, u16 nr) -{ - u16 first = le16_to_cpu(bud->first_set[order]); - - trace_printk("set level %u order %d nr %u first %u\n", - bud->level, order, nr, first); - - if (nr <= first) - bud->first_set[order] = cpu_to_le16(nr); -} - -/* find the next first set if the caller just cleared the current first_set */ -static void clear_order_nr(struct scoutfs_buddy_block *bud, int order, u16 nr) -{ - u16 first = le16_to_cpu(bud->first_set[order]); - int size; - int i; - - trace_printk("cleared level %u order %d nr %u first %u\n", - bud->level, order, nr, first); - - if (nr != first) - return; - - if (bud->level) { - for (i = nr + 1; i < SCOUTFS_BUDDY_SLOTS; i++) { - if (le16_to_cpu(bud->slots[i].free_orders) & - (1 << order)) - break; - } - if (i == SCOUTFS_BUDDY_SLOTS) - i = U16_MAX; - - } else { - size = order_off(order + 1); - i = find_next_bit_le(bud->bits, size, - order_nr(order, first) + 1); - if (i >= size) - i = U16_MAX; - else - i -= order_off(order); - } - - bud->first_set[order] = cpu_to_le16(i); - -} - -#define for_each_changed_bit(nr, bit, old, new, tmp) \ - for (tmp = old ^ new; \ - tmp && (nr = ffs(tmp) - 1, bit = 1 << nr, 1); \ - tmp ^= bit) - -/* - * Set a slot's free_orders value and update first_set for each order - * that it changes. Returns true of the slot's free_orders was changed. - */ -static bool set_slot_free_orders(struct scoutfs_buddy_block *bud, u16 sl, - u16 free_orders) -{ - u16 old = le16_to_cpu(bud->slots[sl].free_orders); - int order; - int tmp; - int bit; - - if (old == free_orders) - return false; - - for_each_changed_bit(order, bit, old, free_orders, tmp) { - if (old & bit) - clear_order_nr(bud, order, sl); - else - set_order_nr(bud, order, sl); - } - - bud->slots[sl].free_orders = cpu_to_le16(free_orders); - return true; -} - -/* - * The block at the top of the stack has changed its bits or slots and - * updated its first set. We propagate those changes up through - * free_orders in parents slots and their first_set up through the tree - * to free_orders in the root. We can stop when a block's first_set - * values don't change free_orders in their parent's slot. - */ -static void stack_cleanup(struct super_block *sb) -{ - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct buddy_info *binf = sbi->buddy_info; - struct buddy_stack *sta = &binf->stack; - struct scoutfs_buddy_root *root = &sbi->super.buddy_root; - struct scoutfs_buddy_block *bud; - struct scoutfs_block *bl; - u16 free_orders = 0; - bool parent; - u16 sl; - int i; - - parent = false; - while ((bl = stack_pop(sta, &sl))) { - - bud = scoutfs_block_data(bl); - if (parent && !set_slot_free_orders(bud, sl, free_orders)) { - scoutfs_block_put(bl); - break; - } - - free_orders = 0; - for (i = 0; i < ARRAY_SIZE(bud->first_set); i++) { - if (bud->first_set[i] != cpu_to_le16(U16_MAX)) - free_orders |= 1 << i; - } - - scoutfs_block_put(bl); - parent = true; - } - - /* set root if we got that far */ - if (bl == NULL) - root->slot.free_orders = cpu_to_le16(free_orders); - - /* put any remaining blocks */ - while ((bl = stack_pop(sta, &sl))) - scoutfs_block_put(bl); - -} - -static int test_buddy_bit(struct scoutfs_buddy_block *bud, int order, int nr) -{ - return !!test_bit_le(order_nr(order, nr), bud->bits); -} - -static void set_buddy_bit(struct scoutfs_buddy_block *bud, int order, int nr) -{ - if (!test_and_set_bit_le(order_nr(order, nr), bud->bits)) - set_order_nr(bud, order, nr); -} - -static void clear_buddy_bit(struct scoutfs_buddy_block *bud, int order, int nr) -{ - if (test_and_clear_bit_le(order_nr(order, nr), bud->bits)) - clear_order_nr(bud, order, nr); -} - -/* - * mkfs always writes the paths down the sides of the radix that have - * partially populated blocks. We only have to initialize full blocks - * in the middle of the tree. - */ -static void init_buddy_block(struct buddy_info *binf, - struct scoutfs_super_block *super, - struct scoutfs_block *bl, int level) -{ - struct scoutfs_buddy_block *bud = scoutfs_block_data(bl); - u16 count; - int nr; - int i; - - scoutfs_block_zero(bl, sizeof(bud->hdr)); - - for (i = 0; i < ARRAY_SIZE(bud->first_set); i++) - bud->first_set[i] = cpu_to_le16(U16_MAX); - - bud->level = level; - - if (level) { - for (i = 0; i < SCOUTFS_BUDDY_SLOTS; i++) - set_slot_free_orders(bud, i, SCOUTFS_BUDDY_ORDER0_BITS); - } else { - /* ensure that there aren't multiple highest orders */ - BUILD_BUG_ON((SCOUTFS_BUDDY_ORDER0_BITS / - (1 << (SCOUTFS_BUDDY_ORDERS - 1))) > 1); - - count = SCOUTFS_BUDDY_ORDER0_BITS; - nr = 0; - for (i = SCOUTFS_BUDDY_ORDERS - 1; i >= 0; i--) { - if (count & (1 << i)) { - set_buddy_bit(bud, i, nr); - nr = (nr + 1) << 1; - } else { - nr <<= 1; - } - } - } -} - -/* - * Give the caller the block referenced by the given slot. They've - * calculated the blkno of the pair of blocks while walking the tree. - * The slot describes which of the pair its referencing. The caller is - * always going to modify the block so we always try and cow it. We - * construct a fake ref so we can re-use the block ref cow code. When - * we initialize the first use of a block we use the first of the pair. - */ -static struct scoutfs_block *get_buddy_block(struct super_block *sb, - struct scoutfs_buddy_slot *slot, - u64 blkno, int level) -{ - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct scoutfs_super_block *super = &sbi->super; - struct buddy_info *binf = sbi->buddy_info; - struct scoutfs_buddy_block *bud; - struct scoutfs_block_ref ref; - struct scoutfs_block *bl; - - trace_printk("getting block level %d blkno %llu slot seq %llu off %u\n", - level, blkno, le64_to_cpu(slot->seq), slot->blkno_off); - - /* init a new block for an unused slot */ - if (slot->seq == 0) { - bl = scoutfs_block_dirty(sb, blkno); - if (!IS_ERR(bl)) - init_buddy_block(binf, super, bl, level); - } else { - /* construct block ref from tree walk blkno and slot ref */ - ref.blkno = cpu_to_le64(blkno + slot->blkno_off); - ref.seq = slot->seq; - bl = scoutfs_block_dirty_ref(sb, &ref); - } - - if (!IS_ERR(bl)) { - bud = scoutfs_block_data(bl); - - /* rebuild slot ref to blkno */ - if (slot->seq != bud->hdr.seq) { - slot->blkno_off = le64_to_cpu(bud->hdr.blkno) - blkno; - /* alloc_same only xors low bit */ - BUG_ON(slot->blkno_off > 1); - slot->seq = bud->hdr.seq; - } - } - - return bl; -} - -/* - * Walk the buddy block radix to the leaf that contains either the given - * relative blk or the first free given order. The radix is of a fixed - * depth and we initialize new blocks as we descend through - * uninitialized refs. - * - * If order is -1 then we search for the blk. - * - * As we descend we calculate the base blk offset of the path we're - * taking down the tree. This is used to find the blkno of the next - * block relative to the blkno of the given level. It's then used by - * the caller to calculate the total blk offset by adding the bit they - * find in the block. - * - * The path through the tree is recorded in the stack in the buddy info. - * The caller is responsible for cleaning up the stack and must do so - * even if we return an error. - */ -static int buddy_walk(struct super_block *sb, u64 blk, int order, u64 *base) -{ - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct scoutfs_super_block *super = &sbi->super; - struct buddy_info *binf = sbi->buddy_info; - struct buddy_stack *sta = &binf->stack; - struct scoutfs_buddy_root *root = &sbi->super.buddy_root; - struct scoutfs_buddy_block *bud; - struct scoutfs_buddy_slot *slot; - struct scoutfs_block *bl; - u64 blkno; - int level; - int ret = 0; - int sl = 0; - - /* XXX corruption? */ - if (blk > last_blk(super) || root->height == 0 || - root->height > SCOUTFS_BUDDY_MAX_HEIGHT) - return -EIO; - - slot = &root->slot; - level = root->height; - blkno = SCOUTFS_BUDDY_BLKNO; - *base = 0; - - while (level--) { - /* XXX do base and level make sense here? */ - bl = get_buddy_block(sb, slot, blkno, level); - if (IS_ERR(bl)) { - ret = PTR_ERR(bl); - break; - } - - trace_printk("before blk %llu order %d level %d blkno %llu base %llu sl %d\n", - blk, order, level, blkno, *base, sl); - - bud = scoutfs_block_data(bl); - - if (level) { - if (order >= 0) { - /* find first slot with order free */ - sl = le16_to_cpu(bud->first_set[order]); - /* XXX corruption */ - if (sl == U16_MAX) { - scoutfs_block_put(bl); - ret = -EIO; - break; - } - } else { - /* find slot based on blk */ - sl = div64_u64_rem(blk, binf->level_div[level], - &blk); - } - - /* shouldn't be sl * 2, right? */ - *base = (*base * SCOUTFS_BUDDY_SLOTS) + sl; - /* this is the only place we * 2 */ - blkno = binf->level_blkno[level - 1] + (*base * 2); - slot = &bud->slots[sl]; - } else { - *base *= SCOUTFS_BUDDY_ORDER0_BITS; - /* sl in stack is 0 for final leaf block */ - sl = 0; - } - - trace_printk("after blk %llu order %d level %d blkno %llu base %llu sl %d\n", - blk, order, level, blkno, *base, sl); - - - stack_push(sta, bl, sl); - } - - trace_printk("walking ret %d\n", ret); - - return ret; -} - -/* - * Find the order to search for to allocate a requested order. We try - * to use the smallest greater or equal order and then the largest - * smaller order. - */ -static int find_free_order(struct scoutfs_buddy_root *root, int order) -{ - u16 free = le16_to_cpu(root->slot.free_orders); - u16 smaller_mask = (1 << order) - 1; - u16 larger = free & ~smaller_mask; - u16 smaller = free & smaller_mask; - - if (larger) - return ffs(larger) - 1; - if (smaller) - return fls(smaller) - 1; - - return -ENOSPC; -} - -/* - * Walk to the leaf that contains the found order and allocate a region - * of the given order, returning the relative blk to the caller. - */ -static int buddy_alloc(struct super_block *sb, u64 *blk, int order, int found) -{ - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct buddy_info *binf = sbi->buddy_info; - struct buddy_stack *sta = &binf->stack; - struct scoutfs_buddy_block *bud; - struct scoutfs_block *bl; - u64 base; - int ret; - int nr; - int i; - - trace_printk("alloc order %d found %d\n", order, found); - - if (WARN_ON_ONCE(found >= 0 && order > found)) - return -EINVAL; - - ret = buddy_walk(sb, *blk, found, &base); - if (ret) - goto out; - - bl = stack_peek(sta); - bud = scoutfs_block_data(bl); - - if (found >= 0) { - nr = le16_to_cpu(bud->first_set[found]); - /* XXX corruption */ - if (nr == U16_MAX) { - ret = -EIO; - goto out; - } - - /* give caller the found blk for the order */ - *blk = base + (nr << found); - } else { - nr = buddy_bit(*blk) >> found; - } - - /* always allocate the higher or equal found order */ - clear_buddy_bit(bud, found, nr); - - /* and maybe free our buddies between smaller order and larger found */ - nr = buddy_bit(*blk) >> order; - for (i = order; i < found; i++) { - set_buddy_bit(bud, i, nr ^ 1); - nr >>= 1; - } - - ret = 0; -out: - trace_printk("alloc order %d found %d blk %llu ret %d\n", - order, found, *blk, ret); - stack_cleanup(sb); - return ret; -} - -/* - * Free a given order by setting its order bit. If the order's buddy - * isn't set then it isn't free and we can't merge so we set our order - * and are done. If the buddy is free then we can clear it and ascend - * up to try and set the next higher order. That performs the same - * buddy merging test. Eventually we make it to the highest order which - * doesn't have a buddy so we can always set it. - * - * As we're freeing orders in the final buddy bitmap that only partially - * covers the end of the device we might try to test buddies which are - * past the end of the device. The test will still fall within the leaf - * block bitmap and those bits past the device will never be set so we - * will fail the merge and correctly set the orders free. - */ -static int buddy_free(struct super_block *sb, u64 blk, int order) -{ - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct buddy_info *binf = sbi->buddy_info; - struct buddy_stack *sta = &binf->stack; - struct scoutfs_buddy_block *bud; - struct scoutfs_block *bl; - u64 unused; - int ret; - int nr; - int i; - - ret = buddy_walk(sb, blk, -1, &unused); - if (ret) - goto out; - - bl = stack_peek(sta); - bud = scoutfs_block_data(bl); - - nr = buddy_bit(blk) >> order; - for (i = order; i < SCOUTFS_BUDDY_ORDERS - 2; i++) { - - if (!test_buddy_bit(bud, i, nr ^ 1)) - break; - - clear_buddy_bit(bud, i, nr ^ 1); - nr >>= 1; - } - - set_buddy_bit(bud, i, nr); - - ret = 0; -out: - stack_cleanup(sb); - return ret; -} - -/* - * Try to allocate an extent with the size number of blocks. blkno is - * set to the start of the extent and the order of the block count is - * returned. - */ -int scoutfs_buddy_alloc(struct super_block *sb, u64 *blkno, int order) -{ - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct scoutfs_super_block *super = &sbi->super; - struct buddy_info *binf = sbi->buddy_info; - int found; - u64 blk; - int ret; - - trace_printk("order %d\n", order); - - mutex_lock(&binf->mutex); - - found = find_free_order(&super->buddy_root, order); - if (found < 0) { - ret = found; - goto out; - } - - if (found < order) - order = found; - - blk = 0; - ret = buddy_alloc(sb, &blk, order, found); - if (ret) - goto out; - - *blkno = first_blkno(super) + blk; - le64_add_cpu(&super->free_blocks, -(1ULL << order)); - atomic_add((1ULL << order), &binf->alloc_count); - ret = order; - -out: - trace_printk("blkno %llu order %d ret %d\n", *blkno, order, ret); - mutex_unlock(&binf->mutex); - return ret; -} - -/* - * We use the block _ref() routines to dirty existing blocks to reuse - * all the block verification and cow machinery. During cow this is - * called to allocate a new blkno to cow an existing buddy block. We - * use the existing blkno to see if we have to return the other mirrored - * buddy blkno or do a real allocation for every other kind of block - * being cowed. - */ -int scoutfs_buddy_alloc_same(struct super_block *sb, u64 *blkno, u64 existing) -{ - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct scoutfs_super_block *super = &sbi->super; - - if (buddy_blkno(super, existing)) { - *blkno = existing ^ 1; - trace_printk("existing %llu ret blkno %llu\n", - existing, *blkno); - return 0; - } - - return scoutfs_buddy_alloc(sb, blkno, 0); -} - -struct extent_node { - struct rb_node node; - u64 start; - u64 len; -}; - -static int add_enode_extent(struct rb_root *root, u64 start, u64 len) -{ - struct rb_node **node = &root->rb_node; - struct rb_node *parent = NULL; - struct extent_node *left = NULL; - struct extent_node *right = NULL; - struct extent_node *enode; - - trace_printk("adding enode [%llu,%llu]\n", start, len); - - while (*node && !(left && right)) { - parent = *node; - enode = container_of(*node, struct extent_node, node); - - if (start < enode->start) { - if (!right && start + len == enode->start) - right = enode; - node = &(*node)->rb_left; - } else { - if (!left && enode->start + enode->len == start) - left = enode; - node = &(*node)->rb_right; - } - } - - if (right) { - right->start = start; - right->len += len; - trace_printk("right now [%llu, %llu]\n", - right->start, right->len); - } - - if (left) { - if (right) { - left->len += right->len; - rb_erase(&right->node, root); - kfree(right); - } else { - left->len += len; - } - trace_printk("left now [%llu, %llu]\n", left->start, left->len); - } - - if (left || right) - return 0; - - enode = kmalloc(sizeof(struct extent_node), GFP_NOFS); - if (!enode) - return -ENOMEM; - - enode->start = start; - enode->len = len; - - trace_printk("inserted new [%llu, %llu]\n", enode->start, enode->len); - - rb_link_node(&enode->node, parent, node); - rb_insert_color(&enode->node, root); - - return 0; -} - -static void destroy_pending_frees(struct super_block *sb) -{ - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct buddy_info *binf = sbi->buddy_info; - struct extent_node *enode; - struct rb_node *node; - - for (node = rb_first(&binf->pending_frees); node;) { - enode = rb_entry(node, struct extent_node, node); - node = rb_next(node); - - rb_erase(&enode->node, &binf->pending_frees); - kfree(enode); - } -} - -/* XXX this should be generic */ -#define min3_t(t, a, b, c) min3((t)(a), (t)(b), (t)(c)) - -/* - * Allocate or free all the orders that make up a given arbitrary block - * extent. Today this is used by callers who know that the blocks for - * the extent have already been pinned so we BUG on error. - */ -static void apply_extent(struct super_block *sb, bool alloc, u64 blk, u64 len) -{ - unsigned int blk_order; - unsigned int blk_bit; - unsigned int size; - int order; - int ret; - - trace_printk("applying extent blk %llu len %llu\n", blk, len); - - while (len) { - /* buddy bit might be 0, len always has a bit set */ - blk_bit = buddy_bit(blk); - blk_order = blk_bit ? ffs(blk_bit) - 1 : 0; - order = min3_t(int, blk_order, fls64(len) - 1, - SCOUTFS_BUDDY_ORDERS - 1); - size = 1 << order; - - trace_printk("applying blk %llu order %d\n", blk, order); - - if (alloc) - ret = buddy_alloc(sb, &blk, order, -1); - else - ret = buddy_free(sb, blk, order); - BUG_ON(ret); - - blk += size; - len -= size; - } -} - -/* - * The pending rbtree has recorded frees of stable data that we had to - * wait until transaction commit to record. Once these are tracked in - * the allocator we can't use the allocator until the commit succeeds. - * This is called by transaction commit to get these pending frees into - * the current commit. If it fails they pull them back out. - */ -int scoutfs_buddy_apply_pending(struct super_block *sb, bool alloc) -{ - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct buddy_info *binf = sbi->buddy_info; - struct extent_node *enode; - struct rb_node *node; - - for (node = rb_first(&binf->pending_frees); node;) { - enode = rb_entry(node, struct extent_node, node); - node = rb_next(node); - - apply_extent(sb, alloc, enode->start, enode->len); - } - - return 0; -} - -/* - * Free a given allocated extent. The seq tells us which transaction - * first allocated the extent. If it was allocated in this transaction - * then we can return it to the free buddy and that must succeed. - * - * If it was allocated in a previous transaction then we dirty the - * blocks it will take to free it then record it in an rbtree. The - * rbtree entries are replayed into the dirty blocks as the transaction - * commits. - * - * Buddy block numbers are preallocated and calculated from the radix - * tree structure so we can ignore the block layer's calls to free buddy - * blocks during cow. - */ -int scoutfs_buddy_free(struct super_block *sb, __le64 seq, u64 blkno, int order) -{ - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct scoutfs_super_block *super = &sbi->super; - struct buddy_info *binf = sbi->buddy_info; - u64 unused; - u64 blk; - int ret; - - trace_printk("seq %llu blkno %llu order %d rsv %u\n", - le64_to_cpu(seq), blkno, order, buddy_blkno(super, blkno)); - - /* no specific free tracking for buddy blocks */ - if (buddy_blkno(super, blkno)) - return 0; - - /* XXX corruption? */ - if (!device_blkno(super, blkno)) - return -EINVAL; - - blk = blkno - first_blkno(super); - - if (!valid_order(blk, order)) - return -EINVAL; - - mutex_lock(&binf->mutex); - - if (seq == super->hdr.seq) { - ret = buddy_free(sb, blk, order); - /* - * If this order was allocated in this transaction then its - * blocks should be pinned and we should always be able - * to free it. - */ - BUG_ON(ret); - } else { - ret = buddy_walk(sb, blk, -1, &unused) ?: - add_enode_extent(&binf->pending_frees, blk, 1 << order); - if (ret == 0) - trace_printk("added blk %llu order %d\n", blk, order); - stack_cleanup(sb); - } - - if (ret == 0) - le64_add_cpu(&super->free_blocks, 1ULL << order); - - mutex_unlock(&binf->mutex); - - return ret; -} - -/* - * This is current only used to return partial extents from larger - * allocations in this transaction. - */ -void scoutfs_buddy_free_extent(struct super_block *sb, u64 blkno, u64 count) -{ - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct buddy_info *binf = sbi->buddy_info; - struct scoutfs_super_block *super = &sbi->stable_super; - u64 blk; - - BUG_ON(!device_blkno(super, blkno)); - - blk = blkno - first_blkno(super); - - mutex_lock(&binf->mutex); - - apply_extent(sb, false, blkno - first_blkno(super), count); - le64_add_cpu(&super->free_blocks, count); - - mutex_unlock(&binf->mutex); -} - -/* - * Return the number of block allocations since the last time the - * counter was reset. This count doesn't include dirty buddy blocks. - */ -unsigned int scoutfs_buddy_alloc_count(struct super_block *sb) -{ - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct buddy_info *binf = sbi->buddy_info; - - return atomic_read(&binf->alloc_count); -} - -u64 scoutfs_buddy_bfree(struct super_block *sb) -{ - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct buddy_info *binf = sbi->buddy_info; - struct scoutfs_super_block *super = &sbi->super; - u64 ret; - - mutex_lock(&binf->mutex); - ret = le64_to_cpu(super->free_blocks); - mutex_unlock(&binf->mutex); - - return ret; -} - -void scoutfs_buddy_committed(struct super_block *sb) -{ - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct buddy_info *binf = sbi->buddy_info; - - atomic_set(&binf->alloc_count, 0); - destroy_pending_frees(sb); -} - -int scoutfs_buddy_setup(struct super_block *sb) -{ - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct scoutfs_super_block *super = &sbi->super; - struct buddy_info *binf = sbi->buddy_info; - u64 level_blocks[SCOUTFS_BUDDY_MAX_HEIGHT]; - u64 blocks; - int i; - - /* first bit offsets in blocks are __le16 */ - BUILD_BUG_ON(SCOUTFS_BUDDY_ORDER0_BITS >= U16_MAX); - - /* bits need to be naturally aligned to long for _le bitops */ - BUILD_BUG_ON(offsetof(struct scoutfs_buddy_block, bits) & - (sizeof(long) - 1)); - - binf = kzalloc(sizeof(struct buddy_info), GFP_KERNEL); - if (!binf) - return -ENOMEM; - sbi->buddy_info = binf; - - mutex_init(&binf->mutex); - atomic_set(&binf->alloc_count, 0); - binf->pending_frees = RB_ROOT; - - /* calculate blocks at each level */ - blocks = DIV_ROUND_UP_ULL(last_blk(super) + 1, - SCOUTFS_BUDDY_ORDER0_BITS); - for (i = 0; i < SCOUTFS_BUDDY_MAX_HEIGHT; i++) { - level_blocks[i] = (blocks * 2); - if (blocks == 1) { - binf->max_height = i + 1; - break; - } - blocks = DIV_ROUND_UP_ULL(blocks, SCOUTFS_BUDDY_SLOTS); - } - - /* calculate device blkno of first block in each level */ - binf->level_blkno[binf->max_height - 1] = SCOUTFS_BUDDY_BLKNO; - for (i = (binf->max_height - 2); i >= 0; i--) { - binf->level_blkno[i] = binf->level_blkno[i + 1] + - level_blocks[i + 1]; - } - - /* calculate blk divisor to find slot at a given level */ - binf->level_div[1] = SCOUTFS_BUDDY_ORDER0_BITS; - for (i = 2; i < binf->max_height; i++) { - binf->level_div[i] = binf->level_div[i - 1] * - SCOUTFS_BUDDY_SLOTS; - } - - for (i = 0; i < binf->max_height; i++) - trace_printk("level %d div %llu blkno %llu blocks %llu\n", - i, binf->level_div[i], binf->level_blkno[i], - level_blocks[i]); - - return 0; -} - -void scoutfs_buddy_destroy(struct super_block *sb) -{ - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct buddy_info *binf = sbi->buddy_info; - - if (binf) - WARN_ON_ONCE(!RB_EMPTY_ROOT(&binf->pending_frees)); - kfree(binf); -} - diff --git a/kmod/src/buddy.h b/kmod/src/buddy.h deleted file mode 100644 index 24c0ed0c..00000000 --- a/kmod/src/buddy.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef _SCOUTFS_BUDDY_H_ -#define _SCOUTFS_BUDDY_H_ - -int scoutfs_buddy_alloc(struct super_block *sb, u64 *blkno, int order); -int scoutfs_buddy_alloc_same(struct super_block *sb, u64 *blkno, u64 existing); -int scoutfs_buddy_free(struct super_block *sb, __le64 seq, u64 blkno, - int order); -void scoutfs_buddy_free_extent(struct super_block *sb, u64 blkno, u64 count); - -int scoutfs_buddy_was_free(struct super_block *sb, u64 blkno, int order); -u64 scoutfs_buddy_bfree(struct super_block *sb); - -unsigned int scoutfs_buddy_alloc_count(struct super_block *sb); -int scoutfs_buddy_apply_pending(struct super_block *sb, bool alloc); -void scoutfs_buddy_committed(struct super_block *sb); - -int scoutfs_buddy_setup(struct super_block *sb); -void scoutfs_buddy_destroy(struct super_block *sb); - -#endif diff --git a/kmod/src/counters.h b/kmod/src/counters.h index 137ebbae..c9d081a6 100644 --- a/kmod/src/counters.h +++ b/kmod/src/counters.h @@ -14,8 +14,6 @@ #define EXPAND_EACH_COUNTER \ EXPAND_COUNTER(alloc_alloc) \ EXPAND_COUNTER(alloc_free) \ - EXPAND_COUNTER(block_mem_alloc) \ - EXPAND_COUNTER(block_mem_free) \ EXPAND_COUNTER(seg_lru_shrink) \ EXPAND_COUNTER(trans_level0_seg_write) \ EXPAND_COUNTER(manifest_compact_migrate) \ diff --git a/kmod/src/crc.c b/kmod/src/crc.c deleted file mode 100644 index cde9a1ae..00000000 --- a/kmod/src/crc.c +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright (C) 2015 Versity Software, Inc. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ -#include -#include - -#include "format.h" -#include "crc.h" - -u32 scoutfs_crc_block(struct scoutfs_block_header *hdr) -{ - return crc32c(~0, (char *)hdr + sizeof(hdr->crc), - SCOUTFS_BLOCK_SIZE - sizeof(hdr->crc)); -} diff --git a/kmod/src/crc.h b/kmod/src/crc.h deleted file mode 100644 index 7f1fbf56..00000000 --- a/kmod/src/crc.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _SCOUTFS_CRC_H_ -#define _SCOUTFS_CRC_H_ - -u32 scoutfs_crc_block(struct scoutfs_block_header *hdr); - -#endif diff --git a/kmod/src/dir.c b/kmod/src/dir.c index 0d5f0bb2..79b75dcc 100644 --- a/kmod/src/dir.c +++ b/kmod/src/dir.c @@ -23,9 +23,7 @@ #include "inode.h" #include "key.h" #include "super.h" -#include "btree.h" #include "trans.h" -#include "name.h" #include "xattr.h" #include "kvec.h" #include "item.h" diff --git a/kmod/src/format.h b/kmod/src/format.h index a3784bcb..b877d4d8 100644 --- a/kmod/src/format.h +++ b/kmod/src/format.h @@ -35,9 +35,6 @@ */ #define SCOUTFS_SUPER_BLKNO ((64 * 1024) >> SCOUTFS_BLOCK_SHIFT) #define SCOUTFS_SUPER_NR 2 -#define SCOUTFS_BUDDY_BLKNO (SCOUTFS_SUPER_BLKNO + SCOUTFS_SUPER_NR) - -#define SCOUTFS_MAX_TRANS_BLOCKS (128 * 1024 * 1024 / SCOUTFS_BLOCK_SIZE) /* * This header is found at the start of every block so that we can @@ -161,70 +158,6 @@ struct scoutfs_segment_block { /* packed vals */ } __packed; -/* - * Block references include the sequence number so that we can detect - * readers racing with writers and so that we can tell that we don't - * need to follow a reference when traversing based on seqs. - */ -struct scoutfs_block_ref { - __le64 blkno; - __le64 seq; -} __packed; - -/* - * If the block was full of bits the largest possible order would be - * the block size shift + 3 (BITS_PER_BYTE). But the header uses - * up some space and then the buddy bits mean two bits per block. - * Then +1 for this being the number, not the greatest order. - */ -#define SCOUTFS_BUDDY_ORDERS (SCOUTFS_BLOCK_SHIFT + 3 - 2 + 1) - -struct scoutfs_buddy_block { - struct scoutfs_block_header hdr; - __le16 first_set[SCOUTFS_BUDDY_ORDERS]; - __u8 level; - __u8 __pad[3]; /* naturally align bits */ - union { - struct scoutfs_buddy_slot { - __le64 seq; - __le16 free_orders; - /* XXX seems like we could hide a bit somewhere */ - __u8 blkno_off; - } __packed slots[0]; - __le64 bits[0]; - } __packed; -} __packed; - -/* - * Each buddy leaf block references order 0 blocks with half of its - * bitmap. The other half of the bits are used for the higher order - * bits. - */ -#define SCOUTFS_BUDDY_ORDER0_BITS \ - (((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_buddy_block)) * 8) / 2) - -#define SCOUTFS_BUDDY_SLOTS \ - ((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_buddy_block)) / \ - sizeof(struct scoutfs_buddy_slot)) - -struct scoutfs_buddy_root { - struct scoutfs_buddy_slot slot; - __u8 height; -} __packed; - -/* ((SCOUTFS_BUDDY_SLOTS^5) * SCOUTFS_BUDDY_ORDER0_BITS) > 2^52 */ -#define SCOUTFS_BUDDY_MAX_HEIGHT 6 - -/* - * We should be able to make the offset smaller if neither dirents nor - * data items use the full 64 bits. - */ -struct scoutfs_key { - __le64 inode; - u8 type; - __le64 offset; -} __packed; - /* * Currently we sort keys by the numeric value of the types, but that * isn't necessary. We could have an arbitrary sort order. So we don't @@ -241,8 +174,6 @@ struct scoutfs_key { #define SCOUTFS_DATA_KEY 11 #define SCOUTFS_MAX_UNUSED_KEY 255 -#define SCOUTFS_MAX_ITEM_LEN 512 - /* value is struct scoutfs_inode */ struct scoutfs_inode_key { __u8 type; @@ -307,66 +238,9 @@ struct scoutfs_symlink_key { __be64 ino; } __packed; -struct scoutfs_btree_root { - u8 height; - struct scoutfs_block_ref ref; -} __packed; - -/* - * @free_end: records the byte offset of the first byte after the free - * space in the block between the header and the first item. New items - * are allocated by subtracting the space they need. - * - * @free_reclaim: records the number of bytes of free space amongst the - * items after free_end. If a block is compacted then this much new - * free space would be reclaimed. - */ -struct scoutfs_btree_block { - struct scoutfs_block_header hdr; - __le16 free_end; - __le16 free_reclaim; - __le16 nr_items; - __le16 item_offs[0]; -} __packed; - -/* - * The item sequence number is set to the dirty block's sequence number - * when the item is modified. It is not changed by splits or merges. - */ -struct scoutfs_btree_item { - struct scoutfs_key key; - __le64 seq; - __le16 val_len; - char val[0]; -} __packed; - -/* Blocks are no more than half free. */ -#define SCOUTFS_BTREE_FREE_LIMIT \ - ((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_btree_block)) / 2) - /* XXX does this exist upstream somewhere? */ #define member_sizeof(TYPE, MEMBER) (sizeof(((TYPE *)0)->MEMBER)) -#define SCOUTFS_BTREE_MAX_ITEMS \ - ((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_btree_block)) / \ - (member_sizeof(struct scoutfs_btree_block, item_offs[0]) + \ - sizeof(struct scoutfs_btree_item))) - -/* - * We can calculate the max tree depth by calculating how many leaf - * blocks the tree could reference. The block device can only reference - * 2^64 bytes. The tallest parent tree has half full parent blocks. - * - * So we have the relation: - * - * ceil(max_items / 2) ^ (max_depth - 1) >= 2^64 / block_size - * - * and solve for depth: - * - * max_depth = log(ceil(max_items / 2), 2^64 / block_size) + 1 - */ -#define SCOUTFS_BTREE_MAX_DEPTH 10 - #define SCOUTFS_UUID_BYTES 16 /* @@ -382,16 +256,11 @@ struct scoutfs_super_block { __le64 alloc_uninit; __le64 total_segs; __le64 free_segs; - __le64 total_blocks; - __le64 free_blocks; __le64 ring_blkno; __le64 ring_blocks; __le64 ring_tail_block; __le64 ring_gen; __le64 next_seg_seq; - __le64 buddy_blocks; - struct scoutfs_buddy_root buddy_root; - struct scoutfs_btree_root btree_root; struct scoutfs_treap_root alloc_treap_root; struct scoutfs_manifest manifest; } __packed; @@ -418,7 +287,6 @@ struct scoutfs_timespec { struct scoutfs_inode { __le64 size; __le64 blocks; - __le64 link_counter; __le64 data_version; __le64 next_readdir_pos; __le32 nlink; @@ -426,7 +294,6 @@ struct scoutfs_inode { __le32 gid; __le32 mode; __le32 rdev; - __le32 salt; struct scoutfs_timespec atime; struct scoutfs_timespec ctime; struct scoutfs_timespec mtime; @@ -449,20 +316,6 @@ struct scoutfs_dirent { __u8 name[0]; } __packed; -/* - * Dirent items are stored at keys with the offset set to the hash of - * the name. Creation can find that hash values collide and will - * attempt to linearly probe this many following hash values looking for - * an unused value. - * - * In small directories this doesn't really matter because hash values - * will so very rarely collide. At around 50k items we start to see our - * first collisions. 16 slots is still pretty quick to scan in the - * btree and it gets us up into the hundreds of millions of entries - * before enospc is returned as we run out of hash values. - */ -#define SCOUTFS_DIRENT_COLL_NR 16 - #define SCOUTFS_NAME_LEN 255 /* S32_MAX avoids the (int) sign bit and might avoid sloppy bugs */ @@ -475,14 +328,6 @@ struct scoutfs_dirent { #define SCOUTFS_XATTR_MAX_PARTS \ DIV_ROUND_UP(SCOUTFS_XATTR_MAX_SIZE, SCOUTFS_XATTR_PART_SIZE) -/* - * We only use 31 bits for readdir positions so that we don't confuse - * old signed 32bit f_pos applications or those on the other side of - * network protocols that have limited readir positions. - */ - -#define SCOUTFS_DIRENT_OFF_BITS 31 -#define SCOUTFS_DIRENT_OFF_MASK ((1U << SCOUTFS_DIRENT_OFF_BITS) - 1) /* entries begin after . and .. */ #define SCOUTFS_DIRENT_FIRST_POS 2 /* getdents returns next pos with an entry, no entry at (f_pos)~0 */ @@ -499,14 +344,6 @@ enum { SCOUTFS_DT_WHT, }; -struct scoutfs_extent { - __le64 blkno; - __le64 len; - __u8 flags; -} __packed; - -#define SCOUTFS_EXTENT_FLAG_OFFLINE (1 << 0) - /* ino_path can search for backref items with a null term */ #define SCOUTFS_MAX_KEY_SIZE \ offsetof(struct scoutfs_link_backref_key, name[SCOUTFS_NAME_LEN + 1]) diff --git a/kmod/src/inode.c b/kmod/src/inode.c index ad1caa79..e34441de 100644 --- a/kmod/src/inode.c +++ b/kmod/src/inode.c @@ -22,7 +22,6 @@ #include "super.h" #include "key.h" #include "inode.h" -#include "btree.h" #include "dir.h" #include "data.h" #include "scoutfs_trace.h" @@ -126,8 +125,6 @@ static void load_inode(struct inode *inode, struct scoutfs_inode *cinode) inode->i_ctime.tv_sec = le64_to_cpu(cinode->ctime.sec); inode->i_ctime.tv_nsec = le32_to_cpu(cinode->ctime.nsec); - ci->salt = le32_to_cpu(cinode->salt); - atomic64_set(&ci->link_counter, le64_to_cpu(cinode->link_counter)); ci->data_version = le64_to_cpu(cinode->data_version); ci->next_readdir_pos = le64_to_cpu(cinode->next_readdir_pos); } @@ -247,8 +244,6 @@ static void store_inode(struct scoutfs_inode *cinode, struct inode *inode) cinode->mtime.sec = cpu_to_le64(inode->i_mtime.tv_sec); cinode->mtime.nsec = cpu_to_le32(inode->i_mtime.tv_nsec); - cinode->salt = cpu_to_le32(ci->salt); - cinode->link_counter = cpu_to_le64(atomic64_read(&ci->link_counter)); cinode->data_version = cpu_to_le64(ci->data_version); cinode->next_readdir_pos = cpu_to_le64(ci->next_readdir_pos); } @@ -415,8 +410,6 @@ struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir, ci->data_version = 0; ci->next_readdir_pos = SCOUTFS_DIRENT_FIRST_POS; ci->staging = false; - get_random_bytes(&ci->salt, sizeof(ci->salt)); - atomic64_set(&ci->link_counter, 0); inode->i_ino = ino; /* XXX overflow */ inode_init_owner(inode, dir, mode); diff --git a/kmod/src/inode.h b/kmod/src/inode.h index f3badfb4..6dcb03d8 100644 --- a/kmod/src/inode.h +++ b/kmod/src/inode.h @@ -5,7 +5,6 @@ struct scoutfs_inode_info { u64 ino; - u32 salt; seqcount_t seqcount; u64 data_version; @@ -14,7 +13,6 @@ struct scoutfs_inode_info { /* holder of i_mutex is staging */ bool staging; - atomic64_t link_counter; struct rw_semaphore xattr_rwsem; struct inode inode; diff --git a/kmod/src/ioctl.c b/kmod/src/ioctl.c index 255e167f..82375047 100644 --- a/kmod/src/ioctl.c +++ b/kmod/src/ioctl.c @@ -24,7 +24,6 @@ #include "format.h" #include "key.h" #include "dir.h" -#include "name.h" #include "ioctl.h" #include "super.h" #include "inode.h" diff --git a/kmod/src/key.h b/kmod/src/key.h index 7d3b2230..3c108555 100644 --- a/kmod/src/key.h +++ b/kmod/src/key.h @@ -126,127 +126,4 @@ static inline void scoutfs_key_set_max(struct scoutfs_key_buf *key) scoutfs_key_memset(key, 0xff, sizeof(struct scoutfs_inode_key)); } -/* - * What follows are the key functions for the small fixed size btree - * keys. It will all be removed once the callers are converted from - * the btree to the item cache. - */ - -#define CKF "%llu.%u.%llu" -#define CKA(key) \ - le64_to_cpu((key)->inode), (key)->type, le64_to_cpu((key)->offset) - -static inline u64 scoutfs_key_inode(struct scoutfs_key *key) -{ - return le64_to_cpu(key->inode); -} - -static inline u64 scoutfs_key_offset(struct scoutfs_key *key) -{ - return le64_to_cpu(key->offset); -} - -static inline int le64_cmp(__le64 a, __le64 b) -{ - return le64_to_cpu(a) < le64_to_cpu(b) ? -1 : - le64_to_cpu(a) > le64_to_cpu(b) ? 1 : 0; -} - -/* - * Items are sorted by type and then by inode to reflect the relative - * frequency of use. Inodes and xattrs are hot, then dirents, then file - * data extents. We want each use class to be hot and dense, we don't - * want a scan of the inodes to have to skip over each inode's extent - * items. - */ -static inline int scoutfs_key_cmp(struct scoutfs_key *a, struct scoutfs_key *b) -{ - return ((short)a->type - (short)b->type) ?: - le64_cmp(a->inode, b->inode) ?: - le64_cmp(a->offset, b->offset); -} - -/* - * return -ve if the first range is completely before the second, +ve for - * completely after, and 0 if they intersect. - */ -static inline int scoutfs_cmp_key_ranges(struct scoutfs_key *a_first, - struct scoutfs_key *a_last, - struct scoutfs_key *b_first, - struct scoutfs_key *b_last) -{ - if (scoutfs_key_cmp(a_last, b_first) < 0) - return -1; - if (scoutfs_key_cmp(a_first, b_last) > 0) - return 1; - return 0; -} - -static inline int scoutfs_cmp_key_range(struct scoutfs_key *key, - struct scoutfs_key *first, - struct scoutfs_key *last) -{ - return scoutfs_cmp_key_ranges(key, key, first, last); -} - -static inline void scoutfs_set_key(struct scoutfs_key *key, u64 inode, u8 type, - u64 offset) -{ - key->inode = cpu_to_le64(inode); - key->type = type; - key->offset = cpu_to_le64(offset); -} - -static inline void scoutfs_set_max_key(struct scoutfs_key *key) -{ - scoutfs_set_key(key, ~0ULL, ~0, ~0ULL); -} - -/* - * This saturates at (~0,~0,~0) instead of wrapping. This will never be - * an issue for real item keys but parent item keys along the right - * spine of the tree have maximal key values that could wrap if - * incremented. - */ -static inline void scoutfs_inc_key(struct scoutfs_key *key) -{ - if (key->inode == cpu_to_le64(~0ULL) && - key->type == (u8)~0 && - key->offset == cpu_to_le64(~0ULL)) - return; - - le64_add_cpu(&key->offset, 1); - if (!key->offset) { - if (++key->type == 0) - le64_add_cpu(&key->inode, 1); - } -} - -static inline void scoutfs_dec_key(struct scoutfs_key *key) -{ - le64_add_cpu(&key->offset, -1ULL); - if (key->offset == cpu_to_le64(~0ULL)) { - if (key->type-- == 0) - le64_add_cpu(&key->inode, -1ULL); - } -} - -static inline struct scoutfs_key *scoutfs_max_key(struct scoutfs_key *a, - struct scoutfs_key *b) -{ - return scoutfs_key_cmp(a, b) > 0 ? a : b; -} - -static inline bool scoutfs_key_is_zero(struct scoutfs_key *key) -{ - return key->inode == 0 && key->type == 0 && key->offset == 0; -} - -static inline void scoutfs_key_set_zero(struct scoutfs_key *key) -{ - key->inode = 0; - key->type = 0; - key->offset = 0; -} - #endif diff --git a/kmod/src/kvec.c b/kmod/src/kvec.c index 5e49c6a3..422a4fc5 100644 --- a/kmod/src/kvec.c +++ b/kmod/src/kvec.c @@ -25,10 +25,8 @@ #include "dir.h" #include "xattr.h" #include "msg.h" -#include "block.h" #include "counters.h" #include "trans.h" -#include "buddy.h" #include "kvec.h" #include "scoutfs_trace.h" diff --git a/kmod/src/name.c b/kmod/src/name.c deleted file mode 100644 index e14f52bd..00000000 --- a/kmod/src/name.c +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright (C) 2016 Versity Software, Inc. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ -#include -#include -#include - -#include "name.h" - -/* - * XXX This crc nonsense is a quick hack. We'll want something a - * lot stronger like siphash. - */ -u64 scoutfs_name_hash(const char *name, unsigned int len) -{ - unsigned int half = (len + 1) / 2; - - return crc32c(~0, name, half) | - ((u64)crc32c(~0, name + len - half, half) << 32); -} - -int scoutfs_names_equal(const char *name_a, int len_a, - const char *name_b, int len_b) -{ - return (len_a == len_b) && !memcmp(name_a, name_b, len_a); -} diff --git a/kmod/src/name.h b/kmod/src/name.h deleted file mode 100644 index 020ecb0f..00000000 --- a/kmod/src/name.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef _SCOUTFS_NAME_H_ -#define _SCOUTFS_NAME_H_ - -u64 scoutfs_name_hash(const char *data, unsigned int len); -int scoutfs_names_equal(const char *name_a, int len_a, - const char *name_b, int len_b); - -#endif diff --git a/kmod/src/scoutfs_trace.c b/kmod/src/scoutfs_trace.c index 038eb228..6c775b9f 100644 --- a/kmod/src/scoutfs_trace.c +++ b/kmod/src/scoutfs_trace.c @@ -23,7 +23,6 @@ #include "inode.h" #include "dir.h" #include "msg.h" -#include "block.h" #define CREATE_TRACE_POINTS #include "scoutfs_trace.h" diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h index 4a1d7a7f..669b99f4 100644 --- a/kmod/src/scoutfs_trace.h +++ b/kmod/src/scoutfs_trace.h @@ -180,171 +180,6 @@ TRACE_EVENT(scoutfs_scan_orphans, TP_printk("dev %d,%d", MAJOR(__entry->dev), MINOR(__entry->dev)) ); -TRACE_EVENT(scoutfs_buddy_alloc, - TP_PROTO(u64 blkno, int order, int region, int ret), - - TP_ARGS(blkno, order, region, ret), - - TP_STRUCT__entry( - __field(u64, blkno) - __field(int, order) - __field(int, region) - __field(int, ret) - ), - - TP_fast_assign( - __entry->blkno = blkno; - __entry->order = order; - __entry->region = region; - __entry->ret = ret; - ), - - TP_printk("blkno %llu order %d region %d ret %d", - __entry->blkno, __entry->order, __entry->region, __entry->ret) -); - - -TRACE_EVENT(scoutfs_buddy_free, - TP_PROTO(u64 blkno, int order, int region, int ret), - - TP_ARGS(blkno, order, region, ret), - - TP_STRUCT__entry( - __field(u64, blkno) - __field(int, order) - __field(int, region) - __field(int, ret) - ), - - TP_fast_assign( - __entry->blkno = blkno; - __entry->order = order; - __entry->region = region; - __entry->ret = ret; - ), - - TP_printk("blkno %llu order %d region %d ret %d", - __entry->blkno, __entry->order, __entry->region, __entry->ret) -); - -DECLARE_EVENT_CLASS(scoutfs_btree_op, - TP_PROTO(struct super_block *sb, struct scoutfs_key *key, int len), - - TP_ARGS(sb, key, len), - - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( u64, key_ino ) - __field( u64, key_off ) - __field( u8, key_type ) - __field( int, val_len ) - ), - - TP_fast_assign( - __entry->dev = sb->s_dev; - __entry->key_ino = le64_to_cpu(key->inode); - __entry->key_off = le64_to_cpu(key->offset); - __entry->key_type = key->type; - __entry->val_len = len; - ), - - TP_printk("dev %d,%d key "TRACE_KEYF" size %d", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->key_ino, show_key_type(__entry->key_type), - __entry->key_off, __entry->val_len) -); - -DEFINE_EVENT(scoutfs_btree_op, scoutfs_btree_lookup, - TP_PROTO(struct super_block *sb, struct scoutfs_key *key, int len), - - TP_ARGS(sb, key, len) -); - -DEFINE_EVENT(scoutfs_btree_op, scoutfs_btree_insert, - TP_PROTO(struct super_block *sb, struct scoutfs_key *key, int len), - - TP_ARGS(sb, key, len) -); - -DEFINE_EVENT(scoutfs_btree_op, scoutfs_btree_delete, - TP_PROTO(struct super_block *sb, struct scoutfs_key *key, int len), - - TP_ARGS(sb, key, len) -); - -DEFINE_EVENT(scoutfs_btree_op, scoutfs_btree_dirty, - TP_PROTO(struct super_block *sb, struct scoutfs_key *key, int len), - - TP_ARGS(sb, key, len) -); - -DEFINE_EVENT(scoutfs_btree_op, scoutfs_btree_update, - TP_PROTO(struct super_block *sb, struct scoutfs_key *key, int len), - - TP_ARGS(sb, key, len) -); - -DECLARE_EVENT_CLASS(scoutfs_btree_ranged_op, - TP_PROTO(struct super_block *sb, struct scoutfs_key *first, - struct scoutfs_key *last), - - TP_ARGS(sb, first, last), - - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( u64, first_ino ) - __field( u64, first_off ) - __field( u8, first_type ) - __field( u64, last_ino ) - __field( u64, last_off ) - __field( u8, last_type ) - ), - - TP_fast_assign( - __entry->dev = sb->s_dev; - __entry->first_ino = le64_to_cpu(first->inode); - __entry->first_off = le64_to_cpu(first->offset); - __entry->first_type = first->type; - __entry->last_ino = le64_to_cpu(last->inode); - __entry->last_off = le64_to_cpu(last->offset); - __entry->last_type = last->type; - ), - - TP_printk("dev %d,%d first key "TRACE_KEYF" last key "TRACE_KEYF, - MAJOR(__entry->dev), MINOR(__entry->dev), __entry->first_ino, - show_key_type(__entry->first_type), __entry->first_off, - __entry->last_ino, show_key_type(__entry->last_type), - __entry->last_off) -); - -DEFINE_EVENT(scoutfs_btree_ranged_op, scoutfs_btree_hole, - TP_PROTO(struct super_block *sb, struct scoutfs_key *first, - struct scoutfs_key *last), - - TP_ARGS(sb, first, last) -); - -DEFINE_EVENT(scoutfs_btree_ranged_op, scoutfs_btree_next, - TP_PROTO(struct super_block *sb, struct scoutfs_key *first, - struct scoutfs_key *last), - - TP_ARGS(sb, first, last) -); - -DEFINE_EVENT(scoutfs_btree_ranged_op, scoutfs_btree_prev, - TP_PROTO(struct super_block *sb, struct scoutfs_key *first, - struct scoutfs_key *last), - - TP_ARGS(sb, first, last) -); - -DEFINE_EVENT(scoutfs_btree_ranged_op, scoutfs_btree_since, - TP_PROTO(struct super_block *sb, struct scoutfs_key *first, - struct scoutfs_key *last), - - TP_ARGS(sb, first, last) -); - TRACE_EVENT(scoutfs_manifest_add, TP_PROTO(struct super_block *sb, struct kvec *first, struct kvec *last, u64 segno, u64 seq, u8 level), diff --git a/kmod/src/super.c b/kmod/src/super.c index 57dcbd46..158052c1 100644 --- a/kmod/src/super.c +++ b/kmod/src/super.c @@ -25,10 +25,8 @@ #include "dir.h" #include "xattr.h" #include "msg.h" -#include "block.h" #include "counters.h" #include "trans.h" -#include "buddy.h" #include "item.h" #include "manifest.h" #include "seg.h" @@ -96,8 +94,6 @@ void scoutfs_advance_dirty_super(struct super_block *sb) struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct scoutfs_super_block *super = &sbi->super; - sbi->stable_super = sbi->super; - le64_add_cpu(&super->hdr.blkno, 1); if (le64_to_cpu(super->hdr.blkno) == (SCOUTFS_SUPER_BLKNO + SCOUTFS_SUPER_NR)) @@ -182,8 +178,6 @@ static int read_supers(struct super_block *sb) scoutfs_info(sb, "using super %u with seq %llu", found, le64_to_cpu(sbi->super.hdr.seq)); - sbi->stable_super = sbi->super; - return 0; } @@ -204,23 +198,12 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent) return -ENOMEM; spin_lock_init(&sbi->next_ino_lock); - spin_lock_init(&sbi->block_lock); - /* radix only inserted with NOFS _preload */ - INIT_RADIX_TREE(&sbi->block_radix, GFP_ATOMIC); - init_waitqueue_head(&sbi->block_wq); - atomic_set(&sbi->block_writes, 0); - INIT_LIST_HEAD(&sbi->block_lru_list); - init_rwsem(&sbi->btree_rwsem); atomic_set(&sbi->trans_holds, 0); init_waitqueue_head(&sbi->trans_hold_wq); spin_lock_init(&sbi->trans_write_lock); INIT_WORK(&sbi->trans_write_work, scoutfs_trans_write_func); init_waitqueue_head(&sbi->trans_write_wq); - sbi->block_shrinker.shrink = scoutfs_block_shrink; - sbi->block_shrinker.seeks = DEFAULT_SEEKS; - register_shrinker(&sbi->block_shrinker); - /* XXX can have multiple mounts of a device, need mount id */ sbi->kset = kset_create_and_add(sb->s_id, NULL, &scoutfs_kset->kobj); if (!sbi->kset) @@ -269,16 +252,12 @@ static void scoutfs_kill_sb(struct super_block *sb) if (sbi) { scoutfs_compact_destroy(sb); scoutfs_shutdown_trans(sb); - scoutfs_buddy_destroy(sb); - if (sbi->block_shrinker.shrink == scoutfs_block_shrink) - unregister_shrinker(&sbi->block_shrinker); scoutfs_data_destroy(sb); scoutfs_item_destroy(sb); scoutfs_alloc_destroy(sb); scoutfs_manifest_destroy(sb); scoutfs_treap_destroy(sb); scoutfs_seg_destroy(sb); - scoutfs_block_destroy(sb); scoutfs_destroy_counters(sb); if (sbi->kset) kset_unregister(sbi->kset); diff --git a/kmod/src/super.h b/kmod/src/super.h index 82eb6bba..e791e76d 100644 --- a/kmod/src/super.h +++ b/kmod/src/super.h @@ -5,10 +5,8 @@ #include #include "format.h" -#include "buddy.h" struct scoutfs_counters; -struct buddy_info; struct item_cache; struct manifest; struct segment_cache; @@ -20,20 +18,9 @@ struct scoutfs_sb_info { struct super_block *sb; struct scoutfs_super_block super; - struct scoutfs_super_block stable_super; spinlock_t next_ino_lock; - spinlock_t block_lock; - struct radix_tree_root block_radix; - wait_queue_head_t block_wq; - atomic_t block_writes; - int block_write_err; - /* block cache lru */ - struct shrinker block_shrinker; - struct list_head block_lru_list; - unsigned long block_lru_nr; - struct manifest *manifest; struct item_cache *item_cache; struct segment_cache *segment_cache; @@ -42,10 +29,6 @@ struct scoutfs_sb_info { struct compact_info *compact_info; struct data_info *data_info; - struct buddy_info *buddy_info; - - struct rw_semaphore btree_rwsem; - atomic_t trans_holds; wait_queue_head_t trans_hold_wq; struct task_struct *trans_task; @@ -68,17 +51,6 @@ static inline struct scoutfs_sb_info *SCOUTFS_SB(struct super_block *sb) return sb->s_fs_info; } -/* The root of the metadata btree */ -static inline struct scoutfs_btree_root *SCOUTFS_META(struct super_block *sb) -{ - return &SCOUTFS_SB(sb)->super.btree_root; -} - -static inline struct scoutfs_btree_root *SCOUTFS_STABLE_META(struct super_block *sb) -{ - return &SCOUTFS_SB(sb)->stable_super.btree_root; -} - void scoutfs_advance_dirty_super(struct super_block *sb); int scoutfs_write_dirty_super(struct super_block *sb); diff --git a/kmod/src/trans.c b/kmod/src/trans.c index 487514f6..d596bf68 100644 --- a/kmod/src/trans.c +++ b/kmod/src/trans.c @@ -18,9 +18,7 @@ #include #include "super.h" -#include "block.h" #include "trans.h" -#include "buddy.h" #include "data.h" #include "bio.h" #include "item.h" diff --git a/kmod/src/xattr.c b/kmod/src/xattr.c index 52f1acd7..afe1cc14 100644 --- a/kmod/src/xattr.c +++ b/kmod/src/xattr.c @@ -22,7 +22,6 @@ #include "kvec.h" #include "item.h" #include "trans.h" -#include "name.h" #include "xattr.h" /*