/* * Copyright (C) 2016 Versity Software, Inc. All rights reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License v2 as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. */ #include #include #include #include #include #include #include "super.h" #include "key.h" #include "segment.h" #include "manifest.h" #include "block.h" #include "chunk.h" #include "ring.h" #include "bloom.h" #include "skip.h" /* * scoutfs log segments are large multi-block structures that contain * key/value items. This file implements manipulations of the items. * * Each log segment starts with a bloom filter to supports quickly * testing for key values without having to search the whole block for a * key. * * After the bloom filter come the packed structures that describe the * items that are present in the block. They're sorted in a skip list * to support reasonably efficient insertion, sorted iteration, and * deletion. * * Finally the item values are stored at the end of the block. This * supports finding that an item's key isn't present by only reading the * item structs, not the values. * * All told, should we chose to, we can have three large portions of the * blocks resident for searching. It's likely that we'll keep the bloom * filters hot but that the items and especially the values may age out * of the cache. */ void scoutfs_put_ref(struct scoutfs_item_ref *ref) { if (ref->item_bh) brelse(ref->item_bh); if (ref->val_bh) brelse(ref->val_bh); memset(ref, 0, sizeof(struct scoutfs_item_ref)); } /* private to here */ struct scoutfs_item_iter { struct list_head list; struct buffer_head *bh; struct scoutfs_item *item; u64 blkno; bool restart_after; }; void scoutfs_put_iter_list(struct list_head *list) { struct scoutfs_item_iter *iter; struct scoutfs_item_iter *pos; list_for_each_entry_safe(iter, pos, list, list) { list_del_init(&iter->list); brelse(iter->bh); kfree(iter); } } /* * The caller has a pointer to an item and a reference to its block. We * read the value block and populate the reference. * * The item references get their own buffer head references so that the * caller doesn't have to play funny games. They always have to drop * their release bh. If this succeeds then they also need to put the * ref. */ static int populate_ref(struct super_block *sb, u64 blkno, struct buffer_head *item_bh, struct scoutfs_item *item, struct scoutfs_item_ref *ref) { struct buffer_head *bh; bh = scoutfs_read_block_off(sb, blkno, le32_to_cpu(item->offset)); if (!bh) return -EIO; ref->key = &item->key; ref->val_len = le16_to_cpu(item->len); ref->val = bh->b_data + (le32_to_cpu(item->offset) & SCOUTFS_BLOCK_MASK); get_bh(item_bh); ref->item_bh = item_bh; ref->val_bh = bh; return 0; } /* * Return a reference to the item at the given key. We walk the manifest * to find blocks that might contain the key from most recent to oldest. * To find the key in each log segment we test it's bloom filter and * then search through the item keys. The first matching item we find * is returned. * * XXX lock the dirty log segment? * * -ENOENT is returned if the item isn't present. The caller needs to put * the ref if we return success. */ int scoutfs_read_item(struct super_block *sb, struct scoutfs_key *key, struct scoutfs_item_ref *ref) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct scoutfs_ring_manifest_entry ment; struct scoutfs_item *item = NULL; struct scoutfs_bloom_bits bits; struct buffer_head *bh; int ret; /* XXX hold manifest */ scoutfs_calc_bloom_bits(&bits, key, sbi->super.bloom_salts); item = NULL; ret = -ENOENT; memset(&ment, 0, sizeof(struct scoutfs_ring_manifest_entry)); while (scoutfs_foreach_range_segment(sb, key, key, &ment)) { /* XXX read-ahead all bloom blocks */ ret = scoutfs_test_bloom_bits(sb, le64_to_cpu(ment.blkno), &bits); if (ret < 0) break; if (!ret) { ret = -ENOENT; continue; } /* XXX read-ahead all item header blocks */ ret = scoutfs_skip_lookup(sb, le64_to_cpu(ment.blkno), key, &bh, &item); if (ret) { if (ret == -ENOENT) continue; break; } break; } /* XXX release manifest */ /* XXX read-ahead all value blocks? */ if (!ret) { ret = populate_ref(sb, le64_to_cpu(ment.blkno), bh, item, ref); brelse(bh); } return ret; } /* * The dirty_item_off points to the byte offset after the last item. * Advance it past block tails and initial block headers until there's * room for an item with the given skip list elements height. Then set * the dirty_item_off past the item offset item we return. */ static int add_item_off(struct scoutfs_sb_info *sbi, int height) { int len = offsetof(struct scoutfs_item, skip_next[height]); int off = sbi->dirty_item_off; int tail_free; /* item's can't cross a block boundary */ tail_free = SCOUTFS_BLOCK_SIZE - (off & SCOUTFS_BLOCK_MASK); if (tail_free < len) off += tail_free + sizeof(struct scoutfs_block_header); sbi->dirty_item_off = off + len; return off; } /* * The dirty_val_off points to the first byte of the last value that * was allocated. Subtract the offset to make room for a new item * of the given length. If that crosses a block boundary or wanders * into the block header then pull it back into the tail of the previous * block. */ static int sub_val_off(struct scoutfs_sb_info *sbi, int len) { int off = sbi->dirty_val_off - len; int block_off; int tail_free; /* values can't start in a block header */ block_off = off & SCOUTFS_BLOCK_MASK; if (block_off < sizeof(struct scoutfs_block_header)) off -= (block_off + 1); /* values can't cross a block boundary */ tail_free = SCOUTFS_BLOCK_SIZE - (off & SCOUTFS_BLOCK_MASK); if (tail_free < len) off -= len - tail_free; sbi->dirty_val_off = off; return off; } /* * Initialize the buffers for the next dirty segment. We have to initialize * the bloom filter bits and the item block header. * * XXX we need to really pin the blocks somehow */ static int start_dirty_segment(struct super_block *sb, u64 blkno) { struct scoutfs_bloom_block *blm; struct scoutfs_item_block *iblk; struct buffer_head *bh; int ret = 0; int i; for (i = 0; i < SCOUTFS_BLOCKS_PER_CHUNK; i++) { bh = scoutfs_new_block(sb, blkno + i); if (!bh) { ret = -EIO; break; } if (i < SCOUTFS_BLOOM_BLOCKS) { blm = (void *)bh->b_data; memset(blm->bits, 0, SCOUTFS_BLOCK_SIZE - offsetof(struct scoutfs_bloom_block, bits)); } if (i == SCOUTFS_BLOOM_BLOCKS) { iblk = (void *)bh->b_data; /* also zero first unused item slot */ memset(&iblk->skip_root, 0, sizeof(iblk->skip_root) + sizeof(struct scoutfs_item)); } /* bh is pinned by sbi->dirty_blkno */ } while (ret && i--) { /* unwind pinned blocks on failure */ bh = sb_getblk(sb, blkno + i); if (bh) { brelse(bh); brelse(bh); } } return ret; } /* * Zero the portion of this block that intersects with the free space in * the middle of the segment. @start and @end are chunk-relative byte * offsets of the inclusive start and exclusive end of the free region. */ static void zero_unused_block(struct super_block *sb, struct buffer_head *bh, u32 start, u32 end) { u32 off = bh->b_blocknr << SCOUTFS_BLOCK_SHIFT; /* see if the segment range falls outside our block */ if (start >= off + SCOUTFS_BLOCK_SIZE || end <= off) return; /* convert the chunk offsets to our block offsets */ start = max(start, off) - off; end = min(off + SCOUTFS_BLOCK_SIZE, end) - off; /* don't zero block headers */ start = max_t(u32, start, sizeof(struct scoutfs_block_header)); end = max_t(u32, start, sizeof(struct scoutfs_block_header)); if (start < end) memset(bh->b_data + start, 0, end - start); } /* * Finish off a dirty segment if we have one. Calculate the checksums of * all the blocks, mark them dirty, and drop their pinned reference. */ int scoutfs_finish_dirty_segment(struct super_block *sb) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; struct buffer_head *bh; u64 blkno; int ret = 0; u64 i; /* XXX sync doesn't lock this test? */ blkno = sbi->dirty_blkno; if (!blkno) return 0; for (i = 0; i < SCOUTFS_BLOCKS_PER_CHUNK; i++) { bh = scoutfs_read_block(sb, blkno + i); /* should have been pinned */ if (WARN_ON_ONCE(!bh)) { ret = -EIO; break; } zero_unused_block(sb, bh, sbi->dirty_item_off, sbi->dirty_val_off); scoutfs_calc_hdr_crc(bh); mark_buffer_dirty(bh); brelse(bh); /* extra release to unpin */ brelse(bh); } /* * XXX the manifest entry for this log segment has a key range * that is much too large. We should shrink it here to reflect * the real keys. That would reduce the number of blocks involved * in merging it into level 1. */ /* * Try to kick off a background write of the finished segment. Callers * can wait for the buffers in writeback if they need to. */ if (!ret) { filemap_fdatawrite_range(mapping, blkno << SCOUTFS_CHUNK_SHIFT, ((blkno + 1) << SCOUTFS_CHUNK_SHIFT) - 1); sbi->dirty_blkno = 0; } return ret; } /* * Return a reference to a newly allocated and initialized item in a * block in the currently dirty log segment. * * Item creation is purposely kept very simple. Item and value offset * allocation proceed from either end of the log segment. Once they * intersect the log segment is full and written out. Deleted dirty * items don't reclaim their space. The free space will be reclaimed by * the level 0 -> level 1 merge that happens anyway. Not reclaiming * free space makes item location more rigid and lets us relax the * locking requirements of item references. An item reference doesn't * have to worry about unrelated item modification moving their item * around to, say, defragment free space. */ int scoutfs_create_item(struct super_block *sb, struct scoutfs_key *key, unsigned bytes, struct scoutfs_item_ref *ref) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct scoutfs_ring_manifest_entry ment; struct scoutfs_bloom_bits bits; struct scoutfs_item *item; struct buffer_head *bh; int item_off; int val_off; int height; u64 blkno; int ret = 0; /* XXX how big should items really get? */ if (WARN_ON_ONCE(bytes == 0 || bytes > 4096)) return -EINVAL; height = scoutfs_skip_random_height(); mutex_lock(&sbi->dirty_mutex); next_chunk: if (!sbi->dirty_blkno) { ret = scoutfs_alloc_chunk(sb, &blkno); if (ret) goto out; /* XXX free blkno on error? */ ret = start_dirty_segment(sb, blkno); if (ret) goto out; /* * We need a local manifest in memory to find items as * we insert them in the dirty segment. We don't know * what keys are going to be used so we cover the whole * thing. * * XXX But we're also adding it to the ring here. We should * add it as its finalized and its item range is collapsed. */ ment.blkno = cpu_to_le64(blkno); ment.seq = sbi->super.hdr.seq; ment.level = 0; memset(&ment.first, 0, sizeof(ment.first)); memset(&ment.last, ~0, sizeof(ment.last)); ret = scoutfs_new_manifest(sb, &ment); if (ret) goto out; sbi->dirty_blkno = blkno; sbi->dirty_item_off = (SCOUTFS_BLOCK_SIZE * SCOUTFS_BLOOM_BLOCKS) + sizeof(struct scoutfs_item_block); sbi->dirty_val_off = SCOUTFS_CHUNK_SIZE; } item_off = add_item_off(sbi, height); val_off = sub_val_off(sbi, bytes); if (item_off > val_off) { ret = scoutfs_finish_dirty_segment(sb); if (ret) goto out; goto next_chunk; } /* XXX fix up this error handling in general */ bh = scoutfs_read_block_off(sb, sbi->dirty_blkno, item_off); if (!bh) { ret = -EIO; goto out; } /* populate iblk first and last? better than in manifest? */ item = (void *)bh->b_data + (item_off & SCOUTFS_BLOCK_MASK); item->key = *key; item->offset = cpu_to_le32(val_off); item->len = cpu_to_le16(bytes); item->skip_height = height; ret = scoutfs_skip_insert(sb, sbi->dirty_blkno, item, item_off); if (ret) goto out; ret = populate_ref(sb, sbi->dirty_blkno, bh, item, ref); brelse(bh); if (ret) goto out; /* XXX delete skip on failure? */ /* set the bloom bits last because we can't unset them */ scoutfs_calc_bloom_bits(&bits, key, sbi->super.bloom_salts); ret = scoutfs_set_bloom_bits(sb, sbi->dirty_blkno, &bits); out: WARN_ON_ONCE(ret); /* XXX error paths are not robust */ mutex_unlock(&sbi->dirty_mutex); return ret; } /* * Ensure that there is a dirty item with the given key in the current * dirty segment. * * The caller locks access to the item and prevents sync and made sure * that there's enough free space in the segment for their dirty inodes. * * This is better than getting -EEXIST from create_item because that * will leave the allocated item and val dangling in the block when it * returns the error. */ int scoutfs_dirty_item(struct super_block *sb, struct scoutfs_key *key, unsigned bytes, struct scoutfs_item_ref *ref) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct scoutfs_item *item; struct buffer_head *bh; bool create = false; int ret; mutex_lock(&sbi->dirty_mutex); if (sbi->dirty_blkno) { ret = scoutfs_skip_lookup(sb, sbi->dirty_blkno, key, &bh, &item); if (ret == -ENOENT) create = true; else if (!ret) { ret = populate_ref(sb, sbi->dirty_blkno, bh, item, ref); brelse(bh); } } else { create = true; } mutex_unlock(&sbi->dirty_mutex); if (create) ret = scoutfs_create_item(sb, key, bytes, ref); return ret; } /* * This is a really cheesy temporary delete method. It only works on items * that are stored in dirty blocks. The caller is responsible for dropping * the ref. XXX be less bad. */ int scoutfs_delete_item(struct super_block *sb, struct scoutfs_item_ref *ref) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); u64 blkno; int ret; blkno = round_down(ref->item_bh->b_blocknr, SCOUTFS_BLOCKS_PER_CHUNK); if (WARN_ON_ONCE(blkno != sbi->dirty_blkno)) return -EINVAL; ret = scoutfs_skip_delete(sb, blkno, ref->key); WARN_ON_ONCE(ret); return ret; } /* * Return a reference to the next item in the inclusive search range. * The caller should have access to the search key range. * * We walk the manifest to find all the log segments that could contain * the start of the range. We hold cursors on the blocks in the * segments. Each next item iteration comes from finding the least of * the next item at all these cursors. * * If we exhaust a segment at a given level we may need to search the * next segment in that level to find the next item. The manifest may * have changed under us while we walked our old set of segments. So we * restart the entire search to get another consistent collection of * segments to search. * * We put the segment references and iteration cursors in a list in the * caller so that they can find many next items by advancing the cursors * without having to walk the manifest and perform initial binary * searches in each segment. * * The caller is responsible for putting the item ref if we return * success. -ENOENT is returned if there are no more items in the * search range. * * XXX this is wonky. We don't want to search the manifest for the * range, just the initial value. Then we record the last key in * segments we finish and only restart if least is > that or there are * no least. We have to advance the first key when restarting the * search. */ int scoutfs_next_item(struct super_block *sb, struct scoutfs_key *first, struct scoutfs_key *last, struct list_head *iter_list, struct scoutfs_item_ref *ref) { struct scoutfs_ring_manifest_entry ment; struct scoutfs_item_iter *least; struct scoutfs_item_iter *iter; struct scoutfs_item_iter *pos; int ret; restart: if (list_empty(iter_list)) { /* * Find all the segments that intersect the search range * and find the next item in the block from the start * of the range. */ memset(&ment, 0, sizeof(struct scoutfs_ring_manifest_entry)); while (scoutfs_foreach_range_segment(sb, first, last, &ment)) { iter = kzalloc(sizeof(struct scoutfs_item_iter), GFP_NOFS); if (!iter) { ret = -ENOMEM; goto out; } /* * We will restart the walk of the manifest blocks if * we iterate over all the items in this block without * exhausting the search range. */ if (ment.level > 0 && scoutfs_key_cmp(&ment.last, last) < 0) iter->restart_after = true; iter->blkno = le64_to_cpu(ment.blkno); list_add_tail(&iter->list, iter_list); } if (list_empty(iter_list)) { ret = -ENOENT; goto out; } } least = NULL; ret = 0; list_for_each_entry_safe(iter, pos, iter_list, list) { /* search towards the first key if we haven't yet */ if (!iter->item) { ret = scoutfs_skip_search(sb, iter->blkno, first, &iter->bh, &iter->item); } /* then iterate until we find or pass the first key */ while (!ret && scoutfs_key_cmp(&iter->item->key, first) < 0) { ret = scoutfs_skip_next(sb, iter->blkno, &iter->bh, &iter->item); } /* we're done with this block if we past the last key */ while (!ret && scoutfs_key_cmp(&iter->item->key, last) > 0) { brelse(iter->bh); iter->bh = NULL; iter->item = NULL; ret = -ENOENT; } if (ret == -ENOENT) { if (iter->restart_after) { /* need next block at this level */ scoutfs_put_iter_list(iter_list); goto restart; } else { /* this level is done */ list_del_init(&iter->list); brelse(iter->bh); kfree(iter); continue; } } if (ret) goto out; /* remember the most recent smallest key from the first */ if (!least || scoutfs_key_cmp(&iter->item->key, &least->item->key) < 0) least = iter; } if (least) ret = populate_ref(sb, least->blkno, least->bh, least->item, ref); else ret = -ENOENT; out: if (ret) scoutfs_put_iter_list(iter_list); return ret; }