Files
scoutfs/kmod/src/block.c
Zach Brown 161063c8d6 scoutfs: remove very noisy bh ref tracing
This wasn't adding much value and was exceptionally noisy.

Signed-off-by: Zach Brown <zab@versity.com>
2016-09-21 10:04:07 -07:00

489 lines
12 KiB
C

/*
* Copyright (C) 2016 Versity Software, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include <linux/kernel.h>
#include <linux/buffer_head.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
#include "super.h"
#include "format.h"
#include "block.h"
#include "crc.h"
#include "counters.h"
#include "buddy.h"
/*
* scoutfs has a fixed 4k small block size for metadata blocks. This
* lets us consistently use buffer heads without worrying about having a
* block size greater than the page size.
*
* This block interface does the work to cow dirty blocks, track dirty
* blocks, generate checksums as they're written, only write them in
* transactions, verify checksums on read, and invalidate and retry
* reads of stale cached blocks. (That last bit only has a hint of an
* implementation.)
*
* XXX
* - tear down dirty blocks left by write errors on unmount
* - should invalidate dirty blocks if freed
*/
struct block_bh_private {
struct super_block *sb;
struct buffer_head *bh;
struct rb_node node;
};
enum {
BH_ScoutfsVerified = BH_PrivateStart,
};
BUFFER_FNS(ScoutfsVerified, scoutfs_verified)
static int verify_block_header(struct scoutfs_sb_info *sbi,
struct buffer_head *bh)
{
struct scoutfs_super_block *super = &sbi->super;
struct scoutfs_block_header *hdr = (void *)bh->b_data;
u32 crc = scoutfs_crc_block(hdr);
int ret = -EIO;
if (le32_to_cpu(hdr->crc) != crc) {
printk("blkno %llu hdr crc %x != calculated %x\n",
(u64)bh->b_blocknr, le32_to_cpu(hdr->crc), crc);
} else if (super->hdr.fsid && hdr->fsid != super->hdr.fsid) {
printk("blkno %llu fsid %llx != super fsid %llx\n",
(u64)bh->b_blocknr, le64_to_cpu(hdr->fsid),
le64_to_cpu(super->hdr.fsid));
} else if (le64_to_cpu(hdr->blkno) != bh->b_blocknr) {
printk("blkno %llu invalid hdr blkno %llx\n",
(u64)bh->b_blocknr, le64_to_cpu(hdr->blkno));
} else {
ret = 0;
}
return ret;
}
static struct buffer_head *bh_from_bhp_node(struct rb_node *node)
{
struct block_bh_private *bhp;
bhp = container_of(node, struct block_bh_private, node);
return bhp->bh;
}
static struct scoutfs_sb_info *sbi_from_bh(struct buffer_head *bh)
{
struct block_bh_private *bhp = bh->b_private;
return SCOUTFS_SB(bhp->sb);
}
static void insert_bhp_rb(struct rb_root *root, struct buffer_head *ins)
{
struct rb_node **node = &root->rb_node;
struct rb_node *parent = NULL;
struct block_bh_private *bhp;
struct buffer_head *bh;
while (*node) {
parent = *node;
bh = bh_from_bhp_node(*node);
if (ins->b_blocknr < bh->b_blocknr)
node = &(*node)->rb_left;
else
node = &(*node)->rb_right;
}
bhp = ins->b_private;
rb_link_node(&bhp->node, parent, node);
rb_insert_color(&bhp->node, root);
}
/*
* Track a dirty block by allocating private data and inserting it into
* the dirty rbtree in the super block.
*
* Callers are in transactions that prevent metadata writeback so blocks
* won't be written and cleaned while we're trying to dirty them. We
* serialize racing to add dirty tracking to the same block in case the
* caller didn't.
*
* Presence in the dirty tree holds a bh ref.
*/
static int insert_bhp(struct super_block *sb, struct buffer_head *bh)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct block_bh_private *bhp;
unsigned long flags;
int ret = 0;
if (bh->b_private)
return 0;
lock_buffer(bh);
if (bh->b_private)
goto out;
bhp = kmalloc(sizeof(*bhp), GFP_NOFS);
if (!bhp) {
ret = -ENOMEM;
goto out;
}
bhp->sb = sb;
bhp->bh = bh;
get_bh(bh);
bh->b_private = bhp;
spin_lock_irqsave(&sbi->block_lock, flags);
insert_bhp_rb(&sbi->block_dirty_tree, bh);
spin_unlock_irqrestore(&sbi->block_lock, flags);
trace_printk("blkno %llu bh %p\n", (u64)bh->b_blocknr, bh);
out:
unlock_buffer(bh);
return ret;
}
static void erase_bhp(struct buffer_head *bh)
{
struct block_bh_private *bhp = bh->b_private;
struct scoutfs_sb_info *sbi = sbi_from_bh(bh);
unsigned long flags;
spin_lock_irqsave(&sbi->block_lock, flags);
rb_erase(&bhp->node, &sbi->block_dirty_tree);
spin_unlock_irqrestore(&sbi->block_lock, flags);
put_bh(bh);
kfree(bhp);
bh->b_private = NULL;
trace_printk("blkno %llu bh %p\n", (u64)bh->b_blocknr, bh);
}
/*
* Read an existing block from the device and verify its metadata header.
* The buffer head is returned unlocked and uptodate.
*/
struct buffer_head *scoutfs_block_read(struct super_block *sb, u64 blkno)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct buffer_head *bh;
int ret;
bh = sb_bread(sb, blkno);
if (!bh) {
bh = ERR_PTR(-EIO);
goto out;
}
if (!buffer_scoutfs_verified(bh)) {
lock_buffer(bh);
if (!buffer_scoutfs_verified(bh)) {
ret = verify_block_header(sbi, bh);
if (!ret)
set_buffer_scoutfs_verified(bh);
} else {
ret = 0;
}
unlock_buffer(bh);
if (ret < 0) {
scoutfs_block_put(bh);
bh = ERR_PTR(ret);
}
}
out:
return bh;
}
/*
* Read an existing block from the device described by the caller's
* reference.
*
* If the reference sequence numbers don't match then we could be racing
* with another writer. We back off and try again. If it happens too
* many times the caller assumes that we've hit persistent corruption
* and returns an error.
*
* XXX how does this race with
* - reads that span transactions?
* - writers creating a new dirty block?
*/
struct buffer_head *scoutfs_block_read_ref(struct super_block *sb,
struct scoutfs_block_ref *ref)
{
struct scoutfs_block_header *hdr;
struct buffer_head *bh;
bh = scoutfs_block_read(sb, le64_to_cpu(ref->blkno));
if (!IS_ERR(bh)) {
hdr = bh_data(bh);
if (WARN_ON_ONCE(hdr->seq != ref->seq)) {
clear_buffer_uptodate(bh);
brelse(bh);
bh = ERR_PTR(-EAGAIN);
}
}
return bh;
}
/*
* We stop tracking dirty metadata blocks when their IO succeeds. This
* happens in the context of transaction commit which excludes other
* metadata dirtying paths.
*/
static void block_write_end_io(struct buffer_head *bh, int uptodate)
{
struct scoutfs_sb_info *sbi = sbi_from_bh(bh);
trace_printk("bh %p uptdate %d\n", bh, uptodate);
/* XXX */
unlock_buffer(bh);
if (uptodate) {
erase_bhp(bh);
} else {
/* don't care if this is racey? */
if (!sbi->block_write_err)
sbi->block_write_err = -EIO;
}
if (atomic_dec_and_test(&sbi->block_writes))
wake_up(&sbi->block_wq);
}
/*
* Submit writes for all the buffer heads in the dirty block tree. The
* write transaction machinery ensures that the dirty blocks form a
* consistent image and excludes future dirtying while we're working.
*
* Presence in the dirty tree holds a reference. Blocks are only
* removed from the tree which drops the ref when IO completes.
*
* Blocks that see write errors remain in the dirty tree and will try to
* be written again in the next transaction commit.
*
* Reads can traverse the blocks while they're in flight.
*/
int scoutfs_block_write_dirty(struct super_block *sb)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct buffer_head *bh;
struct rb_node *node;
struct blk_plug plug;
unsigned long flags;
int ret;
atomic_set(&sbi->block_writes, 1);
sbi->block_write_err = 0;
ret = 0;
blk_start_plug(&plug);
spin_lock_irqsave(&sbi->block_lock, flags);
node = rb_first(&sbi->block_dirty_tree);
while(node) {
bh = bh_from_bhp_node(node);
node = rb_next(node);
spin_unlock_irqrestore(&sbi->block_lock, flags);
atomic_inc(&sbi->block_writes);
scoutfs_block_set_crc(bh);
/*
* XXX submit_bh() forces us to lock the block while IO is
* in flight. This is unfortunate because we use the buffer
* head lock to serialize access to btree block contents.
* We should fix that and only use the buffer head lock
* when the APIs force us to.
*/
lock_buffer(bh);
bh->b_end_io = block_write_end_io;
ret = submit_bh(WRITE, bh); /* doesn't actually fail? */
spin_lock_irqsave(&sbi->block_lock, flags);
if (ret)
break;
}
spin_unlock_irqrestore(&sbi->block_lock, flags);
blk_finish_plug(&plug);
/* wait for all io to drain */
atomic_dec(&sbi->block_writes);
wait_event(sbi->block_wq, atomic_read(&sbi->block_writes) == 0);
trace_printk("ret %d\n", ret);
return ret;
}
/*
* The caller knows that it's not racing with writers.
*/
int scoutfs_block_has_dirty(struct super_block *sb)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
return !RB_EMPTY_ROOT(&sbi->block_dirty_tree);
}
/*
* Give the caller a dirty block that they can safely modify. If the
* reference refers to a stable clean block then we allocate a new block
* and update the reference.
*
* Blocks are dirtied and modified within a transaction that has a given
* sequence number which we use to determine if the block is currently
* dirty or not.
*
* For now we're using the dirty super block in the sb_info to track the
* dirty seq. That'll be different when we have multiple btrees.
*
* Callers are responsible for serializing modification to the reference
* which is probably embedded in some other dirty persistent structure.
*/
struct buffer_head *scoutfs_block_dirty_ref(struct super_block *sb,
struct scoutfs_block_ref *ref)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct scoutfs_block_header *hdr;
struct buffer_head *copy_bh = NULL;
struct buffer_head *bh;
u64 blkno = 0;
int ret;
int err;
bh = scoutfs_block_read(sb, le64_to_cpu(ref->blkno));
if (IS_ERR(bh) || ref->seq == sbi->super.hdr.seq)
return bh;
ret = scoutfs_buddy_alloc_same(sb, &blkno, 0, le64_to_cpu(ref->blkno));
if (ret < 0)
goto out;
copy_bh = scoutfs_block_dirty(sb, blkno);
if (IS_ERR(copy_bh)) {
ret = PTR_ERR(copy_bh);
goto out;
}
ret = scoutfs_buddy_free(sb, bh->b_blocknr, 0);
if (ret)
goto out;
memcpy(copy_bh->b_data, bh->b_data, SCOUTFS_BLOCK_SIZE);
hdr = bh_data(copy_bh);
hdr->blkno = cpu_to_le64(blkno);
hdr->seq = sbi->super.hdr.seq;
ref->blkno = hdr->blkno;
ref->seq = hdr->seq;
ret = 0;
out:
scoutfs_block_put(bh);
if (ret) {
if (!IS_ERR_OR_NULL(copy_bh)) {
err = scoutfs_buddy_free(sb, copy_bh->b_blocknr, 0);
WARN_ON_ONCE(err); /* freeing dirty must work */
}
scoutfs_block_put(copy_bh);
copy_bh = ERR_PTR(ret);
}
return copy_bh;
}
/*
* Return a dirty metadata block with an updated block header to match
* the current dirty seq. Callers are responsible for serializing
* access to the block and for zeroing unwritten block contents.
*/
struct buffer_head *scoutfs_block_dirty(struct super_block *sb, u64 blkno)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct scoutfs_block_header *hdr;
struct buffer_head *bh;
int ret;
/* allocate a new block and try to insert it */
bh = sb_getblk(sb, blkno);
if (!bh) {
bh = ERR_PTR(-ENOMEM);
goto out;
}
ret = insert_bhp(sb, bh);
if (ret < 0) {
scoutfs_block_put(bh);
bh = ERR_PTR(ret);
goto out;
}
hdr = bh_data(bh);
*hdr = sbi->super.hdr;
hdr->blkno = cpu_to_le64(blkno);
hdr->seq = sbi->super.hdr.seq;
set_buffer_uptodate(bh);
set_buffer_scoutfs_verified(bh);
out:
return bh;
}
/*
* Allocate a new dirty writable block. The caller must be in a
* transaction so that we can assign the dirty seq.
*/
struct buffer_head *scoutfs_block_dirty_alloc(struct super_block *sb)
{
struct buffer_head *bh;
u64 blkno;
int ret;
int err;
ret = scoutfs_buddy_alloc(sb, &blkno, 0);
if (ret < 0)
return ERR_PTR(ret);
bh = scoutfs_block_dirty(sb, blkno);
if (IS_ERR(bh)) {
err = scoutfs_buddy_free(sb, blkno, 0);
WARN_ON_ONCE(err); /* freeing dirty must work */
}
return bh;
}
void scoutfs_block_set_crc(struct buffer_head *bh)
{
struct scoutfs_block_header *hdr = bh_data(bh);
hdr->crc = cpu_to_le32(scoutfs_crc_block(hdr));
}
void scoutfs_block_zero(struct buffer_head *bh, size_t off)
{
if (WARN_ON_ONCE(off > SCOUTFS_BLOCK_SIZE))
return;
if (off < SCOUTFS_BLOCK_SIZE)
memset((char *)bh->b_data + off, 0, SCOUTFS_BLOCK_SIZE - off);
}