mirror of
https://github.com/versity/scoutfs.git
synced 2026-01-09 05:13:18 +00:00
This wasn't adding much value and was exceptionally noisy. Signed-off-by: Zach Brown <zab@versity.com>
489 lines
12 KiB
C
489 lines
12 KiB
C
/*
|
|
* Copyright (C) 2016 Versity Software, Inc. All rights reserved.
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public
|
|
* License v2 as published by the Free Software Foundation.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* General Public License for more details.
|
|
*/
|
|
#include <linux/kernel.h>
|
|
#include <linux/buffer_head.h>
|
|
#include <linux/blkdev.h>
|
|
#include <linux/slab.h>
|
|
|
|
#include "super.h"
|
|
#include "format.h"
|
|
#include "block.h"
|
|
#include "crc.h"
|
|
#include "counters.h"
|
|
#include "buddy.h"
|
|
|
|
/*
|
|
* scoutfs has a fixed 4k small block size for metadata blocks. This
|
|
* lets us consistently use buffer heads without worrying about having a
|
|
* block size greater than the page size.
|
|
*
|
|
* This block interface does the work to cow dirty blocks, track dirty
|
|
* blocks, generate checksums as they're written, only write them in
|
|
* transactions, verify checksums on read, and invalidate and retry
|
|
* reads of stale cached blocks. (That last bit only has a hint of an
|
|
* implementation.)
|
|
*
|
|
* XXX
|
|
* - tear down dirty blocks left by write errors on unmount
|
|
* - should invalidate dirty blocks if freed
|
|
*/
|
|
|
|
struct block_bh_private {
|
|
struct super_block *sb;
|
|
struct buffer_head *bh;
|
|
struct rb_node node;
|
|
};
|
|
|
|
enum {
|
|
BH_ScoutfsVerified = BH_PrivateStart,
|
|
};
|
|
BUFFER_FNS(ScoutfsVerified, scoutfs_verified)
|
|
|
|
static int verify_block_header(struct scoutfs_sb_info *sbi,
|
|
struct buffer_head *bh)
|
|
{
|
|
struct scoutfs_super_block *super = &sbi->super;
|
|
struct scoutfs_block_header *hdr = (void *)bh->b_data;
|
|
u32 crc = scoutfs_crc_block(hdr);
|
|
int ret = -EIO;
|
|
|
|
if (le32_to_cpu(hdr->crc) != crc) {
|
|
printk("blkno %llu hdr crc %x != calculated %x\n",
|
|
(u64)bh->b_blocknr, le32_to_cpu(hdr->crc), crc);
|
|
} else if (super->hdr.fsid && hdr->fsid != super->hdr.fsid) {
|
|
printk("blkno %llu fsid %llx != super fsid %llx\n",
|
|
(u64)bh->b_blocknr, le64_to_cpu(hdr->fsid),
|
|
le64_to_cpu(super->hdr.fsid));
|
|
} else if (le64_to_cpu(hdr->blkno) != bh->b_blocknr) {
|
|
printk("blkno %llu invalid hdr blkno %llx\n",
|
|
(u64)bh->b_blocknr, le64_to_cpu(hdr->blkno));
|
|
} else {
|
|
ret = 0;
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
static struct buffer_head *bh_from_bhp_node(struct rb_node *node)
|
|
{
|
|
struct block_bh_private *bhp;
|
|
|
|
bhp = container_of(node, struct block_bh_private, node);
|
|
return bhp->bh;
|
|
}
|
|
|
|
static struct scoutfs_sb_info *sbi_from_bh(struct buffer_head *bh)
|
|
{
|
|
struct block_bh_private *bhp = bh->b_private;
|
|
|
|
return SCOUTFS_SB(bhp->sb);
|
|
}
|
|
|
|
static void insert_bhp_rb(struct rb_root *root, struct buffer_head *ins)
|
|
{
|
|
struct rb_node **node = &root->rb_node;
|
|
struct rb_node *parent = NULL;
|
|
struct block_bh_private *bhp;
|
|
struct buffer_head *bh;
|
|
|
|
while (*node) {
|
|
parent = *node;
|
|
bh = bh_from_bhp_node(*node);
|
|
|
|
if (ins->b_blocknr < bh->b_blocknr)
|
|
node = &(*node)->rb_left;
|
|
else
|
|
node = &(*node)->rb_right;
|
|
}
|
|
|
|
bhp = ins->b_private;
|
|
rb_link_node(&bhp->node, parent, node);
|
|
rb_insert_color(&bhp->node, root);
|
|
}
|
|
|
|
/*
|
|
* Track a dirty block by allocating private data and inserting it into
|
|
* the dirty rbtree in the super block.
|
|
*
|
|
* Callers are in transactions that prevent metadata writeback so blocks
|
|
* won't be written and cleaned while we're trying to dirty them. We
|
|
* serialize racing to add dirty tracking to the same block in case the
|
|
* caller didn't.
|
|
*
|
|
* Presence in the dirty tree holds a bh ref.
|
|
*/
|
|
static int insert_bhp(struct super_block *sb, struct buffer_head *bh)
|
|
{
|
|
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
|
struct block_bh_private *bhp;
|
|
unsigned long flags;
|
|
int ret = 0;
|
|
|
|
if (bh->b_private)
|
|
return 0;
|
|
|
|
lock_buffer(bh);
|
|
if (bh->b_private)
|
|
goto out;
|
|
|
|
bhp = kmalloc(sizeof(*bhp), GFP_NOFS);
|
|
if (!bhp) {
|
|
ret = -ENOMEM;
|
|
goto out;
|
|
}
|
|
|
|
bhp->sb = sb;
|
|
bhp->bh = bh;
|
|
get_bh(bh);
|
|
bh->b_private = bhp;
|
|
|
|
spin_lock_irqsave(&sbi->block_lock, flags);
|
|
insert_bhp_rb(&sbi->block_dirty_tree, bh);
|
|
spin_unlock_irqrestore(&sbi->block_lock, flags);
|
|
|
|
trace_printk("blkno %llu bh %p\n", (u64)bh->b_blocknr, bh);
|
|
out:
|
|
unlock_buffer(bh);
|
|
return ret;
|
|
}
|
|
|
|
static void erase_bhp(struct buffer_head *bh)
|
|
{
|
|
struct block_bh_private *bhp = bh->b_private;
|
|
struct scoutfs_sb_info *sbi = sbi_from_bh(bh);
|
|
unsigned long flags;
|
|
|
|
spin_lock_irqsave(&sbi->block_lock, flags);
|
|
rb_erase(&bhp->node, &sbi->block_dirty_tree);
|
|
spin_unlock_irqrestore(&sbi->block_lock, flags);
|
|
|
|
put_bh(bh);
|
|
kfree(bhp);
|
|
bh->b_private = NULL;
|
|
|
|
trace_printk("blkno %llu bh %p\n", (u64)bh->b_blocknr, bh);
|
|
}
|
|
|
|
/*
|
|
* Read an existing block from the device and verify its metadata header.
|
|
* The buffer head is returned unlocked and uptodate.
|
|
*/
|
|
struct buffer_head *scoutfs_block_read(struct super_block *sb, u64 blkno)
|
|
{
|
|
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
|
struct buffer_head *bh;
|
|
int ret;
|
|
|
|
bh = sb_bread(sb, blkno);
|
|
if (!bh) {
|
|
bh = ERR_PTR(-EIO);
|
|
goto out;
|
|
}
|
|
|
|
if (!buffer_scoutfs_verified(bh)) {
|
|
lock_buffer(bh);
|
|
if (!buffer_scoutfs_verified(bh)) {
|
|
ret = verify_block_header(sbi, bh);
|
|
if (!ret)
|
|
set_buffer_scoutfs_verified(bh);
|
|
} else {
|
|
ret = 0;
|
|
}
|
|
unlock_buffer(bh);
|
|
if (ret < 0) {
|
|
scoutfs_block_put(bh);
|
|
bh = ERR_PTR(ret);
|
|
}
|
|
}
|
|
|
|
out:
|
|
return bh;
|
|
}
|
|
|
|
/*
|
|
* Read an existing block from the device described by the caller's
|
|
* reference.
|
|
*
|
|
* If the reference sequence numbers don't match then we could be racing
|
|
* with another writer. We back off and try again. If it happens too
|
|
* many times the caller assumes that we've hit persistent corruption
|
|
* and returns an error.
|
|
*
|
|
* XXX how does this race with
|
|
* - reads that span transactions?
|
|
* - writers creating a new dirty block?
|
|
*/
|
|
struct buffer_head *scoutfs_block_read_ref(struct super_block *sb,
|
|
struct scoutfs_block_ref *ref)
|
|
{
|
|
struct scoutfs_block_header *hdr;
|
|
struct buffer_head *bh;
|
|
|
|
bh = scoutfs_block_read(sb, le64_to_cpu(ref->blkno));
|
|
if (!IS_ERR(bh)) {
|
|
hdr = bh_data(bh);
|
|
if (WARN_ON_ONCE(hdr->seq != ref->seq)) {
|
|
clear_buffer_uptodate(bh);
|
|
brelse(bh);
|
|
bh = ERR_PTR(-EAGAIN);
|
|
}
|
|
}
|
|
|
|
return bh;
|
|
}
|
|
|
|
/*
|
|
* We stop tracking dirty metadata blocks when their IO succeeds. This
|
|
* happens in the context of transaction commit which excludes other
|
|
* metadata dirtying paths.
|
|
*/
|
|
static void block_write_end_io(struct buffer_head *bh, int uptodate)
|
|
{
|
|
struct scoutfs_sb_info *sbi = sbi_from_bh(bh);
|
|
|
|
trace_printk("bh %p uptdate %d\n", bh, uptodate);
|
|
|
|
/* XXX */
|
|
unlock_buffer(bh);
|
|
|
|
if (uptodate) {
|
|
erase_bhp(bh);
|
|
} else {
|
|
/* don't care if this is racey? */
|
|
if (!sbi->block_write_err)
|
|
sbi->block_write_err = -EIO;
|
|
}
|
|
|
|
if (atomic_dec_and_test(&sbi->block_writes))
|
|
wake_up(&sbi->block_wq);
|
|
}
|
|
|
|
/*
|
|
* Submit writes for all the buffer heads in the dirty block tree. The
|
|
* write transaction machinery ensures that the dirty blocks form a
|
|
* consistent image and excludes future dirtying while we're working.
|
|
*
|
|
* Presence in the dirty tree holds a reference. Blocks are only
|
|
* removed from the tree which drops the ref when IO completes.
|
|
*
|
|
* Blocks that see write errors remain in the dirty tree and will try to
|
|
* be written again in the next transaction commit.
|
|
*
|
|
* Reads can traverse the blocks while they're in flight.
|
|
*/
|
|
int scoutfs_block_write_dirty(struct super_block *sb)
|
|
{
|
|
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
|
struct buffer_head *bh;
|
|
struct rb_node *node;
|
|
struct blk_plug plug;
|
|
unsigned long flags;
|
|
int ret;
|
|
|
|
atomic_set(&sbi->block_writes, 1);
|
|
sbi->block_write_err = 0;
|
|
ret = 0;
|
|
|
|
blk_start_plug(&plug);
|
|
|
|
spin_lock_irqsave(&sbi->block_lock, flags);
|
|
node = rb_first(&sbi->block_dirty_tree);
|
|
while(node) {
|
|
bh = bh_from_bhp_node(node);
|
|
node = rb_next(node);
|
|
spin_unlock_irqrestore(&sbi->block_lock, flags);
|
|
|
|
atomic_inc(&sbi->block_writes);
|
|
scoutfs_block_set_crc(bh);
|
|
|
|
/*
|
|
* XXX submit_bh() forces us to lock the block while IO is
|
|
* in flight. This is unfortunate because we use the buffer
|
|
* head lock to serialize access to btree block contents.
|
|
* We should fix that and only use the buffer head lock
|
|
* when the APIs force us to.
|
|
*/
|
|
lock_buffer(bh);
|
|
|
|
bh->b_end_io = block_write_end_io;
|
|
ret = submit_bh(WRITE, bh); /* doesn't actually fail? */
|
|
|
|
spin_lock_irqsave(&sbi->block_lock, flags);
|
|
if (ret)
|
|
break;
|
|
}
|
|
spin_unlock_irqrestore(&sbi->block_lock, flags);
|
|
|
|
blk_finish_plug(&plug);
|
|
|
|
/* wait for all io to drain */
|
|
atomic_dec(&sbi->block_writes);
|
|
wait_event(sbi->block_wq, atomic_read(&sbi->block_writes) == 0);
|
|
|
|
trace_printk("ret %d\n", ret);
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* The caller knows that it's not racing with writers.
|
|
*/
|
|
int scoutfs_block_has_dirty(struct super_block *sb)
|
|
{
|
|
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
|
|
|
return !RB_EMPTY_ROOT(&sbi->block_dirty_tree);
|
|
}
|
|
|
|
/*
|
|
* Give the caller a dirty block that they can safely modify. If the
|
|
* reference refers to a stable clean block then we allocate a new block
|
|
* and update the reference.
|
|
*
|
|
* Blocks are dirtied and modified within a transaction that has a given
|
|
* sequence number which we use to determine if the block is currently
|
|
* dirty or not.
|
|
*
|
|
* For now we're using the dirty super block in the sb_info to track the
|
|
* dirty seq. That'll be different when we have multiple btrees.
|
|
*
|
|
* Callers are responsible for serializing modification to the reference
|
|
* which is probably embedded in some other dirty persistent structure.
|
|
*/
|
|
struct buffer_head *scoutfs_block_dirty_ref(struct super_block *sb,
|
|
struct scoutfs_block_ref *ref)
|
|
{
|
|
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
|
struct scoutfs_block_header *hdr;
|
|
struct buffer_head *copy_bh = NULL;
|
|
struct buffer_head *bh;
|
|
u64 blkno = 0;
|
|
int ret;
|
|
int err;
|
|
|
|
bh = scoutfs_block_read(sb, le64_to_cpu(ref->blkno));
|
|
if (IS_ERR(bh) || ref->seq == sbi->super.hdr.seq)
|
|
return bh;
|
|
|
|
ret = scoutfs_buddy_alloc_same(sb, &blkno, 0, le64_to_cpu(ref->blkno));
|
|
if (ret < 0)
|
|
goto out;
|
|
|
|
copy_bh = scoutfs_block_dirty(sb, blkno);
|
|
if (IS_ERR(copy_bh)) {
|
|
ret = PTR_ERR(copy_bh);
|
|
goto out;
|
|
}
|
|
|
|
ret = scoutfs_buddy_free(sb, bh->b_blocknr, 0);
|
|
if (ret)
|
|
goto out;
|
|
|
|
memcpy(copy_bh->b_data, bh->b_data, SCOUTFS_BLOCK_SIZE);
|
|
|
|
hdr = bh_data(copy_bh);
|
|
hdr->blkno = cpu_to_le64(blkno);
|
|
hdr->seq = sbi->super.hdr.seq;
|
|
ref->blkno = hdr->blkno;
|
|
ref->seq = hdr->seq;
|
|
|
|
ret = 0;
|
|
out:
|
|
scoutfs_block_put(bh);
|
|
if (ret) {
|
|
if (!IS_ERR_OR_NULL(copy_bh)) {
|
|
err = scoutfs_buddy_free(sb, copy_bh->b_blocknr, 0);
|
|
WARN_ON_ONCE(err); /* freeing dirty must work */
|
|
}
|
|
scoutfs_block_put(copy_bh);
|
|
copy_bh = ERR_PTR(ret);
|
|
}
|
|
|
|
return copy_bh;
|
|
}
|
|
|
|
/*
|
|
* Return a dirty metadata block with an updated block header to match
|
|
* the current dirty seq. Callers are responsible for serializing
|
|
* access to the block and for zeroing unwritten block contents.
|
|
*/
|
|
struct buffer_head *scoutfs_block_dirty(struct super_block *sb, u64 blkno)
|
|
{
|
|
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
|
struct scoutfs_block_header *hdr;
|
|
struct buffer_head *bh;
|
|
int ret;
|
|
|
|
/* allocate a new block and try to insert it */
|
|
bh = sb_getblk(sb, blkno);
|
|
if (!bh) {
|
|
bh = ERR_PTR(-ENOMEM);
|
|
goto out;
|
|
}
|
|
|
|
ret = insert_bhp(sb, bh);
|
|
if (ret < 0) {
|
|
scoutfs_block_put(bh);
|
|
bh = ERR_PTR(ret);
|
|
goto out;
|
|
}
|
|
|
|
hdr = bh_data(bh);
|
|
*hdr = sbi->super.hdr;
|
|
hdr->blkno = cpu_to_le64(blkno);
|
|
hdr->seq = sbi->super.hdr.seq;
|
|
|
|
set_buffer_uptodate(bh);
|
|
set_buffer_scoutfs_verified(bh);
|
|
out:
|
|
return bh;
|
|
}
|
|
|
|
/*
|
|
* Allocate a new dirty writable block. The caller must be in a
|
|
* transaction so that we can assign the dirty seq.
|
|
*/
|
|
struct buffer_head *scoutfs_block_dirty_alloc(struct super_block *sb)
|
|
{
|
|
struct buffer_head *bh;
|
|
u64 blkno;
|
|
int ret;
|
|
int err;
|
|
|
|
ret = scoutfs_buddy_alloc(sb, &blkno, 0);
|
|
if (ret < 0)
|
|
return ERR_PTR(ret);
|
|
|
|
bh = scoutfs_block_dirty(sb, blkno);
|
|
if (IS_ERR(bh)) {
|
|
err = scoutfs_buddy_free(sb, blkno, 0);
|
|
WARN_ON_ONCE(err); /* freeing dirty must work */
|
|
}
|
|
return bh;
|
|
}
|
|
|
|
void scoutfs_block_set_crc(struct buffer_head *bh)
|
|
{
|
|
struct scoutfs_block_header *hdr = bh_data(bh);
|
|
|
|
hdr->crc = cpu_to_le32(scoutfs_crc_block(hdr));
|
|
}
|
|
|
|
void scoutfs_block_zero(struct buffer_head *bh, size_t off)
|
|
{
|
|
if (WARN_ON_ONCE(off > SCOUTFS_BLOCK_SIZE))
|
|
return;
|
|
|
|
if (off < SCOUTFS_BLOCK_SIZE)
|
|
memset((char *)bh->b_data + off, 0, SCOUTFS_BLOCK_SIZE - off);
|
|
}
|