mirror of
https://github.com/versity/scoutfs.git
synced 2026-05-01 10:25:43 +00:00
As subsystems were built I tended to use interruptible waits in the hope that we'd let users break out of most waits. The reality is that we have significant code paths that have trouble unwinding. Final inode deletion during iput->evict in a task is a good example. It's madness to have a pending signal turn an inode deletion from an efficient inline operation to a deferred background orphan inode scan deletion. It also happens that golang built pre-emptive thread scheduling around signals. Under load we see a surprising amount of signal spam and it has created surprising error cases which would have otherwise been fine. This changes waits to expect that IOs (including network commands) will complete reasonably promptly. We remove all interruptible waits with the notable exception of breaking out of a pending mount. That requires shuffling setup around a little bit so that the first network message we wait for is the lock for getting the root inode. Signed-off-by: Zach Brown <zab@versity.com>
1288 lines
34 KiB
C
1288 lines
34 KiB
C
/*
|
|
* Copyright (C) 2019 Versity Software, Inc. All rights reserved.
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public
|
|
* License v2 as published by the Free Software Foundation.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* General Public License for more details.
|
|
*/
|
|
#include <linux/kernel.h>
|
|
#include <linux/fs.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/crc32c.h>
|
|
#include <linux/pagemap.h>
|
|
#include <linux/vmalloc.h>
|
|
#include <linux/sched.h>
|
|
#include <linux/bio.h>
|
|
#include <linux/blkdev.h>
|
|
#include <linux/rhashtable.h>
|
|
#include <linux/random.h>
|
|
|
|
#include "format.h"
|
|
#include "super.h"
|
|
#include "block.h"
|
|
#include "counters.h"
|
|
#include "msg.h"
|
|
#include "scoutfs_trace.h"
|
|
#include "alloc.h"
|
|
#include "triggers.h"
|
|
|
|
/*
|
|
* The scoutfs block cache manages metadata blocks that can be larger
|
|
* than the page size. Callers can have their own contexts for tracking
|
|
* dirty blocks that are written together. We pin dirty blocks in
|
|
* memory and only checksum them all as they're all written.
|
|
*
|
|
* Memory reclaim is driven by maintaining two very coarse groups of
|
|
* blocks. As we access blocks we mark them with an increasing counter
|
|
* to discourage them from being reclaimed. We then define a threshold
|
|
* at the current counter minus half the population. Recent blocks have
|
|
* a counter greater than the threshold, and all other blocks with
|
|
* counters less than it are considered older and are candidates for
|
|
* reclaim. This results in access updates rarely modifying an atomic
|
|
* counter as blocks need to be moved into the recent group, and shrink
|
|
* can randomly scan blocks looking for the half of the population that
|
|
* will be in the old group. It's reasonably effective, but is
|
|
* particularly efficient and avoids contention between concurrent
|
|
* accesses and shrinking.
|
|
*/
|
|
|
|
struct block_info {
|
|
struct super_block *sb;
|
|
atomic_t total_inserted;
|
|
atomic64_t access_counter;
|
|
struct rhashtable ht;
|
|
wait_queue_head_t waitq;
|
|
struct shrinker shrinker;
|
|
struct work_struct free_work;
|
|
struct llist_head free_llist;
|
|
};
|
|
|
|
#define DECLARE_BLOCK_INFO(sb, name) \
|
|
struct block_info *name = SCOUTFS_SB(sb)->block_info
|
|
|
|
enum block_status_bits {
|
|
BLOCK_BIT_UPTODATE = 0, /* contents consistent with media */
|
|
BLOCK_BIT_NEW, /* newly allocated, contents undefined */
|
|
BLOCK_BIT_DIRTY, /* dirty, writer will write */
|
|
BLOCK_BIT_IO_BUSY, /* bios are in flight */
|
|
BLOCK_BIT_ERROR, /* saw IO error */
|
|
BLOCK_BIT_PAGE_ALLOC, /* page (possibly high order) allocation */
|
|
BLOCK_BIT_VIRT, /* mapped virt allocation */
|
|
BLOCK_BIT_CRC_VALID, /* crc has been verified */
|
|
};
|
|
|
|
/*
|
|
* We want to tie atomic changes in refcounts to whether or not the
|
|
* block is still visible in the hash table, so we store the hash
|
|
* table's reference up at a known high bit. We could naturally set the
|
|
* inserted bit through excessive refcount increments. We don't do
|
|
* anything about that but at least warn if we get close.
|
|
*
|
|
* We're avoiding the high byte for no real good reason, just out of a
|
|
* historical fear of implementations that don't provide the full
|
|
* precision.
|
|
*/
|
|
#define BLOCK_REF_INSERTED (1U << 23)
|
|
#define BLOCK_REF_FULL (BLOCK_REF_INSERTED >> 1)
|
|
|
|
struct block_private {
|
|
struct scoutfs_block bl;
|
|
struct super_block *sb;
|
|
atomic_t refcount;
|
|
u64 accessed;
|
|
struct rhash_head ht_head;
|
|
struct list_head dirty_entry;
|
|
struct llist_node free_node;
|
|
unsigned long bits;
|
|
atomic_t io_count;
|
|
union {
|
|
struct page *page;
|
|
void *virt;
|
|
};
|
|
};
|
|
|
|
#define TRACE_BLOCK(which, bp) \
|
|
do { \
|
|
__typeof__(bp) _bp = (bp); \
|
|
trace_scoutfs_block_##which(_bp->sb, _bp, _bp->bl.blkno, atomic_read(&_bp->refcount), \
|
|
atomic_read(&_bp->io_count), _bp->bits, _bp->accessed); \
|
|
} while (0)
|
|
|
|
#define BLOCK_PRIVATE(_bl) \
|
|
container_of((_bl), struct block_private, bl)
|
|
|
|
static __le32 block_calc_crc(struct scoutfs_block_header *hdr, u32 size)
|
|
{
|
|
int off = offsetof(struct scoutfs_block_header, crc) +
|
|
FIELD_SIZEOF(struct scoutfs_block_header, crc);
|
|
u32 calc = crc32c(~0, (char *)hdr + off, size - off);
|
|
|
|
return cpu_to_le32(calc);
|
|
}
|
|
|
|
static struct block_private *block_alloc(struct super_block *sb, u64 blkno)
|
|
{
|
|
struct block_private *bp;
|
|
unsigned int noio_flags;
|
|
|
|
/*
|
|
* If we had multiple blocks per page we'd need to be a little
|
|
* more careful with a partial page allocator when allocating
|
|
* blocks.
|
|
*/
|
|
BUILD_BUG_ON(PAGE_SIZE > SCOUTFS_BLOCK_LG_SIZE);
|
|
|
|
bp = kzalloc(sizeof(struct block_private), GFP_NOFS);
|
|
if (!bp)
|
|
goto out;
|
|
|
|
bp->page = alloc_pages(GFP_NOFS | __GFP_NOWARN,
|
|
SCOUTFS_BLOCK_LG_PAGE_ORDER);
|
|
if (bp->page) {
|
|
scoutfs_inc_counter(sb, block_cache_alloc_page_order);
|
|
set_bit(BLOCK_BIT_PAGE_ALLOC, &bp->bits);
|
|
bp->bl.data = page_address(bp->page);
|
|
} else {
|
|
/*
|
|
* __vmalloc doesn't pass the gfp flags down to pte
|
|
* allocs, they're done with user alloc flags.
|
|
* Unfortunately, some lockdep doesn't know that
|
|
* PF_NOMEMALLOC prevents __GFP_FS reclaim and generates
|
|
* spurious reclaim-on dependencies and warnings.
|
|
*/
|
|
lockdep_off();
|
|
noio_flags = memalloc_noio_save();
|
|
bp->virt = __vmalloc(SCOUTFS_BLOCK_LG_SIZE, GFP_NOFS | __GFP_HIGHMEM, PAGE_KERNEL);
|
|
memalloc_noio_restore(noio_flags);
|
|
lockdep_on();
|
|
|
|
if (!bp->virt) {
|
|
kfree(bp);
|
|
bp = NULL;
|
|
goto out;
|
|
}
|
|
|
|
scoutfs_inc_counter(sb, block_cache_alloc_virt);
|
|
set_bit(BLOCK_BIT_VIRT, &bp->bits);
|
|
bp->bl.data = bp->virt;
|
|
}
|
|
|
|
bp->bl.blkno = blkno;
|
|
bp->sb = sb;
|
|
atomic_set(&bp->refcount, 1);
|
|
INIT_LIST_HEAD(&bp->dirty_entry);
|
|
set_bit(BLOCK_BIT_NEW, &bp->bits);
|
|
atomic_set(&bp->io_count, 0);
|
|
|
|
TRACE_BLOCK(allocate, bp);
|
|
|
|
out:
|
|
if (!bp)
|
|
scoutfs_inc_counter(sb, block_cache_alloc_failure);
|
|
return bp;
|
|
}
|
|
|
|
static void block_free(struct super_block *sb, struct block_private *bp)
|
|
{
|
|
scoutfs_inc_counter(sb, block_cache_free);
|
|
|
|
TRACE_BLOCK(free, bp);
|
|
|
|
if (test_bit(BLOCK_BIT_PAGE_ALLOC, &bp->bits))
|
|
__free_pages(bp->page, SCOUTFS_BLOCK_LG_PAGE_ORDER);
|
|
else if (test_bit(BLOCK_BIT_VIRT, &bp->bits))
|
|
vfree(bp->virt);
|
|
else
|
|
BUG();
|
|
|
|
/* ok to tear down dirty blocks when forcing unmount */
|
|
WARN_ON_ONCE(!scoutfs_forcing_unmount(sb) && !list_empty(&bp->dirty_entry));
|
|
|
|
WARN_ON_ONCE(atomic_read(&bp->refcount));
|
|
WARN_ON_ONCE(atomic_read(&bp->io_count));
|
|
kfree(bp);
|
|
}
|
|
|
|
/*
|
|
* Free all the blocks that were put in the free_llist. We have to wait
|
|
* for rcu grace periods to expire to ensure that no more rcu hash list
|
|
* lookups can see the blocks.
|
|
*/
|
|
static void block_free_work(struct work_struct *work)
|
|
{
|
|
struct block_info *binf = container_of(work, struct block_info, free_work);
|
|
struct super_block *sb = binf->sb;
|
|
struct block_private *bp;
|
|
struct block_private *tmp;
|
|
struct llist_node *deleted;
|
|
|
|
scoutfs_inc_counter(sb, block_cache_free_work);
|
|
|
|
deleted = llist_del_all(&binf->free_llist);
|
|
synchronize_rcu();
|
|
|
|
llist_for_each_entry_safe(bp, tmp, deleted, free_node) {
|
|
block_free(sb, bp);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Get a reference to a block while holding an existing reference.
|
|
*/
|
|
static void block_get(struct block_private *bp)
|
|
{
|
|
WARN_ON_ONCE((atomic_read(&bp->refcount) & ~BLOCK_REF_INSERTED) <= 0);
|
|
|
|
atomic_inc(&bp->refcount);
|
|
}
|
|
|
|
/*
|
|
* Get a reference to a block as long as it's been inserted in the hash
|
|
* table and hasn't been removed.
|
|
*/
|
|
static struct block_private *block_get_if_inserted(struct block_private *bp)
|
|
{
|
|
int cnt;
|
|
|
|
do {
|
|
cnt = atomic_read(&bp->refcount);
|
|
WARN_ON_ONCE(cnt & BLOCK_REF_FULL);
|
|
if (!(cnt & BLOCK_REF_INSERTED))
|
|
return NULL;
|
|
|
|
} while (atomic_cmpxchg(&bp->refcount, cnt, cnt + 1) != cnt);
|
|
|
|
return bp;
|
|
}
|
|
|
|
/*
|
|
* Drop the caller's reference. If this was the final reference we
|
|
* queue the block to be freed once the rcu period ends. Readers can be
|
|
* racing to try to get references to these blocks, but they won't get a
|
|
* reference because the block isn't present in the hash table any more.
|
|
*/
|
|
static void block_put(struct super_block *sb, struct block_private *bp)
|
|
{
|
|
DECLARE_BLOCK_INFO(sb, binf);
|
|
int cnt;
|
|
|
|
if (!IS_ERR_OR_NULL(bp)) {
|
|
cnt = atomic_dec_return(&bp->refcount);
|
|
if (cnt == 0) {
|
|
llist_add(&bp->free_node, &binf->free_llist);
|
|
schedule_work(&binf->free_work);
|
|
} else {
|
|
WARN_ON_ONCE(cnt < 0);
|
|
}
|
|
}
|
|
}
|
|
|
|
static const struct rhashtable_params block_ht_params = {
|
|
.key_len = member_sizeof(struct block_private, bl.blkno),
|
|
.key_offset = offsetof(struct block_private, bl.blkno),
|
|
.head_offset = offsetof(struct block_private, ht_head),
|
|
};
|
|
|
|
/*
|
|
* Insert a new block into the hash table. Once it is inserted in the
|
|
* hash table readers can start getting references. The caller may have
|
|
* multiple refs but the block can't already be inserted.
|
|
*/
|
|
static int block_insert(struct super_block *sb, struct block_private *bp)
|
|
{
|
|
DECLARE_BLOCK_INFO(sb, binf);
|
|
int ret;
|
|
|
|
WARN_ON_ONCE(atomic_read(&bp->refcount) & BLOCK_REF_INSERTED);
|
|
|
|
retry:
|
|
atomic_add(BLOCK_REF_INSERTED, &bp->refcount);
|
|
ret = rhashtable_lookup_insert_fast(&binf->ht, &bp->ht_head, block_ht_params);
|
|
if (ret < 0) {
|
|
atomic_sub(BLOCK_REF_INSERTED, &bp->refcount);
|
|
if (ret == -EBUSY) {
|
|
/* wait for pending rebalance to finish */
|
|
synchronize_rcu();
|
|
goto retry;
|
|
}
|
|
} else {
|
|
atomic_inc(&binf->total_inserted);
|
|
TRACE_BLOCK(insert, bp);
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
static u64 accessed_recently(struct block_info *binf)
|
|
{
|
|
return atomic64_read(&binf->access_counter) - (atomic_read(&binf->total_inserted) >> 1);
|
|
}
|
|
|
|
/*
|
|
* Make sure that a block that is being accessed is less likely to be
|
|
* reclaimed if it is seen by the shrinker. If the block hasn't been
|
|
* accessed recently we update its accessed value.
|
|
*/
|
|
static void block_accessed(struct super_block *sb, struct block_private *bp)
|
|
{
|
|
DECLARE_BLOCK_INFO(sb, binf);
|
|
|
|
if (bp->accessed == 0 || bp->accessed < accessed_recently(binf)) {
|
|
scoutfs_inc_counter(sb, block_cache_access_update);
|
|
bp->accessed = atomic64_inc_return(&binf->access_counter);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* The caller wants to remove the block from the hash table and has an
|
|
* idea what the refcount should be. If the refcount does still
|
|
* indicate that the block is hashed, and we're able to clear that bit,
|
|
* then we can remove it from the hash table.
|
|
*
|
|
* The caller makes sure that it's safe to be referencing this block,
|
|
* either with their own held reference (most everything) or by being in
|
|
* an rcu grace period (shrink).
|
|
*/
|
|
static bool block_remove_cnt(struct super_block *sb, struct block_private *bp, int cnt)
|
|
{
|
|
DECLARE_BLOCK_INFO(sb, binf);
|
|
int ret;
|
|
|
|
if ((cnt & BLOCK_REF_INSERTED) &&
|
|
(atomic_cmpxchg(&bp->refcount, cnt, cnt & ~BLOCK_REF_INSERTED) == cnt)) {
|
|
|
|
TRACE_BLOCK(remove, bp);
|
|
ret = rhashtable_remove_fast(&binf->ht, &bp->ht_head, block_ht_params);
|
|
WARN_ON_ONCE(ret); /* must have been inserted */
|
|
atomic_dec(&binf->total_inserted);
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* Try to remove the block from the hash table as long as the refcount
|
|
* indicates that it is still in the hash table. This can be racing
|
|
* with normal refcount changes so it might have to retry.
|
|
*/
|
|
static void block_remove(struct super_block *sb, struct block_private *bp)
|
|
{
|
|
int cnt;
|
|
|
|
do {
|
|
cnt = atomic_read(&bp->refcount);
|
|
} while ((cnt & BLOCK_REF_INSERTED) && !block_remove_cnt(sb, bp, cnt));
|
|
}
|
|
|
|
/*
|
|
* Take one shot at removing the block from the hash table if it's still
|
|
* in the hash table and the caller has the only other reference.
|
|
*/
|
|
static bool block_remove_solo(struct super_block *sb, struct block_private *bp)
|
|
{
|
|
return block_remove_cnt(sb, bp, BLOCK_REF_INSERTED | 1);
|
|
}
|
|
|
|
static bool io_busy(struct block_private *bp)
|
|
{
|
|
smp_rmb(); /* test after adding to wait queue */
|
|
return test_bit(BLOCK_BIT_IO_BUSY, &bp->bits);
|
|
}
|
|
|
|
/*
|
|
* Called during shutdown with no other users.
|
|
*/
|
|
static void block_remove_all(struct super_block *sb)
|
|
{
|
|
DECLARE_BLOCK_INFO(sb, binf);
|
|
struct rhashtable_iter iter;
|
|
struct block_private *bp;
|
|
|
|
rhashtable_walk_enter(&binf->ht, &iter);
|
|
rhashtable_walk_start(&iter);
|
|
|
|
for (;;) {
|
|
bp = rhashtable_walk_next(&iter);
|
|
if (bp == NULL)
|
|
break;
|
|
if (bp == ERR_PTR(-EAGAIN))
|
|
continue;
|
|
|
|
if (block_get_if_inserted(bp)) {
|
|
block_remove(sb, bp);
|
|
WARN_ON_ONCE(atomic_read(&bp->refcount) != 1);
|
|
block_put(sb, bp);
|
|
}
|
|
}
|
|
|
|
rhashtable_walk_stop(&iter);
|
|
rhashtable_walk_exit(&iter);
|
|
|
|
WARN_ON_ONCE(atomic_read(&binf->total_inserted) != 0);
|
|
}
|
|
|
|
/*
|
|
* XXX The io_count and sb fields in the block_private are only used
|
|
* during IO. We don't need to have them sitting around for the entire
|
|
* lifetime of each cached block.
|
|
*
|
|
* This is happening in interrupt context so we do as little work as
|
|
* possible. Final freeing, verifying checksums, and unlinking errored
|
|
* blocks are all done by future users of the blocks.
|
|
*/
|
|
static void block_end_io(struct super_block *sb, int rw,
|
|
struct block_private *bp, int err)
|
|
{
|
|
DECLARE_BLOCK_INFO(sb, binf);
|
|
bool is_read = !(rw & WRITE);
|
|
|
|
if (err) {
|
|
scoutfs_inc_counter(sb, block_cache_end_io_error);
|
|
set_bit(BLOCK_BIT_ERROR, &bp->bits);
|
|
}
|
|
|
|
if (!atomic_dec_and_test(&bp->io_count))
|
|
return;
|
|
|
|
if (is_read && !test_bit(BLOCK_BIT_ERROR, &bp->bits))
|
|
set_bit(BLOCK_BIT_UPTODATE, &bp->bits);
|
|
|
|
clear_bit(BLOCK_BIT_IO_BUSY, &bp->bits);
|
|
block_put(sb, bp);
|
|
|
|
/* make sure set and cleared bits are visible to woken */
|
|
smp_mb();
|
|
|
|
if (waitqueue_active(&binf->waitq))
|
|
wake_up(&binf->waitq);
|
|
}
|
|
|
|
static void block_bio_end_io(struct bio *bio, int err)
|
|
{
|
|
struct block_private *bp = bio->bi_private;
|
|
struct super_block *sb = bp->sb;
|
|
|
|
TRACE_BLOCK(end_io, bp);
|
|
block_end_io(sb, bio->bi_rw, bp, err);
|
|
bio_put(bio);
|
|
}
|
|
|
|
/*
|
|
* Kick off IO for a single block.
|
|
*/
|
|
static int block_submit_bio(struct super_block *sb, struct block_private *bp,
|
|
int rw)
|
|
{
|
|
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
|
struct bio *bio = NULL;
|
|
struct blk_plug plug;
|
|
struct page *page;
|
|
unsigned long off;
|
|
sector_t sector;
|
|
int ret = 0;
|
|
|
|
if (scoutfs_forcing_unmount(sb))
|
|
return -EIO;
|
|
|
|
sector = bp->bl.blkno << (SCOUTFS_BLOCK_LG_SHIFT - 9);
|
|
|
|
WARN_ON_ONCE(bp->bl.blkno == U64_MAX);
|
|
WARN_ON_ONCE(sector == U64_MAX || sector == 0);
|
|
|
|
/* don't let racing end_io during submission think block is complete */
|
|
atomic_inc(&bp->io_count);
|
|
set_bit(BLOCK_BIT_IO_BUSY, &bp->bits);
|
|
block_get(bp);
|
|
|
|
blk_start_plug(&plug);
|
|
|
|
for (off = 0; off < SCOUTFS_BLOCK_LG_SIZE; off += PAGE_SIZE) {
|
|
if (!bio) {
|
|
bio = bio_alloc(GFP_NOFS, SCOUTFS_BLOCK_LG_PAGES_PER);
|
|
if (!bio) {
|
|
ret = -ENOMEM;
|
|
break;
|
|
}
|
|
|
|
bio->bi_sector = sector + (off >> 9);
|
|
bio->bi_bdev = sbi->meta_bdev;
|
|
bio->bi_end_io = block_bio_end_io;
|
|
bio->bi_private = bp;
|
|
|
|
atomic_inc(&bp->io_count);
|
|
|
|
TRACE_BLOCK(submit, bp);
|
|
}
|
|
|
|
if (test_bit(BLOCK_BIT_PAGE_ALLOC, &bp->bits))
|
|
page = virt_to_page((char *)bp->bl.data + off);
|
|
else if (test_bit(BLOCK_BIT_VIRT, &bp->bits))
|
|
page = vmalloc_to_page((char *)bp->bl.data + off);
|
|
else
|
|
BUG();
|
|
|
|
if (!bio_add_page(bio, page, PAGE_SIZE, 0)) {
|
|
submit_bio(rw, bio);
|
|
bio = NULL;
|
|
}
|
|
}
|
|
|
|
if (bio)
|
|
submit_bio(rw, bio);
|
|
|
|
blk_finish_plug(&plug);
|
|
|
|
/* let racing end_io know we're done */
|
|
block_end_io(sb, rw, bp, ret);
|
|
|
|
return ret;
|
|
}
|
|
|
|
static struct block_private *block_lookup(struct super_block *sb, u64 blkno)
|
|
{
|
|
DECLARE_BLOCK_INFO(sb, binf);
|
|
struct block_private *bp;
|
|
|
|
rcu_read_lock();
|
|
bp = rhashtable_lookup(&binf->ht, &blkno, block_ht_params);
|
|
if (bp)
|
|
bp = block_get_if_inserted(bp);
|
|
rcu_read_unlock();
|
|
|
|
return bp;
|
|
}
|
|
|
|
/*
|
|
* Return a reference to a cached block found in the hash table. If one
|
|
* isn't found then we try and allocate and insert a new one. Its
|
|
* contents are undefined if it's newly allocated.
|
|
*
|
|
* Our hash table lookups during rcu can be racing with shrinking and
|
|
* removal from the hash table. We only atomically get a reference if
|
|
* the refcount indicates that the block is still present in the hash
|
|
* table.
|
|
*/
|
|
static struct block_private *block_lookup_create(struct super_block *sb,
|
|
u64 blkno)
|
|
{
|
|
struct block_private *bp;
|
|
int ret;
|
|
|
|
restart:
|
|
bp = block_lookup(sb, blkno);
|
|
|
|
/* drop failed reads that interrupted waiters abandoned */
|
|
if (bp && (test_bit(BLOCK_BIT_ERROR, &bp->bits) &&
|
|
!test_bit(BLOCK_BIT_DIRTY, &bp->bits))) {
|
|
block_remove(sb, bp);
|
|
block_put(sb, bp);
|
|
bp = NULL;
|
|
}
|
|
|
|
if (!bp) {
|
|
bp = block_alloc(sb, blkno);
|
|
if (bp == NULL) {
|
|
ret = -ENOMEM;
|
|
goto out;
|
|
}
|
|
|
|
ret = block_insert(sb, bp);
|
|
if (ret < 0) {
|
|
if (ret == -EEXIST) {
|
|
block_put(sb, bp);
|
|
goto restart;
|
|
}
|
|
goto out;
|
|
}
|
|
}
|
|
|
|
block_accessed(sb, bp);
|
|
ret = 0;
|
|
|
|
out:
|
|
if (ret < 0) {
|
|
block_put(sb, bp);
|
|
return ERR_PTR(ret);
|
|
}
|
|
|
|
return bp;
|
|
}
|
|
|
|
static bool uptodate_or_error(struct block_private *bp)
|
|
{
|
|
smp_rmb(); /* test after adding to wait queue */
|
|
return test_bit(BLOCK_BIT_UPTODATE, &bp->bits) ||
|
|
test_bit(BLOCK_BIT_ERROR, &bp->bits);
|
|
}
|
|
|
|
static bool block_is_dirty(struct block_private *bp)
|
|
{
|
|
return test_bit(BLOCK_BIT_DIRTY, &bp->bits) != 0;
|
|
}
|
|
|
|
static struct block_private *block_read(struct super_block *sb, u64 blkno)
|
|
{
|
|
DECLARE_BLOCK_INFO(sb, binf);
|
|
struct block_private *bp = NULL;
|
|
int ret;
|
|
|
|
bp = block_lookup_create(sb, blkno);
|
|
if (IS_ERR(bp)) {
|
|
ret = PTR_ERR(bp);
|
|
goto out;
|
|
}
|
|
|
|
if (!test_bit(BLOCK_BIT_UPTODATE, &bp->bits) &&
|
|
test_and_clear_bit(BLOCK_BIT_NEW, &bp->bits)) {
|
|
ret = block_submit_bio(sb, bp, READ);
|
|
if (ret < 0)
|
|
goto out;
|
|
}
|
|
|
|
wait_event(binf->waitq, uptodate_or_error(bp));
|
|
if (test_bit(BLOCK_BIT_ERROR, &bp->bits))
|
|
ret = -EIO;
|
|
else
|
|
ret = 0;
|
|
|
|
out:
|
|
if (ret < 0) {
|
|
block_put(sb, bp);
|
|
return ERR_PTR(ret);
|
|
}
|
|
|
|
return bp;
|
|
}
|
|
|
|
/*
|
|
* Read a referenced metadata block.
|
|
*
|
|
* The caller may be following a stale reference to a block location
|
|
* that has since been rewritten. We check that destination block
|
|
* header fields match the reference. If they don't, we return -ESTALE
|
|
* and the caller can chose to retry with newer references or return an
|
|
* error. -ESTALE can be a sign of block corruption when the refs are
|
|
* current.
|
|
*
|
|
* Once the caller has the cached block it won't be modified. A writer
|
|
* trying to dirty a block at that location will remove our existing
|
|
* block and insert a new block with modified headers.
|
|
*/
|
|
int scoutfs_block_read_ref(struct super_block *sb, struct scoutfs_block_ref *ref, u32 magic,
|
|
struct scoutfs_block **bl_ret)
|
|
{
|
|
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
|
struct scoutfs_block_header *hdr;
|
|
struct block_private *bp = NULL;
|
|
bool retried = false;
|
|
int ret;
|
|
|
|
retry:
|
|
bp = block_read(sb, le64_to_cpu(ref->blkno));
|
|
if (IS_ERR(bp)) {
|
|
ret = PTR_ERR(bp);
|
|
goto out;
|
|
}
|
|
hdr = bp->bl.data;
|
|
|
|
/* corrupted writes might be a sign of a stale reference */
|
|
if (!test_bit(BLOCK_BIT_CRC_VALID, &bp->bits)) {
|
|
if (hdr->crc != block_calc_crc(hdr, SCOUTFS_BLOCK_LG_SIZE)) {
|
|
ret = -ESTALE;
|
|
goto out;
|
|
}
|
|
|
|
set_bit(BLOCK_BIT_CRC_VALID, &bp->bits);
|
|
}
|
|
|
|
if (hdr->magic != cpu_to_le32(magic) || hdr->fsid != super->hdr.fsid ||
|
|
hdr->seq != ref->seq || hdr->blkno != ref->blkno) {
|
|
ret = -ESTALE;
|
|
goto out;
|
|
}
|
|
|
|
ret = 0;
|
|
out:
|
|
if ((ret == -ESTALE || scoutfs_trigger(sb, BLOCK_REMOVE_STALE)) &&
|
|
!retried && !block_is_dirty(bp)) {
|
|
retried = true;
|
|
scoutfs_inc_counter(sb, block_cache_remove_stale);
|
|
block_remove(sb, bp);
|
|
block_put(sb, bp);
|
|
bp = NULL;
|
|
goto retry;
|
|
}
|
|
|
|
if (ret < 0) {
|
|
block_put(sb, bp);
|
|
bp = NULL;
|
|
}
|
|
|
|
*bl_ret = bp ? &bp->bl : NULL;
|
|
return ret;
|
|
}
|
|
|
|
void scoutfs_block_put(struct super_block *sb, struct scoutfs_block *bl)
|
|
{
|
|
if (!IS_ERR_OR_NULL(bl))
|
|
block_put(sb, BLOCK_PRIVATE(bl));
|
|
}
|
|
|
|
void scoutfs_block_writer_init(struct super_block *sb,
|
|
struct scoutfs_block_writer *wri)
|
|
{
|
|
spin_lock_init(&wri->lock);
|
|
INIT_LIST_HEAD(&wri->dirty_list);
|
|
wri->nr_dirty_blocks = 0;
|
|
}
|
|
|
|
/*
|
|
* Mark a given block dirty. The caller serializes all dirtying calls
|
|
* with writer write calls. As it happens we dirty in allocation order
|
|
* and allocate with an advancing cursor so we always dirty in block
|
|
* offset order and can walk our list to submit nice ordered IO.
|
|
*/
|
|
static void block_mark_dirty(struct super_block *sb, struct scoutfs_block_writer *wri,
|
|
struct scoutfs_block *bl)
|
|
{
|
|
struct block_private *bp = BLOCK_PRIVATE(bl);
|
|
|
|
if (!test_and_set_bit(BLOCK_BIT_DIRTY, &bp->bits)) {
|
|
BUG_ON(!list_empty(&bp->dirty_entry));
|
|
block_get(bp);
|
|
spin_lock(&wri->lock);
|
|
list_add_tail(&bp->dirty_entry, &wri->dirty_list);
|
|
wri->nr_dirty_blocks++;
|
|
spin_unlock(&wri->lock);
|
|
TRACE_BLOCK(mark_dirty, bp);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Give the caller a dirty block that is pointed to by their ref.
|
|
*
|
|
* The ref may already refer to a cached dirty block. In that case the
|
|
* dirty block is returned.
|
|
*
|
|
* If the ref doesn't refer to a dirty block, then a new block is always
|
|
* allocated and returned. If the ref refers to an existing block then
|
|
* its contents are copied into the new block.
|
|
*
|
|
* If a new blkno is allocated then the ref is updated and any existing
|
|
* blkno is freed.
|
|
*
|
|
* A newly allocated block that we insert into the cache and return
|
|
* might already have an old stale copy inserted in the cache and it
|
|
* might be actively in use by readers. Future readers may also try to
|
|
* read their old block from our newly allocated block. We always
|
|
* remove any existing blocks and insert our new block only after
|
|
* modifying its headers and marking it dirty. Readers will never have
|
|
* their blocks modified and they can always identify new mismatched
|
|
* cached blocks.
|
|
*
|
|
* The dirty_blkno and ref_blkno arguments are used by the metadata
|
|
* allocator to avoid recursing into itself. dirty_blkno provides the
|
|
* blkno of the new dirty block to avoid calling _alloc_meta and
|
|
* ref_blkno is set to the old blkno instead of freeing it with
|
|
* _free_meta.
|
|
*/
|
|
int scoutfs_block_dirty_ref(struct super_block *sb, struct scoutfs_alloc *alloc,
|
|
struct scoutfs_block_writer *wri, struct scoutfs_block_ref *ref,
|
|
u32 magic, struct scoutfs_block **bl_ret,
|
|
u64 dirty_blkno, u64 *ref_blkno)
|
|
{
|
|
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
|
struct scoutfs_block *cow_bl = NULL;
|
|
struct scoutfs_block *bl = NULL;
|
|
struct block_private *exist_bp = NULL;
|
|
struct block_private *cow_bp = NULL;
|
|
struct block_private *bp = NULL;
|
|
struct scoutfs_block_header *hdr;
|
|
bool undo_alloc = false;
|
|
u64 blkno;
|
|
int ret;
|
|
int err;
|
|
|
|
/* read existing referenced block, if any */
|
|
blkno = le64_to_cpu(ref->blkno);
|
|
if (blkno) {
|
|
ret = scoutfs_block_read_ref(sb, ref, magic, bl_ret);
|
|
if (ret < 0)
|
|
goto out;
|
|
bl = *bl_ret;
|
|
bp = BLOCK_PRIVATE(bl);
|
|
|
|
if (block_is_dirty(bp)) {
|
|
ret = 0;
|
|
goto out;
|
|
}
|
|
}
|
|
|
|
/* allocate new blkno if caller didn't give us one */
|
|
if (dirty_blkno == 0) {
|
|
ret = scoutfs_alloc_meta(sb, alloc, wri, &dirty_blkno);
|
|
if (ret < 0)
|
|
goto out;
|
|
undo_alloc = true;
|
|
}
|
|
|
|
/* allocate new cached block */
|
|
cow_bp = block_alloc(sb, dirty_blkno);
|
|
if (cow_bp == NULL) {
|
|
ret = -ENOMEM;
|
|
goto out;
|
|
}
|
|
cow_bl = &cow_bp->bl;
|
|
|
|
set_bit(BLOCK_BIT_UPTODATE, &cow_bp->bits);
|
|
set_bit(BLOCK_BIT_CRC_VALID, &cow_bp->bits);
|
|
|
|
/* free original referenced blkno, or give it to the caller to deal with */
|
|
if (ref_blkno) {
|
|
*ref_blkno = blkno;
|
|
} else if (blkno) {
|
|
ret = scoutfs_free_meta(sb, alloc, wri, blkno);
|
|
if (ret < 0)
|
|
goto out;
|
|
}
|
|
|
|
/* copy original block contents, or initialize */
|
|
if (bl)
|
|
memcpy(cow_bl->data, bl->data, SCOUTFS_BLOCK_LG_SIZE);
|
|
else
|
|
memset(cow_bl->data, 0, SCOUTFS_BLOCK_LG_SIZE);
|
|
scoutfs_block_put(sb, bl);
|
|
bl = cow_bl;
|
|
cow_bl = NULL;
|
|
bp = cow_bp;
|
|
cow_bp = NULL;
|
|
|
|
hdr = bl->data;
|
|
hdr->magic = cpu_to_le32(magic);
|
|
hdr->fsid = super->hdr.fsid;
|
|
hdr->blkno = cpu_to_le64(bl->blkno);
|
|
prandom_bytes(&hdr->seq, sizeof(hdr->seq));
|
|
|
|
trace_scoutfs_block_dirty_ref(sb, le64_to_cpu(ref->blkno), le64_to_cpu(ref->seq),
|
|
le64_to_cpu(hdr->blkno), le64_to_cpu(hdr->seq));
|
|
|
|
/* mark the block dirty before it's visible */
|
|
block_mark_dirty(sb, wri, bl);
|
|
|
|
/* insert the new block, maybe removing any existing blocks */
|
|
while ((ret = block_insert(sb, bp)) == -EEXIST) {
|
|
exist_bp = block_lookup(sb, dirty_blkno);
|
|
if (exist_bp) {
|
|
block_remove(sb, exist_bp);
|
|
block_put(sb, exist_bp);
|
|
}
|
|
}
|
|
if (ret < 0)
|
|
goto out;
|
|
|
|
/* set the ref now that the block is visible */
|
|
ref->blkno = hdr->blkno;
|
|
ref->seq = hdr->seq;
|
|
|
|
ret = 0;
|
|
|
|
out:
|
|
scoutfs_block_put(sb, cow_bl);
|
|
if (ret < 0 && undo_alloc) {
|
|
err = scoutfs_free_meta(sb, alloc, wri, dirty_blkno);
|
|
BUG_ON(err); /* inconsistent */
|
|
}
|
|
|
|
if (ret < 0) {
|
|
scoutfs_block_put(sb, bl);
|
|
bl = NULL;
|
|
}
|
|
*bl_ret = bl;
|
|
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Submit writes for all the dirty blocks in the writer's dirty list and
|
|
* wait for them to complete. The caller must serialize this with
|
|
* attempts to dirty blocks in the writer. If we return an error then
|
|
* all the blocks will still be considered dirty. This can be called
|
|
* again to attempt to write all the blocks again.
|
|
*/
|
|
int scoutfs_block_writer_write(struct super_block *sb,
|
|
struct scoutfs_block_writer *wri)
|
|
{
|
|
DECLARE_BLOCK_INFO(sb, binf);
|
|
struct scoutfs_block_header *hdr;
|
|
struct block_private *bp;
|
|
struct blk_plug plug;
|
|
int ret = 0;
|
|
|
|
if (wri->nr_dirty_blocks == 0)
|
|
return 0;
|
|
|
|
/* checksum everything to reduce time between io submission merging */
|
|
list_for_each_entry(bp, &wri->dirty_list, dirty_entry) {
|
|
hdr = bp->bl.data;
|
|
hdr->crc = block_calc_crc(hdr, SCOUTFS_BLOCK_LG_SIZE);
|
|
}
|
|
|
|
blk_start_plug(&plug);
|
|
|
|
list_for_each_entry(bp, &wri->dirty_list, dirty_entry) {
|
|
/* retry previous write errors */
|
|
clear_bit(BLOCK_BIT_ERROR, &bp->bits);
|
|
|
|
ret = block_submit_bio(sb, bp, WRITE);
|
|
if (ret < 0)
|
|
break;
|
|
}
|
|
|
|
blk_finish_plug(&plug);
|
|
|
|
list_for_each_entry(bp, &wri->dirty_list, dirty_entry) {
|
|
/* XXX should this be interruptible? */
|
|
wait_event(binf->waitq, !io_busy(bp));
|
|
if (ret == 0 && test_bit(BLOCK_BIT_ERROR, &bp->bits)) {
|
|
clear_bit(BLOCK_BIT_ERROR, &bp->bits);
|
|
ret = -EIO;
|
|
}
|
|
}
|
|
|
|
if (ret == 0)
|
|
scoutfs_block_writer_forget_all(sb, wri);
|
|
|
|
return ret;
|
|
}
|
|
|
|
static void block_forget(struct super_block *sb,
|
|
struct scoutfs_block_writer *wri,
|
|
struct block_private *bp)
|
|
{
|
|
assert_spin_locked(&wri->lock);
|
|
|
|
clear_bit(BLOCK_BIT_DIRTY, &bp->bits);
|
|
list_del_init(&bp->dirty_entry);
|
|
wri->nr_dirty_blocks--;
|
|
TRACE_BLOCK(forget, bp);
|
|
block_put(sb, bp);
|
|
}
|
|
|
|
/*
|
|
* Clear the dirty status of all the blocks in the writer. The blocks
|
|
* remain clean in cache but can be freed by reclaim and then re-read
|
|
* from disk, losing whatever modifications made them dirty.
|
|
*/
|
|
void scoutfs_block_writer_forget_all(struct super_block *sb,
|
|
struct scoutfs_block_writer *wri)
|
|
{
|
|
struct block_private *tmp;
|
|
struct block_private *bp;
|
|
|
|
spin_lock(&wri->lock);
|
|
|
|
list_for_each_entry_safe(bp, tmp, &wri->dirty_list, dirty_entry)
|
|
block_forget(sb, wri, bp);
|
|
|
|
spin_unlock(&wri->lock);
|
|
}
|
|
|
|
/*
|
|
* Forget that the given block was dirty. It won't be written in the
|
|
* future. Its contents remain in the cache. This is typically used
|
|
* as a block is freed. If it is allocated and re-used then its contents
|
|
* will be re-initialized.
|
|
*
|
|
* The caller should ensure that we don't try and mark and forget the
|
|
* same block, but this is racing with marking and forgetting other
|
|
* blocks.
|
|
*/
|
|
void scoutfs_block_writer_forget(struct super_block *sb,
|
|
struct scoutfs_block_writer *wri,
|
|
struct scoutfs_block *bl)
|
|
{
|
|
struct block_private *bp = BLOCK_PRIVATE(bl);
|
|
|
|
if (test_bit(BLOCK_BIT_DIRTY, &bp->bits)) {
|
|
scoutfs_inc_counter(sb, block_cache_forget);
|
|
spin_lock(&wri->lock);
|
|
if (test_bit(BLOCK_BIT_DIRTY, &bp->bits))
|
|
block_forget(sb, wri, bp);
|
|
spin_unlock(&wri->lock);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* The caller has ensured that no more dirtying will take place. This
|
|
* helps the caller avoid doing a bunch of work before calling into the
|
|
* writer to write dirty blocks that didn't exist.
|
|
*/
|
|
bool scoutfs_block_writer_has_dirty(struct super_block *sb,
|
|
struct scoutfs_block_writer *wri)
|
|
{
|
|
return wri->nr_dirty_blocks != 0;
|
|
}
|
|
|
|
/*
|
|
* This is a best-effort guess. It's only used for heuristics so it's OK
|
|
* if it goes a little bonkers sometimes.
|
|
*/
|
|
u64 scoutfs_block_writer_dirty_bytes(struct super_block *sb,
|
|
struct scoutfs_block_writer *wri)
|
|
{
|
|
return wri->nr_dirty_blocks * SCOUTFS_BLOCK_LG_SIZE;
|
|
}
|
|
|
|
/*
|
|
* Remove a number of cached blocks that haven't been used recently.
|
|
*
|
|
* We don't maintain a strictly ordered LRU to avoid the contention of
|
|
* accesses always moving blocks around in some precise global
|
|
* structure.
|
|
*
|
|
* Instead we use counters to divide the blocks into two roughly equal
|
|
* groups by how recently they were accessed. We randomly walk all
|
|
* inserted blocks looking for any blocks in the older half to remove
|
|
* and free. The random walk and there being two groups means that we
|
|
* typically only walk a small multiple of the number we're looking for
|
|
* before we find them all.
|
|
*
|
|
* Our rcu walk of blocks can see blocks in all stages of their life
|
|
* cycle, from dirty blocks to those with 0 references that are queued
|
|
* for freeing. We only want to free idle inserted blocks so we
|
|
* atomically remove blocks when the only references are ours and the
|
|
* hash table.
|
|
*/
|
|
static int block_shrink(struct shrinker *shrink, struct shrink_control *sc)
|
|
{
|
|
struct block_info *binf = container_of(shrink, struct block_info,
|
|
shrinker);
|
|
struct super_block *sb = binf->sb;
|
|
struct rhashtable_iter iter;
|
|
struct block_private *bp;
|
|
unsigned long nr;
|
|
u64 recently;
|
|
|
|
nr = sc->nr_to_scan;
|
|
if (nr == 0)
|
|
goto out;
|
|
|
|
scoutfs_inc_counter(sb, block_cache_shrink);
|
|
|
|
nr = DIV_ROUND_UP(nr, SCOUTFS_BLOCK_LG_PAGES_PER);
|
|
|
|
restart:
|
|
recently = accessed_recently(binf);
|
|
rhashtable_walk_enter(&binf->ht, &iter);
|
|
rhashtable_walk_start(&iter);
|
|
|
|
/*
|
|
* This isn't great but I don't see a better way. We want to
|
|
* walk the hash from a random point so that we're not
|
|
* constantly walking over the same region that we've already
|
|
* freed old blocks within. The interface doesn't let us do
|
|
* this explicitly, but this seems to work? The difference this
|
|
* makes is enormous, around a few orders of magnitude fewer
|
|
* _nexts per shrink.
|
|
*/
|
|
if (iter.walker.tbl)
|
|
iter.slot = prandom_u32_max(iter.walker.tbl->size);
|
|
|
|
while (nr > 0) {
|
|
bp = rhashtable_walk_next(&iter);
|
|
if (bp == NULL)
|
|
break;
|
|
if (bp == ERR_PTR(-EAGAIN)) {
|
|
/* hard exit to wait for rcu rebalance to finish */
|
|
rhashtable_walk_stop(&iter);
|
|
rhashtable_walk_exit(&iter);
|
|
scoutfs_inc_counter(sb, block_cache_shrink_restart);
|
|
synchronize_rcu();
|
|
goto restart;
|
|
}
|
|
|
|
scoutfs_inc_counter(sb, block_cache_shrink_next);
|
|
|
|
if (bp->accessed >= recently) {
|
|
scoutfs_inc_counter(sb, block_cache_shrink_recent);
|
|
continue;
|
|
}
|
|
|
|
if (block_get_if_inserted(bp)) {
|
|
if (block_remove_solo(sb, bp)) {
|
|
scoutfs_inc_counter(sb, block_cache_shrink_remove);
|
|
TRACE_BLOCK(shrink, bp);
|
|
nr--;
|
|
}
|
|
block_put(sb, bp);
|
|
}
|
|
}
|
|
|
|
rhashtable_walk_stop(&iter);
|
|
rhashtable_walk_exit(&iter);
|
|
out:
|
|
return min_t(u64, (u64)atomic_read(&binf->total_inserted) * SCOUTFS_BLOCK_LG_PAGES_PER,
|
|
INT_MAX);
|
|
}
|
|
|
|
struct sm_block_completion {
|
|
struct completion comp;
|
|
int err;
|
|
};
|
|
|
|
static void sm_block_bio_end_io(struct bio *bio, int err)
|
|
{
|
|
struct sm_block_completion *sbc = bio->bi_private;
|
|
|
|
sbc->err = err;
|
|
complete(&sbc->comp);
|
|
bio_put(bio);
|
|
}
|
|
|
|
/*
|
|
* Perform a private synchronous read or write of a small fixed size 4K
|
|
* block. We allocate a private page and bio and copy to or from the
|
|
* caller's buffer.
|
|
*
|
|
* The interface is a little weird because our blocks always start with
|
|
* a block header that contains a crc of the entire block. We're the
|
|
* only layer that sees the full block buffer so we pass the calculated
|
|
* crc to the caller for them to check in their context.
|
|
*/
|
|
static int sm_block_io(struct super_block *sb, struct block_device *bdev, int rw, u64 blkno,
|
|
struct scoutfs_block_header *hdr, size_t len,
|
|
__le32 *blk_crc)
|
|
{
|
|
struct scoutfs_block_header *pg_hdr;
|
|
struct sm_block_completion sbc;
|
|
struct page *page;
|
|
struct bio *bio;
|
|
int ret;
|
|
|
|
BUILD_BUG_ON(PAGE_SIZE < SCOUTFS_BLOCK_SM_SIZE);
|
|
|
|
if (scoutfs_forcing_unmount(sb))
|
|
return -EIO;
|
|
|
|
if (WARN_ON_ONCE(len > SCOUTFS_BLOCK_SM_SIZE) ||
|
|
WARN_ON_ONCE(!(rw & WRITE) && !blk_crc))
|
|
return -EINVAL;
|
|
|
|
page = alloc_page(GFP_NOFS);
|
|
if (!page)
|
|
return -ENOMEM;
|
|
|
|
pg_hdr = page_address(page);
|
|
|
|
if (rw & WRITE) {
|
|
memcpy(pg_hdr, hdr, len);
|
|
if (len < SCOUTFS_BLOCK_SM_SIZE)
|
|
memset((char *)pg_hdr + len, 0,
|
|
SCOUTFS_BLOCK_SM_SIZE - len);
|
|
pg_hdr->crc = block_calc_crc(pg_hdr, SCOUTFS_BLOCK_SM_SIZE);
|
|
}
|
|
|
|
bio = bio_alloc(GFP_NOFS, 1);
|
|
if (!bio) {
|
|
ret = -ENOMEM;
|
|
goto out;
|
|
}
|
|
|
|
bio->bi_sector = blkno << (SCOUTFS_BLOCK_SM_SHIFT - 9);
|
|
bio->bi_bdev = bdev;
|
|
bio->bi_end_io = sm_block_bio_end_io;
|
|
bio->bi_private = &sbc;
|
|
bio_add_page(bio, page, SCOUTFS_BLOCK_SM_SIZE, 0);
|
|
|
|
init_completion(&sbc.comp);
|
|
sbc.err = 0;
|
|
|
|
submit_bio((rw & WRITE) ? WRITE_SYNC : READ_SYNC, bio);
|
|
|
|
wait_for_completion(&sbc.comp);
|
|
ret = sbc.err;
|
|
|
|
if (ret == 0 && !(rw & WRITE)) {
|
|
memcpy(hdr, pg_hdr, len);
|
|
*blk_crc = block_calc_crc(pg_hdr, SCOUTFS_BLOCK_SM_SIZE);
|
|
}
|
|
out:
|
|
__free_page(page);
|
|
return ret;
|
|
}
|
|
|
|
int scoutfs_block_read_sm(struct super_block *sb,
|
|
struct block_device *bdev, u64 blkno,
|
|
struct scoutfs_block_header *hdr, size_t len,
|
|
__le32 *blk_crc)
|
|
{
|
|
return sm_block_io(sb, bdev, READ, blkno, hdr, len, blk_crc);
|
|
}
|
|
|
|
int scoutfs_block_write_sm(struct super_block *sb,
|
|
struct block_device *bdev, u64 blkno,
|
|
struct scoutfs_block_header *hdr, size_t len)
|
|
{
|
|
return sm_block_io(sb, bdev, WRITE, blkno, hdr, len, NULL);
|
|
}
|
|
|
|
int scoutfs_block_setup(struct super_block *sb)
|
|
{
|
|
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
|
struct block_info *binf;
|
|
int ret;
|
|
|
|
binf = kzalloc(sizeof(struct block_info), GFP_KERNEL);
|
|
if (!binf) {
|
|
ret = -ENOMEM;
|
|
goto out;
|
|
}
|
|
|
|
ret = rhashtable_init(&binf->ht, &block_ht_params);
|
|
if (ret < 0) {
|
|
kfree(binf);
|
|
goto out;
|
|
}
|
|
|
|
binf->sb = sb;
|
|
atomic_set(&binf->total_inserted, 0);
|
|
atomic64_set(&binf->access_counter, 0);
|
|
init_waitqueue_head(&binf->waitq);
|
|
binf->shrinker.shrink = block_shrink;
|
|
binf->shrinker.seeks = DEFAULT_SEEKS;
|
|
register_shrinker(&binf->shrinker);
|
|
INIT_WORK(&binf->free_work, block_free_work);
|
|
init_llist_head(&binf->free_llist);
|
|
|
|
sbi->block_info = binf;
|
|
|
|
ret = 0;
|
|
out:
|
|
if (ret)
|
|
scoutfs_block_destroy(sb);
|
|
|
|
return ret;
|
|
}
|
|
|
|
void scoutfs_block_destroy(struct super_block *sb)
|
|
{
|
|
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
|
struct block_info *binf = SCOUTFS_SB(sb)->block_info;
|
|
|
|
if (binf) {
|
|
unregister_shrinker(&binf->shrinker);
|
|
block_remove_all(sb);
|
|
flush_work(&binf->free_work);
|
|
rhashtable_destroy(&binf->ht);
|
|
|
|
kfree(binf);
|
|
sbi->block_info = NULL;
|
|
}
|
|
}
|