mirror of
https://github.com/versity/scoutfs.git
synced 2026-01-09 21:27:25 +00:00
Free extents are stored in two btrees: one sorted by block number, one by size. So if you insert a new extent between two existing extents, you can be modifying two items in the by-block-number tree. And depending on the size of those items, that can result in three items over in the -by-size tree. So that's a 5x multiplier per level. If we're shrinking the tree and adding more freed blocks, we're conceptually dirtying two blocks at each level to merge. (current *2 in the code). But if they fall under the low water mark then one of them is freed, so we can have *3 per level in this case. Signed-off-by: Chris Kirby <ckirby@versity.com>
1701 lines
47 KiB
C
1701 lines
47 KiB
C
/*
|
|
* Copyright (C) 2020 Versity Software, Inc. All rights reserved.
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public
|
|
* License v2 as published by the Free Software Foundation.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* General Public License for more details.
|
|
*/
|
|
#include <linux/kernel.h>
|
|
#include <linux/module.h>
|
|
#include <linux/fs.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/blkdev.h>
|
|
#include <linux/sort.h>
|
|
#include <linux/random.h>
|
|
|
|
#include "super.h"
|
|
#include "block.h"
|
|
#include "btree.h"
|
|
#include "trans.h"
|
|
#include "alloc.h"
|
|
#include "counters.h"
|
|
#include "scoutfs_trace.h"
|
|
|
|
/*
|
|
* The core allocator uses extent items in btrees rooted in the super.
|
|
* Each free extent is stored in two items. The first item is indexed
|
|
* by block location and is used to merge adjacent extents when freeing.
|
|
* The second item is indexed by the order of the length and is used to
|
|
* find large extents to allocate from.
|
|
*
|
|
* Free extent always consumes the front of the largest extent. This
|
|
* attempts to discourage fragmentation by given smaller freed extents
|
|
* time for an adjacent free to merge before we attempt to re-use them.
|
|
*
|
|
* The metadata btrees that store extents are updated with cow. This
|
|
* requires allocation during extent item modification on behalf of
|
|
* allocation. Avoiding this recursion introduces the second structure,
|
|
* persistent singly linked lists of individual blknos.
|
|
*
|
|
* The alloc lists are used for metadata allocation during a
|
|
* transaction. Before each transaction lists of blknos are prepared
|
|
* for use during the transaction. This ensures a small predictable
|
|
* number of cows needed to fully dirty the metadata allocator
|
|
* structures during the transaction. As the transaction proceeds
|
|
* allocations are made from a list of available meta blknos, and frees
|
|
* are performed by adding blknos to another list of freed blknos.
|
|
* After transactions these lists are merged back in to extents.
|
|
*
|
|
* Data allocations are performed directly on a btree of extent items,
|
|
* with a bit of caching to stream small file data allocations from
|
|
* memory instead of performing multiple btree calls per block
|
|
* allocation.
|
|
*
|
|
* Every transaction has exclusive access to its metadata list blocks
|
|
* and data extent trees which are prepared by the server. For client
|
|
* metadata and srch transactions the server moved extents and blocks
|
|
* into persistent items that are communicated with the server. For
|
|
* server transactions metadata the server has to prepare structures for
|
|
* itself. To avoid modifying the same structure both explicitly
|
|
* (refilling an allocator) and implicitly (using the current allocator
|
|
* for cow allocations), it double buffers list blocks. It uses current
|
|
* blocks to modify the next blocks, and swaps them at each transaction.
|
|
*/
|
|
|
|
/*
|
|
* Return the order of the length of a free extent, which we define as
|
|
* floor(log_8_(len)): 0..7 = 0, 8..63 = 1, etc.
|
|
*/
|
|
static u64 free_extent_order(u64 len)
|
|
{
|
|
return (fls64(len | 1) - 1) / 3;
|
|
}
|
|
|
|
/*
|
|
* The smallest (non-zero) length that will be mapped to the same order
|
|
* as the given length.
|
|
*/
|
|
static u64 smallest_order_length(u64 len)
|
|
{
|
|
return 1ULL << (free_extent_order(len) * 3);
|
|
}
|
|
|
|
/*
|
|
* Moving an extent between trees can dirty blocks in several ways. This
|
|
* function calculates worst case number of blocks across these scenarions.
|
|
* We treat the alloc and free counts independently, so the values below are
|
|
* max(allocated, freed), not the sum.
|
|
*
|
|
* We track extents with two separate btree items: by block number and by size.
|
|
*
|
|
* If we're removing an extent from the btree (allocating), we can dirty
|
|
* two blocks if the keys are in different leaves. If we wind up merging
|
|
* leaves because we fall below the low water mark, we can wind up freeing
|
|
* three leaves.
|
|
*
|
|
* That sequence is as follows, assuming the original keys are removed from
|
|
* blocks A and B:
|
|
*
|
|
* Allocate new dirty A' and B'
|
|
* Free old stable A and B
|
|
* B' has fallen below the low water mark, so copy B' into A'
|
|
* Free B'
|
|
*
|
|
* An extent insertion (freeing an extent) can dirty up to five distinct items
|
|
* in the btree as it adds and removes the blkno and size sorted items for the
|
|
* old and new lengths of the extent:
|
|
*
|
|
* In the by-blkno portion of the btree, we can dirty (allocate for COW) up
|
|
* to two blocks- either by merging adjacent extents, which can cause us to
|
|
* join leaf blocks; or by an insertion that causes a split.
|
|
*
|
|
* In the by-size portion, we never merge extents, so normally we just dirty
|
|
* a single item with a size insertion. But if we merged adjacent extents in
|
|
* the by-blkno portion of the tree, we might be working with three by-sizex
|
|
* items: removing the two old ones that were combined in the merge; and
|
|
* adding the new one for the larger, merged size.
|
|
*
|
|
* Finally, dirtying the paths to these leaves can grow the tree and grow/shrink
|
|
* neighbours at each level, so we multiply by the height of the tree after
|
|
* accounting for a possible new level.
|
|
*/
|
|
static u32 extent_mod_blocks(u32 height)
|
|
{
|
|
return ((1 + height) * 3) * 5;
|
|
}
|
|
|
|
/*
|
|
* Free extents don't have flags and are stored in two indexes sorted by
|
|
* block location and by length order, largest first. The location key
|
|
* field is set to the final block in the extent so that we can find
|
|
* intersections by calling _next() with the start of the range we're
|
|
* searching for.
|
|
*
|
|
* We never store 0 length extents but we do build keys for searching
|
|
* the order index from 0,0 without having to map it to a real extent.
|
|
*/
|
|
static void init_ext_key(struct scoutfs_key *key, int zone, u64 start, u64 len)
|
|
{
|
|
*key = (struct scoutfs_key) {
|
|
.sk_zone = zone,
|
|
};
|
|
|
|
if (len == 0) {
|
|
/* we only use 0 len extents for magic 0,0 order lookups */
|
|
WARN_ON_ONCE(zone != SCOUTFS_FREE_EXTENT_ORDER_ZONE || start != 0);
|
|
return;
|
|
}
|
|
|
|
if (zone == SCOUTFS_FREE_EXTENT_BLKNO_ZONE) {
|
|
key->skfb_end = cpu_to_le64(start + len - 1);
|
|
key->skfb_len = cpu_to_le64(len);
|
|
} else if (zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE) {
|
|
key->skfo_revord = cpu_to_le64(U64_MAX - free_extent_order(len));
|
|
key->skfo_end = cpu_to_le64(start + len - 1);
|
|
key->skfo_len = cpu_to_le64(len);
|
|
} else {
|
|
BUG();
|
|
}
|
|
}
|
|
|
|
static void ext_from_key(struct scoutfs_extent *ext, struct scoutfs_key *key)
|
|
{
|
|
if (key->sk_zone == SCOUTFS_FREE_EXTENT_BLKNO_ZONE) {
|
|
ext->start = le64_to_cpu(key->skfb_end) -
|
|
le64_to_cpu(key->skfb_len) + 1;
|
|
ext->len = le64_to_cpu(key->skfb_len);
|
|
} else {
|
|
ext->start = le64_to_cpu(key->skfo_end) -
|
|
le64_to_cpu(key->skfo_len) + 1;
|
|
ext->len = le64_to_cpu(key->skfo_len);
|
|
}
|
|
ext->map = 0;
|
|
ext->flags = 0;
|
|
|
|
/* we never store 0 length extents */
|
|
WARN_ON_ONCE(ext->len == 0);
|
|
}
|
|
|
|
struct alloc_ext_args {
|
|
struct scoutfs_alloc *alloc;
|
|
struct scoutfs_block_writer *wri;
|
|
struct scoutfs_alloc_root *root;
|
|
int zone;
|
|
};
|
|
|
|
static int alloc_ext_next(struct super_block *sb, void *arg,
|
|
u64 start, u64 len, struct scoutfs_extent *ext)
|
|
{
|
|
struct alloc_ext_args *args = arg;
|
|
SCOUTFS_BTREE_ITEM_REF(iref);
|
|
struct scoutfs_key key;
|
|
int ret;
|
|
|
|
init_ext_key(&key, args->zone, start, len);
|
|
|
|
ret = scoutfs_btree_next(sb, &args->root->root, &key, &iref);
|
|
if (ret == 0) {
|
|
if (iref.val_len != 0)
|
|
ret = -EIO;
|
|
else if (iref.key->sk_zone != args->zone)
|
|
ret = -ENOENT;
|
|
else
|
|
ext_from_key(ext, iref.key);
|
|
scoutfs_btree_put_iref(&iref);
|
|
}
|
|
|
|
if (ret < 0)
|
|
memset(ext, 0, sizeof(struct scoutfs_extent));
|
|
|
|
return ret;
|
|
}
|
|
|
|
static int other_zone(int zone)
|
|
{
|
|
if (zone == SCOUTFS_FREE_EXTENT_BLKNO_ZONE)
|
|
return SCOUTFS_FREE_EXTENT_ORDER_ZONE;
|
|
else if (zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE)
|
|
return SCOUTFS_FREE_EXTENT_BLKNO_ZONE;
|
|
else
|
|
BUG();
|
|
}
|
|
|
|
/*
|
|
* Insert an extent along with its matching item which is indexed by
|
|
* opposite of its order or blkno. If we succeed we update the root's
|
|
* record of the total length of all the stored extents.
|
|
*/
|
|
static int alloc_ext_insert(struct super_block *sb, void *arg,
|
|
u64 start, u64 len, u64 map, u8 flags)
|
|
{
|
|
struct alloc_ext_args *args = arg;
|
|
struct scoutfs_key other;
|
|
struct scoutfs_key key;
|
|
int ret;
|
|
int err;
|
|
|
|
/* allocator extents don't have mappings or flags */
|
|
if (WARN_ON_ONCE(map || flags))
|
|
return -EINVAL;
|
|
|
|
init_ext_key(&key, args->zone, start, len);
|
|
init_ext_key(&other, other_zone(args->zone), start, len);
|
|
|
|
ret = scoutfs_btree_insert(sb, args->alloc, args->wri,
|
|
&args->root->root, &key, NULL, 0);
|
|
if (ret == 0) {
|
|
ret = scoutfs_btree_insert(sb, args->alloc, args->wri,
|
|
&args->root->root, &other, NULL, 0);
|
|
if (ret < 0) {
|
|
err = scoutfs_btree_delete(sb, args->alloc, args->wri,
|
|
&args->root->root, &key);
|
|
BUG_ON(err);
|
|
} else {
|
|
le64_add_cpu(&args->root->total_len, len);
|
|
}
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
static int alloc_ext_remove(struct super_block *sb, void *arg,
|
|
u64 start, u64 len, u64 map, u8 flags)
|
|
{
|
|
struct alloc_ext_args *args = arg;
|
|
struct scoutfs_key other;
|
|
struct scoutfs_key key;
|
|
int ret;
|
|
int err;
|
|
|
|
init_ext_key(&key, args->zone, start, len);
|
|
init_ext_key(&other, other_zone(args->zone), start, len);
|
|
|
|
ret = scoutfs_btree_delete(sb, args->alloc, args->wri,
|
|
&args->root->root, &key);
|
|
if (ret == 0) {
|
|
ret = scoutfs_btree_delete(sb, args->alloc, args->wri,
|
|
&args->root->root, &other);
|
|
if (ret < 0) {
|
|
err = scoutfs_btree_insert(sb, args->alloc, args->wri,
|
|
&args->root->root, &key,
|
|
NULL, 0);
|
|
BUG_ON(err);
|
|
} else {
|
|
le64_add_cpu(&args->root->total_len, -len);
|
|
}
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
static struct scoutfs_ext_ops alloc_ext_ops = {
|
|
.next = alloc_ext_next,
|
|
.insert = alloc_ext_insert,
|
|
.remove = alloc_ext_remove,
|
|
.insert_overlap_warn = true,
|
|
};
|
|
|
|
static bool invalid_extent(u64 start, u64 end, u64 first, u64 last)
|
|
{
|
|
return start > end || start < first || end > last;
|
|
}
|
|
|
|
static bool invalid_meta_blkno(struct super_block *sb, u64 blkno)
|
|
{
|
|
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
|
u64 last_meta = (i_size_read(sbi->meta_bdev->bd_inode) >> SCOUTFS_BLOCK_LG_SHIFT) - 1;
|
|
|
|
return invalid_extent(blkno, blkno, SCOUTFS_META_DEV_START_BLKNO, last_meta);
|
|
}
|
|
|
|
static bool invalid_data_extent(struct super_block *sb, u64 start, u64 len)
|
|
{
|
|
u64 last_data = (i_size_read(sb->s_bdev->bd_inode) >> SCOUTFS_BLOCK_SM_SHIFT) - 1;
|
|
|
|
return invalid_extent(start, start + len - 1, SCOUTFS_DATA_DEV_START_BLKNO, last_data);
|
|
}
|
|
|
|
void scoutfs_alloc_init(struct scoutfs_alloc *alloc,
|
|
struct scoutfs_alloc_list_head *avail,
|
|
struct scoutfs_alloc_list_head *freed)
|
|
{
|
|
memset(alloc, 0, sizeof(struct scoutfs_alloc));
|
|
|
|
seqlock_init(&alloc->seqlock);
|
|
mutex_init(&alloc->mutex);
|
|
alloc->avail = *avail;
|
|
alloc->freed = *freed;
|
|
}
|
|
|
|
/*
|
|
* We're about to commit the transaction that used this allocator, drop
|
|
* its block references.
|
|
*/
|
|
int scoutfs_alloc_prepare_commit(struct super_block *sb,
|
|
struct scoutfs_alloc *alloc,
|
|
struct scoutfs_block_writer *wri)
|
|
{
|
|
scoutfs_block_put(sb, alloc->dirty_avail_bl);
|
|
alloc->dirty_avail_bl = NULL;
|
|
scoutfs_block_put(sb, alloc->dirty_freed_bl);
|
|
alloc->dirty_freed_bl = NULL;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static u32 list_block_space(__le32 nr)
|
|
{
|
|
return SCOUTFS_ALLOC_LIST_MAX_BLOCKS - le32_to_cpu(nr);
|
|
}
|
|
|
|
static u64 list_block_peek(struct scoutfs_alloc_list_block *lblk,
|
|
unsigned int skip)
|
|
{
|
|
BUG_ON(skip >= le32_to_cpu(lblk->nr));
|
|
|
|
return le64_to_cpu(lblk->blknos[le32_to_cpu(lblk->start) + skip]);
|
|
}
|
|
|
|
/*
|
|
* Add a blkno to the array. Typically we append of the array. But we
|
|
* can also prepend once there's no more room at the end. Consumers of
|
|
* the blocks sort before removing them.
|
|
*/
|
|
static void list_block_add(struct scoutfs_alloc_list_head *lhead,
|
|
struct scoutfs_alloc_list_block *lblk, u64 blkno)
|
|
{
|
|
u32 start = le32_to_cpu(lblk->start);
|
|
u32 nr = le32_to_cpu(lblk->nr);
|
|
|
|
BUG_ON(lhead->ref.blkno != lblk->hdr.blkno);
|
|
BUG_ON(list_block_space(lblk->nr) == 0);
|
|
|
|
if (start + nr < SCOUTFS_ALLOC_LIST_MAX_BLOCKS) {
|
|
lblk->blknos[start + nr] = cpu_to_le64(blkno);
|
|
} else {
|
|
start--;
|
|
lblk->blknos[start] = cpu_to_le64(blkno);
|
|
lblk->start = cpu_to_le32(start);
|
|
}
|
|
|
|
le32_add_cpu(&lblk->nr, 1);
|
|
le64_add_cpu(&lhead->total_nr, 1);
|
|
le32_add_cpu(&lhead->first_nr, 1);
|
|
}
|
|
|
|
/*
|
|
* Remove blknos from the start of the array.
|
|
*/
|
|
static void list_block_remove(struct scoutfs_alloc_list_head *lhead,
|
|
struct scoutfs_alloc_list_block *lblk,
|
|
unsigned int count)
|
|
{
|
|
BUG_ON(lhead->ref.blkno != lblk->hdr.blkno);
|
|
BUG_ON(count > SCOUTFS_ALLOC_LIST_MAX_BLOCKS);
|
|
BUG_ON(le32_to_cpu(lblk->nr) < count);
|
|
|
|
le32_add_cpu(&lblk->nr, -count);
|
|
if (lblk->nr == 0)
|
|
lblk->start = 0;
|
|
else
|
|
le32_add_cpu(&lblk->start, count);
|
|
le64_add_cpu(&lhead->total_nr, -(u64)count);
|
|
le32_add_cpu(&lhead->first_nr, -count);
|
|
}
|
|
|
|
static int cmp_le64(const void *A, const void *B)
|
|
{
|
|
const __le64 *a = A;
|
|
const __le64 *b = B;
|
|
|
|
return scoutfs_cmp_u64s(le64_to_cpu(*a), le64_to_cpu(*b));
|
|
}
|
|
|
|
static void swap_le64(void *A, void *B, int size)
|
|
{
|
|
__le64 *a = A;
|
|
__le64 *b = B;
|
|
|
|
swap(*a, *b);
|
|
}
|
|
|
|
static void list_block_sort(struct scoutfs_alloc_list_block *lblk)
|
|
{
|
|
sort(&lblk->blknos[le32_to_cpu(lblk->start)], le32_to_cpu(lblk->nr),
|
|
sizeof(lblk->blknos[0]), cmp_le64, swap_le64);
|
|
}
|
|
|
|
/*
|
|
* We're always reading blocks that we own, so we shouldn't see stale
|
|
* references but we could retry reads after dropping stale cached
|
|
* blocks. If we do see a stale error then we've hit persistent
|
|
* corruption.
|
|
*/
|
|
static int read_list_block(struct super_block *sb, struct scoutfs_block_ref *ref,
|
|
struct scoutfs_block **bl_ret)
|
|
{
|
|
int ret;
|
|
|
|
ret = scoutfs_block_read_ref(sb, ref, SCOUTFS_BLOCK_MAGIC_ALLOC_LIST, bl_ret);
|
|
if (ret < 0) {
|
|
if (ret == -ESTALE) {
|
|
scoutfs_inc_counter(sb, alloc_stale_list_block);
|
|
ret = -EIO;
|
|
}
|
|
};
|
|
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Give the caller a dirty list block, always allocating a new block if
|
|
* the ref is empty.
|
|
*
|
|
* If the caller gives us an allocated blkno for the cow then we know
|
|
* that they're taking care of allocating and freeing the blknos, if not
|
|
* we call meta alloc and free.
|
|
*/
|
|
static int dirty_list_block(struct super_block *sb,
|
|
struct scoutfs_alloc *alloc,
|
|
struct scoutfs_block_writer *wri,
|
|
struct scoutfs_block_ref *ref,
|
|
u64 dirty, u64 *old,
|
|
struct scoutfs_block **bl_ret)
|
|
{
|
|
return scoutfs_block_dirty_ref(sb, alloc, wri, ref, SCOUTFS_BLOCK_MAGIC_ALLOC_LIST,
|
|
bl_ret, dirty, old);
|
|
}
|
|
|
|
/* Allocate a new dirty list block if we fill up more than 3/4 of the block. */
|
|
#define EMPTY_FREED_THRESH (SCOUTFS_ALLOC_LIST_MAX_BLOCKS / 4)
|
|
|
|
/*
|
|
* Get dirty avail and freed list blocks that will be used for meta
|
|
* allocations during our transaction. We peek at the next avail blknos
|
|
* for the cow allocations and manually record the cow frees rather than
|
|
* recursively calling into alloc_meta and free_meta.
|
|
*
|
|
* In the client the server will have emptied the freed list so it will
|
|
* always allocate a new first empty block for frees. But in the server
|
|
* it might have long lists of frees that it's trying to merge in to
|
|
* extents over multiple transactions. If the head of the freed list
|
|
* doesn't have room we add a new empty block.
|
|
*/
|
|
static int dirty_alloc_blocks(struct super_block *sb,
|
|
struct scoutfs_alloc *alloc,
|
|
struct scoutfs_block_writer *wri)
|
|
{
|
|
struct scoutfs_block_ref orig_freed;
|
|
struct scoutfs_alloc_list_block *lblk;
|
|
struct scoutfs_block *av_bl = NULL;
|
|
struct scoutfs_block *fr_bl = NULL;
|
|
struct scoutfs_block *bl;
|
|
bool link_orig = false;
|
|
u64 av_peek;
|
|
u64 av_old;
|
|
u64 fr_peek;
|
|
u64 fr_old;
|
|
int ret;
|
|
|
|
if (alloc->dirty_avail_bl != NULL)
|
|
return 0;
|
|
|
|
mutex_lock(&alloc->mutex);
|
|
|
|
/* undo dirty freed if we get an error after */
|
|
orig_freed = alloc->freed.ref;
|
|
|
|
if (alloc->dirty_avail_bl != NULL) {
|
|
ret = 0;
|
|
goto out;
|
|
}
|
|
|
|
/* caller must ensure that transactions commit before running out */
|
|
if (WARN_ON_ONCE(alloc->avail.ref.blkno == 0) ||
|
|
WARN_ON_ONCE(le32_to_cpu(alloc->avail.first_nr) < 2)) {
|
|
ret = -ENOSPC;
|
|
goto out;
|
|
}
|
|
|
|
ret = read_list_block(sb, &alloc->avail.ref, &bl);
|
|
if (ret < 0)
|
|
goto out;
|
|
|
|
lblk = bl->data;
|
|
av_peek = list_block_peek(lblk, 0);
|
|
fr_peek = list_block_peek(lblk, 1);
|
|
scoutfs_block_put(sb, bl);
|
|
lblk = NULL;
|
|
|
|
if (alloc->freed.ref.blkno &&
|
|
list_block_space(alloc->freed.first_nr) < EMPTY_FREED_THRESH) {
|
|
/* zero ref to force alloc of new block... */
|
|
memset(&alloc->freed.ref, 0, sizeof(alloc->freed.ref));
|
|
alloc->freed.first_nr = 0;
|
|
link_orig = true;
|
|
}
|
|
|
|
/* dirty the first free block */
|
|
ret = dirty_list_block(sb, alloc, wri, &alloc->freed.ref,
|
|
fr_peek, &fr_old, &fr_bl);
|
|
if (ret < 0)
|
|
goto out;
|
|
|
|
if (link_orig) {
|
|
/* .. and point the new block at the rest of the list */
|
|
lblk = fr_bl->data;
|
|
lblk->next = orig_freed;
|
|
lblk = NULL;
|
|
}
|
|
|
|
ret = dirty_list_block(sb, alloc, wri, &alloc->avail.ref,
|
|
av_peek, &av_old, &av_bl);
|
|
if (ret < 0)
|
|
goto out;
|
|
|
|
list_block_remove(&alloc->avail, av_bl->data, 2);
|
|
/* sort dirty avail to encourage contiguous sorted meta blocks */
|
|
list_block_sort(av_bl->data);
|
|
|
|
if (av_old)
|
|
list_block_add(&alloc->freed, fr_bl->data, av_old);
|
|
if (fr_old)
|
|
list_block_add(&alloc->freed, fr_bl->data, fr_old);
|
|
|
|
alloc->dirty_avail_bl = av_bl;
|
|
av_bl = NULL;
|
|
alloc->dirty_freed_bl = fr_bl;
|
|
fr_bl = NULL;
|
|
ret = 0;
|
|
|
|
out:
|
|
if (ret < 0 && alloc->freed.ref.blkno != orig_freed.blkno) {
|
|
if (fr_bl)
|
|
scoutfs_block_writer_forget(sb, wri, fr_bl);
|
|
alloc->freed.ref = orig_freed;
|
|
}
|
|
|
|
mutex_unlock(&alloc->mutex);
|
|
scoutfs_block_put(sb, av_bl);
|
|
scoutfs_block_put(sb, fr_bl);
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Alloc a metadata block for a transaction in either the client or the
|
|
* server. The list block in the allocator was prepared for the transaction.
|
|
*/
|
|
int scoutfs_alloc_meta(struct super_block *sb, struct scoutfs_alloc *alloc,
|
|
struct scoutfs_block_writer *wri, u64 *blkno)
|
|
{
|
|
struct scoutfs_alloc_list_block *lblk;
|
|
int ret;
|
|
|
|
ret = dirty_alloc_blocks(sb, alloc, wri);
|
|
if (ret < 0)
|
|
goto out;
|
|
|
|
write_seqlock(&alloc->seqlock);
|
|
|
|
lblk = alloc->dirty_avail_bl->data;
|
|
if (WARN_ON_ONCE(lblk->nr == 0)) {
|
|
/* shouldn't happen, transaction should commit first */
|
|
ret = -ENOSPC;
|
|
} else {
|
|
*blkno = list_block_peek(lblk, 0);
|
|
list_block_remove(&alloc->avail, lblk, 1);
|
|
ret = 0;
|
|
}
|
|
|
|
write_sequnlock(&alloc->seqlock);
|
|
|
|
out:
|
|
if (ret < 0)
|
|
*blkno = 0;
|
|
scoutfs_inc_counter(sb, alloc_alloc_meta);
|
|
trace_scoutfs_alloc_alloc_meta(sb, *blkno, ret);
|
|
return ret;
|
|
}
|
|
|
|
int scoutfs_free_meta(struct super_block *sb, struct scoutfs_alloc *alloc,
|
|
struct scoutfs_block_writer *wri, u64 blkno)
|
|
{
|
|
struct scoutfs_alloc_list_block *lblk;
|
|
int ret;
|
|
|
|
if (WARN_ON_ONCE(invalid_meta_blkno(sb, blkno)))
|
|
return -EINVAL;
|
|
|
|
ret = dirty_alloc_blocks(sb, alloc, wri);
|
|
if (ret < 0)
|
|
goto out;
|
|
|
|
write_seqlock(&alloc->seqlock);
|
|
|
|
lblk = alloc->dirty_freed_bl->data;
|
|
if (WARN_ON_ONCE(list_block_space(lblk->nr) == 0)) {
|
|
/* shouldn't happen, transaction should commit first */
|
|
ret = -EIO;
|
|
} else {
|
|
list_block_add(&alloc->freed, lblk, blkno);
|
|
ret = 0;
|
|
}
|
|
|
|
write_sequnlock(&alloc->seqlock);
|
|
|
|
out:
|
|
scoutfs_inc_counter(sb, alloc_free_meta);
|
|
trace_scoutfs_alloc_free_meta(sb, blkno, ret);
|
|
return ret;
|
|
}
|
|
|
|
void scoutfs_dalloc_init(struct scoutfs_data_alloc *dalloc,
|
|
struct scoutfs_alloc_root *data_avail)
|
|
{
|
|
dalloc->root = *data_avail;
|
|
memset(&dalloc->cached, 0, sizeof(dalloc->cached));
|
|
atomic64_set(&dalloc->total_len, le64_to_cpu(dalloc->root.total_len));
|
|
}
|
|
|
|
void scoutfs_dalloc_get_root(struct scoutfs_data_alloc *dalloc,
|
|
struct scoutfs_alloc_root *data_avail)
|
|
{
|
|
*data_avail = dalloc->root;
|
|
}
|
|
|
|
static void dalloc_update_total_len(struct scoutfs_data_alloc *dalloc)
|
|
{
|
|
atomic64_set(&dalloc->total_len, le64_to_cpu(dalloc->root.total_len) +
|
|
dalloc->cached.len);
|
|
}
|
|
|
|
u64 scoutfs_dalloc_total_len(struct scoutfs_data_alloc *dalloc)
|
|
{
|
|
return atomic64_read(&dalloc->total_len);
|
|
}
|
|
|
|
/*
|
|
* Return the current in-memory cached free extent to extent items in
|
|
* the avail root. This should be locked by the caller just like
|
|
* _alloc_data and _free_data.
|
|
*/
|
|
int scoutfs_dalloc_return_cached(struct super_block *sb,
|
|
struct scoutfs_alloc *alloc,
|
|
struct scoutfs_block_writer *wri,
|
|
struct scoutfs_data_alloc *dalloc)
|
|
{
|
|
struct alloc_ext_args args = {
|
|
.alloc = alloc,
|
|
.wri = wri,
|
|
.root = &dalloc->root,
|
|
.zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE,
|
|
};
|
|
int ret = 0;
|
|
|
|
if (dalloc->cached.len) {
|
|
ret = scoutfs_ext_insert(sb, &alloc_ext_ops, &args,
|
|
dalloc->cached.start,
|
|
dalloc->cached.len, 0, 0);
|
|
if (ret == 0)
|
|
memset(&dalloc->cached, 0, sizeof(dalloc->cached));
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Allocate a data extent. An extent that's smaller than the requested
|
|
* size can be returned.
|
|
*
|
|
* The caller can provide a cached extent that can satisfy allocations
|
|
* and will be refilled by allocations. The caller is responsible for
|
|
* freeing any remaining cached extent back into persistent items before
|
|
* committing.
|
|
*
|
|
* Unlike meta allocations, the caller is expected to serialize
|
|
* allocations from the root.
|
|
*
|
|
* ENOBUFS is returned if the data allocator ran out of space and we can
|
|
* probably refill it from the server. The caller is expected to back
|
|
* out, commit the transaction, and try again.
|
|
*
|
|
* ENOSPC is returned if the data allocator ran out of space but we have
|
|
* a flag from the server telling us that there's no more space
|
|
* available. This is a hard error and should be returned.
|
|
*/
|
|
int scoutfs_alloc_data(struct super_block *sb, struct scoutfs_alloc *alloc,
|
|
struct scoutfs_block_writer *wri,
|
|
struct scoutfs_data_alloc *dalloc, u64 count,
|
|
u64 *blkno_ret, u64 *count_ret)
|
|
{
|
|
struct alloc_ext_args args = {
|
|
.alloc = alloc,
|
|
.wri = wri,
|
|
.root = &dalloc->root,
|
|
.zone = SCOUTFS_FREE_EXTENT_ORDER_ZONE,
|
|
};
|
|
struct scoutfs_extent ext;
|
|
u64 len;
|
|
int ret;
|
|
|
|
/* large allocations come straight from the allocator */
|
|
if (count >= SCOUTFS_ALLOC_DATA_LG_THRESH) {
|
|
ret = scoutfs_ext_alloc(sb, &alloc_ext_ops, &args,
|
|
0, 0, count, &ext);
|
|
if (ret < 0)
|
|
goto out;
|
|
|
|
*blkno_ret = ext.start;
|
|
*count_ret = ext.len;
|
|
ret = 0;
|
|
goto out;
|
|
}
|
|
|
|
/* smaller allocations come from a cached extent */
|
|
if (dalloc->cached.len == 0) {
|
|
ret = scoutfs_ext_alloc(sb, &alloc_ext_ops, &args, 0, 0,
|
|
SCOUTFS_ALLOC_DATA_LG_THRESH,
|
|
&dalloc->cached);
|
|
if (ret < 0)
|
|
goto out;
|
|
}
|
|
|
|
len = min(count, dalloc->cached.len);
|
|
|
|
*blkno_ret = dalloc->cached.start;
|
|
*count_ret = len;
|
|
|
|
dalloc->cached.start += len;
|
|
dalloc->cached.len -= len;
|
|
ret = 0;
|
|
out:
|
|
if (ret < 0) {
|
|
if (ret == -ENOENT) {
|
|
if (le32_to_cpu(dalloc->root.flags) & SCOUTFS_ALLOC_FLAG_LOW)
|
|
ret = -ENOSPC;
|
|
else
|
|
ret = -ENOBUFS;
|
|
}
|
|
|
|
*blkno_ret = 0;
|
|
*count_ret = 0;
|
|
} else {
|
|
dalloc_update_total_len(dalloc);
|
|
}
|
|
|
|
scoutfs_inc_counter(sb, alloc_alloc_data);
|
|
trace_scoutfs_alloc_alloc_data(sb, count, *blkno_ret, *count_ret, ret);
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Free data extents into the freed tree that will be reclaimed by the
|
|
* server and made available for future allocators only if our
|
|
* transaction succeeds. We don't want to overwrite existing data if
|
|
* our transaction fails.
|
|
*
|
|
* Unlike meta allocations, the caller is expected to serialize data
|
|
* allocations.
|
|
*/
|
|
int scoutfs_free_data(struct super_block *sb, struct scoutfs_alloc *alloc,
|
|
struct scoutfs_block_writer *wri,
|
|
struct scoutfs_alloc_root *root, u64 blkno, u64 count)
|
|
{
|
|
struct alloc_ext_args args = {
|
|
.alloc = alloc,
|
|
.wri = wri,
|
|
.root = root,
|
|
.zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE,
|
|
};
|
|
int ret;
|
|
|
|
if (WARN_ON_ONCE(invalid_data_extent(sb, blkno, count)))
|
|
return -EINVAL;
|
|
|
|
ret = scoutfs_ext_insert(sb, &alloc_ext_ops, &args, blkno, count, 0, 0);
|
|
scoutfs_inc_counter(sb, alloc_free_data);
|
|
trace_scoutfs_alloc_free_data(sb, blkno, count, ret);
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Return the first zone bit that the extent intersects with.
|
|
*/
|
|
static int first_extent_zone(struct scoutfs_extent *ext, __le64 *zones, u64 zone_blocks)
|
|
{
|
|
int first;
|
|
int last;
|
|
int nr;
|
|
|
|
first = div64_u64(ext->start, zone_blocks);
|
|
last = div64_u64(ext->start + ext->len - 1, zone_blocks);
|
|
|
|
nr = find_next_bit_le(zones, SCOUTFS_DATA_ALLOC_MAX_ZONES, first);
|
|
if (nr <= last)
|
|
return nr;
|
|
|
|
return SCOUTFS_DATA_ALLOC_MAX_ZONES;
|
|
}
|
|
|
|
/*
|
|
* Find an extent in specific zones to satisfy an allocation. We use
|
|
* the order items to search for the largest extent that intersects with
|
|
* the zones whose bits are set in the caller's bitmap.
|
|
*/
|
|
static int find_zone_extent(struct super_block *sb, struct scoutfs_alloc_root *root,
|
|
__le64 *zones, u64 zone_blocks,
|
|
struct scoutfs_extent *found_ret, u64 count,
|
|
struct scoutfs_extent *ext_ret)
|
|
{
|
|
struct alloc_ext_args args = {
|
|
.root = root,
|
|
.zone = SCOUTFS_FREE_EXTENT_ORDER_ZONE,
|
|
};
|
|
struct scoutfs_extent found;
|
|
struct scoutfs_extent ext;
|
|
u64 start;
|
|
u64 len;
|
|
int nr;
|
|
int ret;
|
|
|
|
/* don't bother when there are no bits set */
|
|
if (find_next_bit_le(zones, SCOUTFS_DATA_ALLOC_MAX_ZONES, 0) ==
|
|
SCOUTFS_DATA_ALLOC_MAX_ZONES)
|
|
return -ENOENT;
|
|
|
|
/* start searching for largest extent from the first zone */
|
|
len = smallest_order_length(SCOUTFS_BLOCK_SM_MAX);
|
|
nr = 0;
|
|
|
|
for (;;) {
|
|
/* search for extents in the next zone at our order */
|
|
nr = find_next_bit_le(zones, SCOUTFS_DATA_ALLOC_MAX_ZONES, nr);
|
|
if (nr >= SCOUTFS_DATA_ALLOC_MAX_ZONES) {
|
|
/* wrap down to next smaller order if we run out of bits */
|
|
len >>= 3;
|
|
if (len == 0) {
|
|
ret = -ENOENT;
|
|
break;
|
|
}
|
|
nr = find_next_bit_le(zones, SCOUTFS_DATA_ALLOC_MAX_ZONES, 0);
|
|
}
|
|
|
|
start = (u64)nr * zone_blocks;
|
|
|
|
ret = scoutfs_ext_next(sb, &alloc_ext_ops, &args, start, len, &found);
|
|
if (ret < 0)
|
|
break;
|
|
|
|
/* see if the next extent intersects any zones */
|
|
nr = first_extent_zone(&found, zones, zone_blocks);
|
|
if (nr < SCOUTFS_DATA_ALLOC_MAX_ZONES) {
|
|
start = (u64)nr * zone_blocks;
|
|
|
|
ext.start = max(start, found.start);
|
|
ext.len = min(count, found.start + found.len - ext.start);
|
|
|
|
*found_ret = found;
|
|
*ext_ret = ext;
|
|
ret = 0;
|
|
break;
|
|
}
|
|
|
|
/* continue searching past extent */
|
|
nr = div64_u64(found.start + found.len - 1, zone_blocks) + 1;
|
|
len = smallest_order_length(found.len);
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Move extent items adding up to the requested total length from the
|
|
* src to the dst tree. The caller is responsible for locking the
|
|
* trees, usually because they're also looking at total_len to decide
|
|
* how much to move.
|
|
*
|
|
* -ENOENT is returned if we run out of extents in the source tree
|
|
* before moving the total.
|
|
*
|
|
* If meta_budget is non-zero then -EINPROGRESS can be returned if the
|
|
* the caller's budget is consumed in the allocator during this call
|
|
* (though not necessarily by us, we don't have per-thread tracking of
|
|
* allocator consumption :/). The call can still have made progress and
|
|
* caller is expected commit the dirty trees and examining the resulting
|
|
* modified trees to see if they need to continue moving extents.
|
|
*
|
|
* The caller can specify that extents in the source tree should first
|
|
* be found based on their zone bitmaps. We'll first try to find
|
|
* extents in the exclusive zones, then vacant zones, and then we'll
|
|
* fall back to normal allocation that ignores zones.
|
|
*
|
|
* This first pass is not optimal because it performs full btree walks
|
|
* per extent. We could optimize this with more clever btree item
|
|
* manipulation functions which can iterate through src and dst blocks
|
|
* and let callbacks indicate how to change items.
|
|
*/
|
|
int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
|
|
struct scoutfs_block_writer *wri,
|
|
struct scoutfs_alloc_root *dst,
|
|
struct scoutfs_alloc_root *src, u64 total,
|
|
__le64 *exclusive, __le64 *vacant, u64 zone_blocks, u64 meta_budget)
|
|
{
|
|
struct alloc_ext_args args = {
|
|
.alloc = alloc,
|
|
.wri = wri,
|
|
};
|
|
struct scoutfs_extent found;
|
|
struct scoutfs_extent ext;
|
|
u32 avail_start = 0;
|
|
u32 freed_start = 0;
|
|
u64 moved = 0;
|
|
u64 count;
|
|
int ret = 0;
|
|
int err;
|
|
|
|
if (zone_blocks == 0) {
|
|
exclusive = NULL;
|
|
vacant = NULL;
|
|
}
|
|
|
|
if (meta_budget != 0)
|
|
scoutfs_alloc_meta_remaining(alloc, &avail_start, &freed_start);
|
|
|
|
while (moved < total) {
|
|
count = total - moved;
|
|
|
|
if (exclusive) {
|
|
/* first try to find extents in our exclusive zones */
|
|
ret = find_zone_extent(sb, src, exclusive, zone_blocks,
|
|
&found, count, &ext);
|
|
if (ret == -ENOENT) {
|
|
exclusive = NULL;
|
|
continue;
|
|
}
|
|
} else if (vacant) {
|
|
/* then try to find extents in vacant zones */
|
|
ret = find_zone_extent(sb, src, vacant, zone_blocks,
|
|
&found, count, &ext);
|
|
if (ret == -ENOENT) {
|
|
vacant = NULL;
|
|
continue;
|
|
}
|
|
} else {
|
|
/* otherwise fall back to finding extents anywhere */
|
|
args.root = src;
|
|
args.zone = SCOUTFS_FREE_EXTENT_ORDER_ZONE;
|
|
ret = scoutfs_ext_next(sb, &alloc_ext_ops, &args, 0, 0, &found);
|
|
if (ret == 0) {
|
|
ext.start = found.start;
|
|
ext.len = min(count, found.len);
|
|
}
|
|
}
|
|
if (ret < 0)
|
|
break;
|
|
|
|
if (meta_budget != 0 &&
|
|
scoutfs_alloc_meta_low_since(alloc, avail_start, freed_start, meta_budget,
|
|
extent_mod_blocks(src->root.height) +
|
|
extent_mod_blocks(dst->root.height))) {
|
|
ret = -EINPROGRESS;
|
|
break;
|
|
}
|
|
|
|
/* return partial if the server alloc can't dirty any more */
|
|
if (scoutfs_alloc_meta_low(sb, alloc, 50 + extent_mod_blocks(src->root.height) +
|
|
extent_mod_blocks(dst->root.height))) {
|
|
if (WARN_ON_ONCE(!moved))
|
|
ret = -ENOSPC;
|
|
else
|
|
ret = 0;
|
|
break;
|
|
}
|
|
|
|
/* searching set start/len, finish initializing alloced extent */
|
|
ext.map = found.map ? ext.start - found.start + found.map : 0;
|
|
ext.flags = found.flags;
|
|
|
|
/* remove the allocation from the found extent */
|
|
args.root = src;
|
|
args.zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE;
|
|
ret = scoutfs_ext_remove(sb, &alloc_ext_ops, &args, ext.start, ext.len);
|
|
if (ret < 0)
|
|
break;
|
|
|
|
/* insert the allocated extent into the dest */
|
|
args.root = dst;
|
|
args.zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE;
|
|
ret = scoutfs_ext_insert(sb, &alloc_ext_ops, &args, ext.start,
|
|
ext.len, ext.map, ext.flags);
|
|
if (ret < 0) {
|
|
/* and put it back in src if insertion failed */
|
|
args.root = src;
|
|
args.zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE;
|
|
err = scoutfs_ext_insert(sb, &alloc_ext_ops, &args,
|
|
ext.start, ext.len, ext.map,
|
|
ext.flags);
|
|
BUG_ON(err); /* inconsistent */
|
|
break;
|
|
}
|
|
|
|
moved += ext.len;
|
|
scoutfs_inc_counter(sb, alloc_moved_extent);
|
|
|
|
trace_scoutfs_alloc_move_extent(sb, &ext);
|
|
}
|
|
|
|
scoutfs_inc_counter(sb, alloc_move);
|
|
trace_scoutfs_alloc_move(sb, total, moved, ret);
|
|
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Add new free space to an allocator. _ext_insert will make sure that it doesn't
|
|
* overlap with any existing extents. This is done by the server in a transaction that
|
|
* also updates total_*_blocks in the super so we don't verify.
|
|
*/
|
|
int scoutfs_alloc_insert(struct super_block *sb, struct scoutfs_alloc *alloc,
|
|
struct scoutfs_block_writer *wri, struct scoutfs_alloc_root *root,
|
|
u64 start, u64 len)
|
|
{
|
|
struct alloc_ext_args args = {
|
|
.alloc = alloc,
|
|
.wri = wri,
|
|
.root = root,
|
|
.zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE,
|
|
};
|
|
|
|
return scoutfs_ext_insert(sb, &alloc_ext_ops, &args, start, len, 0, 0);
|
|
}
|
|
|
|
int scoutfs_alloc_remove(struct super_block *sb, struct scoutfs_alloc *alloc,
|
|
struct scoutfs_block_writer *wri, struct scoutfs_alloc_root *root,
|
|
u64 start, u64 len)
|
|
{
|
|
struct alloc_ext_args args = {
|
|
.alloc = alloc,
|
|
.wri = wri,
|
|
.root = root,
|
|
.zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE,
|
|
};
|
|
|
|
return scoutfs_ext_remove(sb, &alloc_ext_ops, &args, start, len);
|
|
}
|
|
|
|
/*
|
|
* We only trim one block, instead of looping trimming all, because the
|
|
* caller is assuming that we do a fixed amount of work when they check
|
|
* that their allocator has enough remaining free blocks for us.
|
|
*/
|
|
static int trim_empty_first_block(struct super_block *sb,
|
|
struct scoutfs_alloc *alloc,
|
|
struct scoutfs_block_writer *wri,
|
|
struct scoutfs_alloc_list_head *lhead)
|
|
{
|
|
struct scoutfs_alloc_list_block *one = NULL;
|
|
struct scoutfs_alloc_list_block *two = NULL;
|
|
struct scoutfs_block *one_bl = NULL;
|
|
struct scoutfs_block *two_bl = NULL;
|
|
int ret;
|
|
|
|
if (WARN_ON_ONCE(lhead->ref.blkno == 0) ||
|
|
WARN_ON_ONCE(lhead->first_nr != 0))
|
|
return 0;
|
|
|
|
ret = read_list_block(sb, &lhead->ref, &one_bl);
|
|
if (ret < 0)
|
|
goto out;
|
|
one = one_bl->data;
|
|
|
|
if (one->next.blkno) {
|
|
ret = read_list_block(sb, &one->next, &two_bl);
|
|
if (ret < 0)
|
|
goto out;
|
|
two = two_bl->data;
|
|
}
|
|
|
|
ret = scoutfs_free_meta(sb, alloc, wri, le64_to_cpu(lhead->ref.blkno));
|
|
if (ret < 0)
|
|
goto out;
|
|
|
|
lhead->ref = one->next;
|
|
lhead->first_nr = two ? two->nr : 0;
|
|
ret = 0;
|
|
out:
|
|
scoutfs_block_put(sb, one_bl);
|
|
scoutfs_block_put(sb, two_bl);
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* True if the allocator has enough blocks in the avail list and space
|
|
* in the freed list to be able to perform the callers operations. If
|
|
* false the caller should back off and return partial progress rather
|
|
* than completely exhausting the avail list or overflowing the freed
|
|
* list.
|
|
*
|
|
* The caller tells us how many extents they're about to modify and how
|
|
* many other additional blocks they may cow manually. And finally, the
|
|
* caller could be the first to dirty the avail and freed blocks in the
|
|
* allocator,
|
|
*/
|
|
static bool list_has_blocks(struct super_block *sb, struct scoutfs_alloc *alloc,
|
|
struct scoutfs_alloc_root *root, u32 extents, u32 addl_blocks)
|
|
{
|
|
u32 tree_blocks = extent_mod_blocks(root->root.height) * extents;
|
|
u32 most = 1 + tree_blocks + addl_blocks;
|
|
|
|
if (le32_to_cpu(alloc->avail.first_nr) < most) {
|
|
scoutfs_inc_counter(sb, alloc_list_avail_lo);
|
|
return false;
|
|
}
|
|
|
|
if (list_block_space(alloc->freed.first_nr) < most) {
|
|
scoutfs_inc_counter(sb, alloc_list_freed_hi);
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
static bool lhead_in_alloc(struct scoutfs_alloc *alloc,
|
|
struct scoutfs_alloc_list_head *lhead)
|
|
{
|
|
return lhead == &alloc->avail || lhead == &alloc->freed;
|
|
}
|
|
|
|
/*
|
|
* Move free blocks from extent items in the root into only the first
|
|
* block in the list towards the target if it's fallen below the lo
|
|
* threshold. This can return success without necessarily moving as
|
|
* much as was requested if its meta allocator runs low, the caller is
|
|
* expected to check the counts and act accordingly.
|
|
*
|
|
* -ENOSPC is returned if the root runs out of extents before the list
|
|
* reaches the target.
|
|
*/
|
|
int scoutfs_alloc_fill_list(struct super_block *sb,
|
|
struct scoutfs_alloc *alloc,
|
|
struct scoutfs_block_writer *wri,
|
|
struct scoutfs_alloc_list_head *lhead,
|
|
struct scoutfs_alloc_root *root,
|
|
u64 lo, u64 target)
|
|
{
|
|
struct alloc_ext_args args = {
|
|
.alloc = alloc,
|
|
.wri = wri,
|
|
.root = root,
|
|
.zone = SCOUTFS_FREE_EXTENT_ORDER_ZONE,
|
|
};
|
|
struct scoutfs_alloc_list_block *lblk;
|
|
struct scoutfs_block *bl = NULL;
|
|
struct scoutfs_extent ext;
|
|
int ret = 0;
|
|
int i;
|
|
|
|
if (WARN_ON_ONCE(target < lo) ||
|
|
WARN_ON_ONCE(lo > SCOUTFS_ALLOC_LIST_MAX_BLOCKS) ||
|
|
WARN_ON_ONCE(target > SCOUTFS_ALLOC_LIST_MAX_BLOCKS) ||
|
|
WARN_ON_ONCE(lhead_in_alloc(alloc, lhead)))
|
|
return -EINVAL;
|
|
|
|
if (le32_to_cpu(lhead->first_nr) >= lo)
|
|
return 0;
|
|
|
|
ret = dirty_list_block(sb, alloc, wri, &lhead->ref, 0, NULL, &bl);
|
|
if (ret < 0)
|
|
goto out;
|
|
lblk = bl->data;
|
|
|
|
while (le32_to_cpu(lblk->nr) < target && list_has_blocks(sb, alloc, root, 1, 0)) {
|
|
|
|
ret = scoutfs_ext_alloc(sb, &alloc_ext_ops, &args, 0, 0,
|
|
target - le32_to_cpu(lblk->nr), &ext);
|
|
if (ret < 0) {
|
|
if (ret == -ENOENT)
|
|
ret = -ENOSPC;
|
|
break;
|
|
}
|
|
|
|
for (i = 0; i < ext.len; i++)
|
|
list_block_add(lhead, lblk, ext.start + i);
|
|
|
|
trace_scoutfs_alloc_fill_extent(sb, &ext);
|
|
}
|
|
|
|
out:
|
|
scoutfs_block_put(sb, bl);
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Move blknos from all the blocks in the list into extents in the root,
|
|
* removing empty blocks as we go. This can return success and leave blocks
|
|
* on the list if its metadata alloc runs out of space.
|
|
*/
|
|
int scoutfs_alloc_empty_list(struct super_block *sb,
|
|
struct scoutfs_alloc *alloc,
|
|
struct scoutfs_block_writer *wri,
|
|
struct scoutfs_alloc_root *root,
|
|
struct scoutfs_alloc_list_head *lhead)
|
|
{
|
|
struct alloc_ext_args args = {
|
|
.alloc = alloc,
|
|
.wri = wri,
|
|
.root = root,
|
|
.zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE,
|
|
};
|
|
struct scoutfs_alloc_list_block *lblk = NULL;
|
|
struct scoutfs_block *bl = NULL;
|
|
struct scoutfs_extent ext;
|
|
int ret = 0;
|
|
|
|
if (WARN_ON_ONCE(lhead_in_alloc(alloc, lhead)))
|
|
return -EINVAL;
|
|
|
|
while (lhead->ref.blkno && list_has_blocks(sb, alloc, args.root, 1, 1)) {
|
|
|
|
if (lhead->first_nr == 0) {
|
|
ret = trim_empty_first_block(sb, alloc, wri, lhead);
|
|
if (ret < 0)
|
|
break;
|
|
|
|
scoutfs_block_put(sb, bl);
|
|
bl = NULL;
|
|
continue;
|
|
}
|
|
|
|
if (bl == NULL) {
|
|
ret = dirty_list_block(sb, alloc, wri, &lhead->ref,
|
|
0, NULL, &bl);
|
|
if (ret < 0)
|
|
break;
|
|
lblk = bl->data;
|
|
|
|
/* sort to encourage forming extents */
|
|
list_block_sort(lblk);
|
|
}
|
|
|
|
/* combine free blknos into extents and insert them */
|
|
ext.start = list_block_peek(lblk, 0);
|
|
ext.len = 1;
|
|
while ((le32_to_cpu(lblk->nr) > ext.len) &&
|
|
(list_block_peek(lblk, ext.len) == ext.start + ext.len))
|
|
ext.len++;
|
|
|
|
ret = scoutfs_ext_insert(sb, &alloc_ext_ops, &args,
|
|
ext.start, ext.len, 0, 0);
|
|
if (ret < 0)
|
|
break;
|
|
|
|
list_block_remove(lhead, lblk, ext.len);
|
|
|
|
trace_scoutfs_alloc_empty_extent(sb, &ext);
|
|
}
|
|
|
|
scoutfs_block_put(sb, bl);
|
|
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Insert the source list at the head of the destination list, leaving
|
|
* the source empty.
|
|
*
|
|
* This looks bad because the lists are singly-linked and we have to cow
|
|
* the entire src lsit to update its tail block next ref to the start of
|
|
* the dst list.
|
|
*
|
|
* In practice, this isn't a problem because the server only calls this
|
|
* with small lists that it's going to use soon.
|
|
*/
|
|
int scoutfs_alloc_splice_list(struct super_block *sb,
|
|
struct scoutfs_alloc *alloc,
|
|
struct scoutfs_block_writer *wri,
|
|
struct scoutfs_alloc_list_head *dst,
|
|
struct scoutfs_alloc_list_head *src)
|
|
{
|
|
struct scoutfs_alloc_list_block *lblk;
|
|
struct scoutfs_block_ref *ref;
|
|
struct scoutfs_block *prev = NULL;
|
|
struct scoutfs_block *bl = NULL;
|
|
int ret = 0;
|
|
|
|
if (WARN_ON_ONCE(lhead_in_alloc(alloc, dst)) ||
|
|
WARN_ON_ONCE(lhead_in_alloc(alloc, src)))
|
|
return -EINVAL;
|
|
|
|
if (src->ref.blkno == 0)
|
|
return 0;
|
|
|
|
ref = &src->ref;
|
|
while (ref->blkno) {
|
|
ret = dirty_list_block(sb, alloc, wri, ref, 0, NULL, &bl);
|
|
if (ret < 0)
|
|
goto out;
|
|
|
|
lblk = bl->data;
|
|
ref = &lblk->next;
|
|
|
|
scoutfs_block_put(sb, prev);
|
|
prev = bl;
|
|
bl = NULL;
|
|
}
|
|
|
|
*ref = dst->ref;
|
|
dst->ref = src->ref;
|
|
dst->first_nr = src->first_nr;
|
|
le64_add_cpu(&dst->total_nr, le64_to_cpu(src->total_nr));
|
|
|
|
memset(src, 0, sizeof(struct scoutfs_alloc_list_head));
|
|
ret = 0;
|
|
out:
|
|
scoutfs_block_put(sb, prev);
|
|
scoutfs_block_put(sb, bl);
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Returns true if meta avail and free don't have room for the given
|
|
* number of allocations or frees. This is called at a significantly
|
|
* higher frequency than allocations as writers try to enter
|
|
* transactions. This is the only reader of the seqlock which gives
|
|
* read-mostly sampling instead of bouncing a spinlock around all the
|
|
* cores.
|
|
*/
|
|
bool scoutfs_alloc_meta_low(struct super_block *sb,
|
|
struct scoutfs_alloc *alloc, u32 nr)
|
|
{
|
|
unsigned int seq;
|
|
bool lo;
|
|
|
|
do {
|
|
seq = read_seqbegin(&alloc->seqlock);
|
|
lo = le32_to_cpu(alloc->avail.first_nr) < nr ||
|
|
list_block_space(alloc->freed.first_nr) < nr;
|
|
} while (read_seqretry(&alloc->seqlock, seq));
|
|
|
|
return lo;
|
|
}
|
|
|
|
void scoutfs_alloc_meta_remaining(struct scoutfs_alloc *alloc, u32 *avail_total, u32 *freed_space)
|
|
{
|
|
unsigned int seq;
|
|
|
|
do {
|
|
seq = read_seqbegin(&alloc->seqlock);
|
|
*avail_total = le32_to_cpu(alloc->avail.first_nr);
|
|
*freed_space = list_block_space(alloc->freed.first_nr);
|
|
} while (read_seqretry(&alloc->seqlock, seq));
|
|
}
|
|
|
|
/*
|
|
* Returns true if the caller's consumption of nr from either avail or
|
|
* freed would end up exceeding their budget relative to the starting
|
|
* remaining snapshot they took.
|
|
*/
|
|
bool scoutfs_alloc_meta_low_since(struct scoutfs_alloc *alloc, u32 avail_start, u32 freed_start,
|
|
u32 budget, u32 nr)
|
|
{
|
|
u32 avail_use;
|
|
u32 freed_use;
|
|
u32 avail;
|
|
u32 freed;
|
|
|
|
scoutfs_alloc_meta_remaining(alloc, &avail, &freed);
|
|
|
|
avail_use = avail_start - avail;
|
|
freed_use = freed_start - freed;
|
|
|
|
return ((avail_use + nr) > budget) || ((freed_use + nr) > budget);
|
|
}
|
|
|
|
bool scoutfs_alloc_test_flag(struct super_block *sb,
|
|
struct scoutfs_alloc *alloc, u32 flag)
|
|
{
|
|
unsigned int seq;
|
|
bool set;
|
|
|
|
do {
|
|
seq = read_seqbegin(&alloc->seqlock);
|
|
set = !!(le32_to_cpu(alloc->avail.flags) & flag);
|
|
} while (read_seqretry(&alloc->seqlock, seq));
|
|
|
|
return set;
|
|
}
|
|
|
|
/*
|
|
* Iterate over the allocator structures referenced by the caller's
|
|
* super and call the caller's callback with summaries of the blocks
|
|
* found in each structure.
|
|
*
|
|
* The caller's responsible for the stability of the referenced blocks.
|
|
* If the blocks could be stale the caller must deal with retrying when
|
|
* it sees ESTALE.
|
|
*/
|
|
int scoutfs_alloc_foreach_super(struct super_block *sb, struct scoutfs_super_block *super,
|
|
scoutfs_alloc_foreach_cb_t cb, void *arg)
|
|
{
|
|
struct scoutfs_srch_compact *sc;
|
|
struct scoutfs_log_merge_request *lmreq;
|
|
struct scoutfs_log_merge_complete *lmcomp;
|
|
struct scoutfs_log_trees lt;
|
|
SCOUTFS_BTREE_ITEM_REF(iref);
|
|
struct scoutfs_key key;
|
|
int expected;
|
|
u64 avail_tot;
|
|
u64 freed_tot;
|
|
u64 id;
|
|
int ret;
|
|
|
|
sc = kmalloc(sizeof(struct scoutfs_srch_compact), GFP_NOFS);
|
|
if (!sc) {
|
|
ret = -ENOMEM;
|
|
goto out;
|
|
}
|
|
|
|
/* all the server allocators */
|
|
ret = cb(sb, arg, SCOUTFS_ALLOC_OWNER_SERVER, 0, true, true,
|
|
le64_to_cpu(super->meta_alloc[0].total_len)) ?:
|
|
cb(sb, arg, SCOUTFS_ALLOC_OWNER_SERVER, 0, true, true,
|
|
le64_to_cpu(super->meta_alloc[1].total_len)) ?:
|
|
cb(sb, arg, SCOUTFS_ALLOC_OWNER_SERVER, 0, false, true,
|
|
le64_to_cpu(super->data_alloc.total_len)) ?:
|
|
cb(sb, arg, SCOUTFS_ALLOC_OWNER_SERVER, 1, true, true,
|
|
le64_to_cpu(super->server_meta_avail[0].total_nr)) ?:
|
|
cb(sb, arg, SCOUTFS_ALLOC_OWNER_SERVER, 1, true, true,
|
|
le64_to_cpu(super->server_meta_avail[1].total_nr)) ?:
|
|
cb(sb, arg, SCOUTFS_ALLOC_OWNER_SERVER, 1, true, false,
|
|
le64_to_cpu(super->server_meta_freed[0].total_nr)) ?:
|
|
cb(sb, arg, SCOUTFS_ALLOC_OWNER_SERVER, 1, true, false,
|
|
le64_to_cpu(super->server_meta_freed[1].total_nr));
|
|
if (ret < 0)
|
|
goto out;
|
|
|
|
/* mount fs transaction allocators */
|
|
scoutfs_key_init_log_trees(&key, 0, 0);
|
|
for (;;) {
|
|
ret = scoutfs_btree_next(sb, &super->logs_root, &key, &iref);
|
|
if (ret == -ENOENT)
|
|
break;
|
|
if (ret < 0)
|
|
goto out;
|
|
|
|
if (iref.val_len == sizeof(lt)) {
|
|
key = *iref.key;
|
|
memcpy(<, iref.val, sizeof(lt));
|
|
} else {
|
|
ret = -EIO;
|
|
}
|
|
scoutfs_btree_put_iref(&iref);
|
|
if (ret < 0)
|
|
goto out;
|
|
|
|
ret = cb(sb, arg, SCOUTFS_ALLOC_OWNER_MOUNT,
|
|
le64_to_cpu(key.sklt_rid), true, true,
|
|
le64_to_cpu(lt.meta_avail.total_nr)) ?:
|
|
cb(sb, arg, SCOUTFS_ALLOC_OWNER_MOUNT,
|
|
le64_to_cpu(key.sklt_rid), true, false,
|
|
le64_to_cpu(lt.meta_freed.total_nr)) ?:
|
|
cb(sb, arg, SCOUTFS_ALLOC_OWNER_MOUNT,
|
|
le64_to_cpu(key.sklt_rid), false, true,
|
|
le64_to_cpu(lt.data_avail.total_len)) ?:
|
|
cb(sb, arg, SCOUTFS_ALLOC_OWNER_MOUNT,
|
|
le64_to_cpu(key.sklt_rid), false, false,
|
|
le64_to_cpu(lt.data_freed.total_len));
|
|
if (ret < 0)
|
|
goto out;
|
|
|
|
scoutfs_key_inc(&key);
|
|
}
|
|
|
|
/* srch compaction allocators */
|
|
memset(&key, 0, sizeof(key));
|
|
key.sk_zone = SCOUTFS_SRCH_ZONE;
|
|
key.sk_type = SCOUTFS_SRCH_PENDING_TYPE;
|
|
|
|
for (;;) {
|
|
/* _PENDING_ and _BUSY_ are last, _next won't see other types */
|
|
ret = scoutfs_btree_next(sb, &super->srch_root, &key, &iref);
|
|
if (ret == -ENOENT)
|
|
break;
|
|
if (ret == 0) {
|
|
if (iref.val_len == sizeof(*sc)) {
|
|
key = *iref.key;
|
|
memcpy(sc, iref.val, iref.val_len);
|
|
} else {
|
|
ret = -EIO;
|
|
}
|
|
scoutfs_btree_put_iref(&iref);
|
|
}
|
|
if (ret < 0)
|
|
goto out;
|
|
|
|
ret = cb(sb, arg, SCOUTFS_ALLOC_OWNER_SRCH,
|
|
le64_to_cpu(sc->id), true, true,
|
|
le64_to_cpu(sc->meta_avail.total_nr)) ?:
|
|
cb(sb, arg, SCOUTFS_ALLOC_OWNER_SRCH,
|
|
le64_to_cpu(sc->id), true, false,
|
|
le64_to_cpu(sc->meta_freed.total_nr));
|
|
if (ret < 0)
|
|
goto out;
|
|
|
|
scoutfs_key_inc(&key);
|
|
}
|
|
|
|
/* log merge allocators */
|
|
memset(&key, 0, sizeof(key));
|
|
key.sk_zone = SCOUTFS_LOG_MERGE_REQUEST_ZONE;
|
|
expected = sizeof(*lmreq);
|
|
id = 0;
|
|
avail_tot = 0;
|
|
freed_tot = 0;
|
|
|
|
for (;;) {
|
|
ret = scoutfs_btree_next(sb, &super->log_merge, &key, &iref);
|
|
if (ret == 0) {
|
|
if (iref.key->sk_zone != key.sk_zone) {
|
|
ret = -ENOENT;
|
|
} else if (iref.val_len == expected) {
|
|
key = *iref.key;
|
|
if (key.sk_zone == SCOUTFS_LOG_MERGE_REQUEST_ZONE) {
|
|
lmreq = iref.val;
|
|
id = le64_to_cpu(lmreq->rid);
|
|
avail_tot = le64_to_cpu(lmreq->meta_avail.total_nr);
|
|
freed_tot = le64_to_cpu(lmreq->meta_freed.total_nr);
|
|
} else {
|
|
lmcomp = iref.val;
|
|
id = le64_to_cpu(lmcomp->rid);
|
|
avail_tot = le64_to_cpu(lmcomp->meta_avail.total_nr);
|
|
freed_tot = le64_to_cpu(lmcomp->meta_freed.total_nr);
|
|
}
|
|
} else {
|
|
ret = -EIO;
|
|
}
|
|
scoutfs_btree_put_iref(&iref);
|
|
}
|
|
if (ret == -ENOENT) {
|
|
if (key.sk_zone == SCOUTFS_LOG_MERGE_REQUEST_ZONE) {
|
|
memset(&key, 0, sizeof(key));
|
|
key.sk_zone = SCOUTFS_LOG_MERGE_COMPLETE_ZONE;
|
|
expected = sizeof(*lmcomp);
|
|
continue;
|
|
}
|
|
break;
|
|
}
|
|
if (ret < 0)
|
|
goto out;
|
|
|
|
ret = cb(sb, arg, SCOUTFS_ALLOC_OWNER_LOG_MERGE, id, true, true, avail_tot) ?:
|
|
cb(sb, arg, SCOUTFS_ALLOC_OWNER_LOG_MERGE, id, true, false, freed_tot);
|
|
if (ret < 0)
|
|
goto out;
|
|
|
|
scoutfs_key_inc(&key);
|
|
}
|
|
|
|
ret = 0;
|
|
out:
|
|
|
|
kfree(sc);
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Read the current on-disk super and use it to walk the allocators and
|
|
* call the caller's callback. This assumes that the super it's reading
|
|
* could be stale and will retry if it encounters stale blocks.
|
|
*/
|
|
int scoutfs_alloc_foreach(struct super_block *sb, scoutfs_alloc_foreach_cb_t cb, void *arg)
|
|
{
|
|
struct scoutfs_super_block *super = NULL;
|
|
DECLARE_SAVED_REFS(saved);
|
|
int ret;
|
|
|
|
super = kmalloc(sizeof(struct scoutfs_super_block), GFP_NOFS);
|
|
if (!super) {
|
|
ret = -ENOMEM;
|
|
goto out;
|
|
}
|
|
|
|
do {
|
|
ret = scoutfs_read_super(sb, super);
|
|
if (ret < 0)
|
|
goto out;
|
|
|
|
ret = scoutfs_alloc_foreach_super(sb, super, cb, arg);
|
|
|
|
ret = scoutfs_block_check_stale(sb, ret, &saved, &super->logs_root.ref,
|
|
&super->srch_root.ref);
|
|
} while (ret == -ESTALE);
|
|
|
|
out:
|
|
kfree(super);
|
|
return ret;
|
|
}
|
|
|
|
struct foreach_cb_args {
|
|
scoutfs_alloc_extent_cb_t cb;
|
|
void *cb_arg;
|
|
};
|
|
|
|
static int alloc_btree_extent_item_cb(struct super_block *sb, struct scoutfs_key *key, u64 seq,
|
|
u8 flags, void *val, int val_len, void *arg)
|
|
{
|
|
struct foreach_cb_args *cba = arg;
|
|
struct scoutfs_extent ext;
|
|
|
|
if (key->sk_zone != SCOUTFS_FREE_EXTENT_BLKNO_ZONE)
|
|
return -ENOENT;
|
|
|
|
ext_from_key(&ext, key);
|
|
cba->cb(sb, cba->cb_arg, &ext);
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Call the caller's callback on each extent stored in the allocator's
|
|
* btree. The callback sees extents called in order by starting blkno.
|
|
*/
|
|
int scoutfs_alloc_extents_cb(struct super_block *sb, struct scoutfs_alloc_root *root,
|
|
scoutfs_alloc_extent_cb_t cb, void *cb_arg)
|
|
{
|
|
struct foreach_cb_args cba = {
|
|
.cb = cb,
|
|
.cb_arg = cb_arg,
|
|
};
|
|
struct scoutfs_key start;
|
|
struct scoutfs_key end;
|
|
struct scoutfs_key key;
|
|
int ret;
|
|
|
|
init_ext_key(&key, SCOUTFS_FREE_EXTENT_BLKNO_ZONE, 0, 1);
|
|
|
|
for (;;) {
|
|
/* will stop at order items before getting stuck in final block */
|
|
BUILD_BUG_ON(SCOUTFS_FREE_EXTENT_BLKNO_ZONE > SCOUTFS_FREE_EXTENT_ORDER_ZONE);
|
|
init_ext_key(&start, SCOUTFS_FREE_EXTENT_BLKNO_ZONE, 0, 1);
|
|
init_ext_key(&end, SCOUTFS_FREE_EXTENT_ORDER_ZONE, 0, 1);
|
|
|
|
ret = scoutfs_btree_read_items(sb, &root->root, &key, &start, &end,
|
|
alloc_btree_extent_item_cb, &cba);
|
|
if (ret < 0 || end.sk_zone != SCOUTFS_FREE_EXTENT_BLKNO_ZONE) {
|
|
if (ret == -ENOENT)
|
|
ret = 0;
|
|
break;
|
|
}
|
|
|
|
key = end;
|
|
scoutfs_key_inc(&key);
|
|
}
|
|
|
|
return ret;
|
|
}
|