Merge pull request #254 from versity/zab/shrink_cleanup

Zab/shrink cleanup
This commit is contained in:
Zach Brown
2025-10-30 08:56:33 -07:00
committed by GitHub
16 changed files with 509 additions and 393 deletions

View File

@@ -425,3 +425,48 @@ endif
ifneq (,$(shell grep 'int ..remap_pages..struct vm_area_struct' include/linux/mm.h)) ifneq (,$(shell grep 'int ..remap_pages..struct vm_area_struct' include/linux/mm.h))
ccflags-y += -DKC_MM_REMAP_PAGES ccflags-y += -DKC_MM_REMAP_PAGES
endif endif
#
# v3.19-4742-g503c358cf192
#
# list_lru_shrink_count() and list_lru_shrink_walk() introduced
#
ifneq (,$(shell grep 'list_lru_shrink_count.*struct list_lru' include/linux/list_lru.h))
ccflags-y += -DKC_LIST_LRU_SHRINK_COUNT_WALK
endif
#
# v3.19-4757-g3f97b163207c
#
# lru_list_walk_cb lru arg added
#
ifneq (,$(shell grep 'struct list_head \*item, spinlock_t \*lock, void \*cb_arg' include/linux/list_lru.h))
ccflags-y += -DKC_LIST_LRU_WALK_CB_ITEM_LOCK
endif
#
# v6.7-rc4-153-g0a97c01cd20b
#
# list_lru_{add,del} -> list_lru_{add,del}_obj
#
ifneq (,$(shell grep '^bool list_lru_add_obj' include/linux/list_lru.h))
ccflags-y += -DKC_LIST_LRU_ADD_OBJ
endif
#
# v6.12-rc6-227-gda0c02516c50
#
# lru_list_walk_cb lock arg removed
#
ifneq (,$(shell grep 'struct list_lru_one \*list, spinlock_t \*lock, void \*cb_arg' include/linux/list_lru.h))
ccflags-y += -DKC_LIST_LRU_WALK_CB_LIST_LOCK
endif
#
# v5.1-rc4-273-ge9b98e162aa5
#
# introduce stack trace helpers
#
ifneq (,$(shell grep '^unsigned int stack_trace_save' include/linux/stacktrace.h))
ccflags-y += -DKC_STACK_TRACE_SAVE
endif

View File

@@ -22,6 +22,8 @@
#include <linux/rhashtable.h> #include <linux/rhashtable.h>
#include <linux/random.h> #include <linux/random.h>
#include <linux/sched/mm.h> #include <linux/sched/mm.h>
#include <linux/list_lru.h>
#include <linux/stacktrace.h>
#include "format.h" #include "format.h"
#include "super.h" #include "super.h"
@@ -38,26 +40,12 @@
* than the page size. Callers can have their own contexts for tracking * than the page size. Callers can have their own contexts for tracking
* dirty blocks that are written together. We pin dirty blocks in * dirty blocks that are written together. We pin dirty blocks in
* memory and only checksum them all as they're all written. * memory and only checksum them all as they're all written.
*
* Memory reclaim is driven by maintaining two very coarse groups of
* blocks. As we access blocks we mark them with an increasing counter
* to discourage them from being reclaimed. We then define a threshold
* at the current counter minus half the population. Recent blocks have
* a counter greater than the threshold, and all other blocks with
* counters less than it are considered older and are candidates for
* reclaim. This results in access updates rarely modifying an atomic
* counter as blocks need to be moved into the recent group, and shrink
* can randomly scan blocks looking for the half of the population that
* will be in the old group. It's reasonably effective, but is
* particularly efficient and avoids contention between concurrent
* accesses and shrinking.
*/ */
struct block_info { struct block_info {
struct super_block *sb; struct super_block *sb;
atomic_t total_inserted;
atomic64_t access_counter;
struct rhashtable ht; struct rhashtable ht;
struct list_lru lru;
wait_queue_head_t waitq; wait_queue_head_t waitq;
KC_DEFINE_SHRINKER(shrinker); KC_DEFINE_SHRINKER(shrinker);
struct work_struct free_work; struct work_struct free_work;
@@ -76,28 +64,15 @@ enum block_status_bits {
BLOCK_BIT_PAGE_ALLOC, /* page (possibly high order) allocation */ BLOCK_BIT_PAGE_ALLOC, /* page (possibly high order) allocation */
BLOCK_BIT_VIRT, /* mapped virt allocation */ BLOCK_BIT_VIRT, /* mapped virt allocation */
BLOCK_BIT_CRC_VALID, /* crc has been verified */ BLOCK_BIT_CRC_VALID, /* crc has been verified */
BLOCK_BIT_ACCESSED, /* seen by lookup since last lru add/walk */
}; };
/*
* We want to tie atomic changes in refcounts to whether or not the
* block is still visible in the hash table, so we store the hash
* table's reference up at a known high bit. We could naturally set the
* inserted bit through excessive refcount increments. We don't do
* anything about that but at least warn if we get close.
*
* We're avoiding the high byte for no real good reason, just out of a
* historical fear of implementations that don't provide the full
* precision.
*/
#define BLOCK_REF_INSERTED (1U << 23)
#define BLOCK_REF_FULL (BLOCK_REF_INSERTED >> 1)
struct block_private { struct block_private {
struct scoutfs_block bl; struct scoutfs_block bl;
struct super_block *sb; struct super_block *sb;
atomic_t refcount; atomic_t refcount;
u64 accessed;
struct rhash_head ht_head; struct rhash_head ht_head;
struct list_head lru_head;
struct list_head dirty_entry; struct list_head dirty_entry;
struct llist_node free_node; struct llist_node free_node;
unsigned long bits; unsigned long bits;
@@ -106,13 +81,15 @@ struct block_private {
struct page *page; struct page *page;
void *virt; void *virt;
}; };
unsigned int stack_len;
unsigned long stack[10];
}; };
#define TRACE_BLOCK(which, bp) \ #define TRACE_BLOCK(which, bp) \
do { \ do { \
__typeof__(bp) _bp = (bp); \ __typeof__(bp) _bp = (bp); \
trace_scoutfs_block_##which(_bp->sb, _bp, _bp->bl.blkno, atomic_read(&_bp->refcount), \ trace_scoutfs_block_##which(_bp->sb, _bp, _bp->bl.blkno, atomic_read(&_bp->refcount), \
atomic_read(&_bp->io_count), _bp->bits, _bp->accessed); \ atomic_read(&_bp->io_count), _bp->bits); \
} while (0) } while (0)
#define BLOCK_PRIVATE(_bl) \ #define BLOCK_PRIVATE(_bl) \
@@ -126,7 +103,17 @@ static __le32 block_calc_crc(struct scoutfs_block_header *hdr, u32 size)
return cpu_to_le32(calc); return cpu_to_le32(calc);
} }
static struct block_private *block_alloc(struct super_block *sb, u64 blkno) static noinline void save_block_stack(struct block_private *bp)
{
bp->stack_len = stack_trace_save(bp->stack, ARRAY_SIZE(bp->stack), 2);
}
static void print_block_stack(struct block_private *bp)
{
stack_trace_print(bp->stack, bp->stack_len, 1);
}
static noinline struct block_private *block_alloc(struct super_block *sb, u64 blkno)
{ {
struct block_private *bp; struct block_private *bp;
unsigned int nofs_flags; unsigned int nofs_flags;
@@ -176,11 +163,13 @@ static struct block_private *block_alloc(struct super_block *sb, u64 blkno)
bp->bl.blkno = blkno; bp->bl.blkno = blkno;
bp->sb = sb; bp->sb = sb;
atomic_set(&bp->refcount, 1); atomic_set(&bp->refcount, 1);
INIT_LIST_HEAD(&bp->lru_head);
INIT_LIST_HEAD(&bp->dirty_entry); INIT_LIST_HEAD(&bp->dirty_entry);
set_bit(BLOCK_BIT_NEW, &bp->bits); set_bit(BLOCK_BIT_NEW, &bp->bits);
atomic_set(&bp->io_count, 0); atomic_set(&bp->io_count, 0);
TRACE_BLOCK(allocate, bp); TRACE_BLOCK(allocate, bp);
save_block_stack(bp);
out: out:
if (!bp) if (!bp)
@@ -233,32 +222,85 @@ static void block_free_work(struct work_struct *work)
} }
/* /*
* Get a reference to a block while holding an existing reference. * Users of blocks hold a refcount. If putting a refcount drops to zero
* then the block is freed.
*
* Acquiring new references and claiming the exclusive right to tear
* down a block is built around this LIVE_REFCOUNT_BASE refcount value.
* As blocks are initially cached they have the live base added to their
* refcount. Lookups will only increment the refcount and return blocks
* for reference holders while the refcount is >= than the base.
*
* To remove a block from the cache and eventually free it, either by
* the lru walk in the shrinker, or by reference holders, the live base
* is removed and turned into a normal refcount increment that will be
* put by the caller. This can only be done once for a block, and once
* its done lookup will not return any more references.
*/
#define LIVE_REFCOUNT_BASE (INT_MAX ^ (INT_MAX >> 1))
/*
* Inc the refcount while holding an incremented refcount. We can't
* have so many individual reference holders that they pass the live
* base.
*/ */
static void block_get(struct block_private *bp) static void block_get(struct block_private *bp)
{ {
WARN_ON_ONCE((atomic_read(&bp->refcount) & ~BLOCK_REF_INSERTED) <= 0); int now = atomic_inc_return(&bp->refcount);
atomic_inc(&bp->refcount); BUG_ON(now <= 1);
BUG_ON(now == LIVE_REFCOUNT_BASE);
} }
/* /*
* Get a reference to a block as long as it's been inserted in the hash * if (*v >= u) {
* table and hasn't been removed. * *v += a;
* return true;
* }
*/ */
static struct block_private *block_get_if_inserted(struct block_private *bp) static bool atomic_add_unless_less(atomic_t *v, int a, int u)
{ {
int cnt; int c;
do { do {
cnt = atomic_read(&bp->refcount); c = atomic_read(v);
WARN_ON_ONCE(cnt & BLOCK_REF_FULL); if (c < u)
if (!(cnt & BLOCK_REF_INSERTED)) return false;
return NULL; } while (atomic_cmpxchg(v, c, c + a) != c);
} while (atomic_cmpxchg(&bp->refcount, cnt, cnt + 1) != cnt); return true;
}
return bp; static bool block_get_if_live(struct block_private *bp)
{
return atomic_add_unless_less(&bp->refcount, 1, LIVE_REFCOUNT_BASE);
}
/*
* If the refcount still has the live base, subtract it and increment
* the callers refcount that they'll put.
*/
static bool block_get_remove_live(struct block_private *bp)
{
return atomic_add_unless_less(&bp->refcount, (1 - LIVE_REFCOUNT_BASE), LIVE_REFCOUNT_BASE);
}
/*
* Only get the live base refcount if it is the only refcount remaining.
* This means that there are no active refcount holders and the block
* can't be dirty or under IO, which both hold references.
*/
static bool block_get_remove_live_only(struct block_private *bp)
{
int c;
do {
c = atomic_read(&bp->refcount);
if (c != LIVE_REFCOUNT_BASE)
return false;
} while (atomic_cmpxchg(&bp->refcount, c, c - LIVE_REFCOUNT_BASE + 1) != c);
return true;
} }
/* /*
@@ -290,104 +332,73 @@ static const struct rhashtable_params block_ht_params = {
}; };
/* /*
* Insert a new block into the hash table. Once it is inserted in the * Insert the block into the cache so that it's visible for lookups.
* hash table readers can start getting references. The caller may have * The caller can hold references (including for a dirty block).
* multiple refs but the block can't already be inserted. *
* We make sure the base is added and the block is in the lru once it's
* in the hash. If hash table insertion fails it'll be briefly visible
* in the lru, but won't be isolated/evicted because we hold an
* incremented refcount in addition to the live base.
*/ */
static int block_insert(struct super_block *sb, struct block_private *bp) static int block_insert(struct super_block *sb, struct block_private *bp)
{ {
DECLARE_BLOCK_INFO(sb, binf); DECLARE_BLOCK_INFO(sb, binf);
int ret; int ret;
WARN_ON_ONCE(atomic_read(&bp->refcount) & BLOCK_REF_INSERTED); BUG_ON(atomic_read(&bp->refcount) >= LIVE_REFCOUNT_BASE);
atomic_add(LIVE_REFCOUNT_BASE, &bp->refcount);
smp_mb__after_atomic(); /* make sure live base is visible to list_lru walk */
list_lru_add_obj(&binf->lru, &bp->lru_head);
retry: retry:
atomic_add(BLOCK_REF_INSERTED, &bp->refcount);
ret = rhashtable_lookup_insert_fast(&binf->ht, &bp->ht_head, block_ht_params); ret = rhashtable_lookup_insert_fast(&binf->ht, &bp->ht_head, block_ht_params);
if (ret < 0) { if (ret < 0) {
atomic_sub(BLOCK_REF_INSERTED, &bp->refcount);
if (ret == -EBUSY) { if (ret == -EBUSY) {
/* wait for pending rebalance to finish */ /* wait for pending rebalance to finish */
synchronize_rcu(); synchronize_rcu();
goto retry; goto retry;
} else {
atomic_sub(LIVE_REFCOUNT_BASE, &bp->refcount);
BUG_ON(atomic_read(&bp->refcount) >= LIVE_REFCOUNT_BASE);
list_lru_del_obj(&binf->lru, &bp->lru_head);
} }
} else { } else {
atomic_inc(&binf->total_inserted);
TRACE_BLOCK(insert, bp); TRACE_BLOCK(insert, bp);
} }
return ret; return ret;
} }
static u64 accessed_recently(struct block_info *binf)
{
return atomic64_read(&binf->access_counter) - (atomic_read(&binf->total_inserted) >> 1);
}
/* /*
* Make sure that a block that is being accessed is less likely to be * Indicate to the lru walker that this block has been accessed since it
* reclaimed if it is seen by the shrinker. If the block hasn't been * was added or last walked.
* accessed recently we update its accessed value.
*/ */
static void block_accessed(struct super_block *sb, struct block_private *bp) static void block_accessed(struct super_block *sb, struct block_private *bp)
{ {
DECLARE_BLOCK_INFO(sb, binf); if (!test_and_set_bit(BLOCK_BIT_ACCESSED, &bp->bits))
if (bp->accessed == 0 || bp->accessed < accessed_recently(binf)) {
scoutfs_inc_counter(sb, block_cache_access_update); scoutfs_inc_counter(sb, block_cache_access_update);
bp->accessed = atomic64_inc_return(&binf->access_counter);
}
} }
/* /*
* The caller wants to remove the block from the hash table and has an * Remove the block from the cache. When this returns the block won't
* idea what the refcount should be. If the refcount does still * be visible for additional references from lookup.
* indicate that the block is hashed, and we're able to clear that bit,
* then we can remove it from the hash table.
* *
* The caller makes sure that it's safe to be referencing this block, * We always try and remove from the hash table. It's safe to remove a
* either with their own held reference (most everything) or by being in * block that isn't hashed, it just returns -ENOENT.
* an rcu grace period (shrink). *
*/ * This is racing with the lru walk in the shrinker also trying to
static bool block_remove_cnt(struct super_block *sb, struct block_private *bp, int cnt) * remove idle blocks from the cache. They both try to remove the live
{ * refcount base and perform their removal and put if they get it.
DECLARE_BLOCK_INFO(sb, binf);
int ret;
if ((cnt & BLOCK_REF_INSERTED) &&
(atomic_cmpxchg(&bp->refcount, cnt, cnt & ~BLOCK_REF_INSERTED) == cnt)) {
TRACE_BLOCK(remove, bp);
ret = rhashtable_remove_fast(&binf->ht, &bp->ht_head, block_ht_params);
WARN_ON_ONCE(ret); /* must have been inserted */
atomic_dec(&binf->total_inserted);
return true;
}
return false;
}
/*
* Try to remove the block from the hash table as long as the refcount
* indicates that it is still in the hash table. This can be racing
* with normal refcount changes so it might have to retry.
*/ */
static void block_remove(struct super_block *sb, struct block_private *bp) static void block_remove(struct super_block *sb, struct block_private *bp)
{ {
int cnt; DECLARE_BLOCK_INFO(sb, binf);
do { rhashtable_remove_fast(&binf->ht, &bp->ht_head, block_ht_params);
cnt = atomic_read(&bp->refcount);
} while ((cnt & BLOCK_REF_INSERTED) && !block_remove_cnt(sb, bp, cnt));
}
/* if (block_get_remove_live(bp)) {
* Take one shot at removing the block from the hash table if it's still list_lru_del_obj(&binf->lru, &bp->lru_head);
* in the hash table and the caller has the only other reference. block_put(sb, bp);
*/ }
static bool block_remove_solo(struct super_block *sb, struct block_private *bp)
{
return block_remove_cnt(sb, bp, BLOCK_REF_INSERTED | 1);
} }
static bool io_busy(struct block_private *bp) static bool io_busy(struct block_private *bp)
@@ -396,37 +407,6 @@ static bool io_busy(struct block_private *bp)
return test_bit(BLOCK_BIT_IO_BUSY, &bp->bits); return test_bit(BLOCK_BIT_IO_BUSY, &bp->bits);
} }
/*
* Called during shutdown with no other users.
*/
static void block_remove_all(struct super_block *sb)
{
DECLARE_BLOCK_INFO(sb, binf);
struct rhashtable_iter iter;
struct block_private *bp;
rhashtable_walk_enter(&binf->ht, &iter);
rhashtable_walk_start(&iter);
for (;;) {
bp = rhashtable_walk_next(&iter);
if (bp == NULL)
break;
if (bp == ERR_PTR(-EAGAIN))
continue;
if (block_get_if_inserted(bp)) {
block_remove(sb, bp);
WARN_ON_ONCE(atomic_read(&bp->refcount) != 1);
block_put(sb, bp);
}
}
rhashtable_walk_stop(&iter);
rhashtable_walk_exit(&iter);
WARN_ON_ONCE(atomic_read(&binf->total_inserted) != 0);
}
/* /*
* XXX The io_count and sb fields in the block_private are only used * XXX The io_count and sb fields in the block_private are only used
@@ -543,6 +523,10 @@ static int block_submit_bio(struct super_block *sb, struct block_private *bp,
return ret; return ret;
} }
/*
* Return a block with an elevated refcount if it was present in the
* hash table and its refcount didn't indicate that it was being freed.
*/
static struct block_private *block_lookup(struct super_block *sb, u64 blkno) static struct block_private *block_lookup(struct super_block *sb, u64 blkno)
{ {
DECLARE_BLOCK_INFO(sb, binf); DECLARE_BLOCK_INFO(sb, binf);
@@ -550,8 +534,8 @@ static struct block_private *block_lookup(struct super_block *sb, u64 blkno)
rcu_read_lock(); rcu_read_lock();
bp = rhashtable_lookup(&binf->ht, &blkno, block_ht_params); bp = rhashtable_lookup(&binf->ht, &blkno, block_ht_params);
if (bp) if (bp && !block_get_if_live(bp))
bp = block_get_if_inserted(bp); bp = NULL;
rcu_read_unlock(); rcu_read_unlock();
return bp; return bp;
@@ -1078,102 +1062,108 @@ static unsigned long block_count_objects(struct shrinker *shrink, struct shrink_
struct super_block *sb = binf->sb; struct super_block *sb = binf->sb;
scoutfs_inc_counter(sb, block_cache_count_objects); scoutfs_inc_counter(sb, block_cache_count_objects);
return list_lru_shrink_count(&binf->lru, sc);
return shrinker_min_long(atomic_read(&binf->total_inserted)); }
struct isolate_args {
struct super_block *sb;
struct list_head dispose;
};
#define DECLARE_ISOLATE_ARGS(sb_, name_) \
struct isolate_args name_ = { \
.sb = sb_, \
.dispose = LIST_HEAD_INIT(name_.dispose), \
}
static enum lru_status isolate_lru_block(struct list_head *item, struct list_lru_one *list,
void *cb_arg)
{
struct block_private *bp = container_of(item, struct block_private, lru_head);
struct isolate_args *ia = cb_arg;
TRACE_BLOCK(isolate, bp);
/* rotate accessed blocks to the tail of the list (lazy promotion) */
if (test_and_clear_bit(BLOCK_BIT_ACCESSED, &bp->bits)) {
scoutfs_inc_counter(ia->sb, block_cache_isolate_rotate);
return LRU_ROTATE;
}
/* any refs, including dirty/io, stop us from acquiring lru refcount */
if (!block_get_remove_live_only(bp)) {
scoutfs_inc_counter(ia->sb, block_cache_isolate_skip);
return LRU_SKIP;
}
scoutfs_inc_counter(ia->sb, block_cache_isolate_removed);
list_lru_isolate_move(list, &bp->lru_head, &ia->dispose);
return LRU_REMOVED;
}
static void shrink_dispose_blocks(struct super_block *sb, struct list_head *dispose)
{
struct block_private *bp;
struct block_private *bp__;
list_for_each_entry_safe(bp, bp__, dispose, lru_head) {
list_del_init(&bp->lru_head);
block_remove(sb, bp);
block_put(sb, bp);
}
} }
/*
* Remove a number of cached blocks that haven't been used recently.
*
* We don't maintain a strictly ordered LRU to avoid the contention of
* accesses always moving blocks around in some precise global
* structure.
*
* Instead we use counters to divide the blocks into two roughly equal
* groups by how recently they were accessed. We randomly walk all
* inserted blocks looking for any blocks in the older half to remove
* and free. The random walk and there being two groups means that we
* typically only walk a small multiple of the number we're looking for
* before we find them all.
*
* Our rcu walk of blocks can see blocks in all stages of their life
* cycle, from dirty blocks to those with 0 references that are queued
* for freeing. We only want to free idle inserted blocks so we
* atomically remove blocks when the only references are ours and the
* hash table.
*/
static unsigned long block_scan_objects(struct shrinker *shrink, struct shrink_control *sc) static unsigned long block_scan_objects(struct shrinker *shrink, struct shrink_control *sc)
{ {
struct block_info *binf = KC_SHRINKER_CONTAINER_OF(shrink, struct block_info); struct block_info *binf = KC_SHRINKER_CONTAINER_OF(shrink, struct block_info);
struct super_block *sb = binf->sb; struct super_block *sb = binf->sb;
struct rhashtable_iter iter; DECLARE_ISOLATE_ARGS(sb, ia);
struct block_private *bp; unsigned long freed;
bool stop = false;
unsigned long freed = 0;
unsigned long nr = sc->nr_to_scan;
u64 recently;
scoutfs_inc_counter(sb, block_cache_scan_objects); scoutfs_inc_counter(sb, block_cache_scan_objects);
recently = accessed_recently(binf); freed = kc_list_lru_shrink_walk(&binf->lru, sc, isolate_lru_block, &ia);
rhashtable_walk_enter(&binf->ht, &iter); shrink_dispose_blocks(sb, &ia.dispose);
rhashtable_walk_start(&iter);
/*
* This isn't great but I don't see a better way. We want to
* walk the hash from a random point so that we're not
* constantly walking over the same region that we've already
* freed old blocks within. The interface doesn't let us do
* this explicitly, but this seems to work? The difference this
* makes is enormous, around a few orders of magnitude fewer
* _nexts per shrink.
*/
if (iter.walker.tbl)
iter.slot = prandom_u32_max(iter.walker.tbl->size);
while (nr > 0) {
bp = rhashtable_walk_next(&iter);
if (bp == NULL)
break;
if (bp == ERR_PTR(-EAGAIN)) {
/*
* We can be called from reclaim in the allocation
* to resize the hash table itself. We have to
* return so that the caller can proceed and
* enable hash table iteration again.
*/
scoutfs_inc_counter(sb, block_cache_shrink_stop);
stop = true;
break;
}
scoutfs_inc_counter(sb, block_cache_shrink_next);
if (bp->accessed >= recently) {
scoutfs_inc_counter(sb, block_cache_shrink_recent);
continue;
}
if (block_get_if_inserted(bp)) {
if (block_remove_solo(sb, bp)) {
scoutfs_inc_counter(sb, block_cache_shrink_remove);
TRACE_BLOCK(shrink, bp);
freed++;
nr--;
}
block_put(sb, bp);
}
}
rhashtable_walk_stop(&iter);
rhashtable_walk_exit(&iter);
if (stop)
return SHRINK_STOP;
else
return freed; return freed;
} }
static enum lru_status dump_lru_block(struct list_head *item, struct list_lru_one *list,
void *cb_arg)
{
struct block_private *bp = container_of(item, struct block_private, lru_head);
printk("blkno %llu refcount 0x%x io_count %d bits 0x%lx\n",
bp->bl.blkno, atomic_read(&bp->refcount), atomic_read(&bp->io_count),
bp->bits);
print_block_stack(bp);
return LRU_SKIP;
}
/*
* Called during shutdown with no other users. The isolating walk must
* find blocks on the lru that only have references for presence on the
* lru and in the hash table.
*/
static void block_shrink_all(struct super_block *sb)
{
DECLARE_BLOCK_INFO(sb, binf);
DECLARE_ISOLATE_ARGS(sb, ia);
long count;
count = DIV_ROUND_UP(list_lru_count(&binf->lru), 128) * 2;
do {
kc_list_lru_walk(&binf->lru, isolate_lru_block, &ia, 128);
shrink_dispose_blocks(sb, &ia.dispose);
} while (list_lru_count(&binf->lru) > 0 && --count > 0);
count = list_lru_count(&binf->lru);
if (count > 0) {
scoutfs_err(sb, "failed to isolate/dispose %ld blocks", count);
kc_list_lru_walk(&binf->lru, dump_lru_block, sb, count);
}
}
struct sm_block_completion { struct sm_block_completion {
struct completion comp; struct completion comp;
int err; int err;
@@ -1276,7 +1266,7 @@ int scoutfs_block_write_sm(struct super_block *sb,
int scoutfs_block_setup(struct super_block *sb) int scoutfs_block_setup(struct super_block *sb)
{ {
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct block_info *binf; struct block_info *binf = NULL;
int ret; int ret;
binf = kzalloc(sizeof(struct block_info), GFP_KERNEL); binf = kzalloc(sizeof(struct block_info), GFP_KERNEL);
@@ -1285,15 +1275,15 @@ int scoutfs_block_setup(struct super_block *sb)
goto out; goto out;
} }
ret = rhashtable_init(&binf->ht, &block_ht_params); ret = list_lru_init(&binf->lru);
if (ret < 0) { if (ret < 0)
kfree(binf); goto out;
ret = rhashtable_init(&binf->ht, &block_ht_params);
if (ret < 0)
goto out; goto out;
}
binf->sb = sb; binf->sb = sb;
atomic_set(&binf->total_inserted, 0);
atomic64_set(&binf->access_counter, 0);
init_waitqueue_head(&binf->waitq); init_waitqueue_head(&binf->waitq);
KC_INIT_SHRINKER_FUNCS(&binf->shrinker, block_count_objects, KC_INIT_SHRINKER_FUNCS(&binf->shrinker, block_count_objects,
block_scan_objects); block_scan_objects);
@@ -1305,8 +1295,10 @@ int scoutfs_block_setup(struct super_block *sb)
ret = 0; ret = 0;
out: out:
if (ret) if (ret < 0 && binf) {
scoutfs_block_destroy(sb); list_lru_destroy(&binf->lru);
kfree(binf);
}
return ret; return ret;
} }
@@ -1318,9 +1310,10 @@ void scoutfs_block_destroy(struct super_block *sb)
if (binf) { if (binf) {
KC_UNREGISTER_SHRINKER(&binf->shrinker); KC_UNREGISTER_SHRINKER(&binf->shrinker);
block_remove_all(sb); block_shrink_all(sb);
flush_work(&binf->free_work); flush_work(&binf->free_work);
rhashtable_destroy(&binf->ht); rhashtable_destroy(&binf->ht);
list_lru_destroy(&binf->lru);
kfree(binf); kfree(binf);
sbi->block_info = NULL; sbi->block_info = NULL;

View File

@@ -26,17 +26,15 @@
EXPAND_COUNTER(block_cache_alloc_page_order) \ EXPAND_COUNTER(block_cache_alloc_page_order) \
EXPAND_COUNTER(block_cache_alloc_virt) \ EXPAND_COUNTER(block_cache_alloc_virt) \
EXPAND_COUNTER(block_cache_end_io_error) \ EXPAND_COUNTER(block_cache_end_io_error) \
EXPAND_COUNTER(block_cache_isolate_removed) \
EXPAND_COUNTER(block_cache_isolate_rotate) \
EXPAND_COUNTER(block_cache_isolate_skip) \
EXPAND_COUNTER(block_cache_forget) \ EXPAND_COUNTER(block_cache_forget) \
EXPAND_COUNTER(block_cache_free) \ EXPAND_COUNTER(block_cache_free) \
EXPAND_COUNTER(block_cache_free_work) \ EXPAND_COUNTER(block_cache_free_work) \
EXPAND_COUNTER(block_cache_remove_stale) \ EXPAND_COUNTER(block_cache_remove_stale) \
EXPAND_COUNTER(block_cache_count_objects) \ EXPAND_COUNTER(block_cache_count_objects) \
EXPAND_COUNTER(block_cache_scan_objects) \ EXPAND_COUNTER(block_cache_scan_objects) \
EXPAND_COUNTER(block_cache_shrink) \
EXPAND_COUNTER(block_cache_shrink_next) \
EXPAND_COUNTER(block_cache_shrink_recent) \
EXPAND_COUNTER(block_cache_shrink_remove) \
EXPAND_COUNTER(block_cache_shrink_stop) \
EXPAND_COUNTER(btree_compact_values) \ EXPAND_COUNTER(btree_compact_values) \
EXPAND_COUNTER(btree_compact_values_enomem) \ EXPAND_COUNTER(btree_compact_values_enomem) \
EXPAND_COUNTER(btree_delete) \ EXPAND_COUNTER(btree_delete) \
@@ -118,10 +116,11 @@
EXPAND_COUNTER(item_pcpu_page_hit) \ EXPAND_COUNTER(item_pcpu_page_hit) \
EXPAND_COUNTER(item_pcpu_page_miss) \ EXPAND_COUNTER(item_pcpu_page_miss) \
EXPAND_COUNTER(item_pcpu_page_miss_keys) \ EXPAND_COUNTER(item_pcpu_page_miss_keys) \
EXPAND_COUNTER(item_read_pages_barrier) \
EXPAND_COUNTER(item_read_pages_retry) \
EXPAND_COUNTER(item_read_pages_split) \ EXPAND_COUNTER(item_read_pages_split) \
EXPAND_COUNTER(item_shrink_page) \ EXPAND_COUNTER(item_shrink_page) \
EXPAND_COUNTER(item_shrink_page_dirty) \ EXPAND_COUNTER(item_shrink_page_dirty) \
EXPAND_COUNTER(item_shrink_page_reader) \
EXPAND_COUNTER(item_shrink_page_trylock) \ EXPAND_COUNTER(item_shrink_page_trylock) \
EXPAND_COUNTER(item_update) \ EXPAND_COUNTER(item_update) \
EXPAND_COUNTER(item_write_dirty) \ EXPAND_COUNTER(item_write_dirty) \

View File

@@ -86,6 +86,8 @@ struct item_cache_info {
/* often walked, but per-cpu refs are fast path */ /* often walked, but per-cpu refs are fast path */
rwlock_t rwlock; rwlock_t rwlock;
struct rb_root pg_root; struct rb_root pg_root;
/* stop readers from caching stale items behind reclaimed cleaned written items */
u64 read_dirty_barrier;
/* page-granular modification by writers, then exclusive to commit */ /* page-granular modification by writers, then exclusive to commit */
spinlock_t dirty_lock; spinlock_t dirty_lock;
@@ -96,10 +98,6 @@ struct item_cache_info {
spinlock_t lru_lock; spinlock_t lru_lock;
struct list_head lru_list; struct list_head lru_list;
unsigned long lru_pages; unsigned long lru_pages;
/* written by page readers, read by shrink */
spinlock_t active_lock;
struct list_head active_list;
}; };
#define DECLARE_ITEM_CACHE_INFO(sb, name) \ #define DECLARE_ITEM_CACHE_INFO(sb, name) \
@@ -1285,78 +1283,6 @@ static int cache_empty_page(struct super_block *sb,
return 0; return 0;
} }
/*
* Readers operate independently from dirty items and transactions.
* They read a set of persistent items and insert them into the cache
* when there aren't already pages whose key range contains the items.
* This naturally prefers cached dirty items over stale read items.
*
* We have to deal with the case where dirty items are written and
* invalidated while a read is in flight. The reader won't have seen
* the items that were dirty in their persistent roots as they started
* reading. By the time they insert their read pages the previously
* dirty items have been reclaimed and are not in the cache. The old
* stale items will be inserted in their place, effectively corrupting
* by having the dirty items disappear.
*
* We fix this by tracking the max seq of items in pages. As readers
* start they record the current transaction seq. Invalidation skips
* pages with a max seq greater than the first reader seq because the
* items in the page have to stick around to prevent the readers stale
* items from being inserted.
*
* This naturally only affects a small set of pages with items that were
* written relatively recently. If we're in memory pressure then we
* probably have a lot of pages and they'll naturally have items that
* were visible to any raders. We don't bother with the complicated and
* expensive further refinement of tracking the ranges that are being
* read and comparing those with pages to invalidate.
*/
struct active_reader {
struct list_head head;
u64 seq;
};
#define INIT_ACTIVE_READER(rdr) \
struct active_reader rdr = { .head = LIST_HEAD_INIT(rdr.head) }
static void add_active_reader(struct super_block *sb, struct active_reader *active)
{
DECLARE_ITEM_CACHE_INFO(sb, cinf);
BUG_ON(!list_empty(&active->head));
active->seq = scoutfs_trans_sample_seq(sb);
spin_lock(&cinf->active_lock);
list_add_tail(&active->head, &cinf->active_list);
spin_unlock(&cinf->active_lock);
}
static u64 first_active_reader_seq(struct item_cache_info *cinf)
{
struct active_reader *active;
u64 first;
/* only the calling task adds or deletes this active */
spin_lock(&cinf->active_lock);
active = list_first_entry_or_null(&cinf->active_list, struct active_reader, head);
first = active ? active->seq : U64_MAX;
spin_unlock(&cinf->active_lock);
return first;
}
static void del_active_reader(struct item_cache_info *cinf, struct active_reader *active)
{
/* only the calling task adds or deletes this active */
if (!list_empty(&active->head)) {
spin_lock(&cinf->active_lock);
list_del_init(&active->head);
spin_unlock(&cinf->active_lock);
}
}
/* /*
* Add a newly read item to the pages that we're assembling for * Add a newly read item to the pages that we're assembling for
* insertion into the cache. These pages are private, they only exist * insertion into the cache. These pages are private, they only exist
@@ -1450,24 +1376,34 @@ static int read_page_item(struct super_block *sb, struct scoutfs_key *key, u64 s
* and duplicates, we insert any resulting pages which don't overlap * and duplicates, we insert any resulting pages which don't overlap
* with existing cached pages. * with existing cached pages.
* *
* We only insert uncached regions because this is called with cluster
* locks held, but without locking the cache. The regions we read can
* be stale with respect to the current cache, which can be read and
* dirtied by other cluster lock holders on our node, but the cluster
* locks protect the stable items we read. Invalidation is careful not
* to drop pages that have items that we couldn't see because they were
* dirty when we started reading.
*
* The forest item reader is reading stable trees that could be * The forest item reader is reading stable trees that could be
* overwritten. It can return -ESTALE which we return to the caller who * overwritten. It can return -ESTALE which we return to the caller who
* will retry the operation and work with a new set of more recent * will retry the operation and work with a new set of more recent
* btrees. * btrees.
*
* We only insert uncached regions because this is called with cluster
* locks held, but without locking the cache. The regions we read can
* be stale with respect to the current cache, which can be read and
* dirtied by other cluster lock holders on our node, but the cluster
* locks protect the stable items we read.
*
* Using the presence of locally written dirty pages to override stale
* read pages only works if, well, the more recent locally written pages
* are still present. Readers are totally decoupled from writers and
* can have a set of items that is very old indeed. In the mean time
* more recent items would have been dirtied locally, committed,
* cleaned, and reclaimed. We have a coarse barrier which ensures that
* readers can't insert items read from old roots from before local data
* was written. If a write completes while a read is in progress the
* read will have to retry. The retried read can use cached blocks so
* we're relying on reads being much faster than writes to reduce the
* overhead to mostly cpu work of recollecting the items from cached
* blocks via a more recent root from the server.
*/ */
static int read_pages(struct super_block *sb, struct item_cache_info *cinf, static int read_pages(struct super_block *sb, struct item_cache_info *cinf,
struct scoutfs_key *key, struct scoutfs_lock *lock) struct scoutfs_key *key, struct scoutfs_lock *lock)
{ {
struct rb_root root = RB_ROOT; struct rb_root root = RB_ROOT;
INIT_ACTIVE_READER(active);
struct cached_page *right = NULL; struct cached_page *right = NULL;
struct cached_page *pg; struct cached_page *pg;
struct cached_page *rd; struct cached_page *rd;
@@ -1480,6 +1416,7 @@ static int read_pages(struct super_block *sb, struct item_cache_info *cinf,
struct rb_node *par; struct rb_node *par;
struct rb_node *pg_tmp; struct rb_node *pg_tmp;
struct rb_node *item_tmp; struct rb_node *item_tmp;
u64 rdbar;
int pgi; int pgi;
int ret; int ret;
@@ -1493,8 +1430,9 @@ static int read_pages(struct super_block *sb, struct item_cache_info *cinf,
pg->end = lock->end; pg->end = lock->end;
rbtree_insert(&pg->node, NULL, &root.rb_node, &root); rbtree_insert(&pg->node, NULL, &root.rb_node, &root);
/* set active reader seq before reading persistent roots */ read_lock(&cinf->rwlock);
add_active_reader(sb, &active); rdbar = cinf->read_dirty_barrier;
read_unlock(&cinf->rwlock);
start = lock->start; start = lock->start;
end = lock->end; end = lock->end;
@@ -1533,6 +1471,13 @@ static int read_pages(struct super_block *sb, struct item_cache_info *cinf,
retry: retry:
write_lock(&cinf->rwlock); write_lock(&cinf->rwlock);
/* can't insert if write has cleaned since we read */
if (cinf->read_dirty_barrier != rdbar) {
scoutfs_inc_counter(sb, item_read_pages_barrier);
ret = -ESTALE;
goto unlock;
}
while ((rd = first_page(&root))) { while ((rd = first_page(&root))) {
pg = page_rbtree_walk(sb, &cinf->pg_root, &rd->start, &rd->end, pg = page_rbtree_walk(sb, &cinf->pg_root, &rd->start, &rd->end,
@@ -1570,12 +1515,12 @@ retry:
} }
} }
ret = 0;
unlock:
write_unlock(&cinf->rwlock); write_unlock(&cinf->rwlock);
ret = 0;
out: out:
del_active_reader(cinf, &active);
/* free any pages we left dangling on error */ /* free any pages we left dangling on error */
for_each_page_safe(&root, rd, pg_tmp) { for_each_page_safe(&root, rd, pg_tmp) {
rbtree_erase(&rd->node, &root); rbtree_erase(&rd->node, &root);
@@ -1635,6 +1580,7 @@ retry:
ret = read_pages(sb, cinf, key, lock); ret = read_pages(sb, cinf, key, lock);
if (ret < 0 && ret != -ESTALE) if (ret < 0 && ret != -ESTALE)
goto out; goto out;
scoutfs_inc_counter(sb, item_read_pages_retry);
goto retry; goto retry;
} }
@@ -2415,6 +2361,11 @@ int scoutfs_item_write_done(struct super_block *sb)
struct cached_item *tmp; struct cached_item *tmp;
struct cached_page *pg; struct cached_page *pg;
/* don't let read_pages miss written+cleaned items */
write_lock(&cinf->rwlock);
cinf->read_dirty_barrier++;
write_unlock(&cinf->rwlock);
spin_lock(&cinf->dirty_lock); spin_lock(&cinf->dirty_lock);
while ((pg = list_first_entry_or_null(&cinf->dirty_list, struct cached_page, dirty_head))) { while ((pg = list_first_entry_or_null(&cinf->dirty_list, struct cached_page, dirty_head))) {
if (write_trylock(&pg->rwlock)) { if (write_trylock(&pg->rwlock)) {
@@ -2593,24 +2544,15 @@ static unsigned long item_cache_scan_objects(struct shrinker *shrink,
struct cached_page *tmp; struct cached_page *tmp;
struct cached_page *pg; struct cached_page *pg;
unsigned long freed = 0; unsigned long freed = 0;
u64 first_reader_seq;
int nr = sc->nr_to_scan; int nr = sc->nr_to_scan;
scoutfs_inc_counter(sb, item_cache_scan_objects); scoutfs_inc_counter(sb, item_cache_scan_objects);
/* can't invalidate pages with items that weren't visible to first reader */
first_reader_seq = first_active_reader_seq(cinf);
write_lock(&cinf->rwlock); write_lock(&cinf->rwlock);
spin_lock(&cinf->lru_lock); spin_lock(&cinf->lru_lock);
list_for_each_entry_safe(pg, tmp, &cinf->lru_list, lru_head) { list_for_each_entry_safe(pg, tmp, &cinf->lru_list, lru_head) {
if (first_reader_seq <= pg->max_seq) {
scoutfs_inc_counter(sb, item_shrink_page_reader);
continue;
}
if (!write_trylock(&pg->rwlock)) { if (!write_trylock(&pg->rwlock)) {
scoutfs_inc_counter(sb, item_shrink_page_trylock); scoutfs_inc_counter(sb, item_shrink_page_trylock);
continue; continue;
@@ -2677,8 +2619,6 @@ int scoutfs_item_setup(struct super_block *sb)
atomic_set(&cinf->dirty_pages, 0); atomic_set(&cinf->dirty_pages, 0);
spin_lock_init(&cinf->lru_lock); spin_lock_init(&cinf->lru_lock);
INIT_LIST_HEAD(&cinf->lru_list); INIT_LIST_HEAD(&cinf->lru_list);
spin_lock_init(&cinf->active_lock);
INIT_LIST_HEAD(&cinf->active_list);
cinf->pcpu_pages = alloc_percpu(struct item_percpu_pages); cinf->pcpu_pages = alloc_percpu(struct item_percpu_pages);
if (!cinf->pcpu_pages) if (!cinf->pcpu_pages)
@@ -2711,8 +2651,6 @@ void scoutfs_item_destroy(struct super_block *sb)
int cpu; int cpu;
if (cinf) { if (cinf) {
BUG_ON(!list_empty(&cinf->active_list));
#ifdef KC_CPU_NOTIFIER #ifdef KC_CPU_NOTIFIER
unregister_hotcpu_notifier(&cinf->notifier); unregister_hotcpu_notifier(&cinf->notifier);
#endif #endif

View File

@@ -81,3 +81,69 @@ kc_generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
return written ? written : status; return written ? written : status;
} }
#endif #endif
#include <linux/list_lru.h>
#ifdef KC_LIST_LRU_WALK_CB_ITEM_LOCK
static enum lru_status kc_isolate(struct list_head *item, spinlock_t *lock, void *cb_arg)
{
struct kc_isolate_args *args = cb_arg;
/* isolate doesn't use list, nr_items updated in caller */
return args->isolate(item, NULL, args->cb_arg);
}
unsigned long kc_list_lru_walk(struct list_lru *lru, kc_list_lru_walk_cb_t isolate, void *cb_arg,
unsigned long nr_to_walk)
{
struct kc_isolate_args args = {
.isolate = isolate,
.cb_arg = cb_arg,
};
return list_lru_walk(lru, kc_isolate, &args, nr_to_walk);
}
unsigned long kc_list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc,
kc_list_lru_walk_cb_t isolate, void *cb_arg)
{
struct kc_isolate_args args = {
.isolate = isolate,
.cb_arg = cb_arg,
};
return list_lru_shrink_walk(lru, sc, kc_isolate, &args);
}
#endif
#ifdef KC_LIST_LRU_WALK_CB_LIST_LOCK
static enum lru_status kc_isolate(struct list_head *item, struct list_lru_one *list,
spinlock_t *lock, void *cb_arg)
{
struct kc_isolate_args *args = cb_arg;
return args->isolate(item, list, args->cb_arg);
}
unsigned long kc_list_lru_walk(struct list_lru *lru, kc_list_lru_walk_cb_t isolate, void *cb_arg,
unsigned long nr_to_walk)
{
struct kc_isolate_args args = {
.isolate = isolate,
.cb_arg = cb_arg,
};
return list_lru_walk(lru, kc_isolate, &args, nr_to_walk);
}
unsigned long kc_list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc,
kc_list_lru_walk_cb_t isolate, void *cb_arg)
{
struct kc_isolate_args args = {
.isolate = isolate,
.cb_arg = cb_arg,
};
return list_lru_shrink_walk(lru, sc, kc_isolate, &args);
}
#endif

View File

@@ -410,4 +410,77 @@ static inline vm_fault_t vmf_error(int err)
} }
#endif #endif
#include <linux/list_lru.h>
#ifndef KC_LIST_LRU_SHRINK_COUNT_WALK
/* we don't bother with sc->{nid,memcg} (which doesn't exist in oldest kernels) */
static inline unsigned long list_lru_shrink_count(struct list_lru *lru,
struct shrink_control *sc)
{
return list_lru_count(lru);
}
static inline unsigned long
list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc,
list_lru_walk_cb isolate, void *cb_arg)
{
return list_lru_walk(lru, isolate, cb_arg, sc->nr_to_scan);
}
#endif
#ifndef KC_LIST_LRU_ADD_OBJ
#define list_lru_add_obj list_lru_add
#define list_lru_del_obj list_lru_del
#endif
#if defined(KC_LIST_LRU_WALK_CB_LIST_LOCK) || defined(KC_LIST_LRU_WALK_CB_ITEM_LOCK)
struct list_lru_one;
typedef enum lru_status (*kc_list_lru_walk_cb_t)(struct list_head *item, struct list_lru_one *list,
void *cb_arg);
struct kc_isolate_args {
kc_list_lru_walk_cb_t isolate;
void *cb_arg;
};
unsigned long kc_list_lru_walk(struct list_lru *lru, kc_list_lru_walk_cb_t isolate, void *cb_arg,
unsigned long nr_to_walk);
unsigned long kc_list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc,
kc_list_lru_walk_cb_t isolate, void *cb_arg);
#else
#define kc_list_lru_shrink_walk list_lru_shrink_walk
#endif
#if defined(KC_LIST_LRU_WALK_CB_ITEM_LOCK)
/* isolate moved by hand, nr_items updated in walk as _REMOVE returned */
static inline void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
struct list_head *head)
{
list_move(item, head);
}
#endif
#ifndef KC_STACK_TRACE_SAVE
#include <linux/stacktrace.h>
static inline unsigned int stack_trace_save(unsigned long *store, unsigned int size,
unsigned int skipnr)
{
struct stack_trace trace = {
.entries = store,
.max_entries = size,
.skip = skipnr,
};
save_stack_trace(&trace);
return trace.nr_entries;
}
static inline void stack_trace_print(unsigned long *entries, unsigned int nr_entries, int spaces)
{
struct stack_trace trace = {
.entries = entries,
.nr_entries = nr_entries,
};
print_stack_trace(&trace, spaces);
}
#endif
#endif #endif

View File

@@ -2526,8 +2526,8 @@ TRACE_EVENT(scoutfs_block_stale,
DECLARE_EVENT_CLASS(scoutfs_block_class, DECLARE_EVENT_CLASS(scoutfs_block_class,
TP_PROTO(struct super_block *sb, void *bp, u64 blkno, int refcount, int io_count, TP_PROTO(struct super_block *sb, void *bp, u64 blkno, int refcount, int io_count,
unsigned long bits, __u64 accessed), unsigned long bits),
TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed), TP_ARGS(sb, bp, blkno, refcount, io_count, bits),
TP_STRUCT__entry( TP_STRUCT__entry(
SCSB_TRACE_FIELDS SCSB_TRACE_FIELDS
__field(void *, bp) __field(void *, bp)
@@ -2535,7 +2535,6 @@ DECLARE_EVENT_CLASS(scoutfs_block_class,
__field(int, refcount) __field(int, refcount)
__field(int, io_count) __field(int, io_count)
__field(long, bits) __field(long, bits)
__field(__u64, accessed)
), ),
TP_fast_assign( TP_fast_assign(
SCSB_TRACE_ASSIGN(sb); SCSB_TRACE_ASSIGN(sb);
@@ -2544,71 +2543,65 @@ DECLARE_EVENT_CLASS(scoutfs_block_class,
__entry->refcount = refcount; __entry->refcount = refcount;
__entry->io_count = io_count; __entry->io_count = io_count;
__entry->bits = bits; __entry->bits = bits;
__entry->accessed = accessed;
), ),
TP_printk(SCSBF" bp %p blkno %llu refcount %d io_count %d bits 0x%lx accessed %llu", TP_printk(SCSBF" bp %p blkno %llu refcount %x io_count %d bits 0x%lx",
SCSB_TRACE_ARGS, __entry->bp, __entry->blkno, __entry->refcount, SCSB_TRACE_ARGS, __entry->bp, __entry->blkno, __entry->refcount,
__entry->io_count, __entry->bits, __entry->accessed) __entry->io_count, __entry->bits)
); );
DEFINE_EVENT(scoutfs_block_class, scoutfs_block_allocate, DEFINE_EVENT(scoutfs_block_class, scoutfs_block_allocate,
TP_PROTO(struct super_block *sb, void *bp, u64 blkno, TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
int refcount, int io_count, unsigned long bits, int refcount, int io_count, unsigned long bits),
__u64 accessed), TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
); );
DEFINE_EVENT(scoutfs_block_class, scoutfs_block_free, DEFINE_EVENT(scoutfs_block_class, scoutfs_block_free,
TP_PROTO(struct super_block *sb, void *bp, u64 blkno, TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
int refcount, int io_count, unsigned long bits, int refcount, int io_count, unsigned long bits),
__u64 accessed), TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
); );
DEFINE_EVENT(scoutfs_block_class, scoutfs_block_insert, DEFINE_EVENT(scoutfs_block_class, scoutfs_block_insert,
TP_PROTO(struct super_block *sb, void *bp, u64 blkno, TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
int refcount, int io_count, unsigned long bits, int refcount, int io_count, unsigned long bits),
__u64 accessed), TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
); );
DEFINE_EVENT(scoutfs_block_class, scoutfs_block_remove, DEFINE_EVENT(scoutfs_block_class, scoutfs_block_remove,
TP_PROTO(struct super_block *sb, void *bp, u64 blkno, TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
int refcount, int io_count, unsigned long bits, int refcount, int io_count, unsigned long bits),
__u64 accessed), TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
); );
DEFINE_EVENT(scoutfs_block_class, scoutfs_block_end_io, DEFINE_EVENT(scoutfs_block_class, scoutfs_block_end_io,
TP_PROTO(struct super_block *sb, void *bp, u64 blkno, TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
int refcount, int io_count, unsigned long bits, int refcount, int io_count, unsigned long bits),
__u64 accessed), TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
); );
DEFINE_EVENT(scoutfs_block_class, scoutfs_block_submit, DEFINE_EVENT(scoutfs_block_class, scoutfs_block_submit,
TP_PROTO(struct super_block *sb, void *bp, u64 blkno, TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
int refcount, int io_count, unsigned long bits, int refcount, int io_count, unsigned long bits),
__u64 accessed), TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
); );
DEFINE_EVENT(scoutfs_block_class, scoutfs_block_invalidate, DEFINE_EVENT(scoutfs_block_class, scoutfs_block_invalidate,
TP_PROTO(struct super_block *sb, void *bp, u64 blkno, TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
int refcount, int io_count, unsigned long bits, int refcount, int io_count, unsigned long bits),
__u64 accessed), TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
); );
DEFINE_EVENT(scoutfs_block_class, scoutfs_block_mark_dirty, DEFINE_EVENT(scoutfs_block_class, scoutfs_block_mark_dirty,
TP_PROTO(struct super_block *sb, void *bp, u64 blkno, TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
int refcount, int io_count, unsigned long bits, int refcount, int io_count, unsigned long bits),
__u64 accessed), TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
); );
DEFINE_EVENT(scoutfs_block_class, scoutfs_block_forget, DEFINE_EVENT(scoutfs_block_class, scoutfs_block_forget,
TP_PROTO(struct super_block *sb, void *bp, u64 blkno, TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
int refcount, int io_count, unsigned long bits, int refcount, int io_count, unsigned long bits),
__u64 accessed), TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
); );
DEFINE_EVENT(scoutfs_block_class, scoutfs_block_shrink, DEFINE_EVENT(scoutfs_block_class, scoutfs_block_shrink,
TP_PROTO(struct super_block *sb, void *bp, u64 blkno, TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
int refcount, int io_count, unsigned long bits, int refcount, int io_count, unsigned long bits),
__u64 accessed), TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed) );
DEFINE_EVENT(scoutfs_block_class, scoutfs_block_isolate,
TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
int refcount, int io_count, unsigned long bits),
TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
); );
DECLARE_EVENT_CLASS(scoutfs_ext_next_class, DECLARE_EVENT_CLASS(scoutfs_ext_next_class,

View File

@@ -2134,7 +2134,7 @@ static int server_srch_commit_compact(struct super_block *sb,
&super->srch_root, rid, sc, &super->srch_root, rid, sc,
&av, &fr); &av, &fr);
mutex_unlock(&server->srch_mutex); mutex_unlock(&server->srch_mutex);
if (ret < 0) /* XXX very bad, leaks allocators */ if (ret < 0)
goto apply; goto apply;
/* reclaim allocators if they were set by _srch_commit_ */ /* reclaim allocators if they were set by _srch_commit_ */
@@ -2144,10 +2144,10 @@ static int server_srch_commit_compact(struct super_block *sb,
scoutfs_alloc_splice_list(sb, &server->alloc, &server->wri, scoutfs_alloc_splice_list(sb, &server->alloc, &server->wri,
server->other_freed, &fr); server->other_freed, &fr);
mutex_unlock(&server->alloc_mutex); mutex_unlock(&server->alloc_mutex);
WARN_ON(ret < 0); /* XXX leaks allocators */
apply: apply:
ret = server_apply_commit(sb, &hold, ret); ret = server_apply_commit(sb, &hold, ret);
out: out:
WARN_ON(ret < 0); /* XXX leaks allocators */
return scoutfs_net_response(sb, conn, cmd, id, ret, NULL, 0); return scoutfs_net_response(sb, conn, cmd, id, ret, NULL, 0);
} }

View File

@@ -1406,7 +1406,7 @@ int scoutfs_srch_commit_compact(struct super_block *sb,
ret = -EIO; ret = -EIO;
scoutfs_btree_put_iref(&iref); scoutfs_btree_put_iref(&iref);
} }
if (ret < 0) /* XXX leaks allocators */ if (ret < 0)
goto out; goto out;
/* restore busy to pending if the operation failed */ /* restore busy to pending if the operation failed */
@@ -1426,10 +1426,8 @@ int scoutfs_srch_commit_compact(struct super_block *sb,
/* update file references if we finished compaction (!deleting) */ /* update file references if we finished compaction (!deleting) */
if (!(res->flags & SCOUTFS_SRCH_COMPACT_FLAG_DELETE)) { if (!(res->flags & SCOUTFS_SRCH_COMPACT_FLAG_DELETE)) {
ret = commit_files(sb, alloc, wri, root, res); ret = commit_files(sb, alloc, wri, root, res);
if (ret < 0) { if (ret < 0)
/* XXX we can't commit, shutdown? */
goto out; goto out;
}
/* transition flags for deleting input files */ /* transition flags for deleting input files */
for (i = 0; i < res->nr; i++) { for (i = 0; i < res->nr; i++) {
@@ -1456,7 +1454,7 @@ update:
le64_to_cpu(pending->id), 0); le64_to_cpu(pending->id), 0);
ret = scoutfs_btree_insert(sb, alloc, wri, root, &key, ret = scoutfs_btree_insert(sb, alloc, wri, root, &key,
pending, sizeof(*pending)); pending, sizeof(*pending));
if (ret < 0) if (WARN_ON_ONCE(ret < 0)) /* XXX inconsistency */
goto out; goto out;
} }
@@ -1469,7 +1467,6 @@ update:
BUG_ON(err); /* both busy and pending present */ BUG_ON(err); /* both busy and pending present */
} }
out: out:
WARN_ON_ONCE(ret < 0); /* XXX inconsistency */
kfree(busy); kfree(busy);
return ret; return ret;
} }

View File

@@ -90,6 +90,7 @@ done
# set some T_ defaults # set some T_ defaults
T_TRACE_DUMP="0" T_TRACE_DUMP="0"
T_TRACE_PRINTK="0" T_TRACE_PRINTK="0"
T_PORT_START="19700"
# array declarations to be able to use array ops # array declarations to be able to use array ops
declare -a T_TRACE_GLOB declare -a T_TRACE_GLOB
@@ -265,6 +266,17 @@ for e in T_META_DEVICE T_DATA_DEVICE T_EX_META_DEV T_EX_DATA_DEV T_KMOD T_RESULT
eval $e=\"$(readlink -f "${!e}")\" eval $e=\"$(readlink -f "${!e}")\"
done done
# try and check ports, but not necessary
T_TEST_PORT="$T_PORT_START"
T_SCRATCH_PORT="$((T_PORT_START + 100))"
T_DEV_PORT="$((T_PORT_START + 200))"
read local_start local_end < /proc/sys/net/ipv4/ip_local_port_range
if [ -n "$local_start" -a -n "$local_end" -a "$local_start" -lt "$local_end" ]; then
if [ ! "$T_DEV_PORT" -lt "$local_start" -a ! "$T_TEST_PORT" -gt "$local_end" ]; then
die "listening port range $T_TEST_PORT - $T_DEV_PORT is within local dynamic port range $local_start - $local_end in /proc/sys/net/ipv4/ip_local_port_range"
fi
fi
# permute sequence? # permute sequence?
T_SEQUENCE=sequence T_SEQUENCE=sequence
if [ -n "$T_SHUF" ]; then if [ -n "$T_SHUF" ]; then
@@ -363,7 +375,7 @@ fi
quo="" quo=""
if [ -n "$T_MKFS" ]; then if [ -n "$T_MKFS" ]; then
for i in $(seq -0 $((T_QUORUM - 1))); do for i in $(seq -0 $((T_QUORUM - 1))); do
quo="$quo -Q $i,127.0.0.1,$((42000 + i))" quo="$quo -Q $i,127.0.0.1,$((T_TEST_PORT + i))"
done done
msg "making new filesystem with $T_QUORUM quorum members" msg "making new filesystem with $T_QUORUM quorum members"

View File

@@ -15,7 +15,7 @@ echo "== prepare devices, mount point, and logs"
SCR="$T_TMPDIR/mnt.scratch" SCR="$T_TMPDIR/mnt.scratch"
mkdir -p "$SCR" mkdir -p "$SCR"
> $T_TMP.mount.out > $T_TMP.mount.out
scoutfs mkfs -f -Q 0,127.0.0.1,53000 "$T_EX_META_DEV" "$T_EX_DATA_DEV" > $T_TMP.mkfs.out 2>&1 \ scoutfs mkfs -f -Q 0,127.0.0.1,$T_SCRATCH_PORT "$T_EX_META_DEV" "$T_EX_DATA_DEV" > $T_TMP.mkfs.out 2>&1 \
|| t_fail "mkfs failed" || t_fail "mkfs failed"
echo "== bad devices, bad options" echo "== bad devices, bad options"

View File

@@ -11,7 +11,7 @@ truncate -s $sz "$T_TMP.equal"
truncate -s $large_sz "$T_TMP.large" truncate -s $large_sz "$T_TMP.large"
echo "== make scratch fs" echo "== make scratch fs"
t_quiet scoutfs mkfs -f -Q 0,127.0.0.1,53000 "$T_EX_META_DEV" "$T_EX_DATA_DEV" t_quiet scoutfs mkfs -f -Q 0,127.0.0.1,$T_SCRATCH_PORT "$T_EX_META_DEV" "$T_EX_DATA_DEV"
SCR="$T_TMPDIR/mnt.scratch" SCR="$T_TMPDIR/mnt.scratch"
mkdir -p "$SCR" mkdir -p "$SCR"

View File

@@ -57,7 +57,7 @@ test "$before" == "$after" || \
# XXX this is all pretty manual, would be nice to have helpers # XXX this is all pretty manual, would be nice to have helpers
echo "== make small meta fs" echo "== make small meta fs"
# meta device just big enough for reserves and the metadata we'll fill # meta device just big enough for reserves and the metadata we'll fill
scoutfs mkfs -A -f -Q 0,127.0.0.1,53000 -m 10G "$T_EX_META_DEV" "$T_EX_DATA_DEV" > $T_TMP.mkfs.out 2>&1 || \ scoutfs mkfs -A -f -Q 0,127.0.0.1,$T_SCRATCH_PORT -m 10G "$T_EX_META_DEV" "$T_EX_DATA_DEV" > $T_TMP.mkfs.out 2>&1 || \
t_fail "mkfs failed" t_fail "mkfs failed"
SCR="$T_TMPDIR/mnt.scratch" SCR="$T_TMPDIR/mnt.scratch"
mkdir -p "$SCR" mkdir -p "$SCR"

View File

@@ -89,7 +89,7 @@ for vers in $(seq $MIN $((MAX - 1))); do
old_module="$builds/$vers/scoutfs.ko" old_module="$builds/$vers/scoutfs.ko"
echo "mkfs $vers" >> "$T_TMP.log" echo "mkfs $vers" >> "$T_TMP.log"
t_quiet $old_scoutfs mkfs -f -Q 0,127.0.0.1,53000 "$T_EX_META_DEV" "$T_EX_DATA_DEV" \ t_quiet $old_scoutfs mkfs -f -Q 0,127.0.0.1,$T_SCRATCH_PORT "$T_EX_META_DEV" "$T_EX_DATA_DEV" \
|| t_fail "mkfs $vers failed" || t_fail "mkfs $vers failed"
echo "mount $vers with $vers" >> "$T_TMP.log" echo "mount $vers with $vers" >> "$T_TMP.log"

View File

@@ -72,7 +72,7 @@ quarter_data=$(echo "$size_data / 4" | bc)
# XXX this is all pretty manual, would be nice to have helpers # XXX this is all pretty manual, would be nice to have helpers
echo "== make initial small fs" echo "== make initial small fs"
scoutfs mkfs -A -f -Q 0,127.0.0.1,53000 -m $quarter_meta -d $quarter_data \ scoutfs mkfs -A -f -Q 0,127.0.0.1,$T_SCRATCH_PORT -m $quarter_meta -d $quarter_data \
"$T_EX_META_DEV" "$T_EX_DATA_DEV" > $T_TMP.mkfs.out 2>&1 || \ "$T_EX_META_DEV" "$T_EX_DATA_DEV" > $T_TMP.mkfs.out 2>&1 || \
t_fail "mkfs failed" t_fail "mkfs failed"
SCR="$T_TMPDIR/mnt.scratch" SCR="$T_TMPDIR/mnt.scratch"

View File

@@ -50,9 +50,9 @@ t_quiet sync
cat << EOF > local.config cat << EOF > local.config
export FSTYP=scoutfs export FSTYP=scoutfs
export MKFS_OPTIONS="-f" export MKFS_OPTIONS="-f"
export MKFS_TEST_OPTIONS="-Q 0,127.0.0.1,42000" export MKFS_TEST_OPTIONS="-Q 0,127.0.0.1,$T_TEST_PORT"
export MKFS_SCRATCH_OPTIONS="-Q 0,127.0.0.1,43000" export MKFS_SCRATCH_OPTIONS="-Q 0,127.0.0.1,$T_SCRATCH_PORT"
export MKFS_DEV_OPTIONS="-Q 0,127.0.0.1,44000" export MKFS_DEV_OPTIONS="-Q 0,127.0.0.1,$T_DEV_PORT"
export TEST_DEV=$T_DB0 export TEST_DEV=$T_DB0
export TEST_DIR=$T_M0 export TEST_DIR=$T_M0
export SCRATCH_META_DEV=$T_EX_META_DEV export SCRATCH_META_DEV=$T_EX_META_DEV