Dump block alloc stacks for leaked blocks

The typical pattern of spinning isolating a list_lru results in a
livelock if there are blocks with leaked refcounts.  We're rarely seeing
this in testing.

We can have a modest array in each block that records the stack of the
caller that initially allocated the block and dump that stack for any
blocks that we're unable to shrink/isolate.  Instead of spinning
shrinking, we can give it a good try and then print the blocks that
remain and carry on with unmount, leaking a few blocks.  (Past events
have had 2 blocks.)

Signed-off-by: Zach Brown <zab@versity.com>
This commit is contained in:
Zach Brown
2025-10-27 10:36:46 -07:00
parent 38a2ffe0c7
commit 6a70ee03b5

View File

@@ -23,6 +23,7 @@
#include <linux/random.h> #include <linux/random.h>
#include <linux/sched/mm.h> #include <linux/sched/mm.h>
#include <linux/list_lru.h> #include <linux/list_lru.h>
#include <linux/stacktrace.h>
#include "format.h" #include "format.h"
#include "super.h" #include "super.h"
@@ -80,6 +81,8 @@ struct block_private {
struct page *page; struct page *page;
void *virt; void *virt;
}; };
unsigned int stack_len;
unsigned long stack[10];
}; };
#define TRACE_BLOCK(which, bp) \ #define TRACE_BLOCK(which, bp) \
@@ -100,7 +103,17 @@ static __le32 block_calc_crc(struct scoutfs_block_header *hdr, u32 size)
return cpu_to_le32(calc); return cpu_to_le32(calc);
} }
static struct block_private *block_alloc(struct super_block *sb, u64 blkno) static noinline void save_block_stack(struct block_private *bp)
{
bp->stack_len = stack_trace_save(bp->stack, ARRAY_SIZE(bp->stack), 2);
}
static void print_block_stack(struct block_private *bp)
{
stack_trace_print(bp->stack, bp->stack_len, 1);
}
static noinline struct block_private *block_alloc(struct super_block *sb, u64 blkno)
{ {
struct block_private *bp; struct block_private *bp;
unsigned int nofs_flags; unsigned int nofs_flags;
@@ -156,6 +169,7 @@ static struct block_private *block_alloc(struct super_block *sb, u64 blkno)
atomic_set(&bp->io_count, 0); atomic_set(&bp->io_count, 0);
TRACE_BLOCK(allocate, bp); TRACE_BLOCK(allocate, bp);
save_block_stack(bp);
out: out:
if (!bp) if (!bp)
@@ -1113,6 +1127,19 @@ static unsigned long block_scan_objects(struct shrinker *shrink, struct shrink_c
return freed; return freed;
} }
static enum lru_status dump_lru_block(struct list_head *item, struct list_lru_one *list,
void *cb_arg)
{
struct block_private *bp = container_of(item, struct block_private, lru_head);
printk("blkno %llu refcount 0x%x io_count %d bits 0x%lx\n",
bp->bl.blkno, atomic_read(&bp->refcount), atomic_read(&bp->io_count),
bp->bits);
print_block_stack(bp);
return LRU_SKIP;
}
/* /*
* Called during shutdown with no other users. The isolating walk must * Called during shutdown with no other users. The isolating walk must
* find blocks on the lru that only have references for presence on the * find blocks on the lru that only have references for presence on the
@@ -1122,11 +1149,19 @@ static void block_shrink_all(struct super_block *sb)
{ {
DECLARE_BLOCK_INFO(sb, binf); DECLARE_BLOCK_INFO(sb, binf);
DECLARE_ISOLATE_ARGS(sb, ia); DECLARE_ISOLATE_ARGS(sb, ia);
long count;
count = DIV_ROUND_UP(list_lru_count(&binf->lru), 128) * 2;
do { do {
kc_list_lru_walk(&binf->lru, isolate_lru_block, &ia, 128); kc_list_lru_walk(&binf->lru, isolate_lru_block, &ia, 128);
shrink_dispose_blocks(sb, &ia.dispose); shrink_dispose_blocks(sb, &ia.dispose);
} while (list_lru_count(&binf->lru) > 0); } while (list_lru_count(&binf->lru) > 0 && --count > 0);
count = list_lru_count(&binf->lru);
if (count > 0) {
scoutfs_err(sb, "failed to isolate/dispose %ld blocks", count);
kc_list_lru_walk(&binf->lru, dump_lru_block, sb, count);
}
} }
struct sm_block_completion { struct sm_block_completion {