diff --git a/kmod/src/block.c b/kmod/src/block.c index 146925cb..1c2e4ae8 100644 --- a/kmod/src/block.c +++ b/kmod/src/block.c @@ -19,6 +19,8 @@ #include #include #include +#include +#include #include "format.h" #include "super.h" @@ -33,22 +35,25 @@ * dirty blocks that are written together. We pin dirty blocks in * memory and only checksum them all as they're all written. * - * An LRU is maintained so the VM can reclaim the oldest presumably - * unlikely to be used blocks. But we don't maintain a perfect record - * of access order. We only move accessed blocks to the tail of the rcu - * if they weren't in the most recently moved fraction of the total - * population. This means that reclaim will walk through waves of that - * fraction of the population. It's close enough and removes lru - * maintenance locking from the fast path. + * Memory reclaim is driven by maintaining two very coarse groups of + * blocks. As we access blocks we mark them with an increasing counter + * to discourage them from being reclaimed. We then define a threshold + * at the current counter minus half the population. Recent blocks have + * a counter greater than the threshold, and all other blocks with + * counters less than it are considered older and are candidates for + * reclaim. This results in access updates rarely modifying an atomic + * counter as blocks need to be moved into the recent group, and shrink + * can randomly scan blocks looking for the half of the population that + * will be in the old group. It's reasonably effective, but is + * particularly efficient and avoids contention between concurrent + * accesses and shrinking. */ struct block_info { struct super_block *sb; - spinlock_t lock; - struct radix_tree_root radix; - struct list_head lru_list; - u64 lru_nr; - u64 lru_move_counter; + atomic_t total_inserted; + atomic64_t access_counter; + struct rhashtable ht; wait_queue_head_t waitq; struct shrinker shrinker; struct work_struct free_work; @@ -64,22 +69,33 @@ enum block_status_bits { BLOCK_BIT_DIRTY, /* dirty, writer will write */ BLOCK_BIT_IO_BUSY, /* bios are in flight */ BLOCK_BIT_ERROR, /* saw IO error */ - BLOCK_BIT_DELETED, /* has been deleted from radix tree */ BLOCK_BIT_PAGE_ALLOC, /* page (possibly high order) allocation */ BLOCK_BIT_VIRT, /* mapped virt allocation */ BLOCK_BIT_CRC_VALID, /* crc has been verified */ }; +/* + * We want to tie atomic changes in refcounts to whether or not the + * block is still visible in the hash table, so we store the hash + * table's reference up at a known high bit. We could naturally set the + * inserted bit through excessive refcount increments. We don't do + * anything about that but at least warn if we get close. + * + * We're avoiding the high byte for no real good reason, just out of a + * historical fear of implementations that don't provide the full + * precision. + */ +#define BLOCK_REF_INSERTED (1U << 23) +#define BLOCK_REF_FULL (BLOCK_REF_INSERTED >> 1) + struct block_private { struct scoutfs_block bl; struct super_block *sb; atomic_t refcount; - union { - struct list_head lru_entry; - struct llist_node free_node; - }; - u64 lru_moved; + u64 accessed; + struct rhash_head ht_head; struct list_head dirty_entry; + struct llist_node free_node; unsigned long bits; atomic_t io_count; union { @@ -88,13 +104,11 @@ struct block_private { }; }; -#define TRACE_BLOCK(which, bp) \ -do { \ - __typeof__(bp) _bp = (bp); \ - trace_scoutfs_block_##which(_bp->sb, _bp, _bp->bl.blkno, \ - atomic_read(&_bp->refcount), \ - atomic_read(&_bp->io_count), \ - _bp->bits, _bp->lru_moved); \ +#define TRACE_BLOCK(which, bp) \ +do { \ + __typeof__(bp) _bp = (bp); \ + trace_scoutfs_block_##which(_bp->sb, _bp, _bp->bl.blkno, atomic_read(&_bp->refcount), \ + atomic_read(&_bp->io_count), _bp->bits, _bp->accessed); \ } while (0) #define BLOCK_PRIVATE(_bl) \ @@ -136,7 +150,7 @@ static struct block_private *block_alloc(struct super_block *sb, u64 blkno) /* * If we had multiple blocks per page we'd need to be a little * more careful with a partial page allocator when allocating - * blocks and would make the lru per-page instead of per-block. + * blocks. */ BUILD_BUG_ON(PAGE_SIZE > SCOUTFS_BLOCK_LG_SIZE); @@ -167,7 +181,6 @@ static struct block_private *block_alloc(struct super_block *sb, u64 blkno) bp->bl.blkno = blkno; bp->sb = sb; atomic_set(&bp->refcount, 1); - INIT_LIST_HEAD(&bp->lru_entry); INIT_LIST_HEAD(&bp->dirty_entry); set_bit(BLOCK_BIT_NEW, &bp->bits); atomic_set(&bp->io_count, 0); @@ -193,7 +206,6 @@ static void block_free(struct super_block *sb, struct block_private *bp) else BUG(); - /* lru_entry could have been clobbered by union member free_node */ WARN_ON_ONCE(!list_empty(&bp->dirty_entry)); WARN_ON_ONCE(atomic_read(&bp->refcount)); WARN_ON_ONCE(atomic_read(&bp->io_count)); @@ -201,109 +213,179 @@ static void block_free(struct super_block *sb, struct block_private *bp) } /* - * We free blocks in task context so we can free kernel virtual mappings. + * Free all the blocks that were put in the free_llist. We have to wait + * for rcu grace periods to expire to ensure that no more rcu hash list + * lookups can see the blocks. */ static void block_free_work(struct work_struct *work) { - struct block_info *binf = container_of(work, struct block_info, - free_work); + struct block_info *binf = container_of(work, struct block_info, free_work); struct super_block *sb = binf->sb; struct block_private *bp; + struct block_private *tmp; struct llist_node *deleted; - deleted = llist_del_all(&binf->free_llist); + scoutfs_inc_counter(sb, block_cache_free_work); - llist_for_each_entry(bp, deleted, free_node) { + deleted = llist_del_all(&binf->free_llist); + synchronize_rcu(); + + llist_for_each_entry_safe(bp, tmp, deleted, free_node) { block_free(sb, bp); } } /* - * After we've dropped the final ref kick off the final free in task - * context. This happens in the relatively rare cases of IO errors, - * stale cached data, memory pressure, and unmount. + * Get a reference to a block while holding an existing reference. + */ +static void block_get(struct block_private *bp) +{ + WARN_ON_ONCE((atomic_read(&bp->refcount) & ~BLOCK_REF_INSERTED) <= 0); + + atomic_inc(&bp->refcount); +} + +/* + * Get a reference to a block as long as it's been inserted in the hash + * table and hasn't been removed. + */ +static struct block_private *block_get_if_inserted(struct block_private *bp) +{ + int cnt; + + do { + cnt = atomic_read(&bp->refcount); + WARN_ON_ONCE(cnt & BLOCK_REF_FULL); + if (!(cnt & BLOCK_REF_INSERTED)) + return NULL; + + } while (atomic_cmpxchg(&bp->refcount, cnt, cnt + 1) != cnt); + + return bp; +} + +/* + * Drop the caller's reference. If this was the final reference we + * queue the block to be freed once the rcu period ends. Readers can be + * racing to try to get references to these blocks, but they won't get a + * reference because the block isn't present in the hash table any more. */ static void block_put(struct super_block *sb, struct block_private *bp) { DECLARE_BLOCK_INFO(sb, binf); + int cnt; - if (!IS_ERR_OR_NULL(bp) && atomic_dec_and_test(&bp->refcount)) { - WARN_ON_ONCE(!list_empty(&bp->lru_entry)); - llist_add(&bp->free_node, &binf->free_llist); - schedule_work(&binf->free_work); + if (!IS_ERR_OR_NULL(bp)) { + cnt = atomic_dec_return(&bp->refcount); + if (cnt == 0) { + llist_add(&bp->free_node, &binf->free_llist); + schedule_work(&binf->free_work); + } else { + WARN_ON_ONCE(cnt < 0); + } } } +static const struct rhashtable_params block_ht_params = { + .key_len = member_sizeof(struct block_private, bl.blkno), + .key_offset = offsetof(struct block_private, bl.blkno), + .head_offset = offsetof(struct block_private, ht_head), +}; + /* - * Add a new block into the cache. The caller holds the lock and has - * preloaded the radix. + * Insert a new block into the hash table. Once it is inserted in the + * hash table readers can start getting references. The caller only has + * its initial ref so inserted can't be set and there can be no other + * references. */ -static void block_insert(struct super_block *sb, struct block_private *bp, - u64 blkno) +static int block_insert(struct super_block *sb, struct block_private *bp) { DECLARE_BLOCK_INFO(sb, binf); + int ret; - assert_spin_locked(&binf->lock); - BUG_ON(!list_empty(&bp->lru_entry)); + WARN_ON_ONCE(atomic_read(&bp->refcount) != 1); - atomic_inc(&bp->refcount); - radix_tree_insert(&binf->radix, blkno, bp); - list_add_tail(&bp->lru_entry, &binf->lru_list); - bp->lru_moved = ++binf->lru_move_counter; - binf->lru_nr++; + atomic_add(BLOCK_REF_INSERTED, &bp->refcount); + ret = rhashtable_insert_fast(&binf->ht, &bp->ht_head, block_ht_params); + if (ret < 0) { + atomic_sub(BLOCK_REF_INSERTED, &bp->refcount); + } else { + atomic_inc(&binf->total_inserted); + TRACE_BLOCK(insert, bp); + } - TRACE_BLOCK(insert, bp); + return ret; +} + +static u64 accessed_recently(struct block_info *binf) +{ + return atomic64_read(&binf->access_counter) - (atomic_read(&binf->total_inserted) >> 1); } /* - * Only move the block to the tail of the LRU if it's outside of the - * small fraction of the lru population that has been most recently - * used. This gives us a reasonable number of most recently accessed - * blocks which will be reclaimed after the rest of the least recently - * used blocks while reducing per-access locking overhead of maintaining - * the LRU. We don't care about unlikely non-atomic u64 accesses racing - * and messing up LRU position. - * - * This can race with blocks being removed from the cache (shrinking, - * stale, errors) so we're careful to only move the entry if it's still - * on the list after we acquire the lock. We still hold a reference so it's - * lru_entry hasn't transitioned to being used as the free_node. + * Make sure that a block that is being accessed is less likely to be + * reclaimed if it is seen by the shrinker. If the block hasn't been + * accessed recently we update its accessed value. */ static void block_accessed(struct super_block *sb, struct block_private *bp) { DECLARE_BLOCK_INFO(sb, binf); - u64 recent = binf->lru_nr >> 3; - scoutfs_inc_counter(sb, block_cache_access); - - if (bp->lru_moved < (binf->lru_move_counter - recent)) { - spin_lock(&binf->lock); - if (!list_empty(&bp->lru_entry)) { - list_move_tail(&bp->lru_entry, &binf->lru_list); - bp->lru_moved = ++binf->lru_move_counter; - scoutfs_inc_counter(sb, block_cache_lru_move); - } - spin_unlock(&binf->lock); + if (bp->accessed == 0 || bp->accessed < accessed_recently(binf)) { + scoutfs_inc_counter(sb, block_cache_access_update); + bp->accessed = atomic64_inc_return(&binf->access_counter); } } /* - * Remove a block from the cache and drop its reference. We only remove - * the block once as the deleted bit is first set. + * The caller wants to remove the block from the hash table and has an + * idea what the refcount should be. If the refcount does still + * indicate that the block is hashed, and we're able to clear that bit, + * then we can remove it from the hash table. + * + * The caller makes sure that it's safe to be referencing this block, + * either with their own held reference (most everything) or by being in + * an rcu grace period (shrink). + */ +static bool block_remove_cnt(struct super_block *sb, struct block_private *bp, int cnt) +{ + DECLARE_BLOCK_INFO(sb, binf); + int ret; + + if ((cnt & BLOCK_REF_INSERTED) && + (atomic_cmpxchg(&bp->refcount, cnt, cnt & ~BLOCK_REF_INSERTED) == cnt)) { + + TRACE_BLOCK(remove, bp); + ret = rhashtable_remove_fast(&binf->ht, &bp->ht_head, block_ht_params); + WARN_ON_ONCE(ret); /* must have been inserted */ + atomic_dec(&binf->total_inserted); + return true; + } + + return false; +} + +/* + * Try to remove the block from the hash table as long as the refcount + * indicates that it is still in the hash table. This can be racing + * with normal refcount changes so it might have to retry. */ static void block_remove(struct super_block *sb, struct block_private *bp) { - DECLARE_BLOCK_INFO(sb, binf); + int cnt; - assert_spin_locked(&binf->lock); + do { + cnt = atomic_read(&bp->refcount); + } while ((cnt & BLOCK_REF_INSERTED) && !block_remove_cnt(sb, bp, cnt)); +} - if (!test_and_set_bit(BLOCK_BIT_DELETED, &bp->bits)) { - BUG_ON(list_empty(&bp->lru_entry)); - radix_tree_delete(&binf->radix, bp->bl.blkno); - list_del_init(&bp->lru_entry); - binf->lru_nr--; - block_put(sb, bp); - } +/* + * Take one shot at removing the block from the hash table if it's still + * in the hash table and the caller has the only other reference. + */ +static bool block_remove_solo(struct super_block *sb, struct block_private *bp) +{ + return block_remove_cnt(sb, bp, BLOCK_REF_INSERTED | 1); } static bool io_busy(struct block_private *bp) @@ -318,20 +400,29 @@ static bool io_busy(struct block_private *bp) static void block_remove_all(struct super_block *sb) { DECLARE_BLOCK_INFO(sb, binf); + struct rhashtable_iter iter; struct block_private *bp; - spin_lock(&binf->lock); + rhashtable_walk_enter(&binf->ht, &iter); + rhashtable_walk_start(&iter); - while (radix_tree_gang_lookup(&binf->radix, (void **)&bp, 0, 1) == 1) { - wait_event(binf->waitq, !io_busy(bp)); - block_remove(sb, bp); + for (;;) { + bp = rhashtable_walk_next(&iter); + if (bp == NULL) + break; + if (bp == ERR_PTR(-EAGAIN)) + continue; + + if (block_get_if_inserted(bp)) { + block_remove(sb, bp); + block_put(sb, bp); + } } - spin_unlock(&binf->lock); + rhashtable_walk_stop(&iter); + rhashtable_walk_exit(&iter); - WARN_ON_ONCE(!list_empty(&binf->lru_list)); - WARN_ON_ONCE(binf->lru_nr != 0); - WARN_ON_ONCE(binf->radix.rnode != NULL); + WARN_ON_ONCE(atomic_read(&binf->total_inserted) != 0); } /* @@ -402,7 +493,7 @@ static int block_submit_bio(struct super_block *sb, struct block_private *bp, /* don't let racing end_io during submission think block is complete */ atomic_inc(&bp->io_count); set_bit(BLOCK_BIT_IO_BUSY, &bp->bits); - atomic_inc(&bp->refcount); + block_get(bp); blk_start_plug(&plug); @@ -449,29 +540,33 @@ static int block_submit_bio(struct super_block *sb, struct block_private *bp, } /* - * Return a reference to a cached block in the system, allocating a new - * block if one isn't found in the radix. Its contents are undefined if - * it's newly allocated. + * Return a reference to a cached block found in the hash table. If one + * isn't found then we try and allocate and insert a new one. Its + * contents are undefined if it's newly allocated. + * + * Our hash table lookups during rcu can be racing with shrinking and + * removal from the hash table. We only atomically get a reference if + * the refcount indicates that the block is still present in the hash + * table. */ -static struct block_private *block_get(struct super_block *sb, u64 blkno) +static struct block_private *block_lookup_create(struct super_block *sb, + u64 blkno) { DECLARE_BLOCK_INFO(sb, binf); - struct block_private *found; struct block_private *bp; int ret; +restart: rcu_read_lock(); - bp = radix_tree_lookup(&binf->radix, blkno); + bp = rhashtable_lookup(&binf->ht, &blkno, block_ht_params); if (bp) - atomic_inc(&bp->refcount); + bp = block_get_if_inserted(bp); rcu_read_unlock(); /* drop failed reads that interrupted waiters abandoned */ if (bp && (test_bit(BLOCK_BIT_ERROR, &bp->bits) && !test_bit(BLOCK_BIT_DIRTY, &bp->bits))) { - spin_lock(&binf->lock); block_remove(sb, bp); - spin_unlock(&binf->lock); block_put(sb, bp); bp = NULL; } @@ -483,24 +578,13 @@ static struct block_private *block_get(struct super_block *sb, u64 blkno) goto out; } - ret = radix_tree_preload(GFP_NOFS); - if (ret) + ret = block_insert(sb, bp); + if (ret < 0) { + if (ret == -EEXIST) { + block_put(sb, bp); + goto restart; + } goto out; - - /* could use slot instead of lookup/insert */ - spin_lock(&binf->lock); - found = radix_tree_lookup(&binf->radix, blkno); - if (found) { - atomic_inc(&found->refcount); - } else { - block_insert(sb, bp, blkno); - } - spin_unlock(&binf->lock); - radix_tree_preload_end(); - - if (found) { - block_put(sb, bp); - bp = found; } } @@ -524,7 +608,7 @@ struct scoutfs_block *scoutfs_block_create(struct super_block *sb, u64 blkno) { struct block_private *bp; - bp = block_get(sb, blkno); + bp = block_lookup_create(sb, blkno); if (IS_ERR(bp)) return ERR_CAST(bp); @@ -547,7 +631,7 @@ struct scoutfs_block *scoutfs_block_read(struct super_block *sb, u64 blkno) struct block_private *bp = NULL; int ret; - bp = block_get(sb, blkno); + bp = block_lookup_create(sb, blkno); if (IS_ERR(bp)) { ret = PTR_ERR(bp); goto out; @@ -580,14 +664,11 @@ out: */ void scoutfs_block_invalidate(struct super_block *sb, struct scoutfs_block *bl) { - DECLARE_BLOCK_INFO(sb, binf); struct block_private *bp = BLOCK_PRIVATE(bl); if (!WARN_ON_ONCE(test_bit(BLOCK_BIT_DIRTY, &bp->bits))) { scoutfs_inc_counter(sb, block_cache_invalidate); - spin_lock(&binf->lock); block_remove(sb, bp); - spin_unlock(&binf->lock); TRACE_BLOCK(invalidate, bp); } } @@ -642,12 +723,11 @@ void scoutfs_block_writer_mark_dirty(struct super_block *sb, if (!test_and_set_bit(BLOCK_BIT_DIRTY, &bp->bits)) { BUG_ON(!list_empty(&bp->dirty_entry)); - atomic_inc(&bp->refcount); + block_get(bp); spin_lock(&wri->lock); list_add_tail(&bp->dirty_entry, &wri->dirty_list); wri->nr_dirty_blocks++; spin_unlock(&wri->lock); - TRACE_BLOCK(mark_dirty, bp); } } @@ -792,53 +872,94 @@ u64 scoutfs_block_writer_dirty_bytes(struct super_block *sb, } /* - * Remove a number of least recently accessed blocks and free them. We - * don't take locking hit of removing blocks from the lru as they're - * used so this is racing with accesses holding an elevated refcount. - * We check the refcount to attempt to not free a block that snuck in - * and is being accessed while the block is still at the head of the - * LRU. + * Remove a number of cached blocks that haven't been used recently. * - * Dirty blocks will always have an elevated refcount (and will be - * likely be towards the tail of the LRU). Even if we do remove them - * from the LRU their dirty refcount will keep them live until IO - * completes and their dirty refcount is dropped. + * We don't maintain a strictly ordered LRU to avoid the contention of + * accesses always moving blocks around in some precise global + * structure. + * + * Instead we use counters to divide the blocks into two roughly equal + * groups by how recently they were accessed. We randomly walk all + * inserted blocks looking for any blocks in the older half to remove + * and free. The random walk and there being two groups means that we + * typically only walk a small multiple of the number we're looking for + * before we find them all. + * + * Our rcu walk of blocks can see blocks in all stages of their life + * cycle, from dirty blocks to those with 0 references that are queued + * for freeing. We only want to free idle inserted blocks so we + * atomically remove blocks when the only references are ours and the + * hash table. */ static int block_shrink(struct shrinker *shrink, struct shrink_control *sc) { struct block_info *binf = container_of(shrink, struct block_info, shrinker); struct super_block *sb = binf->sb; - struct block_private *tmp; + struct rhashtable_iter iter; struct block_private *bp; unsigned long nr; - LIST_HEAD(list); + u64 recently; nr = sc->nr_to_scan; - if (!nr) + if (nr == 0) goto out; - spin_lock(&binf->lock); + scoutfs_inc_counter(sb, block_cache_shrink); - list_for_each_entry_safe(bp, tmp, &binf->lru_list, lru_entry) { + nr = DIV_ROUND_UP(nr, SCOUTFS_BLOCK_LG_PAGES_PER); - if (atomic_read(&bp->refcount) > 1) - continue; +restart: + recently = accessed_recently(binf); + rhashtable_walk_enter(&binf->ht, &iter); + rhashtable_walk_start(&iter); - if (nr-- == 0) + /* + * This isn't great but I don't see a better way. We want to + * walk the hash from a random point so that we're not + * constantly walking over the same region that we've already + * freed old blocks within. The interface doesn't let us do + * this explicitly, but this seems to work? The difference this + * makes is enormous, around a few orders of magnitude fewer + * _nexts per shrink. + */ + if (iter.walker.tbl) + iter.slot = prandom_u32_max(iter.walker.tbl->size); + + while (nr > 0) { + bp = rhashtable_walk_next(&iter); + if (bp == NULL) break; + if (bp == ERR_PTR(-EAGAIN)) { + /* hard reset to not hold rcu grace period across retries */ + rhashtable_walk_stop(&iter); + rhashtable_walk_exit(&iter); + scoutfs_inc_counter(sb, block_cache_shrink_restart); + goto restart; + } - TRACE_BLOCK(shrink, bp); + scoutfs_inc_counter(sb, block_cache_shrink_next); - scoutfs_inc_counter(sb, block_cache_shrink); - block_remove(sb, bp); + if (bp->accessed >= recently) { + scoutfs_inc_counter(sb, block_cache_shrink_recent); + continue; + } + if (block_get_if_inserted(bp)) { + if (block_remove_solo(sb, bp)) { + scoutfs_inc_counter(sb, block_cache_shrink_remove); + TRACE_BLOCK(shrink, bp); + nr--; + } + block_put(sb, bp); + } } - spin_unlock(&binf->lock); - + rhashtable_walk_stop(&iter); + rhashtable_walk_exit(&iter); out: - return min_t(u64, binf->lru_nr * SCOUTFS_BLOCK_LG_PAGES_PER, INT_MAX); + return min_t(u64, (u64)atomic_read(&binf->total_inserted) * SCOUTFS_BLOCK_LG_PAGES_PER, + INT_MAX); } struct sm_block_completion { @@ -945,27 +1066,23 @@ int scoutfs_block_setup(struct super_block *sb) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct block_info *binf; - loff_t size; int ret; - /* we store blknos in longs in the radix */ - size = i_size_read(sb->s_bdev->bd_inode); - if ((size >> SCOUTFS_BLOCK_LG_SHIFT) >= LONG_MAX) { - scoutfs_err(sb, "Cant reference all blocks in %llu byte device with %u bit long radix tree indexes", - size, BITS_PER_LONG); - return -EINVAL; - } - binf = kzalloc(sizeof(struct block_info), GFP_KERNEL); if (!binf) { ret = -ENOMEM; goto out; } + ret = rhashtable_init(&binf->ht, &block_ht_params); + if (ret < 0) { + kfree(binf); + goto out; + } + binf->sb = sb; - spin_lock_init(&binf->lock); - INIT_RADIX_TREE(&binf->radix, GFP_ATOMIC); /* insertion preloads */ - INIT_LIST_HEAD(&binf->lru_list); + atomic_set(&binf->total_inserted, 0); + atomic64_set(&binf->access_counter, 0); init_waitqueue_head(&binf->waitq); binf->shrinker.shrink = block_shrink; binf->shrinker.seeks = DEFAULT_SEEKS; @@ -992,10 +1109,9 @@ void scoutfs_block_destroy(struct super_block *sb) unregister_shrinker(&binf->shrinker); block_remove_all(sb); flush_work(&binf->free_work); + rhashtable_destroy(&binf->ht); - WARN_ON_ONCE(!llist_empty(&binf->free_llist)); kfree(binf); - sbi->block_info = NULL; } } diff --git a/kmod/src/counters.h b/kmod/src/counters.h index f6aa6b3b..37d08191 100644 --- a/kmod/src/counters.h +++ b/kmod/src/counters.h @@ -21,16 +21,20 @@ EXPAND_COUNTER(alloc_move) \ EXPAND_COUNTER(alloc_moved_extent) \ EXPAND_COUNTER(alloc_stale_cached_list_block) \ - EXPAND_COUNTER(block_cache_access) \ + EXPAND_COUNTER(block_cache_access_update) \ EXPAND_COUNTER(block_cache_alloc_failure) \ EXPAND_COUNTER(block_cache_alloc_page_order) \ EXPAND_COUNTER(block_cache_alloc_virt) \ EXPAND_COUNTER(block_cache_end_io_error) \ EXPAND_COUNTER(block_cache_forget) \ EXPAND_COUNTER(block_cache_free) \ + EXPAND_COUNTER(block_cache_free_work) \ EXPAND_COUNTER(block_cache_invalidate) \ - EXPAND_COUNTER(block_cache_lru_move) \ EXPAND_COUNTER(block_cache_shrink) \ + EXPAND_COUNTER(block_cache_shrink_next) \ + EXPAND_COUNTER(block_cache_shrink_recent) \ + EXPAND_COUNTER(block_cache_shrink_remove) \ + EXPAND_COUNTER(block_cache_shrink_restart) \ EXPAND_COUNTER(btree_compact_values) \ EXPAND_COUNTER(btree_compact_values_enomem) \ EXPAND_COUNTER(btree_delete) \ diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h index d440a228..563f95e3 100644 --- a/kmod/src/scoutfs_trace.h +++ b/kmod/src/scoutfs_trace.h @@ -2057,17 +2057,17 @@ TRACE_EVENT(scoutfs_forest_init_our_log, ); DECLARE_EVENT_CLASS(scoutfs_block_class, - TP_PROTO(struct super_block *sb, void *bp, u64 blkno, - int refcount, int io_count, unsigned long bits, u64 lru_moved), - TP_ARGS(sb, bp, blkno, refcount, io_count, bits, lru_moved), + TP_PROTO(struct super_block *sb, void *bp, u64 blkno, int refcount, int io_count, + unsigned long bits, __u64 accessed), + TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed), TP_STRUCT__entry( SCSB_TRACE_FIELDS __field(void *, bp) __field(__u64, blkno) __field(int, refcount) __field(int, io_count) - __field(unsigned long, bits) - __field(__u64, lru_moved) + __field(long, bits) + __field(__u64, accessed) ), TP_fast_assign( SCSB_TRACE_ASSIGN(sb); @@ -2076,57 +2076,71 @@ DECLARE_EVENT_CLASS(scoutfs_block_class, __entry->refcount = refcount; __entry->io_count = io_count; __entry->bits = bits; - __entry->lru_moved = lru_moved; + __entry->accessed = accessed; ), - TP_printk(SCSBF" bp %p blkno %llu refcount %d io_count %d bits 0x%lx lru_moved %llu", - SCSB_TRACE_ARGS, __entry->bp, __entry->blkno, - __entry->refcount, __entry->io_count, __entry->bits, - __entry->lru_moved) + TP_printk(SCSBF" bp %p blkno %llu refcount %d io_count %d bits 0x%lx accessed %llu", + SCSB_TRACE_ARGS, __entry->bp, __entry->blkno, __entry->refcount, + __entry->io_count, __entry->bits, __entry->accessed) ); DEFINE_EVENT(scoutfs_block_class, scoutfs_block_allocate, TP_PROTO(struct super_block *sb, void *bp, u64 blkno, - int refcount, int io_count, unsigned long bits, u64 lru_moved), - TP_ARGS(sb, bp, blkno, refcount, io_count, bits, lru_moved) + int refcount, int io_count, unsigned long bits, + __u64 accessed), + TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed) ); DEFINE_EVENT(scoutfs_block_class, scoutfs_block_free, TP_PROTO(struct super_block *sb, void *bp, u64 blkno, - int refcount, int io_count, unsigned long bits, u64 lru_moved), - TP_ARGS(sb, bp, blkno, refcount, io_count, bits, lru_moved) + int refcount, int io_count, unsigned long bits, + __u64 accessed), + TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed) ); DEFINE_EVENT(scoutfs_block_class, scoutfs_block_insert, TP_PROTO(struct super_block *sb, void *bp, u64 blkno, - int refcount, int io_count, unsigned long bits, u64 lru_moved), - TP_ARGS(sb, bp, blkno, refcount, io_count, bits, lru_moved) + int refcount, int io_count, unsigned long bits, + __u64 accessed), + TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed) +); +DEFINE_EVENT(scoutfs_block_class, scoutfs_block_remove, + TP_PROTO(struct super_block *sb, void *bp, u64 blkno, + int refcount, int io_count, unsigned long bits, + __u64 accessed), + TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed) ); DEFINE_EVENT(scoutfs_block_class, scoutfs_block_end_io, TP_PROTO(struct super_block *sb, void *bp, u64 blkno, - int refcount, int io_count, unsigned long bits, u64 lru_moved), - TP_ARGS(sb, bp, blkno, refcount, io_count, bits, lru_moved) + int refcount, int io_count, unsigned long bits, + __u64 accessed), + TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed) ); DEFINE_EVENT(scoutfs_block_class, scoutfs_block_submit, TP_PROTO(struct super_block *sb, void *bp, u64 blkno, - int refcount, int io_count, unsigned long bits, u64 lru_moved), - TP_ARGS(sb, bp, blkno, refcount, io_count, bits, lru_moved) + int refcount, int io_count, unsigned long bits, + __u64 accessed), + TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed) ); DEFINE_EVENT(scoutfs_block_class, scoutfs_block_invalidate, TP_PROTO(struct super_block *sb, void *bp, u64 blkno, - int refcount, int io_count, unsigned long bits, u64 lru_moved), - TP_ARGS(sb, bp, blkno, refcount, io_count, bits, lru_moved) + int refcount, int io_count, unsigned long bits, + __u64 accessed), + TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed) ); DEFINE_EVENT(scoutfs_block_class, scoutfs_block_mark_dirty, TP_PROTO(struct super_block *sb, void *bp, u64 blkno, - int refcount, int io_count, unsigned long bits, u64 lru_moved), - TP_ARGS(sb, bp, blkno, refcount, io_count, bits, lru_moved) + int refcount, int io_count, unsigned long bits, + __u64 accessed), + TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed) ); DEFINE_EVENT(scoutfs_block_class, scoutfs_block_forget, TP_PROTO(struct super_block *sb, void *bp, u64 blkno, - int refcount, int io_count, unsigned long bits, u64 lru_moved), - TP_ARGS(sb, bp, blkno, refcount, io_count, bits, lru_moved) + int refcount, int io_count, unsigned long bits, + __u64 accessed), + TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed) ); DEFINE_EVENT(scoutfs_block_class, scoutfs_block_shrink, TP_PROTO(struct super_block *sb, void *bp, u64 blkno, - int refcount, int io_count, unsigned long bits, u64 lru_moved), - TP_ARGS(sb, bp, blkno, refcount, io_count, bits, lru_moved) + int refcount, int io_count, unsigned long bits, + __u64 accessed), + TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed) ); DECLARE_EVENT_CLASS(scoutfs_ext_next_class,