From 355c4d04676f5b804eeaa222f4fa76cd88e3f18d Mon Sep 17 00:00:00 2001 From: Yaniv Michael Kaul Date: Mon, 6 Apr 2026 12:45:57 +0300 Subject: [PATCH] db: set_skip_when_empty() for rare error-path metrics Add .set_skip_when_empty() to four metrics in the db module that are only incremented on very rare error paths and are almost always zero: - cache::pinned_dirty_memory_overload: described as 'should sit constantly at 0, nonzero is indicative of a bug' - corrupt_data::entries_reported: only fires on actual data corruption - hints::corrupted_files: only fires on on-disk hint file corruption - rate_limiter::failed_allocations: only fires when the rate limiter hash table is completely full and gives up allocating, requiring extreme cardinality pressure These metrics create unnecessary reporting overhead when they are perpetually zero. set_skip_when_empty() suppresses them from metrics output until they become non-zero. AI-Assisted: yes Signed-off-by: Yaniv Kaul --- db/corrupt_data_handler.cc | 2 +- db/hints/manager.cc | 2 +- db/rate_limiter.cc | 2 +- db/row_cache.cc | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/db/corrupt_data_handler.cc b/db/corrupt_data_handler.cc index 441a596867..a444d13549 100644 --- a/db/corrupt_data_handler.cc +++ b/db/corrupt_data_handler.cc @@ -22,7 +22,7 @@ corrupt_data_handler::corrupt_data_handler(register_metrics rm) { _metrics.add_group("corrupt_data", { sm::make_counter("entries_reported", _stats.corrupt_data_reported, sm::description("Counts the number of corrupt data instances reported to the corrupt data handler. " - "A non-zero value indicates that the database suffered data corruption.")) + "A non-zero value indicates that the database suffered data corruption.")).set_skip_when_empty() }); } } diff --git a/db/hints/manager.cc b/db/hints/manager.cc index 15a0911cce..e5762365db 100644 --- a/db/hints/manager.cc +++ b/db/hints/manager.cc @@ -186,7 +186,7 @@ void manager::register_metrics(const sstring& group_name) { sm::description("Number of unexpected errors during sending, sending will be retried later")), sm::make_counter("corrupted_files", _stats.corrupted_files, - sm::description("Number of hints files that were discarded during sending because the file was corrupted.")), + sm::description("Number of hints files that were discarded during sending because the file was corrupted.")).set_skip_when_empty(), sm::make_gauge("pending_drains", sm::description("Number of tasks waiting in the queue for draining hints"), diff --git a/db/rate_limiter.cc b/db/rate_limiter.cc index 79c3dc61f3..f88051e328 100644 --- a/db/rate_limiter.cc +++ b/db/rate_limiter.cc @@ -206,7 +206,7 @@ void rate_limiter_base::register_metrics() { sm::description("Number of times a lookup returned an already allocated entry.")), sm::make_counter("failed_allocations", _metrics.failed_allocations, - sm::description("Number of times the rate limiter gave up trying to allocate.")), + sm::description("Number of times the rate limiter gave up trying to allocate.")).set_skip_when_empty(), sm::make_counter("probe_count", _metrics.probe_count, sm::description("Number of probes made during lookups.")), diff --git a/db/row_cache.cc b/db/row_cache.cc index 216906441d..4f8c4acbf5 100644 --- a/db/row_cache.cc +++ b/db/row_cache.cc @@ -174,7 +174,7 @@ cache_tracker::setup_metrics() { sm::make_counter("sstable_reader_recreations", sm::description("number of times sstable reader was recreated due to memtable flush"), _stats.underlying_recreations), sm::make_counter("sstable_partition_skips", sm::description("number of times sstable reader was fast forwarded across partitions"), _stats.underlying_partition_skips), sm::make_counter("sstable_row_skips", sm::description("number of times sstable reader was fast forwarded within a partition"), _stats.underlying_row_skips), - sm::make_counter("pinned_dirty_memory_overload", sm::description("amount of pinned bytes that we tried to unpin over the limit. This should sit constantly at 0, and any number different than 0 is indicative of a bug"), _stats.pinned_dirty_memory_overload), + sm::make_counter("pinned_dirty_memory_overload", sm::description("amount of pinned bytes that we tried to unpin over the limit. This should sit constantly at 0, and any number different than 0 is indicative of a bug"), _stats.pinned_dirty_memory_overload).set_skip_when_empty(), sm::make_counter("rows_processed_from_memtable", _stats.rows_processed_from_memtable, sm::description("total number of rows in memtables which were processed during cache update on memtable flush")), sm::make_counter("rows_dropped_from_memtable", _stats.rows_dropped_from_memtable,