db: set_skip_when_empty() for rare error-path metrics

Add .set_skip_when_empty() to four metrics in the db module that are
only incremented on very rare error paths and are almost always zero:

- cache::pinned_dirty_memory_overload: described as 'should sit
  constantly at 0, nonzero is indicative of a bug'
- corrupt_data::entries_reported: only fires on actual data corruption
- hints::corrupted_files: only fires on on-disk hint file corruption
- rate_limiter::failed_allocations: only fires when the rate limiter
  hash table is completely full and gives up allocating, requiring
  extreme cardinality pressure

These metrics create unnecessary reporting overhead when they are
perpetually zero. set_skip_when_empty() suppresses them from metrics
output until they become non-zero.

AI-Assisted: yes
Signed-off-by: Yaniv Kaul <yaniv.kaul@scylladb.com>
This commit is contained in:
Yaniv Michael Kaul
2026-04-06 12:45:57 +03:00
parent b4f652b7c1
commit 355c4d0467
4 changed files with 4 additions and 4 deletions

View File

@@ -22,7 +22,7 @@ corrupt_data_handler::corrupt_data_handler(register_metrics rm) {
_metrics.add_group("corrupt_data", {
sm::make_counter("entries_reported", _stats.corrupt_data_reported,
sm::description("Counts the number of corrupt data instances reported to the corrupt data handler. "
"A non-zero value indicates that the database suffered data corruption."))
"A non-zero value indicates that the database suffered data corruption.")).set_skip_when_empty()
});
}
}

View File

@@ -186,7 +186,7 @@ void manager::register_metrics(const sstring& group_name) {
sm::description("Number of unexpected errors during sending, sending will be retried later")),
sm::make_counter("corrupted_files", _stats.corrupted_files,
sm::description("Number of hints files that were discarded during sending because the file was corrupted.")),
sm::description("Number of hints files that were discarded during sending because the file was corrupted.")).set_skip_when_empty(),
sm::make_gauge("pending_drains",
sm::description("Number of tasks waiting in the queue for draining hints"),

View File

@@ -206,7 +206,7 @@ void rate_limiter_base::register_metrics() {
sm::description("Number of times a lookup returned an already allocated entry.")),
sm::make_counter("failed_allocations", _metrics.failed_allocations,
sm::description("Number of times the rate limiter gave up trying to allocate.")),
sm::description("Number of times the rate limiter gave up trying to allocate.")).set_skip_when_empty(),
sm::make_counter("probe_count", _metrics.probe_count,
sm::description("Number of probes made during lookups.")),

View File

@@ -174,7 +174,7 @@ cache_tracker::setup_metrics() {
sm::make_counter("sstable_reader_recreations", sm::description("number of times sstable reader was recreated due to memtable flush"), _stats.underlying_recreations),
sm::make_counter("sstable_partition_skips", sm::description("number of times sstable reader was fast forwarded across partitions"), _stats.underlying_partition_skips),
sm::make_counter("sstable_row_skips", sm::description("number of times sstable reader was fast forwarded within a partition"), _stats.underlying_row_skips),
sm::make_counter("pinned_dirty_memory_overload", sm::description("amount of pinned bytes that we tried to unpin over the limit. This should sit constantly at 0, and any number different than 0 is indicative of a bug"), _stats.pinned_dirty_memory_overload),
sm::make_counter("pinned_dirty_memory_overload", sm::description("amount of pinned bytes that we tried to unpin over the limit. This should sit constantly at 0, and any number different than 0 is indicative of a bug"), _stats.pinned_dirty_memory_overload).set_skip_when_empty(),
sm::make_counter("rows_processed_from_memtable", _stats.rows_processed_from_memtable,
sm::description("total number of rows in memtables which were processed during cache update on memtable flush")),
sm::make_counter("rows_dropped_from_memtable", _stats.rows_dropped_from_memtable,