From fa7e904cd6e55f67c360c49fc4910fc517757bbb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Chojnowski?= Date: Thu, 26 Jan 2023 11:33:53 +0100 Subject: [PATCH] commitlog: fix total_size_on_disk accounting after segment file removal Currently, segment file removal first calls `f.remove_file()` and does `total_size_on_disk -= f.known_size()` later. However, `remove_file()` resets `known_size` to 0, so in effect the freed space in not accounted for. `total_size_on_disk` is not just a metric. It is also responsible for deciding whether a segment should be recycled -- it is recycled only if `total_size_on_disk - known_size < max_disk_size`. Therefore this bug has dire performance consequences: if `total_size_on_disk - known_size` ever exceeds `max_disk_size`, the recycling of commitlog segments will stop permanently, because `total_size_on_disk - known_size` will never go back below `max_disk_size` due to the accounting bug. All new segments from this point will be allocated from scratch. The bug was uncovered by a QA performance test. It isn't easy to trigger -- it took the test 7 hours of constant high load to step into it. However, the fact that the effect is permanent, and degrades the performance of the cluster silently, makes the bug potentially quite severe. The bug can be easily spotted with Prometheus as infinitely rising `commitlog_total_size_on_disk` on the affected shards. Fixes #12645 Closes #12646 --- db/commitlog/commitlog.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/db/commitlog/commitlog.cc b/db/commitlog/commitlog.cc index 8dea8575a4..7363d1c132 100644 --- a/db/commitlog/commitlog.cc +++ b/db/commitlog/commitlog.cc @@ -2116,6 +2116,9 @@ future<> db::commitlog::segment_manager::do_pending_deletes() { clogger.debug("Discarding segments {}", ftd); for (auto& [f, mode] : ftd) { + // `f.remove_file()` resets known_size to 0, so remember the size here, + // in order to subtract it from total_size_on_disk accurately. + size_t size = f.known_size(); try { if (f) { co_await f.close(); @@ -2132,7 +2135,6 @@ future<> db::commitlog::segment_manager::do_pending_deletes() { } } - auto size = f.known_size(); auto usage = totals.total_size_on_disk; auto next_usage = usage - size; @@ -2165,7 +2167,7 @@ future<> db::commitlog::segment_manager::do_pending_deletes() { // or had such an exception that we consider the file dead // anyway. In either case we _remove_ the file size from // footprint, because it is no longer our problem. - totals.total_size_on_disk -= f.known_size(); + totals.total_size_on_disk -= size; } // #8376 - if we had an error in recycling (disk rename?), and no elements