From c3a9415e0c9032719ef21c1993b8e4cdc680a27f Mon Sep 17 00:00:00 2001 From: Aleksandra Martyniuk Date: Wed, 29 Oct 2025 16:07:02 +0100 Subject: [PATCH 1/2] db: batchlog_manager: update _last_replay only if all batches were replayed Currently, if flushing hints falls within the repair cache timeout, then the flush_time is set to batchlog_manager::_last_replay. _last_replay is updated on each replay, even if some batches weren't replayed. Due to that, we risk the data resurrection. Update _last_replay only if all batches were replayed. Fixes: https://github.com/scylladb/scylladb/issues/24415. (cherry picked from commit 4d0de1126fb7727a0dd2baa82387ffc2dcad7f03) --- db/batchlog_manager.cc | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/db/batchlog_manager.cc b/db/batchlog_manager.cc index c391adb022..32577dc1cc 100644 --- a/db/batchlog_manager.cc +++ b/db/batchlog_manager.cc @@ -77,9 +77,11 @@ future db::batchlog_manager::do_batch_log_replay(post_ }); }); } - co_await bm.container().invoke_on_all([last_replay] (auto& bm) { - bm._last_replay = last_replay; - }); + if (all_replayed == all_batches_replayed::yes) { + co_await bm.container().invoke_on_all([last_replay] (auto& bm) { + bm._last_replay = last_replay; + }); + } blogger.debug("Batchlog replay on shard {}: done", dest); co_return all_replayed; }); From cd5508a690516bc8747faa22b9ee1b71cfc534ea Mon Sep 17 00:00:00 2001 From: Aleksandra Martyniuk Date: Fri, 14 Nov 2025 14:18:04 +0100 Subject: [PATCH 2/2] test: extend test_batchlog_replay_failure_during_repair Modify test_batchlog_replay_failure_during_repair to also check that there isn't data resurrection if flushing hints falls within the repair cache timeout. (cherry picked from commit e3dcb7e827810989ac09fa738749be8b45ff9318) --- db/batchlog_manager.cc | 1 + test/cluster/test_batchlog_manager.py | 8 ++++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/db/batchlog_manager.cc b/db/batchlog_manager.cc index 32577dc1cc..9421a756ba 100644 --- a/db/batchlog_manager.cc +++ b/db/batchlog_manager.cc @@ -190,6 +190,7 @@ future db::batchlog_manager::replay_all_failed_batches if (utils::get_local_injector().is_enabled("skip_batch_replay")) { blogger.debug("Skipping batch replay due to skip_batch_replay injection"); + all_replayed = all_batches_replayed::no; co_return stop_iteration::no; } diff --git a/test/cluster/test_batchlog_manager.py b/test/cluster/test_batchlog_manager.py index ababad2a8d..3c4ee70e5e 100644 --- a/test/cluster/test_batchlog_manager.py +++ b/test/cluster/test_batchlog_manager.py @@ -295,7 +295,8 @@ async def test_drop_mutations_for_dropped_table(manager: ManagerClient) -> None: @pytest.mark.asyncio @skip_mode("release", "error injections are not supported in release mode") -async def test_batchlog_replay_failure_during_repair(manager: ManagerClient) -> None: +@pytest.mark.parametrize("repair_cache", [True, False]) +async def test_batchlog_replay_failure_during_repair(manager: ManagerClient, repair_cache: bool) -> None: """ We want to verify that repair_time will not be updated if batchlog replay fails. @@ -315,7 +316,7 @@ async def test_batchlog_replay_failure_during_repair(manager: ManagerClient) -> cmdline=['--enable-cache', '0', "--hinted-handoff-enabled", "0", "--logger-log-level", "batchlog_manager=trace:repair=debug", - "--repair-hints-batchlog-flush-cache-time-in-ms", "0"] + "--repair-hints-batchlog-flush-cache-time-in-ms", "1000000" if repair_cache else "0"] config = {"error_injections_at_startup": ["short_batchlog_manager_replay_interval"], "write_request_timeout_in_ms": 2000, 'group0_tombstone_gc_refresh_interval_in_ms': 1000, @@ -390,6 +391,9 @@ async def test_batchlog_replay_failure_during_repair(manager: ManagerClient) -> # be triggered from now on (if it's present). await disable_injection("skip_batch_replay") + await s1_log.wait_for("Batchlog replay on shard 0: done", timeout=60, from_mark=s1_mark) + await s1_log.wait_for("Batchlog replay on shard 1: done", timeout=60, from_mark=s1_mark) + await manager.api.repair(s1.ip_addr, ks, "my_table") await manager.api.flush_keyspace(s1.ip_addr, ks)