From dd1d3dd1ee62b7922aee9f661eb23e45d3e2fad9 Mon Sep 17 00:00:00 2001 From: Piotr Dulikowski Date: Mon, 23 Mar 2026 16:37:44 +0100 Subject: [PATCH 1/2] strong_consistency: adjust limits for snapshots Raft snapshots are not implemented yet for strong consistency. Adjust the current raft group config to make them much less likely to occur: - snapshot_threshold config option decides how many log entries need to be applied after the last snapshot. Set it to the maximum value for size_t in order to effectively disable it. - snapshot_threshold_log_size defines a threshold for the log memory usage over which a snapshot is created. Increase it from the default 2MB to 10MB. - max_log_size defines the threshold for the log memory usage over which requests are stopped to be admitted until the log is shrunk back by a snapshot. Set it to 20MB, as this option is recommended to be at least twice as much as snapshot_threshold_log_size. Refs: SCYLLADB-1115 --- service/strong_consistency/groups_manager.cc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/service/strong_consistency/groups_manager.cc b/service/strong_consistency/groups_manager.cc index 45501b7ec4..701e40a76a 100644 --- a/service/strong_consistency/groups_manager.cc +++ b/service/strong_consistency/groups_manager.cc @@ -137,6 +137,12 @@ future<> groups_manager::start_raft_group(global_tablet_id tablet, auto& persistence_ref = *storage; auto config = raft::server::configuration { + // Snapshotting is not implemented yet for strong consistency, + // so effectively disable periodic snapshotting. + // TODO: Revert after snapshots are implemented + .snapshot_threshold = std::numeric_limits::max(), + .snapshot_threshold_log_size = 10 * 1024 * 1024, // 10MB + .max_log_size = 20 * 1024 * 1024, // 20MB .enable_forwarding = false, .on_background_error = [tablet, group_id](std::exception_ptr e) { on_internal_error(logger, From 63067f594da2fe0f5ae24d62723de5b8b84dbfc6 Mon Sep 17 00:00:00 2001 From: Piotr Dulikowski Date: Mon, 23 Mar 2026 16:09:55 +0100 Subject: [PATCH 2/2] strong_consistency: fake taking and dropping snapshots Snapshots are not implemented yet for strong consistency - attempting to take, transfer or drop a snapshot results in an exception. However, the logic of our state machine forces snapshot transfer even if there are no lagging replicas - every raft::server::configuration::snapshot_threshold log entries. We have actually encountered an issue in our benchmarks where snapshots were being taken even though the cluster was not under any disruption, and this is one of the possible causes. It turns out that we can safely allow for taking snapshots right now - we can just implement it as a no-op and return a random UUID. Conversely, dropping a snapshot can also be a no-op. This is safe because snapshot transfer still throws an exception - as long as the taken/recovered snapshots are never attempted to be transferred. --- service/strong_consistency/state_machine.cc | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/service/strong_consistency/state_machine.cc b/service/strong_consistency/state_machine.cc index e8e80d0214..0154df33b7 100644 --- a/service/strong_consistency/state_machine.cc +++ b/service/strong_consistency/state_machine.cc @@ -79,11 +79,16 @@ public: } future take_snapshot() override { - throw std::runtime_error("take_snapshot() not implemented"); + // Until snapshot transfer is fully implemented, return a fake ID + // and don't actually do anything. As long as we don't do snapshot + // transfers (attempting to do that throws an exception), we should + // be safe. + return make_ready_future(raft::snapshot_id(utils::make_random_uuid())); } void drop_snapshot(raft::snapshot_id id) override { - throw std::runtime_error("drop_snapshot() not implemented"); + // Taking a snapshot is a no-op, so dropping a snapshot is also a no-op. + (void) id; } future<> load_snapshot(raft::snapshot_id id) override {