From 4913acd74276520d5b627873f29cffd036fc834a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patryk=20J=C4=99drzejczak?= Date: Mon, 23 Mar 2026 16:28:20 +0100 Subject: [PATCH 1/2] Revert "raft: read_barrier: update local commit_idx to read_idx when it's safe" This reverts commit 1ae2ae50a64f50fb736049ff4ef4f86e4e0e8fad. The reverted change turned out to be incorrect. The Raft member requesting a read barrier could incorrectly advance its commit_idx and break linearizability. More details in https://scylladb.atlassian.net/browse/SCYLLADB-998?focusedCommentId=42935 --- raft/fsm.cc | 8 -------- raft/fsm.hh | 9 --------- raft/server.cc | 1 - 3 files changed, 18 deletions(-) diff --git a/raft/fsm.cc b/raft/fsm.cc index 4e4025db64..54a1caeb5e 100644 --- a/raft/fsm.cc +++ b/raft/fsm.cc @@ -1114,14 +1114,6 @@ std::optional> fsm::start_read_barrier(server_id req return std::make_pair(id, _commit_idx); } -void fsm::maybe_update_commit_idx_for_read(index_t read_idx) { - // read_idx from the leader might not be replicated to the local node yet. - const bool in_local_log = read_idx <= _log.last_idx(); - if (in_local_log && log_term_for(read_idx) == get_current_term()) { - advance_commit_idx(read_idx); - } -} - void fsm::stop() { if (is_leader()) { // Become follower to stop accepting requests diff --git a/raft/fsm.hh b/raft/fsm.hh index 6e14140d86..9d1ab9980f 100644 --- a/raft/fsm.hh +++ b/raft/fsm.hh @@ -480,15 +480,6 @@ public: std::optional> start_read_barrier(server_id requester); - // Update the commit index to the read index (a read barrier result from the leader) if the local entry with the - // read index belongs to the current term. - // - // Satisfying the condition above guarantees that the local log matches the current leader's log up to the read - // index (the Log Matching Property), so the current leader won't drop the local entry with the read index. - // Moreover, this entry has been committed by the leader, so future leaders also won't drop it (the Leader - // Completeness Property). Hence, updating the commit index is safe. - void maybe_update_commit_idx_for_read(index_t read_idx); - size_t in_memory_log_size() const { return _log.in_memory_size(); } diff --git a/raft/server.cc b/raft/server.cc index 86c6712df5..dbe5d1f513 100644 --- a/raft/server.cc +++ b/raft/server.cc @@ -1571,7 +1571,6 @@ future<> server_impl::read_barrier(seastar::abort_source* as) { co_return stop_iteration::no; } read_idx = std::get(res); - _fsm->maybe_update_commit_idx_for_read(read_idx); co_return stop_iteration::yes; }); From ba54b2272b47b3a3f0f8c942fe56b6b7a688e5e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patryk=20J=C4=99drzejczak?= Date: Mon, 23 Mar 2026 16:28:17 +0100 Subject: [PATCH 2/2] raft: speed up read barrier requested by non-voters We achieve this by making the leader replicate to the non-voting requester of a read barrier if its commit_idx is behind. There are some corner cases where the new `replicate_to(*opt_progress, true);` call will be a no-op, while the corresponding call in `tick_leader()` would result in sending the AppendEntries RPC to the follower. These cases are: - `progress.state == follower_progress::state::PROBE && progress.probe_sent`, - `progress.state == follower_progress::state::PIPELINE && progress.in_flight == follower_progress::max_in_flight`. We could try to improve the optimization by including some of the cases above, but it would only complicate the code without noticeable benefits (at least for group0). Note: this is the second attempt for this optimization. The first approach turned out to be incorrect and was reverted in the previous commit. The performance improvement is the same as in the previous case. --- raft/fsm.cc | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/raft/fsm.cc b/raft/fsm.cc index 54a1caeb5e..0bbeec5ec7 100644 --- a/raft/fsm.cc +++ b/raft/fsm.cc @@ -1098,7 +1098,8 @@ std::optional> fsm::start_read_barrier(server_id req // Make sure that only a leader or a node that is part of the config can request read barrier // Nodes outside of the config may never get the data, so they will not be able to read it. - if (requester != _my_id && leader_state().tracker.find(requester) == nullptr) { + follower_progress* opt_progress = leader_state().tracker.find(requester); + if (requester != _my_id && opt_progress == nullptr) { throw std::runtime_error(fmt::format("Read barrier requested by a node outside of the configuration {}", requester)); } @@ -1109,6 +1110,18 @@ std::optional> fsm::start_read_barrier(server_id req return {}; } + // Optimization for read barriers requested on non-voters. A non-voter doesn't receive the read_quorum message, so + // it might update its commit index only after another leader tick, which would slow down wait_for_apply() at the + // end of the read barrier. Prevent that by replicating to the non-voting requester here. + if (requester != _my_id && opt_progress->commit_idx < _commit_idx && opt_progress->match_idx == _log.last_idx() + && !opt_progress->can_vote) { + logger.trace("start_read_barrier[{}]: replicate to {} because follower commit_idx={} < commit_idx={}, " + "follower match_idx={} == last_idx={}, and follower can_vote={}", + _my_id, requester, opt_progress->commit_idx, _commit_idx, opt_progress->match_idx, + _log.last_idx(), opt_progress->can_vote); + replicate_to(*opt_progress, true); + } + read_id id = next_read_id(); logger.trace("start_read_barrier[{}] starting read barrier with id {}", _my_id, id); return std::make_pair(id, _commit_idx);