mirror of
https://github.com/scylladb/scylladb.git
synced 2026-04-20 00:20:47 +00:00
3f7ee3ce5dintroduced system.batchlog_v2, with a schema designed to speed up batchlog replays and make post-replay cleanups much more effective. It did not introduce a cluster feature for the new table, because it is node local table, so the cluster can switch to the new table gradually, one node at a time. However, https://github.com/scylladb/scylladb/issues/27886 showed that the switching causes timeouts during upgrades, in mixed clusters. Furthermore, switching to the new table unconditionally on upgrades nodes, means that on rollback, the batches saved into the v2 table are lost. This PR introduces re-introduces v1 (`system.batchlog`) support and guards the use of the v2 table with a cluster feature, so mixed clusters keep using v1 and thus be rollback-compatible. The re-introduced v1 support doesn't support post-replay cleanups for simplicity. The cleanup in v1 was never particularly effective anyway and we ended up disabling it for heavy batchlog users, so I don't think the lack of support for cleanup is a problem. Fixes: https://github.com/scylladb/scylladb/issues/27886 Needs backport to 2026.1, to fix upgrades for clusters using batches Closes scylladb/scylladb#28736 * github.com:scylladb/scylladb: test/boost/batchlog_manager_test: add tests for v1 batchlog test/boost/batchlog_manager_test: make prepare_batches() work with both v1 and v2 test/boost/batchlog_manager_test: fix indentation test/boost/batchlog_manager_test: extract prepare_batches() method test/lib/cql_assertions: is_rows(): add dump parameter tools/scylla-sstable: extract query result printers tools/scylla-sstable: add std::ostream& arg to query result printers repair/row_level: repair_flush_hints_batchlog_handler(): add all_replayed to finish log db/batchlog_manager: re-add v1 support db/batchlog_manager: return all_replayed from process_batch() db/batchlog_manager: process_bath() fix indentation db/batchlog_manager: make batch() a standalone function db/batchlog_manager: make structs stats public db/batchlog_manager: allocate limiter on the stack db/batchlog_manager: add feature_service dependency gms/feature_service: add batchlog_v2 feature (cherry picked from commita83ee6cf66) Closes scylladb/scylladb#28853
125 lines
3.4 KiB
C++
125 lines
3.4 KiB
C++
/*
|
|
* Copyright (C) 2015-present ScyllaDB
|
|
*
|
|
* Modified by ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include <seastar/core/future.hh>
|
|
#include <seastar/core/sharded.hh>
|
|
#include <seastar/core/gate.hh>
|
|
#include <seastar/core/metrics_registration.hh>
|
|
#include <seastar/core/abort_source.hh>
|
|
|
|
#include "db_clock.hh"
|
|
|
|
#include <chrono>
|
|
#include <limits>
|
|
|
|
namespace cql3 {
|
|
|
|
class query_processor;
|
|
|
|
} // namespace cql3
|
|
|
|
namespace gms {
|
|
|
|
class feature_service;
|
|
|
|
} // namespace gms
|
|
|
|
namespace db {
|
|
|
|
class system_keyspace;
|
|
|
|
using all_batches_replayed = bool_class<struct all_batches_replayed_tag>;
|
|
|
|
struct batchlog_manager_config {
|
|
db_clock::duration replay_timeout;
|
|
uint64_t replay_rate = std::numeric_limits<uint64_t>::max();
|
|
std::chrono::milliseconds delay = std::chrono::milliseconds(0);
|
|
unsigned replay_cleanup_after_replays;
|
|
};
|
|
|
|
enum class batchlog_stage : int8_t {
|
|
initial,
|
|
failed_replay
|
|
};
|
|
|
|
class batchlog_manager : public peering_sharded_service<batchlog_manager> {
|
|
public:
|
|
using post_replay_cleanup = bool_class<class post_replay_cleanup_tag>;
|
|
|
|
struct stats {
|
|
uint64_t write_attempts = 0;
|
|
};
|
|
|
|
|
|
private:
|
|
static constexpr std::chrono::seconds replay_interval = std::chrono::seconds(60);
|
|
static constexpr uint32_t page_size = 128; // same as HHOM, for now, w/out using any heuristics. TODO: set based on avg batch size.
|
|
static constexpr std::chrono::seconds write_timeout = std::chrono::seconds(300);
|
|
|
|
using clock_type = lowres_clock;
|
|
|
|
stats _stats;
|
|
|
|
seastar::metrics::metric_groups _metrics;
|
|
|
|
cql3::query_processor& _qp;
|
|
db::system_keyspace& _sys_ks;
|
|
gms::feature_service& _fs;
|
|
db_clock::duration _replay_timeout;
|
|
uint64_t _replay_rate;
|
|
std::chrono::milliseconds _delay;
|
|
unsigned _replay_cleanup_after_replays = 100;
|
|
semaphore _sem{1};
|
|
seastar::named_gate _gate;
|
|
unsigned _cpu = 0;
|
|
seastar::abort_source _stop;
|
|
future<> _loop_done;
|
|
|
|
gc_clock::time_point _last_replay;
|
|
|
|
// Was the v1 -> v2 migration already done since last restart?
|
|
// The migration is attempted once after each restart. This is redundant but
|
|
// keeps thing simple. Once no upgrade path exists from a ScyllaDB version
|
|
// which can still produce v1 entries, this migration code can be removed.
|
|
bool _migration_done = false;
|
|
|
|
future<> maybe_migrate_v1_to_v2();
|
|
|
|
future<all_batches_replayed> replay_all_failed_batches_v1(post_replay_cleanup cleanup);
|
|
future<all_batches_replayed> replay_all_failed_batches_v2(post_replay_cleanup cleanup);
|
|
future<all_batches_replayed> replay_all_failed_batches(post_replay_cleanup cleanup);
|
|
public:
|
|
// Takes a QP, not a distributes. Because this object is supposed
|
|
// to be per shard and does no dispatching beyond delegating the the
|
|
// shard qp (which is what you feed here).
|
|
batchlog_manager(cql3::query_processor&, db::system_keyspace& sys_ks, gms::feature_service& fs, batchlog_manager_config config);
|
|
|
|
// abort the replay loop and return its future.
|
|
future<> drain();
|
|
future<> stop();
|
|
|
|
future<all_batches_replayed> do_batch_log_replay(post_replay_cleanup cleanup);
|
|
|
|
future<size_t> count_all_batches() const;
|
|
gc_clock::time_point get_last_replay() const {
|
|
return _last_replay;
|
|
}
|
|
|
|
const stats& get_stats() const {
|
|
return _stats;
|
|
}
|
|
private:
|
|
future<> batchlog_replay_loop();
|
|
};
|
|
|
|
}
|