mirror of
https://github.com/scylladb/scylladb.git
synced 2026-04-24 02:20:37 +00:00
Before this PR, writes to the previous CDC generations would
always be rejected. After this PR, they will be accepted if the
write's timestamp is greater than `now - generation_leeway`.
This change was proposed around 3 years ago. The motivation was
to improve user experience. If a client generates timestamps by
itself and its clock is desynchronized with the clock of the node
the client is connected to, there could be a period during
generation switching when writes fail. We didn't consider this
problem critical because the client could simply retry a failed
write with a higher timestamp. Eventually, it would succeed. This
approach is safe because these failed writes cannot have any side
effects. However, it can be inconvenient. Writing to previous
generations was proposed to improve it.
The idea was rejected 3 years ago. Recently, it turned out that
there is a case when the client cannot retry a write with the
increased timestamp. It happens when a table uses CDC and LWT,
which makes timestamps permanent. Once Paxos commits an entry
with a given timestamp, Scylla will keep trying to apply that entry
until it succeeds, with the same timestamp. Applying the entry
involves writing to the CDC log table. If it fails, we get stuck.
It's a major bug with an unknown perfect solution.
Allowing writes to previous generations for `generation_leeway` is
a probabilistic fix that should solve the problem in practice.
Apart from this change, this PR adds tests for it and updates
the documentation.
This PR is sufficient to enable writes to the previous generations
only in the gossiper-based topology. The Raft-based topology
needs some adjustments in loading and cleaning CDC generations.
These changes won't interfere with the changes introduced in this
PR, so they are left for a follow-up.
Fixes scylladb/scylladb#7251
Fixes scylladb/scylladb#15260
Closes scylladb/scylladb#17134
* github.com:scylladb/scylladb:
docs: using-scylla: cdc: remove info about failing writes to old generations
docs: dev: cdc: document writing to previous CDC generations
test: add test_writes_to_previous_cdc_generations
cdc: generation: allow increasing generation_leeway through error injection
cdc: metadata: allow sending writes to the previous generations
(cherry picked from commit 9bb4482ad0)
Backport note: in tests, replaced `servers_add` with loop of `server_add`
207 lines
8.3 KiB
C++
207 lines
8.3 KiB
C++
/*
|
|
* Copyright (C) 2019-present ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* SPDX-License-Identifier: AGPL-3.0-or-later
|
|
*/
|
|
|
|
#include "dht/token-sharding.hh"
|
|
#include "utils/exceptions.hh"
|
|
#include "exceptions/exceptions.hh"
|
|
|
|
#include "cdc/generation.hh"
|
|
#include "cdc/metadata.hh"
|
|
|
|
extern logging::logger cdc_log;
|
|
|
|
static api::timestamp_type to_ts(db_clock::time_point tp) {
|
|
// This assumes that timestamp_clock and db_clock have the same epochs.
|
|
return std::chrono::duration_cast<api::timestamp_clock::duration>(tp.time_since_epoch()).count();
|
|
}
|
|
|
|
static cdc::stream_id get_stream(
|
|
const cdc::token_range_description& entry,
|
|
dht::token tok) {
|
|
// The ith stream is the stream for the ith shard.
|
|
auto shard_cnt = entry.streams.size();
|
|
auto shard_id = dht::shard_of(shard_cnt, entry.sharding_ignore_msb, tok);
|
|
|
|
if (shard_id >= shard_cnt) {
|
|
on_internal_error(cdc_log, "get_stream: shard_id out of bounds");
|
|
}
|
|
|
|
return entry.streams[shard_id];
|
|
}
|
|
|
|
// non-static for testing
|
|
cdc::stream_id get_stream(
|
|
const utils::chunked_vector<cdc::token_range_description>& entries,
|
|
dht::token tok) {
|
|
if (entries.empty()) {
|
|
on_internal_error(cdc_log, "get_stream: entries empty");
|
|
}
|
|
|
|
auto it = std::lower_bound(entries.begin(), entries.end(), tok,
|
|
[] (const cdc::token_range_description& e, dht::token t) { return e.token_range_end < t; });
|
|
if (it == entries.end()) {
|
|
it = entries.begin();
|
|
}
|
|
|
|
return get_stream(*it, tok);
|
|
}
|
|
|
|
cdc::metadata::container_t::const_iterator cdc::metadata::gen_used_at(api::timestamp_type ts) const {
|
|
auto it = _gens.upper_bound(ts);
|
|
if (it == _gens.begin()) {
|
|
// All known generations have higher timestamps than `ts`.
|
|
return _gens.end();
|
|
}
|
|
|
|
return std::prev(it);
|
|
}
|
|
|
|
bool cdc::metadata::streams_available() const {
|
|
auto now = api::new_timestamp();
|
|
auto it = gen_used_at(now);
|
|
return it != _gens.end();
|
|
}
|
|
|
|
cdc::stream_id cdc::metadata::get_stream(api::timestamp_type ts, dht::token tok) {
|
|
auto now = api::new_timestamp();
|
|
if (ts > now + get_generation_leeway().count()) {
|
|
throw exceptions::invalid_request_exception(format(
|
|
"cdc: attempted to get a stream \"from the future\" ({}; current server time: {})."
|
|
" With CDC you cannot send writes with timestamps arbitrarily into the future, because we don't"
|
|
" know what streams will be used at that time.\n"
|
|
"We *do* allow sending writes into the near future, but our ability to do that is limited."
|
|
" If you really must use your own timestamps, then make sure your clocks are well-synchronized"
|
|
" with the database's clocks.", format_timestamp(ts), format_timestamp(now)));
|
|
// Note that we might still send a write to a wrong generation, if we learn about the current
|
|
// generation too late (we might think that an earlier generation is the current one).
|
|
// Nothing protects us from that until we start using transactions for generation switching.
|
|
}
|
|
|
|
auto it = gen_used_at(now - get_generation_leeway().count());
|
|
|
|
if (it != _gens.end()) {
|
|
// Garbage-collect generations that will no longer be used.
|
|
it = _gens.erase(_gens.begin(), it);
|
|
}
|
|
|
|
if (ts <= now - get_generation_leeway().count()) {
|
|
// We reject the write if `ts <= now - generation_leeway` and the write is not to the current generation, which
|
|
// happens iff one of the following is true:
|
|
// - the write is to no generation,
|
|
// - the write is to a generation older than the generation under `it`,
|
|
// - the write is to the generation under `it` and that generation is not the current generation.
|
|
// Note that we cannot distinguish the first and second cases because we garbage-collect obsolete generations,
|
|
// but we can check if one of them takes place (`it == _gens.end() || ts < it->first`). These three conditions
|
|
// are sufficient. The write with `ts <= now - generation_leeway` cannot be to one of the generations following
|
|
// the generation under `it` because that generation was operating at `now - generation_leeway`.
|
|
bool is_previous_gen = it != _gens.end() && std::next(it) != _gens.end() && std::next(it)->first <= now;
|
|
if (it == _gens.end() || ts < it->first || is_previous_gen) {
|
|
throw exceptions::invalid_request_exception(format(
|
|
"cdc: attempted to get a stream \"from the past\" ({}; current server time: {})."
|
|
" With CDC you cannot send writes with timestamps too far into the past, because that would break"
|
|
" consistency properties.\n"
|
|
"We *do* allow sending writes into the near past, but our ability to do that is limited."
|
|
" Are you using client-side timestamps? Make sure your clocks are well-synchronized"
|
|
" with the database's clocks.", format_timestamp(ts), format_timestamp(now)));
|
|
}
|
|
}
|
|
|
|
it = _gens.begin();
|
|
if (it == _gens.end() || ts < it->first) {
|
|
throw std::runtime_error(format(
|
|
"cdc::metadata::get_stream: could not find any CDC stream for timestamp {}."
|
|
" Are we in the middle of a cluster upgrade?", format_timestamp(ts)));
|
|
}
|
|
|
|
// Find the generation operating at `ts`.
|
|
{
|
|
auto next_it = std::next(it);
|
|
while (next_it != _gens.end() && next_it->first <= ts) {
|
|
it = next_it++;
|
|
}
|
|
}
|
|
// Note: if there is a next generation that `ts` belongs to, but we don't know about it,
|
|
// then too bad. This is no different from the situation in which we didn't manage to learn
|
|
// about the current generation in time. We won't be able to prevent it until we introduce transactions.
|
|
|
|
if (!it->second) {
|
|
throw std::runtime_error(format(
|
|
"cdc: attempted to get a stream from a generation that we know about, but weren't able to retrieve"
|
|
" (generation timestamp: {}, write timestamp: {}). Make sure that the replicas which contain"
|
|
" this generation's data are alive and reachable from this node.", format_timestamp(it->first), format_timestamp(ts)));
|
|
}
|
|
|
|
auto& gen = *it->second;
|
|
auto ret = ::get_stream(gen.entries(), tok);
|
|
_last_stream_timestamp = ts;
|
|
return ret;
|
|
}
|
|
|
|
bool cdc::metadata::known_or_obsolete(db_clock::time_point tp) const {
|
|
auto ts = to_ts(tp);
|
|
auto it = _gens.lower_bound(ts);
|
|
|
|
if (it == _gens.end()) {
|
|
// No known generations with timestamp >= ts.
|
|
return false;
|
|
}
|
|
|
|
if (it->first == ts) {
|
|
if (it->second) {
|
|
// We already inserted this particular generation.
|
|
return true;
|
|
}
|
|
++it;
|
|
}
|
|
|
|
// Check if the generation is obsolete.
|
|
return it != _gens.end() && it->first <= api::new_timestamp() - get_generation_leeway().count();
|
|
}
|
|
|
|
bool cdc::metadata::insert(db_clock::time_point tp, topology_description&& gen) {
|
|
if (known_or_obsolete(tp)) {
|
|
return false;
|
|
}
|
|
|
|
auto now = api::new_timestamp();
|
|
auto it = gen_used_at(now - get_generation_leeway().count());
|
|
|
|
if (it != _gens.end()) {
|
|
// Garbage-collect generations that will no longer be used.
|
|
it = _gens.erase(_gens.begin(), it);
|
|
|
|
}
|
|
|
|
_gens.insert_or_assign(to_ts(tp), std::move(gen));
|
|
return true;
|
|
}
|
|
|
|
bool cdc::metadata::prepare(db_clock::time_point tp) {
|
|
if (known_or_obsolete(tp)) {
|
|
return false;
|
|
}
|
|
|
|
auto ts = to_ts(tp);
|
|
auto emplaced = _gens.emplace(to_ts(tp), std::nullopt).second;
|
|
|
|
if (_last_stream_timestamp != api::missing_timestamp) {
|
|
auto last_correct_gen = gen_used_at(_last_stream_timestamp);
|
|
if (emplaced && last_correct_gen != _gens.end() && last_correct_gen->first == ts) {
|
|
cdc_log.error(
|
|
"just learned about a CDC generation newer than the one used the last time"
|
|
" streams were retrieved. This generation, or some newer one, should have"
|
|
" been used instead (new generation's timestamp: {}, last time streams were retrieved: {})."
|
|
" The new generation probably arrived too late due to a network partition"
|
|
" and we've made a write using the wrong set streams.",
|
|
format_timestamp(ts), format_timestamp(_last_stream_timestamp));
|
|
}
|
|
}
|
|
|
|
return emplaced;
|
|
}
|