mirror of
https://github.com/scylladb/scylladb.git
synced 2026-06-02 13:06:57 +00:00
Merge 'tablets: fix update_tablet_metadata failures during bootstrap' from Aleksandra Martyniuk
When partition_split_builder splits a tablet metadata partition into
multiple mutations, the first mutation gets the partition tombstone
and/or static row while subsequent mutations contain only clustered
rows. The hint logic would correctly clear tokens (marking a full
partition read) upon seeing the tombstone in the first mutation, but
then re-add tokens when processing the subsequent row-only mutations.
This caused update_tablet_metadata to attempt a point update via
mutate_tablet_map_async on a tablet map that doesn't exist yet during
bootstrap, throwing no_such_tablet_map and failing the snapshot transfer.
Fix by adding a full_read flag to table_hint. Once a full partition read
is decided (due to partition tombstone, range tombstone, static row, or
row deletion), the flag prevents subsequent mutations for the same table
from re-adding tokens. Additionally, fall back to a full partition read
when the tablet map is missing locally, which happens when the joining
node receives tablet metadata for a table it has never seen before.
Fixes: https://scylladb.atlassian.net/browse/SCYLLADB-2303.
Needs backports to 2026.1+. 2026.1 introduces the regression with b17a36c071
Closes scylladb/scylladb#30115
* github.com:scylladb/scylladb:
tablets: fall back to full partition read when tablet map is missing
tablets: fix hint re-adding tokens after full partition read decision
This commit is contained in:
@@ -1039,6 +1039,9 @@ struct tablet_metadata_change_hint {
|
||||
struct table_hint {
|
||||
table_id table_id;
|
||||
std::vector<token> tokens;
|
||||
// When true, the entire partition must be read regardless of tokens.
|
||||
// Once set, subsequent mutations for the same table must not re-add tokens.
|
||||
bool full_read = false;
|
||||
|
||||
bool operator==(const table_hint&) const = default;
|
||||
};
|
||||
|
||||
@@ -640,13 +640,23 @@ static void do_update_tablet_metadata_change_hint(locator::tablet_metadata_chang
|
||||
auto it = hint.tables.try_emplace(table_id, locator::tablet_metadata_change_hint::table_hint{table_id, {}}).first;
|
||||
|
||||
const auto& mp = m.partition();
|
||||
auto& tokens = it->second.tokens;
|
||||
auto& tbl_hint = it->second;
|
||||
|
||||
// Once a full partition read is decided, don't re-add tokens from
|
||||
// subsequent mutations for the same table. This can happen when
|
||||
// partition_split_builder splits a partition into multiple mutations:
|
||||
// the first one gets the partition tombstone/static row (triggering
|
||||
// full_read), while later ones contain only clustered rows.
|
||||
if (tbl_hint.full_read) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (mp.partition_tombstone() || !mp.row_tombstones().empty() || !mp.static_row().empty()) {
|
||||
// If there is a partition tombstone, range tombstone or static row,
|
||||
// update the entire partition. Also clear any row hints that might be
|
||||
// present to force a full read of the partition.
|
||||
tokens.clear();
|
||||
tbl_hint.tokens.clear();
|
||||
tbl_hint.full_read = true;
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -654,10 +664,11 @@ static void do_update_tablet_metadata_change_hint(locator::tablet_metadata_chang
|
||||
// TODO: we do not handle deletions yet, will revisit when tablet count
|
||||
// reduction is worked out.
|
||||
if (row.row().deleted_at()) {
|
||||
tokens.clear();
|
||||
tbl_hint.tokens.clear();
|
||||
tbl_hint.full_read = true;
|
||||
return;
|
||||
}
|
||||
tokens.push_back(to_tablet_metadata_row_key(s, row.key()));
|
||||
tbl_hint.tokens.push_back(to_tablet_metadata_row_key(s, row.key()));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1051,7 +1062,7 @@ future<> update_tablet_metadata(replica::database& db, cql3::query_processor& qp
|
||||
|
||||
try {
|
||||
for (const auto& [_, table_hint] : hint.tables) {
|
||||
if (table_hint.tokens.empty()) {
|
||||
if (table_hint.full_read || table_hint.tokens.empty() || !tm.has_tablet_map(table_hint.table_id)) {
|
||||
co_await do_update_tablet_metadata_partition(qp, tm, table_hint, builder);
|
||||
} else {
|
||||
co_await tm.mutate_tablet_map_async(table_hint.table_id, [&] (tablet_map& tmap) -> future<> {
|
||||
|
||||
@@ -1153,7 +1153,8 @@ SEASTAR_TEST_CASE(test_tablet_metadata_hint) {
|
||||
auto make_hint = [&] (std::initializer_list<std::pair<table_id, std::vector<token>>> tablets) {
|
||||
locator::tablet_metadata_change_hint hint;
|
||||
for (const auto& [tid, tokens] : tablets) {
|
||||
hint.tables.emplace(tid, locator::tablet_metadata_change_hint::table_hint{.table_id = tid, .tokens = tokens});
|
||||
hint.tables.emplace(tid, locator::tablet_metadata_change_hint::table_hint{
|
||||
.table_id = tid, .tokens = tokens, .full_read = tokens.empty()});
|
||||
}
|
||||
return hint;
|
||||
};
|
||||
@@ -1281,6 +1282,58 @@ SEASTAR_TEST_CASE(test_tablet_metadata_hint) {
|
||||
}, tablet_cql_test_config());
|
||||
}
|
||||
|
||||
// Regression test: when a partition tombstone or static row is in the first
|
||||
// split mutation, subsequent split mutations (rows-only) must not re-add tokens
|
||||
// to the hint, undoing the full-partition-read decision.
|
||||
SEASTAR_TEST_CASE(test_tablet_metadata_hint_split_with_tombstone) {
|
||||
return do_with_cql_env_thread([] (cql_test_env& e) {
|
||||
auto h2 = host_id(utils::UUID_gen::get_time_UUID());
|
||||
|
||||
auto table1 = add_table(e).get();
|
||||
|
||||
tablet_metadata tm = read_tablet_metadata(e.local_qp()).get();
|
||||
auto ts = current_timestamp(e);
|
||||
|
||||
const auto& tmap = tm.get_tablet_map(table1);
|
||||
auto tokens = tmap.get_sorted_tokens().get();
|
||||
BOOST_REQUIRE_GE(tokens.size(), 2);
|
||||
|
||||
// Simulate what partition_split_builder does: first mutation has a
|
||||
// partition tombstone (or static row) but no clustered rows; subsequent
|
||||
// mutations have only clustered rows.
|
||||
auto s = db::system_keyspace::tablets();
|
||||
|
||||
// First split chunk: partition tombstone only
|
||||
mutation mut1(s, partition_key::from_single_value(*s,
|
||||
data_value(table1.uuid()).serialize_nonnull()));
|
||||
mut1.partition().apply(tombstone(ts++, gc_clock::now()));
|
||||
|
||||
// Second split chunk: rows only (no tombstone, no static row)
|
||||
replica::tablet_mutation_builder builder(ts++, table1);
|
||||
builder.set_replicas(tokens[0],
|
||||
tablet_replica_set {
|
||||
tablet_replica {h2, 0},
|
||||
}
|
||||
);
|
||||
builder.set_replicas(tokens[1],
|
||||
tablet_replica_set {
|
||||
tablet_replica {h2, 1},
|
||||
}
|
||||
);
|
||||
auto mut2 = builder.build();
|
||||
|
||||
// Build hint incrementally, as schema_applier::prepare() does
|
||||
locator::tablet_metadata_change_hint hint;
|
||||
replica::update_tablet_metadata_change_hint(hint, mut1);
|
||||
replica::update_tablet_metadata_change_hint(hint, mut2);
|
||||
|
||||
BOOST_REQUIRE(hint.tables.contains(table1));
|
||||
BOOST_REQUIRE(hint.tables.at(table1).tokens.empty());
|
||||
BOOST_REQUIRE(hint.tables.at(table1).full_read);
|
||||
}, tablet_cql_test_config());
|
||||
}
|
||||
|
||||
|
||||
SEASTAR_TEST_CASE(test_get_shard) {
|
||||
return do_with_cql_env_thread([] (cql_test_env& e) {
|
||||
auto h1 = host_id(utils::UUID_gen::get_time_UUID());
|
||||
|
||||
Reference in New Issue
Block a user