tablets: fix hint re-adding tokens after full partition read decision

When partition_split_builder splits a tablet metadata partition into
multiple mutations, the first mutation gets the partition tombstone and/or
static row while subsequent mutations contain only clustered rows.

The tablet metadata change hint logic would correctly clear tokens (marking
a full partition read) upon seeing the tombstone in the first mutation,
but then re-add tokens when processing the subsequent row-only mutations.
This caused update_tablet_metadata to attempt a point update via
mutate_tablet_map_async on a tablet map that doesn't exist yet during
bootstrap, throwing no_such_tablet_map and failing the snapshot transfer.

Fix by adding a full_read flag to table_hint. Once a full partition read
is decided (due to partition tombstone, range tombstone, static row, or
row deletion), the flag prevents subsequent mutations for the same table
from re-adding tokens.

(cherry picked from commit d6c1707a04)
This commit is contained in:
Aleksandra Martyniuk
2026-05-27 12:17:06 +02:00
committed by scylladbbot
parent d4bbc144a4
commit 101237d7cf
3 changed files with 73 additions and 6 deletions

View File

@@ -1025,6 +1025,9 @@ struct tablet_metadata_change_hint {
struct table_hint {
table_id table_id;
std::vector<token> tokens;
// When true, the entire partition must be read regardless of tokens.
// Once set, subsequent mutations for the same table must not re-add tokens.
bool full_read = false;
bool operator==(const table_hint&) const = default;
};

View File

@@ -597,13 +597,23 @@ static void do_update_tablet_metadata_change_hint(locator::tablet_metadata_chang
auto it = hint.tables.try_emplace(table_id, locator::tablet_metadata_change_hint::table_hint{table_id, {}}).first;
const auto& mp = m.partition();
auto& tokens = it->second.tokens;
auto& tbl_hint = it->second;
// Once a full partition read is decided, don't re-add tokens from
// subsequent mutations for the same table. This can happen when
// partition_split_builder splits a partition into multiple mutations:
// the first one gets the partition tombstone/static row (triggering
// full_read), while later ones contain only clustered rows.
if (tbl_hint.full_read) {
return;
}
if (mp.partition_tombstone() || !mp.row_tombstones().empty() || !mp.static_row().empty()) {
// If there is a partition tombstone, range tombstone or static row,
// update the entire partition. Also clear any row hints that might be
// present to force a full read of the partition.
tokens.clear();
tbl_hint.tokens.clear();
tbl_hint.full_read = true;
return;
}
@@ -611,10 +621,11 @@ static void do_update_tablet_metadata_change_hint(locator::tablet_metadata_chang
// TODO: we do not handle deletions yet, will revisit when tablet count
// reduction is worked out.
if (row.row().deleted_at()) {
tokens.clear();
tbl_hint.tokens.clear();
tbl_hint.full_read = true;
return;
}
tokens.push_back(to_tablet_metadata_row_key(s, row.key()));
tbl_hint.tokens.push_back(to_tablet_metadata_row_key(s, row.key()));
}
}
@@ -1003,7 +1014,7 @@ future<> update_tablet_metadata(replica::database& db, cql3::query_processor& qp
try {
for (const auto& [_, table_hint] : hint.tables) {
if (table_hint.tokens.empty()) {
if (table_hint.full_read || table_hint.tokens.empty()) {
co_await do_update_tablet_metadata_partition(qp, tm, table_hint, builder);
} else {
co_await tm.mutate_tablet_map_async(table_hint.table_id, [&] (tablet_map& tmap) -> future<> {

View File

@@ -1153,7 +1153,8 @@ SEASTAR_TEST_CASE(test_tablet_metadata_hint) {
auto make_hint = [&] (std::initializer_list<std::pair<table_id, std::vector<token>>> tablets) {
locator::tablet_metadata_change_hint hint;
for (const auto& [tid, tokens] : tablets) {
hint.tables.emplace(tid, locator::tablet_metadata_change_hint::table_hint{.table_id = tid, .tokens = tokens});
hint.tables.emplace(tid, locator::tablet_metadata_change_hint::table_hint{
.table_id = tid, .tokens = tokens, .full_read = tokens.empty()});
}
return hint;
};
@@ -1281,6 +1282,58 @@ SEASTAR_TEST_CASE(test_tablet_metadata_hint) {
}, tablet_cql_test_config());
}
// Regression test: when a partition tombstone or static row is in the first
// split mutation, subsequent split mutations (rows-only) must not re-add tokens
// to the hint, undoing the full-partition-read decision.
SEASTAR_TEST_CASE(test_tablet_metadata_hint_split_with_tombstone) {
return do_with_cql_env_thread([] (cql_test_env& e) {
auto h2 = host_id(utils::UUID_gen::get_time_UUID());
auto table1 = add_table(e).get();
tablet_metadata tm = read_tablet_metadata(e.local_qp()).get();
auto ts = current_timestamp(e);
const auto& tmap = tm.get_tablet_map(table1);
auto tokens = tmap.get_sorted_tokens().get();
BOOST_REQUIRE_GE(tokens.size(), 2);
// Simulate what partition_split_builder does: first mutation has a
// partition tombstone (or static row) but no clustered rows; subsequent
// mutations have only clustered rows.
auto s = db::system_keyspace::tablets();
// First split chunk: partition tombstone only
mutation mut1(s, partition_key::from_single_value(*s,
data_value(table1.uuid()).serialize_nonnull()));
mut1.partition().apply(tombstone(ts++, gc_clock::now()));
// Second split chunk: rows only (no tombstone, no static row)
replica::tablet_mutation_builder builder(ts++, table1);
builder.set_replicas(tokens[0],
tablet_replica_set {
tablet_replica {h2, 0},
}
);
builder.set_replicas(tokens[1],
tablet_replica_set {
tablet_replica {h2, 1},
}
);
auto mut2 = builder.build();
// Build hint incrementally, as schema_applier::prepare() does
locator::tablet_metadata_change_hint hint;
replica::update_tablet_metadata_change_hint(hint, mut1);
replica::update_tablet_metadata_change_hint(hint, mut2);
BOOST_REQUIRE(hint.tables.contains(table1));
BOOST_REQUIRE(hint.tables.at(table1).tokens.empty());
BOOST_REQUIRE(hint.tables.at(table1).full_read);
}, tablet_cql_test_config());
}
SEASTAR_TEST_CASE(test_get_shard) {
return do_with_cql_env_thread([] (cql_test_env& e) {
auto h1 = host_id(utils::UUID_gen::get_time_UUID());