From d6c1707a0453b222f3d4dcba2b187817da95d6dc Mon Sep 17 00:00:00 2001 From: Aleksandra Martyniuk Date: Wed, 27 May 2026 12:17:06 +0200 Subject: [PATCH 1/2] tablets: fix hint re-adding tokens after full partition read decision When partition_split_builder splits a tablet metadata partition into multiple mutations, the first mutation gets the partition tombstone and/or static row while subsequent mutations contain only clustered rows. The tablet metadata change hint logic would correctly clear tokens (marking a full partition read) upon seeing the tombstone in the first mutation, but then re-add tokens when processing the subsequent row-only mutations. This caused update_tablet_metadata to attempt a point update via mutate_tablet_map_async on a tablet map that doesn't exist yet during bootstrap, throwing no_such_tablet_map and failing the snapshot transfer. Fix by adding a full_read flag to table_hint. Once a full partition read is decided (due to partition tombstone, range tombstone, static row, or row deletion), the flag prevents subsequent mutations for the same table from re-adding tokens. --- locator/tablets.hh | 3 +++ replica/tablets.cc | 21 +++++++++++---- test/boost/tablets_test.cc | 55 +++++++++++++++++++++++++++++++++++++- 3 files changed, 73 insertions(+), 6 deletions(-) diff --git a/locator/tablets.hh b/locator/tablets.hh index 763414415b..e910cf5d19 100644 --- a/locator/tablets.hh +++ b/locator/tablets.hh @@ -1039,6 +1039,9 @@ struct tablet_metadata_change_hint { struct table_hint { table_id table_id; std::vector tokens; + // When true, the entire partition must be read regardless of tokens. + // Once set, subsequent mutations for the same table must not re-add tokens. + bool full_read = false; bool operator==(const table_hint&) const = default; }; diff --git a/replica/tablets.cc b/replica/tablets.cc index aa7f1ddb14..00374344c3 100644 --- a/replica/tablets.cc +++ b/replica/tablets.cc @@ -640,13 +640,23 @@ static void do_update_tablet_metadata_change_hint(locator::tablet_metadata_chang auto it = hint.tables.try_emplace(table_id, locator::tablet_metadata_change_hint::table_hint{table_id, {}}).first; const auto& mp = m.partition(); - auto& tokens = it->second.tokens; + auto& tbl_hint = it->second; + + // Once a full partition read is decided, don't re-add tokens from + // subsequent mutations for the same table. This can happen when + // partition_split_builder splits a partition into multiple mutations: + // the first one gets the partition tombstone/static row (triggering + // full_read), while later ones contain only clustered rows. + if (tbl_hint.full_read) { + return; + } if (mp.partition_tombstone() || !mp.row_tombstones().empty() || !mp.static_row().empty()) { // If there is a partition tombstone, range tombstone or static row, // update the entire partition. Also clear any row hints that might be // present to force a full read of the partition. - tokens.clear(); + tbl_hint.tokens.clear(); + tbl_hint.full_read = true; return; } @@ -654,10 +664,11 @@ static void do_update_tablet_metadata_change_hint(locator::tablet_metadata_chang // TODO: we do not handle deletions yet, will revisit when tablet count // reduction is worked out. if (row.row().deleted_at()) { - tokens.clear(); + tbl_hint.tokens.clear(); + tbl_hint.full_read = true; return; } - tokens.push_back(to_tablet_metadata_row_key(s, row.key())); + tbl_hint.tokens.push_back(to_tablet_metadata_row_key(s, row.key())); } } @@ -1051,7 +1062,7 @@ future<> update_tablet_metadata(replica::database& db, cql3::query_processor& qp try { for (const auto& [_, table_hint] : hint.tables) { - if (table_hint.tokens.empty()) { + if (table_hint.full_read || table_hint.tokens.empty()) { co_await do_update_tablet_metadata_partition(qp, tm, table_hint, builder); } else { co_await tm.mutate_tablet_map_async(table_hint.table_id, [&] (tablet_map& tmap) -> future<> { diff --git a/test/boost/tablets_test.cc b/test/boost/tablets_test.cc index 035f5836a4..3646cd44dd 100644 --- a/test/boost/tablets_test.cc +++ b/test/boost/tablets_test.cc @@ -1153,7 +1153,8 @@ SEASTAR_TEST_CASE(test_tablet_metadata_hint) { auto make_hint = [&] (std::initializer_list>> tablets) { locator::tablet_metadata_change_hint hint; for (const auto& [tid, tokens] : tablets) { - hint.tables.emplace(tid, locator::tablet_metadata_change_hint::table_hint{.table_id = tid, .tokens = tokens}); + hint.tables.emplace(tid, locator::tablet_metadata_change_hint::table_hint{ + .table_id = tid, .tokens = tokens, .full_read = tokens.empty()}); } return hint; }; @@ -1281,6 +1282,58 @@ SEASTAR_TEST_CASE(test_tablet_metadata_hint) { }, tablet_cql_test_config()); } +// Regression test: when a partition tombstone or static row is in the first +// split mutation, subsequent split mutations (rows-only) must not re-add tokens +// to the hint, undoing the full-partition-read decision. +SEASTAR_TEST_CASE(test_tablet_metadata_hint_split_with_tombstone) { + return do_with_cql_env_thread([] (cql_test_env& e) { + auto h2 = host_id(utils::UUID_gen::get_time_UUID()); + + auto table1 = add_table(e).get(); + + tablet_metadata tm = read_tablet_metadata(e.local_qp()).get(); + auto ts = current_timestamp(e); + + const auto& tmap = tm.get_tablet_map(table1); + auto tokens = tmap.get_sorted_tokens().get(); + BOOST_REQUIRE_GE(tokens.size(), 2); + + // Simulate what partition_split_builder does: first mutation has a + // partition tombstone (or static row) but no clustered rows; subsequent + // mutations have only clustered rows. + auto s = db::system_keyspace::tablets(); + + // First split chunk: partition tombstone only + mutation mut1(s, partition_key::from_single_value(*s, + data_value(table1.uuid()).serialize_nonnull())); + mut1.partition().apply(tombstone(ts++, gc_clock::now())); + + // Second split chunk: rows only (no tombstone, no static row) + replica::tablet_mutation_builder builder(ts++, table1); + builder.set_replicas(tokens[0], + tablet_replica_set { + tablet_replica {h2, 0}, + } + ); + builder.set_replicas(tokens[1], + tablet_replica_set { + tablet_replica {h2, 1}, + } + ); + auto mut2 = builder.build(); + + // Build hint incrementally, as schema_applier::prepare() does + locator::tablet_metadata_change_hint hint; + replica::update_tablet_metadata_change_hint(hint, mut1); + replica::update_tablet_metadata_change_hint(hint, mut2); + + BOOST_REQUIRE(hint.tables.contains(table1)); + BOOST_REQUIRE(hint.tables.at(table1).tokens.empty()); + BOOST_REQUIRE(hint.tables.at(table1).full_read); + }, tablet_cql_test_config()); +} + + SEASTAR_TEST_CASE(test_get_shard) { return do_with_cql_env_thread([] (cql_test_env& e) { auto h1 = host_id(utils::UUID_gen::get_time_UUID()); From 491db28fbf759d447f24961115600f98524c4015 Mon Sep 17 00:00:00 2001 From: Aleksandra Martyniuk Date: Wed, 27 May 2026 14:35:18 +0200 Subject: [PATCH 2/2] tablets: fall back to full partition read when tablet map is missing When update_tablet_metadata receives a hint with non-empty tokens for a table whose tablet map doesn't exist locally yet, it would call mutate_tablet_map_async which throws no_such_tablet_map. This happens during bootstrap when the joining node receives tablet metadata for a table it has never seen before. Fix by checking has_tablet_map() before attempting the point update. If the map is missing, fall back to do_update_tablet_metadata_partition which reads the full partition from system.tablets and creates the map. --- replica/tablets.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/replica/tablets.cc b/replica/tablets.cc index 00374344c3..9787da1ab7 100644 --- a/replica/tablets.cc +++ b/replica/tablets.cc @@ -1062,7 +1062,7 @@ future<> update_tablet_metadata(replica::database& db, cql3::query_processor& qp try { for (const auto& [_, table_hint] : hint.tables) { - if (table_hint.full_read || table_hint.tokens.empty()) { + if (table_hint.full_read || table_hint.tokens.empty() || !tm.has_tablet_map(table_hint.table_id)) { co_await do_update_tablet_metadata_partition(qp, tm, table_hint, builder); } else { co_await tm.mutate_tablet_map_async(table_hint.table_id, [&] (tablet_map& tmap) -> future<> {