tablets: deallocate storage state on end_migration

When a tablet is migrated and cleaned up, deallocate the tablet storage
group state on `end_migration` stage, instead of `cleanup` stage:

* When the stage is updated from `cleanup` to `end_migration`, the
  storage group is removed on the leaving replica.
* When the table is initialized, if the tablet stage is `end_migration`
  then we don't allocate a storage group for it. This happens for
  example if the leaving replica is restarted during tablet migration.
  If it's initialized in `cleanup` stage then we allocate a storage
  group, and it will be deallocated when transitioning to
  `end_migration`.

This guarantees that the storage group is always deallocated on the
leaving replica by `end_migration`, and that it is always allocated if
the tablet wasn't cleaned up fully yet.

It is a similar case also for the pending replica when the migration is
aborted. We deallocate the state on `revert_migration` which is the
stage following `cleanup_target`.

Previously the storage group would be allocated when the tablet is
initialized on any of the tablet replicas - also on the leaving replica,
and when the tablet stage is `cleanup` or `end_migration`, and
deallocated during `cleanup`.

This fixes the following issue:

1. A migrating tablet enters cleanup stage
2. the tablet is cleaned up successfuly
3. The leaving replica is restarted, and allocates storage group
4. tablet cleanup is not called because it was already cleaned up
4. the storage group remains allocated on the leaving replica after the
   migration is completed - it's not cleaned up properly.

Fixes scylladb/scylladb#23481

(cherry picked from commit 34f15ca871)
This commit is contained in:
Michael Litvak
2025-06-07 12:29:59 +03:00
committed by GitHub Action
parent 4a18a08284
commit 0b46cfb60d
3 changed files with 31 additions and 7 deletions

View File

@@ -136,6 +136,18 @@ std::optional<tablet_replica> get_leaving_replica(const tablet_info& tinfo, cons
return *leaving.begin();
}
bool is_post_cleanup(tablet_replica replica, const tablet_info& tinfo, const tablet_transition_info& trinfo) {
if (replica == locator::get_leaving_replica(tinfo, trinfo)) {
// we do tablet cleanup on the leaving replica in the `cleanup` stage, after which there is only the `end_migration` stage.
return trinfo.stage == locator::tablet_transition_stage::end_migration;
}
if (replica == trinfo.pending_replica) {
// we do tablet cleanup on the pending replica in the `cleanup_target` stage, after which there is only the `revert_migration` stage.
return trinfo.stage == locator::tablet_transition_stage::revert_migration;
}
return false;
}
tablet_replica_set get_new_replicas(const tablet_info& tinfo, const tablet_migration_info& mig) {
return replace_replica(tinfo.replicas, mig.src, mig.dst);
}

View File

@@ -222,6 +222,10 @@ struct tablet_transition_info {
// Returns the leaving replica for a given transition.
std::optional<tablet_replica> get_leaving_replica(const tablet_info&, const tablet_transition_info&);
// True if the tablet is transitioning and it's in a stage that follows the stage
// where we clean up the tablet on the given replica.
bool is_post_cleanup(tablet_replica replica, const tablet_info& tinfo, const tablet_transition_info& trinfo);
/// Represents intention to move a single tablet replica from src to dst.
struct tablet_migration_info {
locator::tablet_transition_kind kind;

View File

@@ -806,12 +806,19 @@ public:
auto local_replica = locator::tablet_replica{_my_host_id, this_shard_id()};
for (auto tid : tmap.tablet_ids()) {
auto range = tmap.get_token_range(tid);
if (tmap.has_replica(tid, local_replica)) {
tlogger.debug("Tablet with id {} and range {} present for {}.{}", tid, range, schema()->ks_name(), schema()->cf_name());
ret[tid.value()] = allocate_storage_group(tmap, tid, std::move(range));
if (!tmap.has_replica(tid, local_replica)) {
continue;
}
// if the tablet was cleaned up already on this replica, don't allocate a storage group for it.
auto trinfo = tmap.get_tablet_transition_info(tid);
if (trinfo && locator::is_post_cleanup(local_replica, tmap.get_tablet_info(tid), *trinfo)) {
continue;
}
auto range = tmap.get_token_range(tid);
tlogger.debug("Tablet with id {} and range {} present for {}.{}", tid, range, schema()->ks_name(), schema()->cf_name());
ret[tid.value()] = allocate_storage_group(tmap, tid, std::move(range));
}
_storage_groups = std::move(ret);
}
@@ -2433,7 +2440,7 @@ future<> tablet_storage_group_manager::update_effective_replication_map(const lo
co_return;
}
// Allocate storage group if tablet is migrating in.
// Allocate storage group if tablet is migrating in, or deallocate if it's migrating out.
auto this_replica = locator::tablet_replica{
.host = erm.get_token_metadata().get_my_id(),
.shard = this_shard_id()
@@ -2449,6 +2456,8 @@ future<> tablet_storage_group_manager::update_effective_replication_map(const lo
auto range = new_tablet_map->get_token_range(tid);
_storage_groups[tid.value()] = allocate_storage_group(*new_tablet_map, tid, std::move(range));
tablet_migrating_in = true;
} else if (_storage_groups.contains(tid.value()) && locator::is_post_cleanup(this_replica, new_tablet_map->get_tablet_info(tid), transition_info)) {
remove_storage_group(tid.value());
}
}
@@ -3803,7 +3812,6 @@ future<> table::cleanup_tablet(database& db, db::system_keyspace& sys_ks, locato
co_await stop_compaction_groups(sg);
co_await utils::get_local_injector().inject("delay_tablet_compaction_groups_cleanup", std::chrono::seconds(5));
co_await cleanup_compaction_groups(db, sys_ks, tid, sg);
_sg_manager->remove_storage_group(tid.value());
}
future<> table::cleanup_tablet_without_deallocation(database& db, db::system_keyspace& sys_ks, locator::tablet_id tid) {