Merge 'service_levels: mark v2 migration complete on empty legacy table' from Alex Dathskovsky

During raft-topology upgrade in 2026.1, service_level_controller::migrate_to_v2() returns early when system_distributed.service_levels is empty. This skips the service_level_version = 2 write, so the cluster is never marked as upgraded to service levels v2 even though there is no data to migrate. Subsequent upgrades may then fail the startup check which requires service_level_version == 2.
Remove the early return and let the migration commit the version marker even when there are no legacy service levels rows to copy.

Fixes: https://scylladb.atlassian.net/browse/SCYLLADB-1198

backport: should be backported to all versions that can be upgraded to 2026.2

Closes scylladb/scylladb#29333

* github.com:scylladb/scylladb:
  test/auth_cluster: cover empty legacy table in service level upgrade
  service_levels: mark v2 migration complete on empty legacy table

(cherry picked from commit 95e422db48)

Closes scylladb/scylladb#29352
This commit is contained in:
Avi Kivity
2026-04-06 14:07:48 +03:00
parent e5bd2f8679
commit d4c28ee317
2 changed files with 21 additions and 4 deletions

View File

@@ -880,10 +880,6 @@ future<> service_level_controller::migrate_to_v2(size_t nodes_count, db::system_
qs,
{},
cql3::query_processor::cache_internal::no);
if (rows->empty()) {
co_return;
}
auto col_names = schema->all_columns() | std::views::transform([] (const auto& col) {return col.name_as_cql_string(); }) | std::ranges::to<std::vector<sstring>>();
auto col_names_str = fmt::to_string(fmt::join(col_names, ", "));

View File

@@ -109,6 +109,27 @@ async def test_service_levels_upgrade(request, manager: ManagerClient, build_mod
result_with_sl_v2 = await cql.run_async(f"SELECT service_level FROM system.service_levels_v2")
assert set([sl.service_level for sl in result_with_sl_v2]) == set(sls + [sl_v2])
@pytest.mark.asyncio
async def test_service_levels_upgrade_with_empty_legacy_table(manager: ManagerClient):
cfg = {**auth_config, "force_gossip_topology_changes": True, "tablets_mode_for_new_keyspaces": "disabled"}
servers = [await manager.server_add(config=cfg)]
cfg.pop("force_gossip_topology_changes")
servers += [await manager.server_add(config=cfg) for _ in range(2)]
cql = manager.get_cql()
assert cql
hosts = await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)
rows = await cql.run_async("SELECT service_level FROM system_distributed.service_levels")
assert list(rows) == []
await manager.api.upgrade_to_raft_topology(hosts[0].address)
await asyncio.gather(*(wait_until_topology_upgrade_finishes(manager, h.address, time.time() + 60) for h in hosts))
sl_version = await cql.run_async("SELECT value FROM system.scylla_local WHERE key = 'service_level_version'")
assert sl_version[0].value == "2"
@pytest.mark.asyncio
async def test_service_levels_work_during_recovery(manager: ManagerClient):
# FIXME: move this test to the Raft-based recovery procedure or remove it if unneeded.