mirror of
https://github.com/scylladb/scylladb.git
synced 2026-05-30 19:46:48 +00:00
Merge 'Handle tablet migration failure while streaming' from Pavel Emelyanov
It can happen that a node is lost during tablet migration involving that node. Migration will be stuck, blocking topology state machine. To recover from this, the current procedure is for the admin to execute nodetool removenode or replacing the node. This marks the node as "ignored" and tablet state machine can pick this up and abort the migration. This PR implements the handling for streaming stage only and adds a test for it. Checking other stages needs more work with failure injection to inject failures into specific barrier. To handle streaming failure two new stages are introduced -- cleanup_target and revert_migration. The former is to clean the pending replica that could receive some data by the time streaming stopped working, the latter is like end_migration, but doesn't commit the new_replicas into replicas field. refs: #16527 Closes scylladb/scylladb#17360 * github.com:scylladb/scylladb: test/topology: Add checking error paths for failed migration topology.tablets_migration: Handle failed streaming topology.tablets_migration: Add cleanup_target transition stage topology.tablets_migration: Add revert_migration transition stage storage_service: Rewrap cleanup stage checking in cleanup_tablet() test/topology: Move helpers to get tablet replicas to pylib
This commit is contained in:
@@ -38,6 +38,10 @@ write_replica_set_selector get_selector_for_writes(tablet_transition_stage stage
|
||||
return write_replica_set_selector::next;
|
||||
case tablet_transition_stage::cleanup:
|
||||
return write_replica_set_selector::next;
|
||||
case tablet_transition_stage::cleanup_target:
|
||||
return write_replica_set_selector::previous;
|
||||
case tablet_transition_stage::revert_migration:
|
||||
return write_replica_set_selector::previous;
|
||||
case tablet_transition_stage::end_migration:
|
||||
return write_replica_set_selector::next;
|
||||
}
|
||||
@@ -59,6 +63,10 @@ read_replica_set_selector get_selector_for_reads(tablet_transition_stage stage)
|
||||
return read_replica_set_selector::next;
|
||||
case tablet_transition_stage::cleanup:
|
||||
return read_replica_set_selector::next;
|
||||
case tablet_transition_stage::cleanup_target:
|
||||
return read_replica_set_selector::previous;
|
||||
case tablet_transition_stage::revert_migration:
|
||||
return read_replica_set_selector::previous;
|
||||
case tablet_transition_stage::end_migration:
|
||||
return read_replica_set_selector::next;
|
||||
}
|
||||
@@ -275,6 +283,8 @@ static const std::unordered_map<tablet_transition_stage, sstring> tablet_transit
|
||||
{tablet_transition_stage::streaming, "streaming"},
|
||||
{tablet_transition_stage::use_new, "use_new"},
|
||||
{tablet_transition_stage::cleanup, "cleanup"},
|
||||
{tablet_transition_stage::cleanup_target, "cleanup_target"},
|
||||
{tablet_transition_stage::revert_migration, "revert_migration"},
|
||||
{tablet_transition_stage::end_migration, "end_migration"},
|
||||
};
|
||||
|
||||
|
||||
@@ -157,6 +157,8 @@ enum class tablet_transition_stage {
|
||||
write_both_read_new,
|
||||
use_new,
|
||||
cleanup,
|
||||
cleanup_target,
|
||||
revert_migration,
|
||||
end_migration,
|
||||
};
|
||||
|
||||
|
||||
@@ -5505,15 +5505,21 @@ future<> storage_service::cleanup_tablet(locator::global_tablet_id tablet) {
|
||||
if (!trinfo) {
|
||||
throw std::runtime_error(fmt::format("No transition info for tablet {}", tablet));
|
||||
}
|
||||
if (trinfo->stage != locator::tablet_transition_stage::cleanup) {
|
||||
throw std::runtime_error(fmt::format("Tablet {} stage is not at cleanup", tablet));
|
||||
|
||||
if (trinfo->stage == locator::tablet_transition_stage::cleanup) {
|
||||
auto& tinfo = tmap.get_tablet_info(tablet.tablet);
|
||||
locator::tablet_replica leaving_replica = locator::get_leaving_replica(tinfo, *trinfo);
|
||||
if (leaving_replica.host != tm->get_my_id()) {
|
||||
throw std::runtime_error(fmt::format("Tablet {} has leaving replica different than this one", tablet));
|
||||
}
|
||||
} else if (trinfo->stage == locator::tablet_transition_stage::cleanup_target) {
|
||||
if (trinfo->pending_replica.host != tm->get_my_id()) {
|
||||
throw std::runtime_error(fmt::format("Tablet {} has pending replica different than this one", tablet));
|
||||
}
|
||||
} else {
|
||||
throw std::runtime_error(fmt::format("Tablet {} stage is not at cleanup/cleanup_target", tablet));
|
||||
}
|
||||
|
||||
auto& tinfo = tmap.get_tablet_info(tablet.tablet);
|
||||
locator::tablet_replica leaving_replica = locator::get_leaving_replica(tinfo, *trinfo);
|
||||
if (leaving_replica.host != tm->get_my_id()) {
|
||||
throw std::runtime_error(fmt::format("Tablet {} has leaving replica different than this one", tablet));
|
||||
}
|
||||
auto shard_opt = tmap.get_shard(tablet.tablet, tm->get_my_id());
|
||||
if (!shard_opt) {
|
||||
on_internal_error(rtlogger, format("Tablet {} has no shard on this node", tablet));
|
||||
|
||||
@@ -425,6 +425,10 @@ private:
|
||||
return false;
|
||||
case tablet_transition_stage::cleanup:
|
||||
return false;
|
||||
case tablet_transition_stage::cleanup_target:
|
||||
return false;
|
||||
case tablet_transition_stage::revert_migration:
|
||||
return false;
|
||||
case tablet_transition_stage::end_migration:
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -991,6 +991,23 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
}
|
||||
};
|
||||
|
||||
auto check_excluded_replicas = [&] {
|
||||
auto tsi = get_migration_streaming_info(get_token_metadata().get_topology(), tmap.get_tablet_info(gid.tablet), trinfo);
|
||||
for (auto r : tsi.read_from) {
|
||||
if (is_excluded(raft::server_id(r.host.uuid()))) {
|
||||
rtlogger.debug("Aborting streaming of {} because read-from {} is marked as ignored", gid, r);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
for (auto r : tsi.written_to) {
|
||||
if (is_excluded(raft::server_id(r.host.uuid()))) {
|
||||
rtlogger.debug("Aborting streaming of {} because written-to {} is marked as ignored", gid, r);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
switch (trinfo.stage) {
|
||||
case locator::tablet_transition_stage::allow_write_both_read_old:
|
||||
if (do_barrier()) {
|
||||
@@ -1014,6 +1031,14 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
utils::get_local_injector().inject("stream_tablet_fail_on_drain",
|
||||
[] { throw std::runtime_error("stream_tablet failed due to error injection"); });
|
||||
}
|
||||
|
||||
if (tablet_state.streaming && tablet_state.streaming->failed()) {
|
||||
if (check_excluded_replicas()) {
|
||||
transition_to_with_barrier(locator::tablet_transition_stage::cleanup_target);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (advance_in_background(gid, tablet_state.streaming, "streaming", [&] {
|
||||
rtlogger.info("Initiating tablet streaming ({}) of {} to {}", trinfo.transition, gid, trinfo.pending_replica);
|
||||
auto dst = trinfo.pending_replica.host;
|
||||
@@ -1047,6 +1072,30 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
transition_to(locator::tablet_transition_stage::end_migration);
|
||||
}
|
||||
break;
|
||||
case locator::tablet_transition_stage::cleanup_target:
|
||||
if (advance_in_background(gid, tablet_state.cleanup, "cleanup_target", [&] {
|
||||
locator::tablet_replica dst = trinfo.pending_replica;
|
||||
if (is_excluded(raft::server_id(dst.host.uuid()))) {
|
||||
rtlogger.info("Tablet cleanup of {} on {} skipped because node is excluded and doesn't need to revert migration", gid, dst);
|
||||
return make_ready_future<>();
|
||||
}
|
||||
rtlogger.info("Initiating tablet cleanup of {} on {} to revert migration", gid, dst);
|
||||
return ser::storage_service_rpc_verbs::send_tablet_cleanup(&_messaging,
|
||||
netw::msg_addr(id2ip(dst.host)), _as, raft::server_id(dst.host.uuid()), gid);
|
||||
})) {
|
||||
transition_to(locator::tablet_transition_stage::revert_migration);
|
||||
}
|
||||
break;
|
||||
case locator::tablet_transition_stage::revert_migration:
|
||||
// Need a separate stage and a barrier after cleanup RPC to cut off stale RPCs.
|
||||
// See do_tablet_operation() doc.
|
||||
if (do_barrier()) {
|
||||
_tablets.erase(gid);
|
||||
updates.emplace_back(get_mutation_builder()
|
||||
.del_transition(last_token)
|
||||
.build());
|
||||
}
|
||||
break;
|
||||
case locator::tablet_transition_stage::end_migration:
|
||||
// Need a separate stage and a barrier after cleanup RPC to cut off stale RPCs.
|
||||
// See do_tablet_operation() doc.
|
||||
|
||||
61
test/pylib/tablets.py
Normal file
61
test/pylib/tablets.py
Normal file
@@ -0,0 +1,61 @@
|
||||
#
|
||||
# Copyright (C) 2024-present ScyllaDB
|
||||
#
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
#
|
||||
|
||||
from test.pylib.util import read_barrier
|
||||
from test.pylib.manager_client import ManagerClient
|
||||
from test.pylib.internal_types import ServerInfo, HostID
|
||||
from typing import NamedTuple
|
||||
|
||||
class TabletReplicas(NamedTuple):
|
||||
last_token: int
|
||||
replicas: list[tuple[HostID, int]]
|
||||
|
||||
async def get_all_tablet_replicas(manager: ManagerClient, server: ServerInfo, keyspace_name: str, table_name: str) -> list[TabletReplicas]:
|
||||
"""
|
||||
Retrieves the tablet distribution for a given table.
|
||||
This call is guaranteed to see all prior changes applied to group0 tables.
|
||||
|
||||
:param server: server to query. Can be any live node.
|
||||
"""
|
||||
|
||||
host = manager.get_cql().cluster.metadata.get_host(server.ip_addr)
|
||||
|
||||
# read_barrier is needed to ensure that local tablet metadata on the queried node
|
||||
# reflects the finalized tablet movement.
|
||||
await read_barrier(manager.get_cql(), host)
|
||||
|
||||
table_id = await manager.get_table_id(keyspace_name, table_name)
|
||||
rows = await manager.get_cql().run_async(f"SELECT last_token, replicas FROM system.tablets where "
|
||||
f"table_id = {table_id}", host=host)
|
||||
return [TabletReplicas(
|
||||
last_token=x.last_token,
|
||||
replicas=[(HostID(str(host)), shard) for (host, shard) in x.replicas]
|
||||
) for x in rows]
|
||||
|
||||
async def get_tablet_replicas(manager: ManagerClient, server: ServerInfo, keyspace_name: str, table_name: str, token: int) -> list[tuple[HostID, int]]:
|
||||
"""
|
||||
Gets tablet replicas of the tablet which owns a given token of a given table.
|
||||
This call is guaranteed to see all prior changes applied to group0 tables.
|
||||
|
||||
:param server: server to query. Can be any live node.
|
||||
"""
|
||||
rows = await get_all_tablet_replicas(manager, server, keyspace_name, table_name)
|
||||
for row in rows:
|
||||
if row.last_token >= token:
|
||||
return row.replicas
|
||||
return []
|
||||
|
||||
|
||||
async def get_tablet_replica(manager: ManagerClient, server: ServerInfo, keyspace_name: str, table_name: str, token: int) -> tuple[HostID, int]:
|
||||
"""
|
||||
Get the first replica of the tablet which owns a given token of a given table.
|
||||
This call is guaranteed to see all prior changes applied to group0 tables.
|
||||
|
||||
:param server: server to query. Can be any live node.
|
||||
"""
|
||||
replicas = await get_tablet_replicas(manager, server, keyspace_name, table_name, token)
|
||||
return replicas[0]
|
||||
|
||||
91
test/topology_custom/test_tablets_migration.py
Normal file
91
test/topology_custom/test_tablets_migration.py
Normal file
@@ -0,0 +1,91 @@
|
||||
#
|
||||
# Copyright (C) 2024-present ScyllaDB
|
||||
#
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
#
|
||||
from cassandra.query import SimpleStatement, ConsistencyLevel
|
||||
from test.pylib.manager_client import ManagerClient
|
||||
from test.pylib.rest_client import HTTPError
|
||||
from test.pylib.tablets import get_all_tablet_replicas
|
||||
from test.topology.conftest import skip_mode
|
||||
import pytest
|
||||
import logging
|
||||
import asyncio
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("fail_replica", ["source", "destination"])
|
||||
@pytest.mark.parametrize("fail_stage", ["streaming"])
|
||||
@pytest.mark.asyncio
|
||||
@skip_mode('release', 'error injections are not supported in release mode')
|
||||
async def test_node_failure_during_tablet_migration(manager: ManagerClient, fail_replica, fail_stage):
|
||||
logger.info("Bootstrapping cluster")
|
||||
cfg = {'enable_user_defined_functions': False, 'experimental_features': ['tablets', 'consistent-topology-changes']}
|
||||
host_ids = []
|
||||
servers = []
|
||||
|
||||
async def make_server():
|
||||
s = await manager.server_add(config=cfg)
|
||||
servers.append(s)
|
||||
host_ids.append(await manager.get_host_id(s.server_id))
|
||||
await manager.api.disable_tablet_balancing(s.ip_addr)
|
||||
|
||||
await make_server()
|
||||
cql = manager.get_cql()
|
||||
|
||||
await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 2} AND tablets = {'initial': 1}")
|
||||
await make_server()
|
||||
await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);")
|
||||
|
||||
keys = range(256)
|
||||
await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({k}, {k});") for k in keys])
|
||||
await make_server()
|
||||
|
||||
logger.info(f"Cluster is [{host_ids}]")
|
||||
|
||||
replicas = await get_all_tablet_replicas(manager, servers[0], 'test', 'test')
|
||||
logger.info(f"Tablet is on [{replicas}]")
|
||||
assert len(replicas) == 1 and len(replicas[0].replicas) == 2
|
||||
|
||||
old_replica = None
|
||||
for r in replicas[0].replicas:
|
||||
assert r[0] != host_ids[2], "Tablet got migrated to node2"
|
||||
if r[0] == host_ids[1]:
|
||||
old_replica = r
|
||||
assert old_replica is not None
|
||||
new_replica = (host_ids[2], 0)
|
||||
logger.info(f"Moving tablet {old_replica} -> {new_replica}")
|
||||
|
||||
fail_idx = 1 if fail_replica == "source" else 2
|
||||
|
||||
logger.info(f"Will fail {fail_stage}")
|
||||
if fail_stage == "streaming":
|
||||
await manager.api.enable_injection(servers[2].ip_addr, "stream_mutation_fragments", one_shot=True)
|
||||
s2_log = await manager.server_open_log(servers[2].server_id)
|
||||
s2_mark = await s2_log.mark()
|
||||
else:
|
||||
assert False, f"Unknown stage {fail_stage}"
|
||||
|
||||
migration_task = asyncio.create_task(
|
||||
manager.api.move_tablet(servers[0].ip_addr, "test", "test", old_replica[0], old_replica[1], new_replica[0], new_replica[1], 0))
|
||||
|
||||
logger.info(f"Wait for {fail_stage} to happen")
|
||||
if fail_stage == "streaming":
|
||||
await s2_log.wait_for('stream_mutation_fragments: waiting', from_mark=s2_mark)
|
||||
else:
|
||||
assert False
|
||||
|
||||
logger.info(f"Stop {fail_replica} {host_ids[fail_idx]}")
|
||||
await manager.server_stop(servers[fail_idx].server_id)
|
||||
logger.info(f"Remove {fail_replica} {host_ids[fail_idx]}")
|
||||
await manager.remove_node(servers[0].server_id, servers[fail_idx].server_id)
|
||||
|
||||
logger.info("Done, waiting for migration to finish")
|
||||
await migration_task
|
||||
|
||||
replicas = await get_all_tablet_replicas(manager, servers[0], 'test', 'test')
|
||||
logger.info(f"Tablet is now on [{replicas}]")
|
||||
assert len(replicas) == 1
|
||||
for r in replicas[0].replicas:
|
||||
assert r[0] != host_ids[fail_idx]
|
||||
@@ -10,16 +10,15 @@ from test.pylib.manager_client import ManagerClient
|
||||
from test.pylib.rest_client import inject_error_one_shot, HTTPError
|
||||
from test.pylib.rest_client import inject_error
|
||||
from test.pylib.util import wait_for_cql_and_get_hosts, read_barrier
|
||||
from test.pylib.tablets import get_tablet_replica, get_all_tablet_replicas
|
||||
from test.topology.conftest import skip_mode
|
||||
from test.topology.util import reconnect_driver
|
||||
from test.pylib.internal_types import HostID
|
||||
|
||||
import pytest
|
||||
import asyncio
|
||||
import logging
|
||||
import time
|
||||
import random
|
||||
from typing import NamedTuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -33,56 +32,6 @@ async def inject_error_on(manager, error_name, servers):
|
||||
errs = [manager.api.enable_injection(s.ip_addr, error_name, False) for s in servers]
|
||||
await asyncio.gather(*errs)
|
||||
|
||||
class TabletReplicas(NamedTuple):
|
||||
last_token: int
|
||||
replicas: list[tuple[HostID, int]]
|
||||
|
||||
async def get_all_tablet_replicas(manager: ManagerClient, server: ServerInfo, keyspace_name: str, table_name: str) -> list[TabletReplicas]:
|
||||
"""
|
||||
Retrieves the tablet distribution for a given table.
|
||||
This call is guaranteed to see all prior changes applied to group0 tables.
|
||||
|
||||
:param server: server to query. Can be any live node.
|
||||
"""
|
||||
|
||||
host = manager.get_cql().cluster.metadata.get_host(server.ip_addr)
|
||||
|
||||
# read_barrier is needed to ensure that local tablet metadata on the queried node
|
||||
# reflects the finalized tablet movement.
|
||||
await read_barrier(manager.get_cql(), host)
|
||||
|
||||
table_id = await manager.get_table_id(keyspace_name, table_name)
|
||||
rows = await manager.get_cql().run_async(f"SELECT last_token, replicas FROM system.tablets where "
|
||||
f"table_id = {table_id}", host=host)
|
||||
return [TabletReplicas(
|
||||
last_token=x.last_token,
|
||||
replicas=[(HostID(str(host)), shard) for (host, shard) in x.replicas]
|
||||
) for x in rows]
|
||||
|
||||
async def get_tablet_replicas(manager: ManagerClient, server: ServerInfo, keyspace_name: str, table_name: str, token: int) -> list[tuple[HostID, int]]:
|
||||
"""
|
||||
Gets tablet replicas of the tablet which owns a given token of a given table.
|
||||
This call is guaranteed to see all prior changes applied to group0 tables.
|
||||
|
||||
:param server: server to query. Can be any live node.
|
||||
"""
|
||||
rows = await get_all_tablet_replicas(manager, server, keyspace_name, table_name)
|
||||
for row in rows:
|
||||
if row.last_token >= token:
|
||||
return row.replicas
|
||||
return []
|
||||
|
||||
|
||||
async def get_tablet_replica(manager: ManagerClient, server: ServerInfo, keyspace_name: str, table_name: str, token: int) -> tuple[HostID, int]:
|
||||
"""
|
||||
Get the first replica of the tablet which owns a given token of a given table.
|
||||
This call is guaranteed to see all prior changes applied to group0 tables.
|
||||
|
||||
:param server: server to query. Can be any live node.
|
||||
"""
|
||||
replicas = await get_tablet_replicas(manager, server, keyspace_name, table_name, token)
|
||||
return replicas[0]
|
||||
|
||||
async def repair_on_node(manager: ManagerClient, server: ServerInfo, servers: list[ServerInfo]):
|
||||
node = server.ip_addr
|
||||
await manager.servers_see_each_other(servers)
|
||||
|
||||
Reference in New Issue
Block a user