mirror of
https://github.com/scylladb/scylladb.git
synced 2026-05-02 14:15:46 +00:00
Remove bootstrap and decomission from allowed_repair_based_node_ops.
Using RBNO over streaming for these operations has no benefits, as they
are not exposed to the out-of-date replica problem that replace,
removenode and rebuild are.
On top of that, RBNO is known to have problems with empty user tables.
Using streaming for boostrap and decomission is safe and faster
than RBNO in all condition, especially when the table is small.
One test needs adjustment as it relies on RBNO being used for all node
ops.
Fixes: SCYLLADB-105
Closes scylladb/scylladb#28080
(cherry picked from commit b637e17b19)
Closes scylladb/scylladb#28725
58 lines
2.5 KiB
Python
58 lines
2.5 KiB
Python
#
|
|
# Copyright (C) 2023-present ScyllaDB
|
|
#
|
|
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
|
#
|
|
|
|
from test.pylib.manager_client import ManagerClient
|
|
|
|
import asyncio
|
|
import pytest
|
|
|
|
from test.pylib.util import wait_for_first_completed
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_different_group0_ids(manager: ManagerClient):
|
|
"""
|
|
The reproducer for #14448.
|
|
|
|
The test starts two nodes with different group0_ids. The second node
|
|
is restarted and tries to join the cluster consisting of the first node.
|
|
gossip_digest_syn message should be rejected by the first node, so
|
|
the second node will not be able to join the cluster.
|
|
|
|
This test uses repair-based node operations to make this test easier.
|
|
If the second node successfully joins the cluster, their tokens metadata
|
|
will be merged and the repair service will allow to decommission the second node.
|
|
If not - decommissioning the second node will fail with an exception
|
|
"zero replica after the removal" thrown by the repair service.
|
|
"""
|
|
|
|
# Consistent topology changes are disabled to use repair based node operations.
|
|
cfg = {'force_gossip_topology_changes': True, 'tablets_mode_for_new_keyspaces': 'disabled', 'allowed_repair_based_node_ops': 'bootstrap,decommission,replace,removenode,rebuild'}
|
|
scylla_a = await manager.server_add(config = cfg)
|
|
scylla_b = await manager.server_add(start=False, config = cfg)
|
|
await manager.server_start(scylla_b.server_id, seeds=[scylla_b.ip_addr])
|
|
|
|
await manager.server_stop(scylla_b.server_id)
|
|
await manager.server_start(scylla_b.server_id, seeds=[scylla_a.ip_addr, scylla_b.ip_addr])
|
|
|
|
log_file_a = await manager.server_open_log(scylla_a.server_id)
|
|
log_file_b = await manager.server_open_log(scylla_b.server_id)
|
|
|
|
# Wait for a gossip round to finish
|
|
await wait_for_first_completed([
|
|
log_file_b.wait_for(f'InetAddress {scylla_a.ip_addr} is now UP'), # The second node joins the cluster
|
|
log_file_a.wait_for(f'Group0Id mismatch') # The first node discards gossip from the second node
|
|
])
|
|
|
|
|
|
# Check if decommissioning the second node fails.
|
|
# Repair service throws a runtime exception "zero replica after the removal"
|
|
# when it tries to remove the only one node from the cluster.
|
|
# If it is not thrown, it means that the second node successfully send a gossip
|
|
# to the first node and they merged their tokens metadata.
|
|
with pytest.raises(Exception, match='zero replica after the removal'):
|
|
await manager.decommission_node(scylla_b.server_id)
|