Compare commits

..

1 Commits

Author SHA1 Message Date
Alex
3a901a1bf3 test/cluster/test_view_building_coordinator: start view-building nodes one by one to make the tests stronger
The test_node_operation_during_view_building() setup used servers_add() to
bring up all initial nodes concurrently. That is more aggressive than this test
needs, and it makes the setup sensitive to bootstrap/topology races and to
single-node startup failures. The add_server has notes about this case.
In the decommission case in particular, the test starts with 4 nodes and only
later exercises the node operation under test. When all 4 nodes are started
concurrently, a failure in one node during initial bootstrap can cause the whole
batch add to fail before the test even reaches the decommission step. This
showed up as Failed to add servers, with later nodes timing out while waiting
for topology/IP mapping after one of the early nodes shut down.
Switch the initial cluster setup to repeated server_add() calls. This keeps
the topology changes serialized, allows each node to fully join before the next
one starts, and matches the actual needs of the test. The change does not alter
the scenario being tested; it only makes the test setup less fragile and easier
to diagnose when a node startup problem happens.
2026-03-22 12:08:34 +02:00
9 changed files with 16 additions and 50 deletions

View File

@@ -8,9 +8,6 @@ on:
jobs:
check-fixes-prefix:
runs-on: ubuntu-latest
permissions:
contents: read
issues: write
steps:
- name: Check PR body for "Fixes" prefix patterns
uses: actions/github-script@v7

View File

@@ -1,7 +1,5 @@
name: Trigger Scylla CI Route
permissions: {}
on:
issue_comment:
types: [created]

View File

@@ -952,8 +952,6 @@ class sstring:
@staticmethod
def to_hex(data, size):
if size == 0:
return ''
inf = gdb.selected_inferior()
return bytes(inf.read_memory(data, size)).hex()
@@ -976,8 +974,6 @@ class sstring:
return self.ref['u']['external']['str']
def as_bytes(self):
if len(self) == 0:
return b''
inf = gdb.selected_inferior()
return bytes(inf.read_memory(self.data(), len(self)))
@@ -5640,8 +5636,6 @@ class scylla_sstable_summary(gdb.Command):
self.inf = gdb.selected_inferior()
def to_hex(self, data, size):
if size == 0:
return ''
return bytes(self.inf.read_memory(data, size)).hex()
def invoke(self, arg, for_tty):
@@ -5653,10 +5647,6 @@ class scylla_sstable_summary(gdb.Command):
sst = seastar_lw_shared_ptr(arg).get().dereference()
else:
sst = arg
ms_version = int(gdb.parse_and_eval('sstables::sstable_version_types::ms'))
if int(sst['_version']) >= ms_version:
gdb.write("sstable uses ms format (trie-based index); summary is not populated.\n")
return
summary = seastar_lw_shared_ptr(sst['_components']['_value']).get().dereference()['summary']
gdb.write("header: {}\n".format(summary['header']))

View File

@@ -221,16 +221,10 @@ private:
sst->set_sstable_level(0);
auto units = co_await sst_manager.dir_semaphore().get_units(1);
sstables::sstable_open_config cfg {
.unsealed_sstable = true,
.ignore_component_digest_mismatch = db.get_config().ignore_component_digest_mismatch(),
};
co_await sst->load(table.get_effective_replication_map()->get_sharder(*table.schema()), cfg);
co_await table.add_new_sstable_and_update_cache(sst, [&sst_manager, sst] (sstables::shared_sstable loading_sst) -> future<> {
if (loading_sst == sst) {
auto writer_cfg = sst_manager.configure_writer(loading_sst->get_origin());
co_await loading_sst->seal_sstable(writer_cfg.backup);
}
});
co_await table.add_sstable_and_update_cache(sst);
}
future<>
@@ -301,8 +295,7 @@ private:
sstables::sstable_state::normal,
sstables::sstable::component_basename(
_table.schema()->ks_name(), _table.schema()->cf_name(), descriptor.version, gen, descriptor.format, it->first),
sstables::sstable_stream_sink_cfg{.last_component = std::next(it) == components.cend(),
.leave_unsealed = true});
sstables::sstable_stream_sink_cfg{.last_component = std::next(it) == components.cend()});
auto out = co_await sstable_sink->output(foptions, stream_options);
input_stream src(co_await [this, &it, sstable, f = files.at(it->first)]() -> future<input_stream<char>> {

View File

@@ -62,11 +62,7 @@ SEASTAR_TEST_CASE(test_index_doesnt_flood_cache_in_small_partition_workload) {
// cfg.db_config->index_cache_fraction.set(1.0);
return do_with_cql_env_thread([] (cql_test_env& e) {
// We disable compactions because they cause confusing cache mispopulations.
// We disable compression because the sstable writer targets a specific
// (*compressed* data file size : summary file size) ratio,
// so the number of keys per index page becomes hard to control,
// and might be arbitrarily large.
e.execute_cql("CREATE TABLE ks.t(pk blob PRIMARY KEY) WITH compaction = { 'class' : 'NullCompactionStrategy' } AND compression = {'sstable_compression': ''};").get();
e.execute_cql("CREATE TABLE ks.t(pk blob PRIMARY KEY) WITH compaction = { 'class' : 'NullCompactionStrategy' };").get();
auto insert_query = e.prepare("INSERT INTO ks.t(pk) VALUES (?)").get();
auto select_query = e.prepare("SELECT * FROM t WHERE pk = ?").get();
@@ -158,11 +154,7 @@ SEASTAR_TEST_CASE(test_index_is_cached_in_big_partition_workload) {
// cfg.db_config->index_cache_fraction.set(0.0);
return do_with_cql_env_thread([] (cql_test_env& e) {
// We disable compactions because they cause confusing cache mispopulations.
// We disable compression because the sstable writer targets a specific
// (*compressed* data file size : summary file size) ratio,
// so the number of keys per index page becomes hard to control,
// and might be arbitrarily large.
e.execute_cql("CREATE TABLE ks.t(pk bigint, ck bigint, v blob, primary key (pk, ck)) WITH compaction = { 'class' : 'NullCompactionStrategy' } AND compression = {'sstable_compression': ''};").get();
e.execute_cql("CREATE TABLE ks.t(pk bigint, ck bigint, v blob, primary key (pk, ck)) WITH compaction = { 'class' : 'NullCompactionStrategy' };").get();
auto insert_query = e.prepare("INSERT INTO ks.t(pk, ck, v) VALUES (?, ?, ?)").get();
auto select_query = e.prepare("SELECT * FROM t WHERE pk = ? AND ck = ?").get();

View File

@@ -691,7 +691,7 @@ class TesterAlternator(BaseAlternator):
random.choice(nodes_for_maintenance).compact()
except NodetoolError as exc:
error_message = str(exc)
valid_errors = ["ConnectException", "Connection refused", "status code 404 Not Found"]
valid_errors = ["ConnectException", "status code 404 Not Found"]
if not any(err in error_message for err in valid_errors):
raise

View File

@@ -353,7 +353,7 @@ class TestSchemaManagement(Tester):
logger.debug("Restarting node2")
node2.start(wait_for_binary_proto=True)
session2 = self.patient_exclusive_cql_connection(node2)
session2 = self.patient_cql_connection(node2)
read_barrier(session2)
rows = session.execute(SimpleStatement("SELECT * FROM cf", consistency_level=ConsistencyLevel.ALL))
@@ -382,7 +382,7 @@ class TestSchemaManagement(Tester):
logger.debug("Restarting node2")
node2.start(wait_for_binary_proto=True)
session2 = self.patient_exclusive_cql_connection(node2)
session2 = self.patient_cql_connection(node2)
read_barrier(session2)
session.execute(SimpleStatement("INSERT INTO cf (p, v) VALUES (2, '2')", consistency_level=ConsistencyLevel.ALL))

View File

@@ -808,16 +808,7 @@ async def test_index_requires_rf_rack_valid_keyspace(manager: ManagerClient):
# Create a table with tablets and no indexes, then add a GSI - the update should fail
table_name = unique_table_name()
# The server waits 10s for schema agreement after creating a table,
# which may not be enough after a sequence of rapid schema changes
# on a multi-node cluster (see SCYLLADB-1135). Retry if needed.
for attempt in range(2):
try:
create_table_with_index(alternator, table_name, index_type=None, initial_tablets='1')
break
except ClientError as e:
if 'schema agreement' not in str(e) or attempt == 1:
raise
create_table_with_index(alternator, table_name, index_type=None, initial_tablets='1')
with pytest.raises(ClientError, match=expected_err_update_add_gsi):
alternator.meta.client.update_table(
TableName=table_name,

View File

@@ -352,9 +352,14 @@ async def test_node_operation_during_view_building(manager: ManagerClient, opera
rack_layout = ["rack1", "rack2", "rack3"]
property_file = [{"dc": "dc1", "rack": rack} for rack in rack_layout]
servers = await manager.servers_add(node_count, config={"enable_tablets": "true"},
cmdline=cmdline_loggers,
property_file=property_file)
servers = [
await manager.server_add(
config={"enable_tablets": "true"},
cmdline=cmdline_loggers,
property_file=server_property_file,
)
for server_property_file in property_file
]
cql, _ = await manager.get_ready_cql(servers)
await manager.disable_tablet_balancing()