test/cluster/test_view_building_coordinator: start view-building nodes one by one to make the tests stronger

The test_node_operation_during_view_building() setup used servers_add() to bring up all initial nodes concurrently. That is more aggressive than this test needs, and it makes the setup sensitive to bootstrap/topology races and to single-node startup failures. The add_server has notes about this case. In the decommission case in particular, the test starts with 4 nodes and only later exercises the node operation under test. When all 4 nodes are started concurrently, a failure in one node during initial bootstrap can cause the whole batch add to fail before the test even reaches the decommission step. This showed up as Failed to add servers, with later nodes timing out while waiting for topology/IP mapping after one of the early nodes shut down. Switch the initial cluster setup to repeated server_add() calls. This keeps the topology changes serialized, allows each node to fully join before the next one starts, and matches the actual needs of the test. The change does not alter the scenario being tested; it only makes the test setup less fragile and easier to diagnose when a node startup problem happens.
2026-03-22 12:08:34 +02:00
9 changed files with 16 additions and 50 deletions
--- a/.github/workflows/backport-pr-fixes-validation.yaml
+++ b/.github/workflows/backport-pr-fixes-validation.yaml
@@ -8,9 +8,6 @@ on:
 jobs:
  check-fixes-prefix:
    runs-on: ubuntu-latest
-    permissions:
-      contents: read
-      issues: write
    steps:
      - name: Check PR body for "Fixes" prefix patterns
        uses: actions/github-script@v7
--- a/.github/workflows/trigger-scylla-ci.yaml
+++ b/.github/workflows/trigger-scylla-ci.yaml
@@ -1,7 +1,5 @@
 name: Trigger Scylla CI Route

-permissions: {}
-
 on:
  issue_comment:
    types: [created]
--- a/scylla-gdb.py
+++ b/scylla-gdb.py
@@ -952,8 +952,6 @@ class sstring:

    @staticmethod
    def to_hex(data, size):
-        if size == 0:
-            return ''
        inf = gdb.selected_inferior()
        return bytes(inf.read_memory(data, size)).hex()

@@ -976,8 +974,6 @@ class sstring:
            return self.ref['u']['external']['str']

    def as_bytes(self):
-        if len(self) == 0:
-            return b''
        inf = gdb.selected_inferior()
        return bytes(inf.read_memory(self.data(), len(self)))

@@ -5640,8 +5636,6 @@ class scylla_sstable_summary(gdb.Command):
        self.inf = gdb.selected_inferior()

    def to_hex(self, data, size):
-        if size == 0:
-            return ''
        return bytes(self.inf.read_memory(data, size)).hex()

    def invoke(self, arg, for_tty):
@@ -5653,10 +5647,6 @@ class scylla_sstable_summary(gdb.Command):
            sst = seastar_lw_shared_ptr(arg).get().dereference()
        else:
            sst = arg
-        ms_version = int(gdb.parse_and_eval('sstables::sstable_version_types::ms'))
-        if int(sst['_version']) >= ms_version:
-            gdb.write("sstable uses ms format (trie-based index); summary is not populated.\n")
-            return
        summary = seastar_lw_shared_ptr(sst['_components']['_value']).get().dereference()['summary']

        gdb.write("header: {}\n".format(summary['header']))
--- a/sstables_loader.cc
+++ b/sstables_loader.cc
@@ -221,16 +221,10 @@ private:
        sst->set_sstable_level(0);
        auto units = co_await sst_manager.dir_semaphore().get_units(1);
        sstables::sstable_open_config cfg {
-            .unsealed_sstable = true,
            .ignore_component_digest_mismatch = db.get_config().ignore_component_digest_mismatch(),
        };
        co_await sst->load(table.get_effective_replication_map()->get_sharder(*table.schema()), cfg);
-        co_await table.add_new_sstable_and_update_cache(sst, [&sst_manager, sst] (sstables::shared_sstable loading_sst) -> future<> {
-            if (loading_sst == sst) {
-                auto writer_cfg = sst_manager.configure_writer(loading_sst->get_origin());
-                co_await loading_sst->seal_sstable(writer_cfg.backup);
-            }
-        });
+        co_await table.add_sstable_and_update_cache(sst);
    }

    future<>
@@ -301,8 +295,7 @@ private:
                        sstables::sstable_state::normal,
                        sstables::sstable::component_basename(
                            _table.schema()->ks_name(), _table.schema()->cf_name(), descriptor.version, gen, descriptor.format, it->first),
-                        sstables::sstable_stream_sink_cfg{.last_component = std::next(it) == components.cend(),
-                                                           .leave_unsealed = true});
+                        sstables::sstable_stream_sink_cfg{.last_component = std::next(it) == components.cend()});
                    auto out = co_await sstable_sink->output(foptions, stream_options);

                    input_stream src(co_await [this, &it, sstable, f = files.at(it->first)]() -> future<input_stream<char>> {
--- a/test/boost/cache_algorithm_test.cc
+++ b/test/boost/cache_algorithm_test.cc
@@ -62,11 +62,7 @@ SEASTAR_TEST_CASE(test_index_doesnt_flood_cache_in_small_partition_workload) {
    // cfg.db_config->index_cache_fraction.set(1.0);
    return do_with_cql_env_thread([] (cql_test_env& e) {
        // We disable compactions because they cause confusing cache mispopulations.
-        // We disable compression because the sstable writer targets a specific
-        // (*compressed* data file size : summary file size) ratio,
-        // so the number of keys per index page becomes hard to control,
-        // and might be arbitrarily large.
-        e.execute_cql("CREATE TABLE ks.t(pk blob PRIMARY KEY) WITH compaction = { 'class' : 'NullCompactionStrategy' } AND compression = {'sstable_compression': ''};").get();
+        e.execute_cql("CREATE TABLE ks.t(pk blob PRIMARY KEY) WITH compaction = { 'class' : 'NullCompactionStrategy' };").get();
        auto insert_query = e.prepare("INSERT INTO ks.t(pk) VALUES (?)").get();
        auto select_query = e.prepare("SELECT * FROM t WHERE pk = ?").get();

@@ -158,11 +154,7 @@ SEASTAR_TEST_CASE(test_index_is_cached_in_big_partition_workload) {
    // cfg.db_config->index_cache_fraction.set(0.0);
    return do_with_cql_env_thread([] (cql_test_env& e) {
        // We disable compactions because they cause confusing cache mispopulations.
-        // We disable compression because the sstable writer targets a specific
-        // (*compressed* data file size : summary file size) ratio,
-        // so the number of keys per index page becomes hard to control,
-        // and might be arbitrarily large.
-        e.execute_cql("CREATE TABLE ks.t(pk bigint, ck bigint, v blob, primary key (pk, ck)) WITH compaction = { 'class' : 'NullCompactionStrategy' } AND compression = {'sstable_compression': ''};").get();
+        e.execute_cql("CREATE TABLE ks.t(pk bigint, ck bigint, v blob, primary key (pk, ck)) WITH compaction = { 'class' : 'NullCompactionStrategy' };").get();
        auto insert_query = e.prepare("INSERT INTO ks.t(pk, ck, v) VALUES (?, ?, ?)").get();
        auto select_query = e.prepare("SELECT * FROM t WHERE pk = ? AND ck = ?").get();

--- a/test/cluster/dtest/alternator_tests.py
+++ b/test/cluster/dtest/alternator_tests.py
@@ -691,7 +691,7 @@ class TesterAlternator(BaseAlternator):
                random.choice(nodes_for_maintenance).compact()
            except NodetoolError as exc:
                error_message = str(exc)
-                valid_errors = ["ConnectException", "Connection refused", "status code 404 Not Found"]
+                valid_errors = ["ConnectException", "status code 404 Not Found"]
                if not any(err in error_message for err in valid_errors):
                    raise

--- a/test/cluster/dtest/schema_management_test.py
+++ b/test/cluster/dtest/schema_management_test.py
@@ -353,7 +353,7 @@ class TestSchemaManagement(Tester):

        logger.debug("Restarting node2")
        node2.start(wait_for_binary_proto=True)
-        session2 = self.patient_exclusive_cql_connection(node2)
+        session2 = self.patient_cql_connection(node2)
        read_barrier(session2)

        rows = session.execute(SimpleStatement("SELECT * FROM cf", consistency_level=ConsistencyLevel.ALL))
@@ -382,7 +382,7 @@ class TestSchemaManagement(Tester):

        logger.debug("Restarting node2")
        node2.start(wait_for_binary_proto=True)
-        session2 = self.patient_exclusive_cql_connection(node2)
+        session2 = self.patient_cql_connection(node2)
        read_barrier(session2)

        session.execute(SimpleStatement("INSERT INTO cf (p, v) VALUES (2, '2')", consistency_level=ConsistencyLevel.ALL))
--- a/test/cluster/test_alternator.py
+++ b/test/cluster/test_alternator.py
@@ -808,16 +808,7 @@ async def test_index_requires_rf_rack_valid_keyspace(manager: ManagerClient):

    # Create a table with tablets and no indexes, then add a GSI - the update should fail
    table_name = unique_table_name()
-    # The server waits 10s for schema agreement after creating a table,
-    # which may not be enough after a sequence of rapid schema changes
-    # on a multi-node cluster (see SCYLLADB-1135). Retry if needed.
-    for attempt in range(2):
-        try:
-            create_table_with_index(alternator, table_name, index_type=None, initial_tablets='1')
-            break
-        except ClientError as e:
-            if 'schema agreement' not in str(e) or attempt == 1:
-                raise
+    create_table_with_index(alternator, table_name, index_type=None, initial_tablets='1')
    with pytest.raises(ClientError, match=expected_err_update_add_gsi):
        alternator.meta.client.update_table(
            TableName=table_name,
--- a/test/cluster/test_view_building_coordinator.py
+++ b/test/cluster/test_view_building_coordinator.py
@@ -352,9 +352,14 @@ async def test_node_operation_during_view_building(manager: ManagerClient, opera
        rack_layout = ["rack1", "rack2", "rack3"]

    property_file = [{"dc": "dc1", "rack": rack} for rack in rack_layout]
-    servers = await manager.servers_add(node_count, config={"enable_tablets": "true"},
-                                        cmdline=cmdline_loggers,
-                                        property_file=property_file)
+    servers = [
+        await manager.server_add(
+            config={"enable_tablets": "true"},
+            cmdline=cmdline_loggers,
+            property_file=server_property_file,
+        )
+        for server_property_file in property_file
+    ]

    cql, _ = await manager.get_ready_cql(servers)
    await manager.disable_tablet_balancing()