Compare commits

..

68 Commits

Author SHA1 Message Date
Yaniv Michael Kaul
ba21da2e02 compaction: set_skip_when_empty() for validation_errors metric
Add .set_skip_when_empty() to compaction_manager::validation_errors.
This metric only increments when scrubbing encounters out-of-order or
invalid mutation fragments in SSTables, indicating data corruption.
It is almost always zero and creates unnecessary reporting overhead.

AI-Assisted: yes
Signed-off-by: Yaniv Kaul <yaniv.kaul@scylladb.com>
2026-04-06 14:42:07 +03:00
Avi Kivity
b4f652b7c1 test: fix flaky test_create_ks_auth by removing bad retry timeout
get_session() was passing timeout=0.1 to patient_exclusive_cql_connection
and patient_cql_connection, leaving only 0.1 seconds for the retry loop
in retry_till_success(). Since each connection attempt can take up to 5
seconds (connect_timeout=5), the retry loop effectively got only one
attempt with no chance to retry on transient NoHostAvailable errors.

Use the default timeout=30 seconds, consistent with all other callers.

Fixes: SCYLLADB-1373

Closes scylladb/scylladb#29332
2026-04-05 19:13:15 +03:00
Jenkins Promoter
ab4a2cdde2 Update pgo profiles - aarch64 2026-04-05 16:58:02 +03:00
Jenkins Promoter
b97cf0083c Update pgo profiles - x86_64 2026-04-05 16:00:15 +03:00
Nikos Dragazis
6d50e67bd2 scylla_swap_setup: Remove Before=swap.target dependency from swap unit
When a Scylla node starts, the scylla-image-setup.service invokes the
`scylla_swap_setup` script to provision swap. This script allocates a
swap file and creates a swap systemd unit to delegate control to
systemd. By default, systemd injects a Before=swap.target dependency
into every swap unit, allowing other services to use swap.target to wait
for swap to be enabled.

On Azure, this doesn't work so well because we store the swap file on
the ephemeral disk [1] which has network dependencies (`_netdev` mount
option, configured by cloud-init [2]). This makes the swap.target
indirectly depend on the network, leading to dependency cycles such as:

swap.target -> mnt-swapfile.swap -> mnt.mount -> network-online.target
-> network.target -> systemd-resolved.service -> tmp.mount -> swap.target

This patch breaks the cycle by removing the swap unit from swap.target
using DefaultDependencies=no. The swap unit will still be activated via
WantedBy=multi-user.target, just not during early boot.

Although this problem is specific to Azure, this patch applies the fix
to all clouds to keep the code simple.

Fixes #26519.
Fixes SCYLLADB-1257

[1] https://github.com/scylladb/scylla-machine-image/pull/426
[2] https://github.com/canonical/cloud-init/pull/1213#issuecomment-1026065501

Signed-off-by: Nikos Dragazis <nikolaos.dragazis@scylladb.com>

Closes scylladb/scylladb#28504
2026-04-05 15:07:50 +03:00
Tomasz Grabiec
74542be5aa test: pylib: Ignore exceptions in wait_for()
ManagerClient::get_ready_cql() calls server_sees_others(), which waits
for servers to see each other as alive in gossip. If one of the
servers is still early in boot, RESTful API call to
"gossiper/endpoint/live" may fail. It throws an exception, which
currently terminates the wait_for() and propagates up, failing the test.

Fix this by ignoring errors when polling inside wait_for. In case of
timeout, we log the last exception. This should fix the problem not
only in this case, for all uses of wait_for().

Example output:

```
pred = <function ManagerClient.server_sees_others.<locals>._sees_min_others at 0x7f022af9a140>
deadline = 1775218828.9172852, period = 1.0, before_retry = None
backoff_factor = 1.5, max_period = 1.0, label = None

    async def wait_for(
            pred: Callable[[], Awaitable[Optional[T]]],
            deadline: float,
            period: float = 0.1,
            before_retry: Optional[Callable[[], Any]] = None,
            backoff_factor: float = 1.5,
            max_period: float = 1.0,
            label: Optional[str] = None) -> T:
        tag = label or getattr(pred, '__name__', 'unlabeled')
        start = time.time()
        retries = 0
        last_exception: Exception | None = None
        while True:
            elapsed = time.time() - start
            if time.time() >= deadline:
                timeout_msg = f"wait_for({tag}) timed out after {elapsed:.2f}s ({retries} retries)"
                if last_exception is not None:
                    timeout_msg += (
                        f"; last exception: {type(last_exception).__name__}: {last_exception}"
                    )
                    raise AssertionError(timeout_msg) from last_exception
                raise AssertionError(timeout_msg)

            try:
>               res = await pred()

test/pylib/util.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    async def _sees_min_others():
>       raise Exception("asd")
E       Exception: asd

test/pylib/manager_client.py:802: Exception

The above exception was the direct cause of the following exception:

manager = <test.pylib.manager_client.ManagerClient object at 0x7f022af7e7b0>

    @pytest.mark.asyncio
    async def test_auth_after_reset(manager: ManagerClient) -> None:
        servers = await manager.servers_add(3, config=auth_config, auto_rack_dc="dc1")
>       cql, _ = await manager.get_ready_cql(servers)

test/cluster/auth_cluster/test_auth_after_reset.py:33:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
test/pylib/manager_client.py:137: in get_ready_cql
    await self.servers_see_each_other(servers)
test/pylib/manager_client.py:820: in servers_see_each_other
    await asyncio.gather(*others)
test/pylib/manager_client.py:806: in server_sees_others
    await wait_for(_sees_min_others, time() + interval, period=.5)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

pred = <function ManagerClient.server_sees_others.<locals>._sees_min_others at 0x7f022af9a140>
deadline = 1775218828.9172852, period = 1.0, before_retry = None
backoff_factor = 1.5, max_period = 1.0, label = None

    async def wait_for(
            pred: Callable[[], Awaitable[Optional[T]]],
            deadline: float,
            period: float = 0.1,
            before_retry: Optional[Callable[[], Any]] = None,
            backoff_factor: float = 1.5,
            max_period: float = 1.0,
            label: Optional[str] = None) -> T:
        tag = label or getattr(pred, '__name__', 'unlabeled')
        start = time.time()
        retries = 0
        last_exception: Exception | None = None
        while True:
            elapsed = time.time() - start
            if time.time() >= deadline:
                timeout_msg = f"wait_for({tag}) timed out after {elapsed:.2f}s ({retries} retries)"
                if last_exception is not None:
                    timeout_msg += (
                        f"; last exception: {type(last_exception).__name__}: {last_exception}"
                    )
>                   raise AssertionError(timeout_msg) from last_exception
E                   AssertionError: wait_for(_sees_min_others) timed out after 45.30s (46 retries); last exception: Exception: asd

test/pylib/util.py:76: AssertionError
```

Fixes a failure observed in test_auth_after_reset:

```
manager = <test.pylib.manager_client.ManagerClient object at 0x7fb3740e1630>

    @pytest.mark.asyncio
    async def test_auth_after_reset(manager: ManagerClient) -> None:
        servers = await manager.servers_add(3, config=auth_config, auto_rack_dc="dc1")
        cql, _ = await manager.get_ready_cql(servers)
        await cql.run_async("ALTER ROLE cassandra WITH PASSWORD = 'forgotten_pwd'")

        logging.info("Stopping cluster")
        await asyncio.gather(*[manager.server_stop_gracefully(server.server_id) for server in servers])

        logging.info("Deleting sstables")
        for table in ["roles", "role_members", "role_attributes", "role_permissions"]:
            await asyncio.gather(*[manager.server_wipe_sstables(server.server_id, "system", table) for server in servers])

        logging.info("Starting cluster")
        # Don't try connect to the servers yet, with deleted superuser it will be possible only after
        # quorum is reached.
        await asyncio.gather(*[manager.server_start(server.server_id, connect_driver=False) for server in servers])

        logging.info("Waiting for CQL connection")
        await repeat_until_success(lambda: manager.driver_connect(auth_provider=PlainTextAuthProvider(username="cassandra", password="cassandra")))
>       await manager.get_ready_cql(servers)

test/cluster/auth_cluster/test_auth_after_reset.py:50:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
test/pylib/manager_client.py:137: in get_ready_cql
    await self.servers_see_each_other(servers)
test/pylib/manager_client.py:819: in servers_see_each_other
    await asyncio.gather(*others)
test/pylib/manager_client.py:805: in server_sees_others
    await wait_for(_sees_min_others, time() + interval, period=.5)
test/pylib/util.py:71: in wait_for
    res = await pred()
test/pylib/manager_client.py:802: in _sees_min_others
    alive_nodes = await self.api.get_alive_endpoints(server_ip)
test/pylib/rest_client.py:243: in get_alive_endpoints
    data = await self.client.get_json(f"/gossiper/endpoint/live", host=node_ip)
test/pylib/rest_client.py:99: in get_json
    ret = await self._fetch("GET", resource_uri, response_type = "json", host = host,
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

self = <test.pylib.rest_client.TCPRESTClient object at 0x7fb2404a0650>
method = 'GET', resource = '/gossiper/endpoint/live', response_type = 'json'
host = '127.15.252.8', port = 10000, params = None, json = None, timeout = None
allow_failed = False

    async def _fetch(self, method: str, resource: str, response_type: Optional[str] = None,
                     host: Optional[str] = None, port: Optional[int] = None,
                     params: Optional[Mapping[str, str]] = None,
                     json: Optional[Mapping] = None, timeout: Optional[float] = None, allow_failed: bool = False) -> Any:
        # Can raise exception. See https://docs.aiohttp.org/en/latest/web_exceptions.html
        assert method in ["GET", "POST", "PUT", "DELETE"], f"Invalid HTTP request method {method}"
        assert response_type is None or response_type in ["text", "json"], \
                f"Invalid response type requested {response_type} (expected 'text' or 'json')"
        # Build the URI
        port = port if port else self.default_port if hasattr(self, "default_port") else None
        port_str = f":{port}" if port else ""
        assert host is not None or hasattr(self, "default_host"), "_fetch: missing host for " \
                "{method} {resource}"
        host_str = host if host is not None else self.default_host
        uri = self.uri_scheme + "://" + host_str + port_str + resource
        logging.debug(f"RESTClient fetching {method} {uri}")

        client_timeout = ClientTimeout(total = timeout if timeout is not None else 300)
        async with request(method, uri,
                           connector = self.connector if hasattr(self, "connector") else None,
                           params = params, json = json, timeout = client_timeout) as resp:
            if allow_failed:
                return await resp.json()
            if resp.status != 200:
                text = await resp.text()
>               raise HTTPError(uri, resp.status, params, json, text)
E               test.pylib.rest_client.HTTPError: HTTP error 404, uri: http://127.15.252.8:10000/gossiper/endpoint/live, params: None, json: None, body:
E               {"message": "Not found", "code": 404}

test/pylib/rest_client.py:77: HTTPError
```

Fixes: SCYLLADB-1367

Closes scylladb/scylladb#29323
2026-04-05 13:52:26 +03:00
Andrzej Jackowski
8c0920202b test: protect populate_range in row_cache_test from bad_alloc
When test_exception_safety_of_update_from_memtable was converted from
manual fail_after()/catch to with_allocation_failures() in 74db08165d,
the populate_range() call ended up inside the failure injection scope
without a scoped_critical_alloc_section guard. The other two tests
converted in the same commit (test_exception_safety_of_transitioning...
and test_exception_safety_of_partition_scan) were correctly guarded.

Without the guard, the allocation failure injector can sometimes
target an allocation point inside the cleanup path of populate_range().
In a rare corner case, this triggers a bad_alloc in a noexcept context
(reader_concurrency_semaphore::stop()), causing std::terminate.

Fixes SCYLLADB-1346

Closes scylladb/scylladb#29321
2026-04-04 21:13:26 +03:00
Botond Dénes
2c22d69793 Merge 'Pytest: fix variable handling in GSServer (mock) and ensure docker service logs go to test log as well' from Calle Wilund
Fixes: SCYLLADB-1106

* Small fix in scylla_cluster - remove debug print
* Fix GSServer::unpublish so it does not except if publish was not called beforehand
* Improve dockerized_server so mock server logs echo to the test log to help diagnose CI failures (because we don't collect log files from mocks etc, and in any case correlation will be much easier).

No backport needed.

Closes scylladb/scylladb#29112

* github.com:scylladb/scylladb:
  dockerized_service: Convert log reader to pipes and push to test log
  test::cluster::conftest::GSServer: Fix unpublish for when publish was not called
  scylla_cluster: Use thread safe future signalling
  scylla_cluster: Remove left-over debug printout
2026-04-03 06:38:05 +03:00
Raphael S. Carvalho
b6ebbbf036 test/cluster/test_tablets2: Fix test_split_stopped_on_shutdown race with stale log messages
The test was failing because the call to:

    await log.wait_for('Stopping.*ongoing compactions')

was missing the 'from_mark=log_mark' argument. The log mark was updated
(line: log_mark = await log.mark()) immediately after detecting
'splitting_mutation_writer_switch_wait: waiting', and just before
launching the shutdown task. However, the wait_for call on the following
line was scanning from the beginning of the log, not from that mark.

As a result, the search immediately matched old 'Stopping N tasks for N
ongoing compactions for table system.X due to table removal' messages
emitted during initial server bootstrap (for system.large_partitions,
system.large_rows, system.large_cells), rather than waiting for the
shutdown to actually stop the user-table split compaction.

This caused the test to prematurely send the message to the
'splitting_mutation_writer_switch_wait' injection. The split compaction
was unblocked before the shutdown had aborted it, so it completed
successfully. Since the split succeeded, 'Failed to complete splitting
of table' was never logged.

Meanwhile, 'storage_service_drain_wait' was blocking do_drain() waiting
for a message. With the split already done, the test was stuck waiting
for the expected failure log that would never come (600s timeout). At
the same time, after 60s the 'storage_service_drain_wait' injection
timed out internally, triggering on_internal_error() which -- with
--abort-on-internal-error=1 -- crashed the server (exit code -6).

Fix: pass from_mark=log_mark to the wait_for('Stopping.*ongoing
compactions') call so it only matches messages that appear after the
shutdown has started, ensuring the test correctly synchronizes with the
shutdown aborting the user-table split compaction before releasing the
injection.

Fixes https://scylladb.atlassian.net/browse/SCYLLADB-1319.
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

Closes scylladb/scylladb#29311
2026-04-03 06:28:51 +03:00
Andrei Chekun
6526a78334 test.py: fix nodetool mock server port collision
Replace the random port selection with an OS-assigned port. We open
a temporary TCP socket, bind it to (ip, 0) with SO_REUSEADDR, read back
the port number the OS selected, then close the socket before launching
rest_api_mock.py.
Add reuse_address=True and reuse_port=True to TCPSite in rest_api_mock.py
so the server itself can also reclaim a TIME_WAIT port if needed.

Fixes: SCYLLADB-1275

Closes scylladb/scylladb#29314
2026-04-02 16:24:07 +02:00
Botond Dénes
eb78498e07 test: fix flaky test_timeout_is_applied_on_lookup by using eventually_true
On slow/overloaded CI machines the lowres_clock timer may not have
fired after the fixed 2x sleep, causing the assertion on
get_abort_exception() to fail. Replace the fixed sleep with
sleep(1x) + eventually_true() which retries with exponential backoff,
matching the pattern already used in test_time_based_cache_eviction.

Fixes: SCYLLADB-1311

Closes scylladb/scylladb#29299
2026-04-01 18:20:11 +03:00
Robert Bindar
e7527392c4 test: close clients if cluster teardown throws
make sure the driver is stopped even though cluster
teardown throws and avoid potential stale driver
connections entering infinite reconnect loops which
exhaust cpu resources.

Fixes: SCYLLADB-1189

Signed-off-by: Robert Bindar <robert.bindar@scylladb.com>

Closes scylladb/scylladb#29230
2026-04-01 17:22:19 +03:00
Tomasz Grabiec
2ec47a8a21 tests: address_map_test: Fix flakiness in debug mode due to task reordering
Debug mode shuffles task position in the queue. So the following is possible:
 1) shard 1 calls manual_clock::advance(). This expires timers on shard 1 and queues a background smp call to shard 0 which will expire timers there
 2) the smp::submit_to(0, ...) from shard 1 called by the test sumbits the call
 3) shard 0 creates tasks for both calls, but (2) is run first, and preempts the reactor
 4) shard 1 sees the completion, completes m_svc.invoke_on(1, ..)
 5) shard 0 inserts the completion from (4) before task from (1)
 6) the check on shard 0: m.find(id1) fails because the timer is not expired yet

To fix that, wait for timer expiration on shard 0, so that the test
doesn't depend on task execution order.

Note: I was not able to reproduce the problem locally using test.py --mode
debug --repeat 1000.

It happens in jenkins very rarely. Which is expected as the scenario which
leads to this is quite unlikely.

Fixes SCYLLADB-1265

Closes scylladb/scylladb#29290
2026-04-01 17:17:35 +03:00
Aleksandra Martyniuk
4d4ce074bb test: node_ops_tasks_tree: reconnect driver after topology changes
The test exercises all five node operations (bootstrap, replace, rebuild,
removenode, decommission) and by the end only one node out of four
remains alive. The CQL driver session, however, still holds stale
references to the dead hosts in its connection pool and load-balancing
policy state.

When the new_test_keyspace context manager exits and attempts
DROP KEYSPACE, the driver routes the query to the dead hosts first,
gets ConnectionShutdown from each, and throws NoHostAvailable before
ever trying the single live node.

Fix by calling driver_connect() after the decommission step, which
closes the old session and creates a fresh one connected only to the
servers the test manager reports as running.

Fixes: https://scylladb.atlassian.net/browse/SCYLLADB-1313.

Closes scylladb/scylladb#29306
2026-04-01 17:13:11 +03:00
Botond Dénes
0351756b15 Merge 'test: fix fuzzy_test timeout in release mode' from Piotr Smaron
The multishard_query_test/fuzzy_test was timing out (SIGKILL after
15 minutes) in release mode CI.

In release mode the test generates up to 64 partitions with up to
1000 clustering rows and 1000 range tombstones each.  With deeply
nested randomly-generated types (e.g. frozen<map<varint,
frozen<map<frozen<tuple<...>>>>>>), this volume of data can exceed
the 15-minute CI timeout.

Reduce the release-mode clustering-row and range-tombstone
distributions from 0-1000 to 0-200.  This caps the worst case at
~12,800 rows -- still 2x the devel-mode maximum (0-100) and
sufficient to exercise multi-partition paged scanning with many
pages.

Fixes: SCYLLADB-1270

No need to backport for now, only appeared on master.

Closes scylladb/scylladb#29293

* github.com:scylladb/scylladb:
  test: clean up fuzzy_test_config and add comments
  test: fix fuzzy_test timeout in release mode
2026-04-01 11:50:15 +03:00
Andrei Chekun
18f41dcd71 test.py: introduce new scheduler for choosing job count
This commit improves how test.py chohoses the default number of
parallele jobs.
This update keeps logic of selecting number of jobs from memory and cpu limits
but simplifies the heuristic so it is smoother, easier to reason about.
This avoids discontinuities such as neighboring machine sizes producing
unexpectedly different job counts, and behaves more predictably on asymmetric
machines where CPU and RAM do not scale together.

Compared to the current threshold-based version, this approach:
- avoids hard jumps around memory cutoffs
- avoids bucketed debug scaling based on CPU count
- keeps CPU and memory as separate constraints and combines them in one place
- avoids double-penalizing debug mode
- is easier to tune later by adjusting a few constants instead of rewriting branching logic

Closes scylladb/scylladb#28904
2026-04-01 11:11:15 +03:00
Avi Kivity
d438e35cdd test/cluster: fix race in test_insert_failure_standalone audit log query
get_audit_partitions_for_operation() returns None when no audit log
rows are found. In _test_insert_failure_doesnt_report_success_assign_nodes,
this None is passed to set(), causing TypeError: 'NoneType' object is
not iterable.

The audit log entry may not yet be visible immediately after executing
the INSERT, so use wait_for() from test.pylib.util with exponential
backoff to poll until the entry appears. Import it as wait_for_async
to avoid shadowing the existing wait_for from test.cluster.dtest.dtest_class,
which has a different signature (timeout vs deadline).

Fixes SCYLLADB-1330

Closes scylladb/scylladb#29289
2026-04-01 10:59:02 +03:00
Botond Dénes
2d2ff4fbda sstables: use chunked_managed_vector for promoted indexes in partition_index_page
Switch _promoted_indexes storage in partition_index_page from
managed_vector to chunked_managed_vector to avoid large contiguous
allocations.

Avoid allocation failure (or crashes with --abort-on-internal-error)
when large partitions have enough promoted index entries to trigger a
large allocation with managed_vector.

Fixes: SCYLLADB-1315

Closes scylladb/scylladb#29283
2026-03-31 18:43:57 +03:00
Piotr Smaron
2ce409dca0 test: clean up fuzzy_test_config and add comments
Remove the unused timeout field from fuzzy_test_config.  It was
declared, initialized per build mode, and logged, but never actually
enforced anywhere.

Document the intentionally small max_size (1024 bytes) passed to
read_partitions_with_paged_scan in run_fuzzy_test_scan: it forces
many pages per scan to stress the paging and result-merging logic.
2026-03-31 17:13:26 +02:00
Piotr Smaron
df2924b2a3 test: fix fuzzy_test timeout in release mode
The multishard_query_test/fuzzy_test was timing out (SIGKILL after
15 minutes) in release mode CI.

In release mode the test generates up to 64 partitions with up to
1000 clustering rows and 1000 range tombstones each.  With deeply
nested randomly-generated types (e.g. frozen<map<varint,
frozen<map<frozen<tuple<...>>>>>>), this volume of data can exceed
the 15-minute CI timeout.

Reduce the release-mode clustering-row and range-tombstone
distributions from 0-1000 to 0-200.  This caps the worst case at
~12,800 rows -- still 2x the devel-mode maximum (0-100) and
sufficient to exercise multi-partition paged scanning with many
pages.

Fixes: SCYLLADB-1270
2026-03-31 17:13:06 +02:00
Piotr Szymaniak
6d8ec8a0c0 alternator: fix flaky test_update_condition_unused_entries_short_circuit
The test was flaky because it stopped dc2_node immediately after an
LWT write, before cross-DC replication could complete. The LWT commit
uses LOCAL_QUORUM, which only guarantees persistence in the
coordinator's DC. Replication to the remote DC is async background
work, and CAS mutations don't store hints. Stopping dc2_node could
drop in-flight RPCs, leaving DC1 without the mutation.

Fix by polling both live DC1 nodes after the write to confirm
cross-DC replication completed before stopping dc2_node. Both nodes
must have the data so that the later ConsistentRead=True
(LOCAL_QUORUM) read on restarted node1 is guaranteed to succeed.

Fixes SCYLLADB-1267

Closes scylladb/scylladb#29287
2026-03-31 16:50:51 +03:00
Dawid Mędrek
f040f1b703 Merge 'raft: remake the read barrier optimization' from Patryk Jędrzejczak
The approach taken in 1ae2ae50a6 turned
out to be incorrect. The Raft member requesting a read barrier could
incorrectly advance its commit_idx and break linearizability. We revert that
commit in this PR.

We also remake the read barrier optimization with a completely new approach.
We make the leader replicate to the non-voting requester of a read barrier if
its `commit_idx` is behind.

Fixes https://scylladb.atlassian.net/browse/SCYLLADB-998

No backport: the issue is present only in master.

Closes scylladb/scylladb#29216

* github.com:scylladb/scylladb:
  raft: speed up read barrier requested by non-voters
  Revert "raft: read_barrier: update local commit_idx to read_idx when it's safe"
2026-03-31 15:11:56 +02:00
Avi Kivity
216d39883a Merge 'test: audit: fix audit test syslog race' from Dario Mirovic
Fix two independent race conditions in the syslog audit test that cause intermittent `assert 2 <= 1` failures in `assert_entries_were_added`.

**Datagram ordering race:**
`UnixSockerListener` used `ThreadingUnixDatagramServer`, where each datagram spawns a new thread. The notification barrier in `get_lines()` assumes FIFO handling, but the notification thread can win the lock before an audit entry thread, so `clear_audit_logs()` misses entries that arrive moments later. Fix: switch to sequential `UnixDatagramServer`.

**Config reload race:**
The live-update path used `wait_for_config` (REST API poll on shard 0) which can return before `broadcast_to_all_shards()` completes. Fix: wait for `"completed re-reading configuration file"` in the server log after each SIGHUP, which guarantees all shards have the new config.

Fixes SCYLLADB-1277

This is CI improvement for the latest code. No need for backport.

Closes scylladb/scylladb#29282

* github.com:scylladb/scylladb:
  test: cluster: wait for full config reload in audit live-update path
  test: cluster: fix syslog listener datagram ordering race
2026-03-31 13:53:01 +03:00
Tomasz Grabiec
b355bb70c2 dtest/alternator: stop concurrent-requests test when workers hit limit
`test_limit_concurrent_requests` could create far more tables than intended
because worker threads looped indefinitely and only the probe path terminated
the test. In practice, workers often hit `RequestLimitExceeded` first, but the
test kept running and creating tables, increasing memory pressure and causing
flakiness due to bad_alloc errors in logs.

Fix by replacing the old probe-driven termination with worker-driven
termination. Workers now run until any worker sees
`RequestLimitExceeded`.

Fixes SCYLLADB-1181

Closes scylladb/scylladb#29270
2026-03-31 13:35:50 +03:00
Patryk Jędrzejczak
b9f82f6f23 raft_group0: join_group0: fix join hang when node joins group 0 before post_server_start
A joining node hung forever if the topology coordinator added it to the
group 0 configuration before the node reached `post_server_start`. In
that case, `server->get_configuration().contains(my_id)` returned true
and the node broke out of the join loop early, skipping
`post_server_start`. `_join_node_group0_started` was therefore never set,
so the node's `join_node_response` RPC handler blocked indefinitely.
Meanwhile the topology coordinator's `respond_to_joining_node` call
(which has no timeout) hung forever waiting for the reply that never came.

Fix by only taking the early-break path when not starting as a follower
(i.e. when the node is the discovery leader or is restarting). A joining
node must always reach `post_server_start`.

We also provide a regression test. It takes 6s in dev mode.

Fixes SCYLLADB-959

Closes scylladb/scylladb#29266
2026-03-31 12:33:56 +02:00
Dario Mirovic
0cb63fb669 test: cluster: wait for full config reload in audit live-update path
_apply_config_to_running_servers used wait_for_config (REST API poll)
to confirm live config updates. The REST API reads from shard 0 only,
so it can return before broadcast_to_all_shards() completes — other
shards may still have stale audit config, generating unexpected entries.
Additionally, server_remove_config_option for absent keys sent separate
SIGHUPs before server_update_config, and the single wait_for_config at
the end could match a completion from an earlier SIGHUP.

Wait for "completed re-reading configuration file" in the server log
after each SIGHUP-producing operation. This message is logged only
after both read_config() and broadcast_to_all_shards() finish,
guaranteeing all shards have the new config. Each operation gets its
own mark+wait so no stale completion is matched.

Fixes SCYLLADB-1277
2026-03-31 02:27:11 +02:00
Dario Mirovic
1d623196eb test: cluster: fix syslog listener datagram ordering race
UnixSockerListener used ThreadingUnixDatagramServer, which spawns a
new thread per datagram. The notification barrier in get_lines() relies
on all prior datagrams being handled before the notification. With
threading, the notification handler can win the lock before an audit
entry handler, so get_lines() returns before the entry is appended.
clear_audit_logs() then clears an incomplete buffer, and the late
entry leaks into the next test's before/after diff.

Switch to sequential UnixDatagramServer. The server thread now handles
datagrams in kernel FIFO order, so the notification is always processed
after all preceding audit entries.

Refs SCYLLADB-1277
2026-03-31 02:27:11 +02:00
Patryk Jędrzejczak
ba54b2272b raft: speed up read barrier requested by non-voters
We achieve this by making the leader replicate to the non-voting requester
of a read barrier if its commit_idx is behind.

There are some corner cases where the new `replicate_to(*opt_progress, true);`
call will be a no-op, while the corresponding call in `tick_leader()` would
result in sending the AppendEntries RPC to the follower. These cases are:
- `progress.state == follower_progress::state::PROBE && progress.probe_sent`,
- `progress.state == follower_progress::state::PIPELINE
  && progress.in_flight == follower_progress::max_in_flight`.
We could try to improve the optimization by including some of the cases above,
but it would only complicate the code without noticeable benefits (at least
for group0).

Note: this is the second attempt for this optimization. The first approach
turned out to be incorrect and was reverted in the previous commit. The
performance improvement is the same as in the previous case.
2026-03-30 15:56:24 +02:00
Patryk Jędrzejczak
4913acd742 Revert "raft: read_barrier: update local commit_idx to read_idx when it's safe"
This reverts commit 1ae2ae50a6.

The reverted change turned out to be incorrect. The Raft member requesting
a read barrier could incorrectly advance its commit_idx and break
linearizability. More details in
https://scylladb.atlassian.net/browse/SCYLLADB-998?focusedCommentId=42935
2026-03-30 15:56:24 +02:00
Andrzej Jackowski
ab43420d30 test: use exclusive driver connection in test_limited_concurrency_of_writes
Use get_cql_exclusive(node1) so the driver only connects to node1 and
never attempts to contact the stopped node2. The test was flaky because
the driver received `Host has been marked down or removed` from node2.

Fixes: SCYLLADB-1227

Closes scylladb/scylladb#29268
2026-03-30 11:50:44 +02:00
Botond Dénes
068a7894aa test/cluster: fix flaky test_cleanup_stop by using asyncio.sleep
The test was using time.sleep(1) (a blocking call) to wait after
scheduling the stop_compaction task, intending to let it register on
the server before releasing the sstable_cleanup_wait injection point.

However, time.sleep() blocks the asyncio event loop entirely, so the
asyncio.create_task(stop_compaction) task never gets to run during the
sleep. After the sleep, the directly-awaited message_injection() runs
first, releasing the injection point before stop_compaction is even
sent. By the time stop_compaction reaches Scylla, the cleanup has
already completed successfully -- no exception is raised and the test
fails.

Fix by replacing time.sleep(1) with await asyncio.sleep(1), which
yields control to the event loop and allows the stop_compaction task
to actually send its HTTP request before message_injection is called.

Fixes: SCYLLADB-834

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

Closes scylladb/scylladb#29202
2026-03-30 11:40:47 +03:00
Nadav Har'El
d32fe72252 Merge 'alternator: check concurrency limit before memory acquisition' from Łukasz Paszkowski
Fix the ordering of the concurrency limit check in the Alternator HTTP server so it happens before memory acquisition, and reduce test pressure to avoid LSA exhaustion on the memory-constrained test node.

The patch moves the concurrency check to right after the content-length early-out, before any memory acquisition or I/O. The check was originally placed before memory acquisition but was inadvertently moved after it during a refactoring. This allowed unlimited requests to pile up consuming memory, reading bodies, verifying signatures, and decompressing — all before being rejected. Restores the original ordering and mirrors the CQL transport (`transport/server.cc`).

Lowers `concurrent_requests_limit` from 5 to 3 and the thread multiplier from 5 to 2 (6 threads instead of 25). This is still sufficient to reliably trigger RequestLimitExceeded, while keeping flush pressure within what 512MB per shard can sustain.

Fixes https://scylladb.atlassian.net/browse/SCYLLADB-1248
Fixes https://scylladb.atlassian.net/browse/SCYLLADB-1181

The test started to fail quite recently. It affects master only. No backport is needed. We might want to consider backporting a commit moving the concurrency check earlier.

Closes scylladb/scylladb#29272

* github.com:scylladb/scylladb:
  test: reduce concurrent-request-limit test pressure to avoid LSA exhaustion
  alternator: check concurrency limit before memory acquisition
2026-03-29 11:08:28 +03:00
Łukasz Paszkowski
b8e3ef0c64 test: reduce concurrent-request-limit test pressure to avoid LSA exhaustion
The test_limit_concurrent_requests dtest uses concurrent CreateTable
requests to verify Alternator's concurrency limiting.  Each admitted
CreateTable triggers Raft consensus, schema mutations, and memtable
flushes—all of which consume LSA memory.  On the 1 GB test node
(2 SMP × 512 MB), the original settings (limit=5, 25 threads) created
enough flush pressure to exhaust the LSA emergency reserve, producing
logalloc::bad_alloc errors in the node log.  The test was always
marginal under these settings and became flaky as new system tables
increased baseline LSA usage over time.

Lower concurrent_requests_limit from 5 to 3 and the thread multiplier
from 5 to 2 (6 threads total).  This is still well above the limit and
sufficient to reliably trigger RequestLimitExceeded, while keeping flush
pressure within what 512 MB per shard can sustain.
2026-03-28 20:40:33 +01:00
Łukasz Paszkowski
a86928caa1 alternator: check concurrency limit before memory acquisition
The concurrency limit check in the Alternator server was positioned after
memory acquisition (get_units), request body reading (read_entire_stream),
signature verification, and decompression. This allowed unlimited requests
to pile up consuming memory before being rejected, exhausting LSA memory
and causing logalloc::bad_alloc errors that cascade into Raft applier
and topology coordinator failures, breaking subsequent operations.

Without this fix, test_limit_concurrent_requests on a 1GB node produces
50 logalloc::bad_alloc errors and cascading failures: reads from
system.scylla_local fail, the Raft applier fiber stops, the topology
coordinator stops, and all subsequent CreateTable operations fail with
InternalServerError (500). With this fix, the cascade is eliminated --
admitted requests may still cause LSA pressure on a memory-constrained
node, but the server remains functional.

Move the concurrency check to right after the content-length early-out,
before any memory acquisition or I/O. This mirrors the CQL transport
which correctly checks concurrency before memory acquisition
(transport/server.cc).

The concurrency check was originally added in 1b8c946ad7 (Sep 2020)
*before* memory acquisition, which at the time lived inside with_gate
(after the concurrency gate). The ordering was inverted by f41dac2a3a
(Mar 2021, "avoid large contiguous allocation for request body"), which
moved get_units() earlier in the function to reserve memory before
reading the newly-introduced content stream -- but inadvertently also
moved it before the concurrency check. c3593462a4 (Mar 2025) further
worsened the situation by adding a 16MB fallback reservation for
requests without Content-Length and ungzip/deflate decompression steps
-- all before the concurrency check -- greatly increasing the memory
consumed by requests that would ultimately be rejected.
2026-03-28 20:40:33 +01:00
Emil Maskovsky
9dad68e58d raft: abort stale snapshot transfers when term changes
**The Bug**

Assertion failure: `SCYLLA_ASSERT(res.second)` in `raft/server.cc`
when creating a snapshot transfer for a destination that already had a
stale in-flight transfer.

**Root Cause**

If a node loses leadership and later becomes leader again before the next
`io_fiber` iteration, the old transfer from the previous term can remain
in `_snapshot_transfers` while `become_leader()` resets progress state.
When the new term emits `install_snapshot(dst)`, `send_snapshot(dst)`
tries to create a new entry for the same destination and can hit the
assertion.

**The Fix**

Abort all in-flight snapshot transfers in `process_fsm_output()` when
`term_and_vote` is persisted. A term/vote change marks existing transfers
as stale, so we clean them up before dispatching messages from that batch
and before any new snapshot transfer is started.

With cross-term cleanup moved to the term-change path, `send_snapshot()`
now asserts the within-term invariant that there is at most one in-flight
transfer per destination.

Fixes: SCYLLADB-862

Backport: The issue is reproducible in master, but is present in all
active branches.

Closes scylladb/scylladb#29092
2026-03-27 10:00:15 +01:00
Andrzej Jackowski
181ad9f476 Revert "audit: disable DDL by default"
This reverts commit c30607d80b.

With the default configuration, enabling DDL has no effect because
no `audit_keyspaces` or `audit_tables` are specified. Including DDL
in the default categories can be misleading for some customers, and
ideally we would like to avoid it.

However, DDL has been one of the default audit categories for years,
and removing it risks silently breaking existing deployments that
depend on it. Therefore, the recent change to disable DDL by default
is reverted.

Fixes: SCYLLADB-1155

Closes scylladb/scylladb#29169
2026-03-27 09:55:11 +01:00
Botond Dénes
854c374ebf test/encryption: wait for topology convergence after abrupt restart
test_reboot uses a custom restart function that SIGKILLs and restarts
nodes sequentially. After all nodes are back up, the test proceeded
directly to reads after wait_for_cql_and_get_hosts(), which only
confirms CQL reachability.

While a node is restarted, other nodes might execute global token
metadata barriers, which advance the topology fence version. The
restarted node has to learn about the new version before it can send
reads/writes to the other nodes. The test issues reads as soon as the
CQL port is opened, which might happen before the last restarted node
learns of the latest topology version. If this node acts as a
coordinator for reads/write before this happens, these will fail as the
other nodes will reject the ops with the outdated topology fence
version.

Fix this by replacing wait_for_cql_and_get_hosts() on the abrupt-restart
path with the more robus get_ready_cql(), which makes sure servers see
each other before refreshing the cql connection. This should ensure that
nodes have exchanged gossip and converged on topology state before any
reads are executed. The rolling_restart() path is unaffected as it
handles this internally.

Fixes: SCYLLADB-557

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

Closes scylladb/scylladb#29211
2026-03-27 09:52:27 +01:00
Avi Kivity
b708e5d7c9 Merge 'test: fix race condition in test_crashed_node_substitution' from Sergey Zolotukhin
`test_crashed_node_substitution` intermittently failed:
```python
   assert len(gossiper_eps) == (len(server_eps) + 1)
```
The test crashed the node right after a single ACK2 handshake (`finished do_send_ack2_msg`), assuming the node state was visible to all peers. However, since gossip is eventually consistent, the update may not have propagated yet, so some nodes did not see the failed node.

This change: Wait until the gossiper state is visible on peers before continuing the test and asserting.

Fixes: [SCYLLADB-1256](https://scylladb.atlassian.net/browse/SCYLLADB-1256).

backport: this issue may affect CI for all branches, so should be backported to all versions.

[SCYLLADB-1256]: https://scylladb.atlassian.net/browse/SCYLLADB-1256?atlOrigin=eyJpIjoiNWRkNTljNzYxNjVmNDY3MDlhMDU5Y2ZhYzA5YTRkZjUiLCJwIjoiZ2l0aHViLWNvbS1KU1cifQ

Closes scylladb/scylladb#29254

* github.com:scylladb/scylladb:
  test: test_crashed_node_substitution: add docstring and fix whitespace
  test: fix race condition in test_crashed_node_substitution
2026-03-26 21:40:33 +02:00
Petr Gusev
c38e312321 test_lwt_fencing_upgrade: fix quorum failure due to gossip lag
If lwt_workload() sends an update immediately after a
rolling restart, the coordinator might still see a replica as
down due to gossip lagging behind. Concurrently restarting another
node leaves only one available replica, failing the
LOCAL_QUORUM requirement for learn or eventually consistent
sp::query() in sp::cas() and resulting in
a mutation_write_failure_exception.

We fix this problem by waiting for the restarted server
to see 2 other peers. The server_change_version
doesn't do that by default -- it passes
wait_others=0 to server_start().

Fixes SCYLLADB-1136

Closes scylladb/scylladb#29234
2026-03-26 21:25:53 +02:00
bitpathfinder
627a8294ed test: test_crashed_node_substitution: add docstring and fix whitespace
Add a description of the test's intent and scenario; remove extra blanks.
2026-03-26 18:40:17 +01:00
bitpathfinder
5a086ae9b7 test: fix race condition in test_crashed_node_substitution
`test_crashed_node_substitution` intermittently failed:
```
    assert len(gossiper_eps) == (len(server_eps) + 1)
```
The test crashed the node right after a single ACK2 handshake
("finished do_send_ack2_msg"), assuming the node state was
visible to all peers. However, since gossip is eventually
consistent, the update may not have propagated yet, so some
nodes did not see the failed node.

This change: Wait until the gossiper state is visible on
peers before continuing the test and asserting.

Fixes: SCYLLADB-1256.
2026-03-26 18:25:05 +01:00
Robert Bindar
c575bbf1e8 test_refresh_deletes_uploaded_sstables should wait for sstables to get deleted
SSTable unlinking is async, so in some cases it may happen that
the upload dir is not empty immediately after refresh is done.
This patch adjusts test_refresh_deletes_uploaded_sstables so
it waits with a timeout till the upload dir becomes empty
instead of just assuming the API will sync on sstables being
gone.

Fixes SCYLLADB-1190

Signed-off-by: Robert Bindar <robert.bindar@scylladb.com>

Closes scylladb/scylladb#29215
2026-03-26 08:43:14 +03:00
Marcin Maliszkiewicz
7fdd650009 Merge 'test: audit: clean up test helper class naming' from Dario Mirovic
Remove unused `pytest.mark.single_node` marker from `TestCQLAudit`.

Rename `TestCQLAudit` to `CQLAuditTester` to reflect that it is a test helper, not a test class. This avoids accidental pytest collection and subsequent warning about `__init__`.

Logs before the fixes:
```
test/cluster/test_audit.py:514: 14 warnings
  /home/dario/dev/scylladb/test/cluster/test_audit.py:514: PytestCollectionWarning: cannot collect test class 'TestCQLAudit' because it has a __init__ constructor (from: cluster/test_audit.py)
    @pytest.mark.single_node
```

Fixes SCYLLADB-1237

This is an addition to the latest master code. No backport needed.

Closes scylladb/scylladb#29237

* github.com:scylladb/scylladb:
  test: audit: rename TestCQLAudit to CQLAuditTester
  test: audit: remove unused pytest.mark.single_node
2026-03-25 15:30:16 +01:00
Dario Mirovic
552a2d0995 test: audit: rename TestCQLAudit to CQLAuditTester
pytest tries to collect tests for execution in several ways.
One is to pick all classes that start with 'Test'. Those classes
must not have custom '__init__' constructor. TestCQLAudit does.

TestCQLAudit after migration from test/cluster/dtest is not a test
class anymore, but rather a helper class. There are two ways to fix
this:
1. Add __init__ = False to the TestCQLAudit class
2. Rename it to not start with 'Test'

Option 2 feels better because the new name itself does not convey
the wrong message about its role.

Fixes SCYLLADB-1237
2026-03-25 13:21:08 +01:00
Dario Mirovic
73de865ca3 test: audit: remove unused pytest.mark.single_node
Remove unused pytest.mark.single_node in TestCQLAudit class.
This is a leftover from audit tests migration from
test/cluster/dtest to test/cluster.

Refs SCYLLADB-1237
2026-03-25 13:18:37 +01:00
Marcin Maliszkiewicz
f988ec18cb test/lib: fix port in-use detection in start_docker_service
Previously, the result of when_all was discarded. when_all stores
exceptions in the returned futures rather than throwing, so the outer
catch(in_use&) could never trigger. Now we capture the when_all result
and inspect each future individually to properly detect in_use from
either stream.

Fixes https://scylladb.atlassian.net/browse/SCYLLADB-1216

Closes scylladb/scylladb#29219
2026-03-25 11:45:53 +02:00
Artsiom Mishuta
cd1679934c test/pylib: use exponential backoff in wait_for()
Change wait_for() defaults from period=1s/no backoff to period=0.1s
with 1.5x backoff capped at 1.0s. This catches fast conditions in
100ms instead of 1000ms, benefiting ~100 call sites automatically.

Add completion logging with elapsed time and iteration count.

Tested local with test/cluster/test_fencing.py::test_fence_hints (dev mode),
log output:

  wait_for(at_least_one_hint_failed) completed in 0.83s (4 iterations)
  wait_for(exactly_one_hint_sent) completed in 1.34s (5 iterations)

Fixes SCYLLADB-738

Closes scylladb/scylladb#29173
2026-03-24 23:49:49 +02:00
Botond Dénes
d52fbf7ada Merge 'test: cluster: Deflake test_startup_with_keyspaces_violating_rf_rack_valid_keyspaces' from Dawid Mędrek
The test was flaky. The scenario looked like this:

1. Stop server 1.
2. Set its rf_rack_valid_keyspaces configuration option to true.
3. Create an RF-rack-invalid keyspace.
4. Start server 1 and expect a failure during start-up.

It was wrong. We cannot predict when the Raft mutation corresponding to
the newly created keyspace will arrive at the node or when it will be
processed. If the check of the RF-rack-valid keyspaces we perform at
start-up was done before that, it won't include the keyspace. This will
lead to a test failure.

Unfortunately, it's not feasible to perform a read barrier during
start-up. What's more, although it would help the test, it wouldn't be
useful otherwise. Because of that, we simply fix the test, at least for
now.

The new scenario looks like this:

1. Disable the rf_rack_valid_keyspaces configuration option on server 1.
2. Start the server.
3. Create an RF-rack-invalid keyspace.
4. Perform a read barrier on server 1. This will ensure that it has
   observed all Raft mutations, and we won't run into the same problem.
5. Stop the node.
6. Set its rf_rack_valid_keyspaces configuration option to true.
7. Try to start the node and observe a failure.

This will make the test perform consistently.

---

I ran the test (in dev mode, on my local machine) three times before
these changes, and three times with them. I include the time results
below.

Before:
```
real    0m47.570s
user    0m41.631s
sys     0m8.634s

real    0m50.495s
user    0m42.499s
sys     0m8.607s

real    0m50.375s
user    0m41.832s
sys     0m8.789s
```

After:
```
real    0m50.509s
user    0m43.535s
sys     0m9.715s

real    0m50.857s
user    0m44.185s
sys     0m9.811s

real    0m50.873s
user    0m44.289s
sys     0m9.737s
```

Fixes SCYLLADB-1137

Backport: The test is present on all supported branches, and so we
          should backport these changes to them.

Closes scylladb/scylladb#29218

* github.com:scylladb/scylladb:
  test: cluster: Deflake test_startup_with_keyspaces_violating_rf_rack_valid_keyspaces
  test: cluster: Mark test with @pytest.mark.asyncio in test_multidc.py
2026-03-24 21:09:19 +02:00
Patryk Jędrzejczak
141aa2d696 Merge 'test/cluster/test_incremental_repair.py: fix typo + enable compaction DEBUG logs' from Botond Dénes
This PR contains two small improvements to `test_incremental_repair.py`
motivated by the sporadic failure of
`test_tablet_incremental_repair_and_scrubsstables_abort`.

The test fails with `assert 3 == 2` on `len(sst_add)` in the second
repair round. The extra SSTable has `repaired_at=0`, meaning scrub
unexpectedly produced more unrepaired SSTables than anticipated. Since
scrub (and compaction in general) logs at DEBUG level and the test did
not enable debug logging, the existing logs do not contain enough
information to determine the root cause.

**Commit 1** fixes a long-standing typo in the helper function name
(`preapre` -> `prepare`).

**Commit 2** enables `compaction=debug` for the Scylla nodes started by
`do_tablet_incremental_repair_and_ops`, which covers all
`test_tablet_incremental_repair_and_*` variants. This will capture full
compaction/scrub activity on the next reproduction, making the failure
diagnosable.

Refs: SCYLLADB-1086

Backport: test improvement, no backport

Closes scylladb/scylladb#29175

* https://github.com/scylladb/scylladb:
  test/cluster/test_incremental_repair.py: enable compaction DEBUG logs in do_tablet_incremental_repair_and_ops
  test/cluster/test_incremental_repair.py: fix typo preapre -> prepare
2026-03-24 16:27:01 +01:00
Ernest Zaslavsky
c670183be8 cmake: fix precompiled header (PCH) creation
Two issues prevented the precompiled header from compiling
successfully when using CMake directly (rather than the
configure.py + ninja build system):

a) Propagate build flags to Rust binding targets reusing the
   PCH. The wasmtime_bindings and inc targets reuse the PCH
   from scylla-precompiled-header, which is compiled with
   Seastar's flags (including sanitizer flags in
   Debug/Sanitize modes). Without matching compile options,
   the compiler rejects the PCH due to flag mismatch (e.g.,
   -fsanitize=address). Link these targets against
   Seastar::seastar to inherit the required compile options.

Closes scylladb/scylladb#28941
2026-03-24 15:53:40 +02:00
Dawid Mędrek
e639dcda0b test: cluster: Deflake test_startup_with_keyspaces_violating_rf_rack_valid_keyspaces
The test was flaky. The scenario looked like this:

1. Stop server 1.
2. Set its rf_rack_valid_keyspaces configuration option to true.
3. Create an RF-rack-invalid keyspace.
4. Start server 1 and expect a failure during start-up.

It was wrong. We cannot predict when the Raft mutation corresponding to
the newly created keyspace will arrive at the node or when it will be
processed. If the check of the RF-rack-valid keyspaces we perform at
start-up was done before that, it won't include the keyspace. This will
lead to a test failure.

Unfortunately, it's not feasible to perform a read barrier during
start-up. What's more, although it would help the test, it wouldn't be
useful otherwise. Because of that, we simply fix the test, at least for
now.

The new scenario looks like this:

1. Disable the rf_rack_valid_keyspaces configuration option on server 1.
2. Start the server.
3. Create an RF-rack-invalid keyspace.
4. Perform a read barrier on server 1. This will ensure that it has
   observed all Raft mutations, and we won't run into the same problem.
5. Stop the node.
6. Set its rf_rack_valid_keyspaces configuration option to true.
7. Try to start the node and observe a failure.

This will make the test perform consistently.

---

I ran the test (in dev mode, on my local machine) three times before
these changes, and three times with them. I include the time results
below.

Before:
```
real    0m47.570s
user    0m41.631s
sys     0m8.634s

real    0m50.495s
user    0m42.499s
sys     0m8.607s

real    0m50.375s
user    0m41.832s
sys     0m8.789s
```

After:
```
real    0m50.509s
user    0m43.535s
sys     0m9.715s

real    0m50.857s
user    0m44.185s
sys     0m9.811s

real    0m50.873s
user    0m44.289s
sys     0m9.737s
```

Fixes SCYLLADB-1137
2026-03-24 14:27:36 +01:00
Patryk Jędrzejczak
503a6e2d7e locator: everywhere_replication_strategy: fix sanity_check_read_replicas when read_new is true
ERMs created in `calculate_vnode_effective_replication_map` have RF computed based
on the old token metadata during a topology change. The reading replicas, however,
are computed based on the new token metadata (`target_token_metadata`) when
`read_new` is true. That can create a mismatch for EverywhereStrategy during some
topology changes - RF can be equal to the number of reading replicas +-1. During
bootstrap, this can cause the
`everywhere_replication_strategy::sanity_check_read_replicas` check to fail in
debug mode.

We fix the check in this commit by allowing one more reading replica when
`read_new` is true.

Fixes https://scylladb.atlassian.net/browse/SCYLLADB-1147

Closes scylladb/scylladb#29150
2026-03-24 13:43:39 +01:00
Jenkins Promoter
0f02c0d6fa Update pgo profiles - x86_64 2026-03-24 14:11:38 +02:00
Dawid Mędrek
4fead4baae test: cluster: Mark test with @pytest.mark.asyncio in test_multidc.py
One of the tests,
test_startup_with_keyspaces_violating_rf_rack_valid_keyspaces,
didn't have the marker. Let's add it now.
2026-03-24 12:52:00 +01:00
Botond Dénes
ffd58ca1f0 Merge 'test: cluster: Deflake test_write_cl_any_to_dead_node_generates_hints' from Dawid Mędrek
Before these changes, we would send mutations to the node and
immediately query the metrics to see how many hints had been written.
However, that could lead to random failures of the test: even if the
mutations have finished executing, hints are stored asynchronously, so
we don't have a guarantee they have already been processed.

To prevent such failures, we rewrite the check: we will perform multiple
checks against the metrics until we have confirmed that the hints have
indeed been written or we hit the timeout.

We're generous with the timeout: we give the test 60 seconds. That
should be enough time to avoid flakiness even on super slow machines,
and if the test does fail, we will know something is really wrong.

As a bonus, we improve the test in general too. We explicitly express
the preconditions we rely on, as well as bump the log level. If the
test fails in the future, it might be very difficult do debug it
without this additional information.

Fixes SCYLLADB-1133

Backport: The test is present on all supported branches. To avoid
          running into more failures, we should backport these changes
          to them.

Closes scylladb/scylladb#29191

* github.com:scylladb/scylladb:
  test: cluster: Increase log level in test_write_cl_any_to_dead_node_generates_hints
  test: cluster: Await all mutations concurrently in test_write_cl_any_to_dead_node_generates_hints
  test: cluster: Specify min_tablet_count in test_write_cl_any_to_dead_node_generates_hints
  test: cluster: Use new_test_table in test_write_cl_any_to_dead_node_generates_hints
  test: cluster: Introduce auxiliary function keyspace_has_tablets
  test: cluster: Deflake test_write_cl_any_to_dead_node_generates_hints
2026-03-24 13:39:56 +02:00
Calle Wilund
f1b3bff4a5 dockerized_service: Convert log reader to pipes and push to test log
Refs: SCYLLADB-1106

Ensures any stderr logs from mock services will echo to the test log
regardless of the log file we write. To help debug failed CI.
2026-03-24 12:35:42 +01:00
Calle Wilund
38aaed1ed4 test::cluster::conftest::GSServer: Fix unpublish for when publish was not called
Use checked dict access to check the set vars.

Fixes: SCYLLADB-1106
2026-03-24 12:33:56 +01:00
Calle Wilund
b382f3593c scylla_cluster: Use thread safe future signalling 2026-03-24 12:33:56 +01:00
Andrei Chekun
f6fd3bbea0 test.py: reduce timeout for one test
Reduce the timeout for one test to 60 minutes. The longest test we had
so far was ~10-15 minutes. So reducing this timeout is pretty safe and
should help with hanging tests.

Closes scylladb/scylladb#29212
2026-03-24 12:50:10 +02:00
Dawid Mędrek
148217bed6 test: cluster: Increase log level in test_write_cl_any_to_dead_node_generates_hints
We increase the log level of `hints_manager` to TRACE in the test.
If it fails, it may be incredibly difficult to debug it without any
additional information.
2026-03-23 19:19:17 +01:00
Dawid Mędrek
2b472fe7fd test: cluster: Await all mutations concurrently in test_write_cl_any_to_dead_node_generates_hints 2026-03-23 19:19:17 +01:00
Dawid Mędrek
ae12c712ce test: cluster: Specify min_tablet_count in test_write_cl_any_to_dead_node_generates_hints
The test relies on the assumption that mutations will be distributed
more or less uniformly over the nodes. Although in practice this should
not be possible, theoretically it's possible that there's only one
tablet allocated for the table.

To clearly indicate this precondition, we explicitly set the property
`min_tablet_count` when creating the table. This way, we have a gurantee
that the table has multiple tablets. The load balancer should now take
care of distributing them over the nodes equally. Thanks to that,
`servers[1]` will have some tablets, and so it'll be the target for some
of the mutations we perform.
2026-03-23 19:19:14 +01:00
Dawid Mędrek
dd446aa442 test: cluster: Use new_test_table in test_write_cl_any_to_dead_node_generates_hints
The context manager is the de-facto standard in the test suite. It will
also allow us for a prettier way to conditionally enable per-table
tablet options in the following commit.
2026-03-23 19:07:01 +01:00
Dawid Mędrek
dea79b09a9 test: cluster: Introduce auxiliary function keyspace_has_tablets
The function is adapted from its counterpart in the cqlpy test suite:
cqlpy/util.py::keyspace_has_tablets. We will use it in a commit in this
series to conditionally set tablet properties when creating a table.
It might also be useful in general.
2026-03-23 19:07:01 +01:00
Dawid Mędrek
3d04fd1d13 test: cluster: Deflake test_write_cl_any_to_dead_node_generates_hints
Before these changes, we would send mutations to the node and
immediately query the metrics to see how many hints had been written.
However, that could lead to random failures of the test: even if the
mutations have finished executing, hints are stored asynchronously, so
we don't have a guarantee they have already been processed.

To prevent such failures, we rewrite the check: we will perform multiple
checks against the metrics until we have confirmed that the hints have
indeed been written or we hit the timeout.

We're generous with the timeout: we give the test 60 seconds. That
should be enough time to avoid flakiness even on super slow machines,
and if the test does fail, we will know something is really wrong.

Fixes SCYLLADB-1133
2026-03-23 19:06:57 +01:00
Botond Dénes
f5438e0587 test/cluster/test_incremental_repair.py: enable compaction DEBUG logs in do_tablet_incremental_repair_and_ops
The test sporadically fails because scrub produces an unexpected number
of SSTables. Compaction logs are needed to diagnose why, but were not
captured since scrub runs at DEBUG level. Enable compaction=debug for
the servers started by do_tablet_incremental_repair_and_ops so the next
reproduction provides enough information to root-cause the issue.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-03-23 15:48:26 +02:00
Botond Dénes
f6ab576ed9 test/cluster/test_incremental_repair.py: fix typo preapre -> prepare
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-03-23 15:48:12 +02:00
Calle Wilund
b36dc80835 scylla_cluster: Remove left-over debug printout 2026-03-23 11:07:59 +01:00
62 changed files with 4000 additions and 3698 deletions

View File

@@ -699,6 +699,17 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
// for such a size.
co_return api_error::payload_too_large(fmt::format("Request content length limit of {} bytes exceeded", request_content_length_limit));
}
// Check the concurrency limit early, before acquiring memory and
// reading the request body, to avoid piling up memory from excess
// requests that will be rejected anyway. This mirrors the CQL
// transport which also checks concurrency before memory acquisition
// (transport/server.cc).
if (_pending_requests.get_count() >= _max_concurrent_requests) {
_executor._stats.requests_shed++;
co_return api_error::request_limit_exceeded(format("too many in-flight requests (configured via max_concurrent_requests_per_shard): {}", _pending_requests.get_count()));
}
_pending_requests.enter();
auto leave = defer([this] () noexcept { _pending_requests.leave(); });
// JSON parsing can allocate up to roughly 2x the size of the raw
// document, + a couple of bytes for maintenance.
// If the Content-Length of the request is not available, we assume
@@ -760,12 +771,6 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
_executor._stats.unsupported_operations++;
co_return api_error::unknown_operation(fmt::format("Unsupported operation {}", op));
}
if (_pending_requests.get_count() >= _max_concurrent_requests) {
_executor._stats.requests_shed++;
co_return api_error::request_limit_exceeded(format("too many in-flight requests (configured via max_concurrent_requests_per_shard): {}", _pending_requests.get_count()));
}
_pending_requests.enter();
auto leave = defer([this] () noexcept { _pending_requests.leave(); });
executor::client_state client_state(service::client_state::external_tag(),
_auth_service, &_sl_controller, _timeout_config.current_values(), req->get_client_address());
if (!username.empty()) {

View File

@@ -32,7 +32,7 @@ namespace {
logger mylog{"ldap_role_manager"}; // `log` is taken by math.
struct url_desc_deleter {
void operator()(LDAPURLDesc* p) {
void operator()(LDAPURLDesc *p) {
ldap_free_urldesc(p);
}
};
@@ -40,7 +40,7 @@ struct url_desc_deleter {
using url_desc_ptr = std::unique_ptr<LDAPURLDesc, url_desc_deleter>;
url_desc_ptr parse_url(std::string_view url) {
LDAPURLDesc* desc = nullptr;
LDAPURLDesc *desc = nullptr;
if (ldap_url_parse(url.data(), &desc)) {
mylog.error("error in ldap_url_parse({})", url);
}
@@ -53,12 +53,8 @@ std::vector<sstring> get_attr_values(LDAP* ld, LDAPMessage* res, const char* att
mylog.debug("Analyzing search results");
for (auto e = ldap_first_entry(ld, res); e; e = ldap_next_entry(ld, e)) {
struct deleter {
void operator()(berval** p) {
ldap_value_free_len(p);
}
void operator()(char* p) {
ldap_memfree(p);
}
void operator()(berval** p) { ldap_value_free_len(p); }
void operator()(char* p) { ldap_memfree(p); }
};
const std::unique_ptr<char, deleter> dname(ldap_get_dn(ld, e));
mylog.debug("Analyzing entry {}", dname.get());
@@ -79,29 +75,32 @@ std::vector<sstring> get_attr_values(LDAP* ld, LDAPMessage* res, const char* att
namespace auth {
ldap_role_manager::ldap_role_manager(std::string_view query_template, std::string_view target_attr, std::string_view bind_name, std::string_view bind_password,
uint32_t permissions_update_interval_in_ms, utils::observer<uint32_t> permissions_update_interval_in_ms_observer, cql3::query_processor& qp,
::service::raft_group0_client& rg0c, ::service::migration_manager& mm, cache& cache)
: _std_mgr(qp, rg0c, mm, cache)
, _group0_client(rg0c)
, _query_template(query_template)
, _target_attr(target_attr)
, _bind_name(bind_name)
, _bind_password(bind_password)
, _permissions_update_interval_in_ms(permissions_update_interval_in_ms)
, _permissions_update_interval_in_ms_observer(std::move(permissions_update_interval_in_ms_observer))
, _connection_factory(bind(std::mem_fn(&ldap_role_manager::reconnect), std::ref(*this)))
, _cache(cache)
, _cache_pruner(make_ready_future<>()) {
ldap_role_manager::ldap_role_manager(
std::string_view query_template, std::string_view target_attr, std::string_view bind_name, std::string_view bind_password,
uint32_t permissions_update_interval_in_ms,
utils::observer<uint32_t> permissions_update_interval_in_ms_observer,
cql3::query_processor& qp, ::service::raft_group0_client& rg0c, ::service::migration_manager& mm, cache& cache)
: _std_mgr(qp, rg0c, mm, cache), _group0_client(rg0c), _query_template(query_template), _target_attr(target_attr), _bind_name(bind_name)
, _bind_password(bind_password)
, _permissions_update_interval_in_ms(permissions_update_interval_in_ms)
, _permissions_update_interval_in_ms_observer(std::move(permissions_update_interval_in_ms_observer))
, _connection_factory(bind(std::mem_fn(&ldap_role_manager::reconnect), std::ref(*this)))
, _cache(cache)
, _cache_pruner(make_ready_future<>()) {
}
ldap_role_manager::ldap_role_manager(cql3::query_processor& qp, ::service::raft_group0_client& rg0c, ::service::migration_manager& mm, cache& cache)
: ldap_role_manager(qp.db().get_config().ldap_url_template(), qp.db().get_config().ldap_attr_role(), qp.db().get_config().ldap_bind_dn(),
qp.db().get_config().ldap_bind_passwd(), qp.db().get_config().permissions_update_interval_in_ms(),
qp.db().get_config().permissions_update_interval_in_ms.observe([this](const uint32_t& v) {
_permissions_update_interval_in_ms = v;
}),
qp, rg0c, mm, cache) {
: ldap_role_manager(
qp.db().get_config().ldap_url_template(),
qp.db().get_config().ldap_attr_role(),
qp.db().get_config().ldap_bind_dn(),
qp.db().get_config().ldap_bind_passwd(),
qp.db().get_config().permissions_update_interval_in_ms(),
qp.db().get_config().permissions_update_interval_in_ms.observe([this] (const uint32_t& v) { _permissions_update_interval_in_ms = v; }),
qp,
rg0c,
mm,
cache) {
}
std::string_view ldap_role_manager::qualified_java_name() const noexcept {
@@ -114,16 +113,17 @@ const resource_set& ldap_role_manager::protected_resources() const {
future<> ldap_role_manager::start() {
if (!parse_url(get_url("dummy-user"))) { // Just need host and port -- any user should do.
return make_exception_future(std::runtime_error(fmt::format("error getting LDAP server address from template {}", _query_template)));
return make_exception_future(
std::runtime_error(fmt::format("error getting LDAP server address from template {}", _query_template)));
}
_cache_pruner = futurize_invoke([this]() -> future<> {
_cache_pruner = futurize_invoke([this] () -> future<> {
while (true) {
try {
co_await seastar::sleep_abortable(std::chrono::milliseconds(_permissions_update_interval_in_ms), _as);
} catch (const seastar::sleep_aborted&) {
co_return; // ignore
}
co_await _cache.container().invoke_on_all([](cache& c) -> future<> {
co_await _cache.container().invoke_on_all([] (cache& c) -> future<> {
try {
co_await c.reload_all_permissions();
} catch (...) {
@@ -165,7 +165,7 @@ future<conn_ptr> ldap_role_manager::connect() {
future<conn_ptr> ldap_role_manager::reconnect() {
unsigned retries_left = 5;
using namespace std::literals::chrono_literals;
conn_ptr conn = co_await exponential_backoff_retry::do_until_value(1s, 32s, _as, [this, &retries_left]() -> future<std::optional<conn_ptr>> {
conn_ptr conn = co_await exponential_backoff_retry::do_until_value(1s, 32s, _as, [this, &retries_left] () -> future<std::optional<conn_ptr>> {
if (!retries_left) {
co_return conn_ptr{};
}
@@ -188,13 +188,11 @@ future<conn_ptr> ldap_role_manager::reconnect() {
future<> ldap_role_manager::stop() {
_as.request_abort();
return std::move(_cache_pruner)
.then([this] {
return _std_mgr.stop();
})
.then([this] {
return _connection_factory.stop();
});
return std::move(_cache_pruner).then([this] {
return _std_mgr.stop();
}).then([this] {
return _connection_factory.stop();
});
}
future<> ldap_role_manager::create(std::string_view name, const role_config& config, ::service::group0_batch& mc) {
@@ -223,42 +221,43 @@ future<role_set> ldap_role_manager::query_granted(std::string_view grantee_name,
if (!desc) {
return make_exception_future<role_set>(std::runtime_error(format("Error parsing URL {}", url)));
}
return _connection_factory.with_connection(
[this, desc = std::move(desc), grantee_name_ = sstring(grantee_name)](ldap_connection& conn) -> future<role_set> {
sstring grantee_name = std::move(grantee_name_);
ldap_msg_ptr res = co_await conn.search(desc->lud_dn, desc->lud_scope, desc->lud_filter, desc->lud_attrs,
/*attrsonly=*/0, /*serverctrls=*/nullptr, /*clientctrls=*/nullptr,
/*timeout=*/nullptr, /*sizelimit=*/0);
mylog.trace("query_granted: got search results");
const auto mtype = ldap_msgtype(res.get());
if (mtype != LDAP_RES_SEARCH_ENTRY && mtype != LDAP_RES_SEARCH_RESULT && mtype != LDAP_RES_SEARCH_REFERENCE) {
mylog.error("ldap search yielded result {} of type {}", static_cast<const void*>(res.get()), mtype);
co_return coroutine::exception(std::make_exception_ptr(std::runtime_error("ldap_role_manager: search result has wrong type")));
return _connection_factory.with_connection([this, desc = std::move(desc), grantee_name_ = sstring(grantee_name)]
(ldap_connection& conn) -> future<role_set> {
sstring grantee_name = std::move(grantee_name_);
ldap_msg_ptr res = co_await conn.search(desc->lud_dn, desc->lud_scope, desc->lud_filter, desc->lud_attrs,
/*attrsonly=*/0, /*serverctrls=*/nullptr, /*clientctrls=*/nullptr,
/*timeout=*/nullptr, /*sizelimit=*/0);
mylog.trace("query_granted: got search results");
const auto mtype = ldap_msgtype(res.get());
if (mtype != LDAP_RES_SEARCH_ENTRY && mtype != LDAP_RES_SEARCH_RESULT && mtype != LDAP_RES_SEARCH_REFERENCE) {
mylog.error("ldap search yielded result {} of type {}", static_cast<const void*>(res.get()), mtype);
co_return coroutine::exception(std::make_exception_ptr(std::runtime_error("ldap_role_manager: search result has wrong type")));
}
std::vector<sstring> values = get_attr_values(conn.get_ldap(), res.get(), _target_attr.c_str());
auth::role_set valid_roles{grantee_name};
// Each value is a role to be granted.
co_await parallel_for_each(values, [this, &valid_roles] (const sstring& ldap_role) {
return _std_mgr.exists(ldap_role).then([&valid_roles, &ldap_role] (bool exists) {
if (exists) {
valid_roles.insert(ldap_role);
} else {
mylog.error("unrecognized role received from LDAP: {}", ldap_role);
}
std::vector<sstring> values = get_attr_values(conn.get_ldap(), res.get(), _target_attr.c_str());
auth::role_set valid_roles{grantee_name};
// Each value is a role to be granted.
co_await parallel_for_each(values, [this, &valid_roles](const sstring& ldap_role) {
return _std_mgr.exists(ldap_role).then([&valid_roles, &ldap_role](bool exists) {
if (exists) {
valid_roles.insert(ldap_role);
} else {
mylog.error("unrecognized role received from LDAP: {}", ldap_role);
}
});
});
co_return std::move(valid_roles);
});
});
co_return std::move(valid_roles);
});
}
future<role_to_directly_granted_map> ldap_role_manager::query_all_directly_granted(::service::query_state& qs) {
future<role_to_directly_granted_map>
ldap_role_manager::query_all_directly_granted(::service::query_state& qs) {
role_to_directly_granted_map result;
auto roles = co_await query_all(qs);
for (auto& role : roles) {
for (auto& role: roles) {
auto granted_set = co_await query_granted(role, recursive_role_query::no);
for (auto& granted : granted_set) {
for (auto& granted: granted_set) {
if (granted != role) {
result.insert({role, granted});
}
@@ -272,7 +271,7 @@ future<role_set> ldap_role_manager::query_all(::service::query_state& qs) {
}
future<> ldap_role_manager::create_role(std::string_view role_name) {
return smp::submit_to(0, [this, role_name]() -> future<> {
return smp::submit_to(0, [this, role_name] () -> future<> {
int retries = 10;
while (true) {
auto guard = co_await _group0_client.start_operation(_as, ::service::raft_timeout{});
@@ -284,8 +283,8 @@ future<> ldap_role_manager::create_role(std::string_view role_name) {
} catch (const role_already_exists&) {
// ok
} catch (const ::service::group0_concurrent_modification& ex) {
mylog.warn("Failed to auto-create role \"{}\" due to guard conflict.{}.", role_name,
retries ? " Retrying" : " Number of retries exceeded, giving up");
mylog.warn("Failed to auto-create role \"{}\" due to guard conflict.{}.",
role_name, retries ? " Retrying" : " Number of retries exceeded, giving up");
if (retries--) {
continue;
}
@@ -330,7 +329,8 @@ future<bool> ldap_role_manager::can_login(std::string_view role_name) {
return _std_mgr.can_login(role_name);
}
future<std::optional<sstring>> ldap_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state& qs) {
future<std::optional<sstring>> ldap_role_manager::get_attribute(
std::string_view role_name, std::string_view attribute_name, ::service::query_state& qs) {
return _std_mgr.get_attribute(role_name, attribute_name, qs);
}

View File

@@ -76,14 +76,14 @@ struct partition_deletion {
using clustered_column_set = std::map<clustering_key, cdc::one_kind_column_set, clustering_key::less_compare>;
template <typename Container>
template<typename Container>
concept EntryContainer = requires(Container& container) {
// Parenthesized due to https://bugs.llvm.org/show_bug.cgi?id=45088
{ (container.atomic_entries) } -> std::same_as<std::vector<atomic_column_update>&>;
{ (container.nonatomic_entries) } -> std::same_as<std::vector<nonatomic_column_update>&>;
};
template <EntryContainer Container>
template<EntryContainer Container>
static void add_columns_affected_by_entries(cdc::one_kind_column_set& cset, const Container& cont) {
for (const auto& entry : cont.atomic_entries) {
cset.set(entry.id);
@@ -134,7 +134,7 @@ struct batch {
ret.emplace(clustering_key::make_empty(), all_columns);
}
auto process_change_type = [&](const auto& changes) {
auto process_change_type = [&] (const auto& changes) {
for (const auto& change : changes) {
auto& cset = ret[change.key];
cset.resize(s.regular_columns_count());
@@ -211,9 +211,7 @@ private:
public:
extract_collection_visitor(column_id id, std::map<change_key_t, row_update>& updates)
: _id(id)
, _updates(updates) {
}
: _id(id), _updates(updates) {}
void collection_tombstone(const tombstone& t) {
auto& entry = get_or_append_entry(t.timestamp + 1, gc_clock::duration(0));
@@ -228,9 +226,7 @@ public:
cell(key, c);
}
constexpr bool finished() const {
return false;
}
constexpr bool finished() const { return false; }
};
/* Visits all cells and tombstones in a row, putting the encountered changes into buckets
@@ -253,46 +249,41 @@ struct extract_row_visitor {
void collection_column(const column_definition& cdef, auto&& visit_collection) {
visit(*cdef.type, make_visitor(
[&](const collection_type_impl& ctype) {
struct collection_visitor : public extract_collection_visitor<collection_visitor> {
data_type _value_type;
[&] (const collection_type_impl& ctype) {
struct collection_visitor : public extract_collection_visitor<collection_visitor> {
data_type _value_type;
collection_visitor(column_id id, std::map<change_key_t, row_update>& updates, const collection_type_impl& ctype)
: extract_collection_visitor<collection_visitor>(id, updates)
, _value_type(ctype.value_comparator()) {
}
collection_visitor(column_id id, std::map<change_key_t, row_update>& updates, const collection_type_impl& ctype)
: extract_collection_visitor<collection_visitor>(id, updates), _value_type(ctype.value_comparator()) {}
data_type get_value_type(bytes_view) {
return _value_type;
}
} v(cdef.id, _updates, ctype);
data_type get_value_type(bytes_view) {
return _value_type;
}
} v(cdef.id, _updates, ctype);
visit_collection(v);
},
[&](const user_type_impl& utype) {
struct udt_visitor : public extract_collection_visitor<udt_visitor> {
const user_type_impl& _utype;
visit_collection(v);
},
[&] (const user_type_impl& utype) {
struct udt_visitor : public extract_collection_visitor<udt_visitor> {
const user_type_impl& _utype;
udt_visitor(column_id id, std::map<change_key_t, row_update>& updates, const user_type_impl& utype)
: extract_collection_visitor<udt_visitor>(id, updates)
, _utype(utype) {
}
udt_visitor(column_id id, std::map<change_key_t, row_update>& updates, const user_type_impl& utype)
: extract_collection_visitor<udt_visitor>(id, updates), _utype(utype) {}
data_type get_value_type(bytes_view key) {
return _utype.type(deserialize_field_index(key));
}
} v(cdef.id, _updates, utype);
data_type get_value_type(bytes_view key) {
return _utype.type(deserialize_field_index(key));
}
} v(cdef.id, _updates, utype);
visit_collection(v);
},
[&](const abstract_type& o) {
throw std::runtime_error(format("extract_changes: unknown collection type:", o.name()));
}));
visit_collection(v);
},
[&] (const abstract_type& o) {
throw std::runtime_error(format("extract_changes: unknown collection type:", o.name()));
}
));
}
constexpr bool finished() const {
return false;
}
constexpr bool finished() const { return false; }
};
struct extract_changes_visitor {
@@ -302,8 +293,12 @@ struct extract_changes_visitor {
extract_row_visitor v;
visit_row_cells(v);
for (auto& [ts_ttl, row_update] : v._updates) {
_result[ts_ttl.first].static_updates.push_back({ts_ttl.second, std::move(row_update.atomic_entries), std::move(row_update.nonatomic_entries)});
for (auto& [ts_ttl, row_update]: v._updates) {
_result[ts_ttl.first].static_updates.push_back({
ts_ttl.second,
std::move(row_update.atomic_entries),
std::move(row_update.nonatomic_entries)
});
}
}
@@ -324,18 +319,24 @@ struct extract_changes_visitor {
} v;
visit_row_cells(v);
for (auto& [ts_ttl, row_update] : v._updates) {
for (auto& [ts_ttl, row_update]: v._updates) {
// It is important that changes in the resulting `set_of_changes` are listed
// in increasing TTL order. The reason is explained in a comment in cdc/log.cc,
// search for "#6070".
auto [ts, ttl] = ts_ttl;
if (v._marker && ts == v._marker_ts && ttl == v._marker_ttl) {
_result[ts].clustered_inserts.push_back({ttl, ckey, *v._marker, std::move(row_update.atomic_entries), {}});
_result[ts].clustered_inserts.push_back({
ttl,
ckey,
*v._marker,
std::move(row_update.atomic_entries),
{}
});
auto& cr_insert = _result[ts].clustered_inserts.back();
bool clustered_update_exists = false;
for (auto& nonatomic_up : row_update.nonatomic_entries) {
for (auto& nonatomic_up: row_update.nonatomic_entries) {
// Updating a collection column with an INSERT statement implies inserting a tombstone.
//
// For example, suppose that we have:
@@ -361,7 +362,12 @@ struct extract_changes_visitor {
cr_insert.nonatomic_entries.push_back(std::move(nonatomic_up));
} else {
if (!clustered_update_exists) {
_result[ts].clustered_updates.push_back({ttl, ckey, {}, {}});
_result[ts].clustered_updates.push_back({
ttl,
ckey,
{},
{}
});
// Multiple iterations of this `for` loop (for different collection columns)
// might want to put their `nonatomic_up`s into an UPDATE change;
@@ -384,7 +390,12 @@ struct extract_changes_visitor {
}
}
} else {
_result[ts].clustered_updates.push_back({ttl, ckey, std::move(row_update.atomic_entries), std::move(row_update.nonatomic_entries)});
_result[ts].clustered_updates.push_back({
ttl,
ckey,
std::move(row_update.atomic_entries),
std::move(row_update.nonatomic_entries)
});
}
}
}
@@ -401,9 +412,7 @@ struct extract_changes_visitor {
_result[t.timestamp].partition_deletions = partition_deletion{t};
}
constexpr bool finished() const {
return false;
}
constexpr bool finished() const { return false; }
};
set_of_changes extract_changes(const mutation& m) {
@@ -417,23 +426,13 @@ namespace cdc {
struct find_timestamp_visitor {
api::timestamp_type _ts = api::missing_timestamp;
bool finished() const {
return _ts != api::missing_timestamp;
}
bool finished() const { return _ts != api::missing_timestamp; }
void visit(api::timestamp_type ts) {
_ts = ts;
}
void visit(const atomic_cell_view& cell) {
visit(cell.timestamp());
}
void visit(api::timestamp_type ts) { _ts = ts; }
void visit(const atomic_cell_view& cell) { visit(cell.timestamp()); }
void live_atomic_cell(const column_definition&, const atomic_cell_view& cell) {
visit(cell);
}
void dead_atomic_cell(const column_definition&, const atomic_cell_view& cell) {
visit(cell);
}
void live_atomic_cell(const column_definition&, const atomic_cell_view& cell) { visit(cell); }
void dead_atomic_cell(const column_definition&, const atomic_cell_view& cell) { visit(cell); }
void collection_tombstone(const tombstone& t) {
// A collection tombstone with timestamp T can be created with:
// UPDATE ks.t USING TIMESTAMP T + 1 SET X = null WHERE ...
@@ -442,33 +441,15 @@ struct find_timestamp_visitor {
// with cdc$time using timestamp T + 1 instead of T.
visit(t.timestamp + 1);
}
void live_collection_cell(bytes_view, const atomic_cell_view& cell) {
visit(cell);
}
void dead_collection_cell(bytes_view, const atomic_cell_view& cell) {
visit(cell);
}
void collection_column(const column_definition&, auto&& visit_collection) {
visit_collection(*this);
}
void marker(const row_marker& rm) {
visit(rm.timestamp());
}
void static_row_cells(auto&& visit_row_cells) {
visit_row_cells(*this);
}
void clustered_row_cells(const clustering_key&, auto&& visit_row_cells) {
visit_row_cells(*this);
}
void clustered_row_delete(const clustering_key&, const tombstone& t) {
visit(t.timestamp);
}
void range_delete(const range_tombstone& t) {
visit(t.tomb.timestamp);
}
void partition_delete(const tombstone& t) {
visit(t.timestamp);
}
void live_collection_cell(bytes_view, const atomic_cell_view& cell) { visit(cell); }
void dead_collection_cell(bytes_view, const atomic_cell_view& cell) { visit(cell); }
void collection_column(const column_definition&, auto&& visit_collection) { visit_collection(*this); }
void marker(const row_marker& rm) { visit(rm.timestamp()); }
void static_row_cells(auto&& visit_row_cells) { visit_row_cells(*this); }
void clustered_row_cells(const clustering_key&, auto&& visit_row_cells) { visit_row_cells(*this); }
void clustered_row_delete(const clustering_key&, const tombstone& t) { visit(t.timestamp); }
void range_delete(const range_tombstone& t) { visit(t.tomb.timestamp); }
void partition_delete(const tombstone& t) { visit(t.timestamp); }
};
/* Find some timestamp inside the given mutation.
@@ -524,12 +505,8 @@ struct should_split_visitor {
virtual ~should_split_visitor() = default;
inline bool finished() const {
return _result;
}
inline void stop() {
_result = true;
}
inline bool finished() const { return _result; }
inline void stop() { _result = true; }
void visit(api::timestamp_type ts, gc_clock::duration ttl = gc_clock::duration(0)) {
if (_ts != api::missing_timestamp && _ts != ts) {
@@ -540,23 +517,15 @@ struct should_split_visitor {
if (_ttl && *_ttl != ttl) {
return stop();
}
_ttl = {ttl};
_ttl = { ttl };
}
void visit(const atomic_cell_view& cell) {
visit(cell.timestamp(), get_ttl(cell));
}
void visit(const atomic_cell_view& cell) { visit(cell.timestamp(), get_ttl(cell)); }
void live_atomic_cell(const column_definition&, const atomic_cell_view& cell) {
visit(cell);
}
void dead_atomic_cell(const column_definition&, const atomic_cell_view& cell) {
visit(cell);
}
void live_atomic_cell(const column_definition&, const atomic_cell_view& cell) { visit(cell); }
void dead_atomic_cell(const column_definition&, const atomic_cell_view& cell) { visit(cell); }
void collection_tombstone(const tombstone& t) {
visit(t.timestamp + 1);
}
void collection_tombstone(const tombstone& t) { visit(t.timestamp + 1); }
virtual void live_collection_cell(bytes_view, const atomic_cell_view& cell) {
if (_had_row_marker) {
@@ -565,12 +534,8 @@ struct should_split_visitor {
}
visit(cell);
}
void dead_collection_cell(bytes_view, const atomic_cell_view& cell) {
visit(cell);
}
void collection_column(const column_definition&, auto&& visit_collection) {
visit_collection(*this);
}
void dead_collection_cell(bytes_view, const atomic_cell_view& cell) { visit(cell); }
void collection_column(const column_definition&, auto&& visit_collection) { visit_collection(*this); }
virtual void marker(const row_marker& rm) {
_had_row_marker = true;
@@ -641,8 +606,8 @@ bool should_split(const mutation& m, const per_request_options& options) {
cdc::inspect_mutation(m, v);
return v._result
// A mutation with no timestamp will be split into 0 mutations:
|| v._ts == api::missing_timestamp;
// A mutation with no timestamp will be split into 0 mutations:
|| v._ts == api::missing_timestamp;
}
// Returns true if the row state and the atomic and nonatomic entries represent
@@ -677,7 +642,7 @@ static bool entries_match_row_state(const schema_ptr& base_schema, const cell_ma
if (current_values.size() != update.cells.size()) {
return false;
}
std::unordered_map<sstring_view, bytes> current_values_map;
for (const auto& entry : current_values) {
const auto attr_name = std::string_view(value_cast<sstring>(entry.first));
@@ -746,8 +711,8 @@ bool should_skip(batch& changes, const mutation& base_mutation, change_processor
return true;
}
void process_changes_with_splitting(
const mutation& base_mutation, change_processor& processor, bool enable_preimage, bool enable_postimage, bool alternator_strict_compatibility) {
void process_changes_with_splitting(const mutation& base_mutation, change_processor& processor,
bool enable_preimage, bool enable_postimage, bool alternator_strict_compatibility) {
const auto base_schema = base_mutation.schema();
auto changes = extract_changes(base_mutation);
auto pk = base_mutation.key();
@@ -859,8 +824,8 @@ void process_changes_with_splitting(
}
}
void process_changes_without_splitting(
const mutation& base_mutation, change_processor& processor, bool enable_preimage, bool enable_postimage, bool alternator_strict_compatibility) {
void process_changes_without_splitting(const mutation& base_mutation, change_processor& processor,
bool enable_preimage, bool enable_postimage, bool alternator_strict_compatibility) {
if (alternator_strict_compatibility) {
auto changes = extract_changes(base_mutation);
if (should_skip(changes.begin()->second, base_mutation, processor)) {
@@ -877,7 +842,7 @@ void process_changes_without_splitting(
one_kind_column_set columns{base_schema->static_columns_count()};
if (!p.static_row().empty()) {
p.static_row().get().for_each_cell([&](column_id id, const atomic_cell_or_collection& cell) {
p.static_row().get().for_each_cell([&] (column_id id, const atomic_cell_or_collection& cell) {
columns.set(id);
});
processor.produce_preimage(nullptr, columns);
@@ -890,7 +855,7 @@ void process_changes_without_splitting(
// Row deleted - include all columns in preimage
columns.set(0, base_schema->regular_columns_count(), true);
} else {
cr.row().cells().for_each_cell([&](column_id id, const atomic_cell_or_collection& cell) {
cr.row().cells().for_each_cell([&] (column_id id, const atomic_cell_or_collection& cell) {
columns.set(id);
});
}

View File

@@ -946,7 +946,7 @@ sstables::shared_sstable sstables_task_executor::consume_sstable() {
auto sst = _sstables.back();
_sstables.pop_back();
--_cm._stats.pending_tasks; // from this point on, switch_state(pending|active) works the same way as any other task
cmlog.debug("consumed {}", sst->get_filename());
cmlog.debug("{}", format("consumed {}", sst->get_filename()));
return sst;
}
@@ -1110,7 +1110,7 @@ void compaction_manager::register_metrics() {
sm::make_gauge("normalized_backlog", [this] { return _last_backlog / available_memory(); },
sm::description("Holds the sum of normalized compaction backlog for all tables in the system. Backlog is normalized by dividing backlog by shard's available memory.")),
sm::make_counter("validation_errors", [this] { return _validation_errors; },
sm::description("Holds the number of encountered validation errors.")),
sm::description("Holds the number of encountered validation errors.")).set_skip_when_empty(),
});
}
@@ -1208,6 +1208,7 @@ future<> compaction_manager::await_tasks(std::vector<shared_ptr<compaction_task_
std::vector<shared_ptr<compaction_task_executor>>
compaction_manager::do_stop_ongoing_compactions(sstring reason, std::function<bool(const compaction_group_view*)> filter, std::optional<compaction_type> type_opt) noexcept {
auto ongoing_compactions = get_compactions(filter).size();
auto tasks = _tasks
| std::views::filter([&filter, type_opt] (const auto& task) {
return filter(task.compacting_table()) && (!type_opt || task.compaction_type() == *type_opt);
@@ -1216,7 +1217,6 @@ compaction_manager::do_stop_ongoing_compactions(sstring reason, std::function<bo
| std::ranges::to<std::vector<shared_ptr<compaction_task_executor>>>();
logging::log_level level = tasks.empty() ? log_level::debug : log_level::info;
if (cmlog.is_enabled(level)) {
auto ongoing_compactions = get_compactions(filter).size();
std::string scope = "";
if (!tasks.empty()) {
const compaction_group_view* t = tasks.front()->compacting_table();
@@ -1426,17 +1426,11 @@ protected:
compaction_strategy cs = t.get_compaction_strategy();
compaction_descriptor descriptor = co_await cs.get_sstables_for_compaction(t, _cm.get_strategy_control());
int weight = calculate_weight(descriptor);
bool debug_enabled = cmlog.is_enabled(log_level::debug);
if (debug_enabled) {
cmlog.debug("Started minor compaction sstables={} sstables_reapired_at={} range={} uuid={} compaction_uuid={}",
descriptor.sstables, compacting_table()->get_sstables_repaired_at(),
compacting_table()->token_range(), uuid, _compaction_data.compaction_uuid);
}
cmlog.debug("Started minor compaction sstables={} sstables_reapired_at={} range={} uuid={} compaction_uuid={}",
descriptor.sstables, compacting_table()->get_sstables_repaired_at(),
compacting_table()->token_range(), uuid, _compaction_data.compaction_uuid);
sstring old_sstables;
if (debug_enabled) {
old_sstables = ::format("{}", descriptor.sstables);
}
auto old_sstables = ::format("{}", descriptor.sstables);
if (descriptor.sstables.empty() || !can_proceed() || t.is_auto_compaction_disabled_by_user()) {
cmlog.debug("{}: sstables={} can_proceed={} auto_compaction={}", *this, descriptor.sstables.size(), can_proceed(), t.is_auto_compaction_disabled_by_user());
@@ -1466,10 +1460,8 @@ protected:
try {
bool should_update_history = this->should_update_history(descriptor.options.type());
compaction_result res = co_await compact_sstables(std::move(descriptor), _compaction_data, on_replace);
if (debug_enabled) {
cmlog.debug("Finished minor compaction old_sstables={} new_sstables={} sstables_reapired_at={} range={} uuid={} compaction_uuid={}",
old_sstables, res.new_sstables, compacting_table()->get_sstables_repaired_at(), compacting_table()->token_range(), uuid, _compaction_data.compaction_uuid);
}
cmlog.debug("Finished minor compaction old_sstables={} new_sstables={} sstables_reapired_at={} range={} uuid={} compaction_uuid={}",
old_sstables, res.new_sstables, compacting_table()->get_sstables_repaired_at(), compacting_table()->token_range(), uuid, _compaction_data.compaction_uuid);
finish_compaction();
if (should_update_history) {
// update_history can take a long time compared to

View File

@@ -33,10 +33,8 @@ future<compaction_descriptor> leveled_compaction_strategy::get_sstables_for_comp
auto candidate = manifest.get_compaction_candidates(*state->last_compacted_keys, state->compaction_counter);
if (!candidate.sstables.empty()) {
if (leveled_manifest::logger.is_enabled(logging::log_level::debug)) {
auto main_set = co_await table_s.main_sstable_set();
leveled_manifest::logger.debug("leveled: Compacting {} out of {} sstables", candidate.sstables.size(), main_set->size());
}
auto main_set = co_await table_s.main_sstable_set();
leveled_manifest::logger.debug("leveled: Compacting {} out of {} sstables", candidate.sstables.size(), main_set->size());
co_return candidate;
}

View File

@@ -15,7 +15,6 @@
#include "compaction_strategy_state.hh"
#include "utils/error_injection.hh"
#include <seastar/util/lazy.hh>
#include <ranges>
namespace compaction {
@@ -29,12 +28,12 @@ time_window_compaction_strategy_state_ptr time_window_compaction_strategy::get_s
}
const std::unordered_map<sstring, std::chrono::seconds> time_window_compaction_strategy_options::valid_window_units = {
{"MINUTES", 60s}, {"HOURS", 3600s}, {"DAYS", 86400s}};
{ "MINUTES", 60s }, { "HOURS", 3600s }, { "DAYS", 86400s }
};
const std::unordered_map<sstring, time_window_compaction_strategy_options::timestamp_resolutions>
time_window_compaction_strategy_options::valid_timestamp_resolutions = {
{"MICROSECONDS", timestamp_resolutions::microsecond},
{"MILLISECONDS", timestamp_resolutions::millisecond},
const std::unordered_map<sstring, time_window_compaction_strategy_options::timestamp_resolutions> time_window_compaction_strategy_options::valid_timestamp_resolutions = {
{ "MICROSECONDS", timestamp_resolutions::microsecond },
{ "MILLISECONDS", timestamp_resolutions::millisecond },
};
static std::chrono::seconds validate_compaction_window_unit(const std::map<sstring, sstring>& options) {
@@ -44,8 +43,7 @@ static std::chrono::seconds validate_compaction_window_unit(const std::map<sstri
if (tmp_value) {
auto valid_window_units_it = time_window_compaction_strategy_options::valid_window_units.find(tmp_value.value());
if (valid_window_units_it == time_window_compaction_strategy_options::valid_window_units.end()) {
throw exceptions::configuration_exception(
fmt::format("Invalid window unit {} for {}", tmp_value.value(), time_window_compaction_strategy_options::COMPACTION_WINDOW_UNIT_KEY));
throw exceptions::configuration_exception(fmt::format("Invalid window unit {} for {}", tmp_value.value(), time_window_compaction_strategy_options::COMPACTION_WINDOW_UNIT_KEY));
}
window_unit = valid_window_units_it->second;
}
@@ -61,12 +59,10 @@ static std::chrono::seconds validate_compaction_window_unit(const std::map<sstri
static int validate_compaction_window_size(const std::map<sstring, sstring>& options) {
auto tmp_value = compaction_strategy_impl::get_value(options, time_window_compaction_strategy_options::COMPACTION_WINDOW_SIZE_KEY);
int window_size = cql3::statements::property_definitions::to_long(time_window_compaction_strategy_options::COMPACTION_WINDOW_SIZE_KEY, tmp_value,
time_window_compaction_strategy_options::DEFAULT_COMPACTION_WINDOW_SIZE);
int window_size = cql3::statements::property_definitions::to_long(time_window_compaction_strategy_options::COMPACTION_WINDOW_SIZE_KEY, tmp_value, time_window_compaction_strategy_options::DEFAULT_COMPACTION_WINDOW_SIZE);
if (window_size <= 0) {
throw exceptions::configuration_exception(
fmt::format("{} value ({}) must be greater than 1", time_window_compaction_strategy_options::COMPACTION_WINDOW_SIZE_KEY, window_size));
throw exceptions::configuration_exception(fmt::format("{} value ({}) must be greater than 1", time_window_compaction_strategy_options::COMPACTION_WINDOW_SIZE_KEY, window_size));
}
return window_size;
@@ -86,30 +82,26 @@ static db_clock::duration validate_expired_sstable_check_frequency_seconds(const
try {
expired_sstable_check_frequency = std::chrono::seconds(std::stol(tmp_value.value()));
} catch (const std::exception& e) {
throw exceptions::syntax_exception(fmt::format(
"Invalid long value {} for {}", tmp_value.value(), time_window_compaction_strategy_options::EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_KEY));
throw exceptions::syntax_exception(fmt::format("Invalid long value {} for {}", tmp_value.value(), time_window_compaction_strategy_options::EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_KEY));
}
}
return expired_sstable_check_frequency;
}
static db_clock::duration validate_expired_sstable_check_frequency_seconds(
const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
static db_clock::duration validate_expired_sstable_check_frequency_seconds(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
db_clock::duration expired_sstable_check_frequency = validate_expired_sstable_check_frequency_seconds(options);
unchecked_options.erase(time_window_compaction_strategy_options::EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_KEY);
return expired_sstable_check_frequency;
}
static time_window_compaction_strategy_options::timestamp_resolutions validate_timestamp_resolution(const std::map<sstring, sstring>& options) {
time_window_compaction_strategy_options::timestamp_resolutions timestamp_resolution =
time_window_compaction_strategy_options::timestamp_resolutions::microsecond;
time_window_compaction_strategy_options::timestamp_resolutions timestamp_resolution = time_window_compaction_strategy_options::timestamp_resolutions::microsecond;
auto tmp_value = compaction_strategy_impl::get_value(options, time_window_compaction_strategy_options::TIMESTAMP_RESOLUTION_KEY);
if (tmp_value) {
if (!time_window_compaction_strategy_options::valid_timestamp_resolutions.contains(tmp_value.value())) {
throw exceptions::configuration_exception(fmt::format(
"Invalid timestamp resolution {} for {}", tmp_value.value(), time_window_compaction_strategy_options::TIMESTAMP_RESOLUTION_KEY));
throw exceptions::configuration_exception(fmt::format("Invalid timestamp resolution {} for {}", tmp_value.value(), time_window_compaction_strategy_options::TIMESTAMP_RESOLUTION_KEY));
} else {
timestamp_resolution = time_window_compaction_strategy_options::valid_timestamp_resolutions.at(tmp_value.value());
}
@@ -118,8 +110,7 @@ static time_window_compaction_strategy_options::timestamp_resolutions validate_t
return timestamp_resolution;
}
static time_window_compaction_strategy_options::timestamp_resolutions validate_timestamp_resolution(
const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
static time_window_compaction_strategy_options::timestamp_resolutions validate_timestamp_resolution(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
time_window_compaction_strategy_options::timestamp_resolutions timestamp_resolution = validate_timestamp_resolution(options);
unchecked_options.erase(time_window_compaction_strategy_options::TIMESTAMP_RESOLUTION_KEY);
return timestamp_resolution;
@@ -154,7 +145,7 @@ void time_window_compaction_strategy_options::validate(const std::map<sstring, s
compaction_strategy_impl::validate_min_max_threshold(options, unchecked_options);
auto it = options.find("enable_optimized_twcs_queries");
if (it != options.end() && it->second != "true" && it->second != "false") {
if (it != options.end() && it->second != "true" && it->second != "false") {
throw exceptions::configuration_exception(fmt::format("enable_optimized_twcs_queries value ({}) must be \"true\" or \"false\"", it->second));
}
unchecked_options.erase("enable_optimized_twcs_queries");
@@ -171,9 +162,7 @@ class classify_by_timestamp {
std::vector<int64_t> _known_windows;
public:
explicit classify_by_timestamp(time_window_compaction_strategy_options options)
: _options(std::move(options)) {
}
explicit classify_by_timestamp(time_window_compaction_strategy_options options) : _options(std::move(options)) { }
int64_t operator()(api::timestamp_type ts) {
const auto window = time_window_compaction_strategy::get_window_for(_options, ts);
if (const auto it = std::ranges::find(_known_windows, window); it != _known_windows.end()) {
@@ -201,7 +190,7 @@ uint64_t time_window_compaction_strategy::adjust_partition_estimate(const mutati
auto estimated_window_count = max_data_segregation_window_count;
auto default_ttl = std::chrono::duration_cast<std::chrono::microseconds>(s->default_time_to_live());
bool min_and_max_ts_available = ms_meta.min_timestamp && ms_meta.max_timestamp;
auto estimate_window_count = [this](timestamp_type min_window, timestamp_type max_window) {
auto estimate_window_count = [this] (timestamp_type min_window, timestamp_type max_window) {
const auto window_size = get_window_size(_options);
return (max_window + (window_size - 1) - min_window) / window_size;
};
@@ -221,19 +210,21 @@ uint64_t time_window_compaction_strategy::adjust_partition_estimate(const mutati
return partition_estimate / std::max(1UL, uint64_t(estimated_window_count));
}
mutation_reader_consumer time_window_compaction_strategy::make_interposer_consumer(
const mutation_source_metadata& ms_meta, mutation_reader_consumer end_consumer) const {
if (ms_meta.min_timestamp && ms_meta.max_timestamp &&
get_window_for(_options, *ms_meta.min_timestamp) == get_window_for(_options, *ms_meta.max_timestamp)) {
mutation_reader_consumer time_window_compaction_strategy::make_interposer_consumer(const mutation_source_metadata& ms_meta, mutation_reader_consumer end_consumer) const {
if (ms_meta.min_timestamp && ms_meta.max_timestamp
&& get_window_for(_options, *ms_meta.min_timestamp) == get_window_for(_options, *ms_meta.max_timestamp)) {
return end_consumer;
}
return [options = _options, end_consumer = std::move(end_consumer)](mutation_reader rd) mutable -> future<> {
return mutation_writer::segregate_by_timestamp(std::move(rd), classify_by_timestamp(std::move(options)), end_consumer);
return [options = _options, end_consumer = std::move(end_consumer)] (mutation_reader rd) mutable -> future<> {
return mutation_writer::segregate_by_timestamp(
std::move(rd),
classify_by_timestamp(std::move(options)),
end_consumer);
};
}
compaction_descriptor time_window_compaction_strategy::get_reshaping_job(
std::vector<sstables::shared_sstable> input, schema_ptr schema, reshape_config cfg) const {
compaction_descriptor
time_window_compaction_strategy::get_reshaping_job(std::vector<sstables::shared_sstable> input, schema_ptr schema, reshape_config cfg) const {
auto mode = cfg.mode;
std::vector<sstables::shared_sstable> single_window;
std::vector<sstables::shared_sstable> multi_window;
@@ -248,7 +239,7 @@ compaction_descriptor time_window_compaction_strategy::get_reshaping_job(
// Sort input sstables by first_key order
// to allow efficient reshaping of disjoint sstables.
std::sort(input.begin(), input.end(), [&schema](const sstables::shared_sstable& a, const sstables::shared_sstable& b) {
std::sort(input.begin(), input.end(), [&schema] (const sstables::shared_sstable& a, const sstables::shared_sstable& b) {
return dht::ring_position(a->get_first_decorated_key()).less_compare(*schema, dht::ring_position(b->get_first_decorated_key()));
});
@@ -262,34 +253,31 @@ compaction_descriptor time_window_compaction_strategy::get_reshaping_job(
}
}
auto is_disjoint = [&schema, mode, max_sstables](const std::vector<sstables::shared_sstable>& ssts) {
auto is_disjoint = [&schema, mode, max_sstables] (const std::vector<sstables::shared_sstable>& ssts) {
size_t tolerance = (mode == reshape_mode::relaxed) ? max_sstables : 0;
return sstable_set_overlapping_count(schema, ssts) <= tolerance;
};
clogger.debug("time_window_compaction_strategy::get_reshaping_job: offstrategy_threshold={} max_sstables={} multi_window={} disjoint={} "
"single_window={} disjoint={}",
offstrategy_threshold, max_sstables, multi_window.size(), seastar::value_of([&] {
return !multi_window.empty() && sstable_set_overlapping_count(schema, multi_window) == 0;
}),
single_window.size(), seastar::value_of([&] {
return !single_window.empty() && sstable_set_overlapping_count(schema, single_window) == 0;
}));
clogger.debug("time_window_compaction_strategy::get_reshaping_job: offstrategy_threshold={} max_sstables={} multi_window={} disjoint={} single_window={} disjoint={}",
offstrategy_threshold, max_sstables,
multi_window.size(), !multi_window.empty() && sstable_set_overlapping_count(schema, multi_window) == 0,
single_window.size(), !single_window.empty() && sstable_set_overlapping_count(schema, single_window) == 0);
auto get_job_size = [](const std::vector<sstables::shared_sstable>& ssts) {
auto get_job_size = [] (const std::vector<sstables::shared_sstable>& ssts) {
return std::ranges::fold_left(ssts | std::views::transform(std::mem_fn(&sstables::sstable::bytes_on_disk)), uint64_t(0), std::plus{});
};
// Targets a space overhead of 10%. All disjoint sstables can be compacted together as long as they won't
// cause an overhead above target. Otherwise, the job targets a maximum of #max_threshold sstables.
auto need_trimming = [&](const std::vector<sstables::shared_sstable>& ssts, const uint64_t job_size, bool is_disjoint) {
auto need_trimming = [&] (const std::vector<sstables::shared_sstable>& ssts, const uint64_t job_size, bool is_disjoint) {
const size_t min_sstables = 2;
auto is_above_target_size = job_size > target_job_size;
return (ssts.size() > max_sstables && !is_disjoint) || (ssts.size() > min_sstables && is_above_target_size);
return (ssts.size() > max_sstables && !is_disjoint) ||
(ssts.size() > min_sstables && is_above_target_size);
};
auto maybe_trim_job = [&need_trimming](std::vector<sstables::shared_sstable>& ssts, uint64_t job_size, bool is_disjoint) {
auto maybe_trim_job = [&need_trimming] (std::vector<sstables::shared_sstable>& ssts, uint64_t job_size, bool is_disjoint) {
while (need_trimming(ssts, job_size, is_disjoint)) {
auto sst = ssts.back();
ssts.pop_back();
@@ -306,7 +294,7 @@ compaction_descriptor time_window_compaction_strategy::get_reshaping_job(
// For example, if there are N sstables spanning window W, where N <= 32, then we can produce all data for W
// in a single compaction round, removing the need to later compact W to reduce its number of files.
auto sort_size = std::min(max_sstables, multi_window.size());
std::ranges::partial_sort(multi_window, multi_window.begin() + sort_size, std::ranges::less(), [](const sstables::shared_sstable& a) {
std::ranges::partial_sort(multi_window, multi_window.begin() + sort_size, std::ranges::less(), [] (const sstables::shared_sstable &a) {
return a->get_stats_metadata().max_timestamp;
});
maybe_trim_job(multi_window, job_size, disjoint);
@@ -346,7 +334,8 @@ compaction_descriptor time_window_compaction_strategy::get_reshaping_job(
return compaction_descriptor();
}
future<compaction_descriptor> time_window_compaction_strategy::get_sstables_for_compaction(compaction_group_view& table_s, strategy_control& control) {
future<compaction_descriptor>
time_window_compaction_strategy::get_sstables_for_compaction(compaction_group_view& table_s, strategy_control& control) {
auto state = get_state(table_s);
auto compaction_time = gc_clock::now();
auto candidates = co_await control.candidates(table_s);
@@ -380,8 +369,10 @@ future<compaction_descriptor> time_window_compaction_strategy::get_sstables_for_
co_return compaction_descriptor(std::move(compaction_candidates));
}
time_window_compaction_strategy::bucket_compaction_mode time_window_compaction_strategy::compaction_mode(
const time_window_compaction_strategy_state& state, const bucket_t& bucket, timestamp_type bucket_key, timestamp_type now, size_t min_threshold) const {
time_window_compaction_strategy::bucket_compaction_mode
time_window_compaction_strategy::compaction_mode(const time_window_compaction_strategy_state& state,
const bucket_t& bucket, timestamp_type bucket_key,
timestamp_type now, size_t min_threshold) const {
// STCS will also be performed on older window buckets, to avoid a bad write and
// space amplification when something like read repair cause small updates to
// those past windows.
@@ -394,7 +385,8 @@ time_window_compaction_strategy::bucket_compaction_mode time_window_compaction_s
return bucket_compaction_mode::none;
}
std::vector<sstables::shared_sstable> time_window_compaction_strategy::get_next_non_expired_sstables(compaction_group_view& table_s, strategy_control& control,
std::vector<sstables::shared_sstable>
time_window_compaction_strategy::get_next_non_expired_sstables(compaction_group_view& table_s, strategy_control& control,
std::vector<sstables::shared_sstable> non_expiring_sstables, gc_clock::time_point compaction_time, time_window_compaction_strategy_state& state) {
auto most_interesting = get_compaction_candidates(table_s, control, non_expiring_sstables, state);
@@ -408,29 +400,31 @@ std::vector<sstables::shared_sstable> time_window_compaction_strategy::get_next_
// if there is no sstable to compact in standard way, try compacting single sstable whose droppable tombstone
// ratio is greater than threshold.
std::erase_if(non_expiring_sstables, [this, compaction_time, &table_s](const sstables::shared_sstable& sst) -> bool {
std::erase_if(non_expiring_sstables, [this, compaction_time, &table_s] (const sstables::shared_sstable& sst) -> bool {
return !worth_dropping_tombstones(sst, compaction_time, table_s);
});
if (non_expiring_sstables.empty()) {
return {};
}
auto it = std::ranges::min_element(non_expiring_sstables, [](auto& i, auto& j) {
auto it = std::ranges::min_element(non_expiring_sstables, [] (auto& i, auto& j) {
return i->get_stats_metadata().min_timestamp < j->get_stats_metadata().min_timestamp;
});
return {*it};
return { *it };
}
std::vector<sstables::shared_sstable> time_window_compaction_strategy::get_compaction_candidates(compaction_group_view& table_s, strategy_control& control,
std::vector<sstables::shared_sstable> candidate_sstables, time_window_compaction_strategy_state& state) {
std::vector<sstables::shared_sstable>
time_window_compaction_strategy::get_compaction_candidates(compaction_group_view& table_s, strategy_control& control,
std::vector<sstables::shared_sstable> candidate_sstables, time_window_compaction_strategy_state& state) {
auto [buckets, max_timestamp] = get_buckets(std::move(candidate_sstables), _options);
// Update the highest window seen, if necessary
state.highest_window_seen = std::max(state.highest_window_seen, max_timestamp);
return newest_bucket(table_s, control, std::move(buckets), table_s.min_compaction_threshold(), table_s.schema()->max_compaction_threshold(),
state.highest_window_seen, state);
state.highest_window_seen, state);
}
timestamp_type time_window_compaction_strategy::get_window_lower_bound(std::chrono::seconds sstable_window_size, timestamp_type timestamp) {
timestamp_type
time_window_compaction_strategy::get_window_lower_bound(std::chrono::seconds sstable_window_size, timestamp_type timestamp) {
using namespace std::chrono;
// mask out window size from timestamp to get lower bound of its window
auto num_windows = microseconds(timestamp) / sstable_window_size;
@@ -438,8 +432,8 @@ timestamp_type time_window_compaction_strategy::get_window_lower_bound(std::chro
return duration_cast<microseconds>(num_windows * sstable_window_size).count();
}
std::pair<std::map<timestamp_type, std::vector<sstables::shared_sstable>>, timestamp_type> time_window_compaction_strategy::get_buckets(
std::vector<sstables::shared_sstable> files, const time_window_compaction_strategy_options& options) {
std::pair<std::map<timestamp_type, std::vector<sstables::shared_sstable>>, timestamp_type>
time_window_compaction_strategy::get_buckets(std::vector<sstables::shared_sstable> files, const time_window_compaction_strategy_options& options) {
std::map<timestamp_type, std::vector<sstables::shared_sstable>> buckets;
timestamp_type max_timestamp = 0;
@@ -456,13 +450,11 @@ std::pair<std::map<timestamp_type, std::vector<sstables::shared_sstable>>, times
return std::make_pair(std::move(buckets), max_timestamp);
}
} // namespace compaction
}
template <>
struct fmt::formatter<std::map<compaction::timestamp_type, std::vector<sstables::shared_sstable>>> {
constexpr auto parse(format_parse_context& ctx) {
return ctx.begin();
}
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
auto format(const std::map<compaction::timestamp_type, std::vector<sstables::shared_sstable>>& buckets, fmt::format_context& ctx) const {
auto out = fmt::format_to(ctx.out(), " buckets = {{\n");
for (auto& [timestamp, sstables] : buckets | std::views::reverse) {
@@ -474,9 +466,9 @@ struct fmt::formatter<std::map<compaction::timestamp_type, std::vector<sstables:
namespace compaction {
std::vector<sstables::shared_sstable> time_window_compaction_strategy::newest_bucket(compaction_group_view& table_s, strategy_control& control,
std::map<timestamp_type, std::vector<sstables::shared_sstable>> buckets, int min_threshold, int max_threshold, timestamp_type now,
time_window_compaction_strategy_state& state) {
std::vector<sstables::shared_sstable>
time_window_compaction_strategy::newest_bucket(compaction_group_view& table_s, strategy_control& control, std::map<timestamp_type, std::vector<sstables::shared_sstable>> buckets,
int min_threshold, int max_threshold, timestamp_type now, time_window_compaction_strategy_state& state) {
clogger.debug("time_window_compaction_strategy::newest_bucket:\n now {}\n{}", now, buckets);
for (auto&& [key, bucket] : buckets | std::views::reverse) {
@@ -517,7 +509,8 @@ std::vector<sstables::shared_sstable> time_window_compaction_strategy::newest_bu
return {};
}
std::vector<sstables::shared_sstable> time_window_compaction_strategy::trim_to_threshold(std::vector<sstables::shared_sstable> bucket, int max_threshold) {
std::vector<sstables::shared_sstable>
time_window_compaction_strategy::trim_to_threshold(std::vector<sstables::shared_sstable> bucket, int max_threshold) {
auto n = std::min(bucket.size(), size_t(max_threshold));
// Trim the largest sstables off the end to meet the maxThreshold
std::ranges::partial_sort(bucket, bucket.begin() + n, std::ranges::less(), std::mem_fn(&sstables::sstable::ondisk_data_size));
@@ -549,8 +542,8 @@ future<int64_t> time_window_compaction_strategy::estimated_pending_compactions(c
co_return n;
}
std::vector<compaction_descriptor> time_window_compaction_strategy::get_cleanup_compaction_jobs(
compaction_group_view& table_s, std::vector<sstables::shared_sstable> candidates) const {
std::vector<compaction_descriptor>
time_window_compaction_strategy::get_cleanup_compaction_jobs(compaction_group_view& table_s, std::vector<sstables::shared_sstable> candidates) const {
std::vector<compaction_descriptor> ret;
for (auto&& [_, sstables] : get_buckets(std::move(candidates), _options).first) {
auto per_window_jobs = size_tiered_compaction_strategy(_stcs_options).get_cleanup_compaction_jobs(table_s, std::move(sstables));
@@ -563,4 +556,4 @@ std::unique_ptr<sstables::sstable_set_impl> time_window_compaction_strategy::mak
return std::make_unique<sstables::time_series_sstable_set>(ts.schema(), _options.enable_optimized_twcs_queries);
}
} // namespace compaction
}

View File

@@ -583,8 +583,7 @@ sstable_format: ms
audit: "table"
#
# List of statement categories that should be audited.
# Possible categories are: QUERY, DML, DCL, DDL, AUTH, ADMIN
audit_categories: "DCL,AUTH,ADMIN"
audit_categories: "DCL,DDL,AUTH,ADMIN"
#
# List of tables that should be audited.
# audit_tables: "<keyspace_name>.<table_name>,<keyspace_name>.<table_name>"

View File

@@ -48,15 +48,13 @@ const sstring query_processor::CQL_VERSION = "3.3.1";
const std::chrono::minutes prepared_statements_cache::entry_expiry = std::chrono::minutes(60);
struct query_processor::remote {
remote(service::migration_manager& mm, service::mapreduce_service& fwd, service::storage_service& ss, service::raft_group0_client& group0_client,
service::strong_consistency::coordinator& _sc_coordinator)
: mm(mm)
, mapreducer(fwd)
, ss(ss)
, group0_client(group0_client)
, sc_coordinator(_sc_coordinator)
, gate("query_processor::remote") {
}
remote(service::migration_manager& mm, service::mapreduce_service& fwd,
service::storage_service& ss, service::raft_group0_client& group0_client,
service::strong_consistency::coordinator& _sc_coordinator)
: mm(mm), mapreducer(fwd), ss(ss), group0_client(group0_client)
, sc_coordinator(_sc_coordinator)
, gate("query_processor::remote")
{}
service::migration_manager& mm;
service::mapreduce_service& mapreducer;
@@ -79,34 +77,24 @@ static service::query_state query_state_for_internal_call() {
return {service::client_state::for_internal_calls(), empty_service_permit()};
}
query_processor::query_processor(service::storage_proxy& proxy, data_dictionary::database db, service::migration_notifier& mn,
vector_search::vector_store_client& vsc, query_processor::memory_config mcfg, cql_config& cql_cfg, utils::loading_cache_config auth_prep_cache_cfg,
lang::manager& langm)
: _migration_subscriber{std::make_unique<migration_subscriber>(this)}
, _proxy(proxy)
, _db(db)
, _mnotifier(mn)
, _vector_store_client(vsc)
, _mcfg(mcfg)
, _cql_config(cql_cfg)
, _prepared_cache(prep_cache_log, _mcfg.prepared_statment_cache_size)
, _authorized_prepared_cache(std::move(auth_prep_cache_cfg), authorized_prepared_statements_cache_log)
, _auth_prepared_cache_cfg_cb([this](uint32_t) {
(void)_authorized_prepared_cache_config_action.trigger_later();
})
, _authorized_prepared_cache_config_action([this] {
update_authorized_prepared_cache_config();
return make_ready_future<>();
})
, _authorized_prepared_cache_update_interval_in_ms_observer(_db.get_config().permissions_update_interval_in_ms.observe(_auth_prepared_cache_cfg_cb))
, _authorized_prepared_cache_validity_in_ms_observer(_db.get_config().permissions_validity_in_ms.observe(_auth_prepared_cache_cfg_cb))
, _lang_manager(langm)
, _write_consistency_levels_warned_observer(_db.get_config().write_consistency_levels_warned.observe([this](const auto& v) {
_write_consistency_levels_warned = to_consistency_level_set(v);
}))
, _write_consistency_levels_disallowed_observer(_db.get_config().write_consistency_levels_disallowed.observe([this](const auto& v) {
_write_consistency_levels_disallowed = to_consistency_level_set(v);
})) {
query_processor::query_processor(service::storage_proxy& proxy, data_dictionary::database db, service::migration_notifier& mn, vector_search::vector_store_client& vsc, query_processor::memory_config mcfg, cql_config& cql_cfg, utils::loading_cache_config auth_prep_cache_cfg, lang::manager& langm)
: _migration_subscriber{std::make_unique<migration_subscriber>(this)}
, _proxy(proxy)
, _db(db)
, _mnotifier(mn)
, _vector_store_client(vsc)
, _mcfg(mcfg)
, _cql_config(cql_cfg)
, _prepared_cache(prep_cache_log, _mcfg.prepared_statment_cache_size)
, _authorized_prepared_cache(std::move(auth_prep_cache_cfg), authorized_prepared_statements_cache_log)
, _auth_prepared_cache_cfg_cb([this] (uint32_t) { (void) _authorized_prepared_cache_config_action.trigger_later(); })
, _authorized_prepared_cache_config_action([this] { update_authorized_prepared_cache_config(); return make_ready_future<>(); })
, _authorized_prepared_cache_update_interval_in_ms_observer(_db.get_config().permissions_update_interval_in_ms.observe(_auth_prepared_cache_cfg_cb))
, _authorized_prepared_cache_validity_in_ms_observer(_db.get_config().permissions_validity_in_ms.observe(_auth_prepared_cache_cfg_cb))
, _lang_manager(langm)
, _write_consistency_levels_warned_observer(_db.get_config().write_consistency_levels_warned.observe([this](const auto& v) { _write_consistency_levels_warned = to_consistency_level_set(v); }))
, _write_consistency_levels_disallowed_observer(_db.get_config().write_consistency_levels_disallowed.observe([this](const auto& v) { _write_consistency_levels_disallowed = to_consistency_level_set(v); }))
{
_write_consistency_levels_warned = to_consistency_level_set(_db.get_config().write_consistency_levels_warned());
_write_consistency_levels_disallowed = to_consistency_level_set(_db.get_config().write_consistency_levels_disallowed());
namespace sm = seastar::metrics;
@@ -114,7 +102,7 @@ query_processor::query_processor(service::storage_proxy& proxy, data_dictionary:
using clevel = db::consistency_level;
sm::label cl_label("consistency_level");
sm::label who_label("who"); // Who queried system tables
sm::label who_label("who"); // Who queried system tables
const auto user_who_label_instance = who_label("user");
const auto internal_who_label_instance = who_label("internal");
@@ -122,11 +110,17 @@ query_processor::query_processor(service::storage_proxy& proxy, data_dictionary:
const auto system_ks_label_instance = ks_label("system");
std::vector<sm::metric_definition> qp_group;
qp_group.push_back(sm::make_counter("statements_prepared", _stats.prepare_invocations, sm::description("Counts the total number of parsed CQL requests.")));
qp_group.push_back(sm::make_counter(
"statements_prepared",
_stats.prepare_invocations,
sm::description("Counts the total number of parsed CQL requests.")));
for (auto cl = size_t(clevel::MIN_VALUE); cl <= size_t(clevel::MAX_VALUE); ++cl) {
qp_group.push_back(sm::make_counter(
"queries", _stats.queries_by_cl[cl], sm::description("Counts queries by consistency level."), {cl_label(clevel(cl)), basic_level})
.set_skip_when_empty());
qp_group.push_back(
sm::make_counter(
"queries",
_stats.queries_by_cl[cl],
sm::description("Counts queries by consistency level."),
{cl_label(clevel(cl)), basic_level}).set_skip_when_empty());
}
_metrics.add_group("query_processor", qp_group);
@@ -527,23 +521,29 @@ query_processor::query_processor(service::storage_proxy& proxy, data_dictionary:
std::vector<sm::metric_definition> cql_cl_group;
for (auto cl = size_t(clevel::MIN_VALUE); cl <= size_t(clevel::MAX_VALUE); ++cl) {
cql_cl_group.push_back(sm::make_counter("writes_per_consistency_level", _cql_stats.writes_per_consistency_level[cl],
sm::description("Counts the number of writes for each consistency level."), {cl_label(clevel(cl)), basic_level})
.set_skip_when_empty());
cql_cl_group.push_back(
sm::make_counter(
"writes_per_consistency_level",
_cql_stats.writes_per_consistency_level[cl],
sm::description("Counts the number of writes for each consistency level."),
{cl_label(clevel(cl)), basic_level}).set_skip_when_empty());
}
_metrics.add_group("cql", cql_cl_group);
_metrics.add_group(
"cql", {
sm::make_counter("write_consistency_levels_disallowed_violations", _cql_stats.write_consistency_levels_disallowed_violations,
sm::description("Counts the number of write_consistency_levels_disallowed guardrail violations, "
"i.e. attempts to write with a forbidden consistency level."),
{basic_level}),
sm::make_counter("write_consistency_levels_warned_violations", _cql_stats.write_consistency_levels_warned_violations,
sm::description("Counts the number of write_consistency_levels_warned guardrail violations, "
"i.e. attempts to write with a discouraged consistency level."),
{basic_level}),
});
_metrics.add_group("cql", {
sm::make_counter(
"write_consistency_levels_disallowed_violations",
_cql_stats.write_consistency_levels_disallowed_violations,
sm::description("Counts the number of write_consistency_levels_disallowed guardrail violations, "
"i.e. attempts to write with a forbidden consistency level."),
{basic_level}),
sm::make_counter(
"write_consistency_levels_warned_violations",
_cql_stats.write_consistency_levels_warned_violations,
sm::description("Counts the number of write_consistency_levels_warned guardrail violations, "
"i.e. attempts to write with a discouraged consistency level."),
{basic_level}),
});
_mnotifier.register_listener(_migration_subscriber.get());
}
@@ -554,13 +554,15 @@ query_processor::~query_processor() {
}
}
std::pair<std::reference_wrapper<service::strong_consistency::coordinator>, gate::holder> query_processor::acquire_strongly_consistent_coordinator() {
std::pair<std::reference_wrapper<service::strong_consistency::coordinator>, gate::holder>
query_processor::acquire_strongly_consistent_coordinator() {
auto [remote_, holder] = remote();
return {remote_.get().sc_coordinator, std::move(holder)};
}
void query_processor::start_remote(service::migration_manager& mm, service::mapreduce_service& mapreducer, service::storage_service& ss,
service::raft_group0_client& group0_client, service::strong_consistency::coordinator& sc_coordinator) {
void query_processor::start_remote(service::migration_manager& mm, service::mapreduce_service& mapreducer,
service::storage_service& ss, service::raft_group0_client& group0_client,
service::strong_consistency::coordinator& sc_coordinator) {
_remote = std::make_unique<struct remote>(mm, mapreducer, ss, group0_client, sc_coordinator);
}
@@ -580,9 +582,7 @@ future<> query_processor::stop() {
}
future<::shared_ptr<cql_transport::messages::result_message>> query_processor::execute_with_guard(
std::function<future<::shared_ptr<cql_transport::messages::result_message>>(
service::query_state&, ::shared_ptr<cql_statement>, const query_options&, std::optional<service::group0_guard>)>
fn,
std::function<future<::shared_ptr<cql_transport::messages::result_message>>(service::query_state&, ::shared_ptr<cql_statement>, const query_options&, std::optional<service::group0_guard>)> fn,
::shared_ptr<cql_statement> statement, service::query_state& query_state, const query_options& options) {
// execute all statements that need group0 guard on shard0
if (this_shard_id() != 0) {
@@ -591,13 +591,13 @@ future<::shared_ptr<cql_transport::messages::result_message>> query_processor::e
auto [remote_, holder] = remote();
size_t retries = remote_.get().mm.get_concurrent_ddl_retries();
while (true) {
while (true) {
try {
auto guard = co_await remote_.get().mm.start_group0_operation();
co_return co_await fn(query_state, statement, options, std::move(guard));
} catch (const service::group0_concurrent_modification& ex) {
log.warn("Failed to execute statement \"{}\" due to guard conflict.{}.", statement->raw_cql_statement,
retries ? " Retrying" : " Number of retries exceeded, giving up");
log.warn("Failed to execute statement \"{}\" due to guard conflict.{}.",
statement->raw_cql_statement, retries ? " Retrying" : " Number of retries exceeded, giving up");
if (retries--) {
continue;
}
@@ -606,30 +606,29 @@ future<::shared_ptr<cql_transport::messages::result_message>> query_processor::e
}
}
template <typename... Args>
future<::shared_ptr<result_message>> query_processor::execute_maybe_with_guard(service::query_state& query_state, ::shared_ptr<cql_statement> statement,
const query_options& options,
future<::shared_ptr<result_message>> (query_processor::*fn)(
service::query_state&, ::shared_ptr<cql_statement>, const query_options&, std::optional<service::group0_guard>, Args...),
Args... args) {
template<typename... Args>
future<::shared_ptr<result_message>>
query_processor::execute_maybe_with_guard(service::query_state& query_state, ::shared_ptr<cql_statement> statement, const query_options& options,
future<::shared_ptr<result_message>>(query_processor::*fn)(service::query_state&, ::shared_ptr<cql_statement>, const query_options&, std::optional<service::group0_guard>, Args...), Args... args) {
if (!statement->needs_guard(*this, query_state)) {
return (this->*fn)(query_state, std::move(statement), options, std::nullopt, std::forward<Args>(args)...);
}
static auto exec = [fn](query_processor& qp, Args... args, service::query_state& query_state, ::shared_ptr<cql_statement> statement,
const query_options& options, std::optional<service::group0_guard> guard) {
static auto exec = [fn] (query_processor& qp, Args... args, service::query_state& query_state, ::shared_ptr<cql_statement> statement, const query_options& options, std::optional<service::group0_guard> guard) {
return (qp.*fn)(query_state, std::move(statement), options, std::move(guard), std::forward<Args>(args)...);
};
return execute_with_guard(std::bind_front(exec, std::ref(*this), std::forward<Args>(args)...), std::move(statement), query_state, options);
}
future<::shared_ptr<result_message>> query_processor::execute_direct_without_checking_exception_message(
const std::string_view& query_string, service::query_state& query_state, dialect d, query_options& options) {
future<::shared_ptr<result_message>>
query_processor::execute_direct_without_checking_exception_message(const std::string_view& query_string, service::query_state& query_state, dialect d, query_options& options) {
log.trace("execute_direct: \"{}\"", query_string);
tracing::trace(query_state.get_trace_state(), "Parsing a statement");
auto p = get_statement(query_string, query_state.get_client_state(), d);
auto statement = p->statement;
if (statement->get_bound_terms() != options.get_values_count()) {
const auto msg = format("Invalid amount of bind variables: expected {:d} received {:d}", statement->get_bound_terms(), options.get_values_count());
const auto msg = format("Invalid amount of bind variables: expected {:d} received {:d}",
statement->get_bound_terms(),
options.get_values_count());
throw exceptions::invalid_request_exception(msg);
}
options.prepare(p->bound_names);
@@ -640,13 +639,17 @@ future<::shared_ptr<result_message>> query_processor::execute_direct_without_che
metrics.regularStatementsExecuted.inc();
#endif
auto user = query_state.get_client_state().user();
tracing::trace(query_state.get_trace_state(), "Processing a statement for authenticated user: {}",
user ? (user->name ? *user->name : "anonymous") : "no user authenticated");
tracing::trace(query_state.get_trace_state(), "Processing a statement for authenticated user: {}", user ? (user->name ? *user->name : "anonymous") : "no user authenticated");
return execute_maybe_with_guard(query_state, std::move(statement), options, &query_processor::do_execute_direct, std::move(p->warnings));
}
future<::shared_ptr<result_message>> query_processor::do_execute_direct(service::query_state& query_state, shared_ptr<cql_statement> statement,
const query_options& options, std::optional<service::group0_guard> guard, cql3::cql_warnings_vec warnings) {
future<::shared_ptr<result_message>>
query_processor::do_execute_direct(
service::query_state& query_state,
shared_ptr<cql_statement> statement,
const query_options& options,
std::optional<service::group0_guard> guard,
cql3::cql_warnings_vec warnings) {
auto access_future = co_await coroutine::as_future(statement->check_access(*this, query_state.get_client_state()));
if (access_future.failed()) {
co_await audit::inspect(statement, query_state, options, true);
@@ -671,16 +674,26 @@ future<::shared_ptr<result_message>> query_processor::do_execute_direct(service:
co_return std::move(m);
}
future<::shared_ptr<result_message>> query_processor::execute_prepared_without_checking_exception_message(service::query_state& query_state,
shared_ptr<cql_statement> statement, const query_options& options, statements::prepared_statement::checked_weak_ptr prepared,
cql3::prepared_cache_key_type cache_key, bool needs_authorization) {
return execute_maybe_with_guard(
query_state, std::move(statement), options, &query_processor::do_execute_prepared, std::move(prepared), std::move(cache_key), needs_authorization);
future<::shared_ptr<result_message>>
query_processor::execute_prepared_without_checking_exception_message(
service::query_state& query_state,
shared_ptr<cql_statement> statement,
const query_options& options,
statements::prepared_statement::checked_weak_ptr prepared,
cql3::prepared_cache_key_type cache_key,
bool needs_authorization) {
return execute_maybe_with_guard(query_state, std::move(statement), options, &query_processor::do_execute_prepared, std::move(prepared), std::move(cache_key), needs_authorization);
}
future<::shared_ptr<result_message>> query_processor::do_execute_prepared(service::query_state& query_state, shared_ptr<cql_statement> statement,
const query_options& options, std::optional<service::group0_guard> guard, statements::prepared_statement::checked_weak_ptr prepared,
cql3::prepared_cache_key_type cache_key, bool needs_authorization) {
future<::shared_ptr<result_message>>
query_processor::do_execute_prepared(
service::query_state& query_state,
shared_ptr<cql_statement> statement,
const query_options& options,
std::optional<service::group0_guard> guard,
statements::prepared_statement::checked_weak_ptr prepared,
cql3::prepared_cache_key_type cache_key,
bool needs_authorization) {
if (needs_authorization) {
co_await statement->check_access(*this, query_state.get_client_state());
try {
@@ -694,8 +707,8 @@ future<::shared_ptr<result_message>> query_processor::do_execute_prepared(servic
co_return co_await process_authorized_statement(std::move(statement), query_state, options, std::move(guard));
}
future<::shared_ptr<result_message>> query_processor::process_authorized_statement(const ::shared_ptr<cql_statement> statement,
service::query_state& query_state, const query_options& options, std::optional<service::group0_guard> guard) {
future<::shared_ptr<result_message>>
query_processor::process_authorized_statement(const ::shared_ptr<cql_statement> statement, service::query_state& query_state, const query_options& options, std::optional<service::group0_guard> guard) {
auto& client_state = query_state.get_client_state();
++_stats.queries_by_cl[size_t(options.get_consistency())];
@@ -705,39 +718,43 @@ future<::shared_ptr<result_message>> query_processor::process_authorized_stateme
auto msg = co_await statement->execute_without_checking_exception_message(*this, query_state, options, std::move(guard));
if (msg) {
co_return std::move(msg);
co_return std::move(msg);
}
co_return ::make_shared<result_message::void_message>();
}
future<::shared_ptr<cql_transport::messages::result_message::prepared>> query_processor::prepare(
sstring query_string, service::query_state& query_state, cql3::dialect d) {
future<::shared_ptr<cql_transport::messages::result_message::prepared>>
query_processor::prepare(sstring query_string, service::query_state& query_state, cql3::dialect d) {
auto& client_state = query_state.get_client_state();
return prepare(std::move(query_string), client_state, d);
}
future<::shared_ptr<cql_transport::messages::result_message::prepared>> query_processor::prepare(
sstring query_string, const service::client_state& client_state, cql3::dialect d) {
future<::shared_ptr<cql_transport::messages::result_message::prepared>>
query_processor::prepare(sstring query_string, const service::client_state& client_state, cql3::dialect d) {
try {
auto key = compute_id(query_string, client_state.get_raw_keyspace(), d);
auto prep_entry = co_await _prepared_cache.get_pinned(key, [this, &query_string, &client_state, d] {
auto prepared = get_statement(query_string, client_state, d);
prepared->calculate_metadata_id();
auto bound_terms = prepared->statement->get_bound_terms();
if (bound_terms > std::numeric_limits<uint16_t>::max()) {
throw exceptions::invalid_request_exception(
format("Too many markers(?). {:d} markers exceed the allowed maximum of {:d}", bound_terms, std::numeric_limits<uint16_t>::max()));
}
throwing_assert(bound_terms == prepared->bound_names.size());
return make_ready_future<std::unique_ptr<statements::prepared_statement>>(std::move(prepared));
});
co_await utils::get_local_injector().inject("query_processor_prepare_wait_after_cache_get", utils::wait_for_message(std::chrono::seconds(60)));
auto prepared = get_statement(query_string, client_state, d);
prepared->calculate_metadata_id();
auto bound_terms = prepared->statement->get_bound_terms();
if (bound_terms > std::numeric_limits<uint16_t>::max()) {
throw exceptions::invalid_request_exception(
format("Too many markers(?). {:d} markers exceed the allowed maximum of {:d}",
bound_terms,
std::numeric_limits<uint16_t>::max()));
}
throwing_assert(bound_terms == prepared->bound_names.size());
return make_ready_future<std::unique_ptr<statements::prepared_statement>>(std::move(prepared));
});
co_await utils::get_local_injector().inject(
"query_processor_prepare_wait_after_cache_get",
utils::wait_for_message(std::chrono::seconds(60)));
auto msg = ::make_shared<result_message::prepared::cql>(prepared_cache_key_type::cql_id(key), std::move(prep_entry),
client_state.is_protocol_extension_set(cql_transport::cql_protocol_extension::LWT_ADD_METADATA_MARK));
client_state.is_protocol_extension_set(cql_transport::cql_protocol_extension::LWT_ADD_METADATA_MARK));
co_return std::move(msg);
} catch (typename prepared_statements_cache::statement_is_too_big&) {
} catch(typename prepared_statements_cache::statement_is_too_big&) {
throw prepared_statement_is_too_big(query_string);
}
}
@@ -748,11 +765,15 @@ static std::string hash_target(std::string_view query_string, std::string_view k
return ret;
}
prepared_cache_key_type query_processor::compute_id(std::string_view query_string, std::string_view keyspace, dialect d) {
prepared_cache_key_type query_processor::compute_id(
std::string_view query_string,
std::string_view keyspace,
dialect d) {
return prepared_cache_key_type(md5_hasher::calculate(hash_target(query_string, keyspace)), d);
}
std::unique_ptr<prepared_statement> query_processor::get_statement(const std::string_view& query, const service::client_state& client_state, dialect d) {
std::unique_ptr<prepared_statement>
query_processor::get_statement(const std::string_view& query, const service::client_state& client_state, dialect d) {
// Measuring allocation cost requires that no yield points exist
// between bytes_before and bytes_after. It needs fixing if this
// function is ever futurized.
@@ -777,7 +798,8 @@ std::unique_ptr<prepared_statement> query_processor::get_statement(const std::st
return p;
}
std::unique_ptr<raw::parsed_statement> query_processor::parse_statement(const std::string_view& query, dialect d) {
std::unique_ptr<raw::parsed_statement>
query_processor::parse_statement(const std::string_view& query, dialect d) {
try {
{
const char* error_injection_key = "query_processor-parse_statement-test_failure";
@@ -802,7 +824,8 @@ std::unique_ptr<raw::parsed_statement> query_processor::parse_statement(const st
}
}
std::vector<std::unique_ptr<raw::parsed_statement>> query_processor::parse_statements(std::string_view queries, dialect d) {
std::vector<std::unique_ptr<raw::parsed_statement>>
query_processor::parse_statements(std::string_view queries, dialect d) {
try {
auto statements = util::do_with_parser(queries, d, std::mem_fn(&cql3_parser::CqlParser::queries));
if (statements.empty()) {
@@ -831,10 +854,15 @@ std::pair<std::reference_wrapper<struct query_processor::remote>, gate::holder>
on_internal_error(log, "attempted to perform distributed query when `query_processor::remote` is unavailable");
}
query_options query_processor::make_internal_options(const statements::prepared_statement::checked_weak_ptr& p, const std::vector<data_value_or_unset>& values,
db::consistency_level cl, int32_t page_size, service::node_local_only node_local_only) const {
query_options query_processor::make_internal_options(
const statements::prepared_statement::checked_weak_ptr& p,
const std::vector<data_value_or_unset>& values,
db::consistency_level cl,
int32_t page_size,
service::node_local_only node_local_only) const {
if (p->bound_names.size() != values.size()) {
throw std::invalid_argument(format("Invalid number of values. Expecting {:d} but got {:d}", p->bound_names.size(), values.size()));
throw std::invalid_argument(
format("Invalid number of values. Expecting {:d} but got {:d}", p->bound_names.size(), values.size()));
}
auto ni = p->bound_names.begin();
raw_value_vector_with_unset bound_values;
@@ -842,28 +870,32 @@ query_options query_processor::make_internal_options(const statements::prepared_
bound_values.unset.resize(values.size());
for (auto& var : values) {
auto& n = *ni;
std::visit(overloaded_functor{[&](const data_value& v) {
if (v.type() == bytes_type) {
bound_values.values.emplace_back(cql3::raw_value::make_value(value_cast<bytes>(v)));
} else if (v.is_null()) {
bound_values.values.emplace_back(cql3::raw_value::make_null());
} else {
bound_values.values.emplace_back(cql3::raw_value::make_value(n->type->decompose(v)));
}
},
[&](const unset_value&) {
bound_values.values.emplace_back(cql3::raw_value::make_null());
bound_values.unset[std::distance(p->bound_names.begin(), ni)] = true;
}},
var);
std::visit(overloaded_functor {
[&] (const data_value& v) {
if (v.type() == bytes_type) {
bound_values.values.emplace_back(cql3::raw_value::make_value(value_cast<bytes>(v)));
} else if (v.is_null()) {
bound_values.values.emplace_back(cql3::raw_value::make_null());
} else {
bound_values.values.emplace_back(cql3::raw_value::make_value(n->type->decompose(v)));
}
}, [&] (const unset_value&) {
bound_values.values.emplace_back(cql3::raw_value::make_null());
bound_values.unset[std::distance(p->bound_names.begin(), ni)] = true;
}
}, var);
++ni;
}
return query_options(cl, std::move(bound_values),
cql3::query_options::specific_options{.page_size = page_size,
.state = {},
.serial_consistency = db::consistency_level::SERIAL,
.timestamp = api::missing_timestamp,
.node_local_only = node_local_only});
return query_options(
cl,
std::move(bound_values),
cql3::query_options::specific_options {
.page_size = page_size,
.state = {},
.serial_consistency = db::consistency_level::SERIAL,
.timestamp = api::missing_timestamp,
.node_local_only = node_local_only
});
}
statements::prepared_statement::checked_weak_ptr query_processor::prepare_internal(const sstring& query_string) {
@@ -885,7 +917,11 @@ struct internal_query_state {
};
internal_query_state query_processor::create_paged_state(
const sstring& query_string, db::consistency_level cl, const data_value_list& values, int32_t page_size, std::optional<service::query_state> qs) {
const sstring& query_string,
db::consistency_level cl,
const data_value_list& values,
int32_t page_size,
std::optional<service::query_state> qs) {
auto p = prepare_internal(query_string);
auto opts = make_internal_options(p, values, cl, page_size);
if (!qs) {
@@ -899,7 +935,8 @@ bool query_processor::has_more_results(cql3::internal_query_state& state) const
}
future<> query_processor::for_each_cql_result(
cql3::internal_query_state& state, noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set::row&)> f) {
cql3::internal_query_state& state,
noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set::row&)> f) {
do {
auto msg = co_await execute_paged_internal(state);
for (auto& row : *msg) {
@@ -910,18 +947,17 @@ future<> query_processor::for_each_cql_result(
} while (has_more_results(state));
}
future<::shared_ptr<untyped_result_set>> query_processor::execute_paged_internal(internal_query_state& state) {
future<::shared_ptr<untyped_result_set>>
query_processor::execute_paged_internal(internal_query_state& state) {
state.p->statement->validate(*this, service::client_state::for_internal_calls());
::shared_ptr<cql_transport::messages::result_message> msg = co_await state.p->statement->execute(*this, *state.qs, *state.opts, std::nullopt);
::shared_ptr<cql_transport::messages::result_message> msg =
co_await state.p->statement->execute(*this, *state.qs, *state.opts, std::nullopt);
class visitor : public result_message::visitor_base {
internal_query_state& _state;
query_processor& _qp;
public:
visitor(internal_query_state& state, query_processor& qp)
: _state(state)
, _qp(qp) {
visitor(internal_query_state& state, query_processor& qp) : _state(state), _qp(qp) {
}
virtual ~visitor() = default;
void visit(const result_message::rows& rmrs) override {
@@ -950,14 +986,23 @@ future<::shared_ptr<untyped_result_set>> query_processor::execute_paged_internal
co_return ::make_shared<untyped_result_set>(msg);
}
future<::shared_ptr<untyped_result_set>> query_processor::execute_internal(
const sstring& query_string, db::consistency_level cl, const data_value_list& values, cache_internal cache) {
future<::shared_ptr<untyped_result_set>>
query_processor::execute_internal(
const sstring& query_string,
db::consistency_level cl,
const data_value_list& values,
cache_internal cache) {
auto qs = query_state_for_internal_call();
co_return co_await execute_internal(query_string, cl, qs, values, cache);
}
future<::shared_ptr<untyped_result_set>> query_processor::execute_internal(
const sstring& query_string, db::consistency_level cl, service::query_state& query_state, const data_value_list& values, cache_internal cache) {
future<::shared_ptr<untyped_result_set>>
query_processor::execute_internal(
const sstring& query_string,
db::consistency_level cl,
service::query_state& query_state,
const data_value_list& values,
cache_internal cache) {
if (log.is_enabled(logging::log_level::trace)) {
log.trace("execute_internal: {}\"{}\" ({})", cache ? "(cached) " : "", query_string, fmt::join(values, ", "));
@@ -975,7 +1020,10 @@ future<::shared_ptr<untyped_result_set>> query_processor::execute_internal(
}
future<utils::chunked_vector<mutation>> query_processor::get_mutations_internal(
const sstring query_string, service::query_state& query_state, api::timestamp_type timestamp, std::vector<data_value_or_unset> values) {
const sstring query_string,
service::query_state& query_state,
api::timestamp_type timestamp,
std::vector<data_value_or_unset> values) {
log.debug("get_mutations_internal: \"{}\" ({})", query_string, fmt::join(values, ", "));
auto stmt = prepare_internal(query_string);
auto mod_stmt = dynamic_pointer_cast<cql3::statements::modification_statement>(stmt->statement);
@@ -993,8 +1041,12 @@ future<utils::chunked_vector<mutation>> query_processor::get_mutations_internal(
co_return co_await mod_stmt->get_mutations(*this, opts, timeout, true, timestamp, query_state, json_cache, std::move(keys));
}
future<::shared_ptr<untyped_result_set>> query_processor::execute_with_params(
statements::prepared_statement::checked_weak_ptr p, db::consistency_level cl, service::query_state& query_state, const data_value_list& values) {
future<::shared_ptr<untyped_result_set>>
query_processor::execute_with_params(
statements::prepared_statement::checked_weak_ptr p,
db::consistency_level cl,
service::query_state& query_state,
const data_value_list& values) {
auto opts = make_internal_options(p, values, cl);
auto statement = p->statement;
@@ -1002,24 +1054,30 @@ future<::shared_ptr<untyped_result_set>> query_processor::execute_with_params(
co_return ::make_shared<untyped_result_set>(msg);
}
future<::shared_ptr<result_message>> query_processor::do_execute_with_params(
service::query_state& query_state, shared_ptr<cql_statement> statement, const query_options& options, std::optional<service::group0_guard> guard) {
future<::shared_ptr<result_message>>
query_processor::do_execute_with_params(
service::query_state& query_state,
shared_ptr<cql_statement> statement,
const query_options& options, std::optional<service::group0_guard> guard) {
statement->validate(*this, service::client_state::for_internal_calls());
co_return co_await coroutine::try_future(statement->execute(*this, query_state, options, std::move(guard)));
}
future<::shared_ptr<cql_transport::messages::result_message>> query_processor::execute_batch_without_checking_exception_message(
::shared_ptr<statements::batch_statement> batch, service::query_state& query_state, query_options& options,
future<::shared_ptr<cql_transport::messages::result_message>>
query_processor::execute_batch_without_checking_exception_message(
::shared_ptr<statements::batch_statement> batch,
service::query_state& query_state,
query_options& options,
std::unordered_map<prepared_cache_key_type, authorized_prepared_statements_cache::value_type> pending_authorization_entries) {
auto access_future = co_await coroutine::as_future(batch->check_access(*this, query_state.get_client_state()));
co_await coroutine::parallel_for_each(pending_authorization_entries, [this, &query_state](auto& e) -> future<> {
try {
co_await _authorized_prepared_cache.insert(*query_state.get_client_state().user(), e.first, std::move(e.second));
} catch (...) {
log.error("failed to cache the entry: {}", std::current_exception());
}
});
co_await coroutine::parallel_for_each(pending_authorization_entries, [this, &query_state] (auto& e) -> future<> {
try {
co_await _authorized_prepared_cache.insert(*query_state.get_client_state().user(), e.first, std::move(e.second));
} catch (...) {
log.error("failed to cache the entry: {}", std::current_exception());
}
});
bool failed = access_future.failed();
co_await audit::inspect(batch, query_state, options, failed);
if (access_future.failed()) {
@@ -1028,28 +1086,30 @@ future<::shared_ptr<cql_transport::messages::result_message>> query_processor::e
batch->validate();
batch->validate(*this, query_state.get_client_state());
_stats.queries_by_cl[size_t(options.get_consistency())] += batch->get_statements().size();
if (log.is_enabled(logging::log_level::trace)) {
if (log.is_enabled(logging::log_level::trace)) {
std::ostringstream oss;
for (const auto& s : batch->get_statements()) {
oss << std::endl << s.statement->raw_cql_statement;
for (const auto& s: batch->get_statements()) {
oss << std::endl << s.statement->raw_cql_statement;
}
log.trace("execute_batch({}): {}", batch->get_statements().size(), oss.str());
}
co_return co_await batch->execute(*this, query_state, options, std::nullopt);
}
future<service::broadcast_tables::query_result> query_processor::execute_broadcast_table_query(const service::broadcast_tables::query& query) {
future<service::broadcast_tables::query_result>
query_processor::execute_broadcast_table_query(const service::broadcast_tables::query& query) {
auto [remote_, holder] = remote();
co_return co_await service::broadcast_tables::execute(remote_.get().group0_client, query);
}
future<query::mapreduce_result> query_processor::mapreduce(query::mapreduce_request req, tracing::trace_state_ptr tr_state) {
future<query::mapreduce_result>
query_processor::mapreduce(query::mapreduce_request req, tracing::trace_state_ptr tr_state) {
auto [remote_, holder] = remote();
co_return co_await remote_.get().mapreducer.dispatch(std::move(req), std::move(tr_state));
}
future<::shared_ptr<messages::result_message>> query_processor::execute_schema_statement(
const statements::schema_altering_statement& stmt, service::query_state& state, const query_options& options, service::group0_batch& mc) {
future<::shared_ptr<messages::result_message>>
query_processor::execute_schema_statement(const statements::schema_altering_statement& stmt, service::query_state& state, const query_options& options, service::group0_batch& mc) {
if (this_shard_id() != 0) {
on_internal_error(log, "DDL must be executed on shard 0");
}
@@ -1103,8 +1163,7 @@ future<> query_processor::announce_schema_statement(const statements::schema_alt
co_await remote_.get().mm.announce(std::move(m), std::move(guard), description);
}
query_processor::migration_subscriber::migration_subscriber(query_processor* qp)
: _qp{qp} {
query_processor::migration_subscriber::migration_subscriber(query_processor* qp) : _qp{qp} {
}
void query_processor::migration_subscriber::on_create_keyspace(const sstring& ks_name) {
@@ -1130,7 +1189,10 @@ void query_processor::migration_subscriber::on_create_view(const sstring& ks_nam
void query_processor::migration_subscriber::on_update_keyspace(const sstring& ks_name) {
}
void query_processor::migration_subscriber::on_update_column_family(const sstring& ks_name, const sstring& cf_name, bool columns_changed) {
void query_processor::migration_subscriber::on_update_column_family(
const sstring& ks_name,
const sstring& cf_name,
bool columns_changed) {
// #1255: Ignoring columns_changed deliberately.
log.info("Column definitions for {}.{} changed, invalidating related prepared statements", ks_name, cf_name);
remove_invalid_prepared_statements(ks_name, cf_name);
@@ -1145,7 +1207,9 @@ void query_processor::migration_subscriber::on_update_function(const sstring& ks
void query_processor::migration_subscriber::on_update_aggregate(const sstring& ks_name, const sstring& aggregate_name) {
}
void query_processor::migration_subscriber::on_update_view(const sstring& ks_name, const sstring& view_name, bool columns_changed) {
void query_processor::migration_subscriber::on_update_view(
const sstring& ks_name,
const sstring& view_name, bool columns_changed) {
// scylladb/scylladb#16392 - Materialized views are also tables so we need at least handle
// them as such when changed.
on_update_column_family(ks_name, view_name, columns_changed);
@@ -1174,28 +1238,39 @@ void query_processor::migration_subscriber::on_drop_view(const sstring& ks_name,
remove_invalid_prepared_statements(ks_name, view_name);
}
void query_processor::migration_subscriber::remove_invalid_prepared_statements(sstring ks_name, std::optional<sstring> cf_name) {
_qp->_prepared_cache.remove_if([&](::shared_ptr<cql_statement> stmt) {
void query_processor::migration_subscriber::remove_invalid_prepared_statements(
sstring ks_name,
std::optional<sstring> cf_name) {
_qp->_prepared_cache.remove_if([&] (::shared_ptr<cql_statement> stmt) {
return this->should_invalidate(ks_name, cf_name, stmt);
});
}
bool query_processor::migration_subscriber::should_invalidate(sstring ks_name, std::optional<sstring> cf_name, ::shared_ptr<cql_statement> statement) {
bool query_processor::migration_subscriber::should_invalidate(
sstring ks_name,
std::optional<sstring> cf_name,
::shared_ptr<cql_statement> statement) {
return statement->depends_on(ks_name, cf_name);
}
future<> query_processor::query_internal(const sstring& query_string, db::consistency_level cl, const data_value_list& values, int32_t page_size,
noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)> f, std::optional<service::query_state> qs) {
future<> query_processor::query_internal(
const sstring& query_string,
db::consistency_level cl,
const data_value_list& values,
int32_t page_size,
noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)> f,
std::optional<service::query_state> qs) {
auto query_state = create_paged_state(query_string, cl, values, page_size, std::move(qs));
co_return co_await for_each_cql_result(query_state, std::move(f));
}
future<> query_processor::query_internal(const sstring& query_string, noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)> f) {
future<> query_processor::query_internal(
const sstring& query_string,
noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)> f) {
return query_internal(query_string, db::consistency_level::ONE, {}, 1000, std::move(f));
}
shared_ptr<cql_transport::messages::result_message> query_processor::bounce_to_shard(
unsigned shard, cql3::computed_function_values cached_fn_calls, bool track) {
shared_ptr<cql_transport::messages::result_message> query_processor::bounce_to_shard(unsigned shard, cql3::computed_function_values cached_fn_calls, bool track) {
if (track) {
_proxy.get_stats().replica_cross_shard_ops++;
}
@@ -1203,8 +1278,7 @@ shared_ptr<cql_transport::messages::result_message> query_processor::bounce_to_s
return ::make_shared<cql_transport::messages::result_message::bounce>(my_host_id, shard, std::move(cached_fn_calls));
}
shared_ptr<cql_transport::messages::result_message> query_processor::bounce_to_node(
locator::tablet_replica replica, cql3::computed_function_values cached_fn_calls, seastar::lowres_clock::time_point timeout, bool is_write) {
shared_ptr<cql_transport::messages::result_message> query_processor::bounce_to_node(locator::tablet_replica replica, cql3::computed_function_values cached_fn_calls, seastar::lowres_clock::time_point timeout, bool is_write) {
get_cql_stats().forwarded_requests++;
return ::make_shared<cql_transport::messages::result_message::bounce>(replica.host, replica.shard, std::move(cached_fn_calls), timeout, is_write);
}
@@ -1221,7 +1295,7 @@ void query_processor::update_authorized_prepared_cache_config() {
utils::loading_cache_config cfg;
cfg.max_size = _mcfg.authorized_prepared_cache_size;
cfg.expiry = std::min(std::chrono::milliseconds(_db.get_config().permissions_validity_in_ms()),
std::chrono::duration_cast<std::chrono::milliseconds>(prepared_statements_cache::entry_expiry));
std::chrono::duration_cast<std::chrono::milliseconds>(prepared_statements_cache::entry_expiry));
cfg.refresh = std::chrono::milliseconds(_db.get_config().permissions_update_interval_in_ms());
if (!_authorized_prepared_cache.update_config(std::move(cfg))) {
@@ -1233,4 +1307,4 @@ void query_processor::reset_cache() {
_authorized_prepared_cache.reset();
}
} // namespace cql3
}

View File

@@ -1582,7 +1582,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
"\tnone : No auditing enabled.\n"
"\tsyslog : Audit messages sent to Syslog.\n"
"\ttable : Audit messages written to column family named audit.audit_log.\n")
, audit_categories(this, "audit_categories", liveness::LiveUpdate, value_status::Used, "DCL,AUTH,ADMIN", "Comma separated list of operation categories that should be audited.")
, audit_categories(this, "audit_categories", liveness::LiveUpdate, value_status::Used, "DCL,DDL,AUTH,ADMIN", "Comma separated list of operation categories that should be audited.")
, audit_tables(this, "audit_tables", liveness::LiveUpdate, value_status::Used, "", "Comma separated list of table names (<keyspace>.<table>) that will be audited.")
, audit_keyspaces(this, "audit_keyspaces", liveness::LiveUpdate, value_status::Used, "", "Comma separated list of keyspaces that will be audited. All tables in those keyspaces will be audited")
, audit_unix_socket_path(this, "audit_unix_socket_path", value_status::Used, "/dev/log", "The path to the unix socket used for writing to syslog. Only applicable when audit is set to syslog.")

View File

@@ -63,14 +63,15 @@ namespace db {
namespace schema_tables {
static constexpr std::initializer_list<table_kind> all_table_kinds = {table_kind::table, table_kind::view};
static constexpr std::initializer_list<table_kind> all_table_kinds = {
table_kind::table,
table_kind::view
};
static schema_ptr get_table_holder(table_kind k) {
switch (k) {
case table_kind::table:
return tables();
case table_kind::view:
return views();
case table_kind::table: return tables();
case table_kind::view: return views();
}
abort();
}
@@ -93,18 +94,15 @@ void table_selector::add(sstring name) {
}
}
} // namespace schema_tables
}
} // namespace db
}
template <>
struct fmt::formatter<db::schema_tables::table_kind> {
constexpr auto parse(format_parse_context& ctx) {
return ctx.begin();
}
template <> struct fmt::formatter<db::schema_tables::table_kind> {
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
auto format(db::schema_tables::table_kind k, fmt::format_context& ctx) const {
switch (k) {
using enum db::schema_tables::table_kind;
using enum db::schema_tables::table_kind;
case table:
return fmt::format_to(ctx.out(), "table");
case view:
@@ -127,8 +125,11 @@ static std::optional<table_id> table_id_from_mutations(const schema_mutations& s
return table_id(table_row.get_nonnull<utils::UUID>("id"));
}
static future<std::map<table_id, schema_mutations>> read_tables_for_keyspaces(sharded<service::storage_proxy>& proxy, const std::set<sstring>& keyspace_names,
table_kind kind, const std::unordered_map<sstring, table_selector>& tables_per_keyspace) {
static
future<std::map<table_id, schema_mutations>>
read_tables_for_keyspaces(sharded<service::storage_proxy>& proxy, const std::set<sstring>& keyspace_names, table_kind kind,
const std::unordered_map<sstring, table_selector>& tables_per_keyspace)
{
std::map<table_id, schema_mutations> result;
for (auto&& [keyspace_name, sel] : tables_per_keyspace) {
if (!sel.tables.contains(kind)) {
@@ -148,30 +149,32 @@ static future<std::map<table_id, schema_mutations>> read_tables_for_keyspaces(sh
// Extracts the names of tables affected by a schema mutation.
// The mutation must target one of the tables in schema_tables_holding_schema_mutations().
static table_selector get_affected_tables(const sstring& keyspace_name, const mutation& m) {
static
table_selector get_affected_tables(const sstring& keyspace_name, const mutation& m) {
const schema& s = *m.schema();
auto get_table_name = [&](const clustering_key& ck) {
auto get_table_name = [&] (const clustering_key& ck) {
// The first component of the clustering key in each table listed in
// schema_tables_holding_schema_mutations contains the table name.
return value_cast<sstring>(utf8_type->deserialize(ck.get_component(s, 0)));
};
table_selector result;
if (m.partition().partition_tombstone()) {
slogger.trace("Mutation of {}.{} for keyspace {} contains a partition tombstone", m.schema()->ks_name(), m.schema()->cf_name(), keyspace_name);
slogger.trace("Mutation of {}.{} for keyspace {} contains a partition tombstone",
m.schema()->ks_name(), m.schema()->cf_name(), keyspace_name);
result.all_in_keyspace = true;
}
for (auto&& e : m.partition().row_tombstones()) {
const range_tombstone& rt = e.tombstone();
if (rt.start.size(s) == 0 || rt.end.size(s) == 0) {
slogger.trace(
"Mutation of {}.{} for keyspace {} contains a multi-table range tombstone", m.schema()->ks_name(), m.schema()->cf_name(), keyspace_name);
slogger.trace("Mutation of {}.{} for keyspace {} contains a multi-table range tombstone",
m.schema()->ks_name(), m.schema()->cf_name(), keyspace_name);
result.all_in_keyspace = true;
break;
}
auto table_name = get_table_name(rt.start);
if (table_name != get_table_name(rt.end)) {
slogger.trace(
"Mutation of {}.{} for keyspace {} contains a multi-table range tombstone", m.schema()->ks_name(), m.schema()->cf_name(), keyspace_name);
slogger.trace("Mutation of {}.{} for keyspace {} contains a multi-table range tombstone",
m.schema()->ks_name(), m.schema()->cf_name(), keyspace_name);
result.all_in_keyspace = true;
break;
}
@@ -180,17 +183,16 @@ static table_selector get_affected_tables(const sstring& keyspace_name, const mu
for (auto&& row : m.partition().clustered_rows()) {
result.add(get_table_name(row.key()));
}
slogger.trace("Mutation of {}.{} for keyspace {} affects tables: {}, all_in_keyspace: {}", m.schema()->ks_name(), m.schema()->cf_name(), keyspace_name,
result.tables, result.all_in_keyspace);
slogger.trace("Mutation of {}.{} for keyspace {} affects tables: {}, all_in_keyspace: {}",
m.schema()->ks_name(), m.schema()->cf_name(), keyspace_name, result.tables, result.all_in_keyspace);
return result;
}
future<schema_result> static read_schema_for_keyspaces(
sharded<service::storage_proxy>& proxy, const sstring& schema_table_name, const std::set<sstring>& keyspace_names) {
auto map = [&proxy, schema_table_name](const sstring& keyspace_name) {
return read_schema_partition_for_keyspace(proxy, schema_table_name, keyspace_name);
};
auto insert = [](schema_result&& result, auto&& schema_entity) {
future<schema_result>
static read_schema_for_keyspaces(sharded<service::storage_proxy>& proxy, const sstring& schema_table_name, const std::set<sstring>& keyspace_names)
{
auto map = [&proxy, schema_table_name] (const sstring& keyspace_name) { return read_schema_partition_for_keyspace(proxy, schema_table_name, keyspace_name); };
auto insert = [] (schema_result&& result, auto&& schema_entity) {
if (!schema_entity.second->empty()) {
result.insert(std::move(schema_entity));
}
@@ -200,11 +202,11 @@ future<schema_result> static read_schema_for_keyspaces(
}
// Returns names of live table definitions of given keyspace
future<std::vector<sstring>> static read_table_names_of_keyspace(
sharded<service::storage_proxy>& proxy, const sstring& keyspace_name, schema_ptr schema_table) {
future<std::vector<sstring>>
static read_table_names_of_keyspace(sharded<service::storage_proxy>& proxy, const sstring& keyspace_name, schema_ptr schema_table) {
auto pkey = dht::decorate_key(*schema_table, partition_key::from_singular(*schema_table, keyspace_name));
auto&& rs = co_await db::system_keyspace::query(proxy.local().get_db(), schema_table->ks_name(), schema_table->cf_name(), pkey);
co_return rs->rows() | std::views::transform([schema_table](const query::result_set_row& row) {
co_return rs->rows() | std::views::transform([schema_table] (const query::result_set_row& row) {
const sstring name = schema_table->clustering_key_columns().begin()->name_as_text();
return row.get_nonnull<sstring>(name);
}) | std::ranges::to<std::vector>();
@@ -240,7 +242,8 @@ static void maybe_delete_schema_version(mutation& m) {
}
}
future<> schema_applier::merge_keyspaces() {
future<> schema_applier::merge_keyspaces()
{
/*
* - we don't care about entriesOnlyOnLeft() or entriesInCommon(), because only the changes are of interest to us
* - of all entriesOnlyOnRight(), we only care about ones that have live columns; it's possible to have a ColumnFamily
@@ -277,16 +280,21 @@ future<> schema_applier::merge_keyspaces() {
for (auto& name : created) {
slogger.info("Creating keyspace {}", name);
auto sk_after_v = _after.scylla_keyspaces.contains(name) ? _after.scylla_keyspaces.at(name) : nullptr;
auto ksm = co_await create_keyspace_metadata(schema_result_value_type{name, _after.keyspaces.at(name)}, sk_after_v);
auto ksm = co_await create_keyspace_metadata(
schema_result_value_type{name, _after.keyspaces.at(name)}, sk_after_v);
_affected_keyspaces.created.push_back(
co_await replica::database::prepare_create_keyspace_on_all_shards(sharded_db, _proxy, *ksm, _pending_token_metadata));
co_await replica::database::prepare_create_keyspace_on_all_shards(
sharded_db, _proxy, *ksm, _pending_token_metadata));
_affected_keyspaces.names.created.insert(name);
}
for (auto& name : altered) {
slogger.info("Altering keyspace {}", name);
auto sk_after_v = _after.scylla_keyspaces.contains(name) ? _after.scylla_keyspaces.at(name) : nullptr;
auto tmp_ksm = co_await create_keyspace_metadata(schema_result_value_type{name, _after.keyspaces.at(name)}, sk_after_v);
_affected_keyspaces.altered.push_back(co_await replica::database::prepare_update_keyspace_on_all_shards(sharded_db, *tmp_ksm, _pending_token_metadata));
auto tmp_ksm = co_await create_keyspace_metadata(
schema_result_value_type{name, _after.keyspaces.at(name)}, sk_after_v);
_affected_keyspaces.altered.push_back(
co_await replica::database::prepare_update_keyspace_on_all_shards(
sharded_db, *tmp_ksm, _pending_token_metadata));
_affected_keyspaces.names.altered.insert(name);
}
for (auto& key : _affected_keyspaces.names.dropped) {
@@ -319,7 +327,7 @@ static std::vector<column_definition> get_primary_key_definition(const schema_pt
static std::vector<bytes> get_primary_key(const std::vector<column_definition>& primary_key, const query::result_set_row* row) {
std::vector<bytes> key;
for (const auto& column : primary_key) {
const data_value* val = row->get_data_value(column.name_as_text());
const data_value *val = row->get_data_value(column.name_as_text());
key.push_back(val->serialize_nonnull());
}
return key;
@@ -330,7 +338,7 @@ static std::map<std::vector<bytes>, const query::result_set_row*> build_row_map(
const std::vector<query::result_set_row>& rows = result.rows();
auto primary_key = get_primary_key_definition(result.schema());
std::map<std::vector<bytes>, const query::result_set_row*> ret;
for (const auto& row : rows) {
for (const auto& row: rows) {
auto key = get_primary_key(primary_key, &row);
ret.insert(std::pair(std::move(key), &row));
}
@@ -383,8 +391,8 @@ struct aggregate_diff {
std::vector<std::pair<const query::result_set_row*, const query::result_set_row*>> dropped;
};
static aggregate_diff diff_aggregates_rows(
const schema_result& aggr_before, const schema_result& aggr_after, const schema_result& scylla_aggr_before, const schema_result& scylla_aggr_after) {
static aggregate_diff diff_aggregates_rows(const schema_result& aggr_before, const schema_result& aggr_after,
const schema_result& scylla_aggr_before, const schema_result& scylla_aggr_after) {
using map = std::map<std::vector<bytes>, const query::result_set_row*>;
auto aggr_diff = difference(aggr_before, aggr_after, indirect_equal_to<lw_shared_ptr<query::result_set>>());
@@ -428,11 +436,15 @@ static aggregate_diff diff_aggregates_rows(
for (const auto& k : diff.entries_only_on_left) {
auto entry = scylla_aggr_rows_before.find(k);
dropped.push_back({aggr_before_rows.find(k)->second, (entry != scylla_aggr_rows_before.end()) ? entry->second : nullptr});
dropped.push_back({
aggr_before_rows.find(k)->second, (entry != scylla_aggr_rows_before.end()) ? entry->second : nullptr
});
}
for (const auto& k : diff.entries_only_on_right) {
auto entry = scylla_aggr_rows_after.find(k);
created.push_back({aggr_after_rows.find(k)->second, (entry != scylla_aggr_rows_after.end()) ? entry->second : nullptr});
created.push_back({
aggr_after_rows.find(k)->second, (entry != scylla_aggr_rows_after.end()) ? entry->second : nullptr
});
}
}
@@ -440,10 +452,11 @@ static aggregate_diff diff_aggregates_rows(
}
// see the comments for merge_keyspaces()
future<> schema_applier::merge_types() {
future<> schema_applier::merge_types()
{
auto diff = diff_rows(_before.types, _after.types);
co_await _affected_user_types.start();
co_await _affected_user_types.invoke_on_all([&](affected_user_types_per_shard& af) mutable -> future<> {
co_await _affected_user_types.invoke_on_all([&] (affected_user_types_per_shard& af) mutable -> future<> {
auto& db = _proxy.local().get_db().local();
std::map<sstring, std::reference_wrapper<replica::keyspace>> new_keyspaces_per_shard;
@@ -465,12 +478,16 @@ future<> schema_applier::merge_types() {
// version of view to "before" version of base table and "after" to "after"
// respectively.
enum class schema_diff_side {
left, // old, before
left, // old, before
right, // new, after
};
static schema_diff_per_shard diff_table_or_view(sharded<service::storage_proxy>& proxy, const std::map<table_id, schema_mutations>& before,
const std::map<table_id, schema_mutations>& after, bool reload, noncopyable_function<schema_ptr(schema_mutations sm, schema_diff_side)> create_schema) {
static schema_diff_per_shard diff_table_or_view(sharded<service::storage_proxy>& proxy,
const std::map<table_id, schema_mutations>& before,
const std::map<table_id, schema_mutations>& after,
bool reload,
noncopyable_function<schema_ptr (schema_mutations sm, schema_diff_side)> create_schema)
{
schema_diff_per_shard d;
auto diff = difference(before, after);
for (auto&& key : diff.entries_only_on_left) {
@@ -490,10 +507,10 @@ static schema_diff_per_shard diff_table_or_view(sharded<service::storage_proxy>&
d.altered.emplace_back(schema_diff_per_shard::altered_schema{s_before, s});
}
if (reload) {
for (auto&& key : diff.entries_in_common) {
for (auto&& key: diff.entries_in_common) {
auto s = create_schema(std::move(after.at(key)), schema_diff_side::right);
slogger.info("Reloading {}.{} id={} version={}", s->ks_name(), s->cf_name(), s->id(), s->version());
d.altered.emplace_back(schema_diff_per_shard::altered_schema{s, s});
d.altered.emplace_back(schema_diff_per_shard::altered_schema {s, s});
}
}
return d;
@@ -507,9 +524,7 @@ static schema_diff_per_shard diff_table_or_view(sharded<service::storage_proxy>&
constexpr size_t max_concurrent = 8;
in_progress_types_storage_per_shard::in_progress_types_storage_per_shard(
replica::database& db, const affected_keyspaces& affected_keyspaces, const affected_user_types& affected_types)
: _stored_user_types(db.as_user_types_storage()) {
in_progress_types_storage_per_shard::in_progress_types_storage_per_shard(replica::database& db, const affected_keyspaces& affected_keyspaces, const affected_user_types& affected_types) : _stored_user_types(db.as_user_types_storage()) {
// initialize metadata for new keyspaces
for (auto& ks_per_shard : affected_keyspaces.created) {
auto metadata = ks_per_shard[this_shard_id()]->metadata();
@@ -537,7 +552,7 @@ in_progress_types_storage_per_shard::in_progress_types_storage_per_shard(
auto& ks_name = type->_keyspace;
_in_progress_types[ks_name].remove_type(type);
}
for (const auto& ks_name : affected_keyspaces.names.dropped) {
for (const auto &ks_name : affected_keyspaces.names.dropped) {
// can't reference a type when it's keyspace is being dropped
_in_progress_types[ks_name] = data_dictionary::user_types_metadata();
}
@@ -555,9 +570,8 @@ std::shared_ptr<data_dictionary::user_types_storage> in_progress_types_storage_p
return _stored_user_types;
}
future<> in_progress_types_storage::init(
sharded<replica::database>& sharded_db, const affected_keyspaces& affected_keyspaces, const affected_user_types& affected_types) {
co_await sharded_db.invoke_on_all([&](replica::database& db) {
future<> in_progress_types_storage::init(sharded<replica::database>& sharded_db, const affected_keyspaces& affected_keyspaces, const affected_user_types& affected_types) {
co_await sharded_db.invoke_on_all([&] (replica::database& db) {
shards[this_shard_id()] = make_foreign(seastar::make_shared<in_progress_types_storage_per_shard>(db, affected_keyspaces, affected_types));
});
}
@@ -571,7 +585,8 @@ in_progress_types_storage_per_shard& in_progress_types_storage::local() {
// that when a base schema and a subset of its views are modified together (i.e.,
// upon an alter table or alter type statement), then they are published together
// as well, without any deferring in-between.
future<> schema_applier::merge_tables_and_views() {
future<> schema_applier::merge_tables_and_views()
{
auto& user_types = _types_storage.local();
co_await _affected_tables_and_views.tables_and_views.start();
@@ -582,10 +597,10 @@ future<> schema_applier::merge_tables_and_views() {
// Create CDC tables before non-CDC base tables, because we want the base tables with CDC enabled
// to point to their CDC tables.
local_cdc = diff_table_or_view(_proxy, _before.cdc, _after.cdc, _reload, [&](schema_mutations sm, schema_diff_side) {
local_cdc = diff_table_or_view(_proxy, _before.cdc, _after.cdc, _reload, [&] (schema_mutations sm, schema_diff_side) {
return create_table_from_mutations(_proxy, std::move(sm), user_types, nullptr);
});
local_tables = diff_table_or_view(_proxy, _before.tables, _after.tables, _reload, [&](schema_mutations sm, schema_diff_side side) {
local_tables = diff_table_or_view(_proxy, _before.tables, _after.tables, _reload, [&] (schema_mutations sm, schema_diff_side side) {
// If the table has CDC enabled, find the CDC schema version and set it in the table schema.
// If the table is created or altered with CDC enabled, then the CDC
// table is also created or altered in the same operation, so we can
@@ -621,7 +636,7 @@ future<> schema_applier::merge_tables_and_views() {
return create_table_from_mutations(_proxy, std::move(sm), user_types, cdc_schema);
});
local_views = diff_table_or_view(_proxy, _before.views, _after.views, _reload, [&](schema_mutations sm, schema_diff_side side) {
local_views = diff_table_or_view(_proxy, _before.views, _after.views, _reload, [&] (schema_mutations sm, schema_diff_side side) {
// The view schema mutation should be created with reference to the base table schema because we definitely know it by now.
// If we don't do it we are leaving a window where write commands to this schema are illegal.
// There are 3 possibilities:
@@ -668,26 +683,31 @@ future<> schema_applier::merge_tables_and_views() {
frozen_schema_diff tables_frozen = co_await local_tables.freeze();
frozen_schema_diff cdc_frozen = co_await local_cdc.freeze();
frozen_schema_diff views_frozen = co_await local_views.freeze();
co_await _affected_tables_and_views.tables_and_views.invoke_on_others(
[this, &tables_frozen, &cdc_frozen, &views_frozen](affected_tables_and_views_per_shard& tables_and_views) -> future<> {
auto& db = _proxy.local().get_db().local();
tables_and_views.tables = co_await schema_diff_per_shard::copy_from(db, _types_storage, tables_frozen);
tables_and_views.cdc = co_await schema_diff_per_shard::copy_from(db, _types_storage, cdc_frozen);
tables_and_views.views = co_await schema_diff_per_shard::copy_from(db, _types_storage, views_frozen);
});
co_await _affected_tables_and_views.tables_and_views.invoke_on_others([this, &tables_frozen, &cdc_frozen, &views_frozen] (affected_tables_and_views_per_shard& tables_and_views) -> future<> {
auto& db = _proxy.local().get_db().local();
tables_and_views.tables = co_await schema_diff_per_shard::copy_from(
db, _types_storage, tables_frozen);
tables_and_views.cdc = co_await schema_diff_per_shard::copy_from(
db, _types_storage, cdc_frozen);
tables_and_views.views = co_await schema_diff_per_shard::copy_from(
db, _types_storage, views_frozen);
});
auto& db = _proxy.local().get_db();
co_await max_concurrent_for_each(local_views.dropped, max_concurrent, [&db, this](schema_ptr& dt) -> future<> {
co_await max_concurrent_for_each(local_views.dropped, max_concurrent, [&db, this] (schema_ptr& dt) -> future<> {
auto uuid = dt->id();
_affected_tables_and_views.table_shards.insert({uuid, co_await replica::database::prepare_drop_table_on_all_shards(db, uuid)});
_affected_tables_and_views.table_shards.insert({uuid,
co_await replica::database::prepare_drop_table_on_all_shards(db, uuid)});
});
co_await max_concurrent_for_each(local_tables.dropped, max_concurrent, [&db, this](schema_ptr& dt) -> future<> {
co_await max_concurrent_for_each(local_tables.dropped, max_concurrent, [&db, this] (schema_ptr& dt) -> future<> {
auto uuid = dt->id();
_affected_tables_and_views.table_shards.insert({uuid, co_await replica::database::prepare_drop_table_on_all_shards(db, uuid)});
_affected_tables_and_views.table_shards.insert({uuid,
co_await replica::database::prepare_drop_table_on_all_shards(db, uuid)});
});
co_await max_concurrent_for_each(local_cdc.dropped, max_concurrent, [&db, this](schema_ptr& dt) -> future<> {
co_await max_concurrent_for_each(local_cdc.dropped, max_concurrent, [&db, this] (schema_ptr& dt) -> future<> {
auto uuid = dt->id();
_affected_tables_and_views.table_shards.insert({uuid, co_await replica::database::prepare_drop_table_on_all_shards(db, uuid)});
_affected_tables_and_views.table_shards.insert({uuid,
co_await replica::database::prepare_drop_table_on_all_shards(db, uuid)});
});
}
@@ -699,8 +719,8 @@ future<frozen_schema_diff> schema_diff_per_shard::freeze() const {
}
for (const auto& a : altered) {
result.altered.push_back(frozen_schema_diff::altered_schema{
.old_schema = extended_frozen_schema(a.old_schema),
.new_schema = extended_frozen_schema(a.new_schema),
.old_schema = extended_frozen_schema(a.old_schema),
.new_schema = extended_frozen_schema(a.new_schema),
});
co_await coroutine::maybe_yield();
}
@@ -723,8 +743,8 @@ future<schema_diff_per_shard> schema_diff_per_shard::copy_from(replica::database
}
for (const auto& a : oth.altered) {
result.altered.push_back(schema_diff_per_shard::altered_schema{
.old_schema = a.old_schema.unfreeze(commited_ctxt),
.new_schema = a.new_schema.unfreeze(ctxt),
.old_schema = a.old_schema.unfreeze(commited_ctxt),
.new_schema = a.new_schema.unfreeze(ctxt),
});
co_await coroutine::maybe_yield();
}
@@ -738,7 +758,7 @@ future<schema_diff_per_shard> schema_diff_per_shard::copy_from(replica::database
static future<> notify_tables_and_views(service::migration_notifier& notifier, const affected_tables_and_views& diff) {
auto it = diff.tables_and_views.local().columns_changed.cbegin();
auto notify = [&](auto& r, auto&& f) -> future<> {
auto notify = [&] (auto& r, auto&& f) -> future<> {
co_await max_concurrent_for_each(r, max_concurrent, std::move(f));
};
@@ -747,41 +767,24 @@ static future<> notify_tables_and_views(service::migration_notifier& notifier, c
const auto& views = diff.tables_and_views.local().views;
// View drops are notified first, because a table can only be dropped if its views are already deleted
co_await notify(views.dropped, [&](auto&& dt) {
return notifier.drop_view(view_ptr(dt));
});
co_await notify(tables.dropped, [&](auto&& dt) {
return notifier.drop_column_family(dt);
});
co_await notify(cdc.dropped, [&](auto&& dt) {
return notifier.drop_column_family(dt);
});
co_await notify(views.dropped, [&] (auto&& dt) { return notifier.drop_view(view_ptr(dt)); });
co_await notify(tables.dropped, [&] (auto&& dt) { return notifier.drop_column_family(dt); });
co_await notify(cdc.dropped, [&] (auto&& dt) { return notifier.drop_column_family(dt); });
// Table creations are notified first, in case a view is created right after the table
co_await notify(tables.created, [&](auto&& gs) {
return notifier.create_column_family(gs);
});
co_await notify(cdc.created, [&](auto&& gs) {
return notifier.create_column_family(gs);
});
co_await notify(views.created, [&](auto&& gs) {
return notifier.create_view(view_ptr(gs));
});
co_await notify(tables.created, [&] (auto&& gs) { return notifier.create_column_family(gs); });
co_await notify(cdc.created, [&] (auto&& gs) { return notifier.create_column_family(gs); });
co_await notify(views.created, [&] (auto&& gs) { return notifier.create_view(view_ptr(gs)); });
// Table altering is notified first, in case new base columns appear
co_await notify(tables.altered, [&](auto&& altered) {
return notifier.update_column_family(altered.new_schema, *it++);
});
co_await notify(cdc.altered, [&](auto&& altered) {
return notifier.update_column_family(altered.new_schema, *it++);
});
co_await notify(views.altered, [&](auto&& altered) {
return notifier.update_view(view_ptr(altered.new_schema), *it++);
});
co_await notify(tables.altered, [&] (auto&& altered) { return notifier.update_column_family(altered.new_schema, *it++); });
co_await notify(cdc.altered, [&] (auto&& altered) { return notifier.update_column_family(altered.new_schema, *it++); });
co_await notify(views.altered, [&] (auto&& altered) { return notifier.update_view(view_ptr(altered.new_schema), *it++); });
}
static void drop_cached_func(replica::database& db, const query::result_set_row& row) {
auto language = row.get_nonnull<sstring>("language");
if (language == "wasm") {
cql3::functions::function_name name{row.get_nonnull<sstring>("keyspace_name"), row.get_nonnull<sstring>("function_name")};
cql3::functions::function_name name{
row.get_nonnull<sstring>("keyspace_name"), row.get_nonnull<sstring>("function_name")};
auto arg_types = read_arg_types(row, name.keyspace, db.user_types());
db.lang().remove(name, arg_types);
}
@@ -790,13 +793,14 @@ static void drop_cached_func(replica::database& db, const query::result_set_row&
future<> schema_applier::merge_functions() {
auto diff = diff_rows(_before.functions, _after.functions);
co_await _functions_batch.start();
co_await _functions_batch.invoke_on_all(coroutine::lambda([&](cql3::functions::change_batch& batch) -> future<> {
co_await _functions_batch.invoke_on_all(coroutine::lambda([&] (cql3::functions::change_batch& batch) -> future<> {
auto& db = _proxy.local().get_db().local();
for (const auto& val : diff.created) {
batch.add_function(co_await create_func(db, *val, _types_storage.local()));
}
for (const auto& val : diff.dropped) {
cql3::functions::function_name name{val->get_nonnull<sstring>("keyspace_name"), val->get_nonnull<sstring>("function_name")};
cql3::functions::function_name name{
val->get_nonnull<sstring>("keyspace_name"), val->get_nonnull<sstring>("function_name")};
auto commited_storage = _types_storage.local().committed_storage();
auto arg_types = read_arg_types(*val, name.keyspace, *commited_storage);
// as we don't yield between dropping cache and committing batch
@@ -814,13 +818,14 @@ future<> schema_applier::merge_functions() {
future<> schema_applier::merge_aggregates() {
auto diff = diff_aggregates_rows(_before.aggregates, _after.aggregates, _before.scylla_aggregates, _after.scylla_aggregates);
co_await _functions_batch.invoke_on_all([&](cql3::functions::change_batch& batch) -> future<> {
co_await _functions_batch.invoke_on_all([&] (cql3::functions::change_batch& batch)-> future<> {
auto& db = _proxy.local().get_db().local();
for (const auto& val : diff.created) {
batch.add_function(create_aggregate(db, *val.first, val.second, batch, _types_storage.local()));
}
for (const auto& val : diff.dropped) {
cql3::functions::function_name name{val.first->get_nonnull<sstring>("keyspace_name"), val.first->get_nonnull<sstring>("aggregate_name")};
cql3::functions::function_name name{
val.first->get_nonnull<sstring>("keyspace_name"), val.first->get_nonnull<sstring>("aggregate_name")};
auto commited_storage = _types_storage.local().committed_storage();
auto arg_types = read_arg_types(*val.first, name.keyspace, *commited_storage);
batch.remove_aggregate(name, arg_types);
@@ -855,15 +860,15 @@ future<schema_persisted_state> schema_applier::get_schema_persisted_state() {
auto [tables, cdc] = extract_cdc(std::move(tables_and_cdc));
schema_persisted_state v{
.keyspaces = co_await read_schema_for_keyspaces(_proxy, KEYSPACES, _keyspaces),
.scylla_keyspaces = co_await read_schema_for_keyspaces(_proxy, SCYLLA_KEYSPACES, _keyspaces),
.tables = std::move(tables),
.types = co_await read_schema_for_keyspaces(_proxy, TYPES, _keyspaces),
.views = co_await read_tables_for_keyspaces(_proxy, _keyspaces, table_kind::view, _affected_tables),
.cdc = std::move(cdc),
.functions = co_await read_schema_for_keyspaces(_proxy, FUNCTIONS, _keyspaces),
.aggregates = co_await read_schema_for_keyspaces(_proxy, AGGREGATES, _keyspaces),
.scylla_aggregates = co_await read_schema_for_keyspaces(_proxy, SCYLLA_AGGREGATES, _keyspaces),
.keyspaces = co_await read_schema_for_keyspaces(_proxy, KEYSPACES, _keyspaces),
.scylla_keyspaces = co_await read_schema_for_keyspaces(_proxy, SCYLLA_KEYSPACES, _keyspaces),
.tables = std::move(tables),
.types = co_await read_schema_for_keyspaces(_proxy, TYPES, _keyspaces),
.views = co_await read_tables_for_keyspaces(_proxy, _keyspaces, table_kind::view, _affected_tables),
.cdc = std::move(cdc),
.functions = co_await read_schema_for_keyspaces(_proxy, FUNCTIONS, _keyspaces),
.aggregates = co_await read_schema_for_keyspaces(_proxy, AGGREGATES, _keyspaces),
.scylla_aggregates = co_await read_schema_for_keyspaces(_proxy, SCYLLA_AGGREGATES, _keyspaces),
};
co_return v;
}
@@ -919,11 +924,10 @@ class pending_schema_getter : public service::schema_getter {
private:
schema_applier& _sa;
sharded<replica::database>& _db;
public:
pending_schema_getter(schema_applier& sa)
: _sa(sa)
, _db(sa._proxy.local().get_db()) {};
pending_schema_getter(schema_applier& sa) :
_sa(sa), _db(sa._proxy.local().get_db()) {
};
virtual flat_hash_map<sstring, locator::replication_strategy_ptr> get_keyspaces_replication() const override {
flat_hash_map<sstring, locator::replication_strategy_ptr> out;
@@ -985,7 +989,8 @@ future<> schema_applier::update_tablets() {
if (_tablet_hint) {
slogger.info("Tablet metadata changed");
pending_schema_getter getter{*this};
_token_metadata_change = co_await _ss.local().prepare_token_metadata_change(_pending_token_metadata.local(), getter);
_token_metadata_change = co_await _ss.local().prepare_token_metadata_change(
_pending_token_metadata.local(), getter);
}
}
@@ -994,7 +999,8 @@ future<> schema_applier::update_tablets() {
future<> schema_applier::load_mutable_token_metadata() {
locator::mutable_token_metadata_ptr current_token_metadata = co_await _ss.local().get_mutable_token_metadata_ptr();
if (_tablet_hint) {
auto new_token_metadata = co_await _ss.local().prepare_tablet_metadata(_tablet_hint, current_token_metadata);
auto new_token_metadata = co_await _ss.local().prepare_tablet_metadata(
_tablet_hint, current_token_metadata);
co_return co_await _pending_token_metadata.assign(new_token_metadata);
}
co_await _pending_token_metadata.assign(current_token_metadata);
@@ -1109,13 +1115,14 @@ future<> schema_applier::commit() {
// However, we can only acquire the (write) lock after preparing all
// entities for the pending schema change that need to iterate over tables_metadata;
// otherwise, such iteration would deadlock.
_metadata_locks = std::make_unique<replica::tables_metadata_lock_on_all_shards>(co_await replica::database::lock_tables_metadata(sharded_db));
_metadata_locks = std::make_unique<replica::tables_metadata_lock_on_all_shards>(
co_await replica::database::lock_tables_metadata(sharded_db));
// Run func first on shard 0
// to allow "seeding" of the effective_replication_map
// with a new e_r_m instance.
SCYLLA_ASSERT(this_shard_id() == 0);
commit_on_shard(sharded_db.local());
co_await sharded_db.invoke_on_others([this](replica::database& db) {
co_await sharded_db.invoke_on_others([this] (replica::database& db) {
commit_on_shard(db);
});
// unlock as some functions in post_commit() may read data under those locks
@@ -1147,11 +1154,12 @@ future<> schema_applier::finalize_tables_and_views() {
if (_tablet_hint) {
auto& db = sharded_db.local();
co_await db.get_compaction_manager().get_shared_tombstone_gc_state().flush_pending_repair_time_update(db);
co_await db.get_compaction_manager().get_shared_tombstone_gc_state().
flush_pending_repair_time_update(db);
_ss.local().wake_up_topology_state_machine();
}
co_await sharded_db.invoke_on_all([&diff](replica::database& db) -> future<> {
co_await sharded_db.invoke_on_all([&diff] (replica::database& db) -> future<> {
const auto& tables = diff.tables_and_views.local().tables;
const auto& cdc = diff.tables_and_views.local().cdc;
const auto& views = diff.tables_and_views.local().views;
@@ -1176,14 +1184,15 @@ future<> schema_applier::finalize_tables_and_views() {
//
// Drop column mapping entries for dropped tables since these will not be TTLed automatically
// and will stay there forever if we don't clean them up manually
co_await max_concurrent_for_each(diff.tables_and_views.local().tables.created, max_concurrent, [this](const schema_ptr& gs) -> future<> {
co_await max_concurrent_for_each(diff.tables_and_views.local().tables.created, max_concurrent, [this] (const schema_ptr& gs) -> future<> {
co_await store_column_mapping(_proxy, gs, false);
});
co_await max_concurrent_for_each(
diff.tables_and_views.local().tables.altered, max_concurrent, [this](const schema_diff_per_shard::altered_schema& altered) -> future<> {
co_await when_all_succeed(store_column_mapping(_proxy, altered.old_schema, true), store_column_mapping(_proxy, altered.new_schema, false));
});
co_await max_concurrent_for_each(diff.tables_and_views.local().tables.dropped, max_concurrent, [this](const schema_ptr& s) -> future<> {
co_await max_concurrent_for_each(diff.tables_and_views.local().tables.altered, max_concurrent, [this] (const schema_diff_per_shard::altered_schema& altered) -> future<> {
co_await when_all_succeed(
store_column_mapping(_proxy, altered.old_schema, true),
store_column_mapping(_proxy, altered.new_schema, false));
});
co_await max_concurrent_for_each(diff.tables_and_views.local().tables.dropped, max_concurrent, [this] (const schema_ptr& s) -> future<> {
co_await drop_column_mapping(_sys_ks.local(), s->id(), s->version());
});
}
@@ -1191,7 +1200,7 @@ future<> schema_applier::finalize_tables_and_views() {
future<> schema_applier::post_commit() {
co_await finalize_tables_and_views();
auto& sharded_db = _proxy.local().get_db();
co_await sharded_db.invoke_on_all([&](replica::database& db) -> future<> {
co_await sharded_db.invoke_on_all([&] (replica::database& db) -> future<> {
auto& notifier = db.get_notifier();
// notify about keyspaces
for (const auto& name : _affected_keyspaces.names.created) {
@@ -1251,8 +1260,8 @@ static future<> execute_do_merge_schema(sharded<service::storage_proxy>& proxy,
co_await ap.post_commit();
}
static future<> do_merge_schema(sharded<service::storage_proxy>& proxy, sharded<service::storage_service>& ss, sharded<db::system_keyspace>& sys_ks,
utils::chunked_vector<mutation> mutations, bool reload) {
static future<> do_merge_schema(sharded<service::storage_proxy>& proxy, sharded<service::storage_service>& ss, sharded<db::system_keyspace>& sys_ks, utils::chunked_vector<mutation> mutations, bool reload)
{
slogger.trace("do_merge_schema: {}", mutations);
schema_applier ap(proxy, ss, sys_ks, reload);
co_await execute_do_merge_schema(proxy, ap, std::move(mutations)).finally([&ap]() {
@@ -1269,22 +1278,22 @@ static future<> do_merge_schema(sharded<service::storage_proxy>& proxy, sharded<
* @throws ConfigurationException If one of metadata attributes has invalid value
* @throws IOException If data was corrupted during transportation or failed to apply fs operations
*/
future<> merge_schema(sharded<db::system_keyspace>& sys_ks, sharded<service::storage_proxy>& proxy, sharded<service::storage_service>& ss,
utils::chunked_vector<mutation> mutations, bool reload) {
future<> merge_schema(sharded<db::system_keyspace>& sys_ks, sharded<service::storage_proxy>& proxy, sharded<service::storage_service>& ss, utils::chunked_vector<mutation> mutations, bool reload)
{
if (this_shard_id() != 0) {
// mutations must be applied on the owning shard (0).
co_await smp::submit_to(0, coroutine::lambda([&, fmuts = freeze(mutations)]() mutable -> future<> {
co_await smp::submit_to(0, coroutine::lambda([&, fmuts = freeze(mutations)] () mutable -> future<> {
co_await merge_schema(sys_ks, proxy, ss, co_await unfreeze_gently(fmuts), reload);
}));
co_return;
}
co_await with_merge_lock([&]() mutable -> future<> {
co_await with_merge_lock([&] () mutable -> future<> {
co_await do_merge_schema(proxy, ss, sys_ks, std::move(mutations), reload);
auto version = co_await get_group0_schema_version(sys_ks.local());
co_await update_schema_version_and_announce(sys_ks, proxy, version);
});
}
} // namespace schema_tables
}
} // namespace db
}

View File

@@ -29,8 +29,8 @@ static logging::logger blogger("boot_strapper");
namespace dht {
future<> boot_strapper::bootstrap(
streaming::stream_reason reason, gms::gossiper& gossiper, service::frozen_topology_guard topo_guard, locator::host_id replace_address) {
future<> boot_strapper::bootstrap(streaming::stream_reason reason, gms::gossiper& gossiper, service::frozen_topology_guard topo_guard,
locator::host_id replace_address) {
blogger.debug("Beginning bootstrap process: sorted_tokens={}", get_token_metadata().sorted_tokens());
sstring description;
if (reason == streaming::stream_reason::bootstrap) {
@@ -41,8 +41,7 @@ future<> boot_strapper::bootstrap(
throw std::runtime_error("Wrong stream_reason provided: it can only be replace or bootstrap");
}
try {
auto streamer = make_lw_shared<range_streamer>(
_db, _stream_manager, _token_metadata_ptr, _abort_source, _tokens, _address, _dr, description, reason, topo_guard);
auto streamer = make_lw_shared<range_streamer>(_db, _stream_manager, _token_metadata_ptr, _abort_source, _tokens, _address, _dr, description, reason, topo_guard);
auto nodes_to_filter = gossiper.get_unreachable_members();
if (reason == streaming::stream_reason::replace) {
nodes_to_filter.insert(std::move(replace_address));
@@ -72,8 +71,7 @@ std::unordered_set<token> boot_strapper::get_random_bootstrap_tokens(const token
}
if (num_tokens == 1) {
blogger.warn(
"Picking random token for a single vnode. You should probably add more vnodes; failing that, you should probably specify the token manually");
blogger.warn("Picking random token for a single vnode. You should probably add more vnodes; failing that, you should probably specify the token manually");
}
auto tokens = get_random_tokens(std::move(tmptr), num_tokens);
@@ -88,8 +86,7 @@ std::unordered_set<token> boot_strapper::get_bootstrap_tokens(token_metadata_ptr
return get_bootstrap_tokens(std::move(tmptr), cfg.initial_token(), cfg.num_tokens(), check);
}
std::unordered_set<token> boot_strapper::get_bootstrap_tokens(
const token_metadata_ptr tmptr, sstring tokens_string, uint32_t num_tokens, check_token_endpoint check) {
std::unordered_set<token> boot_strapper::get_bootstrap_tokens(const token_metadata_ptr tmptr, sstring tokens_string, uint32_t num_tokens, check_token_endpoint check) {
std::unordered_set<sstring> initial_tokens;
try {
boost::split(initial_tokens, tokens_string, boost::is_any_of(sstring(", ")));
@@ -105,8 +102,7 @@ std::unordered_set<token> boot_strapper::get_bootstrap_tokens(
for (auto& token_string : initial_tokens) {
auto token = dht::token::from_sstring(token_string);
if (check && tmptr->get_endpoint(token)) {
throw std::runtime_error(
format("Bootstrapping to existing token {} is not allowed (decommission/removenode the old node first).", token_string));
throw std::runtime_error(format("Bootstrapping to existing token {} is not allowed (decommission/removenode the old node first).", token_string));
}
tokens.insert(token);
}

View File

@@ -26,9 +26,10 @@ static logging::logger logger("range_streamer");
using inet_address = gms::inet_address;
std::unordered_map<locator::host_id, dht::token_range_vector> range_streamer::get_range_fetch_map(
const std::unordered_map<dht::token_range, std::vector<locator::host_id>>& ranges_with_sources,
const std::unordered_set<std::unique_ptr<i_source_filter>>& source_filters, const sstring& keyspace) {
std::unordered_map<locator::host_id, dht::token_range_vector>
range_streamer::get_range_fetch_map(const std::unordered_map<dht::token_range, std::vector<locator::host_id>>& ranges_with_sources,
const std::unordered_set<std::unique_ptr<i_source_filter>>& source_filters,
const sstring& keyspace) {
std::unordered_map<locator::host_id, dht::token_range_vector> range_fetch_map_map;
const auto& topo = _token_metadata_ptr->get_topology();
for (const auto& x : ranges_with_sources) {
@@ -78,8 +79,8 @@ std::unordered_map<locator::host_id, dht::token_range_vector> range_streamer::ge
}
// Must be called from a seastar thread
std::unordered_map<dht::token_range, std::vector<locator::host_id>> range_streamer::get_all_ranges_with_sources_for(
const sstring& keyspace_name, const locator::vnode_effective_replication_map* erm, dht::token_range_vector desired_ranges) {
std::unordered_map<dht::token_range, std::vector<locator::host_id>>
range_streamer::get_all_ranges_with_sources_for(const sstring& keyspace_name, const locator::vnode_effective_replication_map* erm, dht::token_range_vector desired_ranges) {
logger.debug("{} ks={}", __func__, keyspace_name);
auto range_addresses = erm->get_range_host_ids().get();
@@ -113,24 +114,24 @@ std::unordered_map<dht::token_range, std::vector<locator::host_id>> range_stream
}
// Must be called from a seastar thread
std::unordered_map<dht::token_range, std::vector<locator::host_id>> range_streamer::get_all_ranges_with_strict_sources_for(
const sstring& keyspace_name, const locator::vnode_effective_replication_map* erm, dht::token_range_vector desired_ranges, gms::gossiper& gossiper) {
std::unordered_map<dht::token_range, std::vector<locator::host_id>>
range_streamer::get_all_ranges_with_strict_sources_for(const sstring& keyspace_name, const locator::vnode_effective_replication_map* erm, dht::token_range_vector desired_ranges, gms::gossiper& gossiper) {
logger.debug("{} ks={}", __func__, keyspace_name);
SCYLLA_ASSERT(_tokens.empty() == false);
SCYLLA_ASSERT (_tokens.empty() == false);
auto& strat = erm->get_replication_strategy();
// Active ranges
//Active ranges
auto metadata_clone = get_token_metadata().clone_only_token_map().get();
auto range_addresses = strat.get_range_host_ids(metadata_clone).get();
// Pending ranges
//Pending ranges
metadata_clone.update_topology(_address, _dr);
metadata_clone.update_normal_tokens(_tokens, _address).get();
auto pending_range_addresses = strat.get_range_host_ids(metadata_clone).get();
auto pending_range_addresses = strat.get_range_host_ids(metadata_clone).get();
metadata_clone.clear_gently().get();
// Collects the source that will have its range moved to the new node
//Collects the source that will have its range moved to the new node
std::unordered_map<dht::token_range, std::vector<locator::host_id>> range_sources;
logger.debug("keyspace={}, desired_ranges.size={}, range_addresses.size={}", keyspace_name, desired_ranges.size(), range_addresses.size());
@@ -149,12 +150,11 @@ std::unordered_map<dht::token_range, std::vector<locator::host_id>> range_stream
}
std::unordered_set<locator::host_id> new_endpoints(it->second.begin(), it->second.end());
// Due to CASSANDRA-5953 we can have a higher RF then we have endpoints.
// So we need to be careful to only be strict when endpoints == RF
//Due to CASSANDRA-5953 we can have a higher RF then we have endpoints.
//So we need to be careful to only be strict when endpoints == RF
if (old_endpoints.size() == erm->get_replication_factor()) {
std::erase_if(old_endpoints, [&new_endpoints](locator::host_id ep) {
return new_endpoints.contains(ep);
});
std::erase_if(old_endpoints,
[&new_endpoints] (locator::host_id ep) { return new_endpoints.contains(ep); });
if (old_endpoints.size() != 1) {
throw std::runtime_error(format("Expected 1 endpoint but found {:d}", old_endpoints.size()));
}
@@ -163,7 +163,7 @@ std::unordered_map<dht::token_range, std::vector<locator::host_id>> range_stream
}
}
// Validate
//Validate
auto it = range_sources.find(desired_range);
if (it == range_sources.end()) {
throw std::runtime_error(format("No sources found for {}", desired_range));
@@ -176,9 +176,7 @@ std::unordered_map<dht::token_range, std::vector<locator::host_id>> range_stream
locator::host_id source_id = it->second.front();
if (gossiper.is_enabled() && !gossiper.is_alive(source_id)) {
throw std::runtime_error(format("A node required to move the data consistently is down ({}). If you wish to move the data from a potentially "
"inconsistent replica, restart the node with consistent_rangemovement=false",
source_id));
throw std::runtime_error(format("A node required to move the data consistently is down ({}). If you wish to move the data from a potentially inconsistent replica, restart the node with consistent_rangemovement=false", source_id));
}
}
@@ -190,8 +188,12 @@ bool range_streamer::use_strict_sources_for_ranges(const sstring& keyspace_name,
auto nr_nodes_in_ring = get_token_metadata().get_normal_token_owners().size();
bool everywhere_topology = erm.get_replication_strategy().get_type() == locator::replication_strategy_type::everywhere_topology;
// Use strict when number of nodes in the ring is equal or more than RF
auto strict = _db.local().get_config().consistent_rangemovement() && !_tokens.empty() && !everywhere_topology && nr_nodes_in_ring >= rf;
logger.debug("use_strict_sources_for_ranges: ks={}, nr_nodes_in_ring={}, rf={}, strict={}", keyspace_name, nr_nodes_in_ring, rf, strict);
auto strict = _db.local().get_config().consistent_rangemovement()
&& !_tokens.empty()
&& !everywhere_topology
&& nr_nodes_in_ring >= rf;
logger.debug("use_strict_sources_for_ranges: ks={}, nr_nodes_in_ring={}, rf={}, strict={}",
keyspace_name, nr_nodes_in_ring, rf, strict);
return strict;
}
@@ -212,36 +214,34 @@ void range_streamer::add_rx_ranges(const sstring& keyspace_name, std::unordered_
}
// TODO: This is the legacy range_streamer interface, it is add_rx_ranges which adds rx ranges.
future<> range_streamer::add_ranges(const sstring& keyspace_name, locator::static_effective_replication_map_ptr erm, dht::token_range_vector ranges,
gms::gossiper& gossiper, bool is_replacing) {
return seastar::async([this, keyspace_name, ermp = std::move(erm), ranges = std::move(ranges), &gossiper, is_replacing]() mutable {
if (_nr_tx_added) {
throw std::runtime_error("Mixed sending and receiving is not supported");
}
_nr_rx_added++;
auto erm = ermp->maybe_as_vnode_effective_replication_map();
SCYLLA_ASSERT(erm != nullptr);
auto ranges_for_keyspace = !is_replacing && use_strict_sources_for_ranges(keyspace_name, *erm)
? get_all_ranges_with_strict_sources_for(keyspace_name, erm, std::move(ranges), gossiper)
: get_all_ranges_with_sources_for(keyspace_name, erm, std::move(ranges));
future<> range_streamer::add_ranges(const sstring& keyspace_name, locator::static_effective_replication_map_ptr erm, dht::token_range_vector ranges, gms::gossiper& gossiper, bool is_replacing) {
return seastar::async([this, keyspace_name, ermp = std::move(erm), ranges= std::move(ranges), &gossiper, is_replacing] () mutable {
if (_nr_tx_added) {
throw std::runtime_error("Mixed sending and receiving is not supported");
}
_nr_rx_added++;
auto erm = ermp->maybe_as_vnode_effective_replication_map();
SCYLLA_ASSERT(erm != nullptr);
auto ranges_for_keyspace = !is_replacing && use_strict_sources_for_ranges(keyspace_name, *erm)
? get_all_ranges_with_strict_sources_for(keyspace_name, erm, std::move(ranges), gossiper)
: get_all_ranges_with_sources_for(keyspace_name, erm, std::move(ranges));
if (logger.is_enabled(logging::log_level::debug)) {
for (auto& x : ranges_for_keyspace) {
logger.debug("{} : keyspace {} range {} exists on {}", _description, keyspace_name, x.first, x.second);
}
if (logger.is_enabled(logging::log_level::debug)) {
for (auto& x : ranges_for_keyspace) {
logger.debug("{} : keyspace {} range {} exists on {}", _description, keyspace_name, x.first, x.second);
}
}
std::unordered_map<locator::host_id, dht::token_range_vector> range_fetch_map =
get_range_fetch_map(ranges_for_keyspace, _source_filters, keyspace_name);
utils::clear_gently(ranges_for_keyspace).get();
std::unordered_map<locator::host_id, dht::token_range_vector> range_fetch_map = get_range_fetch_map(ranges_for_keyspace, _source_filters, keyspace_name);
utils::clear_gently(ranges_for_keyspace).get();
if (logger.is_enabled(logging::log_level::debug)) {
for (auto& x : range_fetch_map) {
logger.debug("{} : keyspace={}, ranges={} from source={}, range_size={}", _description, keyspace_name, x.second, x.first, x.second.size());
}
if (logger.is_enabled(logging::log_level::debug)) {
for (auto& x : range_fetch_map) {
logger.debug("{} : keyspace={}, ranges={} from source={}, range_size={}", _description, keyspace_name, x.second, x.first, x.second.size());
}
_to_stream.emplace(keyspace_name, std::move(range_fetch_map));
});
}
_to_stream.emplace(keyspace_name, std::move(range_fetch_map));
});
}
future<> range_streamer::stream_async() {
@@ -250,73 +250,73 @@ future<> range_streamer::stream_async() {
_token_metadata_ptr = nullptr;
logger.info("{} starts, nr_ranges_remaining={}", _description, _nr_ranges_remaining);
auto start = lowres_clock::now();
return do_for_each(_to_stream, [this, description = _description](auto& stream) {
return do_for_each(_to_stream, [this, description = _description] (auto& stream) {
const auto& keyspace = stream.first;
auto& ip_range_vec = stream.second;
auto ips = ip_range_vec | std::views::keys | std::ranges::to<std::list>();
// Fetch from or send to peer node in parallel
logger.info("{} with {} for keyspace={} started, nodes_to_stream={}", description, ips, keyspace, ip_range_vec.size());
return parallel_for_each(ip_range_vec, [this, description, keyspace](auto& ip_range) {
auto& source = ip_range.first;
auto& range_vec = ip_range.second;
return seastar::with_semaphore(_limiter, 1, [this, description, keyspace, source, &range_vec]() mutable {
return seastar::async([this, description, keyspace, source, &range_vec]() mutable {
// TODO: It is better to use fiber instead of thread here because
// creating a thread per peer can be some memory in a large cluster.
auto start_time = lowres_clock::now();
unsigned sp_index = 0;
unsigned nr_ranges_streamed = 0;
size_t nr_ranges_total = range_vec.size();
auto do_streaming = [&](dht::token_range_vector&& ranges_to_stream) {
auto sp = stream_plan(_stream_manager.local(), format("{}-{}-index-{:d}", description, keyspace, sp_index++), _reason, _topo_guard);
auto abort_listener = _abort_source.subscribe([&]() noexcept {
sp.abort();
});
_abort_source.check();
logger.info("{} with {} for keyspace={}, streaming [{}, {}) out of {} ranges", description, source, keyspace, nr_ranges_streamed,
nr_ranges_streamed + ranges_to_stream.size(), nr_ranges_total);
auto ranges_streamed = ranges_to_stream.size();
if (_nr_rx_added) {
sp.request_ranges(source, keyspace, std::move(ranges_to_stream), _tables);
} else if (_nr_tx_added) {
sp.transfer_ranges(source, keyspace, std::move(ranges_to_stream), _tables);
}
sp.execute().discard_result().get();
// Update finished percentage
nr_ranges_streamed += ranges_streamed;
_nr_ranges_remaining -= ranges_streamed;
float percentage = _nr_total_ranges == 0 ? 1 : (_nr_total_ranges - _nr_ranges_remaining) / (float)_nr_total_ranges;
_stream_manager.local().update_finished_percentage(_reason, percentage);
logger.info("Finished {} out of {} ranges for {}, finished percentage={}", _nr_total_ranges - _nr_ranges_remaining, _nr_total_ranges,
_reason, percentage);
};
dht::token_range_vector ranges_to_stream;
try {
for (auto it = range_vec.begin(); it < range_vec.end();) {
ranges_to_stream.push_back(*it);
++it;
auto fraction = _db.local().get_config().stream_plan_ranges_fraction();
size_t nr_ranges_per_stream_plan = nr_ranges_total * fraction;
if (ranges_to_stream.size() < nr_ranges_per_stream_plan) {
continue;
} else {
do_streaming(std::exchange(ranges_to_stream, {}));
it = range_vec.erase(range_vec.begin(), it);
}
}
if (ranges_to_stream.size() > 0) {
do_streaming(std::exchange(ranges_to_stream, {}));
range_vec.clear();
}
} catch (...) {
auto t = std::chrono::duration_cast<std::chrono::duration<float>>(lowres_clock::now() - start_time).count();
logger.warn("{} with {} for keyspace={} failed, took {} seconds: {}", description, source, keyspace, t, std::current_exception());
throw;
return parallel_for_each(ip_range_vec, [this, description, keyspace] (auto& ip_range) {
auto& source = ip_range.first;
auto& range_vec = ip_range.second;
return seastar::with_semaphore(_limiter, 1, [this, description, keyspace, source, &range_vec] () mutable {
return seastar::async([this, description, keyspace, source, &range_vec] () mutable {
// TODO: It is better to use fiber instead of thread here because
// creating a thread per peer can be some memory in a large cluster.
auto start_time = lowres_clock::now();
unsigned sp_index = 0;
unsigned nr_ranges_streamed = 0;
size_t nr_ranges_total = range_vec.size();
auto do_streaming = [&] (dht::token_range_vector&& ranges_to_stream) {
auto sp = stream_plan(_stream_manager.local(), format("{}-{}-index-{:d}", description, keyspace, sp_index++),
_reason, _topo_guard);
auto abort_listener = _abort_source.subscribe([&] () noexcept { sp.abort(); });
_abort_source.check();
logger.info("{} with {} for keyspace={}, streaming [{}, {}) out of {} ranges",
description, source, keyspace,
nr_ranges_streamed, nr_ranges_streamed + ranges_to_stream.size(), nr_ranges_total);
auto ranges_streamed = ranges_to_stream.size();
if (_nr_rx_added) {
sp.request_ranges(source, keyspace, std::move(ranges_to_stream), _tables);
} else if (_nr_tx_added) {
sp.transfer_ranges(source, keyspace, std::move(ranges_to_stream), _tables);
}
sp.execute().discard_result().get();
// Update finished percentage
nr_ranges_streamed += ranges_streamed;
_nr_ranges_remaining -= ranges_streamed;
float percentage = _nr_total_ranges == 0 ? 1 : (_nr_total_ranges - _nr_ranges_remaining) / (float)_nr_total_ranges;
_stream_manager.local().update_finished_percentage(_reason, percentage);
logger.info("Finished {} out of {} ranges for {}, finished percentage={}",
_nr_total_ranges - _nr_ranges_remaining, _nr_total_ranges, _reason, percentage);
};
dht::token_range_vector ranges_to_stream;
try {
for (auto it = range_vec.begin(); it < range_vec.end();) {
ranges_to_stream.push_back(*it);
++it;
auto fraction = _db.local().get_config().stream_plan_ranges_fraction();
size_t nr_ranges_per_stream_plan = nr_ranges_total * fraction;
if (ranges_to_stream.size() < nr_ranges_per_stream_plan) {
continue;
} else {
do_streaming(std::exchange(ranges_to_stream, {}));
it = range_vec.erase(range_vec.begin(), it);
}
}
if (ranges_to_stream.size() > 0) {
do_streaming(std::exchange(ranges_to_stream, {}));
range_vec.clear();
}
} catch (...) {
auto t = std::chrono::duration_cast<std::chrono::duration<float>>(lowres_clock::now() - start_time).count();
logger.info("{} with {} for keyspace={} succeeded, took {} seconds", description, source, keyspace, t);
});
});
logger.warn("{} with {} for keyspace={} failed, took {} seconds: {}", description, source, keyspace, t, std::current_exception());
throw;
}
auto t = std::chrono::duration_cast<std::chrono::duration<float>>(lowres_clock::now() - start_time).count();
logger.info("{} with {} for keyspace={} succeeded, took {} seconds", description, source, keyspace, t);
});
});
});
}).finally([this, start] {
auto t = std::chrono::duration_cast<std::chrono::seconds>(lowres_clock::now() - start).count();
@@ -344,4 +344,4 @@ size_t range_streamer::nr_ranges_to_stream() {
return nr_ranges_remaining;
}
} // namespace dht
} // dht

View File

@@ -9,6 +9,7 @@
import os
import sys
import shlex
import argparse
import psutil
from pathlib import Path
@@ -103,16 +104,41 @@ if __name__ == '__main__':
run('dd if=/dev/zero of={} bs=1M count={}'.format(swapfile, swapsize_mb), shell=True, check=True)
swapfile.chmod(0o600)
run('mkswap -f {}'.format(swapfile), shell=True, check=True)
mount_point = find_mount_point(swap_directory)
mount_unit = out(f'systemd-escape -p --suffix=mount {shlex.quote(str(mount_point))}')
# Add DefaultDependencies=no to the swap unit to avoid getting the default
# Before=swap.target dependency. We apply this to all clouds, but the
# requirement came from Azure:
#
# On Azure, the swap directory is on the Azure ephemeral disk (mounted on /mnt).
# However, cloud-init makes this mount (i.e., the mnt.mount unit) depend on
# the network (After=network-online.target). By extension, this means that
# the swap unit depends on the network. If we didn't use DefaultDependencies=no,
# then the swap unit would be part of the swap.target which other services
# assume to be a local boot target, so we would end up with dependency cycles
# such as:
#
# swap.target -> mnt-swapfile.swap -> mnt.mount -> network-online.target -> network.target -> systemd-resolved.service -> tmp.mount -> swap.target
#
# By removing the automatic Before=swap.target, the swap unit is no longer
# part of swap.target, avoiding such cycles. The swap will still be
# activated via WantedBy=multi-user.target.
unit_data = '''
[Unit]
Description=swapfile
DefaultDependencies=no
After={}
Conflicts=umount.target
Before=umount.target
[Swap]
What={}
[Install]
WantedBy=multi-user.target
'''[1:-1].format(swapfile)
'''[1:-1].format(mount_unit, swapfile)
with swapunit.open('w') as f:
f.write(unit_data)
systemd_unit.reload()

File diff suppressed because it is too large Load Diff

View File

@@ -42,7 +42,14 @@ void everywhere_replication_strategy::validate_options(const gms::feature_servic
sstring everywhere_replication_strategy::sanity_check_read_replicas(const effective_replication_map& erm, const host_id_vector_replica_set& read_replicas) const {
const auto replication_factor = erm.get_replication_factor();
if (read_replicas.size() > replication_factor) {
if (const auto& topo_info = erm.get_token_metadata().get_topology_change_info(); topo_info && topo_info->read_new) {
if (read_replicas.size() > replication_factor + 1) {
return seastar::format(
"everywhere_replication_strategy: the number of replicas for everywhere_replication_strategy is {}, "
"cannot be higher than replication factor {} + 1 during the 'read from new replicas' stage of a topology change",
read_replicas.size(), replication_factor);
}
} else if (read_replicas.size() > replication_factor) {
return seastar::format("everywhere_replication_strategy: the number of replicas for everywhere_replication_strategy is {}, cannot be higher than replication factor {}", read_replicas.size(), replication_factor);
}
return {};

View File

@@ -33,14 +33,15 @@ size_t hash<locator::endpoint_dc_rack>::operator()(const locator::endpoint_dc_ra
return utils::tuple_hash()(std::tie(v.dc, v.rack));
}
} // namespace std
}
namespace locator {
static logging::logger logger("network_topology_strategy");
network_topology_strategy::network_topology_strategy(replication_strategy_params params, const topology* topo)
: abstract_replication_strategy(params, replication_strategy_type::network_topology) {
network_topology_strategy::network_topology_strategy(replication_strategy_params params, const topology* topo) :
abstract_replication_strategy(params,
replication_strategy_type::network_topology) {
auto opts = _config_options;
logger.debug("options={}", opts);
@@ -64,7 +65,8 @@ network_topology_strategy::network_topology_strategy(replication_strategy_params
if (boost::equals(key, "replication_factor")) {
on_internal_error(rslogger, "replication_factor should have been replaced with a DC:RF mapping by now");
} else {
throw exceptions::configuration_exception(format("'{}' is not a valid option, did you mean (lowercase) 'replication_factor'?", key));
throw exceptions::configuration_exception(format(
"'{}' is not a valid option, did you mean (lowercase) 'replication_factor'?", key));
}
}
@@ -107,8 +109,8 @@ class natural_endpoints_tracker {
, _rf_left(std::min(rf, node_count))
// If there aren't enough racks in this DC to fill the RF, we'll still use at least one node from each rack,
// and the difference is to be filled by the first encountered nodes.
, _acceptable_rack_repeats(rf - rack_count) {
}
, _acceptable_rack_repeats(rf - rack_count)
{}
/**
* Attempts to add an endpoint to the replicas for this datacenter, adding to the endpoints set if successful.
@@ -199,7 +201,8 @@ public:
, _tp(_tm.get_topology())
, _dc_rep_factor(dc_rep_factor)
, _token_owners(_tm.get_datacenter_token_owners())
, _racks(_tm.get_datacenter_racks_token_owners()) {
, _racks(_tm.get_datacenter_racks_token_owners())
{
// not aware of any cluster members
SCYLLA_ASSERT(!_token_owners.empty() && !_racks.empty());
@@ -248,14 +251,16 @@ public:
for (const auto& [dc, rf_data] : dc_rf) {
auto rf = rf_data.count();
if (rf > endpoints_in(dc)) {
throw exceptions::configuration_exception(
seastar::format("Datacenter {} doesn't have enough token-owning nodes for replication_factor={}", dc, rf));
throw exceptions::configuration_exception(seastar::format(
"Datacenter {} doesn't have enough token-owning nodes for replication_factor={}", dc, rf));
}
}
}
};
future<host_id_set> network_topology_strategy::calculate_natural_endpoints(const token& search_token, const token_metadata& tm) const {
future<host_id_set>
network_topology_strategy::calculate_natural_endpoints(
const token& search_token, const token_metadata& tm) const {
natural_endpoints_tracker tracker(tm, _dc_rep_factor);
@@ -280,14 +285,12 @@ void network_topology_strategy::validate_options(const gms::feature_service& fs,
for (auto& c : _config_options) {
if (c.first == sstring("replication_factor")) {
on_internal_error(rslogger, fmt::format("'replication_factor' tag should be unrolled into a list of DC:RF by now."
"_config_options:{}",
_config_options));
"_config_options:{}", _config_options));
}
auto dc = dcs.find(c.first);
if (dc == dcs.end()) {
throw exceptions::configuration_exception(format("Unrecognized strategy option {{{}}} "
"passed to NetworkTopologyStrategy",
this->to_qualified_class_name(c.first)));
"passed to NetworkTopologyStrategy", this->to_qualified_class_name(c.first)));
}
auto racks = dc->second | std::views::keys | std::ranges::to<std::unordered_set<sstring>>();
auto rf = parse_replication_factor(c.second);
@@ -308,8 +311,8 @@ future<tablet_map> network_topology_strategy::allocate_tablets_for_new_table(sch
rslogger.info("Rounding up tablet count from {} to {} for table {}.{}", tablet_count, aligned_tablet_count, s->ks_name(), s->cf_name());
tablet_count = aligned_tablet_count;
}
co_return co_await reallocate_tablets(
std::move(s), std::move(tm), tablet_map(tablet_count, get_consistency() != data_dictionary::consistency_config_option::eventual));
co_return co_await reallocate_tablets(std::move(s), std::move(tm),
tablet_map(tablet_count, get_consistency() != data_dictionary::consistency_config_option::eventual));
}
future<tablet_map> network_topology_strategy::reallocate_tablets(schema_ptr s, token_metadata_ptr tm, tablet_map tablets) const {
@@ -318,15 +321,16 @@ future<tablet_map> network_topology_strategy::reallocate_tablets(schema_ptr s, t
co_await load.populate_with_normalized_load();
co_await load.populate(std::nullopt, s->id());
tablet_logger.debug(
"Allocating tablets for {}.{} ({}): dc_rep_factor={} tablet_count={}", s->ks_name(), s->cf_name(), s->id(), _dc_rep_factor, tablets.tablet_count());
tablet_logger.debug("Allocating tablets for {}.{} ({}): dc_rep_factor={} tablet_count={}", s->ks_name(), s->cf_name(), s->id(), _dc_rep_factor, tablets.tablet_count());
for (tablet_id tb : tablets.tablet_ids()) {
auto tinfo = tablets.get_tablet_info(tb);
tinfo.replicas = co_await reallocate_tablets(s, tm, load, tablets, tb);
if (tablets.has_raft_info()) {
if (!tablets.get_tablet_raft_info(tb).group_id) {
tablets.set_tablet_raft_info(tb, tablet_raft_info{.group_id = raft::group_id{utils::UUID_gen::get_time_UUID()}});
tablets.set_tablet_raft_info(tb, tablet_raft_info {
.group_id = raft::group_id{utils::UUID_gen::get_time_UUID()}
});
}
}
tablets.set_tablet(tb, std::move(tinfo));
@@ -336,8 +340,7 @@ future<tablet_map> network_topology_strategy::reallocate_tablets(schema_ptr s, t
co_return tablets;
}
future<tablet_replica_set> network_topology_strategy::reallocate_tablets(
schema_ptr s, token_metadata_ptr tm, load_sketch& load, const tablet_map& cur_tablets, tablet_id tb) const {
future<tablet_replica_set> network_topology_strategy::reallocate_tablets(schema_ptr s, token_metadata_ptr tm, load_sketch& load, const tablet_map& cur_tablets, tablet_id tb) const {
tablet_replica_set replicas;
// Current number of replicas per dc
std::unordered_map<sstring, size_t> nodes_per_dc;
@@ -361,8 +364,8 @@ future<tablet_replica_set> network_topology_strategy::reallocate_tablets(
if (new_rf && new_rf->is_rack_based()) {
auto diff = diff_racks(old_racks_per_dc[dc], new_rf->get_rack_list());
tablet_logger.debug("reallocate_tablets {}.{} tablet_id={} dc={} old_racks={} add_racks={} del_racks={}", s->ks_name(), s->cf_name(), tb, dc,
old_racks_per_dc[dc], diff.added, diff.removed);
tablet_logger.debug("reallocate_tablets {}.{} tablet_id={} dc={} old_racks={} add_racks={} del_racks={}",
s->ks_name(), s->cf_name(), tb, dc, old_racks_per_dc[dc], diff.added, diff.removed);
if (!diff) {
continue;
@@ -392,18 +395,23 @@ future<tablet_replica_set> network_topology_strategy::reallocate_tablets(
co_return replicas;
}
tablet_replica_set network_topology_strategy::drop_tablets_in_racks(schema_ptr s, token_metadata_ptr tm, load_sketch& load, tablet_id tb,
const tablet_replica_set& cur_replicas, const sstring& dc, const rack_list& racks_to_drop) const {
tablet_replica_set network_topology_strategy::drop_tablets_in_racks(schema_ptr s,
token_metadata_ptr tm,
load_sketch& load,
tablet_id tb,
const tablet_replica_set& cur_replicas,
const sstring& dc,
const rack_list& racks_to_drop) const {
auto& topo = tm->get_topology();
tablet_replica_set filtered;
auto is_rack_to_drop = [&racks_to_drop](const sstring& rack) {
auto is_rack_to_drop = [&racks_to_drop] (const sstring& rack) {
return std::ranges::contains(racks_to_drop, rack);
};
for (const auto& tr : cur_replicas) {
auto& node = topo.get_node(tr.host);
if (node.dc_rack().dc == dc && is_rack_to_drop(node.dc_rack().rack)) {
tablet_logger.debug("drop_tablets_in_rack {}.{} tablet_id={} dc={} rack={} removing replica: {}", s->ks_name(), s->cf_name(), tb, node.dc_rack().dc,
node.dc_rack().rack, tr);
tablet_logger.debug("drop_tablets_in_rack {}.{} tablet_id={} dc={} rack={} removing replica: {}",
s->ks_name(), s->cf_name(), tb, node.dc_rack().dc, node.dc_rack().rack, tr);
load.unload(tr.host, tr.shard, 1, service::default_target_tablet_size);
} else {
filtered.emplace_back(tr);
@@ -412,17 +420,22 @@ tablet_replica_set network_topology_strategy::drop_tablets_in_racks(schema_ptr s
return filtered;
}
tablet_replica_set network_topology_strategy::add_tablets_in_racks(schema_ptr s, token_metadata_ptr tm, load_sketch& load, tablet_id tb,
const tablet_replica_set& cur_replicas, const sstring& dc, const rack_list& racks_to_add) const {
tablet_replica_set network_topology_strategy::add_tablets_in_racks(schema_ptr s,
token_metadata_ptr tm,
load_sketch& load,
tablet_id tb,
const tablet_replica_set& cur_replicas,
const sstring& dc,
const rack_list& racks_to_add) const {
auto nodes = tm->get_datacenter_racks_token_owners_nodes();
auto& dc_nodes = nodes.at(dc);
auto new_replicas = cur_replicas;
for (auto&& rack : racks_to_add) {
for (auto&& rack: racks_to_add) {
host_id min_node;
double min_load = std::numeric_limits<double>::max();
for (auto&& node : dc_nodes.at(rack)) {
for (auto&& node: dc_nodes.at(rack)) {
if (!node.get().is_normal()) {
continue;
}
@@ -437,26 +450,29 @@ tablet_replica_set network_topology_strategy::add_tablets_in_racks(schema_ptr s,
}
if (!min_node) {
throw std::runtime_error(fmt::format("No candidate node in rack {}.{} to allocate tablet replica", dc, rack));
throw std::runtime_error(
fmt::format("No candidate node in rack {}.{} to allocate tablet replica", dc, rack));
}
auto new_replica = tablet_replica{min_node, load.next_shard(min_node, 1, service::default_target_tablet_size)};
new_replicas.push_back(new_replica);
tablet_logger.trace("add_tablet_in_rack {}.{} tablet_id={} dc={} rack={} load={} new_replica={}", s->ks_name(), s->cf_name(), tb.id, dc, rack, min_load,
new_replica);
tablet_logger.trace("add_tablet_in_rack {}.{} tablet_id={} dc={} rack={} load={} new_replica={}",
s->ks_name(), s->cf_name(), tb.id, dc, rack, min_load, new_replica);
}
return new_replicas;
}
future<tablet_replica_set> network_topology_strategy::add_tablets_in_dc(schema_ptr s, token_metadata_ptr tm, load_sketch& load, tablet_id tb,
std::map<sstring, std::unordered_set<locator::host_id>>& replicas_per_rack, const tablet_replica_set& cur_replicas, sstring dc, size_t dc_node_count,
size_t dc_rf) const {
std::map<sstring, std::unordered_set<locator::host_id>>& replicas_per_rack,
const tablet_replica_set& cur_replicas,
sstring dc, size_t dc_node_count, size_t dc_rf) const {
static thread_local std::default_random_engine rnd_engine{std::random_device{}()};
auto replicas = cur_replicas;
// all_dc_racks is ordered lexicographically on purpose
auto all_dc_racks = tm->get_datacenter_racks_token_owners_nodes().at(dc) | std::ranges::to<std::map>();
auto all_dc_racks = tm->get_datacenter_racks_token_owners_nodes().at(dc)
| std::ranges::to<std::map>();
// Track all nodes with no replicas on them for this tablet, per rack.
struct node_load {
@@ -465,7 +481,7 @@ future<tablet_replica_set> network_topology_strategy::add_tablets_in_dc(schema_p
};
// for sorting in descending load order
// (in terms of load)
auto node_load_cmp = [](const node_load& a, const node_load& b) {
auto node_load_cmp = [] (const node_load& a, const node_load& b) {
return a.load > b.load;
};
@@ -517,7 +533,7 @@ future<tablet_replica_set> network_topology_strategy::add_tablets_in_dc(schema_p
// ensure fairness across racks (in particular if rf < number_of_racks)
// by rotating the racks order
auto append_candidate_racks = [&](candidates_list& racks) {
auto append_candidate_racks = [&] (candidates_list& racks) {
if (auto size = racks.size()) {
auto it = racks.begin() + tb.id % size;
std::move(it, racks.end(), std::back_inserter(candidate_racks));
@@ -529,19 +545,20 @@ future<tablet_replica_set> network_topology_strategy::add_tablets_in_dc(schema_p
append_candidate_racks(existing_racks);
if (candidate_racks.empty()) {
on_internal_error(tablet_logger, seastar::format("allocate_replica {}.{}: no candidate racks found for dc={} allocated={} rf={}: existing={}",
s->ks_name(), s->cf_name(), dc, dc_node_count, dc_rf, replicas_per_rack));
on_internal_error(tablet_logger,
seastar::format("allocate_replica {}.{}: no candidate racks found for dc={} allocated={} rf={}: existing={}",
s->ks_name(), s->cf_name(), dc, dc_node_count, dc_rf, replicas_per_rack));
}
auto candidate_rack = candidate_racks.begin();
auto allocate_replica = [&](candidates_list::iterator& candidate) {
auto allocate_replica = [&] (candidates_list::iterator& candidate) {
const auto& rack = candidate->rack;
auto& nodes = candidate->nodes;
if (nodes.empty()) {
on_internal_error(tablet_logger, seastar::format("allocate_replica {}.{} tablet_id={}: candidates vector for rack={} is empty for allocating "
"tablet replicas in dc={} allocated={} rf={}",
s->ks_name(), s->cf_name(), tb.id, rack, dc, dc_node_count, dc_rf));
on_internal_error(tablet_logger,
seastar::format("allocate_replica {}.{} tablet_id={}: candidates vector for rack={} is empty for allocating tablet replicas in dc={} allocated={} rf={}",
s->ks_name(), s->cf_name(), tb.id, rack, dc, dc_node_count, dc_rf));
}
auto host_id = nodes.back().host;
auto replica = tablet_replica{host_id, load.next_shard(host_id, 1, service::default_target_tablet_size)};
@@ -549,13 +566,13 @@ future<tablet_replica_set> network_topology_strategy::add_tablets_in_dc(schema_p
auto inserted = replicas_per_rack[node.dc_rack().rack].insert(host_id).second;
// Sanity check that a node is not used more than once
if (!inserted) {
on_internal_error(tablet_logger, seastar::format("allocate_replica {}.{} tablet_id={}: allocated replica={} node already used when allocating "
"tablet replicas in dc={} allocated={} rf={}: replicas={}",
s->ks_name(), s->cf_name(), tb.id, replica, dc, dc_node_count, dc_rf, replicas));
on_internal_error(tablet_logger,
seastar::format("allocate_replica {}.{} tablet_id={}: allocated replica={} node already used when allocating tablet replicas in dc={} allocated={} rf={}: replicas={}",
s->ks_name(), s->cf_name(), tb.id, replica, dc, dc_node_count, dc_rf, replicas));
}
nodes.pop_back();
tablet_logger.trace("allocate_replica {}.{} tablet_id={}: allocated tablet replica={} dc={} rack={}: nodes remaining in rack={}", s->ks_name(),
s->cf_name(), tb.id, replica, node.dc_rack().dc, node.dc_rack().rack, nodes.size());
tablet_logger.trace("allocate_replica {}.{} tablet_id={}: allocated tablet replica={} dc={} rack={}: nodes remaining in rack={}",
s->ks_name(), s->cf_name(), tb.id, replica, node.dc_rack().dc, node.dc_rack().rack, nodes.size());
if (nodes.empty()) {
candidate = candidate_racks.erase(candidate);
} else {
@@ -566,8 +583,7 @@ future<tablet_replica_set> network_topology_strategy::add_tablets_in_dc(schema_p
}
if (tablet_logger.is_enabled(log_level::trace)) {
if (candidate != candidate_racks.end()) {
tablet_logger.trace("allocate_replica {}.{} tablet_id={}: next rack={} nodes={}", s->ks_name(), s->cf_name(), tb.id, candidate->rack,
candidate->nodes.size());
tablet_logger.trace("allocate_replica {}.{} tablet_id={}: next rack={} nodes={}", s->ks_name(), s->cf_name(), tb.id, candidate->rack, candidate->nodes.size());
} else {
tablet_logger.trace("allocate_replica {}.{} tablet_id={}: no candidate racks left", s->ks_name(), s->cf_name(), tb.id);
}
@@ -575,15 +591,15 @@ future<tablet_replica_set> network_topology_strategy::add_tablets_in_dc(schema_p
return replica;
};
tablet_logger.debug("allocate_replica {}.{} tablet_id={}: allocating tablet replicas in dc={} allocated={} rf={}", s->ks_name(), s->cf_name(), tb.id, dc,
dc_node_count, dc_rf);
tablet_logger.debug("allocate_replica {}.{} tablet_id={}: allocating tablet replicas in dc={} allocated={} rf={}",
s->ks_name(), s->cf_name(), tb.id, dc, dc_node_count, dc_rf);
for (size_t remaining = dc_rf - dc_node_count; remaining; --remaining) {
co_await coroutine::maybe_yield();
if (candidate_rack == candidate_racks.end()) {
on_internal_error(tablet_logger, format("allocate_replica {}.{} tablet_id={}: ran out of candidates for allocating tablet replicas in dc={} "
"allocated={} rf={}: remaining={}",
s->ks_name(), s->cf_name(), tb.id, dc, dc_node_count, dc_rf, remaining));
on_internal_error(tablet_logger,
format("allocate_replica {}.{} tablet_id={}: ran out of candidates for allocating tablet replicas in dc={} allocated={} rf={}: remaining={}",
s->ks_name(), s->cf_name(), tb.id, dc, dc_node_count, dc_rf, remaining));
}
replicas.emplace_back(allocate_replica(candidate_rack));
}
@@ -592,9 +608,9 @@ future<tablet_replica_set> network_topology_strategy::add_tablets_in_dc(schema_p
}
tablet_replica_set network_topology_strategy::drop_tablets_in_dc(schema_ptr s, const locator::topology& topo, load_sketch& load, tablet_id tb,
const tablet_replica_set& cur_replicas, sstring dc, size_t dc_node_count, size_t dc_rf) const {
tablet_logger.debug("drop_tablets_in_dc {}.{} tablet_id={}: deallocating tablet replicas in dc={} allocated={} rf={}", s->ks_name(), s->cf_name(), tb.id,
dc, dc_node_count, dc_rf);
const tablet_replica_set& cur_replicas,
sstring dc, size_t dc_node_count, size_t dc_rf) const {
tablet_logger.debug("drop_tablets_in_dc {}.{} tablet_id={}: deallocating tablet replicas in dc={} allocated={} rf={}", s->ks_name(), s->cf_name(), tb.id, dc, dc_node_count, dc_rf);
// Leave dc_rf replicas in dc, effectively deallocating in reverse order,
// to maintain replica pairing between the base table and its materialized views.
@@ -613,7 +629,8 @@ tablet_replica_set network_topology_strategy::drop_tablets_in_dc(schema_ptr s, c
return filtered;
}
sstring network_topology_strategy::sanity_check_read_replicas(const effective_replication_map& erm, const host_id_vector_replica_set& read_replicas) const {
sstring network_topology_strategy::sanity_check_read_replicas(const effective_replication_map& erm,
const host_id_vector_replica_set& read_replicas) const {
const auto& topology = erm.get_topology();
struct rf_node_count {
@@ -646,4 +663,4 @@ sstring network_topology_strategy::sanity_check_read_replicas(const effective_re
using registry = class_registrator<abstract_replication_strategy, network_topology_strategy, replication_strategy_params, const topology*>;
static registry registrator("org.apache.cassandra.locator.NetworkTopologyStrategy");
static registry registrator_short_name("NetworkTopologyStrategy");
} // namespace locator
}

File diff suppressed because it is too large Load Diff

View File

@@ -26,16 +26,12 @@
struct node_printer {
const locator::node* v;
node_printer(const locator::node* n) noexcept
: v(n) {
}
node_printer(const locator::node* n) noexcept : v(n) {}
};
template <>
struct fmt::formatter<node_printer> {
constexpr auto parse(format_parse_context& ctx) {
return ctx.begin();
}
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
auto format(const node_printer& np, fmt::format_context& ctx) const {
const locator::node* node = np.v;
auto out = fmt::format_to(ctx.out(), "node={}", fmt::ptr(node));
@@ -47,9 +43,7 @@ struct fmt::formatter<node_printer> {
};
static auto lazy_backtrace() {
return seastar::value_of([] {
return current_backtrace();
});
return seastar::value_of([] { return current_backtrace(); });
}
namespace locator {
@@ -57,12 +51,11 @@ namespace locator {
static logging::logger tlogger("topology");
thread_local const endpoint_dc_rack endpoint_dc_rack::default_location = {
.dc = locator::production_snitch_base::default_dc,
.rack = locator::production_snitch_base::default_rack,
.dc = locator::production_snitch_base::default_dc,
.rack = locator::production_snitch_base::default_rack,
};
node::node(const locator::topology* topology, locator::host_id id, endpoint_dc_rack dc_rack, state state, shard_id shard_count, bool excluded,
this_node is_this_node, node::idx_type idx, bool draining)
node::node(const locator::topology* topology, locator::host_id id, endpoint_dc_rack dc_rack, state state, shard_id shard_count, bool excluded, this_node is_this_node, node::idx_type idx, bool draining)
: _topology(topology)
, _host_id(id)
, _dc_rack(std::move(dc_rack))
@@ -71,11 +64,10 @@ node::node(const locator::topology* topology, locator::host_id id, endpoint_dc_r
, _excluded(excluded)
, _draining(draining)
, _is_this_node(is_this_node)
, _idx(idx) {
}
, _idx(idx)
{}
node_holder node::make(const locator::topology* topology, locator::host_id id, endpoint_dc_rack dc_rack, state state, shard_id shard_count, bool excluded,
node::this_node is_this_node, node::idx_type idx, bool draining) {
node_holder node::make(const locator::topology* topology, locator::host_id id, endpoint_dc_rack dc_rack, state state, shard_id shard_count, bool excluded, node::this_node is_this_node, node::idx_type idx, bool draining) {
return std::make_unique<node>(topology, std::move(id), std::move(dc_rack), std::move(state), shard_count, excluded, is_this_node, idx, draining);
}
@@ -85,22 +77,14 @@ node_holder node::clone() const {
std::string node::to_string(node::state s) {
switch (s) {
case state::none:
return "none";
case state::bootstrapping:
return "bootstrapping";
case state::replacing:
return "replacing";
case state::normal:
return "normal";
case state::being_decommissioned:
return "being_decommissioned";
case state::being_removed:
return "being_removed";
case state::being_replaced:
return "being_replaced";
case state::left:
return "left";
case state::none: return "none";
case state::bootstrapping: return "bootstrapping";
case state::replacing: return "replacing";
case state::normal: return "normal";
case state::being_decommissioned: return "being_decommissioned";
case state::being_removed: return "being_removed";
case state::being_replaced: return "being_replaced";
case state::left: return "left";
}
__builtin_unreachable();
}
@@ -117,19 +101,21 @@ future<> topology::clear_gently() noexcept {
}
topology::topology(shallow_copy, config cfg)
: _shard(this_shard_id())
, _cfg(cfg)
, _sort_by_proximity(true) {
: _shard(this_shard_id())
, _cfg(cfg)
, _sort_by_proximity(true)
{
// constructor for shallow copying of token_metadata_impl
}
topology::topology(config cfg)
: _shard(this_shard_id())
, _cfg(cfg)
, _sort_by_proximity(!cfg.disable_proximity_sorting)
, _random_engine(std::random_device{}()) {
tlogger.trace("topology[{}]: constructing using config: endpoint={} id={} dc={} rack={}", fmt::ptr(this), cfg.this_endpoint, cfg.this_host_id,
cfg.local_dc_rack.dc, cfg.local_dc_rack.rack);
: _shard(this_shard_id())
, _cfg(cfg)
, _sort_by_proximity(!cfg.disable_proximity_sorting)
, _random_engine(std::random_device{}())
{
tlogger.trace("topology[{}]: constructing using config: endpoint={} id={} dc={} rack={}", fmt::ptr(this),
cfg.this_endpoint, cfg.this_host_id, cfg.local_dc_rack.dc, cfg.local_dc_rack.rack);
add_node(cfg.this_host_id, cfg.local_dc_rack, node::state::none);
}
@@ -145,7 +131,8 @@ topology::topology(topology&& o) noexcept
, _dc_racks(std::move(o._dc_racks))
, _sort_by_proximity(o._sort_by_proximity)
, _datacenters(std::move(o._datacenters))
, _random_engine(std::move(o._random_engine)) {
, _random_engine(std::move(o._random_engine))
{
SCYLLA_ASSERT(_shard == this_shard_id());
tlogger.trace("topology[{}]: move from [{}]", fmt::ptr(this), fmt::ptr(&o));
@@ -166,18 +153,16 @@ topology& topology::operator=(topology&& o) noexcept {
void topology::set_host_id_cfg(host_id this_host_id) {
if (_cfg.this_host_id) {
on_internal_error(tlogger,
fmt::format("topology[{}] set_host_id_cfg can be caller only once current id {} new id {}", fmt::ptr(this), _cfg.this_host_id, this_host_id));
on_internal_error(tlogger, fmt::format("topology[{}] set_host_id_cfg can be caller only once current id {} new id {}", fmt::ptr(this), _cfg.this_host_id, this_host_id));
}
if (_nodes.size() != 1) {
on_internal_error(tlogger, fmt::format("topology[{}] set_host_id_cfg called while nodes size is greater than 1", fmt::ptr(this)));
on_internal_error(tlogger, fmt::format("topology[{}] set_host_id_cfg called while nodes size is greater than 1", fmt::ptr(this)));
}
if (!_this_node) {
on_internal_error(tlogger, fmt::format("topology[{}] set_host_id_cfg called while _this_nodes is null", fmt::ptr(this)));
on_internal_error(tlogger, fmt::format("topology[{}] set_host_id_cfg called while _this_nodes is null", fmt::ptr(this)));
}
if (_this_node->host_id()) {
on_internal_error(
tlogger, fmt::format("topology[{}] set_host_id_cfg called while _this_nodes has non null id {}", fmt::ptr(this), _this_node->host_id()));
on_internal_error(tlogger, fmt::format("topology[{}] set_host_id_cfg called while _this_nodes has non null id {}", fmt::ptr(this), _this_node->host_id()));
}
remove_node(*_this_node);
@@ -218,8 +203,7 @@ const node& topology::add_node(node_holder nptr) {
if (nptr->topology() != this) {
if (nptr->topology()) {
on_fatal_internal_error(tlogger,
seastar::format("topology[{}]: {} belongs to different topology={}", fmt::ptr(this), node_printer(node), fmt::ptr(node->topology())));
on_fatal_internal_error(tlogger, seastar::format("topology[{}]: {} belongs to different topology={}", fmt::ptr(this), node_printer(node), fmt::ptr(node->topology())));
}
nptr->set_topology(this);
}
@@ -235,8 +219,7 @@ const node& topology::add_node(node_holder nptr) {
try {
if (is_configured_this_node(*node)) {
if (_this_node) {
on_internal_error(tlogger,
seastar::format("topology[{}]: {}: local node already mapped to {}", fmt::ptr(this), node_printer(node), node_printer(this_node())));
on_internal_error(tlogger, seastar::format("topology[{}]: {}: local node already mapped to {}", fmt::ptr(this), node_printer(node), node_printer(this_node())));
}
locator::node& n = *_nodes.back();
n._is_this_node = node::this_node::yes;
@@ -255,25 +238,14 @@ const node& topology::add_node(node_holder nptr) {
return *node;
}
void topology::update_node(node& node, std::optional<host_id> opt_id, std::optional<endpoint_dc_rack> opt_dr, std::optional<node::state> opt_st,
std::optional<shard_id> opt_shard_count) {
void topology::update_node(node& node, std::optional<host_id> opt_id, std::optional<endpoint_dc_rack> opt_dr, std::optional<node::state> opt_st, std::optional<shard_id> opt_shard_count) {
tlogger.debug("topology[{}]: update_node: {}: to: host_id={} dc={} rack={} state={} shard_count={}, at {}", fmt::ptr(this), node_printer(&node),
seastar::value_of([&] {
return opt_id ? format("{}", *opt_id) : "unchanged";
}),
seastar::value_of([&] {
return opt_dr ? format("{}", opt_dr->dc) : "unchanged";
}),
seastar::value_of([&] {
return opt_dr ? format("{}", opt_dr->rack) : "unchanged";
}),
seastar::value_of([&] {
return opt_st ? format("{}", *opt_st) : "unchanged";
}),
seastar::value_of([&] {
return opt_shard_count ? format("{}", *opt_shard_count) : "unchanged";
}),
lazy_backtrace());
opt_id ? format("{}", *opt_id) : "unchanged",
opt_dr ? format("{}", opt_dr->dc) : "unchanged",
opt_dr ? format("{}", opt_dr->rack) : "unchanged",
opt_st ? format("{}", *opt_st) : "unchanged",
opt_shard_count ? format("{}", *opt_shard_count) : "unchanged",
lazy_backtrace());
bool changed = false;
if (opt_id) {
@@ -285,8 +257,7 @@ void topology::update_node(node& node, std::optional<host_id> opt_id, std::optio
on_internal_error(tlogger, seastar::format("This node host_id is already set: {}: new host_id={}", node_printer(&node), *opt_id));
}
if (_nodes_by_host_id.contains(*opt_id)) {
on_internal_error(tlogger, seastar::format("Cannot update node host_id: {}: new host_id already exists: {}", node_printer(&node),
node_printer(find_node(*opt_id))));
on_internal_error(tlogger, seastar::format("Cannot update node host_id: {}: new host_id already exists: {}", node_printer(&node), node_printer(find_node(*opt_id))));
}
changed = true;
} else {
@@ -471,11 +442,11 @@ const node* topology::find_node(node::idx_type idx) const noexcept {
return _nodes.at(idx).get();
}
const node& topology::add_or_update_endpoint(
host_id id, std::optional<endpoint_dc_rack> opt_dr, std::optional<node::state> opt_st, std::optional<shard_id> shard_count) {
tlogger.trace("topology[{}]: add_or_update_endpoint: host_id={} dc={} rack={} state={} shards={}, at {}", fmt::ptr(this), id,
opt_dr.value_or(endpoint_dc_rack{}).dc, opt_dr.value_or(endpoint_dc_rack{}).rack, opt_st.value_or(node::state::none), shard_count,
lazy_backtrace());
const node& topology::add_or_update_endpoint(host_id id, std::optional<endpoint_dc_rack> opt_dr, std::optional<node::state> opt_st, std::optional<shard_id> shard_count)
{
tlogger.trace("topology[{}]: add_or_update_endpoint: host_id={} dc={} rack={} state={} shards={}, at {}", fmt::ptr(this),
id, opt_dr.value_or(endpoint_dc_rack{}).dc, opt_dr.value_or(endpoint_dc_rack{}).rack, opt_st.value_or(node::state::none), shard_count,
lazy_backtrace());
auto* n = find_node(id);
if (n) {
@@ -483,10 +454,14 @@ const node& topology::add_or_update_endpoint(
return *n;
}
return add_node(id, opt_dr.value_or(endpoint_dc_rack::default_location), opt_st.value_or(node::state::none), shard_count.value_or(0));
return add_node(id,
opt_dr.value_or(endpoint_dc_rack::default_location),
opt_st.value_or(node::state::none),
shard_count.value_or(0));
}
bool topology::remove_endpoint(locator::host_id host_id) {
bool topology::remove_endpoint(locator::host_id host_id)
{
auto node = find_node(host_id);
tlogger.debug("topology[{}]: remove_endpoint: host_id={}: {}", fmt::ptr(this), host_id, node_printer(node));
// Do not allow removing yourself from the topology
@@ -527,7 +502,7 @@ void topology::do_sort_by_proximity(locator::host_id address, host_id_vector_rep
locator::host_id id;
int distance;
};
auto host_infos = addresses | std::views::transform([&](locator::host_id id) {
auto host_infos = addresses | std::views::transform([&] (locator::host_id id) {
const auto& loc1 = get_location(id);
return info{id, distance(address, loc, id, loc1)};
}) | std::ranges::to<utils::small_vector<info, host_id_vector_replica_set::internal_capacity()>>();
@@ -589,12 +564,11 @@ std::unordered_set<locator::host_id> topology::get_all_host_ids() const {
return ids;
}
std::unordered_map<sstring, std::unordered_set<host_id>> topology::get_datacenter_host_ids() const {
std::unordered_map<sstring, std::unordered_set<host_id>>
topology::get_datacenter_host_ids() const {
std::unordered_map<sstring, std::unordered_set<host_id>> ret;
for (auto& [dc, nodes] : _dc_nodes) {
ret[dc] = nodes | std::views::transform([](const node& n) {
return n.host_id();
}) | std::ranges::to<std::unordered_set>();
ret[dc] = nodes | std::views::transform([] (const node& n) { return n.host_id(); }) | std::ranges::to<std::unordered_set>();
}
return ret;
}

View File

@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:e59fe56eac435fd03c2f0d7dfc11c6998d7c0750e1851535575497dd13d96015
size 6505524
oid sha256:54662978b9ce4a6e25790b1b0a5099e6063173ffa95a399a6287cf474376ed09
size 6595952

View File

@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:34a0955d2c5a88e18ddab0f1df085e10a17e14129c3e21de91e4f27ef949b6c4
size 6502668
oid sha256:0cf44ea1fb2ae20de45d26fe8095054e60cb8700cddcb2fd79ef79705484b18a
size 6603780

View File

@@ -1098,7 +1098,8 @@ std::optional<std::pair<read_id, index_t>> fsm::start_read_barrier(server_id req
// Make sure that only a leader or a node that is part of the config can request read barrier
// Nodes outside of the config may never get the data, so they will not be able to read it.
if (requester != _my_id && leader_state().tracker.find(requester) == nullptr) {
follower_progress* opt_progress = leader_state().tracker.find(requester);
if (requester != _my_id && opt_progress == nullptr) {
throw std::runtime_error(fmt::format("Read barrier requested by a node outside of the configuration {}", requester));
}
@@ -1109,19 +1110,23 @@ std::optional<std::pair<read_id, index_t>> fsm::start_read_barrier(server_id req
return {};
}
// Optimization for read barriers requested on non-voters. A non-voter doesn't receive the read_quorum message, so
// it might update its commit index only after another leader tick, which would slow down wait_for_apply() at the
// end of the read barrier. Prevent that by replicating to the non-voting requester here.
if (requester != _my_id && opt_progress->commit_idx < _commit_idx && opt_progress->match_idx == _log.last_idx()
&& !opt_progress->can_vote) {
logger.trace("start_read_barrier[{}]: replicate to {} because follower commit_idx={} < commit_idx={}, "
"follower match_idx={} == last_idx={}, and follower can_vote={}",
_my_id, requester, opt_progress->commit_idx, _commit_idx, opt_progress->match_idx,
_log.last_idx(), opt_progress->can_vote);
replicate_to(*opt_progress, true);
}
read_id id = next_read_id();
logger.trace("start_read_barrier[{}] starting read barrier with id {}", _my_id, id);
return std::make_pair(id, _commit_idx);
}
void fsm::maybe_update_commit_idx_for_read(index_t read_idx) {
// read_idx from the leader might not be replicated to the local node yet.
const bool in_local_log = read_idx <= _log.last_idx();
if (in_local_log && log_term_for(read_idx) == get_current_term()) {
advance_commit_idx(read_idx);
}
}
void fsm::stop() {
if (is_leader()) {
// Become follower to stop accepting requests

View File

@@ -480,15 +480,6 @@ public:
std::optional<std::pair<read_id, index_t>> start_read_barrier(server_id requester);
// Update the commit index to the read index (a read barrier result from the leader) if the local entry with the
// read index belongs to the current term.
//
// Satisfying the condition above guarantees that the local log matches the current leader's log up to the read
// index (the Log Matching Property), so the current leader won't drop the local entry with the read index.
// Moreover, this entry has been committed by the leader, so future leaders also won't drop it (the Leader
// Completeness Property). Hence, updating the commit index is safe.
void maybe_update_commit_idx_for_read(index_t read_idx);
size_t in_memory_log_size() const {
return _log.in_memory_size();
}

View File

@@ -1109,6 +1109,18 @@ future<> server_impl::process_fsm_output(index_t& last_stable, fsm_output&& batc
// case.
co_await _persistence->store_term_and_vote(batch.term_and_vote->first, batch.term_and_vote->second);
_stats.store_term_and_vote++;
// When the term advances, any in-flight snapshot transfers
// belong to an outdated term: the progress tracker has been
// reset in become_leader() or we are now a follower.
// Abort them before we dispatch this batch's messages, which
// may start fresh transfers for the new term.
//
// A vote may also change independently of the term (e.g. a
// follower voting for a candidate at the same term), but in
// that case there are no in-flight transfers and the abort
// is a no-op.
abort_snapshot_transfers();
}
if (batch.snp) {
@@ -1218,8 +1230,6 @@ future<> server_impl::process_fsm_output(index_t& last_stable, fsm_output&& batc
// quickly) stop happening (we're outside the config after all).
co_await _apply_entries.push_eventually(removed_from_config{});
}
// request aborts of snapshot transfers
abort_snapshot_transfers();
// abort all read barriers
for (auto& r : _reads) {
r.promise.set_value(not_a_leader{_fsm->current_leader()});
@@ -1561,7 +1571,6 @@ future<> server_impl::read_barrier(seastar::abort_source* as) {
co_return stop_iteration::no;
}
read_idx = std::get<index_t>(res);
_fsm->maybe_update_commit_idx_for_read(read_idx);
co_return stop_iteration::yes;
});

File diff suppressed because it is too large Load Diff

View File

@@ -3253,13 +3253,10 @@ private:
// sequentially because the rows from repair follower 1 to
// repair master might reduce the amount of missing data
// between repair master and repair follower 2.
auto working_hashes = master.working_row_hashes().get();
repair_hash_set set_diff = get_set_diff(master.peer_row_hash_sets(node_idx), working_hashes);
repair_hash_set set_diff = get_set_diff(master.peer_row_hash_sets(node_idx), master.working_row_hashes().get());
// Request missing sets from peer node
if (rlogger.is_enabled(logging::log_level::debug)) {
rlogger.debug("Before get_row_diff to node {}, local={}, peer={}, set_diff={}",
node, working_hashes.size(), master.peer_row_hash_sets(node_idx).size(), set_diff.size());
}
rlogger.debug("Before get_row_diff to node {}, local={}, peer={}, set_diff={}",
node, master.working_row_hashes().get().size(), master.peer_row_hash_sets(node_idx).size(), set_diff.size());
// If we need to pull all rows from the peer. We can avoid
// sending the row hashes on wire by setting needs_all_rows flag.
auto needs_all_rows = repair_meta::needs_all_rows_t(set_diff.size() == master.peer_row_hash_sets(node_idx).size());
@@ -3272,9 +3269,7 @@ private:
master.get_row_diff(std::move(set_diff), needs_all_rows, node, node_idx, dst_cpu_id);
ns.state = repair_state::get_row_diff_finished;
}
if (rlogger.is_enabled(logging::log_level::debug)) {
rlogger.debug("After get_row_diff node {}, hash_sets={}", master.myhostid(), master.working_row_hashes().get().size());
}
rlogger.debug("After get_row_diff node {}, hash_sets={}", master.myhostid(), master.working_row_hashes().get().size());
} catch (...) {
rlogger.warn("repair[{}]: get_row_diff: got error from node={}, keyspace={}, table={}, range={}, error={}",
_shard_task.global_repair_id.uuid(), node, _shard_task.get_keyspace(), _cf_name, _range, std::current_exception());

File diff suppressed because it is too large Load Diff

View File

@@ -87,6 +87,11 @@ target_include_directories(wasmtime_bindings
target_link_libraries(wasmtime_bindings
INTERFACE Rust::rust_combined)
if (Scylla_USE_PRECOMPILED_HEADER_USE)
# The PCH from scylla-precompiled-header is compiled with Seastar's compile
# flags, including sanitizer flags in Debug/Sanitize modes. Any target reusing
# this PCH must have matching compile options, otherwise the compiler rejects
# the PCH due to flag mismatch (e.g., -fsanitize=address).
target_link_libraries(wasmtime_bindings PRIVATE Seastar::seastar)
target_precompile_headers(wasmtime_bindings REUSE_FROM scylla-precompiled-header)
endif()
@@ -108,5 +113,6 @@ target_include_directories(inc
target_link_libraries(inc
INTERFACE Rust::rust_combined)
if (Scylla_USE_PRECOMPILED_HEADER_USE)
target_link_libraries(inc PRIVATE Seastar::seastar)
target_precompile_headers(inc REUSE_FROM scylla-precompiled-header)
endif()

View File

@@ -22,12 +22,12 @@ static logging::logger slogger("schema_registry");
static thread_local schema_registry registry;
schema_version_not_found::schema_version_not_found(table_schema_version v)
: std::runtime_error{format("Schema version {} not found", v)} {
}
: std::runtime_error{format("Schema version {} not found", v)}
{ }
schema_version_loading_failed::schema_version_loading_failed(table_schema_version v)
: std::runtime_error{format("Failed to load schema version {}", v)} {
}
: std::runtime_error{format("Failed to load schema version {}", v)}
{ }
schema_registry_entry::~schema_registry_entry() {
if (_schema) {
@@ -39,7 +39,8 @@ schema_registry_entry::schema_registry_entry(table_schema_version v, schema_regi
: _state(state::INITIAL)
, _version(v)
, _registry(r)
, _sync_state(sync_state::NOT_SYNCED) {
, _sync_state(sync_state::NOT_SYNCED)
{
_erase_timer.set_callback([this] {
slogger.debug("Dropping {}", _version);
SCYLLA_ASSERT(!_schema);
@@ -70,8 +71,8 @@ void schema_registry::attach_table(schema_registry_entry& e) noexcept {
e.set_table(table.weak_from_this());
} catch (const replica::no_such_column_family&) {
if (slogger.is_enabled(seastar::log_level::debug)) {
slogger.debug("No table for schema version {} of {}.{}: {}", e._version, e.get_schema()->ks_name(), e.get_schema()->cf_name(),
seastar::current_backtrace());
slogger.debug("No table for schema version {} of {}.{}: {}", e._version,
e.get_schema()->ks_name(), e.get_schema()->cf_name(), seastar::current_backtrace());
}
// ignore
}
@@ -220,7 +221,7 @@ future<schema_ptr> schema_registry_entry::start_loading(async_schema_loader load
_state = state::LOADING;
slogger.trace("Loading {}", _version);
// Move to background.
(void)f.then_wrapped([self = shared_from_this(), this](future<extended_frozen_schema>&& f) {
(void)f.then_wrapped([self = shared_from_this(), this] (future<extended_frozen_schema>&& f) {
_loader = {};
if (_state != state::LOADING) {
slogger.trace("Loading of {} aborted", _version);
@@ -293,8 +294,8 @@ schema_registry& local_schema_registry() {
}
global_schema_ptr::global_schema_ptr(const global_schema_ptr& o)
: global_schema_ptr(o.get()) {
}
: global_schema_ptr(o.get())
{ }
global_schema_ptr::global_schema_ptr(global_schema_ptr&& o) noexcept {
auto current = this_shard_id();
@@ -331,15 +332,15 @@ schema_ptr global_schema_ptr::get() const {
}
global_schema_ptr::global_schema_ptr(const schema_ptr& ptr)
: _cpu_of_origin(this_shard_id()) {
: _cpu_of_origin(this_shard_id()) {
// _ptr must always have an associated registry entry,
// if ptr doesn't, we need to load it into the registry.
auto ensure_registry_entry = [](const schema_ptr& s) {
auto ensure_registry_entry = [] (const schema_ptr& s) {
schema_registry_entry* e = s->registry_entry();
if (e) {
return s;
} else {
return local_schema_registry().get_or_load(s->version(), [&s](table_schema_version) -> extended_frozen_schema {
return local_schema_registry().get_or_load(s->version(), [&s] (table_schema_version) -> extended_frozen_schema {
return extended_frozen_schema(s);
});
}

View File

@@ -538,6 +538,7 @@ future<> raft_group0::join_group0(std::vector<gms::inet_address> seeds, shared_p
group0_id = g0_info.group0_id;
raft::server_address my_addr{my_id, {}};
bool starting_server_as_follower = false;
if (server == nullptr) {
// This is the first time discovery is run. Create and start a Raft server for group 0 on this node.
raft::configuration initial_configuration;
@@ -565,6 +566,7 @@ future<> raft_group0::join_group0(std::vector<gms::inet_address> seeds, shared_p
// trigger an empty snapshot transfer.
nontrivial_snapshot = true;
} else {
starting_server_as_follower = true;
co_await handshaker->pre_server_start(g0_info);
}
@@ -591,7 +593,9 @@ future<> raft_group0::join_group0(std::vector<gms::inet_address> seeds, shared_p
}
SCYLLA_ASSERT(server);
if (server->get_configuration().contains(my_id)) {
co_await utils::get_local_injector().inject("join_group0_pause_before_config_check",
utils::wait_for_message(std::chrono::minutes{5}));
if (!starting_server_as_follower && server->get_configuration().contains(my_id)) {
// True if we started a new group or completed a configuration change initiated earlier.
group0_log.info("server {} already in group 0 (id {}) as {}", my_id, group0_id,
server->get_configuration().can_vote(my_id)? "voter" : "non-voter");

View File

@@ -239,11 +239,9 @@ public:
// The i-th element corresponds to the i-th entry in _entries.
// Can be smaller than _entries. If _entries[i] doesn't have a matching element in _promoted_indexes then
// that entry doesn't have a promoted index.
// It's not chunked, because promoted index is present only when there are large partitions in the page,
// which also means the page will have typically only 1 entry due to summary:data_file size ratio.
// Kept separately to avoid paying for storage cost in pages where no entry has a promoted index,
// which is typical in workloads with small partitions.
managed_vector<promoted_index> _promoted_indexes;
lsa::chunked_managed_vector<promoted_index> _promoted_indexes;
public:
partition_index_page() = default;
partition_index_page(partition_index_page&&) noexcept = default;

View File

@@ -65,8 +65,9 @@ struct send_info {
mutation_fragment_v1_stream reader;
noncopyable_function<void(size_t)> update;
send_info(netw::messaging_service& ms_, streaming::plan_id plan_id_, lw_shared_ptr<replica::table> tbl_, reader_permit permit_,
dht::token_range_vector ranges_, locator::host_id id_, uint32_t dst_cpu_id_, stream_reason reason_, service::frozen_topology_guard topo_guard_,
noncopyable_function<void(size_t)> update_fn)
dht::token_range_vector ranges_, locator::host_id id_,
uint32_t dst_cpu_id_, stream_reason reason_, service::frozen_topology_guard topo_guard_,
noncopyable_function<void(size_t)> update_fn)
: ms(ms_)
, plan_id(plan_id_)
, cf_id(tbl_->schema()->id())
@@ -78,13 +79,12 @@ struct send_info {
, ranges(std::move(ranges_))
, prs(dht::to_partition_ranges(ranges))
, reader(cf->make_streaming_reader(cf->schema(), std::move(permit_), prs, gc_clock::now()))
, update(std::move(update_fn)) {
, update(std::move(update_fn))
{
}
future<bool> has_relevant_range_on_this_shard() {
return do_with(false, ranges.begin(), [this](bool& found_relevant_range, dht::token_range_vector::iterator& ranges_it) {
auto stop_cond = [this, &found_relevant_range, &ranges_it] {
return ranges_it == ranges.end() || found_relevant_range;
};
return do_with(false, ranges.begin(), [this] (bool& found_relevant_range, dht::token_range_vector::iterator& ranges_it) {
auto stop_cond = [this, &found_relevant_range, &ranges_it] { return ranges_it == ranges.end() || found_relevant_range; };
return do_until(std::move(stop_cond), [this, &found_relevant_range, &ranges_it] {
dht::token_range range = *ranges_it++;
if (!found_relevant_range) {
@@ -113,112 +113,93 @@ struct send_info {
};
future<> send_mutation_fragments(lw_shared_ptr<send_info> si) {
return si->reader.has_more_fragments().then([si](bool there_is_more) {
if (!there_is_more) {
// The reader contains no data
sslog.info("[Stream #{}] Skip sending ks={}, cf={}, reader contains no data, with new rpc streaming", si->plan_id, si->cf->schema()->ks_name(),
si->cf->schema()->cf_name());
return make_ready_future<>();
}
return si->estimate_partitions().then([si](size_t estimated_partitions) {
sslog.info("[Stream #{}] Start sending ks={}, cf={}, estimated_partitions={}, with new rpc streaming", si->plan_id, si->cf->schema()->ks_name(),
si->cf->schema()->cf_name(), estimated_partitions);
return si->ms
.make_sink_and_source_for_stream_mutation_fragments(
si->reader.schema()->version(), si->plan_id, si->cf_id, estimated_partitions, si->reason, si->topo_guard, si->id)
.then_unpack([si](rpc::sink<frozen_mutation_fragment, stream_mutation_fragments_cmd> sink, rpc::source<int32_t> source) mutable {
auto got_error_from_peer = make_lw_shared<bool>(false);
auto table_is_dropped = make_lw_shared<bool>(false);
return si->reader.has_more_fragments().then([si] (bool there_is_more) {
if (!there_is_more) {
// The reader contains no data
sslog.info("[Stream #{}] Skip sending ks={}, cf={}, reader contains no data, with new rpc streaming",
si->plan_id, si->cf->schema()->ks_name(), si->cf->schema()->cf_name());
return make_ready_future<>();
}
return si->estimate_partitions().then([si] (size_t estimated_partitions) {
sslog.info("[Stream #{}] Start sending ks={}, cf={}, estimated_partitions={}, with new rpc streaming", si->plan_id, si->cf->schema()->ks_name(), si->cf->schema()->cf_name(), estimated_partitions);
return si->ms.make_sink_and_source_for_stream_mutation_fragments(si->reader.schema()->version(), si->plan_id, si->cf_id, estimated_partitions, si->reason, si->topo_guard, si->id).then_unpack([si] (rpc::sink<frozen_mutation_fragment, stream_mutation_fragments_cmd> sink, rpc::source<int32_t> source) mutable {
auto got_error_from_peer = make_lw_shared<bool>(false);
auto table_is_dropped = make_lw_shared<bool>(false);
auto source_op = [source, got_error_from_peer, table_is_dropped, si]() mutable -> future<> {
return repeat([source, got_error_from_peer, table_is_dropped, si]() mutable {
return source().then([source, got_error_from_peer, table_is_dropped, si](
std::optional<std::tuple<int32_t>> status_opt) mutable {
if (status_opt) {
auto status = std::get<0>(*status_opt);
if (status == -1) {
*got_error_from_peer = true;
} else if (status == -2) {
*got_error_from_peer = true;
*table_is_dropped = true;
}
sslog.debug("Got status code from peer={}, plan_id={}, cf_id={}, status={}", si->id, si->plan_id, si->cf_id, status);
// we've got an error from the other side, but we cannot just abandon rpc::source we
// need to continue reading until EOS since this will signal that no more work
// is left and rpc::source can be destroyed. The sender closes connection immediately
// after sending the status, so EOS should arrive shortly.
return stop_iteration::no;
} else {
return stop_iteration::yes;
}
});
});
}();
auto source_op = [source, got_error_from_peer, table_is_dropped, si] () mutable -> future<> {
return repeat([source, got_error_from_peer, table_is_dropped, si] () mutable {
return source().then([source, got_error_from_peer, table_is_dropped, si] (std::optional<std::tuple<int32_t>> status_opt) mutable {
if (status_opt) {
auto status = std::get<0>(*status_opt);
if (status == -1) {
*got_error_from_peer = true;
} else if (status == -2) {
*got_error_from_peer = true;
*table_is_dropped = true;
}
sslog.debug("Got status code from peer={}, plan_id={}, cf_id={}, status={}", si->id, si->plan_id, si->cf_id, status);
// we've got an error from the other side, but we cannot just abandon rpc::source we
// need to continue reading until EOS since this will signal that no more work
// is left and rpc::source can be destroyed. The sender closes connection immediately
// after sending the status, so EOS should arrive shortly.
return stop_iteration::no;
} else {
return stop_iteration::yes;
}
});
});
}();
auto sink_op = [sink, si, got_error_from_peer]() mutable -> future<> {
mutation_fragment_stream_validator validator(*(si->reader.schema()));
return do_with(std::move(sink), std::move(validator),
[si, got_error_from_peer](rpc::sink<frozen_mutation_fragment, stream_mutation_fragments_cmd>& sink,
mutation_fragment_stream_validator& validator) {
return repeat([&sink, &validator, si, got_error_from_peer]() mutable {
return si->reader().then(
[&sink, &validator, si, s = si->reader.schema(), got_error_from_peer](mutation_fragment_opt mf) mutable {
if (*got_error_from_peer) {
return make_exception_future<stop_iteration>(std::runtime_error("Got status error code from peer"));
}
if (mf) {
if (!validator(mf->mutation_fragment_kind())) {
return make_exception_future<stop_iteration>(std::runtime_error(
format("Stream reader mutation_fragment validator failed, previous={}, current={}",
validator.previous_mutation_fragment_kind(), mf->mutation_fragment_kind())));
}
frozen_mutation_fragment fmf = freeze(*s, *mf);
auto size = fmf.representation().size();
si->update(size);
return sink(fmf, stream_mutation_fragments_cmd::mutation_fragment_data).then([] {
return stop_iteration::no;
});
} else {
if (!validator.on_end_of_stream()) {
return make_exception_future<stop_iteration>(
std::runtime_error(format("Stream reader mutation_fragment validator failed on "
"end_of_stream, previous={}, current=end_of_stream",
validator.previous_mutation_fragment_kind())));
}
return make_ready_future<stop_iteration>(stop_iteration::yes);
}
});
})
.then([&sink]() mutable {
return sink(frozen_mutation_fragment(bytes_ostream()), stream_mutation_fragments_cmd::end_of_stream);
})
.handle_exception([&sink](std::exception_ptr ep) mutable {
// Notify the receiver the sender has failed
return sink(frozen_mutation_fragment(bytes_ostream()), stream_mutation_fragments_cmd::error)
.then([ep = std::move(ep)]() mutable {
return make_exception_future<>(std::move(ep));
});
})
.finally([&sink]() mutable {
return sink.close();
});
});
}();
return when_all_succeed(std::move(source_op), std::move(sink_op)).then_unpack([got_error_from_peer, table_is_dropped, si] {
if (*got_error_from_peer) {
if (*table_is_dropped) {
sslog.info("[Stream #{}] Skipped streaming the dropped table {}.{}", si->plan_id, si->cf->schema()->ks_name(),
si->cf->schema()->cf_name());
} else {
throw std::runtime_error(
format("Peer failed to process mutation_fragment peer={}, plan_id={}, cf_id={}", si->id, si->plan_id, si->cf_id));
}
auto sink_op = [sink, si, got_error_from_peer] () mutable -> future<> {
mutation_fragment_stream_validator validator(*(si->reader.schema()));
return do_with(std::move(sink), std::move(validator), [si, got_error_from_peer] (rpc::sink<frozen_mutation_fragment, stream_mutation_fragments_cmd>& sink, mutation_fragment_stream_validator& validator) {
return repeat([&sink, &validator, si, got_error_from_peer] () mutable {
return si->reader().then([&sink, &validator, si, s = si->reader.schema(), got_error_from_peer] (mutation_fragment_opt mf) mutable {
if (*got_error_from_peer) {
return make_exception_future<stop_iteration>(std::runtime_error("Got status error code from peer"));
}
if (mf) {
if (!validator(mf->mutation_fragment_kind())) {
return make_exception_future<stop_iteration>(std::runtime_error(format("Stream reader mutation_fragment validator failed, previous={}, current={}",
validator.previous_mutation_fragment_kind(), mf->mutation_fragment_kind())));
}
});
frozen_mutation_fragment fmf = freeze(*s, *mf);
auto size = fmf.representation().size();
si->update(size);
return sink(fmf, stream_mutation_fragments_cmd::mutation_fragment_data).then([] { return stop_iteration::no; });
} else {
if (!validator.on_end_of_stream()) {
return make_exception_future<stop_iteration>(std::runtime_error(format("Stream reader mutation_fragment validator failed on end_of_stream, previous={}, current=end_of_stream",
validator.previous_mutation_fragment_kind())));
}
return make_ready_future<stop_iteration>(stop_iteration::yes);
}
});
}).then([&sink] () mutable {
return sink(frozen_mutation_fragment(bytes_ostream()), stream_mutation_fragments_cmd::end_of_stream);
}).handle_exception([&sink] (std::exception_ptr ep) mutable {
// Notify the receiver the sender has failed
return sink(frozen_mutation_fragment(bytes_ostream()), stream_mutation_fragments_cmd::error).then([ep = std::move(ep)] () mutable {
return make_exception_future<>(std::move(ep));
});
}).finally([&sink] () mutable {
return sink.close();
});
});
}();
return when_all_succeed(std::move(source_op), std::move(sink_op)).then_unpack([got_error_from_peer, table_is_dropped, si] {
if (*got_error_from_peer) {
if (*table_is_dropped) {
sslog.info("[Stream #{}] Skipped streaming the dropped table {}.{}", si->plan_id, si->cf->schema()->ks_name(), si->cf->schema()->cf_name());
} else {
throw std::runtime_error(format("Peer failed to process mutation_fragment peer={}, plan_id={}, cf_id={}", si->id, si->plan_id, si->cf_id));
}
}
});
});
});
});
}
future<> stream_transfer_task::execute() {
@@ -226,55 +207,46 @@ future<> stream_transfer_task::execute() {
auto cf_id = this->cf_id;
auto id = session->peer;
auto& sm = session->manager();
auto table_dropped = co_await streaming::with_table_drop_silenced(sm.db(), sm.mm(), cf_id, [this, &sm, cf_id, plan_id, id](const table_id&) {
auto table_dropped = co_await streaming::with_table_drop_silenced(sm.db(), sm.mm(), cf_id, [this, &sm, cf_id, plan_id, id] (const table_id &) {
auto dst_cpu_id = session->dst_cpu_id;
sslog.debug("[Stream #{}] stream_transfer_task: cf_id={}", plan_id, cf_id);
sort_and_merge_ranges();
auto reason = session->get_reason();
auto topo_guard = session->topo_guard();
return sm.container()
.invoke_on_all([plan_id, cf_id, id, dst_cpu_id, ranges = this->_ranges, reason, topo_guard](stream_manager& sm) mutable {
auto tbl = sm.db().find_column_family(cf_id).shared_from_this();
return sm.db()
.obtain_reader_permit(*tbl, "stream-transfer-task", db::no_timeout, {})
.then([&sm, tbl, plan_id, cf_id, id, dst_cpu_id, ranges = std::move(ranges), reason, topo_guard](reader_permit permit) mutable {
auto si = make_lw_shared<send_info>(sm.ms(), plan_id, tbl, std::move(permit), std::move(ranges), id, dst_cpu_id, reason,
topo_guard, [&sm, plan_id, id](size_t sz) {
sm.update_progress(plan_id, id, streaming::progress_info::direction::OUT, sz);
});
return si->has_relevant_range_on_this_shard()
.then([si, plan_id, cf_id](bool has_relevant_range_on_this_shard) {
if (!has_relevant_range_on_this_shard) {
sslog.debug("[Stream #{}] stream_transfer_task: cf_id={}: ignore ranges on shard={}", plan_id, cf_id,
this_shard_id());
return make_ready_future<>();
}
return send_mutation_fragments(std::move(si));
})
.finally([si] {
return si->reader.close();
});
});
})
.then([this, plan_id, cf_id, id, &sm] {
sslog.debug("[Stream #{}] SEND STREAM_MUTATION_DONE to {}, cf_id={}", plan_id, id, cf_id);
return ser::streaming_rpc_verbs::send_stream_mutation_done(&sm.ms(), id, plan_id, _ranges, cf_id, session->dst_cpu_id)
.handle_exception([plan_id, id](auto ep) {
sslog.warn("[Stream #{}] stream_transfer_task: Fail to send STREAM_MUTATION_DONE to {}: {}", plan_id, id, ep);
std::rethrow_exception(ep);
});
})
.then([this, id, plan_id] {
_mutation_done_sent = true;
sslog.debug("[Stream #{}] GOT STREAM_MUTATION_DONE Reply from {}", plan_id, id);
})
.handle_exception([plan_id, id, &sm](std::exception_ptr ep) {
sslog.warn("[Stream #{}] stream_transfer_task: Fail to send to {}: {}", plan_id, id, ep);
utils::get_local_injector().inject("stream_mutation_fragments_table_dropped", [&sm]() {
sm.db().find_column_family(table_id::create_null_id());
});
std::rethrow_exception(ep);
return sm.container().invoke_on_all([plan_id, cf_id, id, dst_cpu_id, ranges=this->_ranges, reason, topo_guard] (stream_manager& sm) mutable {
auto tbl = sm.db().find_column_family(cf_id).shared_from_this();
return sm.db().obtain_reader_permit(*tbl, "stream-transfer-task", db::no_timeout, {}).then([&sm, tbl, plan_id, cf_id, id, dst_cpu_id, ranges=std::move(ranges), reason, topo_guard] (reader_permit permit) mutable {
auto si = make_lw_shared<send_info>(sm.ms(), plan_id, tbl, std::move(permit), std::move(ranges), id, dst_cpu_id, reason, topo_guard, [&sm, plan_id, id] (size_t sz) {
sm.update_progress(plan_id, id, streaming::progress_info::direction::OUT, sz);
});
return si->has_relevant_range_on_this_shard().then([si, plan_id, cf_id] (bool has_relevant_range_on_this_shard) {
if (!has_relevant_range_on_this_shard) {
sslog.debug("[Stream #{}] stream_transfer_task: cf_id={}: ignore ranges on shard={}",
plan_id, cf_id, this_shard_id());
return make_ready_future<>();
}
return send_mutation_fragments(std::move(si));
}).finally([si] {
return si->reader.close();
});
});
}).then([this, plan_id, cf_id, id, &sm] {
sslog.debug("[Stream #{}] SEND STREAM_MUTATION_DONE to {}, cf_id={}", plan_id, id, cf_id);
return ser::streaming_rpc_verbs::send_stream_mutation_done(&sm.ms(), id, plan_id, _ranges,
cf_id, session->dst_cpu_id).handle_exception([plan_id, id] (auto ep) {
sslog.warn("[Stream #{}] stream_transfer_task: Fail to send STREAM_MUTATION_DONE to {}: {}", plan_id, id, ep);
std::rethrow_exception(ep);
});
}).then([this, id, plan_id] {
_mutation_done_sent = true;
sslog.debug("[Stream #{}] GOT STREAM_MUTATION_DONE Reply from {}", plan_id, id);
}).handle_exception([plan_id, id, &sm] (std::exception_ptr ep) {
sslog.warn("[Stream #{}] stream_transfer_task: Fail to send to {}: {}", plan_id, id, ep);
utils::get_local_injector().inject("stream_mutation_fragments_table_dropped", [&sm] () {
sm.db().find_column_family(table_id::create_null_id());
});
std::rethrow_exception(ep);
});
});
// If the table is dropped during streaming, we can ignore the
// errors and make the stream successful. This allows user to

75
test.py
View File

@@ -11,6 +11,7 @@ from __future__ import annotations
import argparse
import asyncio
import math
import shlex
import textwrap
from random import randint
@@ -73,6 +74,51 @@ PYTEST_RUNNER_DIRECTORIES = [
launch_time = time.monotonic()
class ThreadsCalculator:
"""
The ThreadsCalculator class calculates the number of jobs that can be run concurrently based on system
memory and CPU constraints. It allows resource reservation and configurable parameters for
flexible job scheduling in various modes, such as `debug`.
"""
def __init__(self,
modes: list[str],
min_system_memory_reserve: float = 5e9,
max_system_memory_reserve: float = 8e9,
system_memory_reserve_fraction = 16,
max_test_memory: float = 5e9,
test_memory_fraction: float = 8.0,
debug_test_memory_multiplier: float = 1.5,
debug_cpus_per_test_job=1.5,
non_debug_cpus_per_test_job: float =1.0,
non_debug_max_test_memory: float = 4e9
):
sys_mem = int(os.sysconf("SC_PAGE_SIZE") * os.sysconf("SC_PHYS_PAGES"))
test_mem = min(sys_mem / test_memory_fraction, max_test_memory)
if "debug" in modes:
test_mem *= debug_test_memory_multiplier
system_memory_reserve = int(min(
max(sys_mem / system_memory_reserve_fraction, min_system_memory_reserve),
max_system_memory_reserve,
))
available_mem = max(0, sys_mem - system_memory_reserve)
is_debug = "debug" in modes
test_mem = min(
sys_mem / test_memory_fraction,
max_test_memory if is_debug else non_debug_max_test_memory,
)
if is_debug:
test_mem *= debug_test_memory_multiplier
self.cpus_per_test_job = (
debug_cpus_per_test_job if is_debug else non_debug_cpus_per_test_job
)
self.default_num_jobs_mem = max(1, int(available_mem // test_mem))
def get_number_of_threads(self, nr_cpus: int) -> int:
default_num_jobs_cpu = max(1, math.ceil(nr_cpus / self.cpus_per_test_job))
return min(self.default_num_jobs_mem, default_num_jobs_cpu)
class TabularConsoleOutput:
"""Print test progress to the console"""
@@ -181,7 +227,7 @@ def parse_cmd_line() -> argparse.Namespace:
help="Run only tests for given build mode(s)")
parser.add_argument('--repeat', action="store", default="1", type=int,
help="number of times to repeat test execution")
parser.add_argument('--timeout', action="store", default="24000", type=int,
parser.add_argument('--timeout', action="store", default="3600", type=int,
help="timeout value for single test execution")
parser.add_argument('--session-timeout', action="store", default="24000", type=int,
help="timeout value for test.py/pytest session execution")
@@ -273,6 +319,13 @@ def parse_cmd_line() -> argparse.Namespace:
if args.skip_patterns and args.k:
parser.error(palette.fail('arguments --skip and -k are mutually exclusive, please use only one of them'))
if not args.modes:
try:
args.modes = get_configured_modes()
except Exception:
print(palette.fail("Failed to read output of `ninja mode_list`: please run ./configure.py first"))
raise
if not args.jobs:
if not args.cpus:
nr_cpus = multiprocessing.cpu_count()
@@ -280,19 +333,7 @@ def parse_cmd_line() -> argparse.Namespace:
nr_cpus = int(subprocess.check_output(
['taskset', '-c', args.cpus, 'python3', '-c',
'import os; print(len(os.sched_getaffinity(0)))']))
cpus_per_test_job = 1
sysmem = os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES')
testmem = 6e9 if os.sysconf('SC_PAGE_SIZE') > 4096 else 2e9
default_num_jobs_mem = ((sysmem - 4e9) // testmem)
args.jobs = min(default_num_jobs_mem, nr_cpus // cpus_per_test_job)
if not args.modes:
try:
args.modes = get_configured_modes()
except Exception:
print(palette.fail("Failed to read output of `ninja mode_list`: please run ./configure.py first"))
raise
args.jobs = ThreadsCalculator(args.modes).get_number_of_threads(nr_cpus)
if not args.coverage_modes and args.coverage:
args.coverage_modes = list(args.modes)
@@ -350,16 +391,12 @@ def run_pytest(options: argparse.Namespace) -> tuple[int, list[SimpleNamespace]]
if options.list_tests:
args.extend(['--collect-only', '--quiet', '--no-header'])
else:
threads = int(options.jobs)
# debug mode is very CPU and memory hungry, so we need to lower the number of threads to be able to finish tests
if 'debug' in options.modes:
threads = int(threads * 0.5)
args.extend([
"--log-level=DEBUG", # Capture logs
f'--junit-xml={junit_output_file}',
"-rf",
'--test-py-init',
f'-n{threads}',
f'-n{options.jobs}',
f'--tmpdir={temp_dir}',
f'--maxfail={options.max_failures}',
f'--alluredir={report_dir / f"allure_{HOST_ID}"}',

View File

@@ -18,7 +18,7 @@
#include <seastar/core/coroutine.hh>
#include <seastar/core/manual_clock.hh>
#include <seastar/util/later.hh>
#include <seastar/core/timer.hh>
#include <seastar/util/defer.hh>
#include <seastar/coroutine/maybe_yield.hh>
#include <seastar/util/alloc_failure_injector.hh>
@@ -290,12 +290,17 @@ SEASTAR_THREAD_TEST_CASE(test_address_map_replication) {
m.set_expiring(id1);
BOOST_CHECK(m.find(id1) && *m.find(id1) == addr1);
m.barrier().get();
promise<> shard0_timer_expired;
timer<manual_clock> shard0_timer([&shard0_timer_expired] {
shard0_timer_expired.set_value();
});
shard0_timer.arm(manual_clock::now() + expiration_time);
m_svc.invoke_on(1, [] (address_map_t<manual_clock>& m) {
BOOST_CHECK(m.find(id1) && *m.find(id1) == addr1);
manual_clock::advance(expiration_time);
BOOST_CHECK(!m.find(id1));
return smp::submit_to(0, []{}); // Ensure shard 0 notices timer is expired.
}).get();
shard0_timer_expired.get_future().get();
BOOST_CHECK(!m.find(id1));
// Expiring entries are replicated

View File

@@ -1045,7 +1045,6 @@ validate_result_size(size_t i, schema_ptr schema, const utils::chunked_vector<mu
struct fuzzy_test_config {
uint32_t seed;
std::chrono::seconds timeout;
unsigned concurrency;
unsigned scans;
};
@@ -1077,6 +1076,9 @@ run_fuzzy_test_scan(size_t i, fuzzy_test_config cfg, sharded<replica::database>&
testlog.debug("[scan#{}]: seed={}, is_stateful={}, prange={}, ckranges={}", i, seed, is_stateful, partition_range,
partition_slice.default_row_ranges());
// Use a small max_size to force many pages per scan, stressing the
// paging and result-merging logic. With the large row limit here,
// the byte limit is typically the tighter bound.
const auto [results, npages] = read_partitions_with_paged_scan(db, schema, 1000, 1024, is_stateful, partition_range, partition_slice);
const auto expected_partitions = slice_partitions(*schema, mutations, partition_index_range, partition_slice);
@@ -1160,21 +1162,27 @@ SEASTAR_THREAD_TEST_CASE(fuzzy_test) {
std::uniform_int_distribution<size_t>(0, 100), // clustering-rows
std::uniform_int_distribution<size_t>(0, 100), // range-tombstones
#else
// Keep these values moderate: with complex randomly-generated
// schemas (deeply nested frozen collections/UDTs), large row
// counts cause data generation and paged scanning to be very
// slow, leading to CI timeouts. The test's value comes from
// schema variety and paging correctness, not from sheer data
// volume.
std::uniform_int_distribution<size_t>(32, 64), // partitions
std::uniform_int_distribution<size_t>(0, 1000), // clustering-rows
std::uniform_int_distribution<size_t>(0, 1000), // range-tombstones
std::uniform_int_distribution<size_t>(0, 200), // clustering-rows
std::uniform_int_distribution<size_t>(0, 200), // range-tombstones
#endif
tests::default_timestamp_generator());
#if defined DEBUG
auto cfg = fuzzy_test_config{seed, std::chrono::seconds{8}, 1, 1};
auto cfg = fuzzy_test_config{seed, 1, 1};
#elif defined DEVEL
auto cfg = fuzzy_test_config{seed, std::chrono::seconds{2}, 2, 4};
auto cfg = fuzzy_test_config{seed, 2, 4};
#else
auto cfg = fuzzy_test_config{seed, std::chrono::seconds{2}, 4, 8};
auto cfg = fuzzy_test_config{seed, 4, 8};
#endif
testlog.info("Running test workload with configuration: seed={}, timeout={}s, concurrency={}, scans={}", cfg.seed, cfg.timeout.count(),
testlog.info("Running test workload with configuration: seed={}, concurrency={}, scans={}", cfg.seed,
cfg.concurrency, cfg.scans);
smp::invoke_on_all([cfg, db = &env.db(), gs = global_schema_ptr(tbl.schema), &compacted_frozen_mutations = tbl.compacted_frozen_mutations] {

View File

@@ -906,9 +906,13 @@ SEASTAR_THREAD_TEST_CASE(test_timeout_is_applied_on_lookup) {
BOOST_REQUIRE(entry.permit.timeout() == new_timeout);
BOOST_REQUIRE(!entry.permit.get_abort_exception());
sleep(ttl_timeout_test_timeout * 2).get();
// Don't waste time retrying before the timeout is up
sleep(ttl_timeout_test_timeout).get();
eventually_true([&entry] {
return bool(entry.permit.get_abort_exception());
});
BOOST_REQUIRE(entry.permit.get_abort_exception());
BOOST_REQUIRE_THROW(std::rethrow_exception(entry.permit.get_abort_exception()), seastar::named_semaphore_timed_out);
}

View File

@@ -2644,7 +2644,10 @@ SEASTAR_TEST_CASE(test_exception_safety_of_update_from_memtable) {
return rd;
};
populate_range(cache, population_range);
{
memory::scoped_critical_alloc_section dfg;
populate_range(cache, population_range);
}
auto rd1_v1 = assert_that(make_reader(population_range));
mutation_reader_opt snap;
auto close_snap = defer([&snap] {

View File

@@ -257,39 +257,44 @@ async def manager(request: pytest.FixtureRequest,
yield manager_client
# `request.node.stash` contains a report stored in `pytest_runtest_makereport` from where we can retrieve
# test failure.
report = request.node.stash[PHASE_REPORT_KEY]
failed = report.when == "call" and report.failed
# Check if the test has the check_nodes_for_errors marker
found_errors = await manager_client.check_all_errors(check_all_errors=(request.node.get_closest_marker("check_nodes_for_errors") is not None))
cluster_status = None
found_errors = {}
failed = False
failed_test_dir_path = None
if failed or found_errors:
# Save scylladb logs for failed tests in a separate directory and copy XML report to the same directory to have
# all related logs in one dir.
# Then add property to the XML report with the path to the directory, so it can be visible in Jenkins
failed_test_dir_path = testpy_test.suite.log_dir / "failed_test" / test_case_name.translate(
str.maketrans('[]', '()'))
failed_test_dir_path.mkdir(parents=True, exist_ok=True)
try:
report = request.node.stash[PHASE_REPORT_KEY]
failed = report.when == "call" and report.failed
if failed:
await manager_client.gather_related_logs(
failed_test_dir_path,
{'pytest.log': test_log, 'test_py.log': test_py_log_test}
)
with open(failed_test_dir_path / "stacktrace.txt", "w") as f:
f.write(report.longreprtext)
if request.config.getoption('artifacts_dir_url') is not None:
# get the relative path to the tmpdir for the failed directory
dir_path_relative = f"{failed_test_dir_path.as_posix()[failed_test_dir_path.as_posix().find('testlog'):]}"
full_url = urllib.parse.urljoin(request.config.getoption('artifacts_dir_url') + '/',
urllib.parse.quote(dir_path_relative))
record_property("TEST_LOGS", full_url)
# Check if the test has the check_nodes_for_errors marker
found_errors = await manager_client.check_all_errors(check_all_errors=(request.node.get_closest_marker("check_nodes_for_errors") is not None))
cluster_status = await manager_client.after_test(test_case_name, not failed)
await manager_client.stop() # Stop client session and close driver after each test
if failed or found_errors:
# Save scylladb logs for failed tests in a separate directory and copy XML report to the same directory to have
# all related logs in one dir.
# Then add property to the XML report with the path to the directory, so it can be visible in Jenkins
failed_test_dir_path = testpy_test.suite.log_dir / "failed_test" / test_case_name.translate(
str.maketrans('[]', '()'))
failed_test_dir_path.mkdir(parents=True, exist_ok=True)
if cluster_status["server_broken"] and not failed:
if failed:
await manager_client.gather_related_logs(
failed_test_dir_path,
{'pytest.log': test_log, 'test_py.log': test_py_log_test}
)
with open(failed_test_dir_path / "stacktrace.txt", "w") as f:
f.write(report.longreprtext)
if request.config.getoption('artifacts_dir_url') is not None:
# get the relative path to the tmpdir for the failed directory
dir_path_relative = f"{failed_test_dir_path.as_posix()[failed_test_dir_path.as_posix().find('testlog'):]}"
full_url = urllib.parse.urljoin(request.config.getoption('artifacts_dir_url') + '/',
urllib.parse.quote(dir_path_relative))
record_property("TEST_LOGS", full_url)
cluster_status = await manager_client.after_test(test_case_name, not failed)
finally:
await manager_client.stop() # Stop client session and close driver after each test
if cluster_status is not None and cluster_status["server_broken"] and not failed:
failed = True
pytest.fail(
f"test case {test_case_name} left unfinished tasks on Scylla server. Server marked as broken,"

View File

@@ -10,6 +10,7 @@ import random
import string
import tempfile
import time
import threading
from concurrent.futures.thread import ThreadPoolExecutor
from pprint import pformat
@@ -273,6 +274,30 @@ class TesterAlternator(BaseAlternator):
logger.info("Testing and validating an update query using key condition expression")
logger.info(f"ConditionExpression update of short circuit is: {conditional_update_short_circuit}")
dc2_table.update_item(**conditional_update_short_circuit)
# Wait for cross-DC replication to reach both live DC1 nodes
# before stopping dc2_node. The LWT commit uses LOCAL_QUORUM,
# which only guarantees DC2 persistence; replication to DC1 is
# async background work. Without this wait, stopping dc2_node
# can drop in-flight RPCs to DC1 while CAS mutations don't
# store hints. We must confirm both live DC1 replicas have the
# data so that the later ConsistentRead=True (LOCAL_QUORUM)
# read on restarted node1 is guaranteed to succeed.
# See https://scylladb.atlassian.net/browse/SCYLLADB-1267
dc1_live_nodes = [
node for node in self.cluster.nodelist()
if node.data_center == node1.data_center and node.server_id != node1.server_id
]
dc1_live_tables = [self.get_table(table_name=TABLE_NAME, node=node) for node in dc1_live_nodes]
wait_for(
lambda: all(
t.get_item(
Key={self._table_primary_key: new_pk_val}, ConsistentRead=False
).get("Item", {}).get("c") == 3
for t in dc1_live_tables
),
timeout=60,
text="Waiting for cross-DC replication of conditional update to both live DC1 nodes",
)
dc2_node.stop()
node1.start()
@@ -481,28 +506,33 @@ class TesterAlternator(BaseAlternator):
2) Issue Alternator 'heavy' requests concurrently (create-table)
3) wait for RequestLimitExceeded error response.
"""
concurrent_requests_limit = 5
# Keep the limit low to avoid exhausting LSA memory on the 1GB test node
# when multiple CreateTable requests (Raft + schema + flush) run concurrently.
concurrent_requests_limit = 3
extra_config = {"max_concurrent_requests_per_shard": concurrent_requests_limit, "num_tokens": 1}
self.prepare_dynamodb_cluster(num_of_nodes=1, extra_config=extra_config)
node1 = self.cluster.nodelist()[0]
create_tables_threads = []
for tables_num in range(concurrent_requests_limit * 5):
create_tables_threads.append(self.run_create_table_thread())
stop_workers = threading.Event()
@retrying(num_attempts=150, sleep_time=0.2, allowed_exceptions=ConcurrencyLimitNotExceededError, message="Running create-table request")
def wait_for_create_table_request_failure():
try:
self.create_table(table_name=random_string(length=10), node=node1, wait_until_table_exists=False)
except Exception as error:
if "RequestLimitExceeded" in error.args[0]:
return
raise
raise ConcurrencyLimitNotExceededError
def run_create_table_until_limited() -> None:
while not stop_workers.is_set():
try:
self.create_table(table_name=random_string(length=10), node=node1, wait_until_table_exists=False)
except Exception as error: # noqa: BLE001
if "RequestLimitExceeded" in str(error):
stop_workers.set()
return
raise
wait_for_create_table_request_failure()
with ThreadPoolExecutor(max_workers=concurrent_requests_limit * 5) as executor:
create_table_futures = [executor.submit(run_create_table_until_limited) for _ in range(concurrent_requests_limit * 5)]
for thread in create_tables_threads:
thread.join()
if not stop_workers.wait(timeout=30):
raise ConcurrencyLimitNotExceededError
stop_workers.set()
for future in create_table_futures:
future.result(timeout=60)
@staticmethod
def _set_slow_query_logging_api(run_on_node: ScyllaNode, is_enable: bool = True, threshold: int | None = None):

View File

@@ -1182,9 +1182,9 @@ class TestAuth(Tester):
def get_session(self, node_idx=0, user=None, password=None, exclusive=True):
node = self.cluster.nodelist()[node_idx]
if exclusive:
conn = self.patient_exclusive_cql_connection(node, user=user, password=password, timeout=0.1)
conn = self.patient_exclusive_cql_connection(node, user=user, password=password)
else:
conn = self.patient_cql_connection(node, user=user, password=password, timeout=0.1)
conn = self.patient_cql_connection(node, user=user, password=password)
return conn
def assert_permissions_listed(self, expected, session, query, include_superuser=False):

View File

@@ -199,7 +199,7 @@ class GSServer(GSFront):
def unpublish(self):
for k in self.vars:
v = self.oldvars[k]
v = self.oldvars.get(k)
if v:
os.environ[k] = v
elif os.environ.get(k):

View File

@@ -215,6 +215,11 @@ async def test_node_ops_tasks_tree(manager: ManagerClient):
servers, vt_ids = await check_remove_node_tasks_tree(manager, tm, module_name, servers, vt_ids)
servers, vt_ids = await check_decommission_tasks_tree(manager, tm, module_name, servers, vt_ids)
# Reconnect the driver after topology changes (replace, removenode,
# decommission) so that the new_test_keyspace cleanup can reach a
# live node for DROP KEYSPACE.
await manager.driver_connect()
@pytest.mark.asyncio
async def test_node_ops_tasks_ttl(manager: ManagerClient):
"""Test node ops virtual tasks' ttl."""

View File

@@ -17,6 +17,7 @@ import socket
import socketserver
import tempfile
import threading
import time
import uuid
from collections import namedtuple
from contextlib import contextmanager
@@ -33,9 +34,9 @@ from test.cluster.dtest.dtest_class import create_ks, wait_for
from test.cluster.dtest.tools.assertions import assert_invalid
from test.cluster.dtest.tools.data import rows_to_list, run_in_parallel
from test.cluster.test_config import wait_for_config
from test.pylib.manager_client import ManagerClient
from test.pylib.rest_client import read_barrier
from test.pylib.util import wait_for as wait_for_async
logger = logging.getLogger(__name__)
@@ -113,11 +114,10 @@ class AuditTester:
for k in AUTH_CONFIG:
await self.manager.server_remove_config_option(srv.server_id, k)
# Remove absent keys so the server reverts to compiled-in defaults.
for k in absent_keys:
await self.manager.server_remove_config_option(srv.server_id, k)
if needs_restart:
# Remove absent keys so the server reverts to compiled-in defaults.
for k in absent_keys:
await self.manager.server_remove_config_option(srv.server_id, k)
await self.manager.server_stop_gracefully(srv.server_id)
full_cfg = self._build_server_config(needed, enable_compact_storage, user)
await self.manager.server_update_config(srv.server_id, config_options=full_cfg)
@@ -127,10 +127,17 @@ class AuditTester:
# Server stays up — only push live-updatable keys.
live_cfg = {k: v for k, v in needed.items() if k in LIVE_AUDIT_KEYS}
live_cfg["enable_create_table_with_compact_storage"] = enable_compact_storage
log_file = await self.manager.server_open_log(srv.server_id)
# Each remove/update sends a SIGHUP. Wait for each one's
# "completed re-reading configuration file" before the next
# so we never match a stale message.
for k in absent_keys:
from_mark = await log_file.mark()
await self.manager.server_remove_config_option(srv.server_id, k)
await log_file.wait_for(r"completed re-reading configuration file", from_mark=from_mark, timeout=60)
from_mark = await log_file.mark()
await self.manager.server_update_config(srv.server_id, config_options=live_cfg)
for key in LIVE_AUDIT_KEYS:
if key in live_cfg:
await wait_for_config(self.manager, srv, key, live_cfg[key])
await log_file.wait_for(r"completed re-reading configuration file", from_mark=from_mark, timeout=60)
async def _start_fresh_servers(self, needed: dict[str, str],
enable_compact_storage: bool,
@@ -345,7 +352,7 @@ class UnixSockerListener:
elif data != "Initializing syslog audit backend.":
self.server.parent_instance.lines.append(data)
class UnixDatagramServer(socketserver.ThreadingUnixDatagramServer):
class UnixDatagramServer(socketserver.UnixDatagramServer):
def __init__(self, socket_path, handler, parent_instance, lock):
self.parent_instance = parent_instance
self.mutex = lock
@@ -511,8 +518,7 @@ class AuditBackendComposite(AuditBackend):
return rows_dict
@pytest.mark.single_node
class TestCQLAudit(AuditTester):
class CQLAuditTester(AuditTester):
"""
Make sure CQL statements are audited
"""
@@ -1343,7 +1349,13 @@ class TestCQLAudit(AuditTester):
conn = await self.manager.get_cql_exclusive(srv)
stmt = SimpleStatement("INSERT INTO ks.test1 (k, v1) VALUES (1000, 1000)", consistency_level=ConsistencyLevel.THREE)
conn.execute(stmt)
audit_node_ips = await self.get_audit_partitions_for_operation(session, stmt.query_string)
# The audit log entry may not be visible immediately after the
# insert, so retry with exponential backoff until it appears.
audit_node_ips = await wait_for_async(
lambda: self.get_audit_partitions_for_operation(session, stmt.query_string),
deadline=time.time() + 10,
period=0.05,
label=f"audit entry for node {index}")
node_to_audit_nodes[index] = set(audit_node_ips)
all_addresses = set(srv.ip_addr for srv in servers)
@@ -1763,7 +1775,7 @@ class TestCQLAudit(AuditTester):
async def test_audit_table_noauth(manager: ManagerClient):
"""Table backend, no auth, single node — groups all tests that share this config."""
t = TestCQLAudit(manager)
t = CQLAuditTester(manager)
await t.test_using_non_existent_keyspace(AuditBackendTable)
await t.test_audit_keyspace(AuditBackendTable)
await t.test_audit_keyspace_extra_parameter(AuditBackendTable)
@@ -1787,7 +1799,7 @@ async def test_audit_table_noauth(manager: ManagerClient):
async def test_audit_table_auth(manager: ManagerClient):
"""Table backend, auth enabled, single node."""
t = TestCQLAudit(manager)
t = CQLAuditTester(manager)
await t.test_user_password_masking(AuditBackendTable)
await t.test_negative_audit_records_auth()
await t.test_negative_audit_records_admin()
@@ -1803,7 +1815,7 @@ async def test_audit_table_auth(manager: ManagerClient):
async def test_audit_table_auth_multinode(manager: ManagerClient):
"""Table backend, auth enabled, multi-node (rf=3)."""
t = TestCQLAudit(manager)
t = CQLAuditTester(manager)
await t.test_negative_audit_records_ddl()
@@ -1811,49 +1823,49 @@ async def test_audit_table_auth_multinode(manager: ManagerClient):
async def test_audit_type_none_standalone(manager: ManagerClient):
"""audit=None — verify no auditing occurs."""
await TestCQLAudit(manager).test_audit_type_none()
await CQLAuditTester(manager).test_audit_type_none()
async def test_audit_type_invalid_standalone(manager: ManagerClient):
"""audit=invalid — server should fail to start."""
await TestCQLAudit(manager).test_audit_type_invalid()
await CQLAuditTester(manager).test_audit_type_invalid()
async def test_composite_audit_type_invalid_standalone(manager: ManagerClient):
"""audit=table,syslog,invalid — server should fail to start."""
await TestCQLAudit(manager).test_composite_audit_type_invalid()
await CQLAuditTester(manager).test_composite_audit_type_invalid()
async def test_audit_empty_settings_standalone(manager: ManagerClient):
"""audit=none — verify no auditing occurs."""
await TestCQLAudit(manager).test_audit_empty_settings()
await CQLAuditTester(manager).test_audit_empty_settings()
async def test_composite_audit_empty_settings_standalone(manager: ManagerClient):
"""audit=table,syslog,none — verify no auditing occurs."""
await TestCQLAudit(manager).test_composite_audit_empty_settings()
await CQLAuditTester(manager).test_composite_audit_empty_settings()
async def test_audit_categories_invalid_standalone(manager: ManagerClient):
"""Invalid audit_categories — server should fail to start."""
await TestCQLAudit(manager).test_audit_categories_invalid()
await CQLAuditTester(manager).test_audit_categories_invalid()
async def test_insert_failure_standalone(manager: ManagerClient):
"""7-node topology, audit=table, no auth — standalone due to unique topology."""
await TestCQLAudit(manager).test_insert_failure_doesnt_report_success()
await CQLAuditTester(manager).test_insert_failure_doesnt_report_success()
async def test_service_level_statements_standalone(manager: ManagerClient):
"""audit=table, auth, cmdline=--smp 1 — standalone due to special cmdline."""
await TestCQLAudit(manager).test_service_level_statements()
await CQLAuditTester(manager).test_service_level_statements()
# AuditBackendSyslog, no auth, rf=1
async def test_audit_syslog_noauth(manager: ManagerClient):
"""Syslog backend, no auth, single node."""
t = TestCQLAudit(manager)
t = CQLAuditTester(manager)
Syslog = functools.partial(AuditBackendSyslog, socket_path=syslog_socket_path)
await t.test_using_non_existent_keyspace(Syslog)
await t.test_audit_keyspace(Syslog)
@@ -1870,7 +1882,7 @@ async def test_audit_syslog_noauth(manager: ManagerClient):
async def test_audit_syslog_auth(manager: ManagerClient):
"""Syslog backend, auth enabled, single node."""
t = TestCQLAudit(manager)
t = CQLAuditTester(manager)
Syslog = functools.partial(AuditBackendSyslog, socket_path=syslog_socket_path)
await t.test_user_password_masking(Syslog)
await t.test_role_password_masking(Syslog)
@@ -1881,7 +1893,7 @@ async def test_audit_syslog_auth(manager: ManagerClient):
async def test_audit_composite_noauth(manager: ManagerClient):
"""Composite backend (table+syslog), no auth, single node."""
t = TestCQLAudit(manager)
t = CQLAuditTester(manager)
Composite = functools.partial(AuditBackendComposite, socket_path=syslog_socket_path)
await t.test_using_non_existent_keyspace(Composite)
await t.test_audit_keyspace(Composite)
@@ -1898,7 +1910,7 @@ async def test_audit_composite_noauth(manager: ManagerClient):
async def test_audit_composite_auth(manager: ManagerClient):
"""Composite backend (table+syslog), auth enabled, single node."""
t = TestCQLAudit(manager)
t = CQLAuditTester(manager)
Composite = functools.partial(AuditBackendComposite, socket_path=syslog_socket_path)
await t.test_user_password_masking(Composite)
await t.test_role_password_masking(Composite)
@@ -1910,29 +1922,29 @@ _composite = functools.partial(AuditBackendComposite, socket_path=syslog_socket_
@pytest.mark.parametrize("helper_class,config_changer", [
pytest.param(AuditBackendTable, TestCQLAudit.AuditSighupConfigChanger, id="table-sighup"),
pytest.param(AuditBackendTable, TestCQLAudit.AuditCqlConfigChanger, id="table-cql"),
pytest.param(_syslog, TestCQLAudit.AuditSighupConfigChanger, id="syslog-sighup"),
pytest.param(_syslog, TestCQLAudit.AuditCqlConfigChanger, id="syslog-cql"),
pytest.param(_composite, TestCQLAudit.AuditSighupConfigChanger, id="composite-sighup"),
pytest.param(_composite, TestCQLAudit.AuditCqlConfigChanger, id="composite-cql"),
pytest.param(AuditBackendTable, CQLAuditTester.AuditSighupConfigChanger, id="table-sighup"),
pytest.param(AuditBackendTable, CQLAuditTester.AuditCqlConfigChanger, id="table-cql"),
pytest.param(_syslog, CQLAuditTester.AuditSighupConfigChanger, id="syslog-sighup"),
pytest.param(_syslog, CQLAuditTester.AuditCqlConfigChanger, id="syslog-cql"),
pytest.param(_composite, CQLAuditTester.AuditSighupConfigChanger, id="composite-sighup"),
pytest.param(_composite, CQLAuditTester.AuditCqlConfigChanger, id="composite-cql"),
])
async def test_config_no_liveupdate(manager: ManagerClient, helper_class, config_changer):
"""Non-live audit config params (audit, audit_unix_socket_path, audit_syslog_write_buffer_size) must be unmodifiable."""
await TestCQLAudit(manager).test_config_no_liveupdate(helper_class, config_changer)
await CQLAuditTester(manager).test_config_no_liveupdate(helper_class, config_changer)
@pytest.mark.parametrize("helper_class,config_changer", [
pytest.param(AuditBackendTable, TestCQLAudit.AuditSighupConfigChanger, id="table-sighup"),
pytest.param(AuditBackendTable, TestCQLAudit.AuditCqlConfigChanger, id="table-cql"),
pytest.param(_syslog, TestCQLAudit.AuditSighupConfigChanger, id="syslog-sighup"),
pytest.param(_syslog, TestCQLAudit.AuditCqlConfigChanger, id="syslog-cql"),
pytest.param(_composite, TestCQLAudit.AuditSighupConfigChanger, id="composite-sighup"),
pytest.param(_composite, TestCQLAudit.AuditCqlConfigChanger, id="composite-cql"),
pytest.param(AuditBackendTable, CQLAuditTester.AuditSighupConfigChanger, id="table-sighup"),
pytest.param(AuditBackendTable, CQLAuditTester.AuditCqlConfigChanger, id="table-cql"),
pytest.param(_syslog, CQLAuditTester.AuditSighupConfigChanger, id="syslog-sighup"),
pytest.param(_syslog, CQLAuditTester.AuditCqlConfigChanger, id="syslog-cql"),
pytest.param(_composite, CQLAuditTester.AuditSighupConfigChanger, id="composite-sighup"),
pytest.param(_composite, CQLAuditTester.AuditCqlConfigChanger, id="composite-cql"),
])
async def test_config_liveupdate(manager: ManagerClient, helper_class, config_changer):
"""Live-updatable audit config params (categories, keyspaces, tables) must be modifiable at runtime."""
await TestCQLAudit(manager).test_config_liveupdate(helper_class, config_changer)
await CQLAuditTester(manager).test_config_liveupdate(helper_class, config_changer)
@pytest.mark.parametrize("helper_class", [
@@ -1942,4 +1954,4 @@ async def test_config_liveupdate(manager: ManagerClient, helper_class, config_ch
])
async def test_parallel_syslog_audit(manager: ManagerClient, helper_class):
"""Cluster must not fail when multiple queries are audited in parallel."""
await TestCQLAudit(manager).test_parallel_syslog_audit(helper_class)
await CQLAuditTester(manager).test_parallel_syslog_audit(helper_class)

View File

@@ -0,0 +1,70 @@
#
# Copyright (C) 2026-present ScyllaDB
#
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
#
import logging
import asyncio
import time
import pytest
from test.cluster.util import get_current_group0_config
from test.pylib.manager_client import ManagerClient
from test.pylib.rest_client import read_barrier
from test.pylib.util import wait_for
logger = logging.getLogger(__name__)
@pytest.mark.asyncio
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
async def test_bootstrap_with_quick_group0_join(manager: ManagerClient):
"""Regression test for https://scylladb.atlassian.net/browse/SCYLLADB-959.
The bug was that when the bootstrapping node joined group0 before reaching
post_server_start, it skipped post_server_start and thus hung forever.
The test simulates the scenario by starting the second node with the
join_group0_pause_before_config_check injection. Without the fix, the
startup times out.
"""
logger.info("Adding first server")
s1 = await manager.server_add()
logger.info("Adding second server with join_group0_pause_before_config_check enabled")
s2 = await manager.server_add(start=False, config={
'error_injections_at_startup': ['join_group0_pause_before_config_check']
})
logger.info(f"Starting {s2}")
start_task = asyncio.create_task(manager.server_start(s2.server_id))
s2_log = await manager.server_open_log(s2.server_id)
await s2_log.wait_for("join_group0_pause_before_config_check: waiting for message", timeout=60)
s1_host_id = await manager.get_host_id(s1.server_id)
s2_host_id = await manager.get_host_id(s2.server_id)
async def s2_in_group0_config_on_s1():
config = await get_current_group0_config(manager, s1)
ids = {m[0] for m in config}
assert s1_host_id in ids # sanity check
return True if s2_host_id in ids else None
# Note: we would like to wait for s2 to see itself in the group0 config, but we can't execute
# get_current_group0_config for s2, as s2 doesn't handle CQL requests at this point. As a workaround, we wait for s1
# to see s2 and then perform a read barrier on s2.
logger.info(f"Waiting for {s1} to see {s2} in the group0 config")
await wait_for(s2_in_group0_config_on_s1, deadline=time.time() + 60, period=0.1)
logger.info(f"Performing read barrier on {s2} to make sure it sees itself in the group0 config")
await read_barrier(manager.api, s2.ip_addr)
logger.info(f"Unblocking {s2}")
await manager.api.message_injection(s2.ip_addr, 'join_group0_pause_before_config_check')
logger.info(f"Waiting for {s2} to complete bootstrap")
await asyncio.wait_for(start_task, timeout=60)

View File

@@ -177,7 +177,7 @@ async def _smoke_test(manager: ManagerClient, key_provider: KeyProviderFactory,
# restart the cluster
if restart:
await restart(manager, servers, cfs)
await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)
cql, _ = await manager.get_ready_cql(servers)
else:
await manager.rolling_restart(servers)
for table_name in cfs:

View File

@@ -438,6 +438,7 @@ async def test_lwt_fencing_upgrade(manager: ManagerClient, scylla_2025_1: Scylla
await wait_for(all_hosts_are_alive, deadline=time.time() + 60, period=0.1)
logger.info(f"Upgrading {s.server_id}")
await manager.server_change_version(s.server_id, scylla_binary)
await manager.server_sees_others(s.server_id, 2, interval=60.0)
logger.info("Done upgrading servers")

View File

@@ -8,7 +8,10 @@ import asyncio
import time
import pytest
import logging
from functools import partial
from test.pylib.manager_client import ManagerClient
from test.pylib.util import wait_for
from test.pylib.internal_types import ServerInfo
logger = logging.getLogger(__name__)
@@ -16,6 +19,26 @@ logger = logging.getLogger(__name__)
@pytest.mark.asyncio
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
async def test_crashed_node_substitution(manager: ManagerClient):
"""Test that a node which crashed after starting gossip but before joining group0
(an 'orphan' node) is eventually removed from gossip by the gossiper_orphan_remover_fiber.
The scenario:
1. Start 3 nodes with the 'fast_orphan_removal_fiber' injection enabled. This freezes
the gossiper_orphan_remover_fiber on each node before it enters its polling loop,
so it cannot remove any orphan until explicitly unblocked.
2. Start a 4th node with the 'crash_before_group0_join' injection enabled. This node
starts gossip normally but blocks inside pre_server_start(), just before sending
the join RPC to the topology coordinator. It never joins group0.
3. Wait until the 4th node's gossip state has fully propagated to all 3 running peers,
then trigger its crash via the injection. At this point all peers see it as an orphan:
present in gossip but absent from the group0 topology.
4. Assert the orphan is visible in gossip (live or down) on the surviving nodes.
5. Unblock the gossiper_orphan_remover_fiber on all 3 nodes (via message_injection) and
enable the 'speedup_orphan_removal' injection so the fiber removes the orphan immediately
without waiting for the normal 60-second age threshold.
6. Wait for the 'Finished to force remove node' log line confirming removal, then assert
the orphan is no longer present in gossip.
"""
servers = await manager.servers_add(3, config={
'error_injections_at_startup': ['fast_orphan_removal_fiber']
})
@@ -30,10 +53,24 @@ async def test_crashed_node_substitution(manager: ManagerClient):
log = await manager.server_open_log(failed_server.server_id)
await log.wait_for("finished do_send_ack2_msg")
failed_id = await manager.get_host_id(failed_server.server_id)
# Wait until the failed server's gossip state has propagated to all running peers.
# "finished do_send_ack2_msg" only guarantees that one peer completed a gossip round
# with the failed server; other nodes learn about it only in subsequent gossip rounds.
# Querying gossip before propagation completes would cause the assertion below to fail
# because the orphan node would not yet appear as live or down on every peer.
async def gossip_has_node(server: ServerInfo):
live = await manager.api.client.get_json("/gossiper/endpoint/live", host=server.ip_addr)
down = await manager.api.client.get_json("/gossiper/endpoint/down", host=server.ip_addr)
return True if failed_server.ip_addr in live + down else None
for s in servers:
await wait_for(partial(gossip_has_node, s), deadline=time.time() + 30)
await manager.api.message_injection(failed_server.ip_addr, 'crash_before_group0_join')
await task
live_eps = await manager.api.client.get_json("/gossiper/endpoint/live", host=servers[0].ip_addr)
down_eps = await manager.api.client.get_json("/gossiper/endpoint/down", host=servers[0].ip_addr)

View File

@@ -17,9 +17,9 @@ from test.pylib.manager_client import ManagerClient
from test.pylib.rest_client import ScyllaMetricsClient, TCPRESTClient, inject_error
from test.pylib.tablets import get_tablet_replicas
from test.pylib.scylla_cluster import ReplaceConfig
from test.pylib.util import wait_for
from test.pylib.util import gather_safely, wait_for
from test.cluster.util import get_topology_coordinator, find_server_by_host_id, new_test_keyspace
from test.cluster.util import get_topology_coordinator, find_server_by_host_id, keyspace_has_tablets, new_test_keyspace, new_test_table
logger = logging.getLogger(__name__)
@@ -51,28 +51,42 @@ async def await_sync_point(client: TCPRESTClient, server_ip: IPAddress, sync_poi
@pytest.mark.asyncio
async def test_write_cl_any_to_dead_node_generates_hints(manager: ManagerClient):
node_count = 2
servers = await manager.servers_add(node_count)
cmdline = ["--logger-log-level", "hints_manager=trace"]
servers = await manager.servers_add(node_count, cmdline=cmdline)
async def wait_for_hints_written(min_hint_count: int, timeout: int):
async def aux():
hints_written = await get_hint_metrics(manager.metrics, servers[0].ip_addr, "written")
if hints_written >= min_hint_count:
return True
return None
assert await wait_for(aux, time.time() + timeout)
cql = manager.get_cql()
async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1}") as ks:
table = f"{ks}.t"
await cql.run_async(f"CREATE TABLE {table} (pk int primary key, v int)")
uses_tablets = await keyspace_has_tablets(manager, ks)
# If the keyspace uses tablets, let's explicitly require the table to use multiple tablets.
# Otherwise, it could happen that all mutations would target servers[0] only, which would
# ultimately lead to a test failure here. We rely on the assumption that mutations will be
# distributed more or less uniformly!
extra_opts = "WITH tablets = {'min_tablet_count': 16}" if uses_tablets else ""
async with new_test_table(manager, ks, "pk int PRIMARY KEY, v int", extra_opts) as table:
await manager.server_stop_gracefully(servers[1].server_id)
await manager.server_stop_gracefully(servers[1].server_id)
hints_before = await get_hint_metrics(manager.metrics, servers[0].ip_addr, "written")
hints_before = await get_hint_metrics(manager.metrics, servers[0].ip_addr, "written")
stmt = cql.prepare(f"INSERT INTO {table} (pk, v) VALUES (?, ?)")
stmt.consistency_level = ConsistencyLevel.ANY
# Some of the inserts will be targeted to the dead node.
# The coordinator doesn't have live targets to send the write to, but it should write a hint.
for i in range(100):
await cql.run_async(SimpleStatement(f"INSERT INTO {table} (pk, v) VALUES ({i}, {i+1})", consistency_level=ConsistencyLevel.ANY))
# Some of the inserts will be targeted to the dead node.
# The coordinator doesn't have live targets to send the write to, but it should write a hint.
await gather_safely(*[cql.run_async(stmt, (i, i + 1)) for i in range(100)])
# Verify hints are written
hints_after = await get_hint_metrics(manager.metrics, servers[0].ip_addr, "written")
assert hints_after > hints_before
# Verify hints are written
await wait_for_hints_written(hints_before + 1, timeout=60)
# For dropping the keyspace
await manager.server_start(servers[1].server_id)
# For dropping the keyspace
await manager.server_start(servers[1].server_id)
@pytest.mark.asyncio
async def test_limited_concurrency_of_writes(manager: ManagerClient):
@@ -86,7 +100,7 @@ async def test_limited_concurrency_of_writes(manager: ManagerClient):
})
node2 = await manager.server_add()
cql = manager.get_cql()
cql = await manager.get_cql_exclusive(node1)
async with new_test_keyspace(manager, "WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 2}") as ks:
table = f"{ks}.t"
await cql.run_async(f"CREATE TABLE {table} (pk int primary key, v int)")

View File

@@ -151,7 +151,7 @@ async def trigger_tablet_merge(manager, servers, logs):
await s1_log.wait_for('Detected tablet merge for table', from_mark=s1_mark)
await inject_error_off(manager, "tablet_force_tablet_count_decrease", servers)
async def preapre_cluster_for_incremental_repair(manager, nr_keys = 100 , cmdline = []):
async def prepare_cluster_for_incremental_repair(manager, nr_keys = 100 , cmdline = []):
servers, cql, hosts, ks, table_id = await create_table_insert_data_for_repair(manager, nr_keys=nr_keys, cmdline=cmdline)
repaired_keys = set(range(0, nr_keys))
unrepaired_keys = set()
@@ -164,7 +164,7 @@ async def preapre_cluster_for_incremental_repair(manager, nr_keys = 100 , cmdlin
@pytest.mark.asyncio
async def test_tablet_repair_sstable_skipped_read_metrics(manager: ManagerClient):
servers, cql, hosts, ks, table_id, logs, _, _, _, token = await preapre_cluster_for_incremental_repair(manager)
servers, cql, hosts, ks, table_id, logs, _, _, _, token = await prepare_cluster_for_incremental_repair(manager)
await insert_keys(cql, ks, 0, 100)
@@ -274,7 +274,7 @@ async def test_tablet_incremental_repair_error(manager: ManagerClient):
async def do_tablet_incremental_repair_and_ops(manager: ManagerClient, ops: str):
nr_keys = 100
servers, cql, hosts, ks, table_id, logs, repaired_keys, unrepaired_keys, current_key, token = await preapre_cluster_for_incremental_repair(manager, nr_keys)
servers, cql, hosts, ks, table_id, logs, repaired_keys, unrepaired_keys, current_key, token = await prepare_cluster_for_incremental_repair(manager, nr_keys, cmdline=['--logger-log-level', 'compaction=debug'])
token = -1
await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", token, incremental_mode='incremental')
@@ -335,7 +335,7 @@ async def test_tablet_incremental_repair_and_major(manager: ManagerClient):
@pytest.mark.asyncio
async def test_tablet_incremental_repair_and_minor(manager: ManagerClient):
nr_keys = 100
servers, cql, hosts, ks, table_id, logs, repaired_keys, unrepaired_keys, current_key, token = await preapre_cluster_for_incremental_repair(manager, nr_keys)
servers, cql, hosts, ks, table_id, logs, repaired_keys, unrepaired_keys, current_key, token = await prepare_cluster_for_incremental_repair(manager, nr_keys)
# Disable autocompaction
for server in servers:
@@ -381,7 +381,7 @@ async def test_tablet_incremental_repair_and_minor(manager: ManagerClient):
async def do_test_tablet_incremental_repair_with_split_and_merge(manager, do_split, do_merge):
nr_keys = 100
servers, cql, hosts, ks, table_id, logs, repaired_keys, unrepaired_keys, current_key, token = await preapre_cluster_for_incremental_repair(manager, nr_keys)
servers, cql, hosts, ks, table_id, logs, repaired_keys, unrepaired_keys, current_key, token = await prepare_cluster_for_incremental_repair(manager, nr_keys)
# First repair
await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", token, incremental_mode='incremental') # sstables_repaired_at 1
@@ -442,7 +442,7 @@ async def test_tablet_incremental_repair_with_merge(manager: ManagerClient):
async def test_tablet_incremental_repair_existing_and_repair_produced_sstable(manager: ManagerClient):
nr_keys = 100
cmdline = ["--hinted-handoff-enabled", "0"]
servers, cql, hosts, ks, table_id, logs, repaired_keys, unrepaired_keys, current_key, token = await preapre_cluster_for_incremental_repair(manager, nr_keys, cmdline)
servers, cql, hosts, ks, table_id, logs, repaired_keys, unrepaired_keys, current_key, token = await prepare_cluster_for_incremental_repair(manager, nr_keys, cmdline)
await manager.server_stop_gracefully(servers[1].server_id)
@@ -466,7 +466,7 @@ async def test_tablet_incremental_repair_existing_and_repair_produced_sstable(ma
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
async def test_tablet_incremental_repair_merge_higher_repaired_at_number(manager):
nr_keys = 100
servers, cql, hosts, ks, table_id, logs, repaired_keys, unrepaired_keys, current_key, token = await preapre_cluster_for_incremental_repair(manager, nr_keys)
servers, cql, hosts, ks, table_id, logs, repaired_keys, unrepaired_keys, current_key, token = await prepare_cluster_for_incremental_repair(manager, nr_keys)
# First repair
await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", token, incremental_mode='incremental') # sstables_repaired_at 1
@@ -507,7 +507,7 @@ async def test_tablet_incremental_repair_merge_higher_repaired_at_number(manager
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
async def test_tablet_incremental_repair_merge_correct_repaired_at_number_after_merge(manager):
nr_keys = 100
servers, cql, hosts, ks, table_id, logs, repaired_keys, unrepaired_keys, current_key, token = await preapre_cluster_for_incremental_repair(manager, nr_keys)
servers, cql, hosts, ks, table_id, logs, repaired_keys, unrepaired_keys, current_key, token = await prepare_cluster_for_incremental_repair(manager, nr_keys)
# First repair
await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", token, incremental_mode='incremental') # sstables_repaired_at 1
@@ -541,7 +541,7 @@ async def do_test_tablet_incremental_repair_merge_error(manager, error):
nr_keys = 100
# Make sure no data commit log replay after force server stop
cmdline = ['--enable-commitlog', '0']
servers, cql, hosts, ks, table_id, logs, repaired_keys, unrepaired_keys, current_key, token = await preapre_cluster_for_incremental_repair(manager, nr_keys, cmdline)
servers, cql, hosts, ks, table_id, logs, repaired_keys, unrepaired_keys, current_key, token = await prepare_cluster_for_incremental_repair(manager, nr_keys, cmdline)
# First repair
await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", token, incremental_mode='incremental') # sstables_repaired_at 1
@@ -587,7 +587,7 @@ async def test_tablet_incremental_repair_merge_error_in_merge_completion_fiber(m
@pytest.mark.asyncio
async def test_tablet_repair_with_incremental_option(manager: ManagerClient):
servers, cql, hosts, ks, table_id, logs, _, _, _, token = await preapre_cluster_for_incremental_repair(manager)
servers, cql, hosts, ks, table_id, logs, _, _, _, token = await prepare_cluster_for_incremental_repair(manager)
token = -1
sstables_repaired_at = 0
@@ -632,7 +632,7 @@ async def test_tablet_repair_with_incremental_option(manager: ManagerClient):
@pytest.mark.asyncio
async def test_incremental_repair_tablet_time_metrics(manager: ManagerClient):
servers, _, _, ks, _, _, _, _, _, token = await preapre_cluster_for_incremental_repair(manager)
servers, _, _, ks, _, _, _, _, _, token = await prepare_cluster_for_incremental_repair(manager)
time1 = 0
time2 = 0
@@ -820,7 +820,7 @@ async def test_repair_sigsegv_with_diff_shard_count(manager: ManagerClient, use_
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
async def test_tablet_incremental_repair_table_drop_compaction_group_gone(manager: ManagerClient):
cmdline = ['--logger-log-level', 'repair=debug']
servers, cql, hosts, ks, table_id, logs, _, _, _, _ = await preapre_cluster_for_incremental_repair(manager, cmdline=cmdline)
servers, cql, hosts, ks, table_id, logs, _, _, _, _ = await prepare_cluster_for_incremental_repair(manager, cmdline=cmdline)
coord = await get_topology_coordinator(manager)
coord_serv = await find_server_by_host_id(manager, servers, coord)

View File

@@ -20,6 +20,7 @@ from cassandra.query import SimpleStatement
from test.pylib.async_cql import _wrap_future
from test.pylib.manager_client import ManagerClient
from test.pylib.random_tables import RandomTables, TextType, Column
from test.pylib.rest_client import read_barrier
from test.pylib.util import unique_name
from test.cluster.conftest import cluster_con
@@ -403,6 +404,7 @@ async def test_arbiter_dc_rf_rack_valid_keyspaces(manager: ManagerClient):
for task in [*valid_keyspaces, *invalid_keyspaces]:
_ = tg.create_task(task)
@pytest.mark.asyncio
async def test_startup_with_keyspaces_violating_rf_rack_valid_keyspaces(manager: ManagerClient):
"""
This test verifies that starting a Scylla node fails when there's an RF-rack-invalid keyspace.
@@ -464,22 +466,50 @@ async def test_startup_with_keyspaces_violating_rf_rack_valid_keyspaces(manager:
for rfs, tablets in valid_keyspaces:
_ = tg.create_task(create_keyspace(rfs, tablets))
await manager.server_stop_gracefully(s1.server_id)
await manager.server_update_config(s1.server_id, "rf_rack_valid_keyspaces", "true")
# Precondition: s1 has rf_rack_valid_keyspaces set to false.
# Postcondition: s1 still has rf_rack_valid_keyspaces set to false.
async def try_fail(rfs: List[int], dc: str, rf: int, rack_count: int):
running_servers = await manager.running_servers()
should_start = s1.server_id not in [server.server_id for server in running_servers]
if should_start:
await manager.server_start(s1.server_id)
ks = await create_keyspace(rfs, True)
# We need to wait for the new schema to propagate.
# Otherwise, it's not clear when the mutation
# corresponding to the created keyspace will
# arrive at server 1.
# It could happen only after the node performs
# the check upon start-up, effectively leading
# to a successful start-up, which we don't want.
# For more context, see issue: SCYLLADB-1137.
await read_barrier(manager.api, s1.ip_addr)
await manager.server_stop_gracefully(s1.server_id)
await manager.server_update_config(s1.server_id, "rf_rack_valid_keyspaces", "true")
err = f"The keyspace '{ks}' is required to be RF-rack-valid. " \
f"That condition is violated for DC '{dc}': RF={rf} vs. rack count={rack_count}."
_ = await manager.server_start(s1.server_id, expected_error=err)
await manager.server_start(s1.server_id, expected_error=err)
await cql.run_async(f"DROP KEYSPACE {ks}")
await manager.server_update_config(s1.server_id, "rf_rack_valid_keyspaces", "false")
# Test RF-rack-invalid keyspaces.
await try_fail([2, 0], "dc1", 2, 3)
await try_fail([3, 2], "dc2", 2, 1)
await try_fail([4, 1], "dc1", 4, 3)
_ = await manager.server_start(s1.server_id)
# We need to perform a read barrier on the node to make
# sure that it processes the last DROP KEYSPACE.
# Otherwise, the node could think the RF-rack-invalid
# keyspace still exists.
await manager.server_start(s1.server_id)
await read_barrier(manager.api, s1.ip_addr)
await manager.server_stop_gracefully(s1.server_id)
await manager.server_update_config(s1.server_id, "rf_rack_valid_keyspaces", "true")
await manager.server_start(s1.server_id)
@pytest.mark.asyncio
async def test_startup_with_keyspaces_violating_rf_rack_valid_keyspaces_but_not_enforced(manager: ManagerClient):

View File

@@ -23,10 +23,25 @@ from test.cluster.object_store.conftest import format_tuples
from test.cluster.object_store.test_backup import topo, take_snapshot, do_test_streaming_scopes
from test.cluster.util import new_test_keyspace
from test.pylib.rest_client import read_barrier
from test.pylib.util import unique_name
from test.pylib.util import unique_name, wait_for
logger = logging.getLogger(__name__)
async def wait_for_upload_dir_empty(upload_dir, timeout=30):
'''
Wait until the upload directory is empty with a timeout.
SSTable unlinking is asynchronous and in rare situations, it can happen
that not all sstables are deleted from the upload dir immediately after refresh is done.
'''
deadline = time.time() + timeout
async def check_empty():
files = os.listdir(upload_dir)
if not files:
return True
return None
await wait_for(check_empty, deadline, period=0.5)
class SSTablesOnLocalStorage:
def __init__(self):
self.tmpdir = f'tmpbackup-{str(uuid.uuid4())}'
@@ -153,7 +168,8 @@ async def test_refresh_deletes_uploaded_sstables(manager: ManagerClient):
for s in servers:
cf_dir = dirs[s.server_id]["cf_dir"]
files = os.listdir(os.path.join(cf_dir, 'upload'))
assert files == [], f'Upload dir not empty on server {s.server_id}: {files}'
upload_dir = os.path.join(cf_dir, 'upload')
assert os.path.exists(upload_dir)
await wait_for_upload_dir_empty(upload_dir)
shutil.rmtree(tmpbackup)

View File

@@ -11,7 +11,6 @@ from test.cluster.util import check_token_ring_and_group0_consistency, new_test_
import pytest
import asyncio
import logging
import time
logger = logging.getLogger(__name__)
@pytest.mark.asyncio
@@ -53,7 +52,7 @@ async def test_cleanup_stop(manager: ManagerClient):
await s0_log.wait_for('sstable_cleanup_wait: waiting', from_mark=s0_mark)
stop_cleanup = asyncio.create_task(manager.api.stop_compaction(servers[0].ip_addr, "CLEANUP"))
time.sleep(1)
await asyncio.sleep(1)
await manager.api.message_injection(servers[0].ip_addr, "sstable_cleanup_wait")
await stop_cleanup

View File

@@ -2279,7 +2279,7 @@ async def test_split_stopped_on_shutdown(manager: ManagerClient):
shutdown_task = asyncio.create_task(manager.server_stop_gracefully(server.server_id))
await log.wait_for('Stopping.*ongoing compactions')
await log.wait_for('Stopping.*ongoing compactions', from_mark=log_mark)
await manager.api.message_injection(server.ip_addr, "splitting_mutation_writer_switch_wait")
await log.wait_for('storage_service_drain_wait: waiting', from_mark=log_mark)

View File

@@ -196,7 +196,7 @@ async def test_group0_tombstone_gc(manager: ManagerClient):
tombstone_mark = datetime.now(timezone.utc)
# test #2: the tombstones are not cleaned up when one node is down
with pytest.raises(AssertionError, match="Deadline exceeded"):
with pytest.raises(AssertionError, match="timed out"):
# waiting for shorter time (5s normally enough for a successful case, we expect the timeout here)
await verify_tombstone_gc(tombstone_mark, timeout=5)
@@ -249,7 +249,7 @@ async def test_group0_tombstone_gc(manager: ManagerClient):
await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)
# test #4a: the tombstones are not cleaned up after both live nodes join the new group0
with pytest.raises(AssertionError, match="Deadline exceeded"):
with pytest.raises(AssertionError, match="timed out"):
await verify_tombstone_gc(tombstone_mark, timeout=5)
await manager.remove_node(servers[0].server_id, down_server.server_id)

View File

@@ -165,7 +165,7 @@ async def wait_for_cdc_generations_publishing(cql: Session, hosts: list[Host], d
unpublished_generations = topo_res[0].unpublished_cdc_generations
return unpublished_generations is None or len(unpublished_generations) == 0 or None
await wait_for(all_generations_published, deadline=deadline, period=1.0)
await wait_for(all_generations_published, deadline=deadline)
async def check_system_topology_and_cdc_generations_v3_consistency(manager: ManagerClient, live_hosts: list[Host], cqls: Optional[list[Session]] = None, ignored_hosts: list[Host] = []):
@@ -470,6 +470,17 @@ async def new_materialized_view(manager: ManagerClient, table, select, pk, where
await manager.get_cql().run_async(f"DROP MATERIALIZED VIEW {mv}")
async def keyspace_has_tablets(manager: ManagerClient, keyspace: str) -> bool:
"""
Checks whether the given keyspace uses tablets.
Adapted from its counterpart in the cqlpy test: cqlpy/util.py::keyspace_has_tablets.
"""
cql = manager.get_cql()
rows_iter = await cql.run_async(f"SELECT * FROM system_schema.scylla_keyspaces WHERE keyspace_name='{keyspace}'")
rows = list(rows_iter)
return len(rows) > 0 and getattr(rows[0], "initial_tablets", None) is not None
async def get_raft_log_size(cql, host) -> int:
query = "select count(\"index\") from system.raft"
return (await cql.run_async(query, host=host))[0][0]

View File

@@ -271,10 +271,21 @@ future<std::tuple<tests::proc::process_fixture, int>> tests::proc::start_docker_
// arbitrary timeout of 120s for the server to make some output. Very generous.
// but since we (maybe) run docker, and might need to pull image, this can take
// some time if we're unlucky.
co_await with_timeout(std::chrono::steady_clock::now() + 120s, when_all(std::move(out_fut), std::move(err_fut)));
} catch (in_use&) {
retry = true;
p = std::current_exception();
auto [f1, f2] = co_await with_timeout(std::chrono::steady_clock::now() + 120s, when_all(std::move(out_fut), std::move(err_fut)));
for (auto* f : {&f1, &f2}) {
if (f->failed()) {
try {
f->get();
} catch (in_use&) {
retry = true;
p = std::current_exception();
} catch (...) {
if (!p) {
p = std::current_exception();
}
}
}
}
} catch (...) {
p = std::current_exception();
}

View File

@@ -6,6 +6,7 @@
import os
import random
import socket
import subprocess
import sys
import time
@@ -59,7 +60,14 @@ async def server_address(request, testpy_test: None|Test):
ip = await testpy_test.suite.hosts.lease_host()
else:
ip = f"127.{random.randint(0, 255)}.{random.randint(0, 255)}.{random.randint(0, 255)}"
port = random.randint(10000, 65535)
# Ask the OS to pick a free port by binding to port 0. This avoids
# collisions with ports still in TIME_WAIT from a previous test module
# that used the same IP. SO_REUSEADDR is set on the probe socket so it
# can reclaim a TIME_WAIT port itself
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
s.bind((ip, 0))
port = s.getsockname()[1]
yield ServerAddress(ip, port)
if testpy_test is not None:
await testpy_test.suite.hosts.release_host(ip)

View File

@@ -257,7 +257,7 @@ async def run_server(ip, port):
runner = aiohttp.web.AppRunner(app)
await runner.setup()
site = aiohttp.web.TCPSite(runner, ip, port)
site = aiohttp.web.TCPSite(runner, ip, port, reuse_address=True, reuse_port=True)
await site.start()
try:

View File

@@ -4,14 +4,18 @@
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
#
import logging
import shutil
import itertools
import asyncio
import pathlib
import re
import os
import subprocess
from typing import Callable
logger = logging.getLogger("DockerizedServer")
class DockerizedServer:
"""class for running an external dockerized service image, typically mock server"""
# pylint: disable=too-many-instance-attributes
@@ -37,6 +41,7 @@ class DockerizedServer:
self.port = None
self.proc = None
self.service_port = port
self.echo_thread = None
async def start(self):
"""Starts docker image on a random port"""
@@ -45,77 +50,107 @@ class DockerizedServer:
if exe is not None)).resolve()
sid = f"{os.getpid()}-{DockerizedServer.newid()}"
name = f'{self.logfilenamebase}-{sid}'
logfilename = (pathlib.Path(self.tmpdir) / name).with_suffix(".log")
self.logfile = logfilename.open("wb")
docker_args = self.docker_args(self.host, self.service_port)
image_args = self.image_args(self.host, self.service_port)
args = [exe, "run", "--name", name, "--rm" ]
if self.service_port is None:
args = args + ["-P"]
else:
args = args + ["-p", str(self.service_port)]
args = args + docker_args + [self.image] + image_args
# This seems weird, using the blocking IO subprocess.
# However, we want to use a pipe reader so we can push the
# output into the test log (because we are bad at propagating
# log files etc from CI)
# But the pipe reader needs to read until EOF, otherwise the
# docker process will eventually hang. So we can't await a
# coroutine.
# We _can_, sort of, use pool.create_task(...) to send a coro
# to the background, and use a signal for waiting, like here,
# thus ensuring the coro runs forever, sort of... However,
# this currently breaks, probably due to some part of the
# machinery/tests that don't async fully, causing us to not
# process the log, and thus hand/fail, bla bla.
# The solution is to make the process synced, and use a
# background thread (execution pool) for the processing.
# This way we know the pipe reader will not suddenly get
# blocked at inconvinient times.
proc = subprocess.Popen(args, stderr=subprocess.PIPE)
loop = asyncio.get_running_loop()
ready_fut = loop.create_future()
def process_io():
f = ready_fut
try:
while True:
data = proc.stderr.readline()
if not data:
if f:
loop.call_soon_threadsafe(f.set_exception, RuntimeError("Log EOF"))
logger.debug("EOF received")
break
line = data.decode()
self.logfile.write(data)
logger.debug(line)
if f and self.is_success_line(line, self.service_port):
logger.info('Got start message: %s', line)
loop.call_soon_threadsafe(f.set_result, True)
f = None
if f and self.is_failure_line(line, self.service_port):
logger.info('Got fail message: %s', line)
loop.call_soon_threadsafe(f.set_result, False)
f = None
except Exception as e:
logger.error("Exception in log processing: %s", e)
if f:
loop.call_soon_threadsafe(f.set_exception, e)
self.echo_thread = loop.run_in_executor(None, process_io)
ok = await ready_fut
if not ok:
self.logfile.close()
proc.kill()
proc.wait()
raise RuntimeError("Could not parse expected launch message from container")
check_proc = await asyncio.create_subprocess_exec(exe
, *["container", "port", name]
, stdout=asyncio.subprocess.PIPE
)
while True:
logfilename = (pathlib.Path(self.tmpdir) / name).with_suffix(".log")
self.logfile = logfilename.open("wb")
data = await check_proc.stdout.readline()
if not data:
break
s = data.decode()
m = re.search(r"\d+\/\w+ -> [\w+\.\[\]\:]+:(\d+)", s)
if m:
self.port = int(m.group(1))
docker_args = self.docker_args(self.host, self.service_port)
image_args = self.image_args(self.host, self.service_port)
args = ["run", "--name", name, "--rm" ]
if self.service_port is None:
args = args + ["-P"]
else:
args = args + ["-p", str(self.service_port)]
args = args + docker_args + [self.image] + image_args
proc = await asyncio.create_subprocess_exec(exe, *args, stderr=self.logfile)
failed = False
# In any sane world we would just pipe stderr to a pipe and launch a background
# task to just readline from there to both check the start message as well as
# add it to the log (preferrably via logger).
# This works fine when doing this in a standalone python script.
# However, for some reason, when run in a pytest fixture, the pipe will fill up,
# without or reader waking up and doing anyhing, and for any test longer than very
# short, we will fill the stderr buffer and hang.
# I cannot figure out how to get around this, so we workaround it
# instead by directing stderr to a log file, and simply repeatedly
# try to read the info from this file until we are happy.
async with asyncio.timeout(120):
done = False
while not done and not failed:
with logfilename.open("r") as f:
for line in f:
if self.is_success_line(line, self.service_port):
print(f'Got start message: {line}')
done = True
break
if self.is_failure_line(line, self.service_port):
print(f'Got fail message: {line}')
failed = True
break
if failed:
self.logfile.close()
await proc.wait()
continue
check_proc = await asyncio.create_subprocess_exec(exe
, *["container", "port", name]
, stdout=asyncio.subprocess.PIPE
)
while True:
data = await check_proc.stdout.readline()
if not data:
break
s = data.decode()
m = re.search(r"\d+\/\w+ -> [\w+\.\[\]\:]+:(\d+)", s)
if m:
self.port = int(m.group(1))
await check_proc.wait()
if not self.port:
proc.kill()
raise RuntimeError("Could not query port from container")
self.proc = proc
break
await check_proc.wait()
if not self.port:
proc.kill()
proc.wait()
raise RuntimeError("Could not query port from container")
self.proc = proc
async def stop(self):
"""Stops docker image"""
if self.proc:
logger.debug("Stopping docker process")
self.proc.terminate()
await self.proc.wait()
self.proc.wait()
self.proc = None
if self.echo_thread:
logger.debug("Waiting for IO thread")
await self.echo_thread
self.echo_thread = None
if self.logfile:
logger.debug("Closing log file")
self.logfile.close()
self.logfile = None

View File

@@ -747,6 +747,8 @@ class ScyllaServer:
self.notify_socket = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM | socket.SOCK_CLOEXEC)
self.notify_socket.bind(str(self.notify_socket_path))
self._received_serving = False
loop = asyncio.get_running_loop()
def poll_status(s: socket.socket, f: asyncio.Future, logger: Union[logging.Logger, logging.LoggerAdapter]):
# Try to read all available messages from the socket
while True:
@@ -756,7 +758,7 @@ class ScyllaServer:
message = data.decode('utf-8', errors='replace')
if 'STATUS=serving' in message:
logger.debug("Received sd_notify 'serving' message")
f.set_result(True)
loop.call_soon_threadsafe(f.set_result, True)
return
if 'STATUS=entering maintenance mode' in message:
logger.debug("Receive sd_notify 'entering maintenance mode'")
@@ -766,9 +768,9 @@ class ScyllaServer:
except Exception as e:
logger.debug("Error reading from notify socket: %s", e)
break
f.set_result(False)
loop.call_soon_threadsafe(f.set_result, False)
self.serving_signal = asyncio.get_running_loop().create_future()
self.serving_signal = loop.create_future()
t = threading.Thread(target=poll_status, args=[self.notify_socket, self.serving_signal, self.logger], daemon=True)
t.start()
@@ -892,7 +894,6 @@ class ScyllaServer:
return
await report_error("the node startup failed, but the log file doesn't contain the expected error")
await report_error("failed to start the node")
self.logger.info("Wait me %s expect %s is %s", self.server_id, expected_server_up_state, server_up_state)
if await self.try_get_host_id(api):
if server_up_state == ServerUpState.PROCESS_STARTED:
server_up_state = ServerUpState.HOST_ID_QUERIED

View File

@@ -56,15 +56,39 @@ def unique_name(unique_name_prefix = 'test_'):
async def wait_for(
pred: Callable[[], Awaitable[Optional[T]]],
deadline: float,
period: float = 1,
period: float = 0.1,
before_retry: Optional[Callable[[], Any]] = None,
backoff_factor: float = 1,
max_period: float = None) -> T:
backoff_factor: float = 1.5,
max_period: float = 1.0,
label: Optional[str] = None) -> T:
tag = label or getattr(pred, '__name__', 'unlabeled')
start = time.time()
retries = 0
last_exception: Exception | None = None
while True:
assert(time.time() < deadline), "Deadline exceeded, failing test."
res = await pred()
elapsed = time.time() - start
if time.time() >= deadline:
timeout_msg = f"wait_for({tag}) timed out after {elapsed:.2f}s ({retries} retries)"
if last_exception is not None:
timeout_msg += (
f"; last exception: {type(last_exception).__name__}: {last_exception}"
)
raise AssertionError(timeout_msg) from last_exception
raise AssertionError(timeout_msg)
try:
res = await pred()
last_exception = None
except Exception as exc:
res = None
last_exception = exc
if res is not None:
if retries > 0:
logger.debug(f"wait_for({tag}) completed "
f"in {elapsed:.2f}s ({retries} retries)")
return res
retries += 1
await asyncio.sleep(period)
period *= backoff_factor
if max_period is not None:
@@ -273,14 +297,14 @@ async def wait_for_view_v1(cql: Session, name: str, node_count: int, timeout: in
done = await cql.run_async(f"SELECT COUNT(*) FROM system_distributed.view_build_status WHERE status = 'SUCCESS' AND view_name = '{name}' ALLOW FILTERING")
return done[0][0] == node_count or None
deadline = time.time() + timeout
await wait_for(view_is_built, deadline)
await wait_for(view_is_built, deadline, label=f"view_v1_{name}")
async def wait_for_view(cql: Session, name: str, node_count: int, timeout: int = 120):
async def view_is_built():
done = await cql.run_async(f"SELECT COUNT(*) FROM system.view_build_status_v2 WHERE status = 'SUCCESS' AND view_name = '{name}' ALLOW FILTERING")
return done[0][0] == node_count or None
deadline = time.time() + timeout
await wait_for(view_is_built, deadline)
await wait_for(view_is_built, deadline, label=f"view_{name}")
async def wait_for_first_completed(coros: list[Coroutine], timeout: int|None = None):

View File

@@ -350,6 +350,7 @@ utils::gcp::storage::client::impl::send_with_retry(const std::string& path, cons
co_await authorize(req, scope);
}
auto content = co_await util::read_entire_stream_contiguous(_in);
auto error_msg = get_gcp_error_message(std::string_view(content));
gcp_storage.debug("Got unexpected response status: {}, content: {}", rep._status, content);
co_await coroutine::return_exception_ptr(std::make_exception_ptr(httpd::unexpected_status_error(rep._status)));
}
@@ -628,7 +629,7 @@ future<> utils::gcp::storage::client::object_data_sink::remove_upload() {
co_return;
}
gcp_storage.debug("Removing incomplete upload {}:{} ({})", _bucket, _object_name, _session_path);
gcp_storage.debug("Removing incomplete upload {}:{} ()", _bucket, _object_name, _session_path);
auto res = co_await _impl->send_with_retry(_session_path
, GCP_OBJECT_SCOPE_READ_WRITE

View File

@@ -1583,7 +1583,7 @@ void reclaim_timer::report() const noexcept {
if (_memory_released > 0) {
auto bytes_per_second =
static_cast<float>(_memory_released) / std::chrono::duration_cast<std::chrono::duration<float>>(_duration).count();
timing_logger.log(info_level, "- reclamation rate = {:.3f} MiB/s", bytes_per_second / MiB);
timing_logger.log(info_level, "- reclamation rate = {} MiB/s", format("{:.3f}", bytes_per_second / MiB));
}
if (_debug_enabled) {