Compare commits

..

2 Commits

Author SHA1 Message Date
copilot-swe-agent[bot]
9e806cb3f7 Fix critical bugs and issues found in alternator code review
Co-authored-by: nyh <584227+nyh@users.noreply.github.com>
2026-01-29 22:54:57 +00:00
copilot-swe-agent[bot]
f267af38bd Initial plan 2026-01-29 22:49:31 +00:00
195 changed files with 3257 additions and 3565 deletions

View File

@@ -1,22 +0,0 @@
name: Sync Jira Based on PR Milestone Events
on:
pull_request_target:
types: [milestoned, demilestoned]
permissions:
contents: read
pull-requests: read
jobs:
jira-sync-milestone-set:
if: github.event.action == 'milestoned'
uses: scylladb/github-automation/.github/workflows/main_jira_sync_pr_milestone_set.yml@main
secrets:
caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
jira-sync-milestone-removed:
if: github.event.action == 'demilestoned'
uses: scylladb/github-automation/.github/workflows/main_jira_sync_pr_milestone_removed.yml@main
secrets:
caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}

View File

@@ -1,4 +1,4 @@
name: Call Jira release creation for new milestone
name: Call Jira release creation for new milestone
on:
milestone:
@@ -9,6 +9,6 @@ jobs:
uses: scylladb/github-automation/.github/workflows/main_sync_milestone_to_jira_release.yml@main
with:
# Comma-separated list of Jira project keys
jira_project_keys: "SCYLLADB,CUSTOMER,SMI"
jira_project_keys: "SCYLLADB,CUSTOMER"
secrets:
caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}

View File

@@ -1,62 +0,0 @@
name: Close issues created by Scylla associates
on:
issues:
types: [opened, reopened]
permissions:
issues: write
jobs:
comment-and-close:
runs-on: ubuntu-latest
steps:
- name: Comment and close if author email is scylladb.com
uses: actions/github-script@v7
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
script: |
const issue = context.payload.issue;
const actor = context.actor;
// Get user data (only public email is available)
const { data: user } = await github.rest.users.getByUsername({
username: actor,
});
const email = user.email || "";
console.log(`Actor: ${actor}, public email: ${email || "<none>"}`);
// Only continue if email exists and ends with @scylladb.com
if (!email || !email.toLowerCase().endsWith("@scylladb.com")) {
console.log("User is not a scylladb.com email (or email not public); skipping.");
return;
}
const owner = context.repo.owner;
const repo = context.repo.repo;
const issue_number = issue.number;
const body = "Issues in this repository are closed automatically. Scylla associates should use Jira to manage issues.\nPlease move this issue to Jira https://scylladb.atlassian.net/jira/software/c/projects/SCYLLADB/list";
// Add the comment
await github.rest.issues.createComment({
owner,
repo,
issue_number,
body,
});
console.log(`Comment added to #${issue_number}`);
// Close the issue
await github.rest.issues.update({
owner,
repo,
issue_number,
state: "closed",
state_reason: "not_planned"
});
console.log(`Issue #${issue_number} closed.`);

View File

@@ -9,34 +9,16 @@ on:
jobs:
trigger-jenkins:
if: (github.event_name == 'issue_comment' && github.event.comment.user.login != 'scylladbbot') || github.event.label.name == 'conflicts'
if: (github.event.comment.user.login != 'scylladbbot' && contains(github.event.comment.body, '@scylladbbot') && contains(github.event.comment.body, 'trigger-ci')) || github.event.label.name == 'conflicts'
runs-on: ubuntu-latest
steps:
- name: Validate Comment Trigger
if: github.event_name == 'issue_comment'
id: verify_comment
shell: bash
run: |
BODY=$(cat << 'EOF'
${{ github.event.comment.body }}
EOF
)
CLEAN_BODY=$(echo "$BODY" | grep -v '^[[:space:]]*>')
if echo "$CLEAN_BODY" | grep -qi '@scylladbbot' && echo "$CLEAN_BODY" | grep -qi 'trigger-ci'; then
echo "trigger=true" >> $GITHUB_OUTPUT
else
echo "trigger=false" >> $GITHUB_OUTPUT
fi
- name: Trigger Scylla-CI-Route Jenkins Job
if: github.event_name == 'pull_request_target' || steps.verify_comment.outputs.trigger == 'true'
env:
JENKINS_USER: ${{ secrets.JENKINS_USERNAME }}
JENKINS_API_TOKEN: ${{ secrets.JENKINS_TOKEN }}
JENKINS_URL: "https://jenkins.scylladb.com"
run: |
PR_NUMBER=${{ github.event.issue.number || github.event.pull_request.number }}
PR_NUMBER=${{ github.event.issue.number }}
PR_REPO_NAME=${{ github.event.repository.full_name }}
curl -X POST "$JENKINS_URL/job/releng/job/Scylla-CI-Route/buildWithParameters?PR_NUMBER=$PR_NUMBER&PR_REPO_NAME=$PR_REPO_NAME" \
--user "$JENKINS_USER:$JENKINS_API_TOKEN" --fail -i -v

View File

@@ -1,197 +0,0 @@
# Implementation Summary: Error Injection Event Stream
## Problem Statement
Tests using error injections had to rely on log parsing to detect when injection points were hit:
```python
mark, _ = await log.wait_for('topology_coordinator_pause_before_processing_backlog: waiting', from_mark=mark)
```
This approach was:
- **Slow**: Required waiting for log flushes and buffer processing
- **Unreliable**: Regex matching could fail or match wrong lines
- **Fragile**: Changes to log messages broke tests
## Solution
Implemented a Server-Sent Events (SSE) API that sends real-time notifications when error injection points are triggered.
## Implementation
### 1. Backend Event System (`utils/error_injection.hh`)
**Added**:
- `error_injection_event_callback` type for event notifications
- `_event_callbacks` vector to store registered callbacks
- `notify_event()` method called by all `inject()` methods
- `register_event_callback()` / `clear_event_callbacks()` methods
- Cross-shard registration via `register_event_callback_on_all()`
**Modified**:
- All `inject()` methods now call `notify_event()` after logging
- Changed log level from DEBUG to INFO for better visibility
- Both enabled/disabled template specializations updated
### 2. SSE API Endpoint (`api/error_injection.cc`)
**Added**:
- `GET /v2/error_injection/events` endpoint
- Streams events in SSE format: `data: {"injection":"name","type":"handler","shard":0}\n\n`
- Cross-shard event collection using `foreign_ptr` and `smp::submit_to()`
- Automatic cleanup on client disconnect
**Architecture**:
1. Client connects → queue created on handler shard
2. Callbacks registered on ALL shards
3. When injection fires → event sent via `smp::submit_to()` to queue
4. Queue → SSE stream → client
5. Client disconnect → callbacks cleared on all shards
### 3. Python Client (`test/pylib/rest_client.py`)
**Added**:
- `InjectionEventStream` class:
- `wait_for_injection(name, timeout)` - wait for specific injection
- Background task reads SSE stream
- Queue-based event delivery
- `injection_event_stream()` context manager for lifecycle
- Full async/await support
**Usage**:
```python
async with injection_event_stream(server_ip) as stream:
await api.enable_injection(server_ip, "my_injection", one_shot=True)
# ... trigger operation ...
event = await stream.wait_for_injection("my_injection", timeout=30)
```
### 4. Tests (`test/cluster/test_error_injection_events.py`)
**Added**:
- `test_injection_event_stream_basic` - basic functionality
- `test_injection_event_stream_multiple_injections` - multiple tracking
- `test_injection_event_vs_log_parsing_comparison` - old vs new
### 5. Documentation (`docs/dev/error_injection_events.md`)
Complete documentation covering:
- Architecture and design
- Usage examples
- Migration guide from log parsing
- Thread safety and cleanup
## Key Design Decisions
### Why SSE instead of WebSocket?
- **Unidirectional**: We only need server → client events
- **Simpler**: Built on HTTP, easier to implement
- **Standard**: Well-supported in Python (aiohttp)
- **Sufficient**: No need for bidirectional communication
### Why Thread-Local Callbacks?
- **Performance**: No cross-shard synchronization overhead
- **Simplicity**: Each shard independent
- **Safety**: No shared mutable state
- Event delivery handled by `smp::submit_to()`
### Why Info Level Logging?
- **Visibility**: Events should be visible in logs AND via SSE
- **Debugging**: Easier to correlate events with log context
- **Consistency**: Matches importance of injection triggers
## Benefits
### Performance
- **Instant notification**: No waiting for log flushes
- **No regex matching**: Direct event delivery
- **Parallel processing**: Events from all shards
### Reliability
- **Type-safe**: Structured JSON events
- **No missed events**: Queue-based delivery
- **Automatic cleanup**: RAII ensures no leaks
### Developer Experience
- **Clean API**: Simple async/await pattern
- **Better errors**: Timeout on specific injection name
- **Metadata**: Event includes type and shard ID
- **Backward compatible**: Existing tests unchanged
## Testing
### Security
✅ CodeQL scan: **0 alerts** (Python)
### Validation Needed
Due to build environment limitations, the following validations are recommended:
- [ ] Build C++ code in dev mode
- [ ] Run example tests: `./test.py --mode=dev test/cluster/test_error_injection_events.py`
- [ ] Verify SSE connection lifecycle (connect, disconnect, reconnect)
- [ ] Test with multiple concurrent clients
- [ ] Verify cross-shard event delivery
- [ ] Performance comparison with log parsing
## Files Changed
```
api/api-doc/error_injection.json | 15 +++
api/error_injection.cc | 82 ++++++++++++++
docs/dev/error_injection_events.md | 132 +++++++++++++++++++++
test/cluster/test_error_injection_events.py | 140 ++++++++++++++++++++++
test/pylib/rest_client.py | 144 ++++++++++++++++++++++
utils/error_injection.hh | 81 +++++++++++++
6 files changed, 587 insertions(+), 7 deletions(-)
```
## Migration Guide
### Old Approach
```python
log = await manager.server_open_log(server.server_id)
mark = await log.mark()
await manager.api.enable_injection(server.ip_addr, "my_injection", one_shot=True)
# ... trigger operation ...
mark, _ = await log.wait_for('my_injection: waiting', from_mark=mark)
```
### New Approach
```python
async with injection_event_stream(server.ip_addr) as stream:
await manager.api.enable_injection(server.ip_addr, "my_injection", one_shot=True)
# ... trigger operation ...
event = await stream.wait_for_injection("my_injection", timeout=30)
```
### Backward Compatibility
- ✅ All existing log-based tests continue to work
- ✅ Logging still happens (now at INFO level)
- ✅ No breaking changes to existing APIs
- ✅ SSE is opt-in for new tests
## Future Enhancements
Possible improvements:
1. Server-side filtering by injection name (query parameter)
2. Include injection parameters in events
3. Add event timestamps
4. Event history/replay support
5. Multiple concurrent SSE clients per server
6. WebSocket support if bidirectional communication needed
## Conclusion
This implementation successfully addresses the problem statement:
- ✅ Eliminates log parsing
- ✅ Faster tests
- ✅ More reliable detection
- ✅ Clean API
- ✅ Backward compatible
- ✅ Well documented
- ✅ Security validated
The solution follows ScyllaDB best practices:
- RAII for resource management
- Seastar async patterns (coroutines, futures)
- Cross-shard communication via `smp::submit_to()`
- Thread-local state, no locks
- Comprehensive error handling

View File

@@ -244,7 +244,10 @@ static bool is_set_of(const rjson::value& type1, const rjson::value& type2) {
// Check if two JSON-encoded values match with the CONTAINS relation
bool check_CONTAINS(const rjson::value* v1, const rjson::value& v2, bool v1_from_query, bool v2_from_query) {
if (!v1) {
if (!v1 || !v1->IsObject() || v1->MemberCount() == 0) {
return false;
}
if (!v2.IsObject() || v2.MemberCount() == 0) {
return false;
}
const auto& kv1 = *v1->MemberBegin();
@@ -618,7 +621,7 @@ conditional_operator_type get_conditional_operator(const rjson::value& req) {
// Check if the existing values of the item (previous_item) match the
// conditions given by the Expected and ConditionalOperator parameters
// (if they exist) in the request (an UpdateItem, PutItem or DeleteItem).
// This function can throw a ValidationException API error if there
// This function can throw an ValidationException API error if there
// are errors in the format of the condition itself.
bool verify_expected(const rjson::value& req, const rjson::value* previous_item) {
const rjson::value* expected = rjson::find(req, "Expected");

View File

@@ -53,7 +53,9 @@ void consumed_capacity_counter::add_consumed_capacity_to_response_if_needed(rjso
}
static uint64_t calculate_half_units(uint64_t unit_block_size, uint64_t total_bytes, bool is_quorum) {
uint64_t half_units = (total_bytes + unit_block_size -1) / unit_block_size; //divide by unit_block_size and round up
// Avoid potential integer overflow when total_bytes is close to UINT64_MAX
// by using division with modulo instead of addition before division
uint64_t half_units = total_bytes / unit_block_size + (total_bytes % unit_block_size != 0 ? 1 : 0);
if (is_quorum) {
half_units *= 2;

View File

@@ -237,7 +237,7 @@ static void validate_is_object(const rjson::value& value, const char* caller) {
}
// This function assumes the given value is an object and returns requested member value.
// If it is not possible, an api_error::validation is thrown.
// If it is not possible an api_error::validation is thrown.
static const rjson::value& get_member(const rjson::value& obj, const char* member_name, const char* caller) {
validate_is_object(obj, caller);
const rjson::value* ret = rjson::find(obj, member_name);
@@ -249,7 +249,7 @@ static const rjson::value& get_member(const rjson::value& obj, const char* membe
// This function assumes the given value is an object with a single member, and returns this member.
// In case the requirements are not met, an api_error::validation is thrown.
// In case the requirements are not met an api_error::validation is thrown.
static const rjson::value::Member& get_single_member(const rjson::value& v, const char* caller) {
if (!v.IsObject() || v.MemberCount() != 1) {
throw api_error::validation(format("{}: expected an object with a single member.", caller));
@@ -682,7 +682,7 @@ static std::optional<int> get_int_attribute(const rjson::value& value, std::stri
}
// Sets a KeySchema object inside the given JSON parent describing the key
// attributes of the given schema as being either HASH or RANGE keys.
// attributes of the the given schema as being either HASH or RANGE keys.
// Additionally, adds to a given map mappings between the key attribute
// names and their type (as a DynamoDB type string).
void executor::describe_key_schema(rjson::value& parent, const schema& schema, std::unordered_map<std::string,std::string>* attribute_types, const std::map<sstring, sstring> *tags) {
@@ -834,11 +834,13 @@ future<> executor::fill_table_size(rjson::value &table_description, schema_ptr s
total_size = co_await _ss.estimate_total_sstable_volume(schema->id(), service::storage_service::ignore_errors::yes);
const auto expiry = std::chrono::seconds{ _proxy.data_dictionary().get_config().alternator_describe_table_info_cache_validity_in_seconds() };
// Note: we don't care when the notification of other shards will finish, as long as it will be done
// it's possible to get into race condition (next DescribeTable comes to other shard, that new shard doesn't have
// the size yet, so it will calculate it again) - this is not a problem, because it will call cache_newly_calculated_size_on_all_shards
// with expiry, which is extremely unlikely to be exactly the same as the previous one, all shards will keep the size coming with expiry that is further into the future.
// In case of the same expiry, some shards will have different size, which means DescribeTable will return different values depending on the shard
// which is also fine, as the specification doesn't give precision guarantees of any kind.
// A race condition is possible: if a DescribeTable request arrives on a different shard before
// that shard receives the cached size, it will recalculate independently. This is acceptable because:
// 1. Both calculations will cache their results with an expiry time
// 2. Expiry times are unlikely to be identical, so eventually all shards converge to the most recent value
// 3. Even if expiry times match, different shards may briefly return different table sizes
// 4. This temporary inconsistency is acceptable per DynamoDB specification, which doesn't guarantee
// exact precision for DescribeTable size information
co_await cache_newly_calculated_size_on_all_shards(schema, total_size, expiry);
}
}
@@ -916,7 +918,7 @@ future<rjson::value> executor::fill_table_description(schema_ptr schema, table_s
sstring index_name = cf_name.substr(delim_it + 1);
rjson::add(view_entry, "IndexName", rjson::from_string(index_name));
rjson::add(view_entry, "IndexArn", generate_arn_for_index(*schema, index_name));
// Add index's KeySchema and collect types for AttributeDefinitions:
// Add indexes's KeySchema and collect types for AttributeDefinitions:
executor::describe_key_schema(view_entry, *vptr, key_attribute_types, db::get_tags_of_table(vptr));
// Add projection type
rjson::value projection = rjson::empty_object();
@@ -2435,7 +2437,7 @@ std::unordered_map<bytes, std::string> si_key_attributes(data_dictionary::table
// case, this function simply won't be called for this attribute.)
//
// This function checks if the given attribute update is an update to some
// GSI's key, and if the value is unsuitable, an api_error::validation is
// GSI's key, and if the value is unsuitable, a api_error::validation is
// thrown. The checking here is similar to the checking done in
// get_key_from_typed_value() for the base table's key columns.
//
@@ -3548,7 +3550,7 @@ static bool hierarchy_filter(rjson::value& val, const attribute_path_map_node<T>
return true;
}
// Add a path to an attribute_path_map. Throws a validation error if the path
// Add a path to a attribute_path_map. Throws a validation error if the path
// "overlaps" with one already in the filter (one is a sub-path of the other)
// or "conflicts" with it (both a member and index is requested).
template<typename T>

View File

@@ -50,7 +50,7 @@ public:
_operators.emplace_back(i);
check_depth_limit();
}
void add_dot(std::string name) {
void add_dot(std::string(name)) {
_operators.emplace_back(std::move(name));
check_depth_limit();
}
@@ -85,7 +85,7 @@ struct constant {
}
};
// "value" is a value used in the right hand side of an assignment
// "value" is is a value used in the right hand side of an assignment
// expression, "SET a = ...". It can be a constant (a reference to a value
// included in the request, e.g., ":val"), a path to an attribute from the
// existing item (e.g., "a.b[3].c"), or a function of other such values.
@@ -205,7 +205,7 @@ public:
// The supported primitive conditions are:
// 1. Binary operators - v1 OP v2, where OP is =, <>, <, <=, >, or >= and
// v1 and v2 are values - from the item (an attribute path), the query
// (a ":val" reference), or a function of the above (only the size()
// (a ":val" reference), or a function of the the above (only the size()
// function is supported).
// 2. Ternary operator - v1 BETWEEN v2 and v3 (means v1 >= v2 AND v1 <= v3).
// 3. N-ary operator - v1 IN ( v2, v3, ... )

View File

@@ -55,7 +55,7 @@ partition_key pk_from_json(const rjson::value& item, schema_ptr schema);
clustering_key ck_from_json(const rjson::value& item, schema_ptr schema);
position_in_partition pos_from_json(const rjson::value& item, schema_ptr schema);
// If v encodes a number (i.e., it is a {"N": [...]}), returns an object representing it. Otherwise,
// If v encodes a number (i.e., it is a {"N": [...]}, returns an object representing it. Otherwise,
// raises ValidationException with diagnostic.
big_decimal unwrap_number(const rjson::value& v, std::string_view diagnostic);

View File

@@ -141,7 +141,7 @@ future<executor::request_return_type> executor::describe_time_to_live(client_sta
// expiration_service is a sharded service responsible for cleaning up expired
// items in all tables with per-item expiration enabled. Currently, this means
// Alternator tables with TTL configured via an UpdateTimeToLive request.
// Alternator tables with TTL configured via a UpdateTimeToLive request.
//
// Here is a brief overview of how the expiration service works:
//
@@ -593,7 +593,7 @@ static future<> scan_table_ranges(
if (retries >= 10) {
// Don't get stuck forever asking the same page, maybe there's
// a bug or a real problem in several replicas. Give up on
// this scan and retry the scan from a random position later,
// this scan an retry the scan from a random position later,
// in the next scan period.
throw runtime_exception("scanner thread failed after too many timeouts for the same page");
}

View File

@@ -30,7 +30,7 @@ namespace alternator {
// expiration_service is a sharded service responsible for cleaning up expired
// items in all tables with per-item expiration enabled. Currently, this means
// Alternator tables with TTL configured via an UpdateTimeToLive request.
// Alternator tables with TTL configured via a UpdateTimeToLeave request.
class expiration_service final : public seastar::peering_sharded_service<expiration_service> {
public:
// Object holding per-shard statistics related to the expiration service.
@@ -52,7 +52,7 @@ private:
data_dictionary::database _db;
service::storage_proxy& _proxy;
gms::gossiper& _gossiper;
// _end is set by start(), and resolves when the background service
// _end is set by start(), and resolves when the the background service
// started by it ends. To ask the background service to end, _abort_source
// should be triggered. stop() below uses both _abort_source and _end.
std::optional<future<>> _end;

View File

@@ -112,21 +112,6 @@
}
]
},
{
"path":"/v2/error_injection/events",
"operations":[
{
"method":"GET",
"summary":"Subscribe to Server-Sent Events stream of error injection events",
"type":"void",
"nickname":"injection_events",
"produces":[
"text/event-stream"
],
"parameters":[]
}
]
},
{
"path":"/v2/error_injection/disconnect/{ip}",
"operations":[

View File

@@ -13,22 +13,12 @@
#include "utils/rjson.hh"
#include <seastar/core/future-util.hh>
#include <seastar/util/short_streams.hh>
#include <seastar/core/queue.hh>
#include <seastar/core/when_all.hh>
#include <seastar/core/sharded.hh>
namespace api {
using namespace seastar::httpd;
namespace hf = httpd::error_injection_json;
// Structure to hold error injection event data
struct injection_event {
sstring injection_name;
sstring injection_type;
unsigned shard_id;
};
void set_error_injection(http_context& ctx, routes& r) {
hf::enable_injection.set(r, [](std::unique_ptr<request> req) -> future<json::json_return_type> {
@@ -111,79 +101,6 @@ void set_error_injection(http_context& ctx, routes& r) {
return make_ready_future<json::json_return_type>(json::json_void());
});
});
// Server-Sent Events endpoint for injection events
// This allows clients to subscribe to real-time injection events instead of log parsing
r.add(operation_type::GET, url("/v2/error_injection/events"), [](std::unique_ptr<request> req) -> future<json::json_return_type> {
// Create a shared foreign_ptr to a queue that will receive events from all shards
// Using a queue on the current shard to collect events
using event_queue_t = seastar::queue<injection_event>;
auto event_queue = make_lw_shared<event_queue_t>();
auto queue_ptr = make_foreign(event_queue);
// Register callback on all shards to send events to our queue
auto& errinj = utils::get_local_injector();
// Capture the current shard ID for event delivery
auto target_shard = this_shard_id();
// Setup event callback that forwards events to the queue on the target shard
// Note: We use shared_ptr wrapper for foreign_ptr to make it copyable
auto callback = [queue_ptr = queue_ptr.copy(), target_shard] (std::string_view name, std::string_view type) {
injection_event evt{
.injection_name = sstring(name),
.injection_type = sstring(type),
.shard_id = this_shard_id()
};
// Send event to the target shard's queue (discard future, fire-and-forget)
(void)smp::submit_to(target_shard, [queue_ptr = queue_ptr.copy(), evt = std::move(evt)] () mutable {
return queue_ptr->push_eventually(std::move(evt));
});
};
// Register the callback on all shards
co_await errinj.register_event_callback_on_all(callback);
// Return a streaming function that sends SSE events
noncopyable_function<future<>(output_stream<char>&&)> stream_func =
[event_queue](output_stream<char>&& os) -> future<> {
auto s = std::move(os);
std::exception_ptr ex;
try {
// Send initial SSE comment to establish connection
co_await s.write(": connected\n\n");
co_await s.flush();
// Stream events as they arrive from any shard
while (true) {
auto evt = co_await event_queue->pop_eventually();
// Format as SSE event
// data: {"injection":"name","type":"handler","shard":0}
auto json_data = format("{{\"injection\":\"{}\",\"type\":\"{}\",\"shard\":{}}}",
evt.injection_name, evt.injection_type, evt.shard_id);
co_await s.write(format("data: {}\n\n", json_data));
co_await s.flush();
}
} catch (...) {
ex = std::current_exception();
}
// Cleanup: clear callbacks on all shards
co_await utils::get_local_injector().clear_event_callbacks_on_all();
co_await s.close();
if (ex) {
co_await coroutine::return_exception_ptr(std::move(ex));
}
};
co_return json::json_return_type(std::move(stream_func));
});
}
} // namespace api

View File

@@ -515,15 +515,6 @@ void set_sstables_loader(http_context& ctx, routes& r, sharded<sstables_loader>&
auto sstables = parsed.GetArray() |
std::views::transform([] (const auto& s) { return sstring(rjson::to_string_view(s)); }) |
std::ranges::to<std::vector>();
apilog.info("Restore invoked with following parameters: keyspace={}, table={}, endpoint={}, bucket={}, prefix={}, sstables_count={}, scope={}, primary_replica_only={}",
keyspace,
table,
endpoint,
bucket,
prefix,
sstables.size(),
scope,
primary_replica_only);
auto task_id = co_await sst_loader.local().download_new_sstables(keyspace, table, prefix, std::move(sstables), endpoint, bucket, scope, primary_replica_only);
co_return json::json_return_type(fmt::to_string(task_id));
});

View File

@@ -814,7 +814,8 @@ generation_service::generation_service(
config cfg, gms::gossiper& g, sharded<db::system_distributed_keyspace>& sys_dist_ks,
sharded<db::system_keyspace>& sys_ks,
abort_source& abort_src, const locator::shared_token_metadata& stm, gms::feature_service& f,
replica::database& db)
replica::database& db,
std::function<bool()> raft_topology_change_enabled)
: _cfg(std::move(cfg))
, _gossiper(g)
, _sys_dist_ks(sys_dist_ks)
@@ -823,6 +824,7 @@ generation_service::generation_service(
, _token_metadata(stm)
, _feature_service(f)
, _db(db)
, _raft_topology_change_enabled(std::move(raft_topology_change_enabled))
{
}
@@ -876,7 +878,16 @@ future<> generation_service::on_join(gms::inet_address ep, locator::host_id id,
future<> generation_service::on_change(gms::inet_address ep, locator::host_id id, const gms::application_state_map& states, gms::permit_id pid) {
assert_shard_zero(__PRETTY_FUNCTION__);
return make_ready_future<>();
if (_raft_topology_change_enabled()) {
return make_ready_future<>();
}
return on_application_state_change(ep, id, states, gms::application_state::CDC_GENERATION_ID, pid, [this] (gms::inet_address ep, locator::host_id id, const gms::versioned_value& v, gms::permit_id) {
auto gen_id = gms::versioned_value::cdc_generation_id_from_string(v.value());
cdc_log.debug("Endpoint: {}, CDC generation ID change: {}", ep, gen_id);
return legacy_handle_cdc_generation(gen_id);
});
}
future<> generation_service::check_and_repair_cdc_streams() {

View File

@@ -79,12 +79,17 @@ private:
std::optional<cdc::generation_id> _gen_id;
future<> _cdc_streams_rewrite_complete = make_ready_future<>();
/* Returns true if raft topology changes are enabled.
* Can only be called from shard 0.
*/
std::function<bool()> _raft_topology_change_enabled;
public:
generation_service(config cfg, gms::gossiper&,
sharded<db::system_distributed_keyspace>&,
sharded<db::system_keyspace>& sys_ks,
abort_source&, const locator::shared_token_metadata&,
gms::feature_service&, replica::database& db);
gms::feature_service&, replica::database& db,
std::function<bool()> raft_topology_change_enabled);
future<> stop();
~generation_service();

View File

@@ -730,6 +730,28 @@ vector_search_tests = set([
'test/vector_search/rescoring_test'
])
vector_search_validator_bin = 'vector-search-validator/bin/vector-search-validator'
vector_search_validator_deps = set([
'test/vector_search_validator/build-validator',
'test/vector_search_validator/Cargo.toml',
'test/vector_search_validator/crates/validator/Cargo.toml',
'test/vector_search_validator/crates/validator/src/main.rs',
'test/vector_search_validator/crates/validator-scylla/Cargo.toml',
'test/vector_search_validator/crates/validator-scylla/src/lib.rs',
'test/vector_search_validator/crates/validator-scylla/src/cql.rs',
])
vector_store_bin = 'vector-search-validator/bin/vector-store'
vector_store_deps = set([
'test/vector_search_validator/build-env',
'test/vector_search_validator/build-vector-store',
])
vector_search_validator_bins = set([
vector_search_validator_bin,
vector_store_bin,
])
wasms = set([
'wasm/return_input.wat',
'wasm/test_complex_null_values.wat',
@@ -763,7 +785,7 @@ other = set([
'iotune',
])
all_artifacts = apps | cpp_apps | tests | other | wasms
all_artifacts = apps | cpp_apps | tests | other | wasms | vector_search_validator_bins
arg_parser = argparse.ArgumentParser('Configure scylla', add_help=False, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
arg_parser.add_argument('--out', dest='buildfile', action='store', default='build.ninja',
@@ -1174,7 +1196,6 @@ scylla_core = (['message/messaging_service.cc',
'utils/gz/crc_combine.cc',
'utils/gz/crc_combine_table.cc',
'utils/http.cc',
'utils/http_client_error_processing.cc',
'utils/rest/client.cc',
'utils/s3/aws_error.cc',
'utils/s3/client.cc',
@@ -2564,10 +2585,11 @@ def write_build_file(f,
description = RUST_LIB $out
''').format(mode=mode, antlr3_exec=args.antlr3_exec, fmt_lib=fmt_lib, test_repeat=args.test_repeat, test_timeout=args.test_timeout, rustc_wrapper=rustc_wrapper, **modeval))
f.write(
'build {mode}-build: phony {artifacts} {wasms}\n'.format(
'build {mode}-build: phony {artifacts} {wasms} {vector_search_validator_bins}\n'.format(
mode=mode,
artifacts=str.join(' ', ['$builddir/' + mode + '/' + x for x in sorted(build_artifacts - wasms)]),
artifacts=str.join(' ', ['$builddir/' + mode + '/' + x for x in sorted(build_artifacts - wasms - vector_search_validator_bins)]),
wasms = str.join(' ', ['$builddir/' + x for x in sorted(build_artifacts & wasms)]),
vector_search_validator_bins=str.join(' ', ['$builddir/' + x for x in sorted(build_artifacts & vector_search_validator_bins)]),
)
)
if profile_recipe := modes[mode].get('profile_recipe'):
@@ -2597,7 +2619,7 @@ def write_build_file(f,
continue
profile_dep = modes[mode].get('profile_target', "")
if binary in other or binary in wasms:
if binary in other or binary in wasms or binary in vector_search_validator_bins:
continue
srcs = deps[binary]
# 'scylla'
@@ -2708,10 +2730,11 @@ def write_build_file(f,
)
f.write(
'build {mode}-test: test.{mode} {test_executables} $builddir/{mode}/scylla {wasms}\n'.format(
'build {mode}-test: test.{mode} {test_executables} $builddir/{mode}/scylla {wasms} {vector_search_validator_bins} \n'.format(
mode=mode,
test_executables=' '.join(['$builddir/{}/{}'.format(mode, binary) for binary in sorted(tests)]),
wasms=' '.join([f'$builddir/{binary}' for binary in sorted(wasms)]),
vector_search_validator_bins=' '.join([f'$builddir/{binary}' for binary in sorted(vector_search_validator_bins)]),
)
)
f.write(
@@ -2879,6 +2902,19 @@ def write_build_file(f,
'build compiler-training: phony {}\n'.format(' '.join(['{mode}-compiler-training'.format(mode=mode) for mode in default_modes]))
)
f.write(textwrap.dedent(f'''\
rule build-vector-search-validator
command = test/vector_search_validator/build-validator $builddir
rule build-vector-store
command = test/vector_search_validator/build-vector-store $builddir
'''))
f.write(
'build $builddir/{vector_search_validator_bin}: build-vector-search-validator {}\n'.format(' '.join([dep for dep in sorted(vector_search_validator_deps)]), vector_search_validator_bin=vector_search_validator_bin)
)
f.write(
'build $builddir/{vector_store_bin}: build-vector-store {}\n'.format(' '.join([dep for dep in sorted(vector_store_deps)]), vector_store_bin=vector_store_bin)
)
f.write(textwrap.dedent(f'''\
build dist-unified-tar: phony {' '.join([f'$builddir/{mode}/dist/tar/{scylla_product}-unified-{scylla_version}-{scylla_release}.{arch}.tar.gz' for mode in default_modes])}
build dist-unified: phony dist-unified-tar

View File

@@ -389,10 +389,8 @@ selectStatement returns [std::unique_ptr<raw::select_statement> expr]
bool is_ann_ordering = false;
}
: K_SELECT (
( (K_JSON K_DISTINCT)=> K_JSON { statement_subtype = raw::select_statement::parameters::statement_subtype::JSON; }
| (K_JSON selectClause K_FROM)=> K_JSON { statement_subtype = raw::select_statement::parameters::statement_subtype::JSON; }
)?
( (K_DISTINCT selectClause K_FROM)=> K_DISTINCT { is_distinct = true; } )?
( K_JSON { statement_subtype = raw::select_statement::parameters::statement_subtype::JSON; } )?
( K_DISTINCT { is_distinct = true; } )?
sclause=selectClause
)
K_FROM (
@@ -427,13 +425,13 @@ selector returns [shared_ptr<raw_selector> s]
unaliasedSelector returns [uexpression tmp]
: ( c=cident { tmp = unresolved_identifier{std::move(c)}; }
| v=value { tmp = std::move(v); }
| K_COUNT '(' countArgument ')' { tmp = make_count_rows_function_expression(); }
| K_WRITETIME '(' c=cident ')' { tmp = column_mutation_attribute{column_mutation_attribute::attribute_kind::writetime,
unresolved_identifier{std::move(c)}}; }
| K_TTL '(' c=cident ')' { tmp = column_mutation_attribute{column_mutation_attribute::attribute_kind::ttl,
unresolved_identifier{std::move(c)}}; }
| f=functionName args=selectionFunctionArgs { tmp = function_call{std::move(f), std::move(args)}; }
| f=similarityFunctionName args=vectorSimilarityArgs { tmp = function_call{std::move(f), std::move(args)}; }
| K_CAST '(' arg=unaliasedSelector K_AS t=native_type ')' { tmp = cast{.style = cast::cast_style::sql, .arg = std::move(arg), .type = std::move(t)}; }
)
( '.' fi=cident { tmp = field_selection{std::move(tmp), std::move(fi)}; }
@@ -448,9 +446,23 @@ selectionFunctionArgs returns [std::vector<expression> a]
')'
;
vectorSimilarityArgs returns [std::vector<expression> a]
: '(' ')'
| '(' v1=vectorSimilarityArg { a.push_back(std::move(v1)); }
( ',' vn=vectorSimilarityArg { a.push_back(std::move(vn)); } )*
')'
;
vectorSimilarityArg returns [uexpression a]
: s=unaliasedSelector { a = std::move(s); }
| v=value { a = std::move(v); }
;
countArgument
: '*'
/* COUNT(1) is also allowed, it is recognized via the general function(args) path */
| i=INTEGER { if (i->getText() != "1") {
add_recognition_error("Only COUNT(1) is supported, got COUNT(" + i->getText() + ")");
} }
;
whereClause returns [uexpression clause]
@@ -1694,6 +1706,10 @@ functionName returns [cql3::functions::function_name s]
: (ks=keyspaceName '.')? f=allowedFunctionName { $s.keyspace = std::move(ks); $s.name = std::move(f); }
;
similarityFunctionName returns [cql3::functions::function_name s]
: f=allowedSimilarityFunctionName { $s = cql3::functions::function_name::native_function(std::move(f)); }
;
allowedFunctionName returns [sstring s]
: f=IDENT { $s = $f.text; std::transform(s.begin(), s.end(), s.begin(), ::tolower); }
| f=QUOTED_NAME { $s = $f.text; }
@@ -1702,6 +1718,11 @@ allowedFunctionName returns [sstring s]
| K_COUNT { $s = "count"; }
;
allowedSimilarityFunctionName returns [sstring s]
: f=(K_SIMILARITY_COSINE | K_SIMILARITY_EUCLIDEAN | K_SIMILARITY_DOT_PRODUCT)
{ $s = $f.text; std::transform(s.begin(), s.end(), s.begin(), ::tolower); }
;
functionArgs returns [std::vector<expression> a]
: '(' ')'
| '(' t1=term { a.push_back(std::move(t1)); }
@@ -2398,6 +2419,10 @@ K_MUTATION_FRAGMENTS: M U T A T I O N '_' F R A G M E N T S;
K_VECTOR_SEARCH_INDEXING: V E C T O R '_' S E A R C H '_' I N D E X I N G;
K_SIMILARITY_EUCLIDEAN: S I M I L A R I T Y '_' E U C L I D E A N;
K_SIMILARITY_COSINE: S I M I L A R I T Y '_' C O S I N E;
K_SIMILARITY_DOT_PRODUCT: S I M I L A R I T Y '_' D O T '_' P R O D U C T;
// Case-insensitive alpha characters
fragment A: ('a'|'A');
fragment B: ('b'|'B');

View File

@@ -10,7 +10,6 @@
#include "expr-utils.hh"
#include "evaluate.hh"
#include "cql3/functions/functions.hh"
#include "cql3/functions/aggregate_fcts.hh"
#include "cql3/functions/castas_fcts.hh"
#include "cql3/functions/scalar_function.hh"
#include "cql3/column_identifier.hh"
@@ -1048,47 +1047,8 @@ prepare_function_args_for_type_inference(std::span<const expression> args, data_
return partially_prepared_args;
}
// Special case for count(1) - recognize it as the countRows() function. Note it is quite
// artificial and we might relax it to the more general count(expression) later.
static
std::optional<expression>
try_prepare_count_rows(const expr::function_call& fc, data_dictionary::database db, const sstring& keyspace, const schema* schema_opt, lw_shared_ptr<column_specification> receiver) {
return std::visit(overloaded_functor{
[&] (const functions::function_name& name) -> std::optional<expression> {
auto native_name = name;
if (!native_name.has_keyspace()) {
native_name = name.as_native_function();
}
// Collapse count(1) into countRows()
if (native_name == functions::function_name::native_function("count")) {
if (fc.args.size() == 1) {
if (auto uc_arg = expr::as_if<expr::untyped_constant>(&fc.args[0])) {
if (uc_arg->partial_type == expr::untyped_constant::type_class::integer
&& uc_arg->raw_text == "1") {
return expr::function_call{
.func = functions::aggregate_fcts::make_count_rows_function(),
.args = {},
};
} else {
throw exceptions::invalid_request_exception(format("count() expects a column or the literal 1 as an argument", fc.args[0]));
}
}
}
}
return std::nullopt;
},
[] (const shared_ptr<functions::function>&) -> std::optional<expression> {
// Already prepared, nothing to do
return std::nullopt;
},
}, fc.func);
}
std::optional<expression>
prepare_function_call(const expr::function_call& fc, data_dictionary::database db, const sstring& keyspace, const schema* schema_opt, lw_shared_ptr<column_specification> receiver) {
if (auto prepared = try_prepare_count_rows(fc, db, keyspace, schema_opt, receiver)) {
return prepared;
}
// Try to extract a column family name from the available information.
// Most functions can be prepared without information about the column family, usually just the keyspace is enough.
// One exception is the token() function - in order to prepare system.token() we have to know the partition key of the table,

View File

@@ -10,41 +10,9 @@
#include "types/types.hh"
#include "types/vector.hh"
#include "exceptions/exceptions.hh"
#include <span>
#include <bit>
namespace cql3 {
namespace functions {
namespace detail {
std::vector<float> extract_float_vector(const bytes_opt& param, size_t dimension) {
if (!param) {
throw exceptions::invalid_request_exception("Cannot extract float vector from null parameter");
}
const size_t expected_size = dimension * sizeof(float);
if (param->size() != expected_size) {
throw exceptions::invalid_request_exception(
fmt::format("Invalid vector size: expected {} bytes for {} floats, got {} bytes",
expected_size, dimension, param->size()));
}
std::vector<float> result;
result.reserve(dimension);
bytes_view view(*param);
for (size_t i = 0; i < dimension; ++i) {
// read_simple handles network byte order (big-endian) conversion
uint32_t raw = read_simple<uint32_t>(view);
result.push_back(std::bit_cast<float>(raw));
}
return result;
}
} // namespace detail
namespace {
// The computations of similarity scores match the exact formulas of Cassandra's (jVector's) implementation to ensure compatibility.
@@ -54,14 +22,14 @@ namespace {
// You should only use this function if you need to preserve the original vectors and cannot normalize
// them in advance.
float compute_cosine_similarity(std::span<const float> v1, std::span<const float> v2) {
float compute_cosine_similarity(const std::vector<data_value>& v1, const std::vector<data_value>& v2) {
double dot_product = 0.0;
double squared_norm_a = 0.0;
double squared_norm_b = 0.0;
for (size_t i = 0; i < v1.size(); ++i) {
double a = v1[i];
double b = v2[i];
double a = value_cast<float>(v1[i]);
double b = value_cast<float>(v2[i]);
dot_product += a * b;
squared_norm_a += a * a;
@@ -78,12 +46,12 @@ float compute_cosine_similarity(std::span<const float> v1, std::span<const float
return (1 + (dot_product / (std::sqrt(squared_norm_a * squared_norm_b)))) / 2;
}
float compute_euclidean_similarity(std::span<const float> v1, std::span<const float> v2) {
float compute_euclidean_similarity(const std::vector<data_value>& v1, const std::vector<data_value>& v2) {
double sum = 0.0;
for (size_t i = 0; i < v1.size(); ++i) {
double a = v1[i];
double b = v2[i];
double a = value_cast<float>(v1[i]);
double b = value_cast<float>(v2[i]);
double diff = a - b;
sum += diff * diff;
@@ -97,12 +65,12 @@ float compute_euclidean_similarity(std::span<const float> v1, std::span<const fl
// Assumes that both vectors are L2-normalized.
// This similarity is intended as an optimized way to perform cosine similarity calculation.
float compute_dot_product_similarity(std::span<const float> v1, std::span<const float> v2) {
float compute_dot_product_similarity(const std::vector<data_value>& v1, const std::vector<data_value>& v2) {
double dot_product = 0.0;
for (size_t i = 0; i < v1.size(); ++i) {
double a = v1[i];
double b = v2[i];
double a = value_cast<float>(v1[i]);
double b = value_cast<float>(v2[i]);
dot_product += a * b;
}
@@ -168,15 +136,13 @@ bytes_opt vector_similarity_fct::execute(std::span<const bytes_opt> parameters)
return std::nullopt;
}
// Extract dimension from the vector type
const auto& type = static_cast<const vector_type_impl&>(*arg_types()[0]);
size_t dimension = type.get_dimension();
const auto& type = arg_types()[0];
data_value v1 = type->deserialize(*parameters[0]);
data_value v2 = type->deserialize(*parameters[1]);
const auto& v1_elements = value_cast<std::vector<data_value>>(v1);
const auto& v2_elements = value_cast<std::vector<data_value>>(v2);
// Optimized path: extract floats directly from bytes, bypassing data_value overhead
std::vector<float> v1 = detail::extract_float_vector(parameters[0], dimension);
std::vector<float> v2 = detail::extract_float_vector(parameters[1], dimension);
float result = SIMILARITY_FUNCTIONS.at(_name)(v1, v2);
float result = SIMILARITY_FUNCTIONS.at(_name)(v1_elements, v2_elements);
return float_type->decompose(result);
}

View File

@@ -11,7 +11,6 @@
#include "native_scalar_function.hh"
#include "cql3/assignment_testable.hh"
#include "cql3/functions/function_name.hh"
#include <span>
namespace cql3 {
namespace functions {
@@ -20,7 +19,7 @@ static const function_name SIMILARITY_COSINE_FUNCTION_NAME = function_name::nati
static const function_name SIMILARITY_EUCLIDEAN_FUNCTION_NAME = function_name::native_function("similarity_euclidean");
static const function_name SIMILARITY_DOT_PRODUCT_FUNCTION_NAME = function_name::native_function("similarity_dot_product");
using similarity_function_t = float (*)(std::span<const float>, std::span<const float>);
using similarity_function_t = float (*)(const std::vector<data_value>&, const std::vector<data_value>&);
extern thread_local const std::unordered_map<function_name, similarity_function_t> SIMILARITY_FUNCTIONS;
std::vector<data_type> retrieve_vector_arg_types(const function_name& name, const std::vector<shared_ptr<assignment_testable>>& provided_args);
@@ -34,14 +33,5 @@ public:
virtual bytes_opt execute(std::span<const bytes_opt> parameters) override;
};
namespace detail {
// Extract float vector directly from serialized bytes, bypassing data_value overhead.
// This is an internal API exposed for testing purposes.
// Vector<float, N> wire format: N floats as big-endian uint32_t values, 4 bytes each.
std::vector<float> extract_float_vector(const bytes_opt& param, size_t dimension);
} // namespace detail
} // namespace functions
} // namespace cql3

View File

@@ -23,7 +23,6 @@
#include "index/vector_index.hh"
#include "schema/schema.hh"
#include "service/client_state.hh"
#include "service/paxos/paxos_state.hh"
#include "types/types.hh"
#include "cql3/query_processor.hh"
#include "cql3/cql_statement.hh"
@@ -330,19 +329,6 @@ future<std::vector<description>> table(const data_dictionary::database& db, cons
"*/",
*table_desc.create_statement);
table_desc.create_statement = std::move(os).to_managed_string();
} else if (service::paxos::paxos_store::try_get_base_table(name)) {
// Paxos state table is internally managed by Scylla and it shouldn't be exposed to the user.
// The table is allowed to be described as a comment to ease administrative work but it's hidden from all listings.
fragmented_ostringstream os{};
fmt::format_to(os.to_iter(),
"/* Do NOT execute this statement! It's only for informational purposes.\n"
" A paxos state table is created automatically when enabling LWT on a base table.\n"
"\n{}\n"
"*/",
*table_desc.create_statement);
table_desc.create_statement = std::move(os).to_managed_string();
}
result.push_back(std::move(table_desc));
@@ -378,7 +364,7 @@ future<std::vector<description>> table(const data_dictionary::database& db, cons
future<std::vector<description>> tables(const data_dictionary::database& db, const lw_shared_ptr<keyspace_metadata>& ks, std::optional<bool> with_internals = std::nullopt) {
auto& replica_db = db.real_database();
auto tables = ks->tables() | std::views::filter([&replica_db] (const schema_ptr& s) {
return !cdc::is_log_for_some_table(replica_db, s->ks_name(), s->cf_name()) && !service::paxos::paxos_store::try_get_base_table(s->cf_name());
return !cdc::is_log_for_some_table(replica_db, s->ks_name(), s->cf_name());
}) | std::ranges::to<std::vector<schema_ptr>>();
std::ranges::sort(tables, std::ranges::less(), std::mem_fn(&schema::cf_name));

View File

@@ -259,9 +259,11 @@ uint32_t select_statement::get_bound_terms() const {
future<> select_statement::check_access(query_processor& qp, const service::client_state& state) const {
try {
auto cdc = qp.db().get_cdc_base_table(*_schema);
auto& cf_name = _schema->is_view()
? _schema->view_info()->base_name()
const data_dictionary::database db = qp.db();
auto&& s = db.find_schema(keyspace(), column_family());
auto cdc = db.get_cdc_base_table(*s);
auto& cf_name = s->is_view()
? s->view_info()->base_name()
: (cdc ? cdc->cf_name() : column_family());
const schema_ptr& base_schema = cdc ? cdc : _schema;
bool is_vector_indexed = secondary_index::vector_index::has_vector_index(*base_schema);

View File

@@ -1986,13 +1986,13 @@ future<> db::commitlog::segment_manager::replenish_reserve() {
}
continue;
} catch (shutdown_marker&) {
_reserve_segments.abort(std::current_exception());
break;
} catch (...) {
clogger.warn("Exception in segment reservation: {}", std::current_exception());
}
co_await sleep(100ms);
}
_reserve_segments.abort(std::make_exception_ptr(shutdown_marker()));
}
future<std::vector<db::commitlog::descriptor>>

View File

@@ -1498,7 +1498,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
, index_cache_fraction(this, "index_cache_fraction", liveness::LiveUpdate, value_status::Used, 0.2,
"The maximum fraction of cache memory permitted for use by index cache. Clamped to the [0.0; 1.0] range. Must be small enough to not deprive the row cache of memory, but should be big enough to fit a large fraction of the index. The default value 0.2 means that at least 80\% of cache memory is reserved for the row cache, while at most 20\% is usable by the index cache.")
, consistent_cluster_management(this, "consistent_cluster_management", value_status::Deprecated, true, "Use RAFT for cluster management and DDL.")
, force_gossip_topology_changes(this, "force_gossip_topology_changes", value_status::Deprecated, false, "Force gossip-based topology operations in a fresh cluster. Only the first node in the cluster must use it. The rest will fall back to gossip-based operations anyway. This option should be used only for testing. Note: gossip topology changes are incompatible with tablets.")
, force_gossip_topology_changes(this, "force_gossip_topology_changes", value_status::Used, false, "Force gossip-based topology operations in a fresh cluster. Only the first node in the cluster must use it. The rest will fall back to gossip-based operations anyway. This option should be used only for testing. Note: gossip topology changes are incompatible with tablets.")
, recovery_leader(this, "recovery_leader", liveness::LiveUpdate, value_status::Used, utils::null_uuid(), "Host ID of the node restarted first while performing the Manual Raft-based Recovery Procedure. Warning: this option disables some guardrails for the needs of the Manual Raft-based Recovery Procedure. Make sure you unset it at the end of the procedure.")
, wasm_cache_memory_fraction(this, "wasm_cache_memory_fraction", value_status::Used, 0.01, "Maximum total size of all WASM instances stored in the cache as fraction of total shard memory.")
, wasm_cache_timeout_in_ms(this, "wasm_cache_timeout_in_ms", value_status::Used, 5000, "Time after which an instance is evicted from the cache.")

View File

@@ -215,8 +215,6 @@ public:
static constexpr auto BUILT_VIEWS = "built_views";
static constexpr auto SCYLLA_VIEWS_BUILDS_IN_PROGRESS = "scylla_views_builds_in_progress";
static constexpr auto CDC_LOCAL = "cdc_local";
static constexpr auto CDC_TIMESTAMPS = "cdc_timestamps";
static constexpr auto CDC_STREAMS = "cdc_streams";
// auth
static constexpr auto ROLES = "roles";

View File

@@ -588,7 +588,11 @@ future<> view_building_worker::do_build_range(table_id base_id, std::vector<tabl
utils::get_local_injector().inject("do_build_range_fail",
[] { throw std::runtime_error("do_build_range failed due to error injection"); });
return seastar::async([this, base_id, views_ids = std::move(views_ids), last_token, &as] {
// Run the view building in the streaming scheduling group
// so that it doesn't impact other tasks with higher priority.
seastar::thread_attributes attr;
attr.sched_group = _db.get_streaming_scheduling_group();
return seastar::async(std::move(attr), [this, base_id, views_ids = std::move(views_ids), last_token, &as] {
gc_clock::time_point now = gc_clock::now();
auto base_cf = _db.find_column_family(base_id).shared_from_this();
reader_permit permit = _db.get_reader_concurrency_semaphore().make_tracking_only_permit(nullptr, "build_views_range", db::no_timeout, {});

View File

@@ -67,7 +67,6 @@ public:
return schema_builder(system_keyspace::NAME, "cluster_status", std::make_optional(id))
.with_column("peer", inet_addr_type, column_kind::partition_key)
.with_column("dc", utf8_type)
.with_column("rack", utf8_type)
.with_column("up", boolean_type)
.with_column("draining", boolean_type)
.with_column("excluded", boolean_type)
@@ -112,9 +111,7 @@ public:
// Not all entries in gossiper are present in the topology
auto& node = tm.get_topology().get_node(hostid);
sstring dc = node.dc_rack().dc;
sstring rack = node.dc_rack().rack;
set_cell(cr, "dc", dc);
set_cell(cr, "rack", rack);
set_cell(cr, "draining", node.is_draining());
set_cell(cr, "excluded", node.is_excluded());
}
@@ -1348,8 +1345,8 @@ public:
private:
static schema_ptr build_schema() {
auto id = generate_legacy_id(system_keyspace::NAME, system_keyspace::CDC_TIMESTAMPS);
return schema_builder(system_keyspace::NAME, system_keyspace::CDC_TIMESTAMPS, std::make_optional(id))
auto id = generate_legacy_id(system_keyspace::NAME, "cdc_timestamps");
return schema_builder(system_keyspace::NAME, "cdc_timestamps", std::make_optional(id))
.with_column("keyspace_name", utf8_type, column_kind::partition_key)
.with_column("table_name", utf8_type, column_kind::partition_key)
.with_column("timestamp", reversed_type_impl::get_instance(timestamp_type), column_kind::clustering_key)
@@ -1431,8 +1428,8 @@ public:
}
private:
static schema_ptr build_schema() {
auto id = generate_legacy_id(system_keyspace::NAME, system_keyspace::CDC_STREAMS);
return schema_builder(system_keyspace::NAME, system_keyspace::CDC_STREAMS, std::make_optional(id))
auto id = generate_legacy_id(system_keyspace::NAME, "cdc_streams");
return schema_builder(system_keyspace::NAME, "cdc_streams", std::make_optional(id))
.with_column("keyspace_name", utf8_type, column_kind::partition_key)
.with_column("table_name", utf8_type, column_kind::partition_key)
.with_column("timestamp", timestamp_type, column_kind::clustering_key)

View File

@@ -12,6 +12,5 @@ namespace debug {
seastar::sharded<replica::database>* volatile the_database = nullptr;
seastar::scheduling_group streaming_scheduling_group;
seastar::scheduling_group gossip_scheduling_group;
}

View File

@@ -18,7 +18,6 @@ namespace debug {
extern seastar::sharded<replica::database>* volatile the_database;
extern seastar::scheduling_group streaming_scheduling_group;
extern seastar::scheduling_group gossip_scheduling_group;
}

View File

@@ -1,10 +1,6 @@
### a dictionary of redirections
#old path: new path
# Move the OS Support page
/stable/getting-started/os-support.html: https://docs.scylladb.com/stable/versioning/os-support-per-version.html
# Remove an outdated KB
/stable/kb/perftune-modes-sync.html: /stable/kb/index.html

View File

@@ -25,8 +25,6 @@ Querying data from data is done using a ``SELECT`` statement:
: | CAST '(' `selector` AS `cql_type` ')'
: | `function_name` '(' [ `selector` ( ',' `selector` )* ] ')'
: | COUNT '(' '*' ')'
: | literal
: | bind_marker
: )
: ( '.' `field_name` | '[' `term` ']' )*
where_clause: `relation` ( AND `relation` )*
@@ -37,8 +35,6 @@ Querying data from data is done using a ``SELECT`` statement:
operator: '=' | '<' | '>' | '<=' | '>=' | IN | NOT IN | CONTAINS | CONTAINS KEY
ordering_clause: `column_name` [ ASC | DESC ] ( ',' `column_name` [ ASC | DESC ] )*
timeout: `duration`
literal: number | 'string' | boolean | NULL | tuple_literal | list_literal | map_literal
bind_marker: '?' | ':' `identifier`
For instance::
@@ -85,13 +81,6 @@ A :token:`selector` can be one of the following:
- A casting, which allows you to convert a nested selector to a (compatible) type.
- A function call, where the arguments are selector themselves.
- A call to the :ref:`COUNT function <count-function>`, which counts all non-null results.
- A literal value (constant).
- A bind variable (`?` or `:name`).
Note that due to a quirk of the type system, literals and bind markers cannot be
used as top-level selectors, as the parser cannot infer their type. However, they can be used
when nested inside functions, as the function formal parameter types provide the
necessary context.
Aliases
```````
@@ -292,8 +281,7 @@ For example::
ORDER BY embedding ANN OF [0.1, 0.2, 0.3, 0.4] LIMIT 5;
Vector queries also support filtering with ``WHERE`` clauses on columns that are part of the primary key
or columns provided in a definition of the index.
Vector queries also support filtering with ``WHERE`` clauses on columns that are part of the primary key.
For example::

View File

@@ -140,83 +140,17 @@ Vector Index :label-note:`ScyllaDB Cloud`
`ScyllaDB Cloud documentation <https://cloud.docs.scylladb.com/stable/vector-search/>`_.
ScyllaDB supports creating vector indexes on tables, allowing queries on the table to use those indexes for efficient
similarity search on vector data. Vector indexes can be a global index for indexing vectors per table or a local
index for indexing vectors per partition.
similarity search on vector data.
The vector index is the only custom type index supported in ScyllaDB. It is created using
the ``CUSTOM`` keyword and specifying the index type as ``vector_index``. It is also possible to
add additional columns to the index for filtering the search results. The partition column
specified in the global vector index definition must be the vector column, and any subsequent
columns are treated as filtering columns. The local vector index requires that the partition key
of the base table is also the partition key of the index and the vector column is the first one
from the following columns.
Example of a simple index:
the ``CUSTOM`` keyword and specifying the index type as ``vector_index``. Example:
.. code-block:: cql
CREATE CUSTOM INDEX vectorIndex ON ImageEmbeddings (embedding)
CREATE CUSTOM INDEX vectorIndex ON ImageEmbeddings (embedding)
USING 'vector_index'
WITH OPTIONS = {'similarity_function': 'COSINE', 'maximum_node_connections': '16'};
The vector column (``embedding``) is indexed to enable similarity search using
a global vector index. Additional filtering can be performed on the primary key
columns of the base table.
Example of a global vector index with additional filtering:
.. code-block:: cql
CREATE CUSTOM INDEX vectorIndex ON ImageEmbeddings (embedding, category, info)
USING 'vector_index'
WITH OPTIONS = {'similarity_function': 'COSINE', 'maximum_node_connections': '16'};
The vector column (``embedding``) is indexed to enable similarity search using
a global index. Additional columns are added for filtering the search results.
The filtering is possible on ``category``, ``info`` and all primary key columns
of the base table.
Example of a local vector index:
.. code-block:: cql
CREATE CUSTOM INDEX vectorIndex ON ImageEmbeddings ((id, created_at), embedding, category, info)
USING 'vector_index'
WITH OPTIONS = {'similarity_function': 'COSINE', 'maximum_node_connections': '16'};
The vector column (``embedding``) is indexed for similarity search (a local
index) and additional columns are added for filtering the search results. The
filtering is possible on ``category``, ``info`` and all primary key columns of
the base table. The columns ``id`` and ``created_at`` must be the partition key
of the base table.
Vector indexes support additional filtering columns of native data types
(excluding counter and duration). The indexed column itself must be a vector
column, while the extra columns can be used to filter search results.
The supported types are:
* ``ascii``
* ``bigint``
* ``blob``
* ``boolean``
* ``date``
* ``decimal``
* ``double``
* ``float``
* ``inet``
* ``int``
* ``smallint``
* ``text``
* ``varchar``
* ``time``
* ``timestamp``
* ``timeuuid``
* ``tinyint``
* ``uuid``
* ``varint``
The following options are supported for vector indexes. All of them are optional.
+------------------------------+----------------------------------------------------------------------------------------------------------+---------------+

View File

@@ -1,132 +0,0 @@
# Error Injection Event Stream Implementation
## Overview
This implementation adds Server-Sent Events (SSE) support for error injection points, allowing tests to wait for injections to be triggered without log parsing.
## Architecture
### Backend (C++)
#### 1. Event Notification System (`utils/error_injection.hh`)
- **Callback Type**: `error_injection_event_callback` - function signature: `void(std::string_view injection_name, std::string_view injection_type)`
- **Storage**: Thread-local vector of callbacks (`_event_callbacks`)
- **Notification**: When any `inject()` method is called, `notify_event()` triggers all registered callbacks
- **Thread Safety**: Each shard has its own error_injection instance with its own callbacks
- **Cross-Shard**: Static methods use `smp::invoke_on_all()` to register callbacks on all shards
#### 2. SSE Endpoint (`api/error_injection.cc`)
```
GET /v2/error_injection/events
Content-Type: text/event-stream
```
**Flow**:
1. Client connects to SSE endpoint
2. Server creates a queue on the current shard
3. Callback registered on ALL shards that forwards events to this queue (using `smp::submit_to`)
4. Server streams events in SSE format: `data: {"injection":"name","type":"handler","shard":0}\n\n`
5. On disconnect (client closes or exception), callbacks are cleaned up
**Event Format**:
```json
{
"injection": "injection_name",
"type": "sleep|handler|exception|lambda",
"shard": 0
}
```
### Python Client (`test/pylib/rest_client.py`)
#### InjectionEventStream Class
```python
async with injection_event_stream(node_ip) as stream:
event = await stream.wait_for_injection("my_injection", timeout=30)
```
**Features**:
- Async context manager for automatic connection/disconnection
- Background task reads SSE events
- Queue-based event delivery
- `wait_for_injection()` method filters events by injection name
## Usage Examples
### Basic Usage
```python
async with injection_event_stream(server_ip) as event_stream:
# Enable injection
await api.enable_injection(server_ip, "my_injection", one_shot=True)
# Trigger operation that hits injection
# ... some operation ...
# Wait for injection without log parsing!
event = await event_stream.wait_for_injection("my_injection", timeout=30)
logger.info(f"Injection hit on shard {event['shard']}")
```
### Old vs New Approach
**Old (Log Parsing)**:
```python
log = await manager.server_open_log(server_id)
mark = await log.mark()
await api.enable_injection(ip, "my_injection", one_shot=True)
# ... operation ...
mark, _ = await log.wait_for('my_injection: waiting', from_mark=mark)
```
**New (Event Stream)**:
```python
async with injection_event_stream(ip) as stream:
await api.enable_injection(ip, "my_injection", one_shot=True)
# ... operation ...
event = await stream.wait_for_injection("my_injection", timeout=30)
```
## Benefits
1. **Performance**: No waiting for log flushes or buffer processing
2. **Reliability**: Direct event notifications, no regex matching failures
3. **Simplicity**: Clean async/await pattern
4. **Flexibility**: Can wait for multiple injections, get event metadata
5. **Backward Compatible**: Existing log-based tests continue to work
## Implementation Notes
### Thread Safety
- Each shard has independent error_injection instance
- Events from any shard are delivered to SSE client via `smp::submit_to`
- Queue operations are shard-local, avoiding cross-shard synchronization
### Cleanup
- Client disconnect triggers callback cleanup on all shards
- Cleanup happens automatically via RAII (try/finally in stream function)
- No callback leaks even if client disconnects abruptly
### Logging
- Injection triggers now log at INFO level (was DEBUG)
- This ensures events are visible in logs AND via SSE
- SSE provides machine-readable events, logs provide human-readable context
## Testing
See `test/cluster/test_error_injection_events.py` for example tests:
- `test_injection_event_stream_basic`: Basic functionality
- `test_injection_event_stream_multiple_injections`: Multiple injection tracking
- `test_injection_event_vs_log_parsing_comparison`: Old vs new comparison
## Future Enhancements
Possible improvements:
1. Filter events by injection name at server side (query parameter)
2. Include injection parameters in events
3. Add event timestamps
4. Support for event history/replay
5. WebSocket support (if bidirectional communication needed)

View File

@@ -156,7 +156,7 @@ How do I check the current version of ScyllaDB that I am running?
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
* On a regular system or VM (running Ubuntu, CentOS, or RedHat Enterprise): :code:`$ scylla --version`
Check the `Operating System Support Guide <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_ for a list of supported operating systems and versions.
Check the :doc:`Operating System Support Guide </getting-started/os-support>` for a list of supported operating systems and versions.
* On a docker node: :code:`$ docker exec -it Node_Z scylla --version`

View File

@@ -3,9 +3,9 @@
Automatic Repair
================
Traditionally, launching :doc:`repairs </operating-scylla/procedures/maintenance/repair>` in a ScyllaDB cluster is left to an external process, typically done via `Scylla Manager <https://manager.docs.scylladb.com/stable/repair/index.html>`_.
Traditionally, launching `repairs </operating-scylla/procedures/maintenance/repair>`_ in a ScyllaDB cluster is left to an external process, typically done via `Scylla Manager <https://manager.docs.scylladb.com/stable/repair/index.html>`_.
Automatic repair offers built-in scheduling in ScyllaDB itself. If the time since the last repair is greater than the configured repair interval, ScyllaDB will start a repair for the :doc:`tablet table </architecture/tablets>` automatically.
Automatic repair offers built-in scheduling in ScyllaDB itself. If the time since the last repair is greater than the configured repair interval, ScyllaDB will start a repair for the tablet `tablet </architecture/tablets>`_ automatically.
Repairs are spread over time and among nodes and shards, to avoid load spikes or any adverse effects on user workloads.
To enable automatic repair, add this to the configuration (``scylla.yaml``):
@@ -20,4 +20,4 @@ More featureful configuration methods will be implemented in the future.
To disable, set ``auto_repair_enabled_default: false``.
Automatic repair relies on :doc:`Incremental Repair </features/incremental-repair>` and as such it only works with :doc:`tablet </architecture/tablets>` tables.
Automatic repair relies on `Incremental Repair </features/incremental-repair>`_ and as such it only works with `tablet </architecture/tablets>`_ tables.

View File

@@ -3,7 +3,7 @@
Incremental Repair
==================
ScyllaDB's standard :doc:`repair </operating-scylla/procedures/maintenance/repair>` process scans and processes all the data on a node, regardless of whether it has changed since the last repair. This operation can be resource-intensive and time-consuming. The Incremental Repair feature provides a much more efficient and lightweight alternative for maintaining data consistency.
ScyllaDB's standard `repair </operating-scylla/procedures/maintenance/repair>`_ process scans and processes all the data on a node, regardless of whether it has changed since the last repair. This operation can be resource-intensive and time-consuming. The Incremental Repair feature provides a much more efficient and lightweight alternative for maintaining data consistency.
The core idea of incremental repair is to repair only the data that has been written or changed since the last repair was run. It intelligently skips data that has already been verified, dramatically reducing the time, I/O, and CPU resources required for the repair operation.
@@ -51,7 +51,7 @@ Benefits of Incremental Repair
* **Reduced Resource Usage:** Consumes significantly less CPU, I/O, and network bandwidth compared to a full repair.
* **More Frequent Repairs:** The efficiency of incremental repair allows you to run it more frequently, ensuring a higher level of data consistency across your cluster at all times.
Tables using Incremental Repair can schedule repairs in ScyllaDB itself, with :doc:`Automatic Repair </features/automatic-repair>`.
Tables using Incremental Repair can schedule repairs in ScyllaDB itself, with `Automatic Repair </features/automatic-repair>`_.
Notes
-----

View File

@@ -18,7 +18,7 @@ Getting Started
:class: my-panel
* :doc:`ScyllaDB System Requirements Guide</getting-started/system-requirements/>`
* `OS Support by Platform and Version <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_
* :doc:`OS Support by Platform and Version</getting-started/os-support/>`
.. panel-box::
:title: Install and Configure ScyllaDB

View File

@@ -17,7 +17,7 @@ This article will help you install ScyllaDB on Linux using platform-specific pac
Prerequisites
----------------
* Ubuntu, Debian, CentOS, or RHEL (see `OS Support by Platform and Version <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_
* Ubuntu, Debian, CentOS, or RHEL (see :doc:`OS Support by Platform and Version </getting-started/os-support>`
for details about supported versions and architecture)
* Root or ``sudo`` access to the system
* Open :ref:`ports used by ScyllaDB <networking-ports>`

View File

@@ -10,7 +10,7 @@ Prerequisites
--------------
Ensure that your platform is supported by the ScyllaDB version you want to install.
See `OS Support by Platform and Version <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_.
See :doc:`OS Support by Platform and Version </getting-started/os-support/>`.
Install ScyllaDB with Web Installer
---------------------------------------

View File

@@ -12,8 +12,7 @@ the package manager (dnf and apt).
Prerequisites
---------------
Ensure your platform is supported by the ScyllaDB version you want to install.
See `OS Support <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_
for information about supported Linux distributions and versions.
See :doc:`OS Support </getting-started/os-support>` for information about supported Linux distributions and versions.
Note that if you're on CentOS 7, only root offline installation is supported.

View File

@@ -0,0 +1,26 @@
OS Support by Linux Distributions and Version
==============================================
The following matrix shows which Linux distributions, containers, and images
are :ref:`supported <os-support-definition>` with which versions of ScyllaDB.
.. datatemplate:json:: /_static/data/os-support.json
:template: platforms.tmpl
``*`` 2024.1.9 and later
All releases are available as a Docker container, EC2 AMI, GCP, and Azure images.
.. _os-support-definition:
By *supported*, it is meant that:
- A binary installation package is available.
- The download and install procedures are tested as part of the ScyllaDB release process for each version.
- An automated install is included from :doc:`ScyllaDB Web Installer for Linux tool </getting-started/installation-common/scylla-web-installer>` (for the latest versions).
You can `build ScyllaDB from source <https://github.com/scylladb/scylladb#build-prerequisites>`_
on other x86_64 or aarch64 platforms, without any guarantees.

View File

@@ -8,12 +8,12 @@ ScyllaDB Requirements
:hidden:
system-requirements
OS Support <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>
OS Support <os-support>
Cloud Instance Recommendations <cloud-instance-recommendations>
scylla-in-a-shared-environment
* :doc:`System Requirements</getting-started/system-requirements/>`
* `OS Support by Platform and Version <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_
* :doc:`OS Support by Platform and Version</getting-started/os-support/>`
* :doc:`Cloud Instance Recommendations AWS, GCP, and Azure </getting-started/cloud-instance-recommendations>`
* :doc:`Running ScyllaDB in a Shared Environment </getting-started/scylla-in-a-shared-environment>`

View File

@@ -8,7 +8,7 @@ Supported Platforms
===================
ScyllaDB runs on 64-bit Linux. The x86_64 and AArch64 architectures are supported (AArch64 support includes AWS EC2 Graviton).
See `OS Support by Platform and Version <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_ for information about
See :doc:`OS Support by Platform and Version </getting-started/os-support>` for information about
supported operating systems, distros, and versions.
See :doc:`Cloud Instance Recommendations for AWS, GCP, and Azure </getting-started/cloud-instance-recommendations>` for information

View File

@@ -52,14 +52,18 @@ Row-level repair improves ScyllaDB in two ways:
* keeping the data in a temporary buffer.
* using the cached data to calculate the checksum and send it to the replicas.
See also the `ScyllaDB Manager documentation <https://manager.docs.scylladb.com/>`_.
See also
* `ScyllaDB Manager documentation <https://manager.docs.scylladb.com/>`_
* `Blog: ScyllaDB Open Source 3.1: Efficiently Maintaining Consistency with Row-Level Repair <https://www.scylladb.com/2019/08/13/scylla-open-source-3-1-efficiently-maintaining-consistency-with-row-level-repair/>`_
Incremental Repair
------------------
Built on top of :ref:`Row-level Repair <row-level-repair>` and :doc:`Tablets </architecture/tablets>`, Incremental Repair enables frequent and quick repairs. For more details, see :doc:`Incremental Repair </features/incremental-repair>`.
Built on top of `Row-level Repair <row-level-repair_>`_ and `Tablets </architecture/tablets>`_, Incremental Repair enables frequent and quick repairs. For more details, see `Incremental Repair </features/incremental-repair>`_.
Automatic Repair
----------------
Built on top of :doc:`Incremental Repair </features/incremental-repair>`, :doc:`Automatic Repair </features/automatic-repair>` offers repair scheduling and execution directly in ScyllaDB, without external processes.
Built on top of `Incremental Repair </features/incremental-repair>`_, `Automatic Repair </features/automatic-repair>`_ offers repair scheduling and execution directly in ScyllaDB, without external processes.

View File

@@ -14,7 +14,7 @@ if necessary.
This guide covers upgrading ScyllaDB on Red Hat Enterprise Linux (RHEL),
CentOS, Debian, and Ubuntu.
See `OS Support by Platform and Version <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_
See :doc:`OS Support by Platform and Version </getting-started/os-support>`
for information about supported versions.
It also applies to the ScyllaDB official image on EC2, GCP, or Azure.

View File

@@ -17,7 +17,7 @@ This document describes a step-by-step procedure for upgrading from |SCYLLA_NAME
to |SCYLLA_NAME| |NEW_VERSION| and rollback to version |SRC_VERSION| if necessary.
This guide covers upgrading ScyllaDB on Red Hat Enterprise Linux (RHEL), CentOS, Debian,
and Ubuntu. See `OS Support by Platform and Version <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_
and Ubuntu. See :doc:`OS Support by Platform and Version </getting-started/os-support>`
for information about supported versions.
It also applies when using the ScyllaDB official image on EC2, GCP, or Azure.

View File

@@ -2424,8 +2424,8 @@ bool gossiper::is_enabled() const {
void gossiper::add_expire_time_for_endpoint(locator::host_id endpoint, clk::time_point expire_time) {
auto now_ = now();
auto diff = std::chrono::duration_cast<std::chrono::seconds>(expire_time - now_).count();
logger.info("Node {} will be removed from gossip at [{:%Y-%m-%d %T %z}]: (expire = {}, now = {}, diff = {} seconds)",
endpoint, fmt::gmtime(clk::to_time_t(expire_time)), expire_time.time_since_epoch().count(),
logger.info("Node {} will be removed from gossip at [{:%Y-%m-%d %T}]: (expire = {}, now = {}, diff = {} seconds)",
endpoint, fmt::localtime(clk::to_time_t(expire_time)), expire_time.time_since_epoch().count(),
now_.time_since_epoch().count(), diff);
_expire_time_endpoint_map[endpoint] = expire_time;
}

View File

@@ -153,8 +153,6 @@ public:
}
const std::set<inet_address>& get_seeds() const noexcept;
seastar::scheduling_group get_scheduling_group() const noexcept { return _gcfg.gossip_scheduling_group; }
public:
static clk::time_point inline now() noexcept { return clk::now(); }
public:

View File

@@ -17,11 +17,11 @@
#include "index/secondary_index.hh"
#include "index/secondary_index_manager.hh"
#include "types/concrete_types.hh"
#include "types/types.hh"
#include "utils/managed_string.hh"
#include <seastar/core/sstring.hh>
#include <boost/algorithm/string.hpp>
namespace secondary_index {
static void validate_positive_option(int max, const sstring& value_name, const sstring& value) {
@@ -147,88 +147,17 @@ std::optional<cql3::description> vector_index::describe(const index_metadata& im
}
void vector_index::check_target(const schema& schema, const std::vector<::shared_ptr<cql3::statements::index_target>>& targets) const {
struct validate_visitor {
const class schema& schema;
bool& is_vector;
/// Vector indexes support filtering on native types that can be used as primary key columns.
/// There is no counter (it cannot be used with vector columns)
/// and no duration (it cannot be used as a primary key or in secondary indexes).
static bool is_supported_filtering_column(abstract_type const & kind_type) {
switch (kind_type.get_kind()) {
case abstract_type::kind::ascii:
case abstract_type::kind::boolean:
case abstract_type::kind::byte:
case abstract_type::kind::bytes:
case abstract_type::kind::date:
case abstract_type::kind::decimal:
case abstract_type::kind::double_kind:
case abstract_type::kind::float_kind:
case abstract_type::kind::inet:
case abstract_type::kind::int32:
case abstract_type::kind::long_kind:
case abstract_type::kind::short_kind:
case abstract_type::kind::simple_date:
case abstract_type::kind::time:
case abstract_type::kind::timestamp:
case abstract_type::kind::timeuuid:
case abstract_type::kind::utf8:
case abstract_type::kind::uuid:
case abstract_type::kind::varint:
return true;
default:
break;
}
return false;
}
void validate(cql3::column_identifier const& column, bool is_vector) const {
auto const& c_name = column.to_string();
auto const* c_def = schema.get_column_definition(column.name());
if (c_def == nullptr) {
throw exceptions::invalid_request_exception(format("Column {} not found in schema", c_name));
}
auto type = c_def->type;
if (is_vector) {
auto const* vector_type = dynamic_cast<const vector_type_impl*>(type.get());
if (vector_type == nullptr) {
throw exceptions::invalid_request_exception("Vector indexes are only supported on columns of vectors of floats");
}
auto elements_type = vector_type->get_elements_type();
if (elements_type->get_kind() != abstract_type::kind::float_kind) {
throw exceptions::invalid_request_exception("Vector indexes are only supported on columns of vectors of floats");
}
return;
}
if (!is_supported_filtering_column(*type)) {
throw exceptions::invalid_request_exception(format("Unsupported vector index filtering column {} type", c_name));
}
}
void operator()(const std::vector<::shared_ptr<cql3::column_identifier>>& columns) const {
for (const auto& column : columns) {
// CQL restricts the secondary local index to have multiple columns with partition key only.
// Vectors shouldn't be partition key columns and they aren't supported as a filtering column,
// so we can assume here that these are non-vectors filtering columns.
validate(*column, false);
}
}
void operator()(const ::shared_ptr<cql3::column_identifier>& column) {
validate(*column, is_vector);
// The first column is the vector column, the rest mustn't be vectors.
is_vector = false;
}
};
bool is_vector = true;
for (const auto& target : targets) {
std::visit(validate_visitor{.schema = schema, .is_vector = is_vector}, target->value);
if (targets.size() != 1) {
throw exceptions::invalid_request_exception("Vector index can only be created on a single column");
}
auto target = targets[0];
auto c_def = schema.get_column_definition(to_bytes(target->column_name()));
if (!c_def) {
throw exceptions::invalid_request_exception(format("Column {} not found in schema", target->column_name()));
}
auto type = c_def->type;
if (!type->is_vector() || static_cast<const vector_type_impl*>(type.get())->get_elements_type()->get_kind() != abstract_type::kind::float_kind) {
throw exceptions::invalid_request_exception(format("Vector indexes are only supported on columns of vectors of floats", target->column_name()));
}
}

17
init.cc
View File

@@ -11,6 +11,7 @@
#include "seastarx.hh"
#include "db/config.hh"
#include <boost/algorithm/string/trim.hpp>
#include <seastar/core/coroutine.hh>
#include "sstables/sstable_compressor_factory.hh"
#include "gms/feature_service.hh"
@@ -29,7 +30,11 @@ std::set<gms::inet_address> get_seeds_from_db_config(const db::config& cfg,
std::set<gms::inet_address> seeds;
if (seed_provider.parameters.contains("seeds")) {
for (const auto& seed : utils::split_comma_separated_list(seed_provider.parameters.at("seeds"))) {
size_t begin = 0;
size_t next = 0;
sstring seeds_str = seed_provider.parameters.find("seeds")->second;
while (begin < seeds_str.length() && begin != (next=seeds_str.find(",",begin))) {
auto seed = boost::trim_copy(seeds_str.substr(begin,next-begin));
try {
seeds.emplace(gms::inet_address::lookup(seed, family, preferred).get());
} catch (...) {
@@ -41,10 +46,11 @@ std::set<gms::inet_address> get_seeds_from_db_config(const db::config& cfg,
seed,
std::current_exception());
}
begin = next+1;
}
}
if (seeds.empty()) {
seeds.emplace("127.0.0.1");
seeds.emplace(gms::inet_address("127.0.0.1"));
}
startlog.info("seeds={{{}}}, listen_address={}, broadcast_address={}",
fmt::join(seeds, ", "), listen, broadcast_address);
@@ -96,6 +102,13 @@ std::set<sstring> get_disabled_features_from_db_config(const db::config& cfg, st
if (!cfg.check_experimental(db::experimental_features_t::feature::STRONGLY_CONSISTENT_TABLES)) {
disabled.insert("STRONGLY_CONSISTENT_TABLES"s);
}
if (cfg.force_gossip_topology_changes()) {
if (cfg.enable_tablets_by_default()) {
throw std::runtime_error("Tablets cannot be enabled with gossip topology changes. Use either --tablets-mode-for-new-keyspaces=enabled|enforced or --force-gossip-topology-changes, but not both.");
}
startlog.warn("The tablets feature is disabled due to forced gossip topology changes");
disabled.insert("TABLETS"s);
}
if (!cfg.table_digest_insensitive_to_expiry()) {
disabled.insert("TABLE_DIGEST_INSENSITIVE_TO_EXPIRY"s);
}

View File

@@ -150,6 +150,7 @@ fedora_packages=(
llvm
openldap-servers
openldap-devel
toxiproxy
cyrus-sasl
fipscheck
cpp-jwt-devel
@@ -157,10 +158,7 @@ fedora_packages=(
podman
buildah
# for cassandra-stress
java-openjdk-headless
snappy
https://github.com/scylladb/cassandra-stress/releases/download/v3.18.1/cassandra-stress-java21-3.18.1-1.noarch.rpm
elfutils
jq
@@ -297,7 +295,6 @@ print_usage() {
echo " --print-pip-runtime-packages Print required pip packages for Scylla"
echo " --print-pip-symlinks Print list of pip provided commands which need to install to /usr/bin"
echo " --print-node-exporter-filename Print node_exporter filename"
echo " --future Install dependencies for future toolchain (Fedora rawhide based)"
exit 1
}
@@ -305,7 +302,6 @@ PRINT_PYTHON3=false
PRINT_PIP=false
PRINT_PIP_SYMLINK=false
PRINT_NODE_EXPORTER=false
FUTURE=false
while [ $# -gt 0 ]; do
case "$1" in
"--print-python3-runtime-packages")
@@ -324,10 +320,6 @@ while [ $# -gt 0 ]; do
PRINT_NODE_EXPORTER=true
shift 1
;;
"--future")
FUTURE=true
shift 1
;;
*)
print_usage
;;
@@ -358,10 +350,6 @@ if $PRINT_NODE_EXPORTER; then
exit 0
fi
if ! $FUTURE; then
fedora_packages+=(toxiproxy)
fi
umask 0022
./seastar/install-dependencies.sh
@@ -389,10 +377,6 @@ elif [ "$ID" = "fedora" ]; then
exit 1
fi
dnf install -y "${fedora_packages[@]}" "${fedora_python3_packages[@]}"
# Fedora 45 tightened key checks, and cassandra-stress is not signed yet.
dnf install --no-gpgchecks -y https://github.com/scylladb/cassandra-stress/releases/download/v3.18.1/cassandra-stress-java21-3.18.1-1.noarch.rpm
PIP_DEFAULT_ARGS="--only-binary=:all: -v"
pip_constrained_packages=""
for package in "${!pip_packages[@]}"
@@ -463,11 +447,3 @@ if [ ! -z "${CURL_ARGS}" ]; then
else
echo "Minio server and client are up-to-date, skipping download"
fi
if $FUTURE ; then
toxyproxy_version="v2.12.0"
for bin in toxiproxy-cli toxiproxy-server; do
curl -fSL -o "/usr/local/bin/${bin}" "https://github.com/Shopify/toxiproxy/releases/download/${toxyproxy_version}/${bin}-linux-$(go_arch)"
chmod +x "/usr/local/bin/${bin}"
done
fi

11
main.cc
View File

@@ -571,7 +571,7 @@ sharded<service::storage_proxy> *the_storage_proxy;
// This is used by perf-alternator to allow running scylla together with the tool
// in a single process. So that it's easier to measure internals. It's not added
// to main_func_type to not complicate common flow as no other tool needs such logic.
std::function<void(lw_shared_ptr<db::config>)> after_init_func;
std::function<future<>(lw_shared_ptr<db::config>, sharded<abort_source>&)> after_init_func;
static locator::host_id initialize_local_info_thread(sharded<db::system_keyspace>& sys_ks,
sharded<locator::snitch_ptr>& snitch,
@@ -1150,7 +1150,6 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
dbcfg.memtable_scheduling_group = create_scheduling_group("memtable", "mt", 1000).get();
dbcfg.memtable_to_cache_scheduling_group = create_scheduling_group("memtable_to_cache", "mt2c", 200).get();
dbcfg.gossip_scheduling_group = create_scheduling_group("gossip", "gms", 1000).get();
debug::gossip_scheduling_group = dbcfg.gossip_scheduling_group;
dbcfg.commitlog_scheduling_group = create_scheduling_group("commitlog", "clog", 1000).get();
dbcfg.schema_commitlog_scheduling_group = create_scheduling_group("schema_commitlog", "sclg", 1000).get();
dbcfg.available_memory = memory::stats().total_memory();
@@ -2042,7 +2041,8 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
cdc_config.ring_delay = std::chrono::milliseconds(cfg->ring_delay_ms());
cdc_config.dont_rewrite_streams = cfg->cdc_dont_rewrite_streams();
cdc_generation_service.start(std::move(cdc_config), std::ref(gossiper), std::ref(sys_dist_ks), std::ref(sys_ks),
std::ref(stop_signal.as_sharded_abort_source()), std::ref(token_metadata), std::ref(feature_service), std::ref(db)).get();
std::ref(stop_signal.as_sharded_abort_source()), std::ref(token_metadata), std::ref(feature_service), std::ref(db),
[&ss] () -> bool { return ss.local().raft_topology_change_enabled(); }).get();
auto stop_cdc_generation_service = defer_verbose_shutdown("CDC Generation Management service", [] {
cdc_generation_service.stop().get();
});
@@ -2077,6 +2077,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
perm_cache_config.refresh = std::chrono::milliseconds(cfg->permissions_update_interval_in_ms());
auto start_auth_service = [&mm] (sharded<auth::service>& auth_service, std::any& stop_auth_service, const char* what) {
supervisor::notify(fmt::format("starting {}", what));
auth_service.invoke_on_all(&auth::service::start, std::ref(mm), std::ref(sys_ks)).get();
stop_auth_service = defer_verbose_shutdown(what, [&auth_service] {
@@ -2581,11 +2582,13 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
supervisor::notify("serving");
startlog.info("Scylla version {} initialization completed.", scylla_version());
future<> after_init_fut = make_ready_future<>();
if (after_init_func) {
after_init_func(cfg);
after_init_fut = after_init_func(cfg, stop_signal.as_sharded_abort_source());
}
stop_signal.wait().get();
startlog.info("Signal received; shutting down");
std::move(after_init_fut).get();
// At this point, all objects destructors and all shutdown hooks registered with defer() are executed
} catch (const sleep_aborted&) {
startlog.info("Startup interrupted");

View File

@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:9034610470ff645fab03da5ad6c690e5b41f3307ea4b529c7e63b0786a1289ed
size 6539600
oid sha256:a4710f1f0b0bb329721c21d133618e811e820f2e70553b0aca28fb278bff89c9
size 6492280

View File

@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:0c4bbf51dbe01d684ea5b9a9157781988ed499604d2fde90143bad0b9a5594f0
size 6543944
oid sha256:2433f7a1fc5cda0dd990ab59587eb6046dca0fe1ae48d599953d1936fe014ed9
size 6492176

View File

@@ -103,8 +103,8 @@ thread_local dirty_memory_manager default_dirty_memory_manager;
inline
flush_controller
make_flush_controller(const db::config& cfg, const database_config& dbcfg, std::function<double()> fn) {
return flush_controller(dbcfg.memtable_scheduling_group, cfg.memtable_flush_static_shares(), 50ms, cfg.unspooled_dirty_soft_limit(), std::move(fn));
make_flush_controller(const db::config& cfg, backlog_controller::scheduling_group& sg, std::function<double()> fn) {
return flush_controller(sg, cfg.memtable_flush_static_shares(), 50ms, cfg.unspooled_dirty_soft_limit(), std::move(fn));
}
keyspace::keyspace(config cfg, locator::effective_replication_map_factory& erm_factory)
@@ -394,7 +394,8 @@ database::database(const db::config& cfg, database_config dbcfg, service::migrat
, _system_dirty_memory_manager(*this, 10 << 20, cfg.unspooled_dirty_soft_limit(), default_scheduling_group())
, _dirty_memory_manager(*this, dbcfg.available_memory * 0.50, cfg.unspooled_dirty_soft_limit(), dbcfg.statement_scheduling_group)
, _dbcfg(dbcfg)
, _memtable_controller(make_flush_controller(_cfg, _dbcfg, [this, limit = float(_dirty_memory_manager.throttle_threshold())] {
, _flush_sg(dbcfg.memtable_scheduling_group)
, _memtable_controller(make_flush_controller(_cfg, _flush_sg, [this, limit = float(_dirty_memory_manager.throttle_threshold())] {
auto backlog = (_dirty_memory_manager.unspooled_dirty_memory()) / limit;
if (_dirty_memory_manager.has_extraneous_flushes_requested()) {
backlog = std::max(backlog, _memtable_controller.backlog_of_shares(200));

View File

@@ -1617,6 +1617,7 @@ private:
dirty_memory_manager _dirty_memory_manager;
database_config _dbcfg;
backlog_controller::scheduling_group _flush_sg;
flush_controller _memtable_controller;
drain_progress _drain_progress {};

View File

@@ -20,7 +20,7 @@ set -e
trap 'echo "error $? in $0 line $LINENO"' ERR
SCRIPT_NAME=$(basename $0)
SCYLLA_S3_RELOC_SERVER_DEFAULT_URL=https://api.backtrace.scylladb.com
SCYLLA_S3_RELOC_SERVER_DEFAULT_URL=http://backtrace.scylladb.com
function print_usage {
cat << EOF
@@ -284,8 +284,7 @@ then
log "Build id: ${BUILD_ID}"
# https://api.backtrace.scylladb.com/api/docs#/default/search_by_build_id_search_build_id_get
BUILD=$(curl "${SCYLLA_S3_RELOC_SERVER_URL}/api/search/build_id?build_id=${BUILD_ID}" -H 'accept: application/json')
BUILD=$(curl -s -X GET "${SCYLLA_S3_RELOC_SERVER_URL}/build.json?build_id=${BUILD_ID}")
if [[ -z "$BUILD" ]]
then
@@ -294,16 +293,12 @@ then
fi
RESPONSE_BUILD_ID=$(get_json_field "$BUILD" "build_id")
BUILD_MODE=$(get_json_field "$BUILD" "build_type")
PACKAGE_URL=$(get_json_field "$BUILD" "unstripped_url")
BUILD_DATA=$(get_json_field "$BUILD" "build_data")
VERSION=$(get_json_field "$BUILD_DATA" "version")
PRODUCT=$(get_json_field "$BUILD_DATA" "product")
RELEASE=$(get_json_field "$BUILD_DATA" "release")
ARCH=$(get_json_field "$BUILD_DATA" "platform")
TIMESTAMP=$(get_json_field "$BUILD_DATA" "timestamp")
VERSION=$(get_json_field "$BUILD" "version")
PRODUCT=$(get_json_field "$BUILD" "product")
RELEASE=$(get_json_field "$BUILD" "release")
ARCH=$(get_json_field "$BUILD" "arch")
BUILD_MODE=$(get_json_field "$BUILD" "build_mode")
PACKAGE_URL=$(get_json_field "$BUILD" "package_url" 1)
if [[ "$RESPONSE_BUILD_ID" != "$BUILD_ID" ]]
then
@@ -311,7 +306,7 @@ then
exit 1
fi
log "Matching build is ${PRODUCT}-${VERSION} ${RELEASE} ${BUILD_MODE}-${ARCH} from ${TIMESTAMP}"
log "Matching build is ${PRODUCT}-${VERSION} ${RELEASE} ${BUILD_MODE}-${ARCH}"
fi
if ! [[ -d ${ARTIFACT_DIR}/scylla.package ]]

Submodule seastar updated: d2953d2ad1...f55dc7ebed

View File

@@ -217,8 +217,6 @@ future<> service::client_state::has_access(const sstring& ks, auth::command_desc
static const std::unordered_set<auth::resource> vector_search_system_resources = {
auth::make_data_resource(db::system_keyspace::NAME, db::system_keyspace::GROUP0_HISTORY),
auth::make_data_resource(db::system_keyspace::NAME, db::system_keyspace::VERSIONS),
auth::make_data_resource(db::system_keyspace::NAME, db::system_keyspace::CDC_STREAMS),
auth::make_data_resource(db::system_keyspace::NAME, db::system_keyspace::CDC_TIMESTAMPS),
};
if ((cmd.resource.kind() == auth::resource_kind::data && cmd.permission == auth::permission::SELECT && is_vector_indexed.has_value() && is_vector_indexed.value()) ||

View File

@@ -56,9 +56,6 @@ static future<schema_ptr> get_schema_definition(table_schema_version v, locator:
migration_manager::migration_manager(migration_notifier& notifier, gms::feature_service& feat, netw::messaging_service& ms,
service::storage_proxy& storage_proxy, gms::gossiper& gossiper, service::raft_group0_client& group0_client, sharded<db::system_keyspace>& sysks) :
_notifier(notifier)
, _background_tasks("migration_manager::background_tasks")
, _feat(feat), _messaging(ms), _storage_proxy(storage_proxy), _ss("migration_manager::storage_service"), _gossiper(gossiper), _group0_client(group0_client)
, _sys_ks(sysks)
, _group0_barrier(this_shard_id() == 0 ?
std::function<future<>()>([this] () -> future<> {
if ((co_await _group0_client.get_group0_upgrade_state()).second == group0_upgrade_state::use_pre_raft_procedures) {
@@ -66,7 +63,7 @@ migration_manager::migration_manager(migration_notifier& notifier, gms::feature_
}
// This will run raft barrier and will sync schema with the leader
co_await with_scheduling_group(_gossiper.get_scheduling_group(), [this] {
co_await with_scheduling_group(_storage_proxy.get_db().local().get_gossip_scheduling_group(), [this] {
return start_group0_operation().discard_result();
});
}) :
@@ -77,6 +74,9 @@ migration_manager::migration_manager(migration_notifier& notifier, gms::feature_
});
})
)
, _background_tasks("migration_manager::background_tasks")
, _feat(feat), _messaging(ms), _storage_proxy(storage_proxy), _ss("migration_manager::storage_service"), _gossiper(gossiper), _group0_client(group0_client)
, _sys_ks(sysks)
, _schema_push([this] { return passive_announce(); })
, _concurrent_ddl_retries{10}
{

View File

@@ -57,6 +57,7 @@ private:
migration_notifier& _notifier;
std::unordered_map<locator::host_id, serialized_action> _schema_pulls;
serialized_action _group0_barrier;
std::vector<gms::feature::listener_registration> _feature_listeners;
seastar::named_gate _background_tasks;
static const std::chrono::milliseconds migration_delay;
@@ -68,7 +69,6 @@ private:
seastar::abort_source _as;
service::raft_group0_client& _group0_client;
sharded<db::system_keyspace>& _sys_ks;
serialized_action _group0_barrier;
serialized_action _schema_push;
table_schema_version _schema_version_to_publish;

View File

@@ -123,7 +123,12 @@ utils::small_vector<locator::host_id, N> addr_vector_to_id(const gms::gossiper&
// Check the effective replication map consistency:
// we have an inconsistent effective replication map in case we the number of
// read replicas is higher than the replication factor.
[[maybe_unused]] void validate_read_replicas(const locator::effective_replication_map& erm, const host_id_vector_replica_set& read_replicas) {
void validate_read_replicas(const locator::effective_replication_map& erm, const host_id_vector_replica_set& read_replicas) {
// Skip for non-debug builds.
if constexpr (!tools::build_info::is_debug_build()) {
return;
}
const sstring error = erm.get_replication_strategy().sanity_check_read_replicas(erm, read_replicas);
if (!error.empty()) {
on_internal_error(slogger, error);
@@ -6967,12 +6972,7 @@ host_id_vector_replica_set storage_proxy::get_endpoints_for_reading(const schema
return host_id_vector_replica_set{my_host_id(erm)};
}
auto endpoints = erm.get_replicas_for_reading(token);
// Skip for non-debug builds and maintenance mode.
if constexpr (tools::build_info::is_debug_build()) {
if (!_db.local().get_config().maintenance_mode()) {
validate_read_replicas(erm, endpoints);
}
}
validate_read_replicas(erm, endpoints);
auto it = std::ranges::remove_if(endpoints, std::not_fn(std::bind_front(&storage_proxy::is_alive, this, std::cref(erm)))).begin();
endpoints.erase(it, endpoints.end());
sort_endpoints_by_proximity(erm, endpoints);

View File

@@ -125,7 +125,6 @@
#include "utils/labels.hh"
#include "view_info.hh"
#include "raft/raft.hh"
#include "debug.hh"
#include <boost/algorithm/string/split.hpp>
#include <boost/algorithm/string/classification.hpp>
@@ -174,10 +173,11 @@ void check_raft_rpc_scheduling_group(const replica::database& db, const gms::fea
return;
}
if (current_scheduling_group() != debug::gossip_scheduling_group) {
const auto gossip_scheduling_group = db.get_gossip_scheduling_group();
if (current_scheduling_group() != gossip_scheduling_group) {
on_internal_error_noexcept(
slogger, seastar::format("Raft group0 RPCs should be executed in the gossip scheduling group, current group is [{}], operation [{}].",
current_scheduling_group().name(), rpc_name));
slogger, seastar::format("Raft group0 RPCs should be executed in the gossip scheduling group [{}], current group is [{}], operation [{}].",
gossip_scheduling_group.name(), current_scheduling_group().name(), rpc_name));
}
}
@@ -532,16 +532,9 @@ future<> storage_service::raft_topology_update_ip(locator::host_id id, gms::inet
co_await when_all_succeed(sys_ks_futures.begin(), sys_ks_futures.end()).discard_result();
}
static std::unordered_set<locator::host_id> get_released_nodes(const service::topology& topology, const locator::token_metadata& tm) {
return boost::join(topology.left_nodes, topology.ignored_nodes)
| std::views::transform([] (const auto& raft_id) { return locator::host_id(raft_id.uuid()); })
| std::views::filter([&] (const auto& h) { return !tm.get_topology().has_node(h); })
| std::ranges::to<std::unordered_set<locator::host_id>>();
}
// Synchronizes the local node state (token_metadata, system.peers/system.local tables,
// gossiper) to align it with the other raft topology nodes.
future<storage_service::nodes_to_notify_after_sync> storage_service::sync_raft_topology_nodes(mutable_token_metadata_ptr tmptr, std::unordered_set<raft::server_id> prev_normal, std::optional<std::unordered_set<locator::host_id>> prev_released) {
future<storage_service::nodes_to_notify_after_sync> storage_service::sync_raft_topology_nodes(mutable_token_metadata_ptr tmptr, std::unordered_set<raft::server_id> prev_normal) {
nodes_to_notify_after_sync nodes_to_notify;
rtlogger.trace("Start sync_raft_topology_nodes");
@@ -695,10 +688,13 @@ future<storage_service::nodes_to_notify_after_sync> storage_service::sync_raft_t
}
}
if (prev_released) {
auto nodes_to_release = get_released_nodes(t, *tmptr);
std::erase_if(nodes_to_release, [&] (const auto& host_id) { return prev_released->contains(host_id); });
std::copy(nodes_to_release.begin(), nodes_to_release.end(), std::back_inserter(nodes_to_notify.released));
auto nodes_to_release = t.left_nodes;
nodes_to_release.insert(t.ignored_nodes.begin(), t.ignored_nodes.end());
for (const auto& id: nodes_to_release) {
auto host_id = locator::host_id(id.uuid());
if (!tmptr->get_topology().find_node(host_id)) {
nodes_to_notify.released.push_back(host_id);
}
}
co_await when_all_succeed(sys_ks_futures.begin(), sys_ks_futures.end()).discard_result();
@@ -736,10 +732,6 @@ future<> storage_service::topology_state_load(state_change_hint hint) {
rtlogger.debug("reload raft topology state");
std::unordered_set<raft::server_id> prev_normal = _topology_state_machine._topology.normal_nodes | std::views::keys | std::ranges::to<std::unordered_set>();
std::optional<std::unordered_set<locator::host_id>> prev_released;
if (!_topology_state_machine._topology.is_empty()) {
prev_released = get_released_nodes(_topology_state_machine._topology, get_token_metadata());
}
std::unordered_set<locator::host_id> tablet_hosts = co_await replica::read_required_hosts(_qp);
@@ -840,7 +832,7 @@ future<> storage_service::topology_state_load(state_change_hint hint) {
}, topology.tstate);
tmptr->set_read_new(read_new);
auto nodes_to_notify = co_await sync_raft_topology_nodes(tmptr, std::move(prev_normal), std::move(prev_released));
auto nodes_to_notify = co_await sync_raft_topology_nodes(tmptr, std::move(prev_normal));
std::optional<locator::tablet_metadata> tablets;
if (hint.tablets_hint) {
@@ -3197,6 +3189,9 @@ future<> storage_service::join_cluster(sharded<service::storage_proxy>& proxy,
throw std::runtime_error(
"Cannot start in the Raft-based recovery procedure - Raft-based topology has not been enabled");
}
if (_db.local().get_config().force_gossip_topology_changes()) {
throw std::runtime_error("Cannot force gossip topology changes in the Raft-based recovery procedure");
}
}
}
@@ -3220,6 +3215,9 @@ future<> storage_service::join_cluster(sharded<service::storage_proxy>& proxy,
} else if (_group0->joined_group0()) {
// We are a part of group 0.
set_topology_change_kind(upgrade_state_to_topology_op_kind(_topology_state_machine._topology.upgrade_state));
if (_db.local().get_config().force_gossip_topology_changes() && raft_topology_change_enabled()) {
throw std::runtime_error("Cannot force gossip topology changes - the cluster is using raft-based topology");
}
slogger.info("The node is already in group 0 and will restart in {} mode", raft_topology_change_enabled() ? "raft" : "legacy");
} else if (_sys_ks.local().bootstrap_complete()) {
if (co_await _sys_ks.local().load_topology_features_state()) {
@@ -3240,8 +3238,13 @@ future<> storage_service::join_cluster(sharded<service::storage_proxy>& proxy,
if (_group0->load_my_id() == g0_info.id) {
// We're creating the group 0.
slogger.info("We are creating the group 0. Start in raft topology operations mode");
set_topology_change_kind(topology_change_kind::raft);
if (_db.local().get_config().force_gossip_topology_changes()) {
slogger.info("We are creating the group 0. Start in legacy topology operations mode by force");
set_topology_change_kind(topology_change_kind::legacy);
} else {
slogger.info("We are creating the group 0. Start in raft topology operations mode");
set_topology_change_kind(topology_change_kind::raft);
}
} else {
// Ask the current member of the raft group about which mode to use
auto params = join_node_query_params {};
@@ -3249,6 +3252,9 @@ future<> storage_service::join_cluster(sharded<service::storage_proxy>& proxy,
&_messaging.local(), netw::msg_addr(g0_info.ip_addr), g0_info.id, std::move(params));
switch (result.topo_mode) {
case join_node_query_result::topology_mode::raft:
if (_db.local().get_config().force_gossip_topology_changes()) {
throw std::runtime_error("Cannot force gossip topology changes - joining the cluster that is using raft-based topology");
}
slogger.info("Will join existing cluster in raft topology operations mode");
set_topology_change_kind(topology_change_kind::raft);
break;
@@ -6269,7 +6275,7 @@ future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft
}
break;
case raft_topology_cmd::command::stream_ranges: {
co_await with_scheduling_group(_stream_manager.local().get_scheduling_group(), coroutine::lambda([&] () -> future<> {
co_await with_scheduling_group(_db.local().get_streaming_scheduling_group(), coroutine::lambda([&] () -> future<> {
const auto rs = _topology_state_machine._topology.find(id)->second;
auto tstate = _topology_state_machine._topology.tstate;
auto session = _topology_state_machine._topology.session;
@@ -8425,7 +8431,6 @@ future<> storage_service::start_maintenance_mode() {
set_mode(mode::MAINTENANCE);
return mutate_token_metadata([this] (mutable_token_metadata_ptr token_metadata) -> future<> {
token_metadata->update_topology(my_host_id(), _snitch.local()->get_location(), locator::node::state::normal, smp::count);
return token_metadata->update_normal_tokens({ dht::token{} }, my_host_id());
}, acquire_merge_lock::yes);
}

View File

@@ -1115,7 +1115,7 @@ private:
// gossiper) to align it with the other raft topology nodes.
// Optional target_node can be provided to restrict the synchronization to the specified node.
// Returns a structure that describes which notifications to trigger after token metadata is updated.
future<nodes_to_notify_after_sync> sync_raft_topology_nodes(mutable_token_metadata_ptr tmptr, std::unordered_set<raft::server_id> prev_normal, std::optional<std::unordered_set<locator::host_id>> prev_released);
future<nodes_to_notify_after_sync> sync_raft_topology_nodes(mutable_token_metadata_ptr tmptr, std::unordered_set<raft::server_id> prev_normal);
// Triggers notifications (on_joined, on_left) based on the recent changes to token metadata, as described by the passed in structure.
// This function should be called on the result of `sync_raft_topology_nodes`, after the global token metadata is updated.
future<> notify_nodes_after_sync(nodes_to_notify_after_sync&& nodes_to_notify);

View File

@@ -1865,8 +1865,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
.last_token = dht::token::to_int64(tmap.get_last_token(gid.tablet)),
.table_uuid = gid.table,
};
auto request_type = tinfo.repair_task_info.request_type;
rtlogger.info("Initiating tablet repair host={} tablet={} request_type={}", dst, gid, request_type);
rtlogger.info("Initiating tablet repair host={} tablet={}", dst, gid);
auto session_id = utils::get_local_injector().enter("handle_tablet_migration_repair_random_session") ?
service::session_id::create_random_id() : trinfo->session_id;
auto res = co_await ser::storage_service_rpc_verbs::send_tablet_repair(&_messaging,
@@ -1878,8 +1877,8 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
entry.timestamp = db_clock::now();
tablet_state.repair_task_updates = co_await _sys_ks.get_update_repair_task_mutations(entry, api::new_timestamp());
}
rtlogger.info("Finished tablet repair host={} tablet={} duration={} repair_time={} request_type={}",
dst, tablet, duration, res.repair_time, request_type);
rtlogger.info("Finished tablet repair host={} tablet={} duration={} repair_time={}",
dst, tablet, duration, res.repair_time);
})) {
if (utils::get_local_injector().enter("delay_end_repair_update")) {
break;
@@ -3697,7 +3696,7 @@ public:
, _vb_coordinator(std::make_unique<db::view::view_building_coordinator>(_db, _raft, _group0, _sys_ks, _gossiper, _messaging, _vb_sm, _topo_sm, _term, _as))
, _cdc_gens(cdc_gens)
, _tablet_load_stats_refresh([this] {
return with_scheduling_group(_gossiper.get_scheduling_group(), [this] {
return with_scheduling_group(_db.get_gossip_scheduling_group(), [this] {
return refresh_tablet_load_stats();
});
})
@@ -3877,9 +3876,6 @@ future<> topology_coordinator::refresh_tablet_load_stats() {
for (auto& [table_id, table_stats] : dc_stats.tables) {
co_await coroutine::maybe_yield();
if (!_db.column_family_exists(table_id)) {
continue;
}
auto& t = _db.find_column_family(table_id);
auto& rs = t.get_effective_replication_map()->get_replication_strategy();
if (!rs.uses_tablets()) {
@@ -3903,9 +3899,6 @@ future<> topology_coordinator::refresh_tablet_load_stats() {
}
for (auto& [table_id, table_load_stats] : stats.tables) {
if (!total_replicas.contains(table_id)) {
continue;
}
auto table_total_replicas = total_replicas.at(table_id);
if (table_total_replicas == 0) {
continue;

View File

@@ -436,10 +436,7 @@ tablet_stream_files(netw::messaging_service& ms, std::list<stream_blob_info> sou
stream_options.buffer_size = file_stream_buffer_size;
stream_options.read_ahead = file_stream_read_ahead;
for (auto&& source_info : sources) {
// Keep stream_blob_info alive only at duration of streaming. Allowing the file descriptor
// of the sstable component to be released right after it has been streamed.
auto info = std::exchange(source_info, {});
for (auto& info : sources) {
auto& filename = info.filename;
std::optional<input_stream<char>> fstream;
bool fstream_closed = false;
@@ -620,7 +617,6 @@ tablet_stream_files(netw::messaging_service& ms, std::list<stream_blob_info> sou
ops_id, filename, targets, total_size, get_bw(total_size, start_time));
}
}
co_await utils::get_local_injector().inject("tablet_stream_files_end_wait", utils::wait_for_message(std::chrono::seconds(60)));
if (error) {
blogger.warn("fstream[{}] Master failed sending files_nr={} files={} targets={} send_size={} bw={} error={}",
ops_id, sources.size(), sources, targets, ops_total_size, get_bw(ops_total_size, ops_start_time), error);
@@ -684,20 +680,15 @@ future<stream_files_response> tablet_stream_files_handler(replica::database& db,
if (files.empty()) {
co_return resp;
}
auto sstable_nr = sstables.size();
// Release reference to sstables to be streamed here. Since one sstable is streamed at a time,
// a sstable - that has been compacted - can have its space released from disk right after
// that sstable's content has been fully streamed.
sstables.clear();
blogger.debug("stream_sstables[{}] Started sending sstable_nr={} files_nr={} files={} range={}",
req.ops_id, sstable_nr, files.size(), files, req.range);
req.ops_id, sstables.size(), files.size(), files, req.range);
auto ops_start_time = std::chrono::steady_clock::now();
auto files_nr = files.size();
size_t stream_bytes = co_await tablet_stream_files(ms, std::move(files), req.targets, req.table, req.ops_id, req.topo_guard);
resp.stream_bytes = stream_bytes;
auto duration = std::chrono::steady_clock::now() - ops_start_time;
blogger.info("stream_sstables[{}] Finished sending sstable_nr={} files_nr={} range={} stream_bytes={} stream_time={} stream_bw={}",
req.ops_id, sstable_nr, files_nr, req.range, stream_bytes, duration, get_bw(stream_bytes, ops_start_time));
req.ops_id, sstables.size(), files_nr, req.range, stream_bytes, duration, get_bw(stream_bytes, ops_start_time));
co_return resp;
}

View File

@@ -196,8 +196,6 @@ public:
}
future<> fail_stream_plan(streaming::plan_id plan_id);
scheduling_group get_scheduling_group() const noexcept { return _streaming_group; }
};
} // namespace streaming

View File

@@ -415,7 +415,7 @@ future<utils::chunked_vector<task_identity>> task_manager::virtual_task::impl::g
auto nodes = module->get_nodes();
co_await utils::get_local_injector().inject("tasks_vt_get_children", [] (auto& handler) -> future<> {
tmlogger.info("tasks_vt_get_children: waiting");
co_await handler.wait_for_message(std::chrono::steady_clock::now() + std::chrono::seconds{60});
co_await handler.wait_for_message(std::chrono::steady_clock::now() + std::chrono::seconds{10});
});
co_return co_await map_reduce(nodes, [ms, parent_id, is_host_alive = std::move(is_host_alive)] (auto host_id) -> future<utils::chunked_vector<task_identity>> {
if (is_host_alive(host_id)) {

View File

@@ -61,6 +61,7 @@ PYTEST_RUNNER_DIRECTORIES = [
TEST_DIR / 'raft',
TEST_DIR / 'unit',
TEST_DIR / 'vector_search',
TEST_DIR / 'vector_search_validator',
TEST_DIR / 'alternator',
TEST_DIR / 'broadcast_tables',
TEST_DIR / 'cql',

View File

@@ -103,6 +103,7 @@ if(BUILD_TESTING)
add_subdirectory(raft)
add_subdirectory(resource/wasm)
add_subdirectory(vector_search)
add_subdirectory(vector_search_validator)
if(CMAKE_CONFIGURATION_TYPES)
foreach(config ${CMAKE_CONFIGURATION_TYPES})

View File

@@ -581,7 +581,8 @@ def test_update_item_many_items_fall_into_appropriate_buckets(dynamodb, test_tab
# Verify that only the new item size is counted in the histogram if RBW is
# disabled, and both sizes if it is enabled. The WCU is calculated as the
# maximum of the old and new item sizes.
@pytest.mark.parametrize("force_rbw", [pytest.param(True, marks=pytest.mark.xfail(reason="Updates don't consider the larger of the old item size and the new item size.")), False])
@pytest.mark.xfail(reason="Updates don't consider the larger of the old item size and the new item size. This will be fixed in a next PR.")
@pytest.mark.parametrize("force_rbw", [True, False])
def test_update_item_increases_metrics_for_new_item_size_only(dynamodb, test_table_s, metrics, force_rbw):
with scylla_config_temporary(dynamodb, 'alternator_force_read_before_write', str(force_rbw).lower()):
if force_rbw:

View File

@@ -482,7 +482,6 @@ def test_get_records_nonexistent_iterator(dynamodbstreams):
# and if in the future we can work around the DynamoDB problem, we can return
# these fixtures to module scope.
@contextmanager
def create_table_ss(dynamodb, dynamodbstreams, type):
table = create_test_table(dynamodb,
Tags=TAGS,
@@ -530,23 +529,19 @@ def test_table_sss_new_and_old_images_lsi(dynamodb, dynamodbstreams):
@pytest.fixture(scope="function")
def test_table_ss_keys_only(dynamodb, dynamodbstreams):
with create_table_ss(dynamodb, dynamodbstreams, 'KEYS_ONLY') as stream:
yield stream
yield from create_table_ss(dynamodb, dynamodbstreams, 'KEYS_ONLY')
@pytest.fixture(scope="function")
def test_table_ss_new_image(dynamodb, dynamodbstreams):
with create_table_ss(dynamodb, dynamodbstreams, 'NEW_IMAGE') as stream:
yield stream
yield from create_table_ss(dynamodb, dynamodbstreams, 'NEW_IMAGE')
@pytest.fixture(scope="function")
def test_table_ss_old_image(dynamodb, dynamodbstreams):
with create_table_ss(dynamodb, dynamodbstreams, 'OLD_IMAGE') as stream:
yield stream
yield from create_table_ss(dynamodb, dynamodbstreams, 'OLD_IMAGE')
@pytest.fixture(scope="function")
def test_table_ss_new_and_old_images(dynamodb, dynamodbstreams):
with create_table_ss(dynamodb, dynamodbstreams, 'NEW_AND_OLD_IMAGES') as stream:
yield stream
yield from create_table_ss(dynamodb, dynamodbstreams, 'NEW_AND_OLD_IMAGES')
@pytest.fixture(scope="function")
def test_table_s_no_ck_keys_only(dynamodb, dynamodbstreams):
@@ -659,17 +654,6 @@ def fetch_more(dynamodbstreams, iterators, output):
assert len(set(new_iterators)) == len(new_iterators)
return new_iterators
def print_events(expected_events, output, failed_at=None):
if failed_at is None:
print(f'compare_events: timeouted')
else:
print(f'compare_events: failed at output event {failed_at}')
for index, event in enumerate(expected_events):
expected_type, expected_key, expected_old_image, expected_new_image = event
print(f'expected event {index}: type={expected_type}, key={expected_key}, old_image={expected_old_image}, new_image={expected_new_image}')
for index, event in enumerate(output):
print(f'output event {index}: type={event["eventName"]}, key={event["dynamodb"]["Keys"]}, old_image={event["dynamodb"].get("OldImage")}, new_image={event["dynamodb"].get("NewImage")}')
# Utility function for comparing "output" as fetched by fetch_more(), to a list
# expected_events, each of which looks like:
# [type, keys, old_image, new_image]
@@ -702,75 +686,70 @@ def compare_events(expected_events, output, mode, expected_region):
# Iterate over the events in output. An event for a certain key needs to
# be the *first* remaining event for this key in expected_events_map (and
# then we remove this matched even from expected_events_map)
for e, event in enumerate(output):
try:
# In DynamoDB, eventSource is 'aws:dynamodb'. We decided to set it to
# a *different* value - 'scylladb:alternator'. Issue #6931.
assert 'eventSource' in event
# For lack of a direct equivalent of a region, Alternator provides the
# DC name instead. Reproduces #6931.
assert 'awsRegion' in event
assert event['awsRegion'] == expected_region
# Reproduces #6931.
assert 'eventVersion' in event
assert event['eventVersion'] in ['1.0', '1.1']
# Check that eventID appears, but can't check much on what it is.
assert 'eventID' in event
op = event['eventName']
record = event['dynamodb']
# record['Keys'] is "serialized" JSON, ({'S', 'thestring'}), so we
# want to deserialize it to match our expected_events content.
deserializer = TypeDeserializer()
key = {x:deserializer.deserialize(y) for (x,y) in record['Keys'].items()}
expected_type, expected_key, expected_old_image, expected_new_image = expected_events_map[freeze(key)].pop(0)
assert op == expected_type
assert record['StreamViewType'] == mode
# We don't know what ApproximateCreationDateTime should be, but we do
# know it needs to be a timestamp - there is conflicting documentation
# in what format (ISO 8601?). In any case, boto3 parses this timestamp
# for us, so we can't check it here, beyond checking it exists.
assert 'ApproximateCreationDateTime' in record
# We don't know what SequenceNumber is supposed to be, but the DynamoDB
# documentation requires that it contains only numeric characters and
# some libraries rely on this. This reproduces issue #7158:
assert 'SequenceNumber' in record
assert record['SequenceNumber'].isdecimal()
# Alternator doesn't set the SizeBytes member. Issue #6931.
#assert 'SizeBytes' in record
if mode == 'KEYS_ONLY':
for event in output:
# In DynamoDB, eventSource is 'aws:dynamodb'. We decided to set it to
# a *different* value - 'scylladb:alternator'. Issue #6931.
assert 'eventSource' in event
# For lack of a direct equivalent of a region, Alternator provides the
# DC name instead. Reproduces #6931.
assert 'awsRegion' in event
assert event['awsRegion'] == expected_region
# Reproduces #6931.
assert 'eventVersion' in event
assert event['eventVersion'] in ['1.0', '1.1']
# Check that eventID appears, but can't check much on what it is.
assert 'eventID' in event
op = event['eventName']
record = event['dynamodb']
# record['Keys'] is "serialized" JSON, ({'S', 'thestring'}), so we
# want to deserialize it to match our expected_events content.
deserializer = TypeDeserializer()
key = {x:deserializer.deserialize(y) for (x,y) in record['Keys'].items()}
expected_type, expected_key, expected_old_image, expected_new_image = expected_events_map[freeze(key)].pop(0)
assert op == expected_type
assert record['StreamViewType'] == mode
# We don't know what ApproximateCreationDateTime should be, but we do
# know it needs to be a timestamp - there is conflicting documentation
# in what format (ISO 8601?). In any case, boto3 parses this timestamp
# for us, so we can't check it here, beyond checking it exists.
assert 'ApproximateCreationDateTime' in record
# We don't know what SequenceNumber is supposed to be, but the DynamoDB
# documentation requires that it contains only numeric characters and
# some libraries rely on this. This reproduces issue #7158:
assert 'SequenceNumber' in record
assert record['SequenceNumber'].isdecimal()
# Alternator doesn't set the SizeBytes member. Issue #6931.
#assert 'SizeBytes' in record
if mode == 'KEYS_ONLY':
assert not 'NewImage' in record
assert not 'OldImage' in record
elif mode == 'NEW_IMAGE':
assert not 'OldImage' in record
if expected_new_image == None:
assert not 'NewImage' in record
assert not 'OldImage' in record
elif mode == 'NEW_IMAGE':
assert not 'OldImage' in record
if expected_new_image == None:
assert not 'NewImage' in record
else:
new_image = {x:deserializer.deserialize(y) for (x,y) in record['NewImage'].items()}
assert expected_new_image == new_image
elif mode == 'OLD_IMAGE':
assert not 'NewImage' in record
if expected_old_image == None:
assert not 'OldImage' in record
else:
old_image = {x:deserializer.deserialize(y) for (x,y) in record['OldImage'].items()}
assert expected_old_image == old_image
elif mode == 'NEW_AND_OLD_IMAGES':
if expected_new_image == None:
assert not 'NewImage' in record
else:
new_image = {x:deserializer.deserialize(y) for (x,y) in record['NewImage'].items()}
assert expected_new_image == new_image
if expected_old_image == None:
assert not 'OldImage' in record
else:
old_image = {x:deserializer.deserialize(y) for (x,y) in record['OldImage'].items()}
assert expected_old_image == old_image
else:
pytest.fail('cannot happen')
except AssertionError:
print_events(expected_events, output, failed_at=e)
raise
new_image = {x:deserializer.deserialize(y) for (x,y) in record['NewImage'].items()}
assert expected_new_image == new_image
elif mode == 'OLD_IMAGE':
assert not 'NewImage' in record
if expected_old_image == None:
assert not 'OldImage' in record
else:
old_image = {x:deserializer.deserialize(y) for (x,y) in record['OldImage'].items()}
assert expected_old_image == old_image
elif mode == 'NEW_AND_OLD_IMAGES':
if expected_new_image == None:
assert not 'NewImage' in record
else:
new_image = {x:deserializer.deserialize(y) for (x,y) in record['NewImage'].items()}
assert expected_new_image == new_image
if expected_old_image == None:
assert not 'OldImage' in record
else:
old_image = {x:deserializer.deserialize(y) for (x,y) in record['OldImage'].items()}
assert expected_old_image == old_image
else:
pytest.fail('cannot happen')
# After the above loop, expected_events_map should remain empty arrays.
# If it isn't, one of the expected events did not yet happen. Return False.
for entry in expected_events_map.values():
@@ -799,7 +778,6 @@ def fetch_and_compare_events(dynamodb, dynamodbstreams, iterators, expected_even
return
time.sleep(0.5)
# If we're still here, the last compare_events returned false.
print_events(expected_events, output)
pytest.fail('missing events in output: {}'.format(output))
# Convenience function used to implement several tests below. It runs a given
@@ -2016,33 +1994,6 @@ def test_stream_table_name_length_192_update(dynamodb, dynamodbstreams):
# is in the process of being added
wait_for_active_stream(dynamodbstreams, table)
# In earlier tests, we tested the stream events logged for BatchWriteItem,
# but it was usually a single item in the batch or in do_batch_test(),
# it was multiple items in different partitions. This test checks the
# remaining case, of a batch writing multiple items in one partition -
# and checks that the correct events appear for them on the stream.
# Turns out we had a bug (#28439) in this case, but *only* in always_use_lwt
# write isolation mode, which writes all the items in the batch with the
# same timestamp. The test is parameterized to try all write isolation
# modes, and reproduces #28439 when it failed only in always_use_lwt mode.
# This is a Scylla-only test because it checks write isolation modes, which
# don't exist in DynamoDB.
@pytest.mark.parametrize('mode', ['only_rmw_uses_lwt', pytest.param('always_use_lwt', marks=pytest.mark.xfail(reason='#28439')), 'unsafe_rmw', 'forbid_rmw'])
def test_streams_multiple_items_one_partition(dynamodb, dynamodbstreams, scylla_only, mode):
with create_table_ss(dynamodb, dynamodbstreams, 'NEW_AND_OLD_IMAGES') as stream:
table, stream_arn = stream
# Set write isolation mode on the table to the chosen "mode":
table_arn = table.meta.client.describe_table(TableName=table.name)['Table']['TableArn']
table.meta.client.tag_resource(ResourceArn=table_arn, Tags=[{'Key': 'system:write_isolation', 'Value': mode}])
# Now try the test, a single BatchWriteItem writing three different
# items in the same partition p:
def do_updates(table, p, c):
cs = [c + '1', c + '2', c + '3']
table.meta.client.batch_write_item(RequestItems = {
table.name: [{'PutRequest': {'Item': {'p': p, 'c': cc, 'x': cc}}} for cc in cs]})
return [['INSERT', {'p': p, 'c': cc}, None, {'p': p, 'c': cc, 'x': cc}] for cc in cs]
do_test(stream, dynamodb, dynamodbstreams, do_updates, 'NEW_AND_OLD_IMAGES')
# TODO: tests on multiple partitions
# TODO: write a test that disabling the stream and re-enabling it works, but
# requires the user to wait for the first stream to become DISABLED before

View File

@@ -679,48 +679,3 @@ def test_create_table_spurious_attribute_definitions(dynamodb):
AttributeDefinitions=[{ 'AttributeName': 'p', 'AttributeType': 'S' },
{ 'AttributeName': 'c', 'AttributeType': 'S' }]) as table:
pass
# DynamoDB supports many different types, but the documentation claims that
# for keys, "The only data types allowed for primary key attributes are
# string, number, or binary.". We have many tests for these types (and
# shared test tables with those key types defined in conftest.py) - in this
# test we verify that indeed all other types are NOT allowed - for neither
# partition key nor sort key.
# See also test_gsi.py::test_gsi_invalid_key_types which checks that the
# same types are also forbidden as GSI keys.
def test_forbidden_key_types(dynamodb):
for t in ['BOOL', 'BS', 'L', 'M', 'NS', 'NULL', 'SS']:
# Check that partition key of type t is forbidden.
# The specific error message is different in DynamoDB and Alternator,
# but both mention the requested type in the message in single quotes.
with pytest.raises(ClientError, match=f"ValidationException.*'{t}'"):
with new_test_table(dynamodb,
KeySchema=[{'AttributeName': 'p', 'KeyType': 'HASH'}],
AttributeDefinitions=[{'AttributeName': 'p', 'AttributeType': t}]):
pass
# Check that sort key of type t is forbidden.
with pytest.raises(ClientError, match=f"ValidationException.*'{t}'"):
with new_test_table(dynamodb,
KeySchema=[{'AttributeName': 'p', 'KeyType': 'HASH'},
{'AttributeName': 'c', 'KeyType': 'RANGE'}],
AttributeDefinitions=[{'AttributeName': 'p', 'AttributeType': 'S'},
{'AttributeName': 'c', 'AttributeType': t}]):
pass
# Although as we tested in the previous test (test_forbidden_key_types) most
# DynamoDB types are not allowed as key types (only S, B and N are allowed),
# strangely the GetItem documentation claims that the Key parameter can
# actually allow any type. This is a mistake in the documentation - this
# test shows that when you try to GetItem with one of the forbidden types,
# it fails. Note that actually what both DynamoDB and Alternator test is
# whether the Key type is the same as the one in the table's schema - so
# because we can't create a table with these types, GetItem with those
# types is bound to fail.
def test_forbidden_key_types_getitem(test_table_s):
for p in [False, {b'hi', b'there'}, ['hi',3], {'hi': 3}, {1,2}, None, {'hi', 'there'}]:
# Unfortunately the error message in DynamoDB ("The provided key
# element does not match the schema") and Alternator ("Type mismatch:
# expected type S for key column p, got type "BOOL") doesn't have
# anything in common except the word "match".
with pytest.raises(ClientError, match='ValidationException.*match'):
test_table_s.get_item(Key={'p': p})

View File

@@ -51,7 +51,7 @@
import pytest
from botocore.exceptions import ClientError
from .util import create_test_table, random_string, new_test_table
from .util import create_test_table, random_string
@pytest.fixture(scope="function", autouse=True)
def all_tests_are_scylla_only(scylla_only):
@@ -430,53 +430,3 @@ def test_isolation_updateitem_returnvalues(table_forbid_rmw, tables_permit_rmw):
UpdateExpression='SET a = :val',
ExpressionAttributeValues={':val': 1},
ReturnValues=returnvalues)
#############################################################################
# BatchWriteItem tests.
# BatchWriteItem writes are always pure write - never RMW (read-modify-write)
# operations - because none of the RMW options are supported: Batch writes
# don't support an UpdateExpression, a ConditionExpression or ReturnValues.
# Still, even in the pure write case, the write code paths are different for
# the different write isolation modes, and we need to exercise them.
# For completeness, this test exercises a single batch with more than one
# partition, more than one clustering key in the same partition, and a
# combination of PutRequest and DeleteRequest.
def test_isolation_batchwriteitem(dynamodb):
# Unfortunately we can't use the four table fixtures that all other tests
# use, because those fixtures only have a partition key and we also want
# a sort key (so we can test the case of multiple items in the same
# partition). So we have to create four new tables just for this test.
for mode in ['only_rmw_uses_lwt', 'always_use_lwt', 'unsafe_rmw', 'forbid_rmw']:
with new_test_table(dynamodb,
Tags=[{'Key': 'system:write_isolation', 'Value': mode}],
KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' },
{ 'AttributeName': 'c', 'KeyType': 'RANGE' } ],
AttributeDefinitions=[
{ 'AttributeName': 'p', 'AttributeType': 'S' },
{ 'AttributeName': 'c', 'AttributeType': 'S' } ]) as table:
p1 = random_string()
p2 = random_string()
# Set up two items in p1, only one of them will be deleted later
table.put_item(Item={'p': p1, 'c': 'item1', 'x': 'hello'})
assert table.get_item(Key={'p': p1, 'c': 'item1'}, ConsistentRead=True)['Item'] == {'p': p1, 'c': 'item1', 'x': 'hello'}
table.put_item(Item={'p': p1, 'c': 'item2', 'x': 'hi'})
assert table.get_item(Key={'p': p1, 'c': 'item2'}, ConsistentRead=True)['Item'] == {'p': p1, 'c': 'item2', 'x': 'hi'}
# Perform the batch write, writing to two different partitions
# (p1 and p2), multiple items in one partition (p1), and
# one of the writes is a DeleteRequest (of item1 that we wrote
# above).
table.meta.client.batch_write_item(RequestItems = {
table.name: [
{'PutRequest': {'Item': {'p': p1, 'c': 'item3', 'x': 'dog'}}},
{'PutRequest': {'Item': {'p': p1, 'c': 'item4', 'x': 'cat'}}},
{'DeleteRequest': {'Key': {'p': p1, 'c': 'item1'}}},
{'PutRequest': {'Item': {'p': p2, 'c': 'item5', 'x': 'mouse'}}}
]})
# After the batch write, item1 will be gone, item2..item5 should
# exist with the right content.
assert 'Item' not in table.get_item(Key={'p': p1, 'c': 'item1'}, ConsistentRead=True)
assert table.get_item(Key={'p': p1, 'c': 'item2'}, ConsistentRead=True)['Item'] == {'p': p1, 'c': 'item2', 'x': 'hi'}
assert table.get_item(Key={'p': p1, 'c': 'item3'}, ConsistentRead=True)['Item'] == {'p': p1, 'c': 'item3', 'x': 'dog'}
assert table.get_item(Key={'p': p1, 'c': 'item4'}, ConsistentRead=True)['Item'] == {'p': p1, 'c': 'item4', 'x': 'cat'}
assert table.get_item(Key={'p': p2, 'c': 'item5'}, ConsistentRead=True)['Item'] == {'p': p2, 'c': 'item5', 'x': 'mouse'}

View File

@@ -51,17 +51,17 @@ BOOST_AUTO_TEST_CASE(TestXmlErrorPayload) {
auto error = aws::aws_error::parse(build_xml_response("IncompleteSignatureException", message, requestId)).value();
BOOST_REQUIRE_EQUAL(aws::aws_error_type::INCOMPLETE_SIGNATURE, error.get_error_type());
BOOST_REQUIRE_EQUAL(message, error.get_error_message());
BOOST_REQUIRE_EQUAL(error.is_retryable(), utils::http::retryable::no);
BOOST_REQUIRE_EQUAL(error.is_retryable(), aws::retryable::no);
error = aws::aws_error::parse(build_xml_response("InternalFailure", message, requestId, message_style::plural)).value();
BOOST_REQUIRE_EQUAL(aws::aws_error_type::INTERNAL_FAILURE, error.get_error_type());
BOOST_REQUIRE_EQUAL(message, error.get_error_message());
BOOST_REQUIRE_EQUAL(error.is_retryable(), utils::http::retryable::yes);
BOOST_REQUIRE_EQUAL(error.is_retryable(), aws::retryable::yes);
error = aws::aws_error::parse(build_xml_response("IDontExist", message, requestId, message_style::plural)).value();
BOOST_REQUIRE_EQUAL(aws::aws_error_type::UNKNOWN, error.get_error_type());
BOOST_REQUIRE_EQUAL(message, error.get_error_message());
BOOST_REQUIRE_EQUAL(error.is_retryable(), utils::http::retryable::no);
BOOST_REQUIRE_EQUAL(error.is_retryable(), aws::retryable::no);
auto no_error = aws::aws_error::parse("");
BOOST_REQUIRE_EQUAL(no_error.has_value(), false);
@@ -75,7 +75,7 @@ BOOST_AUTO_TEST_CASE(TestXmlErrorPayload) {
error = aws::aws_error::parse(response).value();
BOOST_REQUIRE_EQUAL(aws::aws_error_type::INTERNAL_FAILURE, error.get_error_type());
BOOST_REQUIRE_EQUAL(message, error.get_error_message());
BOOST_REQUIRE_EQUAL(error.is_retryable(), utils::http::retryable::yes);
BOOST_REQUIRE_EQUAL(error.is_retryable(), aws::retryable::yes);
}
BOOST_AUTO_TEST_CASE(TestErrorsWithPrefixParse) {
@@ -92,7 +92,7 @@ BOOST_AUTO_TEST_CASE(TestErrorsWithPrefixParse) {
auto error = aws::aws_error::parse(build_xml_response(exceptionPrefix + "IDon'tExist", "JunkMessage", requestId)).value();
BOOST_REQUIRE_EQUAL(aws::aws_error_type::UNKNOWN, error.get_error_type());
BOOST_REQUIRE_EQUAL("JunkMessage", error.get_error_message());
BOOST_REQUIRE_EQUAL(error.is_retryable(), utils::http::retryable::no);
BOOST_REQUIRE_EQUAL(error.is_retryable(), aws::retryable::no);
}
BOOST_AUTO_TEST_CASE(TestErrorsWithoutPrefixParse) {
@@ -107,15 +107,7 @@ BOOST_AUTO_TEST_CASE(TestErrorsWithoutPrefixParse) {
auto error = aws::aws_error::parse(build_xml_response("IDon'tExist", "JunkMessage", requestId)).value();
BOOST_REQUIRE_EQUAL(aws::aws_error_type::UNKNOWN, error.get_error_type());
BOOST_REQUIRE_EQUAL("JunkMessage", error.get_error_message());
BOOST_REQUIRE_EQUAL(error.is_retryable(), utils::http::retryable::no);
}
BOOST_AUTO_TEST_CASE(TestHelperFunctions) {
BOOST_REQUIRE_EQUAL(utils::http::from_http_code(seastar::http::reply::status_type::service_unavailable), utils::http::retryable::yes);
BOOST_REQUIRE_EQUAL(utils::http::from_http_code(seastar::http::reply::status_type::unauthorized), utils::http::retryable::no);
BOOST_REQUIRE_EQUAL(utils::http::from_system_error(std::system_error(ECONNRESET, std::system_category())), utils::http::retryable::yes);
BOOST_REQUIRE_EQUAL(utils::http::from_system_error(std::system_error(EADDRINUSE, std::system_category())), utils::http::retryable::no);
BOOST_REQUIRE_EQUAL(error.is_retryable(), aws::retryable::no);
}
BOOST_AUTO_TEST_CASE(TestNestedException) {
@@ -134,7 +126,7 @@ BOOST_AUTO_TEST_CASE(TestNestedException) {
auto error = aws::aws_error::from_exception_ptr(std::current_exception());
BOOST_REQUIRE_EQUAL(aws::aws_error_type::NETWORK_CONNECTION, error.get_error_type());
BOOST_REQUIRE_EQUAL("Software caused connection abort", error.get_error_message());
BOOST_REQUIRE_EQUAL(error.is_retryable(), utils::http::retryable::yes);
BOOST_REQUIRE_EQUAL(error.is_retryable(), aws::retryable::yes);
}
// Test nested exceptions where the innermost is NOT a system_error
@@ -148,7 +140,7 @@ BOOST_AUTO_TEST_CASE(TestNestedException) {
auto error = aws::aws_error::from_exception_ptr(std::current_exception());
BOOST_REQUIRE_EQUAL(aws::aws_error_type::UNKNOWN, error.get_error_type());
BOOST_REQUIRE_EQUAL("Higher level runtime_error", error.get_error_message());
BOOST_REQUIRE_EQUAL(error.is_retryable(), utils::http::retryable::no);
BOOST_REQUIRE_EQUAL(error.is_retryable(), aws::retryable::no);
}
// Test single exception which is NOT a nested exception
@@ -158,7 +150,7 @@ BOOST_AUTO_TEST_CASE(TestNestedException) {
auto error = aws::aws_error::from_exception_ptr(std::current_exception());
BOOST_REQUIRE_EQUAL(aws::aws_error_type::UNKNOWN, error.get_error_type());
BOOST_REQUIRE_EQUAL("Something bad happened", error.get_error_message());
BOOST_REQUIRE_EQUAL(error.is_retryable(), utils::http::retryable::no);
BOOST_REQUIRE_EQUAL(error.is_retryable(), aws::retryable::no);
}
// Test with non-std::exception
@@ -168,7 +160,7 @@ BOOST_AUTO_TEST_CASE(TestNestedException) {
auto error = aws::aws_error::from_exception_ptr(std::current_exception());
BOOST_REQUIRE_EQUAL(aws::aws_error_type::UNKNOWN, error.get_error_type());
BOOST_REQUIRE_EQUAL("No error message was provided, exception content: char const*", error.get_error_message());
BOOST_REQUIRE_EQUAL(error.is_retryable(), utils::http::retryable::no);
BOOST_REQUIRE_EQUAL(error.is_retryable(), aws::retryable::no);
}
// Test system_error
@@ -178,7 +170,7 @@ BOOST_AUTO_TEST_CASE(TestNestedException) {
auto error = aws::aws_error::from_exception_ptr(std::current_exception());
BOOST_REQUIRE_EQUAL(aws::aws_error_type::NETWORK_CONNECTION, error.get_error_type());
BOOST_REQUIRE_EQUAL("Software caused connection abort", error.get_error_message());
BOOST_REQUIRE_EQUAL(error.is_retryable(), utils::http::retryable::yes);
BOOST_REQUIRE_EQUAL(error.is_retryable(), aws::retryable::yes);
}
// Test aws_exception
@@ -188,7 +180,7 @@ BOOST_AUTO_TEST_CASE(TestNestedException) {
auto error = aws::aws_error::from_exception_ptr(std::current_exception());
BOOST_REQUIRE_EQUAL(aws::aws_error_type::HTTP_TOO_MANY_REQUESTS, error.get_error_type());
BOOST_REQUIRE_EQUAL("", error.get_error_message());
BOOST_REQUIRE_EQUAL(error.is_retryable(), utils::http::retryable::yes);
BOOST_REQUIRE_EQUAL(error.is_retryable(), aws::retryable::yes);
}
// Test httpd::unexpected_status_error
@@ -198,6 +190,6 @@ BOOST_AUTO_TEST_CASE(TestNestedException) {
auto error = aws::aws_error::from_exception_ptr(std::current_exception());
BOOST_REQUIRE_EQUAL(aws::aws_error_type::HTTP_NETWORK_CONNECT_TIMEOUT, error.get_error_type());
BOOST_REQUIRE_EQUAL(" HTTP code: 599 Network Connect Timeout", error.get_error_message());
BOOST_REQUIRE_EQUAL(error.is_retryable(), utils::http::retryable::yes);
BOOST_REQUIRE_EQUAL(error.is_retryable(), aws::retryable::yes);
}
}

View File

@@ -391,31 +391,21 @@ SEASTAR_TEST_CASE(select_from_vector_search_system_table) {
return do_with_cql_env_thread(
[](auto&& env) {
create_user_if_not_exists(env, bob);
// All tables in vector_search_system_resources from client_state.cc
const std::vector<sstring> vector_search_system_tables = {
"system.group0_history",
"system.versions",
"system.cdc_streams",
"system.cdc_timestamps",
};
// Without VECTOR_SEARCH_INDEXING permission, bob cannot select from these tables
for (const auto& table : vector_search_system_tables) {
with_user(env, bob, [&env, &table] {
BOOST_REQUIRE_EXCEPTION(env.execute_cql(format("SELECT * FROM {}", table)).get(), exceptions::unauthorized_exception,
exception_predicate::message_contains("User bob has none of the permissions (VECTOR_SEARCH_INDEXING, SELECT) on"));
});
}
with_user(env, bob, [&env] {
BOOST_REQUIRE_EXCEPTION(env.execute_cql("SELECT * FROM system.group0_history").get(), exceptions::unauthorized_exception,
exception_predicate::message_contains("User bob has none of the permissions (VECTOR_SEARCH_INDEXING, SELECT) on"));
});
with_user(env, bob, [&env] {
BOOST_REQUIRE_EXCEPTION(env.execute_cql("SELECT * FROM system.versions").get(), exceptions::unauthorized_exception,
exception_predicate::message_contains("User bob has none of the permissions (VECTOR_SEARCH_INDEXING, SELECT) on"));
});
cquery_nofail(env, "GRANT VECTOR_SEARCH_INDEXING ON ALL KEYSPACES TO bob");
// With VECTOR_SEARCH_INDEXING permission, bob can select from these tables
for (const auto& table : vector_search_system_tables) {
with_user(env, bob, [&env, &table] {
cquery_nofail(env, format("SELECT * FROM {}", table));
});
}
with_user(env, bob, [&env] {
cquery_nofail(env, "SELECT * FROM system.group0_history");
});
with_user(env, bob, [&env] {
cquery_nofail(env, "SELECT * FROM system.versions");
});
},
db_config_with_auth());
}

View File

@@ -29,7 +29,6 @@
#include "types/list.hh"
#include "types/set.hh"
#include "schema/schema_builder.hh"
#include "cql3/functions/vector_similarity_fcts.hh"
BOOST_AUTO_TEST_SUITE(cql_functions_test)
@@ -423,96 +422,4 @@ SEASTAR_TEST_CASE(test_aggregate_functions_vector_type) {
});
}
SEASTAR_THREAD_TEST_CASE(test_extract_float_vector) {
// Compare standard deserialization path vs optimized extraction path
auto serialize = [](size_t dim, const std::vector<float>& values) {
auto vector_type = vector_type_impl::get_instance(float_type, dim);
std::vector<data_value> data_vals;
data_vals.reserve(values.size());
for (float f : values) {
data_vals.push_back(data_value(f));
}
return vector_type->decompose(make_list_value(vector_type, data_vals));
};
auto deserialize_standard = [](size_t dim, const bytes_opt& serialized) {
auto vector_type = vector_type_impl::get_instance(float_type, dim);
data_value v = vector_type->deserialize(*serialized);
const auto& elements = value_cast<std::vector<data_value>>(v);
std::vector<float> result;
result.reserve(elements.size());
for (const auto& elem : elements) {
result.push_back(value_cast<float>(elem));
}
return result;
};
auto compare_vectors = [](const std::vector<float>& a, const std::vector<float>& b) {
BOOST_REQUIRE_EQUAL(a.size(), b.size());
for (size_t i = 0; i < a.size(); ++i) {
if (std::isnan(a[i]) && std::isnan(b[i])) {
continue; // Both NaN, consider equal
}
BOOST_REQUIRE_EQUAL(a[i], b[i]);
}
};
// Prepare test cases
std::vector<std::vector<float>> test_vectors = {
// Small vectors with explicit values
{1.0f, 2.5f},
{-1.5f, 0.0f, 3.14159f},
// Special floating-point values
{
std::numeric_limits<float>::infinity(),
-std::numeric_limits<float>::infinity(),
0.0f,
-0.0f,
std::numeric_limits<float>::min(),
std::numeric_limits<float>::max()
},
// NaN values (require special comparison)
{
std::numeric_limits<float>::quiet_NaN(),
1.0f,
std::numeric_limits<float>::signaling_NaN()
}
};
// Add common embedding dimensions with pattern-generated data
for (size_t dim : {128, 384, 768, 1024, 1536}) {
std::vector<float> vec(dim);
for (size_t i = 0; i < dim; ++i) {
vec[i] = static_cast<float>(i % 100) * 0.01f;
}
test_vectors.push_back(std::move(vec));
}
// Run tests for all test vectors
for (const auto& vec : test_vectors) {
size_t dim = vec.size();
auto serialized = serialize(dim, vec);
auto standard = deserialize_standard(dim, serialized);
compare_vectors(standard, cql3::functions::detail::extract_float_vector(serialized, dim));
}
// Null parameter should throw
BOOST_REQUIRE_EXCEPTION(
cql3::functions::detail::extract_float_vector(std::nullopt, 3),
exceptions::invalid_request_exception,
seastar::testing::exception_predicate::message_contains("Cannot extract float vector from null parameter")
);
// Size mismatch should throw
for (auto [actual_dim, expected_dim] : {std::pair{2, 3}, {4, 3}}) {
std::vector<float> vec(actual_dim, 1.0f);
auto serialized = serialize(actual_dim, vec);
BOOST_REQUIRE_EXCEPTION(
cql3::functions::detail::extract_float_vector(serialized, expected_dim),
exceptions::invalid_request_exception,
seastar::testing::exception_predicate::message_contains("Invalid vector size")
);
}
}
BOOST_AUTO_TEST_SUITE_END()

View File

@@ -113,23 +113,15 @@ static future<> compare_object_data(const local_gcs_wrapper& env, std::string_vi
BOOST_REQUIRE_EQUAL(read, total);
}
using namespace std::string_literals;
static constexpr auto prefix = "bork/ninja/"s;
// #28398 include a prefix in all names.
static std::string make_name() {
return fmt::format("{}{}", prefix, utils::UUID_gen::get_time_UUID());
}
static future<> test_read_write_helper(const local_gcs_wrapper& env, size_t dest_size, std::optional<size_t> specific_buffer_size = std::nullopt) {
auto& c = env.client();
auto name = make_name();
auto uuid = fmt::format("{}", utils::UUID_gen::get_time_UUID());
std::vector<temporary_buffer<char>> written;
// ensure we remove the object
env.objects_to_delete.emplace_back(name);
co_await create_object_of_size(c, env.bucket, name, dest_size, &written, specific_buffer_size);
co_await compare_object_data(env, name, std::move(written));
env.objects_to_delete.emplace_back(uuid);
co_await create_object_of_size(c, env.bucket, uuid, dest_size, &written, specific_buffer_size);
co_await compare_object_data(env, uuid, std::move(written));
}
BOOST_AUTO_TEST_SUITE(gcs_tests, *seastar::testing::async_fixture<gcs_fixture>())
@@ -155,28 +147,21 @@ SEASTAR_FIXTURE_TEST_CASE(test_gcp_storage_list_objects, local_gcs_wrapper, *che
auto& c = env.client();
std::unordered_map<std::string, uint64_t> names;
for (size_t i = 0; i < 10; ++i) {
auto name = make_name();
auto name = fmt::format("{}", utils::UUID_gen::get_time_UUID());
auto size = tests::random::get_int(size_t(1), size_t(2*1024*1024));
env.objects_to_delete.emplace_back(name);
co_await create_object_of_size(c, env.bucket, name, size);
names.emplace(name, size);
}
utils::gcp::storage::bucket_paging paging;
auto infos = co_await c.list_objects(env.bucket);
size_t n_found = 0;
for (;;) {
auto infos = co_await c.list_objects(env.bucket, "", paging);
for (auto& info : infos) {
auto i = names.find(info.name);
if (i != names.end()) {
BOOST_REQUIRE_EQUAL(info.size, i->second);
++n_found;
}
}
if (infos.empty()) {
break;
for (auto& info : infos) {
auto i = names.find(info.name);
if (i != names.end()) {
BOOST_REQUIRE_EQUAL(info.size, i->second);
++n_found;
}
}
BOOST_REQUIRE_EQUAL(n_found, names.size());
@@ -185,7 +170,7 @@ SEASTAR_FIXTURE_TEST_CASE(test_gcp_storage_list_objects, local_gcs_wrapper, *che
SEASTAR_FIXTURE_TEST_CASE(test_gcp_storage_delete_object, local_gcs_wrapper, *check_gcp_storage_test_enabled()) {
auto& env = *this;
auto& c = env.client();
auto name = make_name();
auto name = fmt::format("{}", utils::UUID_gen::get_time_UUID());
env.objects_to_delete.emplace_back(name);
co_await create_object_of_size(c, env.bucket, name, 128);
{
@@ -205,7 +190,7 @@ SEASTAR_FIXTURE_TEST_CASE(test_gcp_storage_delete_object, local_gcs_wrapper, *ch
SEASTAR_FIXTURE_TEST_CASE(test_gcp_storage_skip_read, local_gcs_wrapper, *check_gcp_storage_test_enabled()) {
auto& env = *this;
auto& c = env.client();
auto name = make_name();
auto name = fmt::format("{}", utils::UUID_gen::get_time_UUID());
std::vector<temporary_buffer<char>> bufs;
constexpr size_t file_size = 12*1024*1024 + 384*7 + 31;
@@ -258,7 +243,7 @@ SEASTAR_FIXTURE_TEST_CASE(test_merge_objects, local_gcs_wrapper, *check_gcp_stor
size_t total = 0;
for (size_t i = 0; i < 32; ++i) {
auto name = make_name();
auto name = fmt::format("{}", utils::UUID_gen::get_time_UUID());
auto size = tests::random::get_int(size_t(1), size_t(2*1024*1024));
env.objects_to_delete.emplace_back(name);
co_await create_object_of_size(c, env.bucket, name, size, &bufs);
@@ -266,7 +251,7 @@ SEASTAR_FIXTURE_TEST_CASE(test_merge_objects, local_gcs_wrapper, *check_gcp_stor
total += size;
}
auto name = make_name();
auto name = fmt::format("{}", utils::UUID_gen::get_time_UUID());
env.objects_to_delete.emplace_back(name);
auto info = co_await c.merge_objects(env.bucket, name, names);

View File

@@ -42,7 +42,6 @@
#include "test/lib/key_utils.hh"
#include "test/lib/test_utils.hh"
#include <boost/test/unit_test.hpp>
#include "dht/sharder.hh"
#include "schema/schema_builder.hh"
#include "replica/cell_locking.hh"
@@ -70,8 +69,6 @@
BOOST_AUTO_TEST_SUITE(mutation_reader_test)
namespace test_label = boost::unit_test;
static schema_ptr make_schema() {
return schema_builder("ks", "cf")
.with_column("pk", bytes_type, column_kind::partition_key)
@@ -1242,7 +1239,7 @@ SEASTAR_TEST_CASE(test_combined_mutation_source_is_a_mutation_source) {
}
// Best run with SMP >= 2
SEASTAR_THREAD_TEST_CASE(test_foreign_reader_as_mutation_source, *test_label::label("nightly")) {
SEASTAR_THREAD_TEST_CASE(test_foreign_reader_as_mutation_source) {
if (smp::count < 2) {
std::cerr << "Cannot run test " << get_name() << " with smp::count < 2" << std::endl;
return;

View File

@@ -980,88 +980,3 @@ BOOST_AUTO_TEST_CASE(s3_fqn_manipulation) {
BOOST_REQUIRE_EQUAL(bucket_name, "bucket");
BOOST_REQUIRE_EQUAL(object_name, "prefix1/prefix2/foo.bar");
}
BOOST_AUTO_TEST_CASE(part_size_calculation_test) {
{
BOOST_REQUIRE_EXCEPTION(s3::calc_part_size(490_GiB, 5_MiB), std::runtime_error, [](const std::runtime_error& e) {
return std::string(e.what()).starts_with("too many parts: 100352 > 10000");
});
}
{
auto [parts, size] = s3::calc_part_size(490_GiB, 100_MiB);
BOOST_REQUIRE_EQUAL(size, 100_MiB);
BOOST_REQUIRE(parts == 5018);
}
{
BOOST_REQUIRE_EXCEPTION(s3::calc_part_size(490_GiB, 4_MiB), std::runtime_error, [](const std::runtime_error& e) {
return std::string(e.what()).starts_with("part_size too small: 4194304 is smaller than minimum part size: 5242880");
});
}
{
auto [parts, size] = s3::calc_part_size(50_MiB, 0);
BOOST_REQUIRE_EQUAL(size, 50_MiB);
BOOST_REQUIRE_EQUAL(parts, 1);
}
{
auto [parts, size] = s3::calc_part_size(49_MiB, 0);
BOOST_REQUIRE_EQUAL(size, 50_MiB);
BOOST_REQUIRE_EQUAL(parts, 1);
}
{
auto [parts, size] = s3::calc_part_size(490_GiB, 0);
BOOST_REQUIRE_EQUAL(size, 51_MiB);
BOOST_REQUIRE(parts == 9839);
}
{
auto [parts, size] = s3::calc_part_size(50_MiB * 10000, 0);
BOOST_REQUIRE_EQUAL(size, 50_MiB);
BOOST_REQUIRE_EQUAL(parts, 10000);
}
{
auto [parts, size] = s3::calc_part_size(50_MiB * 10000 + 1, 0);
BOOST_REQUIRE(size > 50_MiB);
BOOST_REQUIRE(parts <= 10000);
}
{
BOOST_REQUIRE_EXCEPTION(s3::calc_part_size(50_TiB, 0), std::runtime_error, [](const std::runtime_error& e) {
return std::string(e.what()).starts_with("object size too large: 54975581388800 is larger than maximum S3 object size: 53687091200000");
});
}
{
BOOST_REQUIRE_EXCEPTION(s3::calc_part_size(1_TiB, 5_GiB + 1), std::runtime_error, [](const std::runtime_error& e) {
return std::string(e.what()).starts_with("part_size too large: 5368709121 is larger than maximum part size: 5368709120");
});
}
{
auto [parts, size] = s3::calc_part_size(5_TiB, 0);
BOOST_REQUIRE_EQUAL(parts, 9987);
BOOST_REQUIRE_EQUAL(size, 525_MiB);
}
{
auto [parts, size] = s3::calc_part_size(5_MiB * 10000, 5_MiB);
BOOST_REQUIRE_EQUAL(size, 5_MiB);
BOOST_REQUIRE_EQUAL(parts, 10000);
}
{
size_t total = 5_MiB * 10001; // 10001 parts at 5 MiB
BOOST_REQUIRE_EXCEPTION(
s3::calc_part_size(total, 5_MiB), std::runtime_error, [](auto& e) { return std::string(e.what()).starts_with("too many parts: 10001 > 10000"); });
}
{
size_t total = 500_GiB + 123; // odd size to force non-MiB alignment
auto [parts, size] = s3::calc_part_size(total, 0);
BOOST_REQUIRE(size % 1_MiB == 0); // aligned
BOOST_REQUIRE(parts <= 10000);
}
{
auto [parts, size] = s3::calc_part_size(6_MiB, 0);
BOOST_REQUIRE_EQUAL(size, 50_MiB);
BOOST_REQUIRE_EQUAL(parts, 1);
}
{
auto [parts, size] = s3::calc_part_size(100_MiB, 200_MiB);
BOOST_REQUIRE_EQUAL(parts, 1);
BOOST_REQUIRE_EQUAL(size, 200_MiB);
}
}

View File

@@ -14,11 +14,15 @@ from test.pylib.manager_client import ManagerClient
from test.cluster.auth_cluster import extra_scylla_config_options as auth_config
@pytest.mark.asyncio
async def test_attach_service_level_to_user(request, manager: ManagerClient):
async def __test_attach_service_level_to_user(request, manager: ManagerClient, is_raft: bool):
user = f"test_user_{unique_name()}"
# Start nodes with correct topology
servers = await manager.servers_add(3, config=auth_config)
if is_raft:
servers = await manager.servers_add(3, config=auth_config)
else:
conf = {**auth_config, 'force_gossip_topology_changes': True, 'tablets_mode_for_new_keyspaces': 'disabled'}
servers = [await manager.server_add(config=conf) for _ in range(3)]
cql = manager.get_cql()
logging.info("Waiting until driver connects to every server")
@@ -42,9 +46,28 @@ async def test_attach_service_level_to_user(request, manager: ManagerClient):
for sl in sls:
await cql.run_async(f"ATTACH SERVICE LEVEL {sl} TO {user}")
#if we are not using raft we have to switch the tenant and wait for it to take effect
if not is_raft:
for ip in ips:
await manager.api.client.post('/service_levels/switch_tenants', host=ip)
# Switching tenants may be blocked if a connection is waiting for a request (see 'generic_server::connection::process_until_tenant_switch()').
# Execute enough cheap statements, so that connection on each shard will process at one statement and update its tenant.
for _ in range(100):
read_barrier(manager.api, ip)
assert verify_service_level(sl), f"All connections should be in {sl} service level"
await cql.run_async(f"DETACH SERVICE LEVEL FROM {user}")
await cql.run_async(f"DROP ROLE {user}")
for sl in sls:
await cql.run_async(f"DROP SERVICE LEVEL {sl}")
await cql.run_async(f"DROP SERVICE LEVEL {sl}")
@pytest.mark.asyncio
async def test_attach_service_level_with_raft(request, manager: ManagerClient):
await __test_attach_service_level_to_user(request, manager, is_raft=True)
@pytest.mark.asyncio
async def test_attach_service_level_with_gossip(request, manager: ManagerClient):
await __test_attach_service_level_to_user(request, manager, is_raft=False)

View File

@@ -146,6 +146,47 @@ async def check_auth_v2_works(manager: ManagerClient, hosts):
await asyncio.gather(*(cql.run_async(f"LIST ROLES OF {username}", host=host) for host in hosts))
await cql.run_async(f"DROP ROLE {username}")
@pytest.mark.asyncio
async def test_auth_v2_migration(request, manager: ManagerClient):
# First, force the first node to start in legacy mode
cfg = {**auth_config, 'force_gossip_topology_changes': True, 'tablets_mode_for_new_keyspaces': 'disabled'}
servers = [await manager.server_add(config=cfg)]
# Enable raft-based node operations for subsequent nodes - they should fall back to
# using gossiper-based node operations
cfg.pop('force_gossip_topology_changes')
servers += [await manager.server_add(config=cfg) for _ in range(2)]
cql = manager.cql
assert(cql)
logging.info("Waiting until driver connects to every server")
hosts = await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)
await wait_for_token_ring_and_group0_consistency(manager, time.time() + 30)
logging.info("Checking the upgrade state on all nodes")
for host in hosts:
status = await manager.api.raft_topology_upgrade_status(host.address)
assert status == "not_upgraded"
await populate_auth_v1_data(manager)
await warmup_v1_static_values(manager, hosts)
logging.info("Triggering upgrade to raft topology")
await manager.api.upgrade_to_raft_topology(hosts[0].address)
logging.info("Waiting until upgrade finishes")
await asyncio.gather(*(wait_until_topology_upgrade_finishes(manager, h.address, time.time() + 60) for h in hosts))
logging.info("Checking migrated data in system")
await check_auth_v2_data_migration(manager, hosts)
logging.info("Checking auth statements after migration")
await check_auth_v2_works(manager, hosts)
@pytest.mark.asyncio
async def test_auth_v2_during_recovery(manager: ManagerClient):
# FIXME: move this test to the Raft-based recovery procedure or remove it if unneeded.

View File

@@ -8,8 +8,6 @@ from cassandra.auth import PlainTextAuthProvider
from cassandra.cluster import Cluster, NoHostAvailable
from cassandra import Unauthorized
from cassandra.connection import UnixSocketEndPoint
from cassandra.policies import WhiteListRoundRobinPolicy
from test.cluster.conftest import cluster_con
from test.pylib.manager_client import ManagerClient
@@ -60,7 +58,7 @@ async def test_maintenance_socket(manager: ManagerClient):
else:
pytest.fail("User 'john' has no permissions to access ks2.t1")
maintenance_cluster = cluster_con([UnixSocketEndPoint(socket)], load_balancing_policy=WhiteListRoundRobinPolicy([UnixSocketEndPoint(socket)]))
maintenance_cluster = cluster_con([UnixSocketEndPoint(socket)])
maintenance_session = maintenance_cluster.connect()
# check that the maintenance session has superuser permissions

View File

@@ -62,6 +62,127 @@ async def test_service_levels_snapshot(manager: ManagerClient):
assert set([sl.service_level for sl in result]) == set([sl.service_level for sl in new_result])
@pytest.mark.asyncio
async def test_service_levels_upgrade(request, manager: ManagerClient, build_mode: str):
# First, force the first node to start in legacy mode
cfg = {**auth_config, 'force_gossip_topology_changes': True, 'tablets_mode_for_new_keyspaces': 'disabled'}
servers = [await manager.server_add(config=cfg)]
# Enable raft-based node operations for subsequent nodes - they should fall back to
# using gossiper-based node operations
cfg.pop('force_gossip_topology_changes')
servers += [await manager.server_add(config=cfg) for _ in range(2)]
cql = manager.get_cql()
assert(cql)
logging.info("Waiting until driver connects to every server")
hosts = await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)
logging.info("Checking the upgrade state on all nodes")
for host in hosts:
status = await manager.api.raft_topology_upgrade_status(host.address)
assert status == "not_upgraded"
sls = ["sl" + unique_name() for _ in range(5)]
for sl in sls:
await cql.run_async(f"CREATE SERVICE LEVEL {sl}")
result = await cql.run_async("SELECT service_level FROM system_distributed.service_levels")
assert set([sl.service_level for sl in result]) == set(sls)
if build_mode in ("debug", "dev"):
# See scylladb/scylladb/#24963 for more details
logging.info("Enabling an error injection in legacy role manager, to check that we don't query auth in system_auth")
await asyncio.gather(*(manager.api.enable_injection(s.ip_addr, "standard_role_manager_fail_legacy_query", one_shot=False) for s in servers))
logging.info("Triggering upgrade to raft topology")
await manager.api.upgrade_to_raft_topology(hosts[0].address)
logging.info("Waiting until upgrade finishes")
await asyncio.gather(*(wait_until_topology_upgrade_finishes(manager, h.address, time.time() + 60) for h in hosts))
await wait_until_driver_service_level_created(manager, time.time() + 60)
result_v2 = await cql.run_async("SELECT service_level FROM system.service_levels_v2")
assert set([sl.service_level for sl in result_v2]) == set(sls + [DRIVER_SL_NAME])
sl_v2 = "sl" + unique_name()
await cql.run_async(f"CREATE SERVICE LEVEL {sl_v2}")
await asyncio.gather(*(read_barrier(manager.api, get_host_api_address(host)) for host in hosts))
result_with_sl_v2 = await cql.run_async(f"SELECT service_level FROM system.service_levels_v2")
assert set([sl.service_level for sl in result_with_sl_v2]) == set(sls + [DRIVER_SL_NAME] + [sl_v2])
@pytest.mark.asyncio
async def test_service_levels_work_during_recovery(manager: ManagerClient):
# FIXME: move this test to the Raft-based recovery procedure or remove it if unneeded.
servers = await manager.servers_add(3, config=auth_config, auto_rack_dc="dc1")
logging.info("Waiting until driver connects to every server")
cql = manager.get_cql()
hosts = await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)
logging.info("Creating a bunch of service levels")
sls = ["sl" + unique_name() for _ in range(5)]
for sl in sls:
await cql.run_async(f"CREATE SERVICE LEVEL {sl}")
# insert a service levels into old table as if it was created before upgrade to v2 and later removed after upgrade
sl_v1 = "sl" + unique_name()
await cql.run_async(f"INSERT INTO system_distributed.service_levels (service_level) VALUES ('{sl_v1}')")
logging.info("Validating service levels were created in v2 table")
result = await cql.run_async("SELECT service_level FROM system.service_levels_v2")
for sl in result:
assert sl.service_level in sls + [DRIVER_SL_NAME]
logging.info(f"Restarting hosts {hosts} in recovery mode")
await asyncio.gather(*(enter_recovery_state(cql, h) for h in hosts))
await manager.rolling_restart(servers)
cql = await reconnect_driver(manager)
logging.info("Cluster restarted, waiting until driver reconnects to every server")
hosts = await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)
logging.info("Checking service levels can be read and v2 table is used")
recovery_result = await cql.run_async("LIST ALL SERVICE LEVELS")
assert sl_v1 not in [sl.service_level for sl in recovery_result]
assert set([sl.service_level for sl in recovery_result]) == set(sls + [DRIVER_SL_NAME])
logging.info("Checking changes to service levels are forbidden during recovery")
with pytest.raises(InvalidRequest, match="The cluster is in recovery mode. Changes to service levels are not allowed."):
await cql.run_async(f"CREATE SERVICE LEVEL sl_{unique_name()}")
with pytest.raises(InvalidRequest, match="The cluster is in recovery mode. Changes to service levels are not allowed."):
await cql.run_async(f"ALTER SERVICE LEVEL {sls[0]} WITH timeout = 1h")
with pytest.raises(InvalidRequest, match="The cluster is in recovery mode. Changes to service levels are not allowed."):
await cql.run_async(f"DROP SERVICE LEVEL {sls[0]}")
logging.info("Restoring cluster to normal status")
await asyncio.gather(*(delete_raft_topology_state(cql, h) for h in hosts))
await asyncio.gather(*(delete_raft_data_and_upgrade_state(cql, h) for h in hosts))
await manager.rolling_restart(servers)
cql = await reconnect_driver(manager)
hosts = await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)
await asyncio.gather(*(wait_until_upgrade_finishes(cql, h, time.time() + 60) for h in hosts))
for host in hosts:
status = await manager.api.raft_topology_upgrade_status(host.address)
assert status == "not_upgraded"
await manager.servers_see_each_other(servers)
await manager.api.upgrade_to_raft_topology(hosts[0].address)
await asyncio.gather(*(wait_until_topology_upgrade_finishes(manager, h.address, time.time() + 60) for h in hosts))
await wait_until_driver_service_level_created(manager, time.time() + 60)
logging.info("Validating service levels works in v2 mode after leaving recovery")
new_sl = "sl" + unique_name()
await cql.run_async(f"CREATE SERVICE LEVEL {new_sl}")
sls_list = await cql.run_async("LIST ALL SERVICE LEVELS")
assert sl_v1 not in [sl.service_level for sl in sls_list]
assert set([sl.service_level for sl in sls_list]) == set(sls + [new_sl] + [DRIVER_SL_NAME])
def default_timeout(mode):
if mode == "dev":
return "30s"
@@ -263,6 +384,50 @@ async def test_shares_check(manager: ManagerClient):
await cql.run_async(f"CREATE SERVICE LEVEL {sl2} WITH shares=500")
await cql.run_async(f"ALTER SERVICE LEVEL {sl1} WITH shares=100")
@pytest.mark.asyncio
@pytest.mark.skip_mode(mode='release', reason='error injection is not supported in release mode')
async def test_workload_prioritization_upgrade(manager: ManagerClient):
# This test simulates OSS->enterprise upgrade in v1 service levels.
# Using error injection, the test disables WORKLOAD_PRIORITIZATION feature
# and removes `shares` column from system_distributed.service_levels table.
config = {
**auth_config,
'authenticator': 'AllowAllAuthenticator',
'authorizer': 'AllowAllAuthorizer',
'force_gossip_topology_changes': True,
'tablets_mode_for_new_keyspaces': 'disabled',
'error_injections_at_startup': [
{
'name': 'suppress_features',
'value': 'WORKLOAD_PRIORITIZATION'
},
{
'name': 'service_levels_v1_table_without_shares'
}
]
}
servers = [await manager.server_add(config=config) for _ in range(3)]
cql = manager.get_cql()
hosts = await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)
# Validate that service levels' table has no `shares` column
sl_schema = await cql.run_async("DESC TABLE system_distributed.service_levels")
assert "shares int" not in sl_schema[0].create_statement
with pytest.raises(InvalidRequest):
await cql.run_async("CREATE SERVICE LEVEL sl1 WITH shares = 100")
# Do rolling restart of the cluster and remove error injections
for server in servers:
await manager.server_update_config(server.server_id, 'error_injections_at_startup', [])
await manager.rolling_restart(servers)
# Validate that `shares` column was added
logs = [await manager.server_open_log(server.server_id) for server in servers]
await logs[0].wait_for("Workload prioritization v1 started|Workload prioritization v1 is already started", timeout=10)
sl_schema_upgraded = await cql.run_async("DESC TABLE system_distributed.service_levels")
assert "shares int" in sl_schema_upgraded[0].create_statement
await cql.run_async("CREATE SERVICE LEVEL sl2 WITH shares = 100")
@pytest.mark.asyncio
@pytest.mark.skip_mode(mode='release', reason='error injection is disabled in release mode')
async def test_service_levels_over_limit(manager: ManagerClient):

View File

@@ -262,17 +262,14 @@ async def manager(request: pytest.FixtureRequest,
# Check if the test has the check_nodes_for_errors marker
found_errors = await manager_client.check_all_errors(check_all_errors=(request.node.get_closest_marker("check_nodes_for_errors") is not None))
failed = failed or found_errors
failed_test_dir_path = None
if failed or found_errors:
if failed:
# Save scylladb logs for failed tests in a separate directory and copy XML report to the same directory to have
# all related logs in one dir.
# Then add property to the XML report with the path to the directory, so it can be visible in Jenkins
failed_test_dir_path = testpy_test.suite.log_dir / "failed_test" / test_case_name.translate(
str.maketrans('[]', '()'))
failed_test_dir_path = testpy_test.suite.log_dir / "failed_test" / test_case_name.translate(str.maketrans('[]', '()'))
failed_test_dir_path.mkdir(parents=True, exist_ok=True)
if failed:
await manager_client.gather_related_logs(
failed_test_dir_path,
{'pytest.log': test_log, 'test_py.log': test_py_log_test}
@@ -288,9 +285,7 @@ async def manager(request: pytest.FixtureRequest,
cluster_status = await manager_client.after_test(test_case_name, not failed)
await manager_client.stop() # Stop client session and close driver after each test
if cluster_status["server_broken"] and not failed:
failed = True
if cluster_status["server_broken"]:
pytest.fail(
f"test case {test_case_name} left unfinished tasks on Scylla server. Server marked as broken,"
f" server_broken_reason: {cluster_status["message"]}"
@@ -329,8 +324,7 @@ async def manager(request: pytest.FixtureRequest,
with open(failed_test_dir_path / "found_errors.txt", "w") as f:
f.write("\n".join(full_message))
if not failed:
pytest.fail(f"\n{'\n'.join(full_message)}")
pytest.fail(f"\n{'\n'.join(full_message)}")
# "cql" fixture: set up client object for communicating with the CQL API.
# Since connection is managed by manager just return that object

View File

@@ -13,6 +13,7 @@ class DTestConfig:
self.num_tokens = -1
self.experimental_features = []
self.tablets = False
self.force_gossip_topology_changes = False
self.scylla_features = set()
def setup(self, request):
@@ -20,6 +21,7 @@ class DTestConfig:
self.num_tokens = request.config.getoption("--num-tokens")
self.experimental_features = request.config.getoption("--experimental-features") or set()
self.tablets = request.config.getoption("--tablets", default=False)
self.force_gossip_topology_changes = request.config.getoption("--force-gossip-topology-changes", default=False)
self.scylla_features = request.config.scylla_features
@property

View File

@@ -526,6 +526,10 @@ class DTestSetup:
experimental_features.append(f)
self.scylla_features |= set(values.get("experimental_features", []))
if self.dtest_config.force_gossip_topology_changes:
logger.debug("Forcing gossip topology changes")
values["force_gossip_topology_changes"] = True
logger.debug("Setting 'enable_tablets' to %s", self.dtest_config.tablets)
values["enable_tablets"] = self.dtest_config.tablets
values["tablets_mode_for_new_keyspaces"] = "enabled" if self.dtest_config.tablets else "disabled"

View File

@@ -1,102 +0,0 @@
#
# Copyright (C) 2026-present ScyllaDB
#
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
#
import logging
import pytest
from cassandra.cluster import Session
from cassandra.protocol import ConfigurationException, InvalidRequest
from dtest_class import Tester
logger = logging.getLogger(__name__)
def create_ks_and_assert_warning(session, query, ks_name, key_warn_msg_words):
ret = session.execute_async(query)
_ = ret.result()
found = False
if len(key_warn_msg_words) > 0:
assert len(ret.warnings) >= 1, "Expected RF guardrail warning"
for warning in ret.warnings:
found = found or all(word in warning.lower() for word in key_warn_msg_words)
assert found, "Didn't match all required keywords"
session.execute(f"USE {ks_name}")
def assert_creating_ks_fails(session, query, ks_name):
with pytest.raises(ConfigurationException):
session.execute(query)
with pytest.raises(InvalidRequest):
session.execute(f"USE {ks_name}")
@pytest.mark.next_gating
class TestGuardrails(Tester):
def test_default_rf(self):
"""
As of now, the only RF guardrail enabled is a soft limit checking that RF >= 3. Not complying to this soft limit
results in a CQL being executed, but with a warning. Also, whatever the guardrails' values, RF = 0 is always OK.
"""
cluster = self.cluster
# FIXME: This test verifies that guardrails work. However, if we set `rf_rack_valid_keyspaces` to true,
# we'll get a different error, so let's disable it for now. For more context, see issues:
# scylladb/scylladb#23071 and scylladb/scylla-dtest#5633.
cluster.set_configuration_options(values={"rf_rack_valid_keyspaces": False})
cluster.populate([1, 1, 1]).start(wait_other_notice=True)
session_dc1: Session = self.patient_cql_connection(cluster.nodelist()[0])
ks_name = "ks"
rf = {"dc1": 2, "dc2": 3, "dc3": 0}
query = "CREATE KEYSPACE %s WITH REPLICATION={%s}"
options = ", ".join(["'%s':%d" % (dc_value, rf_value) for dc_value, rf_value in rf.items()])
query = query % (ks_name, "'class':'NetworkTopologyStrategy', %s" % options)
create_ks_and_assert_warning(session_dc1, query, ks_name, ["warn", "min", "replication", "factor", "3", "dc1", "2"])
def test_all_rf_limits(self):
"""
There're 4 limits for RF: soft/hard min and soft/hard max limits. Breaking soft limits issues a warning,
breaking the hard limits prevents the query from being executed.
"""
cluster = self.cluster
MIN_FAIL_THRESHOLD = 2
MIN_WARN_THRESHOLD = 3
MAX_WARN_THRESHOLD = 4
MAX_FAIL_THRESHOLD = 5
# FIXME: This test verifies that guardrails work. However, if we set `rf_rack_valid_keyspaces` to true,
# we'll get a different error, so let's disable it for now. For more context, see issues:
# scylladb/scylladb#23071 and scylladb/scylla-dtest#5633.
cluster.set_configuration_options(values={"rf_rack_valid_keyspaces": False})
cluster.set_configuration_options(
values={
"minimum_replication_factor_fail_threshold": MIN_FAIL_THRESHOLD, "minimum_replication_factor_warn_threshold": MIN_WARN_THRESHOLD, "maximum_replication_factor_warn_threshold": MAX_WARN_THRESHOLD,
"maximum_replication_factor_fail_threshold": MAX_FAIL_THRESHOLD
}
)
query = "CREATE KEYSPACE %s WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'dc1': %s}"
cluster.populate([1]).start()
node = cluster.nodelist()[0]
session = self.patient_cql_connection(node)
def test_rf(rf):
ks_name = f"ks_{rf}"
if rf < MIN_FAIL_THRESHOLD or rf > MAX_FAIL_THRESHOLD:
assert_creating_ks_fails(session, query % (ks_name, rf), ks_name)
elif rf < MIN_WARN_THRESHOLD:
create_ks_and_assert_warning(session, query % (ks_name, rf), ks_name, ["warn", "min", "replication", "factor", str(MIN_WARN_THRESHOLD), "dc1", "2"])
elif rf > MAX_WARN_THRESHOLD:
create_ks_and_assert_warning(session, query % (ks_name, rf), ks_name, ["warn", "max", "replication", "factor", str(MAX_WARN_THRESHOLD), "dc1", "5"])
else:
create_ks_and_assert_warning(session, query % (ks_name, rf), ks_name, [])
for rf in range(MIN_FAIL_THRESHOLD - 1, MAX_FAIL_THRESHOLD + 1):
test_rf(rf)

View File

@@ -33,7 +33,8 @@ logger = logging.getLogger(__name__)
@pytest.mark.asyncio
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
async def test_mv_topology_change(manager: ManagerClient):
cfg = {'tablets_mode_for_new_keyspaces': 'disabled',
cfg = {'force_gossip_topology_changes': True,
'tablets_mode_for_new_keyspaces': 'disabled',
'error_injections_at_startup': ['delay_before_get_view_natural_endpoint']}
servers = [await manager.server_add(config=cfg) for _ in range(3)]

View File

@@ -9,6 +9,8 @@ extra_scylla_config_options:
rf_rack_valid_keyspaces: True
tablets_mode_for_new_keyspaces: enabled
run_first:
- test_raft_recovery_stuck
- test_raft_recovery_basic
- test_group0_schema_versioning
- test_tablets_migration
- test_zero_token_nodes_topology_ops
@@ -36,6 +38,8 @@ run_in_release:
run_in_dev:
- test_raft_ignore_nodes
- test_group0_schema_versioning
- test_different_group0_ids
- test_replace_ignore_nodes
- test_zero_token_nodes_no_replication
- test_not_enough_token_owners
- test_replace_alive_node
@@ -47,6 +51,5 @@ run_in_dev:
- dtest/commitlog_test
- dtest/cfid_test
- dtest/rebuild_test
- dtest/guardrails_test
run_in_debug:
- random_failures/test_random_failures

View File

@@ -0,0 +1,52 @@
#
# Copyright (C) 2023-present ScyllaDB
#
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
#
import time
import pytest
import logging
from test.pylib.manager_client import ManagerClient
from test.cluster.util import wait_for_token_ring_and_group0_consistency
logger = logging.getLogger(__name__)
@pytest.mark.asyncio
async def test_boot_after_ip_change(manager: ManagerClient) -> None:
"""Bootstrap a new node after existing one changed its IP.
Regression test for #14468. Does not apply to Raft-topology mode.
"""
cfg = {'enable_user_defined_functions': False,
'force_gossip_topology_changes': True,
'tablets_mode_for_new_keyspaces': 'disabled'}
logger.info(f"Booting initial cluster")
servers = [await manager.server_add(config=cfg) for _ in range(2)]
await wait_for_token_ring_and_group0_consistency(manager, time.time() + 30)
logger.info(f"Stopping server {servers[1]}")
await manager.server_stop_gracefully(servers[1].server_id)
logger.info(f"Changing IP of server {servers[1]}")
new_ip = await manager.server_change_ip(servers[1].server_id)
servers[1] = servers[1]._replace(ip_addr = new_ip)
logger.info(f"New IP: {new_ip}")
logger.info(f"Restarting server {servers[1]}")
await manager.server_start(servers[1].server_id)
# We need to do this wait before we boot a new node.
# Otherwise the newly booting node may contact servers[0] even before servers[0]
# saw the new IP of servers[1], and then the booting node will try to wait
# for servers[1] to be alive using its old IP (and eventually time out).
#
# Note that this still acts as a regression test for #14468.
# In #14468, the problem was that a booting node would try to wait for the old IP
# of servers[0] even after all existing servers saw the IP change.
logger.info(f"Wait until {servers[0]} sees the new IP of {servers[1]}")
await manager.server_sees_other_server(servers[0].ip_addr, servers[1].ip_addr)
logger.info(f"Booting new node")
await manager.server_add(config=cfg)

View File

@@ -6,30 +6,52 @@
from test.pylib.manager_client import ManagerClient
import asyncio
import pytest
from test.pylib.util import wait_for_first_completed
@pytest.mark.asyncio
async def test_different_group0_ids(manager: ManagerClient):
"""
The test starts two single-node clusters (with different group0_ids). Node B (the
node from the second cluster) is restarted with seeds containing node A (the node
from the first cluster), and thus it tries to gossip node A. The test checks that
node A rejects gossip_digest_syn.
The reproducer for #14448.
Note: this test relies on the fact that the only node in a single-node cluster
always gossips with its seeds. This can be considered a bug, although a mild one.
If we ever fix it, this test can be rewritten by starting a two-node cluster and
recreating group0 on one of the nodes via the recovery procedure.
The test starts two nodes with different group0_ids. The second node
is restarted and tries to join the cluster consisting of the first node.
gossip_digest_syn message should be rejected by the first node, so
the second node will not be able to join the cluster.
This test uses repair-based node operations to make this test easier.
If the second node successfully joins the cluster, their tokens metadata
will be merged and the repair service will allow to decommission the second node.
If not - decommissioning the second node will fail with an exception
"zero replica after the removal" thrown by the repair service.
"""
scylla_a = await manager.server_add()
scylla_b = await manager.server_add(start=False)
# Consistent topology changes are disabled to use repair based node operations.
cfg = {'force_gossip_topology_changes': True, 'tablets_mode_for_new_keyspaces': 'disabled'}
scylla_a = await manager.server_add(config = cfg)
scylla_b = await manager.server_add(start=False, config = cfg)
await manager.server_start(scylla_b.server_id, seeds=[scylla_b.ip_addr])
id_b = await manager.get_host_id(scylla_b.server_id)
await manager.server_stop(scylla_b.server_id)
await manager.server_start(scylla_b.server_id, seeds=[scylla_a.ip_addr])
await manager.server_start(scylla_b.server_id, seeds=[scylla_a.ip_addr, scylla_b.ip_addr])
log_file_a = await manager.server_open_log(scylla_a.server_id)
await log_file_a.wait_for(f'Group0Id mismatch from {id_b}', timeout=30)
log_file_b = await manager.server_open_log(scylla_b.server_id)
# Wait for a gossip round to finish
await wait_for_first_completed([
log_file_b.wait_for(f'InetAddress {scylla_a.ip_addr} is now UP'), # The second node joins the cluster
log_file_a.wait_for(f'Group0Id mismatch') # The first node discards gossip from the second node
])
# Check if decommissioning the second node fails.
# Repair service throws a runtime exception "zero replica after the removal"
# when it tries to remove the only one node from the cluster.
# If it is not thrown, it means that the second node successfully send a gossip
# to the first node and they merged their tokens metadata.
with pytest.raises(Exception, match='zero replica after the removal'):
await manager.decommission_node(scylla_b.server_id)

View File

@@ -1,140 +0,0 @@
#
# Copyright (C) 2025-present ScyllaDB
#
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
#
"""
Test for error injection event stream functionality.
This test demonstrates the new SSE-based error injection event system
that eliminates the need for log parsing in tests.
"""
import asyncio
import logging
import pytest
from test.pylib.manager_client import ManagerClient
from test.pylib.rest_client import injection_event_stream
logger = logging.getLogger(__name__)
@pytest.mark.asyncio
@pytest.mark.skip_mode('release', 'error injections are not supported in release mode')
async def test_injection_event_stream_basic(manager: ManagerClient):
"""
Test basic error injection event stream functionality.
This test verifies that:
1. We can connect to the SSE event stream
2. Events are received when injections are triggered
3. We can wait for specific injections without log parsing
"""
servers = await manager.servers_add(1)
server_ip = servers[0].ip_addr
# Connect to the injection event stream
async with injection_event_stream(server_ip) as event_stream:
logger.info("Connected to injection event stream")
# Enable a simple injection
test_injection_name = "test_injection_event_basic"
await manager.api.enable_injection(server_ip, test_injection_name, one_shot=True)
# Trigger the injection by calling message_injection
# In real tests, the injection would be triggered by actual code execution
await manager.api.message_injection(server_ip, test_injection_name)
# Wait for the injection event (no log parsing needed!)
try:
event = await event_stream.wait_for_injection(test_injection_name, timeout=10.0)
logger.info(f"Received injection event: {event}")
# Verify event structure
assert event['injection'] == test_injection_name
assert 'type' in event
assert 'shard' in event
logger.info(f"✓ Injection triggered on shard {event['shard']} with type {event['type']}")
except asyncio.TimeoutError:
pytest.fail(f"Injection event for '{test_injection_name}' not received within timeout")
@pytest.mark.asyncio
@pytest.mark.skip_mode('release', 'error injections are not supported in release mode')
async def test_injection_event_stream_multiple_injections(manager: ManagerClient):
"""
Test that we can track multiple injections via the event stream.
"""
servers = await manager.servers_add(1)
server_ip = servers[0].ip_addr
async with injection_event_stream(server_ip) as event_stream:
logger.info("Connected to injection event stream")
# Enable multiple injections
injection_names = [
"test_injection_1",
"test_injection_2",
"test_injection_3",
]
for name in injection_names:
await manager.api.enable_injection(server_ip, name, one_shot=False)
# Trigger injections in sequence
for name in injection_names:
await manager.api.message_injection(server_ip, name)
# Wait for each injection event
event = await event_stream.wait_for_injection(name, timeout=10.0)
logger.info(f"✓ Received event for {name}: type={event['type']}, shard={event['shard']}")
# Cleanup
for name in injection_names:
await manager.api.disable_injection(server_ip, name)
logger.info("✓ All injection events received successfully")
@pytest.mark.asyncio
@pytest.mark.skip_mode('release', 'error injections are not supported in release mode')
async def test_injection_event_vs_log_parsing_comparison(manager: ManagerClient):
"""
Demonstration test comparing the old log parsing approach vs new event stream approach.
This shows how the new SSE event stream eliminates the need for log parsing,
making tests faster and more reliable.
"""
servers = await manager.servers_add(1)
server = servers[0]
injection_name = "test_comparison_injection"
# OLD APPROACH: Log parsing (commented to show the pattern)
# -----------------------------------------------------
# log = await manager.server_open_log(server.server_id)
# mark = await log.mark()
# await manager.api.enable_injection(server.ip_addr, injection_name, one_shot=True)
# # ... trigger some operation that hits the injection ...
# mark, _ = await log.wait_for(f'{injection_name}: waiting', from_mark=mark)
# # Now we know the injection was hit by parsing logs
# -----------------------------------------------------
# NEW APPROACH: Event stream (no log parsing!)
# -----------------------------------------------------
async with injection_event_stream(server.ip_addr) as event_stream:
logger.info("✓ Connected to injection event stream (no log parsing needed)")
# Enable and trigger injection
await manager.api.enable_injection(server.ip_addr, injection_name, one_shot=True)
await manager.api.message_injection(server.ip_addr, injection_name)
# Wait for injection event - fast and reliable!
event = await event_stream.wait_for_injection(injection_name, timeout=10.0)
logger.info(f"✓ Injection detected via event stream: {event}")
# No log parsing, no regex matching, no waiting for log flushes
# Just direct event notification from the injection point
# -----------------------------------------------------
logger.info("✓ New event stream approach is faster and more reliable than log parsing!")

View File

@@ -0,0 +1,21 @@
import pytest
from test.pylib.manager_client import ManagerClient
@pytest.mark.asyncio
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
async def test_gossip_boot(manager: ManagerClient):
"""
Regression test for scylladb/scylladb#17493.
"""
cfg = {'error_injections_at_startup': ['gossiper_replicate_sleep'],
'force_gossip_topology_changes': True,
'tablets_mode_for_new_keyspaces': 'disabled'}
servers = [await manager.server_add(config=cfg, timeout=60) for _ in range(3)]
logs = [await manager.server_open_log(s.server_id) for s in servers]
for log in logs:
for s in servers:
await log.wait_for(f'handle_state_normal for {s.ip_addr}.*finished', timeout=60)

View File

@@ -0,0 +1,358 @@
#
# Copyright (C) 2023-present ScyllaDB
#
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
#
import asyncio
import time
import pytest
import logging
import re
from uuid import UUID
from cassandra.cluster import Session, ConsistencyLevel # type: ignore
from cassandra.query import SimpleStatement # type: ignore
from cassandra.pool import Host # type: ignore
from test.pylib.manager_client import ManagerClient, ServerInfo
from test.pylib.util import wait_for, wait_for_cql_and_get_hosts
from test.pylib.log_browsing import ScyllaLogFile
from test.cluster.util import reconnect_driver, wait_until_upgrade_finishes, \
enter_recovery_state, delete_raft_data_and_upgrade_state, new_test_keyspace
logger = logging.getLogger(__name__)
async def get_local_schema_version(cql: Session, h: Host) -> UUID:
rs = await cql.run_async("select schema_version from system.local where key = 'local'", host=h)
assert(rs)
return rs[0].schema_version
async def get_group0_schema_version(cql: Session, h: Host) -> UUID | None:
rs = await cql.run_async("select value from system.scylla_local where key = 'group0_schema_version'", host=h)
if rs:
return UUID(rs[0].value)
return None
async def get_scylla_tables_versions(cql: Session, h: Host) -> list[tuple[str, str, UUID | None]]:
rs = await cql.run_async("select keyspace_name, table_name, version from system_schema.scylla_tables", host=h)
return [(r.keyspace_name, r.table_name, r.version) for r in rs]
async def get_scylla_tables_version(cql: Session, h: Host, keyspace_name: str, table_name: str) -> UUID | None:
rs = await cql.run_async(
f"select version from system_schema.scylla_tables"
f" where keyspace_name = '{keyspace_name}' and table_name = '{table_name}'",
host=h)
if not rs:
pytest.fail(f"No scylla_tables row found for {keyspace_name}.{table_name}")
return rs[0].version
async def verify_local_schema_versions_synced(cql: Session, hs: list[Host]) -> None:
async def check():
versions = {h: await get_local_schema_version(cql, h) for h in hs}
logger.info(f"system.local schema_versions: {versions}")
h1, v1 = next(iter(versions.items()))
for h, v in versions.items():
if v != v1:
logger.info(f"{h1}'s system.local schema_version {v1} is different than {h}'s version {v}; retrying")
return None
return True
await wait_for(check, deadline=time.time() + 5.0, period=1.0)
async def verify_group0_schema_versions_synced(cql: Session, hs: list[Host]) -> None:
versions = {h: await get_group0_schema_version(cql, h) for h in hs}
logger.info(f"system.scylla_local group0_schema_versions: {versions}")
h1, v1 = next(iter(versions.items()))
for h, v in versions.items():
if v != v1:
pytest.fail(f"{h1}'s system.scylla_local group0_schema_version {v1} is different than {h}'s version {v}")
async def verify_scylla_tables_versions_synced(cql: Session, hs: list[Host], ignore_system_tables: bool) -> None:
versions = {h: set(await get_scylla_tables_versions(cql, h)) for h in hs}
logger.info(f"system_schema.scylla_tables: {versions}")
h1, v1 = next(iter(versions.items()))
for h, v in versions.items():
diff = v.symmetric_difference(v1)
if ignore_system_tables:
diff = {(k, t, v) for k, t, v in diff if k != "system"}
if diff:
pytest.fail(f"{h1}'s system_schema.scylla_tables contents is different than {h}'s, symmetric diff: {diff}")
async def verify_table_versions_synced(cql: Session, hs: list[Host], ignore_system_tables: bool = False) -> None:
logger.info("Verifying that versions stored in tables are in sync")
await verify_group0_schema_versions_synced(cql, hs)
await verify_local_schema_versions_synced(cql, hs)
await verify_scylla_tables_versions_synced(cql, hs, ignore_system_tables)
async def verify_in_memory_table_versions(srvs: list[ServerInfo], logs: list[ScyllaLogFile], marks: list[int], table):
"""
Assumes that `logs` are log files of servers `srvs`, correspondingly in order.
Assumes that `marks` are log markers (obtained by `ScyllaLogFile.mark()`) corresponding to `logs` in order.
Assumes that an 'alter table {table} ...' statement was performed after obtaining `marks`.
Checks that every server printed the same version in `Altering {table}...' log message.
"""
logger.info("Verifying that in-memory table schema versions are in sync")
matches = [await log.grep(f"Altering {table}.*version=(.*)", from_mark=mark) for log, mark in zip(logs, marks)]
def get_version(srv: ServerInfo, matches: list[tuple[str, re.Match[str]]]):
if not matches:
pytest.fail(f"Server {srv} didn't log 'Altering' message")
_, match = matches[0]
return UUID(match.group(1))
versions = {srv: get_version(srv, m) for srv, m in zip(srvs, matches)}
logger.info(f"In-memory table versions: {versions}")
s1, v1 = next(iter(versions.items()))
for s, v in versions.items():
if v != v1:
pytest.fail(f"{s1}'s in-memory table version {v1} is different than {s}'s version {v}")
@pytest.mark.asyncio
async def test_schema_versioning_with_recovery(manager: ManagerClient):
"""
Perform schema changes while mixing nodes in RECOVERY mode with nodes in group 0 mode.
Schema changes originating from RECOVERY node use digest-based schema versioning.
Schema changes originating from group 0 nodes use persisted versions committed through group 0.
Verify that schema versions are in sync after each schema change.
"""
cfg = {'enable_user_defined_functions': False,
'force_gossip_topology_changes': True,
'tablets_mode_for_new_keyspaces': 'disabled'}
logger.info("Booting cluster")
# Must bootstrap sequentially because of gossip topology changes
servers = [await manager.server_add(config=cfg, property_file={"dc":"dc1", "rack":f"rack{i+1}"}) for i in range(3)]
cql = manager.get_cql()
hosts = await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)
logger.info("Creating keyspace and table")
async with new_test_keyspace(manager, "with replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 3}") as ks_name:
await verify_table_versions_synced(cql, hosts)
table_name = "t"
table = f"{ks_name}.{table_name}"
await cql.run_async(f"create table {table} (pk int primary key)")
logger.info("Waiting for driver")
await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)
await verify_table_versions_synced(cql, hosts)
ks_t_version = await get_scylla_tables_version(cql, hosts[0], ks_name, table_name)
assert ks_t_version
logs = [await manager.server_open_log(srv.server_id) for srv in servers]
marks = [await log.mark() for log in logs]
logger.info("Altering table")
await cql.run_async(f"alter table {table} with comment = ''")
await verify_table_versions_synced(cql, hosts)
await verify_in_memory_table_versions(servers, logs, marks, table)
new_ks_t_version = await get_scylla_tables_version(cql, hosts[0], ks_name, table_name)
assert new_ks_t_version
assert new_ks_t_version != ks_t_version
ks_t_version = new_ks_t_version
# We still have a group 0 majority, don't do this at home.
srv1 = servers[0]
logger.info(f"Rebooting {srv1} in RECOVERY mode")
h1 = next(h for h in hosts if h.address == srv1.ip_addr)
await cql.run_async("update system.scylla_local set value = 'recovery' where key = 'group0_upgrade_state'", host=h1)
await manager.server_restart(srv1.server_id)
cql = await reconnect_driver(manager)
logger.info(f"Waiting for driver")
await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)
await verify_table_versions_synced(cql, hosts)
# We're doing a schema change on RECOVERY node while we have two nodes running in group 0 mode.
# Don't do this at home.
#
# Now, the two nodes are not doing any schema changes right now, so this doesn't actually break anything:
# the RECOVERY node is operating using the old schema change procedure, which means
# that it pushes the schema mutations to other nodes directly with RPC, modifying
# the group 0 state machine on other two nodes.
#
# There is one problem with this however. If the RECOVERY node considers some other node
# as DOWN, it will silently *not* push the schema change, completing the operation
# "successfully" nevertheless (it will return to the driver without error).
# Usually in this case we rely on eventual convergence of schema through gossip,
# which will not happen here, because the group 0 nodes are not doing schema pulls!
# So we need to make sure that the RECOVERY node sees the other nodes as UP before
# we perform the schema change, so it pushes the mutations to them.
logger.info(f"Waiting until RECOVERY node ({srv1}) sees other servers as UP")
await manager.server_sees_others(srv1.server_id, 2)
marks = [await log.mark() for log in logs]
logger.info(f"Altering table on RECOVERY node ({srv1})")
await cql.run_async(f"alter table {table} with comment = ''", host=h1)
await verify_table_versions_synced(cql, hosts)
await verify_in_memory_table_versions(servers, logs, marks, table)
new_ks_t_version = await get_scylla_tables_version(cql, hosts[0], ks_name, table_name)
assert not new_ks_t_version
ks_t_version = new_ks_t_version
logger.info(f"Stopping {srv1} gracefully")
await manager.server_stop_gracefully(srv1.server_id)
srv2 = servers[1]
logger.info(f"Waiting until {srv2} sees {srv1} as dead")
await manager.server_not_sees_other_server(srv2.ip_addr, srv1.ip_addr)
# Now we modify schema through group 0 while the RECOVERY node is dead.
# Don't do this at home.
marks = [await log.mark() for log in logs]
h2 = next(h for h in hosts if h.address == srv2.ip_addr)
logger.info(f"Altering table on group 0 node {srv2}")
await cql.run_async(f"alter table {table} with comment = ''", host=h2)
await manager.server_start(srv1.server_id)
cql = await reconnect_driver(manager)
logger.info(f"Waiting for driver")
await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)
logger.info(f"Waiting until {srv2} sees {srv1} as UP")
await manager.server_sees_other_server(srv2.ip_addr, srv1.ip_addr)
# The RECOVERY node will pull schema when it gets a write.
# The other group 0 node will do a barrier so it will also sync schema before the write returns.
logger.info("Forcing schema sync through CL=ALL INSERT")
await cql.run_async(SimpleStatement(f"insert into {table} (pk) values (0)", consistency_level=ConsistencyLevel.ALL),
host=h2)
await verify_table_versions_synced(cql, hosts)
await verify_in_memory_table_versions(servers, logs, marks, table)
new_ks_t_version = await get_scylla_tables_version(cql, hosts[0], ks_name, table_name)
assert new_ks_t_version
ks_t_version = new_ks_t_version
srv3 = servers[2]
h3 = next(h for h in hosts if h.address == srv3.ip_addr)
logger.info("Finishing recovery")
for h in [h2, h3]:
await cql.run_async(
"update system.scylla_local set value = 'recovery' where key = 'group0_upgrade_state'", host=h)
await asyncio.gather(*(manager.server_restart(srv.server_id) for srv in [srv2, srv3]))
cql = await reconnect_driver(manager)
logger.info("Waiting for driver")
await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)
for h in [h1, h2, h3]:
await delete_raft_data_and_upgrade_state(cql, h)
logger.info("Restarting servers")
await asyncio.gather(*(manager.server_restart(srv.server_id) for srv in servers))
cql = await reconnect_driver(manager)
logger.info("Waiting for driver")
await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)
logging.info(f"Waiting until upgrade finishes")
for h in [h1, h2, h3]:
await wait_until_upgrade_finishes(cql, h, time.time() + 60)
await verify_table_versions_synced(cql, hosts)
for change in [
f"alter table {table} with comment = ''",
f"alter table {table} add v int",
f"alter table {table} alter v type blob"]:
marks = [await log.mark() for log in logs]
logger.info(f"Altering table with \"{change}\"")
await cql.run_async(change)
new_ks_t_version = await get_scylla_tables_version(cql, hosts[0], ks_name, table_name)
assert new_ks_t_version
assert new_ks_t_version != ks_t_version
ks_t_version = new_ks_t_version
await verify_table_versions_synced(cql, hosts)
await verify_in_memory_table_versions(servers, logs, marks, table)
@pytest.mark.asyncio
async def test_upgrade(manager: ManagerClient):
"""
This test uses the gossip-based recovery procedure.
While Raft is disabled, we use digest-based schema versioning.
Once Raft upgrade is complete, we use persisted versions committed through group 0.
"""
# Raft upgrade tests had to be replaced with recovery tests (scylladb/scylladb#16192)
# as prerequisite for getting rid of `consistent_cluster_management` flag.
# So we do the same here: start a cluster in Raft mode, then enter recovery
# to simulate a non-Raft cluster.
cfg = {'enable_user_defined_functions': False,
'force_gossip_topology_changes': True,
'tablets_mode_for_new_keyspaces': 'disabled'}
logger.info("Booting cluster")
servers = [await manager.server_add(config=cfg, property_file={"dc":"dc1", "rack":f"rack{i+1}"}) for i in range(3)]
cql = manager.get_cql()
logging.info("Waiting until driver connects to every server")
hosts = await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)
logging.info(f"Setting recovery state on {hosts} and restarting")
await asyncio.gather(*(enter_recovery_state(cql, h) for h in hosts))
await asyncio.gather(*(manager.server_restart(srv.server_id) for srv in servers))
cql = await reconnect_driver(manager)
logging.info("Waiting until driver connects to every server")
await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)
logger.info("Creating keyspace and table")
async with new_test_keyspace(manager, "with replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 2}") as ks_name:
table = f"{ks_name}.t"
await verify_table_versions_synced(cql, hosts)
await cql.run_async(f"create table {table} (pk int primary key)")
logging.info(f"Deleting Raft data and upgrade state on {hosts}")
await asyncio.gather(*(delete_raft_data_and_upgrade_state(cql, h) for h in hosts))
logging.info(f"Restarting {servers}")
await asyncio.gather(*(manager.server_restart(srv.server_id) for srv in servers))
cql = await reconnect_driver(manager)
logger.info("Waiting for driver")
await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)
logging.info(f"Waiting until Raft upgrade procedure finishes")
await asyncio.gather(*(wait_until_upgrade_finishes(cql, h, time.time() + 60) for h in hosts))
logs = [await manager.server_open_log(srv.server_id) for srv in servers]
marks = [await log.mark() for log in logs]
logger.info("Altering table")
await cql.run_async(f"alter table {table} with comment = ''")
await verify_table_versions_synced(cql, hosts)
await verify_in_memory_table_versions(servers, logs, marks, table)
# `group0_schema_version` should be present
# and the version column for `{table}` should be non-null.
for h in hosts:
logger.info(f"Checking that `group0_schema_version` is set on {h}")
assert (await get_group0_schema_version(cql, h)) is not None
for h in hosts:
logger.info(f"Checking that `version` column for `{table}` is set on {h}")
versions = await get_scylla_tables_versions(cql, h)
for ks, _, v in versions:
if ks == "ks":
assert v is not None

View File

@@ -7,14 +7,15 @@ import asyncio
import pytest
import time
import logging
import requests
import re
from cassandra.cluster import NoHostAvailable # type: ignore
from cassandra.cluster import ConnectionException, NoHostAvailable # type: ignore
from cassandra.query import SimpleStatement, ConsistencyLevel
from test.pylib.internal_types import IPAddress
from test.pylib.internal_types import ServerInfo
from test.pylib.manager_client import ManagerClient
from test.pylib.rest_client import ScyllaMetricsClient, TCPRESTClient, inject_error
from test.pylib.rest_client import inject_error
from test.pylib.tablets import get_tablet_replicas
from test.pylib.scylla_cluster import ReplaceConfig
from test.pylib.util import wait_for
@@ -24,21 +25,26 @@ from test.cluster.util import get_topology_coordinator, find_server_by_host_id,
logger = logging.getLogger(__name__)
async def get_hint_metrics(client: ScyllaMetricsClient, server_ip: IPAddress, metric_name: str):
metrics = await client.query(server_ip)
return metrics.get(f"scylla_hints_manager_{metric_name}")
def get_hint_manager_metric(server: ServerInfo, metric_name: str) -> int:
result = 0
metrics = requests.get(f"http://{server.ip_addr}:9180/metrics").text
pattern = re.compile(f"^scylla_hints_manager_{metric_name}")
for metric in metrics.split('\n'):
if pattern.match(metric) is not None:
result += int(float(metric.split()[1]))
return result
async def create_sync_point(client: TCPRESTClient, server_ip: IPAddress) -> str:
response = await client.post_json("/hinted_handoff/sync_point", host=server_ip, port=10_000)
return response
# Creates a sync point for ALL hosts.
def create_sync_point(node: ServerInfo) -> str:
return requests.post(f"http://{node.ip_addr}:10000/hinted_handoff/sync_point/").json()
async def await_sync_point(client: TCPRESTClient, server_ip: IPAddress, sync_point: str, timeout: int) -> bool:
def await_sync_point(node: ServerInfo, sync_point: str, timeout: int) -> bool:
params = {
"id": sync_point,
"timeout": str(timeout)
}
response = await client.get_json("/hinted_handoff/sync_point", host=server_ip, port=10_000, params=params)
response = requests.get(f"http://{node.ip_addr}:10000/hinted_handoff/sync_point", params=params).json()
match response:
case "IN_PROGRESS":
return False
@@ -60,7 +66,10 @@ async def test_write_cl_any_to_dead_node_generates_hints(manager: ManagerClient)
await manager.server_stop_gracefully(servers[1].server_id)
hints_before = await get_hint_metrics(manager.metrics, servers[0].ip_addr, "written")
def get_hints_written_count(server):
return get_hint_manager_metric(server, "written")
hints_before = get_hints_written_count(servers[0])
# Some of the inserts will be targeted to the dead node.
# The coordinator doesn't have live targets to send the write to, but it should write a hint.
@@ -68,7 +77,7 @@ async def test_write_cl_any_to_dead_node_generates_hints(manager: ManagerClient)
await cql.run_async(SimpleStatement(f"INSERT INTO {table} (pk, v) VALUES ({i}, {i+1})", consistency_level=ConsistencyLevel.ANY))
# Verify hints are written
hints_after = await get_hint_metrics(manager.metrics, servers[0].ip_addr, "written")
hints_after = get_hints_written_count(servers[0])
assert hints_after > hints_before
# For dropping the keyspace
@@ -134,29 +143,24 @@ async def test_sync_point(manager: ManagerClient):
# Mutations need to be applied to hinted handoff's commitlog before we create the sync point.
# Otherwise, the sync point will correspond to no hints at all.
async def check_written_hints(min_count: int) -> bool:
errors = await get_hint_metrics(manager.metrics, node1.ip_addr, "errors")
assert errors == 0, "Writing hints to disk failed"
hints = await get_hint_metrics(manager.metrics, node1.ip_addr, "written")
if hints >= min_count:
return True
return None
# We need to wrap the function in an async function to make `wait_for` be able to use it below.
async def check_no_hints_in_progress_node1() -> bool:
return get_hint_manager_metric(node1, "size_of_hints_in_progress") == 0
deadline = time.time() + 30
await wait_for(lambda: check_written_hints(2 * mutation_count), deadline)
await wait_for(check_no_hints_in_progress_node1, deadline)
sync_point1 = await create_sync_point(manager.api.client, node1.ip_addr)
sync_point1 = create_sync_point(node1)
await manager.server_start(node2.server_id)
await manager.server_sees_other_server(node1.ip_addr, node2.ip_addr)
assert not (await await_sync_point(manager.api.client, node1.ip_addr, sync_point1, 3))
assert not await_sync_point(node1, sync_point1, 30)
await manager.server_start(node3.server_id)
await manager.server_sees_other_server(node1.ip_addr, node3.ip_addr)
assert await await_sync_point(manager.api.client, node1.ip_addr, sync_point1, 30)
assert await_sync_point(node1, sync_point1, 30)
@pytest.mark.asyncio
@@ -202,8 +206,7 @@ async def test_hints_consistency_during_decommission(manager: ManagerClient):
await manager.servers_see_each_other([server1, server2, server3])
# Record the current position of hints so that we can wait for them later
sync_points = await asyncio.gather(*[create_sync_point(manager.api.client, srv.ip_addr) for srv in (server1, server2)])
sync_points = list(sync_points)
sync_points = [create_sync_point(srv) for srv in (server1, server2)]
async with asyncio.TaskGroup() as tg:
coord = await get_topology_coordinator(manager)
@@ -229,8 +232,7 @@ async def test_hints_consistency_during_decommission(manager: ManagerClient):
await manager.api.disable_injection(srv.ip_addr, "hinted_handoff_pause_hint_replay")
logger.info("Wait until hints are replayed from nodes 1 and 2")
await asyncio.gather(*(await_sync_point(manager.api.client, srv.ip_addr, pt, timeout=30)
for srv, pt in zip((server1, server2), sync_points)))
await asyncio.gather(*(asyncio.to_thread(await_sync_point, srv, pt, timeout=30) for srv, pt in zip((server1, server2), sync_points)))
# Unpause streaming and let decommission finish
logger.info("Unpause streaming")
@@ -268,11 +270,11 @@ async def test_hints_consistency_during_replace(manager: ManagerClient):
# Write 100 rows with CL=ANY. Some of the rows will only be stored as hints because of RF=1
for i in range(100):
await cql.run_async(SimpleStatement(f"INSERT INTO {table} (pk, v) VALUES ({i}, {i + 1})", consistency_level=ConsistencyLevel.ANY))
sync_point = await create_sync_point(manager.api.client, servers[0].ip_addr)
sync_point = create_sync_point(servers[0])
await manager.server_add(replace_cfg=ReplaceConfig(replaced_id = servers[2].server_id, reuse_ip_addr = False, use_host_id = True))
assert await await_sync_point(manager.api.client, servers[0].ip_addr, sync_point, 30)
assert await_sync_point(servers[0], sync_point, 30)
# Verify that all rows were recovered by the hint replay
for i in range(100):
assert list(await cql.run_async(f"SELECT v FROM {table} WHERE pk = {i}")) == [(i + 1,)]
@@ -297,12 +299,16 @@ async def test_draining_hints(manager: ManagerClient):
for i in range(1000):
await cql.run_async(SimpleStatement(f"INSERT INTO ks.t (pk, v) VALUES ({i}, {i + 1})", consistency_level=ConsistencyLevel.ANY))
sync_point = await create_sync_point(manager.api.client, s1.ip_addr)
sync_point = create_sync_point(s1)
await manager.server_start(s2.server_id)
async def wait():
assert await_sync_point(s1, sync_point, 60)
async with asyncio.TaskGroup() as tg:
_ = tg.create_task(manager.decommission_node(s1.server_id, timeout=60))
_ = tg.create_task(await_sync_point(manager.api.client, s1.ip_addr, sync_point, 60))
_ = tg.create_task(wait())
@pytest.mark.asyncio
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
@@ -328,7 +334,7 @@ async def test_canceling_hint_draining(manager: ManagerClient):
for i in range(1000):
await cql.run_async(SimpleStatement(f"INSERT INTO ks.t (pk, v) VALUES ({i}, {i + 1})", consistency_level=ConsistencyLevel.ANY))
sync_point = await create_sync_point(manager.api.client, s1.ip_addr)
sync_point = create_sync_point(s1)
await manager.api.enable_injection(s1.ip_addr, "hinted_handoff_pause_hint_replay", False, {})
await manager.remove_node(s1.server_id, s2.server_id)
@@ -346,7 +352,7 @@ async def test_canceling_hint_draining(manager: ManagerClient):
await s1_log.wait_for(f"Draining starts for {host_id2}", from_mark=s1_mark)
# Make sure draining finishes successfully.
assert await await_sync_point(manager.api.client, s1.ip_addr, sync_point, 60)
assert await_sync_point(s1, sync_point, 60)
await s1_log.wait_for(f"Removed hint directory for {host_id2}")
@pytest.mark.asyncio
@@ -385,7 +391,7 @@ async def test_hint_to_pending(manager: ManagerClient):
await manager.api.enable_injection(servers[0].ip_addr, "hinted_handoff_pause_hint_replay", False)
await manager.server_start(servers[1].server_id)
sync_point = await create_sync_point(manager.api.client, servers[0].ip_addr)
sync_point = create_sync_point(servers[0])
await manager.api.enable_injection(servers[0].ip_addr, "pause_after_streaming_tablet", False)
tablet_migration = asyncio.create_task(manager.api.move_tablet(servers[0].ip_addr, ks, "t", host_ids[1], 0, host_ids[0], 0, 0))
@@ -397,7 +403,7 @@ async def test_hint_to_pending(manager: ManagerClient):
await wait_for(migration_reached_streaming, time.time() + 60)
await manager.api.disable_injection(servers[0].ip_addr, "hinted_handoff_pause_hint_replay")
assert await await_sync_point(manager.api.client, servers[0].ip_addr, sync_point, 30)
assert await_sync_point(servers[0], sync_point, 30)
await manager.api.message_injection(servers[0].ip_addr, "pause_after_streaming_tablet")
done, pending = await asyncio.wait([tablet_migration])

View File

@@ -21,6 +21,13 @@ async def test_create_keyspace_with_default_replication_factor(manager: ManagerC
def get_pf(dc: str, rack: str) -> dict:
return {'dc': dc, 'rack': rack}
logging.info("Trying to add a zero-token server in the gossip-based topology")
await manager.server_add(config={'join_ring': False,
'force_gossip_topology_changes': True,
'tablets_mode_for_new_keyspaces': 'disabled'},
property_file={'dc': 'dc1', 'rack': 'rz'},
expected_error='the raft-based topology is disabled')
normal_cfg = {
'tablets_mode_for_new_keyspaces': 'enabled' if tablets_enabled else 'disabled',
'rf_rack_valid_keyspaces': rf_rack_valid_keyspaces

View File

@@ -7,19 +7,16 @@
from cassandra.protocol import ConfigurationException
from cassandra.connection import UnixSocketEndPoint
from cassandra.policies import WhiteListRoundRobinPolicy
from cassandra.query import SimpleStatement, ConsistencyLevel
from test.pylib.manager_client import ManagerClient
from test.pylib.tablets import get_all_tablet_replicas
from test.cluster.conftest import cluster_con
from test.pylib.util import gather_safely, wait_for_cql_and_get_hosts
from test.cluster.util import create_new_test_keyspace
from test.pylib.util import wait_for_cql_and_get_hosts
from test.cluster.util import new_test_keyspace
import pytest
import logging
import socket
import time
from typing import TypeAlias
logger = logging.getLogger(__name__)
@@ -28,166 +25,80 @@ async def test_maintenance_mode(manager: ManagerClient):
"""
The test checks that in maintenance mode server A is not available for other nodes and for clients.
It is possible to connect by the maintenance socket to server A and perform local CQL operations.
The test is run with multiple keyspaces with different configurations (replication strategy, RF, tablets enabled).
It initially used only SimpleStrategy and RF=1, which hid https://github.com/scylladb/scylladb/issues/27988. To keep
the test fast, the tasks for different keyspaces are performed concurrently, and server A is started in maintenance
mode only once.
"""
max_rf = 3
servers = await manager.servers_add(max_rf, auto_rack_dc='dc1')
server_a = servers[0]
host_id_a = await manager.get_host_id(server_a.server_id)
server_a, server_b = await manager.server_add(), await manager.server_add()
socket_endpoint = UnixSocketEndPoint(await manager.server_get_maintenance_socket_path(server_a.server_id))
# For the move_tablet API.
await manager.disable_tablet_balancing()
# An exclusive connection to server A is needed for requests with LocalStrategy.
cluster = cluster_con([server_a.ip_addr], load_balancing_policy=WhiteListRoundRobinPolicy([server_a.ip_addr]))
cluster = cluster_con([server_b.ip_addr])
cql = cluster.connect()
# (replication strategy, Optional[replication factor], tablets enabled)
KeyspaceOptions: TypeAlias = tuple[str, int | None, bool]
keyspace_options: list[KeyspaceOptions] = []
keyspace_options.append(('EverywhereStrategy', None, False))
keyspace_options.append(('LocalStrategy', None, False))
for rf in range(1, max_rf + 1):
keyspace_options.append(('SimpleStrategy', rf, False))
for tablets_enabled in [True, False]:
keyspace_options.append(('NetworkTopologyStrategy', rf, tablets_enabled))
key_on_server_a_per_table: dict[str, int] = dict()
async def prepare_table(options: KeyspaceOptions):
replication_strategy, rf, tablets_enabled = options
rf_string = "" if rf is None else f", 'replication_factor': {rf}"
ks = await create_new_test_keyspace(cql,
f"""WITH REPLICATION = {{'class': '{replication_strategy}'{rf_string}}}
AND tablets = {{'enabled': {str(tablets_enabled).lower()}, 'initial': 1}}""")
rf_tag = "" if rf is None else f"rf{rf}"
tablets_tag = "tablets" if tablets_enabled else "vnodes"
table_suffix = f"{replication_strategy.lower()}_{rf_tag}_{tablets_tag}"
table = f"{ks}.{table_suffix}"
async with new_test_keyspace(manager, "WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1}") as ks:
table = f"{ks}.t"
await cql.run_async(f"CREATE TABLE {table} (k int PRIMARY KEY, v int)")
logger.info(f"Created table {table}")
async def insert_one(cl: ConsistencyLevel):
key = 1
insert_stmt = SimpleStatement(f"INSERT INTO {table} (k, v) VALUES ({key}, {key})",
consistency_level=cl)
await cql.run_async(insert_stmt)
key_on_server_a_per_table[table] = key
if replication_strategy == 'LocalStrategy':
await insert_one(ConsistencyLevel.ONE)
return
if tablets_enabled:
await insert_one(ConsistencyLevel.ALL)
logger.info(f"Ensuring that a tablet replica is on {server_a} for table {table}")
[tablet] = await get_all_tablet_replicas(manager, server_a, ks, table_suffix)
if host_id_a not in [r[0] for r in tablet.replicas]:
assert rf < max_rf
any_replica = tablet.replicas[0]
logger.info(f"Moving tablet from {any_replica} to {server_a} for table {table}")
await manager.api.move_tablet(server_a.ip_addr, ks, table_suffix,
any_replica[0], any_replica[1],
host_id_a, 0,
tablet.last_token)
return
# This path is executed only for vnodes-based keyspaces.
# Token ranges of the server A
# [(start_token, end_token)]
ranges = [(int(row[0]), int(row[1])) for row in await cql.run_async(f"""SELECT start_token, end_token
ranges = [(int(row[0]), int(row[1])) for row in await cql.run_async(f"""SELECT start_token, end_token, endpoint
FROM system.token_ring WHERE keyspace_name = '{ks}'
AND endpoint = '{server_a.ip_addr}' ALLOW FILTERING""")]
# Insert data to the cluster until a key is stored on server A.
new_key = 0
while table not in key_on_server_a_per_table:
if new_key == 1000:
# The probability of reaching this code is (2/3)^1000 for RF=1 and lower for greater RFs. This is much
# less than, for example, the probability of a UUID collision, so worrying about this would be silly.
# It could still happen due to a bug, and then we want to know about it, so we fail the test.
pytest.fail(f"Could not find a key on server {server_a} after inserting 1000 keys")
new_key += 1
# Insert data to the cluster and find a key that is stored on server A.
for i in range(256):
await cql.run_async(f"INSERT INTO {table} (k, v) VALUES ({i}, {i})")
insert_stmt = SimpleStatement(f"INSERT INTO {table} (k, v) VALUES ({new_key}, {new_key})",
consistency_level=ConsistencyLevel.ALL)
await cql.run_async(insert_stmt)
# [(key, token of this key)]
keys_with_tokens = [(int(row[0]), int(row[1])) for row in await cql.run_async(f"SELECT k, token(k) FROM {table}")]
key_on_server_a = None
res = await cql.run_async(f"SELECT token(k) FROM {table} WHERE k = {new_key}")
assert len(res) == 1
token = res[0][0]
for key, token in keys_with_tokens:
for start, end in ranges:
if (start < end and start < token <= end) or (start >= end and (token <= end or start < token)):
logger.info(f"Found key {new_key} with token {token} on server {server_a} for table {table}")
key_on_server_a_per_table[table] = new_key
key_on_server_a = key
logger.info("Preparing tables")
await gather_safely(*(prepare_table(options) for options in keyspace_options))
if key_on_server_a is None:
# There is only a chance ~(1/2)^256 that all keys are stored on the server B
# In this case we skip the test
pytest.skip("All keys are stored on the server B")
# Start server A in maintenance mode
await manager.server_stop_gracefully(server_a.server_id)
await manager.server_update_config(server_a.server_id, "maintenance_mode", True)
await manager.server_start(server_a.server_id)
# Start server A in maintenance mode
await manager.server_stop_gracefully(server_a.server_id)
await manager.server_update_config(server_a.server_id, "maintenance_mode", "true")
await manager.server_start(server_a.server_id)
log = await manager.server_open_log(server_a.server_id)
await log.wait_for(r"initialization completed \(maintenance mode\)")
log = await manager.server_open_log(server_a.server_id)
await log.wait_for(r"initialization completed \(maintenance mode\)")
# Check that the regular CQL port is not available
assert socket.socket(socket.AF_INET, socket.SOCK_STREAM).connect_ex((server_a.ip_addr, 9042)) != 0
# Check that the regular CQL port is not available
assert socket.socket(socket.AF_INET, socket.SOCK_STREAM).connect_ex((server_a.ip_addr, 9042)) != 0
maintenance_cluster = cluster_con([socket_endpoint],
load_balancing_policy=WhiteListRoundRobinPolicy([socket_endpoint]))
maintenance_cql = maintenance_cluster.connect()
maintenance_cluster = cluster_con([socket_endpoint],
load_balancing_policy=WhiteListRoundRobinPolicy([socket_endpoint]))
maintenance_cql = maintenance_cluster.connect()
async def update_table_in_maintenance_mode(table: str, key: int):
# Check that local data is available in maintenance mode
select_stm = SimpleStatement(f"SELECT v FROM {table} WHERE k = {key}", consistency_level=ConsistencyLevel.ONE)
res = await maintenance_cql.run_async(select_stm)
assert len(res) == 1 and res[0][0] == key, f"Expected {key} for table {table}"
res = await maintenance_cql.run_async(f"SELECT v FROM {table} WHERE k = {key_on_server_a}")
assert res[0][0] == key_on_server_a
update_stm = SimpleStatement(f"UPDATE {table} SET v = {key + 1} WHERE k = {key}",
consistency_level=ConsistencyLevel.ONE)
await maintenance_cql.run_async(update_stm)
# Check that group0 operations are disabled
with pytest.raises(ConfigurationException):
await maintenance_cql.run_async(f"CREATE TABLE {ks}.t2 (k int PRIMARY KEY, v int)")
logger.info("Updating tables in maintenance mode")
await gather_safely(*(update_table_in_maintenance_mode(table, key)
for table, key in key_on_server_a_per_table.items()))
await maintenance_cql.run_async(f"UPDATE {table} SET v = {key_on_server_a + 1} WHERE k = {key_on_server_a}")
# Check that group0 operations are disabled
with pytest.raises(ConfigurationException, match="cannot start group0 operation in the maintenance mode"):
await create_new_test_keyspace(
maintenance_cql, "WITH REPLICATION = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1}")
# Ensure that server B recognizes server A as being shutdown, not as being alive.
res = await cql.run_async(f"SELECT status FROM system.cluster_status WHERE peer = '{server_a.ip_addr}'")
assert res[0][0] == "shutdown"
# Ensure that another server recognizes server A as being shutdown, not as being alive.
cql_b, [host_b] = await manager.get_ready_cql([servers[1]])
res = await cql_b.run_async(f"SELECT status FROM system.cluster_status WHERE peer = '{server_a.ip_addr}'",
host=host_b)
assert len(res) == 1
assert res[0][0] == "shutdown"
await manager.server_stop_gracefully(server_a.server_id)
await manager.server_stop_gracefully(server_a.server_id)
# Restart in normal mode to see if the changes made in maintenance mode are persisted
await manager.server_update_config(server_a.server_id, "maintenance_mode", False)
await manager.server_start(server_a.server_id, wait_others=1)
await wait_for_cql_and_get_hosts(cql, [server_a], time.time() + 60)
await manager.servers_see_each_other([server_a, server_b])
# Restart in normal mode
await manager.server_update_config(server_a.server_id, "maintenance_mode", False)
await manager.server_start(server_a.server_id, wait_others=1)
await wait_for_cql_and_get_hosts(cql, [server_a], time.time() + 60)
await manager.servers_see_each_other(servers)
res = await cql.run_async(f"SELECT v FROM {table} WHERE k = {key_on_server_a}")
assert res[0][0] == key_on_server_a + 1
async def check_table_in_normal_mode(table: str, key: int):
# Check if the changes made in maintenance mode are persisted
select_stm = SimpleStatement(f"SELECT v FROM {table} WHERE k = {key}", consistency_level=ConsistencyLevel.ALL)
res = await cql.run_async(select_stm)
assert len(res) == 1 and res[0][0] == key + 1, f"Expected {key + 1} for table {table}"
logger.info("Checking tables in normal mode")
await gather_safely(*(check_table_in_normal_mode(table, key) for table, key in key_on_server_a_per_table.items()))
cluster.shutdown()
maintenance_cluster.shutdown()

Some files were not shown because too many files have changed in this diff Show More