Compare commits
6 Commits
copilot/fi
...
copilot/re
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
bebdae5a08 | ||
|
|
32d20e0481 | ||
|
|
6422477d63 | ||
|
|
85b9957e00 | ||
|
|
1057ebb185 | ||
|
|
67ff59b94b |
9
.github/CODEOWNERS
vendored
9
.github/CODEOWNERS
vendored
@@ -1,5 +1,5 @@
|
||||
# AUTH
|
||||
auth/* @nuivall
|
||||
auth/* @nuivall @ptrsmrn
|
||||
|
||||
# CACHE
|
||||
row_cache* @tgrabiec
|
||||
@@ -25,11 +25,11 @@ compaction/* @raphaelsc
|
||||
transport/*
|
||||
|
||||
# CQL QUERY LANGUAGE
|
||||
cql3/* @tgrabiec @nuivall
|
||||
cql3/* @tgrabiec @nuivall @ptrsmrn
|
||||
|
||||
# COUNTERS
|
||||
counters* @nuivall
|
||||
tests/counter_test* @nuivall
|
||||
counters* @nuivall @ptrsmrn
|
||||
tests/counter_test* @nuivall @ptrsmrn
|
||||
|
||||
# DOCS
|
||||
docs/* @annastuchlik @tzach
|
||||
@@ -57,6 +57,7 @@ repair/* @tgrabiec @asias
|
||||
|
||||
# SCHEMA MANAGEMENT
|
||||
db/schema_tables* @tgrabiec
|
||||
db/legacy_schema_migrator* @tgrabiec
|
||||
service/migration* @tgrabiec
|
||||
schema* @tgrabiec
|
||||
|
||||
|
||||
2
.github/scripts/auto-backport.py
vendored
2
.github/scripts/auto-backport.py
vendored
@@ -62,7 +62,7 @@ def create_pull_request(repo, new_branch_name, base_branch_name, pr, backport_pr
|
||||
if is_draft:
|
||||
labels_to_add.append("conflicts")
|
||||
pr_comment = f"@{pr.user.login} - This PR was marked as draft because it has conflicts\n"
|
||||
pr_comment += "Please resolve them and remove the 'conflicts' label. The PR will be made ready for review automatically."
|
||||
pr_comment += "Please resolve them and mark this PR as ready for review"
|
||||
backport_pr.create_issue_comment(pr_comment)
|
||||
|
||||
# Apply all labels at once if we have any
|
||||
|
||||
@@ -18,7 +18,7 @@ jobs:
|
||||
|
||||
// Regular expression pattern to check for "Fixes" prefix
|
||||
// Adjusted to dynamically insert the repository full name
|
||||
const pattern = `Fixes:? ((?:#|${repo.replace('/', '\\/')}#|https://github\\.com/${repo.replace('/', '\\/')}/issues/)(\\d+)|([A-Z]+-\\d+))`;
|
||||
const pattern = `Fixes:? (?:#|${repo.replace('/', '\\/')}#|https://github\\.com/${repo.replace('/', '\\/')}/issues/)(\\d+)`;
|
||||
const regex = new RegExp(pattern);
|
||||
|
||||
if (!regex.test(body)) {
|
||||
|
||||
10
.github/workflows/docs-validate-metrics.yml
vendored
10
.github/workflows/docs-validate-metrics.yml
vendored
@@ -7,7 +7,7 @@ on:
|
||||
- enterprise
|
||||
paths:
|
||||
- '**/*.cc'
|
||||
- 'scripts/metrics-config.yml'
|
||||
- 'scripts/metrics-config.yml'
|
||||
- 'scripts/get_description.py'
|
||||
- 'docs/_ext/scylladb_metrics.py'
|
||||
|
||||
@@ -15,20 +15,20 @@ jobs:
|
||||
validate-metrics:
|
||||
runs-on: ubuntu-latest
|
||||
name: Check metrics documentation coverage
|
||||
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
python-version: '3.10'
|
||||
|
||||
|
||||
- name: Install dependencies
|
||||
run: pip install PyYAML
|
||||
|
||||
|
||||
- name: Validate metrics
|
||||
run: python3 scripts/get_description.py --validate -c scripts/metrics-config.yml
|
||||
|
||||
5
.github/workflows/trigger-scylla-ci.yaml
vendored
5
.github/workflows/trigger-scylla-ci.yaml
vendored
@@ -3,13 +3,10 @@ name: Trigger Scylla CI Route
|
||||
on:
|
||||
issue_comment:
|
||||
types: [created]
|
||||
pull_request_target:
|
||||
types:
|
||||
- unlabeled
|
||||
|
||||
jobs:
|
||||
trigger-jenkins:
|
||||
if: (github.event.comment.user.login != 'scylladbbot' && contains(github.event.comment.body, '@scylladbbot') && contains(github.event.comment.body, 'trigger-ci')) || github.event.label.name == 'conflicts'
|
||||
if: github.event.comment.user.login != 'scylladbbot' && contains(github.event.comment.body, '@scylladbbot') && contains(github.event.comment.body, 'trigger-ci')
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Trigger Scylla-CI-Route Jenkins Job
|
||||
|
||||
194
CLUSTERING_RANGE_MIGRATION.md
Normal file
194
CLUSTERING_RANGE_MIGRATION.md
Normal file
@@ -0,0 +1,194 @@
|
||||
# Clustering Range to Position Range Migration - Summary
|
||||
|
||||
## Problem Statement
|
||||
|
||||
The `clustering_range` type (alias for `interval<clustering_key_prefix>`) has known correctness issues with operations like `intersection()` and `deoverlap()`. These operations can return incorrect results due to the complex semantics of comparing clustering key prefixes with different bound inclusiveness.
|
||||
|
||||
**Related Issues:**
|
||||
- #22817 - `interval<clustering_key_prefix>::deoverlap` can return incorrect results
|
||||
- #21604 - Problems with clustering range operations
|
||||
- #8157 - `interval<clustering_key_prefix_view>::intersection` can return incorrect results
|
||||
|
||||
## Solution Approach
|
||||
|
||||
The `position_range` class represents clustering ranges as a pair of `position_in_partition` objects, avoiding the problematic interval semantics. The migration strategy involves:
|
||||
|
||||
1. **Fix critical bugs immediately** - Use `clustering_interval_set` which internally uses `position_range`
|
||||
2. **Add infrastructure** - Feature flags, IDL support, utility functions
|
||||
3. **Gradual internal migration** - Replace internal uses of `clustering_range` with `position_range`
|
||||
4. **RPC compatibility** - Maintain backward compatibility with feature-gated new verbs
|
||||
|
||||
## What Has Been Done
|
||||
|
||||
### 1. Feature Flag ✅
|
||||
Added `gms::feature position_range` to `gms/feature_service.hh` for cluster-wide feature detection.
|
||||
|
||||
### 2. IDL Support ✅
|
||||
Added `position_range` to `idl/position_in_partition.idl.hh` for RPC serialization:
|
||||
```idl
|
||||
class position_range {
|
||||
position_in_partition start();
|
||||
position_in_partition end();
|
||||
};
|
||||
```
|
||||
|
||||
### 3. Critical Bug Fixes ✅
|
||||
|
||||
#### Fixed in `cql3/statements/cas_request.cc`:
|
||||
```cpp
|
||||
// OLD (buggy):
|
||||
ranges = query::clustering_range::deoverlap(std::move(ranges), clustering_key::tri_compare(*_schema));
|
||||
|
||||
// NEW (fixed):
|
||||
clustering_interval_set interval_set(*_schema, ranges);
|
||||
ranges = interval_set.to_clustering_row_ranges();
|
||||
```
|
||||
|
||||
#### Fixed in `db/view/view.cc`:
|
||||
```cpp
|
||||
// OLD (buggy):
|
||||
auto deoverlapped_ranges = interval<clustering_key_prefix_view>::deoverlap(std::move(row_ranges), cmp);
|
||||
|
||||
// NEW (fixed):
|
||||
clustering_interval_set interval_set(base, temp_ranges);
|
||||
return interval_set.to_clustering_row_ranges();
|
||||
```
|
||||
|
||||
### 4. Utility Functions ✅
|
||||
Created `query/position_range_utils.hh` with safe range operation helpers:
|
||||
- `clustering_row_ranges_to_position_ranges()` - Batch conversion
|
||||
- `position_ranges_to_clustering_row_ranges()` - Batch conversion back
|
||||
- `deoverlap_clustering_row_ranges()` - Safe deoverlap using clustering_interval_set
|
||||
- `intersect_clustering_row_ranges()` - Safe intersection using clustering_interval_set
|
||||
|
||||
### 5. Tests ✅
|
||||
Added comprehensive unit tests in `test/boost/position_range_utils_test.cc`:
|
||||
- Test deoverlap with overlapping and non-overlapping ranges
|
||||
- Test conversion between clustering_range and position_range
|
||||
- Test intersection operations
|
||||
- Validate correctness of utility functions
|
||||
|
||||
### 6. Documentation ✅
|
||||
- **Migration guide**: `docs/dev/clustering-range-to-position-range-migration.md`
|
||||
- Overview of the problem and solution
|
||||
- Conversion utilities and patterns
|
||||
- Implementation checklist
|
||||
|
||||
- **RPC migration plan**: `docs/dev/position-range-rpc-migration.md`
|
||||
- Detailed plan for backward-compatible RPC migration
|
||||
- IDL type definitions for v2 types
|
||||
- Feature-gated verb selection logic
|
||||
- Phased rollout strategy
|
||||
|
||||
## What Remains To Be Done
|
||||
|
||||
### Phase 1: RPC Migration (High Priority)
|
||||
1. Define `partition_slice_v2` with `std::vector<position_range>`
|
||||
2. Define `read_command_v2` using `partition_slice_v2`
|
||||
3. Add new RPC verbs: `read_data_v2`, `read_mutation_data_v2`, `read_digest_v2`
|
||||
4. Implement conversion between v1 and v2 types
|
||||
5. Add feature-gated verb selection in RPC clients
|
||||
6. Test backward compatibility
|
||||
|
||||
### Phase 2: Internal Refactoring (Ongoing)
|
||||
1. Identify internal data structures using `clustering_range`
|
||||
2. Refactor to use `position_range` where appropriate
|
||||
3. Update mutation readers and iterators
|
||||
4. Modify query processing logic
|
||||
5. Update cache structures
|
||||
|
||||
### Phase 3: Validation (Continuous)
|
||||
1. Build and run existing tests
|
||||
2. Add more tests for edge cases
|
||||
3. Performance benchmarking
|
||||
4. Rolling upgrade testing
|
||||
|
||||
## Files Changed
|
||||
|
||||
### Core Changes
|
||||
- `gms/feature_service.hh` - Added position_range feature flag
|
||||
- `idl/position_in_partition.idl.hh` - Added position_range IDL definition
|
||||
- `cql3/statements/cas_request.cc` - Fixed deoverlap bug
|
||||
- `db/view/view.cc` - Fixed deoverlap bug, enhanced documentation
|
||||
|
||||
### New Files
|
||||
- `query/position_range_utils.hh` - Utility functions for safe range operations
|
||||
- `test/boost/position_range_utils_test.cc` - Unit tests for utilities
|
||||
|
||||
### Documentation
|
||||
- `docs/dev/clustering-range-to-position-range-migration.md` - Migration guide
|
||||
- `docs/dev/position-range-rpc-migration.md` - RPC migration plan
|
||||
- `CLUSTERING_RANGE_MIGRATION.md` - This summary document
|
||||
|
||||
## Impact and Benefits
|
||||
|
||||
### Immediate Benefits ✅
|
||||
- **Fixed critical bugs**: Two production code bugs in `cas_request.cc` and `view.cc` that could cause incorrect query results
|
||||
- **Safe operations**: Developers can now use utility functions that guarantee correct deoverlap and intersection
|
||||
- **Future-proof**: Infrastructure is in place for gradual migration
|
||||
|
||||
### Future Benefits 🔄
|
||||
- **Correctness**: All clustering range operations will be correct by construction
|
||||
- **Maintainability**: Clearer code using position_range instead of complex interval semantics
|
||||
- **Performance**: Potential optimizations from simpler position-based comparisons
|
||||
|
||||
## Testing Strategy
|
||||
|
||||
### Unit Tests ✅
|
||||
- `test/boost/position_range_utils_test.cc` validates utility functions
|
||||
- Existing tests in `test/boost/mutation_test.cc` use clustering_interval_set
|
||||
- Tests in `test/boost/mvcc_test.cc` validate clustering_interval_set behavior
|
||||
|
||||
### Integration Testing (To Do)
|
||||
- Test RPC backward compatibility during rolling upgrades
|
||||
- Test mixed-version clusters
|
||||
- Validate query correctness with position_range
|
||||
|
||||
### Performance Testing (To Do)
|
||||
- Benchmark conversion overhead
|
||||
- Compare memory usage
|
||||
- Measure query latency impact
|
||||
|
||||
## Migration Timeline
|
||||
|
||||
- **Week 1-2**: ✅ Foundation and critical bug fixes (COMPLETED)
|
||||
- Feature flag
|
||||
- IDL support
|
||||
- Bug fixes in cas_request.cc and view.cc
|
||||
- Utility functions and tests
|
||||
- Documentation
|
||||
|
||||
- **Week 3-4**: 🔄 RPC migration (IN PROGRESS)
|
||||
- Define v2 IDL types
|
||||
- Implement new RPC verbs
|
||||
- Add feature-gated selection
|
||||
|
||||
- **Week 5-8**: 🔄 Internal refactoring (PLANNED)
|
||||
- Systematic replacement in internal code
|
||||
- Update readers and iterators
|
||||
- Performance validation
|
||||
|
||||
- **Week 9+**: 🔄 Validation and rollout (PLANNED)
|
||||
- Comprehensive testing
|
||||
- Rolling upgrade validation
|
||||
- Production deployment
|
||||
|
||||
## Key Takeaways
|
||||
|
||||
1. **clustering_interval_set is your friend**: When working with clustering ranges, use clustering_interval_set for set operations instead of raw interval operations.
|
||||
|
||||
2. **Use utility functions**: The helpers in `query/position_range_utils.hh` provide safe alternatives to buggy operations.
|
||||
|
||||
3. **RPC requires care**: Backward compatibility is critical. Always use feature flags for RPC changes.
|
||||
|
||||
4. **Incremental approach**: This is a large refactoring. Do it incrementally, with tests at each step.
|
||||
|
||||
5. **Document as you go**: Good documentation (like this) helps future developers understand the context and rationale.
|
||||
|
||||
## References
|
||||
|
||||
- `mutation/position_in_partition.hh` - position_range definition
|
||||
- `keys/clustering_interval_set.hh` - Safe clustering range operations
|
||||
- `query/query-request.hh` - clustering_range definition and warnings
|
||||
- Issues: #22817, #21604, #8157
|
||||
- Feature service: `gms/feature_service.hh`
|
||||
@@ -42,7 +42,7 @@ comparison_operator_type get_comparison_operator(const rjson::value& comparison_
|
||||
if (!comparison_operator.IsString()) {
|
||||
throw api_error::validation(fmt::format("Invalid comparison operator definition {}", rjson::print(comparison_operator)));
|
||||
}
|
||||
std::string op = rjson::to_string(comparison_operator);
|
||||
std::string op = comparison_operator.GetString();
|
||||
auto it = ops.find(op);
|
||||
if (it == ops.end()) {
|
||||
throw api_error::validation(fmt::format("Unsupported comparison operator {}", op));
|
||||
@@ -377,8 +377,8 @@ bool check_compare(const rjson::value* v1, const rjson::value& v2, const Compara
|
||||
return cmp(unwrap_number(*v1, cmp.diagnostic), unwrap_number(v2, cmp.diagnostic));
|
||||
}
|
||||
if (kv1.name == "S") {
|
||||
return cmp(rjson::to_string_view(kv1.value),
|
||||
rjson::to_string_view(kv2.value));
|
||||
return cmp(std::string_view(kv1.value.GetString(), kv1.value.GetStringLength()),
|
||||
std::string_view(kv2.value.GetString(), kv2.value.GetStringLength()));
|
||||
}
|
||||
if (kv1.name == "B") {
|
||||
auto d_kv1 = unwrap_bytes(kv1.value, v1_from_query);
|
||||
@@ -470,9 +470,9 @@ static bool check_BETWEEN(const rjson::value* v, const rjson::value& lb, const r
|
||||
return check_BETWEEN(unwrap_number(*v, diag), unwrap_number(lb, diag), unwrap_number(ub, diag), bounds_from_query);
|
||||
}
|
||||
if (kv_v.name == "S") {
|
||||
return check_BETWEEN(rjson::to_string_view(kv_v.value),
|
||||
rjson::to_string_view(kv_lb.value),
|
||||
rjson::to_string_view(kv_ub.value),
|
||||
return check_BETWEEN(std::string_view(kv_v.value.GetString(), kv_v.value.GetStringLength()),
|
||||
std::string_view(kv_lb.value.GetString(), kv_lb.value.GetStringLength()),
|
||||
std::string_view(kv_ub.value.GetString(), kv_ub.value.GetStringLength()),
|
||||
bounds_from_query);
|
||||
}
|
||||
if (kv_v.name == "B") {
|
||||
|
||||
@@ -8,8 +8,6 @@
|
||||
|
||||
#include "consumed_capacity.hh"
|
||||
#include "error.hh"
|
||||
#include "utils/rjson.hh"
|
||||
#include <fmt/format.h>
|
||||
|
||||
namespace alternator {
|
||||
|
||||
@@ -34,12 +32,12 @@ bool consumed_capacity_counter::should_add_capacity(const rjson::value& request)
|
||||
if (!return_consumed->IsString()) {
|
||||
throw api_error::validation("Non-string ReturnConsumedCapacity field in request");
|
||||
}
|
||||
std::string_view consumed = rjson::to_string_view(*return_consumed);
|
||||
std::string consumed = return_consumed->GetString();
|
||||
if (consumed == "INDEXES") {
|
||||
throw api_error::validation("INDEXES consumed capacity is not supported");
|
||||
}
|
||||
if (consumed != "TOTAL") {
|
||||
throw api_error::validation(fmt::format("Unknown consumed capacity {}", consumed));
|
||||
throw api_error::validation("Unknown consumed capacity "+ consumed);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -419,7 +419,7 @@ static std::optional<std::string> find_table_name(const rjson::value& request) {
|
||||
if (!table_name_value->IsString()) {
|
||||
throw api_error::validation("Non-string TableName field in request");
|
||||
}
|
||||
std::string table_name = rjson::to_string(*table_name_value);
|
||||
std::string table_name = table_name_value->GetString();
|
||||
return table_name;
|
||||
}
|
||||
|
||||
@@ -546,7 +546,7 @@ get_table_or_view(service::storage_proxy& proxy, const rjson::value& request) {
|
||||
// does exist but the index does not (ValidationException).
|
||||
if (proxy.data_dictionary().has_schema(keyspace_name, orig_table_name)) {
|
||||
throw api_error::validation(
|
||||
fmt::format("Requested resource not found: Index '{}' for table '{}'", rjson::to_string_view(*index_name), orig_table_name));
|
||||
fmt::format("Requested resource not found: Index '{}' for table '{}'", index_name->GetString(), orig_table_name));
|
||||
} else {
|
||||
throw api_error::resource_not_found(
|
||||
fmt::format("Requested resource not found: Table: {} not found", orig_table_name));
|
||||
@@ -587,7 +587,7 @@ static std::string get_string_attribute(const rjson::value& value, std::string_v
|
||||
throw api_error::validation(fmt::format("Expected string value for attribute {}, got: {}",
|
||||
attribute_name, value));
|
||||
}
|
||||
return rjson::to_string(*attribute_value);
|
||||
return std::string(attribute_value->GetString(), attribute_value->GetStringLength());
|
||||
}
|
||||
|
||||
// Convenience function for getting the value of a boolean attribute, or a
|
||||
@@ -1080,8 +1080,8 @@ static void add_column(schema_builder& builder, const std::string& name, const r
|
||||
}
|
||||
for (auto it = attribute_definitions.Begin(); it != attribute_definitions.End(); ++it) {
|
||||
const rjson::value& attribute_info = *it;
|
||||
if (rjson::to_string_view(attribute_info["AttributeName"]) == name) {
|
||||
std::string_view type = rjson::to_string_view(attribute_info["AttributeType"]);
|
||||
if (attribute_info["AttributeName"].GetString() == name) {
|
||||
auto type = attribute_info["AttributeType"].GetString();
|
||||
data_type dt = parse_key_type(type);
|
||||
if (computed_column) {
|
||||
// Computed column for GSI (doesn't choose a real column as-is
|
||||
@@ -1116,7 +1116,7 @@ static std::pair<std::string, std::string> parse_key_schema(const rjson::value&
|
||||
throw api_error::validation("First element of KeySchema must be an object");
|
||||
}
|
||||
const rjson::value *v = rjson::find((*key_schema)[0], "KeyType");
|
||||
if (!v || !v->IsString() || rjson::to_string_view(*v) != "HASH") {
|
||||
if (!v || !v->IsString() || v->GetString() != std::string("HASH")) {
|
||||
throw api_error::validation("First key in KeySchema must be a HASH key");
|
||||
}
|
||||
v = rjson::find((*key_schema)[0], "AttributeName");
|
||||
@@ -1124,14 +1124,14 @@ static std::pair<std::string, std::string> parse_key_schema(const rjson::value&
|
||||
throw api_error::validation("First key in KeySchema must have string AttributeName");
|
||||
}
|
||||
validate_attr_name_length(supplementary_context, v->GetStringLength(), true, "HASH key in KeySchema - ");
|
||||
std::string hash_key = rjson::to_string(*v);
|
||||
std::string hash_key = v->GetString();
|
||||
std::string range_key;
|
||||
if (key_schema->Size() == 2) {
|
||||
if (!(*key_schema)[1].IsObject()) {
|
||||
throw api_error::validation("Second element of KeySchema must be an object");
|
||||
}
|
||||
v = rjson::find((*key_schema)[1], "KeyType");
|
||||
if (!v || !v->IsString() || rjson::to_string_view(*v) != "RANGE") {
|
||||
if (!v || !v->IsString() || v->GetString() != std::string("RANGE")) {
|
||||
throw api_error::validation("Second key in KeySchema must be a RANGE key");
|
||||
}
|
||||
v = rjson::find((*key_schema)[1], "AttributeName");
|
||||
@@ -1799,11 +1799,6 @@ static future<executor::request_return_type> create_table_on_shard0(service::cli
|
||||
}
|
||||
}
|
||||
}
|
||||
// Creating an index in tablets mode requires the rf_rack_valid_keyspaces option to be enabled.
|
||||
// GSI and LSI indexes are based on materialized views which require this option to avoid consistency issues.
|
||||
if (!view_builders.empty() && ksm->uses_tablets() && !sp.data_dictionary().get_config().rf_rack_valid_keyspaces()) {
|
||||
co_return api_error::validation("GlobalSecondaryIndexes and LocalSecondaryIndexes with tablets require the rf_rack_valid_keyspaces option to be enabled.");
|
||||
}
|
||||
try {
|
||||
schema_mutations = service::prepare_new_keyspace_announcement(sp.local_db(), ksm, ts);
|
||||
} catch (exceptions::already_exists_exception&) {
|
||||
@@ -1892,8 +1887,8 @@ future<executor::request_return_type> executor::create_table(client_state& clien
|
||||
std::string def_type = type_to_string(def.type);
|
||||
for (auto it = attribute_definitions.Begin(); it != attribute_definitions.End(); ++it) {
|
||||
const rjson::value& attribute_info = *it;
|
||||
if (rjson::to_string_view(attribute_info["AttributeName"]) == def.name_as_text()) {
|
||||
std::string_view type = rjson::to_string_view(attribute_info["AttributeType"]);
|
||||
if (attribute_info["AttributeName"].GetString() == def.name_as_text()) {
|
||||
auto type = attribute_info["AttributeType"].GetString();
|
||||
if (type != def_type) {
|
||||
throw api_error::validation(fmt::format("AttributeDefinitions redefined {} to {} already a key attribute of type {} in this table", def.name_as_text(), type, def_type));
|
||||
}
|
||||
@@ -2024,10 +2019,6 @@ future<executor::request_return_type> executor::update_table(client_state& clien
|
||||
co_return api_error::validation(fmt::format(
|
||||
"LSI {} already exists in table {}, can't use same name for GSI", index_name, table_name));
|
||||
}
|
||||
if (p.local().local_db().find_keyspace(keyspace_name).get_replication_strategy().uses_tablets() &&
|
||||
!p.local().data_dictionary().get_config().rf_rack_valid_keyspaces()) {
|
||||
co_return api_error::validation("GlobalSecondaryIndexes with tablets require the rf_rack_valid_keyspaces option to be enabled.");
|
||||
}
|
||||
|
||||
elogger.trace("Adding GSI {}", index_name);
|
||||
// FIXME: read and handle "Projection" parameter. This will
|
||||
@@ -2232,12 +2223,12 @@ void validate_value(const rjson::value& v, const char* caller) {
|
||||
|
||||
// The put_or_delete_item class builds the mutations needed by the PutItem and
|
||||
// DeleteItem operations - either as stand-alone commands or part of a list
|
||||
// of commands in BatchWriteItem.
|
||||
// of commands in BatchWriteItems.
|
||||
// put_or_delete_item splits each operation into two stages: Constructing the
|
||||
// object parses and validates the user input (throwing exceptions if there
|
||||
// are input errors). Later, build() generates the actual mutation, with a
|
||||
// specified timestamp. This split is needed because of the peculiar needs of
|
||||
// BatchWriteItem and LWT. BatchWriteItem needs all parsing to happen before
|
||||
// BatchWriteItems and LWT. BatchWriteItems needs all parsing to happen before
|
||||
// any writing happens (if one of the commands has an error, none of the
|
||||
// writes should be done). LWT makes it impossible for the parse step to
|
||||
// generate "mutation" objects, because the timestamp still isn't known.
|
||||
@@ -2371,7 +2362,7 @@ put_or_delete_item::put_or_delete_item(const rjson::value& item, schema_ptr sche
|
||||
_cells = std::vector<cell>();
|
||||
_cells->reserve(item.MemberCount());
|
||||
for (auto it = item.MemberBegin(); it != item.MemberEnd(); ++it) {
|
||||
bytes column_name = to_bytes(rjson::to_string_view(it->name));
|
||||
bytes column_name = to_bytes(it->name.GetString());
|
||||
validate_value(it->value, "PutItem");
|
||||
const column_definition* cdef = find_attribute(*schema, column_name);
|
||||
validate_attr_name_length("", column_name.size(), cdef && cdef->is_primary_key());
|
||||
@@ -2748,7 +2739,7 @@ future<executor::request_return_type> rmw_operation::execute(service::storage_pr
|
||||
auto read_command = needs_read_before_write ?
|
||||
previous_item_read_command(proxy, schema(), _ck, selection) :
|
||||
nullptr;
|
||||
return proxy.cas(schema(), std::move(*cas_shard), *this, read_command, to_partition_ranges(*schema(), _pk),
|
||||
return proxy.cas(schema(), std::move(*cas_shard), shared_from_this(), read_command, to_partition_ranges(*schema(), _pk),
|
||||
{timeout, std::move(permit), client_state, trace_state},
|
||||
db::consistency_level::LOCAL_SERIAL, db::consistency_level::LOCAL_QUORUM, timeout, timeout, true, std::move(cdc_opts)).then([this, read_command, &wcu_total] (bool is_applied) mutable {
|
||||
if (!is_applied) {
|
||||
@@ -2792,10 +2783,10 @@ static void verify_all_are_used(const rjson::value* field,
|
||||
return;
|
||||
}
|
||||
for (auto it = field->MemberBegin(); it != field->MemberEnd(); ++it) {
|
||||
if (!used.contains(rjson::to_string(it->name))) {
|
||||
if (!used.contains(it->name.GetString())) {
|
||||
throw api_error::validation(
|
||||
format("{} has spurious '{}', not used in {}",
|
||||
field_name, rjson::to_string_view(it->name), operation));
|
||||
field_name, it->name.GetString(), operation));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3009,7 +3000,7 @@ future<executor::request_return_type> executor::delete_item(client_state& client
|
||||
}
|
||||
|
||||
static schema_ptr get_table_from_batch_request(const service::storage_proxy& proxy, const rjson::value::ConstMemberIterator& batch_request) {
|
||||
sstring table_name = rjson::to_sstring(batch_request->name); // JSON keys are always strings
|
||||
sstring table_name = batch_request->name.GetString(); // JSON keys are always strings
|
||||
try {
|
||||
return proxy.data_dictionary().find_schema(sstring(executor::KEYSPACE_NAME_PREFIX) + table_name, table_name);
|
||||
} catch(data_dictionary::no_such_column_family&) {
|
||||
@@ -3035,20 +3026,17 @@ struct primary_key_equal {
|
||||
};
|
||||
|
||||
// This is a cas_request subclass for applying given put_or_delete_items to
|
||||
// one partition using LWT as part as BatchWriteItem. This is a write-only
|
||||
// one partition using LWT as part as BatchWriteItems. This is a write-only
|
||||
// operation, not needing the previous value of the item (the mutation to be
|
||||
// done is known prior to starting the operation). Nevertheless, we want to
|
||||
// do this mutation via LWT to ensure that it is serialized with other LWT
|
||||
// mutations to the same partition.
|
||||
//
|
||||
// The std::vector<put_or_delete_item> must remain alive until the
|
||||
// storage_proxy::cas() future is resolved.
|
||||
class put_or_delete_item_cas_request : public service::cas_request {
|
||||
schema_ptr schema;
|
||||
const std::vector<put_or_delete_item>& _mutation_builders;
|
||||
std::vector<put_or_delete_item> _mutation_builders;
|
||||
public:
|
||||
put_or_delete_item_cas_request(schema_ptr s, const std::vector<put_or_delete_item>& b) :
|
||||
schema(std::move(s)), _mutation_builders(b) { }
|
||||
put_or_delete_item_cas_request(schema_ptr s, std::vector<put_or_delete_item>&& b) :
|
||||
schema(std::move(s)), _mutation_builders(std::move(b)) { }
|
||||
virtual ~put_or_delete_item_cas_request() = default;
|
||||
virtual std::optional<mutation> apply(foreign_ptr<lw_shared_ptr<query::result>> qr, const query::partition_slice& slice, api::timestamp_type ts, cdc::per_request_options& cdc_opts) override {
|
||||
std::optional<mutation> ret;
|
||||
@@ -3064,48 +3052,20 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
future<> executor::cas_write(schema_ptr schema, service::cas_shard cas_shard, const dht::decorated_key& dk,
|
||||
const std::vector<put_or_delete_item>& mutation_builders, service::client_state& client_state,
|
||||
tracing::trace_state_ptr trace_state, service_permit permit)
|
||||
{
|
||||
if (!cas_shard.this_shard()) {
|
||||
_stats.shard_bounce_for_lwt++;
|
||||
return container().invoke_on(cas_shard.shard(), _ssg,
|
||||
[cs = client_state.move_to_other_shard(),
|
||||
&mb = mutation_builders,
|
||||
&dk,
|
||||
ks = schema->ks_name(),
|
||||
cf = schema->cf_name(),
|
||||
gt = tracing::global_trace_state_ptr(trace_state),
|
||||
permit = std::move(permit)]
|
||||
(executor& self) mutable {
|
||||
return do_with(cs.get(), [&mb, &dk, ks = std::move(ks), cf = std::move(cf),
|
||||
trace_state = tracing::trace_state_ptr(gt), &self]
|
||||
(service::client_state& client_state) mutable {
|
||||
auto schema = self._proxy.data_dictionary().find_schema(ks, cf);
|
||||
service::cas_shard cas_shard(*schema, dk.token());
|
||||
|
||||
//FIXME: Instead of passing empty_service_permit() to the background operation,
|
||||
// the current permit's lifetime should be prolonged, so that it's destructed
|
||||
// only after all background operations are finished as well.
|
||||
return self.cas_write(schema, std::move(cas_shard), dk, mb, client_state, std::move(trace_state), empty_service_permit());
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
static future<> cas_write(service::storage_proxy& proxy, schema_ptr schema, service::cas_shard cas_shard, dht::decorated_key dk, std::vector<put_or_delete_item>&& mutation_builders,
|
||||
service::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit) {
|
||||
auto timeout = executor::default_timeout();
|
||||
auto op = std::make_unique<put_or_delete_item_cas_request>(schema, mutation_builders);
|
||||
auto* op_ptr = op.get();
|
||||
auto op = seastar::make_shared<put_or_delete_item_cas_request>(schema, std::move(mutation_builders));
|
||||
auto cdc_opts = cdc::per_request_options{
|
||||
.alternator = true,
|
||||
.alternator_streams_increased_compatibility =
|
||||
schema->cdc_options().enabled() && _proxy.data_dictionary().get_config().alternator_streams_increased_compatibility(),
|
||||
schema->cdc_options().enabled() && proxy.data_dictionary().get_config().alternator_streams_increased_compatibility(),
|
||||
};
|
||||
return _proxy.cas(schema, std::move(cas_shard), *op_ptr, nullptr, to_partition_ranges(dk),
|
||||
return proxy.cas(schema, std::move(cas_shard), op, nullptr, to_partition_ranges(dk),
|
||||
{timeout, std::move(permit), client_state, trace_state},
|
||||
db::consistency_level::LOCAL_SERIAL, db::consistency_level::LOCAL_QUORUM,
|
||||
timeout, timeout, true, std::move(cdc_opts)).finally([op = std::move(op)]{}).discard_result();
|
||||
// We discarded cas()'s future value ("is_applied") because BatchWriteItem
|
||||
timeout, timeout, true, std::move(cdc_opts)).discard_result();
|
||||
// We discarded cas()'s future value ("is_applied") because BatchWriteItems
|
||||
// does not need to support conditional updates.
|
||||
}
|
||||
|
||||
@@ -3127,11 +3087,13 @@ struct schema_decorated_key_equal {
|
||||
|
||||
// FIXME: if we failed writing some of the mutations, need to return a list
|
||||
// of these failed mutations rather than fail the whole write (issue #5650).
|
||||
future<> executor::do_batch_write(
|
||||
static future<> do_batch_write(service::storage_proxy& proxy,
|
||||
smp_service_group ssg,
|
||||
std::vector<std::pair<schema_ptr, put_or_delete_item>> mutation_builders,
|
||||
service::client_state& client_state,
|
||||
tracing::trace_state_ptr trace_state,
|
||||
service_permit permit) {
|
||||
service_permit permit,
|
||||
stats& stats) {
|
||||
if (mutation_builders.empty()) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
@@ -3153,7 +3115,7 @@ future<> executor::do_batch_write(
|
||||
mutations.push_back(b.second.build(b.first, now));
|
||||
any_cdc_enabled |= b.first->cdc_options().enabled();
|
||||
}
|
||||
return _proxy.mutate(std::move(mutations),
|
||||
return proxy.mutate(std::move(mutations),
|
||||
db::consistency_level::LOCAL_QUORUM,
|
||||
executor::default_timeout(),
|
||||
trace_state,
|
||||
@@ -3162,48 +3124,55 @@ future<> executor::do_batch_write(
|
||||
false,
|
||||
cdc::per_request_options{
|
||||
.alternator = true,
|
||||
.alternator_streams_increased_compatibility = any_cdc_enabled && _proxy.data_dictionary().get_config().alternator_streams_increased_compatibility(),
|
||||
.alternator_streams_increased_compatibility = any_cdc_enabled && proxy.data_dictionary().get_config().alternator_streams_increased_compatibility(),
|
||||
});
|
||||
} else {
|
||||
// Do the write via LWT:
|
||||
// Multiple mutations may be destined for the same partition, adding
|
||||
// or deleting different items of one partition. Join them together
|
||||
// because we can do them in one cas() call.
|
||||
using map_type = std::unordered_map<schema_decorated_key,
|
||||
std::vector<put_or_delete_item>,
|
||||
schema_decorated_key_hash,
|
||||
schema_decorated_key_equal>;
|
||||
auto key_builders = std::make_unique<map_type>(1, schema_decorated_key_hash{}, schema_decorated_key_equal{});
|
||||
for (auto&& b : std::move(mutation_builders)) {
|
||||
auto [it, added] = key_builders->try_emplace(schema_decorated_key {
|
||||
.schema = b.first,
|
||||
.dk = dht::decorate_key(*b.first, b.second.pk())
|
||||
});
|
||||
std::unordered_map<schema_decorated_key, std::vector<put_or_delete_item>, schema_decorated_key_hash, schema_decorated_key_equal>
|
||||
key_builders(1, schema_decorated_key_hash{}, schema_decorated_key_equal{});
|
||||
for (auto& b : mutation_builders) {
|
||||
auto dk = dht::decorate_key(*b.first, b.second.pk());
|
||||
auto [it, added] = key_builders.try_emplace(schema_decorated_key{b.first, dk});
|
||||
it->second.push_back(std::move(b.second));
|
||||
}
|
||||
auto* key_builders_ptr = key_builders.get();
|
||||
return parallel_for_each(*key_builders_ptr, [this, &client_state, trace_state, permit = std::move(permit)] (const auto& e) {
|
||||
_stats.write_using_lwt++;
|
||||
return parallel_for_each(std::move(key_builders), [&proxy, &client_state, &stats, trace_state, ssg, permit = std::move(permit)] (auto& e) {
|
||||
stats.write_using_lwt++;
|
||||
auto desired_shard = service::cas_shard(*e.first.schema, e.first.dk.token());
|
||||
auto s = e.first.schema;
|
||||
if (desired_shard.this_shard()) {
|
||||
return cas_write(proxy, e.first.schema, std::move(desired_shard), e.first.dk, std::move(e.second), client_state, trace_state, permit);
|
||||
} else {
|
||||
stats.shard_bounce_for_lwt++;
|
||||
return proxy.container().invoke_on(desired_shard.shard(), ssg,
|
||||
[cs = client_state.move_to_other_shard(),
|
||||
mb = e.second,
|
||||
dk = e.first.dk,
|
||||
ks = e.first.schema->ks_name(),
|
||||
cf = e.first.schema->cf_name(),
|
||||
gt = tracing::global_trace_state_ptr(trace_state),
|
||||
permit = std::move(permit)]
|
||||
(service::storage_proxy& proxy) mutable {
|
||||
return do_with(cs.get(), [&proxy, mb = std::move(mb), dk = std::move(dk), ks = std::move(ks), cf = std::move(cf),
|
||||
trace_state = tracing::trace_state_ptr(gt)]
|
||||
(service::client_state& client_state) mutable {
|
||||
auto schema = proxy.data_dictionary().find_schema(ks, cf);
|
||||
|
||||
static const auto* injection_name = "alternator_executor_batch_write_wait";
|
||||
return utils::get_local_injector().inject(injection_name, [s = std::move(s)] (auto& handler) -> future<> {
|
||||
const auto ks = handler.get("keyspace");
|
||||
const auto cf = handler.get("table");
|
||||
const auto shard = std::atoll(handler.get("shard")->data());
|
||||
if (ks == s->ks_name() && cf == s->cf_name() && shard == this_shard_id()) {
|
||||
elogger.info("{}: hit", injection_name);
|
||||
co_await handler.wait_for_message(std::chrono::steady_clock::now() + std::chrono::minutes{5});
|
||||
elogger.info("{}: continue", injection_name);
|
||||
}
|
||||
}).then([&e, desired_shard = std::move(desired_shard),
|
||||
&client_state, trace_state = std::move(trace_state), permit = std::move(permit), this]() mutable
|
||||
{
|
||||
return cas_write(e.first.schema, std::move(desired_shard), e.first.dk,
|
||||
std::move(e.second), client_state, std::move(trace_state), std::move(permit));
|
||||
});
|
||||
}).finally([key_builders = std::move(key_builders)]{});
|
||||
// The desired_shard on the original shard remains alive for the duration
|
||||
// of cas_write on this shard and prevents any tablet operations.
|
||||
// However, we need a local instance of cas_shard on this shard
|
||||
// to pass it to sp::cas, so we just create a new one.
|
||||
service::cas_shard cas_shard(*schema, dk.token());
|
||||
|
||||
//FIXME: Instead of passing empty_service_permit() to the background operation,
|
||||
// the current permit's lifetime should be prolonged, so that it's destructed
|
||||
// only after all background operations are finished as well.
|
||||
return cas_write(proxy, schema, std::move(cas_shard), dk, std::move(mb), client_state, std::move(trace_state), empty_service_permit());
|
||||
});
|
||||
}).finally([desired_shard = std::move(desired_shard)]{});
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3350,7 +3319,7 @@ future<executor::request_return_type> executor::batch_write_item(client_state& c
|
||||
_stats.wcu_total[stats::DELETE_ITEM] += wcu_delete_units;
|
||||
_stats.api_operations.batch_write_item_batch_total += total_items;
|
||||
_stats.api_operations.batch_write_item_histogram.add(total_items);
|
||||
co_await do_batch_write(std::move(mutation_builders), client_state, trace_state, std::move(permit));
|
||||
co_await do_batch_write(_proxy, _ssg, std::move(mutation_builders), client_state, trace_state, std::move(permit), _stats);
|
||||
// FIXME: Issue #5650: If we failed writing some of the updates,
|
||||
// need to return a list of these failed updates in UnprocessedItems
|
||||
// rather than fail the whole write (issue #5650).
|
||||
@@ -3395,7 +3364,7 @@ static bool hierarchy_filter(rjson::value& val, const attribute_path_map_node<T>
|
||||
}
|
||||
rjson::value newv = rjson::empty_object();
|
||||
for (auto it = v.MemberBegin(); it != v.MemberEnd(); ++it) {
|
||||
std::string attr = rjson::to_string(it->name);
|
||||
std::string attr = it->name.GetString();
|
||||
auto x = members.find(attr);
|
||||
if (x != members.end()) {
|
||||
if (x->second) {
|
||||
@@ -3615,7 +3584,7 @@ static std::optional<attrs_to_get> calculate_attrs_to_get(const rjson::value& re
|
||||
const rjson::value& attributes_to_get = req["AttributesToGet"];
|
||||
attrs_to_get ret;
|
||||
for (auto it = attributes_to_get.Begin(); it != attributes_to_get.End(); ++it) {
|
||||
attribute_path_map_add("AttributesToGet", ret, rjson::to_string(*it));
|
||||
attribute_path_map_add("AttributesToGet", ret, it->GetString());
|
||||
validate_attr_name_length("AttributesToGet", it->GetStringLength(), false);
|
||||
}
|
||||
if (ret.empty()) {
|
||||
@@ -4281,12 +4250,12 @@ inline void update_item_operation::apply_attribute_updates(const std::unique_ptr
|
||||
attribute_collector& modified_attrs, bool& any_updates, bool& any_deletes) const {
|
||||
for (auto it = _attribute_updates->MemberBegin(); it != _attribute_updates->MemberEnd(); ++it) {
|
||||
// Note that it.key() is the name of the column, *it is the operation
|
||||
bytes column_name = to_bytes(rjson::to_string_view(it->name));
|
||||
bytes column_name = to_bytes(it->name.GetString());
|
||||
const column_definition* cdef = _schema->get_column_definition(column_name);
|
||||
if (cdef && cdef->is_primary_key()) {
|
||||
throw api_error::validation(format("UpdateItem cannot update key column {}", rjson::to_string_view(it->name)));
|
||||
throw api_error::validation(format("UpdateItem cannot update key column {}", it->name.GetString()));
|
||||
}
|
||||
std::string action = rjson::to_string((it->value)["Action"]);
|
||||
std::string action = (it->value)["Action"].GetString();
|
||||
if (action == "DELETE") {
|
||||
// The DELETE operation can do two unrelated tasks. Without a
|
||||
// "Value" option, it is used to delete an attribute. With a
|
||||
@@ -5483,7 +5452,7 @@ calculate_bounds_conditions(schema_ptr schema, const rjson::value& conditions) {
|
||||
std::vector<query::clustering_range> ck_bounds;
|
||||
|
||||
for (auto it = conditions.MemberBegin(); it != conditions.MemberEnd(); ++it) {
|
||||
sstring key = rjson::to_sstring(it->name);
|
||||
std::string key = it->name.GetString();
|
||||
const rjson::value& condition = it->value;
|
||||
|
||||
const rjson::value& comp_definition = rjson::get(condition, "ComparisonOperator");
|
||||
@@ -5491,13 +5460,13 @@ calculate_bounds_conditions(schema_ptr schema, const rjson::value& conditions) {
|
||||
|
||||
const column_definition& pk_cdef = schema->partition_key_columns().front();
|
||||
const column_definition* ck_cdef = schema->clustering_key_size() > 0 ? &schema->clustering_key_columns().front() : nullptr;
|
||||
if (key == pk_cdef.name_as_text()) {
|
||||
if (sstring(key) == pk_cdef.name_as_text()) {
|
||||
if (!partition_ranges.empty()) {
|
||||
throw api_error::validation("Currently only a single restriction per key is allowed");
|
||||
}
|
||||
partition_ranges.push_back(calculate_pk_bound(schema, pk_cdef, comp_definition, attr_list));
|
||||
}
|
||||
if (ck_cdef && key == ck_cdef->name_as_text()) {
|
||||
if (ck_cdef && sstring(key) == ck_cdef->name_as_text()) {
|
||||
if (!ck_bounds.empty()) {
|
||||
throw api_error::validation("Currently only a single restriction per key is allowed");
|
||||
}
|
||||
@@ -5898,7 +5867,7 @@ future<executor::request_return_type> executor::list_tables(client_state& client
|
||||
|
||||
rjson::value* exclusive_start_json = rjson::find(request, "ExclusiveStartTableName");
|
||||
rjson::value* limit_json = rjson::find(request, "Limit");
|
||||
std::string exclusive_start = exclusive_start_json ? rjson::to_string(*exclusive_start_json) : "";
|
||||
std::string exclusive_start = exclusive_start_json ? exclusive_start_json->GetString() : "";
|
||||
int limit = limit_json ? limit_json->GetInt() : 100;
|
||||
if (limit < 1 || limit > 100) {
|
||||
co_return api_error::validation("Limit must be greater than 0 and no greater than 100");
|
||||
|
||||
@@ -40,7 +40,6 @@ namespace cql3::selection {
|
||||
|
||||
namespace service {
|
||||
class storage_proxy;
|
||||
class cas_shard;
|
||||
}
|
||||
|
||||
namespace cdc {
|
||||
@@ -58,7 +57,6 @@ class schema_builder;
|
||||
namespace alternator {
|
||||
|
||||
class rmw_operation;
|
||||
class put_or_delete_item;
|
||||
|
||||
schema_ptr get_table(service::storage_proxy& proxy, const rjson::value& request);
|
||||
bool is_alternator_keyspace(const sstring& ks_name);
|
||||
@@ -221,16 +219,6 @@ private:
|
||||
|
||||
static void describe_key_schema(rjson::value& parent, const schema&, std::unordered_map<std::string,std::string> * = nullptr, const std::map<sstring, sstring> *tags = nullptr);
|
||||
|
||||
future<> do_batch_write(
|
||||
std::vector<std::pair<schema_ptr, put_or_delete_item>> mutation_builders,
|
||||
service::client_state& client_state,
|
||||
tracing::trace_state_ptr trace_state,
|
||||
service_permit permit);
|
||||
|
||||
future<> cas_write(schema_ptr schema, service::cas_shard cas_shard, const dht::decorated_key& dk,
|
||||
const std::vector<put_or_delete_item>& mutation_builders, service::client_state& client_state,
|
||||
tracing::trace_state_ptr trace_state, service_permit permit);
|
||||
|
||||
public:
|
||||
static void describe_key_schema(rjson::value& parent, const schema& schema, std::unordered_map<std::string,std::string>&, const std::map<sstring, sstring> *tags = nullptr);
|
||||
|
||||
|
||||
@@ -496,7 +496,7 @@ const std::pair<std::string, const rjson::value*> unwrap_set(const rjson::value&
|
||||
return {"", nullptr};
|
||||
}
|
||||
auto it = v.MemberBegin();
|
||||
const std::string it_key = rjson::to_string(it->name);
|
||||
const std::string it_key = it->name.GetString();
|
||||
if (it_key != "SS" && it_key != "BS" && it_key != "NS") {
|
||||
return {std::move(it_key), nullptr};
|
||||
}
|
||||
|
||||
@@ -93,7 +93,7 @@ future<executor::request_return_type> executor::update_time_to_live(client_state
|
||||
if (v->GetStringLength() < 1 || v->GetStringLength() > 255) {
|
||||
co_return api_error::validation("The length of AttributeName must be between 1 and 255");
|
||||
}
|
||||
sstring attribute_name = rjson::to_sstring(*v);
|
||||
sstring attribute_name(v->GetString(), v->GetStringLength());
|
||||
|
||||
co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, schema, auth::permission::ALTER, _stats);
|
||||
co_await db::modify_tags(_mm, schema->ks_name(), schema->cf_name(), [&](std::map<sstring, sstring>& tags_map) {
|
||||
|
||||
@@ -3051,7 +3051,7 @@
|
||||
},
|
||||
{
|
||||
"name":"incremental_mode",
|
||||
"description":"Set the incremental repair mode. Can be 'disabled', 'incremental', or 'full'. 'incremental': The incremental repair logic is enabled. Unrepaired sstables will be included for repair. Repaired sstables will be skipped. The incremental repair states will be updated after repair. 'full': The incremental repair logic is enabled. Both repaired and unrepaired sstables will be included for repair. The incremental repair states will be updated after repair. 'disabled': The incremental repair logic is disabled completely. The incremental repair states, e.g., repaired_at in sstables and sstables_repaired_at in the system.tablets table, will not be updated after repair. When the option is not provided, it defaults to 'disabled' mode.",
|
||||
"description":"Set the incremental repair mode. Can be 'disabled', 'incremental', or 'full'. 'incremental': The incremental repair logic is enabled. Unrepaired sstables will be included for repair. Repaired sstables will be skipped. The incremental repair states will be updated after repair. 'full': The incremental repair logic is enabled. Both repaired and unrepaired sstables will be included for repair. The incremental repair states will be updated after repair. 'disabled': The incremental repair logic is disabled completely. The incremental repair states, e.g., repaired_at in sstables and sstables_repaired_at in the system.tablets table, will not be updated after repair. When the option is not provided, it defaults to incremental mode.",
|
||||
"required":false,
|
||||
"allowMultiple":false,
|
||||
"type":"string",
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
#include "auth/allow_all_authenticator.hh"
|
||||
|
||||
#include "service/migration_manager.hh"
|
||||
#include "utils/alien_worker.hh"
|
||||
#include "utils/class_registrator.hh"
|
||||
|
||||
namespace auth {
|
||||
@@ -22,6 +23,7 @@ static const class_registrator<
|
||||
cql3::query_processor&,
|
||||
::service::raft_group0_client&,
|
||||
::service::migration_manager&,
|
||||
cache&> registration("org.apache.cassandra.auth.AllowAllAuthenticator");
|
||||
cache&,
|
||||
utils::alien_worker&> registration("org.apache.cassandra.auth.AllowAllAuthenticator");
|
||||
|
||||
}
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
#include "auth/authenticator.hh"
|
||||
#include "auth/cache.hh"
|
||||
#include "auth/common.hh"
|
||||
#include "utils/alien_worker.hh"
|
||||
|
||||
namespace cql3 {
|
||||
class query_processor;
|
||||
@@ -29,7 +30,7 @@ extern const std::string_view allow_all_authenticator_name;
|
||||
|
||||
class allow_all_authenticator final : public authenticator {
|
||||
public:
|
||||
allow_all_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, cache&) {
|
||||
allow_all_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, cache&, utils::alien_worker&) {
|
||||
}
|
||||
|
||||
virtual future<> start() override {
|
||||
|
||||
@@ -8,7 +8,6 @@
|
||||
*/
|
||||
|
||||
#include "auth/certificate_authenticator.hh"
|
||||
#include "auth/cache.hh"
|
||||
|
||||
#include <boost/regex.hpp>
|
||||
#include <fmt/ranges.h>
|
||||
@@ -35,13 +34,13 @@ static const class_registrator<auth::authenticator
|
||||
, cql3::query_processor&
|
||||
, ::service::raft_group0_client&
|
||||
, ::service::migration_manager&
|
||||
, auth::cache&> cert_auth_reg(CERT_AUTH_NAME);
|
||||
, utils::alien_worker&> cert_auth_reg(CERT_AUTH_NAME);
|
||||
|
||||
enum class auth::certificate_authenticator::query_source {
|
||||
subject, altname
|
||||
};
|
||||
|
||||
auth::certificate_authenticator::certificate_authenticator(cql3::query_processor& qp, ::service::raft_group0_client&, ::service::migration_manager&, auth::cache&)
|
||||
auth::certificate_authenticator::certificate_authenticator(cql3::query_processor& qp, ::service::raft_group0_client&, ::service::migration_manager&, utils::alien_worker&)
|
||||
: _queries([&] {
|
||||
auto& conf = qp.db().get_config();
|
||||
auto queries = conf.auth_certificate_role_queries();
|
||||
@@ -76,9 +75,9 @@ auth::certificate_authenticator::certificate_authenticator(cql3::query_processor
|
||||
throw std::invalid_argument(fmt::format("Invalid source: {}", map.at(cfg_source_attr)));
|
||||
}
|
||||
continue;
|
||||
} catch (const std::out_of_range&) {
|
||||
} catch (std::out_of_range&) {
|
||||
// just fallthrough
|
||||
} catch (const boost::regex_error&) {
|
||||
} catch (boost::regex_error&) {
|
||||
std::throw_with_nested(std::invalid_argument(fmt::format("Invalid query expression: {}", map.at(cfg_query_attr))));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "auth/authenticator.hh"
|
||||
#include "utils/alien_worker.hh"
|
||||
#include <boost/regex_fwd.hpp> // IWYU pragma: keep
|
||||
|
||||
namespace cql3 {
|
||||
@@ -25,15 +26,13 @@ class raft_group0_client;
|
||||
|
||||
namespace auth {
|
||||
|
||||
class cache;
|
||||
|
||||
extern const std::string_view certificate_authenticator_name;
|
||||
|
||||
class certificate_authenticator : public authenticator {
|
||||
enum class query_source;
|
||||
std::vector<std::pair<query_source, boost::regex>> _queries;
|
||||
public:
|
||||
certificate_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, cache&);
|
||||
certificate_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, utils::alien_worker&);
|
||||
~certificate_authenticator();
|
||||
|
||||
future<> start() override;
|
||||
|
||||
@@ -94,7 +94,7 @@ static future<> create_legacy_metadata_table_if_missing_impl(
|
||||
try {
|
||||
co_return co_await mm.announce(co_await ::service::prepare_new_column_family_announcement(qp.proxy(), table, ts),
|
||||
std::move(group0_guard), format("auth: create {} metadata table", table->cf_name()));
|
||||
} catch (const exceptions::already_exists_exception&) {}
|
||||
} catch (exceptions::already_exists_exception&) {}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -256,7 +256,7 @@ future<> default_authorizer::revoke_all(std::string_view role_name, ::service::g
|
||||
} else {
|
||||
co_await collect_mutations(_qp, mc, query, {sstring(role_name)});
|
||||
}
|
||||
} catch (const exceptions::request_execution_exception& e) {
|
||||
} catch (exceptions::request_execution_exception& e) {
|
||||
alogger.warn("CassandraAuthorizer failed to revoke all permissions of {}: {}", role_name, e);
|
||||
}
|
||||
}
|
||||
@@ -293,13 +293,13 @@ future<> default_authorizer::revoke_all_legacy(const resource& resource) {
|
||||
[resource](auto ep) {
|
||||
try {
|
||||
std::rethrow_exception(ep);
|
||||
} catch (const exceptions::request_execution_exception& e) {
|
||||
} catch (exceptions::request_execution_exception& e) {
|
||||
alogger.warn("CassandraAuthorizer failed to revoke all permissions on {}: {}", resource, e);
|
||||
}
|
||||
|
||||
});
|
||||
});
|
||||
} catch (const exceptions::request_execution_exception& e) {
|
||||
} catch (exceptions::request_execution_exception& e) {
|
||||
alogger.warn("CassandraAuthorizer failed to revoke all permissions on {}: {}", resource, e);
|
||||
return make_ready_future();
|
||||
}
|
||||
|
||||
@@ -49,7 +49,8 @@ static const class_registrator<
|
||||
cql3::query_processor&,
|
||||
::service::raft_group0_client&,
|
||||
::service::migration_manager&,
|
||||
cache&> password_auth_reg("org.apache.cassandra.auth.PasswordAuthenticator");
|
||||
cache&,
|
||||
utils::alien_worker&> password_auth_reg("org.apache.cassandra.auth.PasswordAuthenticator");
|
||||
|
||||
static thread_local auto rng_for_salt = std::default_random_engine(std::random_device{}());
|
||||
|
||||
@@ -63,13 +64,14 @@ std::string password_authenticator::default_superuser(const db::config& cfg) {
|
||||
password_authenticator::~password_authenticator() {
|
||||
}
|
||||
|
||||
password_authenticator::password_authenticator(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm, cache& cache)
|
||||
password_authenticator::password_authenticator(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm, cache& cache, utils::alien_worker& hashing_worker)
|
||||
: _qp(qp)
|
||||
, _group0_client(g0)
|
||||
, _migration_manager(mm)
|
||||
, _cache(cache)
|
||||
, _stopped(make_ready_future<>())
|
||||
, _superuser(default_superuser(qp.db().get_config()))
|
||||
, _hashing_worker(hashing_worker)
|
||||
{}
|
||||
|
||||
static bool has_salted_hash(const cql3::untyped_result_set_row& row) {
|
||||
@@ -328,18 +330,20 @@ future<authenticated_user> password_authenticator::authenticate(
|
||||
}
|
||||
salted_hash = role->salted_hash;
|
||||
}
|
||||
const bool password_match = co_await passwords::check(password, *salted_hash);
|
||||
const bool password_match = co_await _hashing_worker.submit<bool>([password = std::move(password), salted_hash] {
|
||||
return passwords::check(password, *salted_hash);
|
||||
});
|
||||
if (!password_match) {
|
||||
throw exceptions::authentication_exception("Username and/or password are incorrect");
|
||||
}
|
||||
co_return username;
|
||||
} catch (const std::system_error &) {
|
||||
} catch (std::system_error &) {
|
||||
std::throw_with_nested(exceptions::authentication_exception("Could not verify password"));
|
||||
} catch (const exceptions::request_execution_exception& e) {
|
||||
} catch (exceptions::request_execution_exception& e) {
|
||||
std::throw_with_nested(exceptions::authentication_exception(e.what()));
|
||||
} catch (const exceptions::authentication_exception& e) {
|
||||
} catch (exceptions::authentication_exception& e) {
|
||||
std::throw_with_nested(e);
|
||||
} catch (const exceptions::unavailable_exception& e) {
|
||||
} catch (exceptions::unavailable_exception& e) {
|
||||
std::throw_with_nested(exceptions::authentication_exception(e.get_message()));
|
||||
} catch (...) {
|
||||
std::throw_with_nested(exceptions::authentication_exception("authentication failed"));
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
#include "auth/passwords.hh"
|
||||
#include "auth/cache.hh"
|
||||
#include "service/raft/raft_group0_client.hh"
|
||||
#include "utils/alien_worker.hh"
|
||||
|
||||
namespace db {
|
||||
class config;
|
||||
@@ -48,12 +49,13 @@ class password_authenticator : public authenticator {
|
||||
shared_promise<> _superuser_created_promise;
|
||||
// We used to also support bcrypt, SHA-256, and MD5 (ref. scylladb#24524).
|
||||
constexpr static auth::passwords::scheme _scheme = passwords::scheme::sha_512;
|
||||
utils::alien_worker& _hashing_worker;
|
||||
|
||||
public:
|
||||
static db::consistency_level consistency_for_user(std::string_view role_name);
|
||||
static std::string default_superuser(const db::config&);
|
||||
|
||||
password_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, cache&);
|
||||
password_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, cache&, utils::alien_worker&);
|
||||
|
||||
~password_authenticator();
|
||||
|
||||
|
||||
@@ -7,8 +7,6 @@
|
||||
*/
|
||||
|
||||
#include "auth/passwords.hh"
|
||||
#include "utils/crypt_sha512.hh"
|
||||
#include <seastar/core/coroutine.hh>
|
||||
|
||||
#include <cerrno>
|
||||
|
||||
@@ -23,46 +21,25 @@ static thread_local crypt_data tlcrypt = {};
|
||||
|
||||
namespace detail {
|
||||
|
||||
void verify_hashing_output(const char * res) {
|
||||
if (!res || (res[0] == '*')) {
|
||||
throw std::system_error(errno, std::system_category());
|
||||
}
|
||||
}
|
||||
|
||||
void verify_scheme(scheme scheme) {
|
||||
const sstring random_part_of_salt = "aaaabbbbccccdddd";
|
||||
|
||||
const sstring salt = sstring(prefix_for_scheme(scheme)) + random_part_of_salt;
|
||||
const char* e = crypt_r("fisk", salt.c_str(), &tlcrypt);
|
||||
try {
|
||||
verify_hashing_output(e);
|
||||
} catch (const std::system_error& ex) {
|
||||
throw no_supported_schemes();
|
||||
|
||||
if (e && (e[0] != '*')) {
|
||||
return;
|
||||
}
|
||||
|
||||
throw no_supported_schemes();
|
||||
}
|
||||
|
||||
sstring hash_with_salt(const sstring& pass, const sstring& salt) {
|
||||
auto res = crypt_r(pass.c_str(), salt.c_str(), &tlcrypt);
|
||||
verify_hashing_output(res);
|
||||
return res;
|
||||
}
|
||||
|
||||
seastar::future<sstring> hash_with_salt_async(const sstring& pass, const sstring& salt) {
|
||||
sstring res;
|
||||
// Only SHA-512 hashes for passphrases shorter than 256 bytes can be computed using
|
||||
// the __crypt_sha512 method. For other computations, we fall back to the
|
||||
// crypt_r implementation from `<crypt.h>`, which can stall.
|
||||
if (salt.starts_with(prefix_for_scheme(scheme::sha_512)) && pass.size() <= 255) {
|
||||
char buf[128];
|
||||
const char * output_ptr = co_await __crypt_sha512(pass.c_str(), salt.c_str(), buf);
|
||||
verify_hashing_output(output_ptr);
|
||||
res = output_ptr;
|
||||
} else {
|
||||
const char * output_ptr = crypt_r(pass.c_str(), salt.c_str(), &tlcrypt);
|
||||
verify_hashing_output(output_ptr);
|
||||
res = output_ptr;
|
||||
if (!res || (res[0] == '*')) {
|
||||
throw std::system_error(errno, std::system_category());
|
||||
}
|
||||
co_return res;
|
||||
return res;
|
||||
}
|
||||
|
||||
std::string_view prefix_for_scheme(scheme c) noexcept {
|
||||
@@ -81,9 +58,8 @@ no_supported_schemes::no_supported_schemes()
|
||||
: std::runtime_error("No allowed hashing schemes are supported on this system") {
|
||||
}
|
||||
|
||||
seastar::future<bool> check(const sstring& pass, const sstring& salted_hash) {
|
||||
const auto pwd_hash = co_await detail::hash_with_salt_async(pass, salted_hash);
|
||||
co_return pwd_hash == salted_hash;
|
||||
bool check(const sstring& pass, const sstring& salted_hash) {
|
||||
return detail::hash_with_salt(pass, salted_hash) == salted_hash;
|
||||
}
|
||||
|
||||
} // namespace auth::passwords
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
#include <random>
|
||||
#include <stdexcept>
|
||||
|
||||
#include <seastar/core/future.hh>
|
||||
#include <seastar/core/sstring.hh>
|
||||
|
||||
#include "seastarx.hh"
|
||||
@@ -76,19 +75,10 @@ sstring generate_salt(RandomNumberEngine& g, scheme scheme) {
|
||||
|
||||
///
|
||||
/// Hash a password combined with an implementation-specific salt string.
|
||||
/// Deprecated in favor of `hash_with_salt_async`.
|
||||
///
|
||||
/// \throws \ref std::system_error when an unexpected implementation-specific error occurs.
|
||||
///
|
||||
[[deprecated("Use hash_with_salt_async instead")]] sstring hash_with_salt(const sstring& pass, const sstring& salt);
|
||||
|
||||
///
|
||||
/// Async version of `hash_with_salt` that returns a future.
|
||||
/// If possible, hashing uses `coroutine::maybe_yield` to prevent reactor stalls.
|
||||
///
|
||||
/// \throws \ref std::system_error when an unexpected implementation-specific error occurs.
|
||||
///
|
||||
seastar::future<sstring> hash_with_salt_async(const sstring& pass, const sstring& salt);
|
||||
sstring hash_with_salt(const sstring& pass, const sstring& salt);
|
||||
|
||||
} // namespace detail
|
||||
|
||||
@@ -117,6 +107,6 @@ sstring hash(const sstring& pass, RandomNumberEngine& g, scheme scheme) {
|
||||
///
|
||||
/// \throws \ref std::system_error when an unexpected implementation-specific error occurs.
|
||||
///
|
||||
seastar::future<bool> check(const sstring& pass, const sstring& salted_hash);
|
||||
bool check(const sstring& pass, const sstring& salted_hash);
|
||||
|
||||
} // namespace auth::passwords
|
||||
|
||||
@@ -35,9 +35,10 @@ static const class_registrator<
|
||||
cql3::query_processor&,
|
||||
::service::raft_group0_client&,
|
||||
::service::migration_manager&,
|
||||
cache&> saslauthd_auth_reg("com.scylladb.auth.SaslauthdAuthenticator");
|
||||
cache&,
|
||||
utils::alien_worker&> saslauthd_auth_reg("com.scylladb.auth.SaslauthdAuthenticator");
|
||||
|
||||
saslauthd_authenticator::saslauthd_authenticator(cql3::query_processor& qp, ::service::raft_group0_client&, ::service::migration_manager&, cache&)
|
||||
saslauthd_authenticator::saslauthd_authenticator(cql3::query_processor& qp, ::service::raft_group0_client&, ::service::migration_manager&, cache&, utils::alien_worker&)
|
||||
: _socket_path(qp.db().get_config().saslauthd_socket_path())
|
||||
{}
|
||||
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
|
||||
#include "auth/authenticator.hh"
|
||||
#include "auth/cache.hh"
|
||||
#include "utils/alien_worker.hh"
|
||||
|
||||
namespace cql3 {
|
||||
class query_processor;
|
||||
@@ -29,7 +30,7 @@ namespace auth {
|
||||
class saslauthd_authenticator : public authenticator {
|
||||
sstring _socket_path; ///< Path to the domain socket on which saslauthd is listening.
|
||||
public:
|
||||
saslauthd_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, cache&);
|
||||
saslauthd_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, cache&,utils::alien_worker&);
|
||||
|
||||
future<> start() override;
|
||||
|
||||
|
||||
@@ -191,7 +191,8 @@ service::service(
|
||||
::service::migration_manager& mm,
|
||||
const service_config& sc,
|
||||
maintenance_socket_enabled used_by_maintenance_socket,
|
||||
cache& cache)
|
||||
cache& cache,
|
||||
utils::alien_worker& hashing_worker)
|
||||
: service(
|
||||
std::move(c),
|
||||
cache,
|
||||
@@ -199,7 +200,7 @@ service::service(
|
||||
g0,
|
||||
mn,
|
||||
create_object<authorizer>(sc.authorizer_java_name, qp, g0, mm),
|
||||
create_object<authenticator>(sc.authenticator_java_name, qp, g0, mm, cache),
|
||||
create_object<authenticator>(sc.authenticator_java_name, qp, g0, mm, cache, hashing_worker),
|
||||
create_object<role_manager>(sc.role_manager_java_name, qp, g0, mm, cache),
|
||||
used_by_maintenance_socket) {
|
||||
}
|
||||
@@ -225,7 +226,7 @@ future<> service::create_legacy_keyspace_if_missing(::service::migration_manager
|
||||
try {
|
||||
co_return co_await mm.announce(::service::prepare_new_keyspace_announcement(db.real_database(), ksm, ts),
|
||||
std::move(group0_guard), seastar::format("auth_service: create {} keyspace", meta::legacy::AUTH_KS));
|
||||
} catch (const ::service::group0_concurrent_modification&) {
|
||||
} catch (::service::group0_concurrent_modification&) {
|
||||
log.info("Concurrent operation is detected while creating {} keyspace, retrying.", meta::legacy::AUTH_KS);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -27,6 +27,7 @@
|
||||
#include "cql3/description.hh"
|
||||
#include "seastarx.hh"
|
||||
#include "service/raft/raft_group0_client.hh"
|
||||
#include "utils/alien_worker.hh"
|
||||
#include "utils/observable.hh"
|
||||
#include "utils/serialized_action.hh"
|
||||
#include "service/maintenance_mode.hh"
|
||||
@@ -130,7 +131,8 @@ public:
|
||||
::service::migration_manager&,
|
||||
const service_config&,
|
||||
maintenance_socket_enabled,
|
||||
cache&);
|
||||
cache&,
|
||||
utils::alien_worker&);
|
||||
|
||||
future<> start(::service::migration_manager&, db::system_keyspace&);
|
||||
|
||||
|
||||
@@ -192,7 +192,7 @@ future<> standard_role_manager::legacy_create_default_role_if_missing() {
|
||||
{_superuser},
|
||||
cql3::query_processor::cache_internal::no).discard_result();
|
||||
log.info("Created default superuser role '{}'.", _superuser);
|
||||
} catch (const exceptions::unavailable_exception& e) {
|
||||
} catch(const exceptions::unavailable_exception& e) {
|
||||
log.warn("Skipped default role setup: some nodes were not ready; will retry");
|
||||
throw e;
|
||||
}
|
||||
|
||||
@@ -38,8 +38,8 @@ class transitional_authenticator : public authenticator {
|
||||
public:
|
||||
static const sstring PASSWORD_AUTHENTICATOR_NAME;
|
||||
|
||||
transitional_authenticator(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm, cache& cache)
|
||||
: transitional_authenticator(std::make_unique<password_authenticator>(qp, g0, mm, cache)) {
|
||||
transitional_authenticator(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm, cache& cache, utils::alien_worker& hashing_worker)
|
||||
: transitional_authenticator(std::make_unique<password_authenticator>(qp, g0, mm, cache, hashing_worker)) {
|
||||
}
|
||||
transitional_authenticator(std::unique_ptr<authenticator> a)
|
||||
: _authenticator(std::move(a)) {
|
||||
@@ -81,7 +81,7 @@ public:
|
||||
}).handle_exception([](auto ep) {
|
||||
try {
|
||||
std::rethrow_exception(ep);
|
||||
} catch (const exceptions::authentication_exception&) {
|
||||
} catch (exceptions::authentication_exception&) {
|
||||
// return anon user
|
||||
return make_ready_future<authenticated_user>(anonymous_user());
|
||||
}
|
||||
@@ -126,7 +126,7 @@ public:
|
||||
virtual bytes evaluate_response(bytes_view client_response) override {
|
||||
try {
|
||||
return _sasl->evaluate_response(client_response);
|
||||
} catch (const exceptions::authentication_exception&) {
|
||||
} catch (exceptions::authentication_exception&) {
|
||||
_complete = true;
|
||||
return {};
|
||||
}
|
||||
@@ -141,7 +141,7 @@ public:
|
||||
return _sasl->get_authenticated_user().handle_exception([](auto ep) {
|
||||
try {
|
||||
std::rethrow_exception(ep);
|
||||
} catch (const exceptions::authentication_exception&) {
|
||||
} catch (exceptions::authentication_exception&) {
|
||||
// return anon user
|
||||
return make_ready_future<authenticated_user>(anonymous_user());
|
||||
}
|
||||
@@ -241,7 +241,8 @@ static const class_registrator<
|
||||
cql3::query_processor&,
|
||||
::service::raft_group0_client&,
|
||||
::service::migration_manager&,
|
||||
auth::cache&> transitional_authenticator_reg(auth::PACKAGE_NAME + "TransitionalAuthenticator");
|
||||
auth::cache&,
|
||||
utils::alien_worker&> transitional_authenticator_reg(auth::PACKAGE_NAME + "TransitionalAuthenticator");
|
||||
|
||||
static const class_registrator<
|
||||
auth::authorizer,
|
||||
|
||||
@@ -859,7 +859,6 @@ scylla_core = (['message/messaging_service.cc',
|
||||
'utils/alien_worker.cc',
|
||||
'utils/array-search.cc',
|
||||
'utils/base64.cc',
|
||||
'utils/crypt_sha512.cc',
|
||||
'utils/logalloc.cc',
|
||||
'utils/large_bitset.cc',
|
||||
'utils/buffer_input_stream.cc',
|
||||
@@ -1063,6 +1062,7 @@ scylla_core = (['message/messaging_service.cc',
|
||||
'db/hints/resource_manager.cc',
|
||||
'db/hints/sync_point.cc',
|
||||
'db/large_data_handler.cc',
|
||||
'db/legacy_schema_migrator.cc',
|
||||
'db/marshal/type_parser.cc',
|
||||
'db/per_partition_rate_limit_options.cc',
|
||||
'db/rate_limiter.cc',
|
||||
@@ -1480,6 +1480,7 @@ deps = {
|
||||
|
||||
pure_boost_tests = set([
|
||||
'test/boost/anchorless_list_test',
|
||||
'test/boost/auth_passwords_test',
|
||||
'test/boost/auth_resource_test',
|
||||
'test/boost/big_decimal_test',
|
||||
'test/boost/caching_options_test',
|
||||
@@ -2192,8 +2193,6 @@ def kmiplib():
|
||||
for id in os_ids:
|
||||
if id in { 'centos', 'fedora', 'rhel' }:
|
||||
return 'rhel84'
|
||||
elif id in { 'ubuntu', 'debian' }:
|
||||
return 'ubuntu' # Temporarily use a placeholder for Ubuntu/Debian
|
||||
print('Could not resolve libkmip.a for platform {}'.format(os_ids))
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
@@ -1322,10 +1322,6 @@ const std::vector<expr::expression>& statement_restrictions::index_restrictions(
|
||||
return _index_restrictions;
|
||||
}
|
||||
|
||||
bool statement_restrictions::is_empty() const {
|
||||
return !_where.has_value();
|
||||
}
|
||||
|
||||
// Current score table:
|
||||
// local and restrictions include full partition key: 2
|
||||
// global: 1
|
||||
|
||||
@@ -408,8 +408,6 @@ public:
|
||||
|
||||
/// Checks that the primary key restrictions don't contain null values, throws invalid_request_exception otherwise.
|
||||
void validate_primary_key(const query_options& options) const;
|
||||
|
||||
bool is_empty() const;
|
||||
};
|
||||
|
||||
statement_restrictions analyze_statement_restrictions(
|
||||
|
||||
@@ -331,7 +331,7 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::exe
|
||||
if (!cl_for_paxos) [[unlikely]] {
|
||||
return make_exception_future<shared_ptr<cql_transport::messages::result_message>>(std::move(cl_for_paxos).assume_error());
|
||||
}
|
||||
std::unique_ptr<cas_request> request;
|
||||
seastar::shared_ptr<cas_request> request;
|
||||
schema_ptr schema;
|
||||
|
||||
db::timeout_clock::time_point now = db::timeout_clock::now();
|
||||
@@ -354,9 +354,9 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::exe
|
||||
if (keys.empty()) {
|
||||
continue;
|
||||
}
|
||||
if (!request) {
|
||||
if (request.get() == nullptr) {
|
||||
schema = statement.s;
|
||||
request = std::make_unique<cas_request>(schema, std::move(keys));
|
||||
request = seastar::make_shared<cas_request>(schema, std::move(keys));
|
||||
} else if (keys.size() != 1 || keys.front().equal(request->key().front(), dht::ring_position_comparator(*schema)) == false) {
|
||||
throw exceptions::invalid_request_exception("BATCH with conditions cannot span multiple partitions");
|
||||
}
|
||||
@@ -366,7 +366,7 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::exe
|
||||
|
||||
request->add_row_update(statement, std::move(ranges), std::move(json_cache), statement_options);
|
||||
}
|
||||
if (!request) {
|
||||
if (request.get() == nullptr) {
|
||||
throw exceptions::invalid_request_exception(format("Unrestricted partition key in a conditional BATCH"));
|
||||
}
|
||||
|
||||
@@ -377,10 +377,9 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::exe
|
||||
);
|
||||
}
|
||||
|
||||
auto* request_ptr = request.get();
|
||||
return qp.proxy().cas(schema, std::move(cas_shard), *request_ptr, request->read_command(qp), request->key(),
|
||||
return qp.proxy().cas(schema, std::move(cas_shard), request, request->read_command(qp), request->key(),
|
||||
{read_timeout, qs.get_permit(), qs.get_client_state(), qs.get_trace_state()},
|
||||
std::move(cl_for_paxos).assume_value(), cl_for_learn, batch_timeout, cas_timeout).then([this, request = std::move(request)] (bool is_applied) {
|
||||
std::move(cl_for_paxos).assume_value(), cl_for_learn, batch_timeout, cas_timeout).then([this, request] (bool is_applied) {
|
||||
return request->build_cas_result_set(_metadata, _columns_of_cas_result_set, is_applied);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
#include "types/map.hh"
|
||||
#include "service/storage_proxy.hh"
|
||||
#include "cql3/query_processor.hh"
|
||||
#include "keys/clustering_interval_set.hh"
|
||||
|
||||
namespace cql3::statements {
|
||||
|
||||
@@ -87,8 +88,9 @@ lw_shared_ptr<query::read_command> cas_request::read_command(query_processor& qp
|
||||
ranges.emplace_back(query::clustering_range::make_open_ended_both_sides());
|
||||
max_rows = 1;
|
||||
} else {
|
||||
// WARNING: clustering_range::deoverlap can return incorrect results - refer to scylladb#22817 and scylladb#21604
|
||||
ranges = query::clustering_range::deoverlap(std::move(ranges), clustering_key::tri_compare(*_schema));
|
||||
// Use clustering_interval_set to correctly deoverlap ranges (fixes scylladb#22817 and scylladb#21604)
|
||||
clustering_interval_set interval_set(*_schema, ranges);
|
||||
ranges = interval_set.to_clustering_row_ranges();
|
||||
}
|
||||
auto options = update_parameters::options;
|
||||
options.set(query::partition_slice::option::always_return_static_content);
|
||||
|
||||
@@ -401,8 +401,7 @@ modification_statement::execute_with_condition(query_processor& qp, service::que
|
||||
type.is_update() ? "update" : "deletion"));
|
||||
}
|
||||
|
||||
auto request = std::make_unique<cas_request>(s, std::move(keys));
|
||||
auto* request_ptr = request.get();
|
||||
auto request = seastar::make_shared<cas_request>(s, std::move(keys));
|
||||
// cas_request can be used for batches as well single statements; Here we have just a single
|
||||
// modification in the list of CAS commands, since we're handling single-statement execution.
|
||||
request->add_row_update(*this, std::move(ranges), std::move(json_cache), options);
|
||||
@@ -428,9 +427,9 @@ modification_statement::execute_with_condition(query_processor& qp, service::que
|
||||
tablet_info = erm->check_locality(token);
|
||||
}
|
||||
|
||||
return qp.proxy().cas(s, std::move(cas_shard), *request_ptr, request->read_command(qp), request->key(),
|
||||
return qp.proxy().cas(s, std::move(cas_shard), request, request->read_command(qp), request->key(),
|
||||
{read_timeout, qs.get_permit(), qs.get_client_state(), qs.get_trace_state()},
|
||||
std::move(cl_for_paxos).assume_value(), cl_for_learn, statement_timeout, cas_timeout).then([this, request = std::move(request), tablet_replicas = std::move(tablet_info->tablet_replicas), token_range = tablet_info->token_range] (bool is_applied) {
|
||||
std::move(cl_for_paxos).assume_value(), cl_for_learn, statement_timeout, cas_timeout).then([this, request, tablet_replicas = std::move(tablet_info->tablet_replicas), token_range = tablet_info->token_range] (bool is_applied) {
|
||||
auto result = request->build_cas_result_set(_metadata, _columns_of_cas_result_set, is_applied);
|
||||
result->add_tablet_info(tablet_replicas, token_range);
|
||||
return result;
|
||||
|
||||
@@ -1976,7 +1976,7 @@ mutation_fragments_select_statement::do_execute(query_processor& qp, service::qu
|
||||
if (it == indexes.end()) {
|
||||
throw exceptions::invalid_request_exception("ANN ordering by vector requires the column to be indexed using 'vector_index'");
|
||||
}
|
||||
if (index_opt || parameters->allow_filtering() || !(restrictions->is_empty()) || check_needs_allow_filtering_anyway(*restrictions)) {
|
||||
if (index_opt || parameters->allow_filtering() || restrictions->need_filtering() || check_needs_allow_filtering_anyway(*restrictions)) {
|
||||
throw exceptions::invalid_request_exception("ANN ordering by vector does not support filtering");
|
||||
}
|
||||
index_opt = *it;
|
||||
|
||||
@@ -42,11 +42,6 @@ table::get_index_manager() const {
|
||||
return _ops->get_index_manager(*this);
|
||||
}
|
||||
|
||||
db_clock::time_point
|
||||
table::get_truncation_time() const {
|
||||
return _ops->get_truncation_time(*this);
|
||||
}
|
||||
|
||||
lw_shared_ptr<keyspace_metadata>
|
||||
keyspace::metadata() const {
|
||||
return _ops->get_keyspace_metadata(*this);
|
||||
|
||||
@@ -77,7 +77,6 @@ public:
|
||||
schema_ptr schema() const;
|
||||
const std::vector<view_ptr>& views() const;
|
||||
const secondary_index::secondary_index_manager& get_index_manager() const;
|
||||
db_clock::time_point get_truncation_time() const;
|
||||
};
|
||||
|
||||
class keyspace {
|
||||
|
||||
@@ -27,7 +27,6 @@ public:
|
||||
virtual std::optional<table> try_find_table(database db, table_id id) const = 0;
|
||||
virtual const secondary_index::secondary_index_manager& get_index_manager(table t) const = 0;
|
||||
virtual schema_ptr get_table_schema(table t) const = 0;
|
||||
virtual db_clock::time_point get_truncation_time(table t) const = 0;
|
||||
virtual lw_shared_ptr<keyspace_metadata> get_keyspace_metadata(keyspace ks) const = 0;
|
||||
virtual bool is_internal(keyspace ks) const = 0;
|
||||
virtual const locator::abstract_replication_strategy& get_replication_strategy(keyspace ks) const = 0;
|
||||
|
||||
@@ -10,6 +10,7 @@ target_sources(db
|
||||
schema_applier.cc
|
||||
schema_tables.cc
|
||||
cql_type_parser.cc
|
||||
legacy_schema_migrator.cc
|
||||
commitlog/commitlog.cc
|
||||
commitlog/commitlog_replayer.cc
|
||||
commitlog/commitlog_entry.cc
|
||||
|
||||
@@ -1,20 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2025-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "mutation/mutation.hh"
|
||||
#include "utils/UUID.hh"
|
||||
|
||||
namespace db {
|
||||
|
||||
mutation get_batchlog_mutation_for(schema_ptr schema, const utils::chunked_vector<mutation>& mutations, int32_t version, db_clock::time_point now, const utils::UUID& id);
|
||||
|
||||
mutation get_batchlog_delete_mutation(schema_ptr schema, int32_t version, db_clock::time_point now, const utils::UUID& id);
|
||||
|
||||
}
|
||||
@@ -10,7 +10,6 @@
|
||||
|
||||
#include <chrono>
|
||||
#include <exception>
|
||||
#include <ranges>
|
||||
#include <seastar/core/future-util.hh>
|
||||
#include <seastar/core/do_with.hh>
|
||||
#include <seastar/core/semaphore.hh>
|
||||
@@ -19,14 +18,12 @@
|
||||
#include <seastar/core/sleep.hh>
|
||||
|
||||
#include "batchlog_manager.hh"
|
||||
#include "batchlog.hh"
|
||||
#include "data_dictionary/data_dictionary.hh"
|
||||
#include "mutation/canonical_mutation.hh"
|
||||
#include "service/storage_proxy.hh"
|
||||
#include "system_keyspace.hh"
|
||||
#include "utils/rate_limiter.hh"
|
||||
#include "utils/log.hh"
|
||||
#include "utils/murmur_hash.hh"
|
||||
#include "db_clock.hh"
|
||||
#include "unimplemented.hh"
|
||||
#include "idl/frozen_schema.dist.hh"
|
||||
@@ -36,94 +33,17 @@
|
||||
#include "cql3/untyped_result_set.hh"
|
||||
#include "service_permit.hh"
|
||||
#include "cql3/query_processor.hh"
|
||||
#include "replica/database.hh"
|
||||
|
||||
static logging::logger blogger("batchlog_manager");
|
||||
|
||||
namespace db {
|
||||
|
||||
// Yields 256 batchlog shards. Even on the largest nodes we currently run on,
|
||||
// this should be enough to give every core a batchlog partition.
|
||||
static constexpr unsigned batchlog_shard_bits = 8;
|
||||
|
||||
int32_t batchlog_shard_of(db_clock::time_point written_at) {
|
||||
const int64_t count = written_at.time_since_epoch().count();
|
||||
std::array<uint64_t, 2> result;
|
||||
utils::murmur_hash::hash3_x64_128(bytes_view(reinterpret_cast<const signed char*>(&count), sizeof(count)), 0, result);
|
||||
uint64_t hash = result[0] ^ result[1];
|
||||
return hash & ((1ULL << batchlog_shard_bits) - 1);
|
||||
}
|
||||
|
||||
std::pair<partition_key, clustering_key>
|
||||
get_batchlog_key(const schema& schema, int32_t version, db::batchlog_stage stage, int32_t batchlog_shard, db_clock::time_point written_at, std::optional<utils::UUID> id) {
|
||||
auto pkey = partition_key::from_exploded(schema, {serialized(version), serialized(int8_t(stage)), serialized(batchlog_shard)});
|
||||
|
||||
std::vector<bytes> ckey_components;
|
||||
ckey_components.reserve(2);
|
||||
ckey_components.push_back(serialized(written_at));
|
||||
if (id) {
|
||||
ckey_components.push_back(serialized(*id));
|
||||
}
|
||||
auto ckey = clustering_key::from_exploded(schema, ckey_components);
|
||||
|
||||
return {std::move(pkey), std::move(ckey)};
|
||||
}
|
||||
|
||||
std::pair<partition_key, clustering_key>
|
||||
get_batchlog_key(const schema& schema, int32_t version, db::batchlog_stage stage, db_clock::time_point written_at, std::optional<utils::UUID> id) {
|
||||
return get_batchlog_key(schema, version, stage, batchlog_shard_of(written_at), written_at, id);
|
||||
}
|
||||
|
||||
mutation get_batchlog_mutation_for(schema_ptr schema, managed_bytes data, int32_t version, db::batchlog_stage stage, db_clock::time_point now, const utils::UUID& id) {
|
||||
auto [key, ckey] = get_batchlog_key(*schema, version, stage, now, id);
|
||||
|
||||
auto timestamp = api::new_timestamp();
|
||||
|
||||
mutation m(schema, key);
|
||||
// Avoid going through data_value and therefore `bytes`, as it can be large (#24809).
|
||||
auto cdef_data = schema->get_column_definition(to_bytes("data"));
|
||||
m.set_cell(ckey, *cdef_data, atomic_cell::make_live(*cdef_data->type, timestamp, std::move(data)));
|
||||
|
||||
return m;
|
||||
}
|
||||
|
||||
mutation get_batchlog_mutation_for(schema_ptr schema, const utils::chunked_vector<mutation>& mutations, int32_t version, db::batchlog_stage stage, db_clock::time_point now, const utils::UUID& id) {
|
||||
auto data = [&mutations] {
|
||||
utils::chunked_vector<canonical_mutation> fm(mutations.begin(), mutations.end());
|
||||
bytes_ostream out;
|
||||
for (auto& m : fm) {
|
||||
ser::serialize(out, m);
|
||||
}
|
||||
return std::move(out).to_managed_bytes();
|
||||
}();
|
||||
|
||||
return get_batchlog_mutation_for(std::move(schema), std::move(data), version, stage, now, id);
|
||||
}
|
||||
|
||||
mutation get_batchlog_mutation_for(schema_ptr schema, const utils::chunked_vector<mutation>& mutations, int32_t version, db_clock::time_point now, const utils::UUID& id) {
|
||||
return get_batchlog_mutation_for(std::move(schema), mutations, version, batchlog_stage::initial, now, id);
|
||||
}
|
||||
|
||||
mutation get_batchlog_delete_mutation(schema_ptr schema, int32_t version, db::batchlog_stage stage, db_clock::time_point now, const utils::UUID& id) {
|
||||
auto [key, ckey] = get_batchlog_key(*schema, version, stage, now, id);
|
||||
mutation m(schema, key);
|
||||
auto timestamp = api::new_timestamp();
|
||||
m.partition().apply_delete(*schema, ckey, tombstone(timestamp, gc_clock::now()));
|
||||
return m;
|
||||
}
|
||||
|
||||
mutation get_batchlog_delete_mutation(schema_ptr schema, int32_t version, db_clock::time_point now, const utils::UUID& id) {
|
||||
return get_batchlog_delete_mutation(std::move(schema), version, batchlog_stage::initial, now, id);
|
||||
}
|
||||
|
||||
} // namespace db
|
||||
|
||||
const std::chrono::seconds db::batchlog_manager::replay_interval;
|
||||
const uint32_t db::batchlog_manager::page_size;
|
||||
|
||||
db::batchlog_manager::batchlog_manager(cql3::query_processor& qp, db::system_keyspace& sys_ks, batchlog_manager_config config)
|
||||
: _qp(qp)
|
||||
, _sys_ks(sys_ks)
|
||||
, _replay_timeout(config.replay_timeout)
|
||||
, _write_request_timeout(std::chrono::duration_cast<db_clock::duration>(config.write_request_timeout))
|
||||
, _replay_rate(config.replay_rate)
|
||||
, _delay(config.delay)
|
||||
, _replay_cleanup_after_replays(config.replay_cleanup_after_replays)
|
||||
@@ -232,75 +152,18 @@ future<> db::batchlog_manager::stop() {
|
||||
}
|
||||
|
||||
future<size_t> db::batchlog_manager::count_all_batches() const {
|
||||
sstring query = format("SELECT count(*) FROM {}.{} BYPASS CACHE", system_keyspace::NAME, system_keyspace::BATCHLOG_V2);
|
||||
sstring query = format("SELECT count(*) FROM {}.{} BYPASS CACHE", system_keyspace::NAME, system_keyspace::BATCHLOG);
|
||||
return _qp.execute_internal(query, cql3::query_processor::cache_internal::yes).then([](::shared_ptr<cql3::untyped_result_set> rs) {
|
||||
return size_t(rs->one().get_as<int64_t>("count"));
|
||||
});
|
||||
}
|
||||
|
||||
future<> db::batchlog_manager::maybe_migrate_v1_to_v2() {
|
||||
if (_migration_done) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
return with_gate(_gate, [this] () mutable -> future<> {
|
||||
blogger.info("Migrating batchlog entries from v1 -> v2");
|
||||
|
||||
auto schema_v1 = _qp.db().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG);
|
||||
auto schema_v2 = _qp.db().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG_V2);
|
||||
|
||||
auto batch = [this, schema_v1, schema_v2] (const cql3::untyped_result_set::row& row) -> future<stop_iteration> {
|
||||
// check version of serialization format
|
||||
if (!row.has("version")) {
|
||||
blogger.warn("Not migrating logged batch because of unknown version");
|
||||
co_return stop_iteration::no;
|
||||
}
|
||||
|
||||
auto version = row.get_as<int32_t>("version");
|
||||
if (version != netw::messaging_service::current_version) {
|
||||
blogger.warn("Not migrating logged batch because of incorrect version");
|
||||
co_return stop_iteration::no;
|
||||
}
|
||||
|
||||
auto id = row.get_as<utils::UUID>("id");
|
||||
auto written_at = row.get_as<db_clock::time_point>("written_at");
|
||||
auto data = row.get_blob_fragmented("data");
|
||||
|
||||
auto& sp = _qp.proxy();
|
||||
|
||||
utils::get_local_injector().inject("batchlog_manager_fail_migration", [] { throw std::runtime_error("Error injection: failing batchlog migration"); });
|
||||
|
||||
auto migrate_mut = get_batchlog_mutation_for(schema_v2, std::move(data), version, batchlog_stage::failed_replay, written_at, id);
|
||||
co_await sp.mutate_locally(migrate_mut, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
|
||||
|
||||
mutation delete_mut(schema_v1, partition_key::from_single_value(*schema_v1, serialized(id)));
|
||||
delete_mut.partition().apply_delete(*schema_v1, clustering_key_prefix::make_empty(), tombstone(api::new_timestamp(), gc_clock::now()));
|
||||
co_await sp.mutate_locally(delete_mut, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
|
||||
|
||||
co_return stop_iteration::no;
|
||||
};
|
||||
try {
|
||||
co_await _qp.query_internal(
|
||||
format("SELECT * FROM {}.{} BYPASS CACHE", system_keyspace::NAME, system_keyspace::BATCHLOG),
|
||||
db::consistency_level::ONE,
|
||||
{},
|
||||
page_size,
|
||||
std::move(batch));
|
||||
} catch (...) {
|
||||
blogger.warn("Batchlog v1 to v2 migration failed: {}; will retry", std::current_exception());
|
||||
co_return;
|
||||
}
|
||||
|
||||
co_await container().invoke_on_all([] (auto& bm) {
|
||||
bm._migration_done = true;
|
||||
});
|
||||
|
||||
blogger.info("Done migrating batchlog entries from v1 -> v2");
|
||||
});
|
||||
db_clock::duration db::batchlog_manager::get_batch_log_timeout() const {
|
||||
// enough time for the actual write + BM removal mutation
|
||||
return _write_request_timeout * 2;
|
||||
}
|
||||
|
||||
future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches(post_replay_cleanup cleanup) {
|
||||
co_await maybe_migrate_v1_to_v2();
|
||||
|
||||
typedef db_clock::rep clock_type;
|
||||
|
||||
db::all_batches_replayed all_replayed = all_batches_replayed::yes;
|
||||
@@ -309,26 +172,21 @@ future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches
|
||||
auto throttle = _replay_rate / _qp.proxy().get_token_metadata_ptr()->count_normal_token_owners();
|
||||
auto limiter = make_lw_shared<utils::rate_limiter>(throttle);
|
||||
|
||||
auto schema = _qp.db().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG_V2);
|
||||
|
||||
struct replay_stats {
|
||||
std::optional<db_clock::time_point> min_too_fresh;
|
||||
bool need_cleanup = false;
|
||||
auto schema = _qp.db().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG);
|
||||
auto delete_batch = [this, schema = std::move(schema)] (utils::UUID id) {
|
||||
auto key = partition_key::from_singular(*schema, id);
|
||||
mutation m(schema, key);
|
||||
auto now = service::client_state(service::client_state::internal_tag()).get_timestamp();
|
||||
m.partition().apply_delete(*schema, clustering_key_prefix::make_empty(), tombstone(now, gc_clock::now()));
|
||||
return _qp.proxy().mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
|
||||
};
|
||||
|
||||
std::unordered_map<int32_t, replay_stats> replay_stats_per_shard;
|
||||
|
||||
// Use a stable `now` accross all batches, so skip/replay decisions are the
|
||||
// same accross a while prefix of written_at (accross all ids).
|
||||
const auto now = db_clock::now();
|
||||
|
||||
auto batch = [this, cleanup, limiter, schema, &all_replayed, &replay_stats_per_shard, now] (const cql3::untyped_result_set::row& row) -> future<stop_iteration> {
|
||||
const auto stage = static_cast<batchlog_stage>(row.get_as<int8_t>("stage"));
|
||||
const auto batch_shard = row.get_as<int32_t>("shard");
|
||||
auto batch = [this, limiter, delete_batch = std::move(delete_batch), &all_replayed](const cql3::untyped_result_set::row& row) -> future<stop_iteration> {
|
||||
auto written_at = row.get_as<db_clock::time_point>("written_at");
|
||||
auto id = row.get_as<utils::UUID>("id");
|
||||
// enough time for the actual write + batchlog entry mutation delivery (two separate requests).
|
||||
auto timeout = _replay_timeout;
|
||||
auto now = db_clock::now();
|
||||
auto timeout = get_batch_log_timeout();
|
||||
|
||||
if (utils::get_local_injector().is_enabled("skip_batch_replay")) {
|
||||
blogger.debug("Skipping batch replay due to skip_batch_replay injection");
|
||||
@@ -336,48 +194,52 @@ future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches
|
||||
co_return stop_iteration::no;
|
||||
}
|
||||
|
||||
// check version of serialization format
|
||||
if (!row.has("version")) {
|
||||
blogger.warn("Skipping logged batch because of unknown version");
|
||||
co_await delete_batch(id);
|
||||
co_return stop_iteration::no;
|
||||
}
|
||||
|
||||
auto version = row.get_as<int32_t>("version");
|
||||
if (version != netw::messaging_service::current_version) {
|
||||
blogger.warn("Skipping logged batch because of incorrect version {}; current version = {}", version, netw::messaging_service::current_version);
|
||||
co_await delete_batch(id);
|
||||
co_return stop_iteration::no;
|
||||
}
|
||||
|
||||
auto data = row.get_blob_unfragmented("data");
|
||||
|
||||
blogger.debug("Replaying batch {} from stage {} and batch shard {}", id, int32_t(stage), batch_shard);
|
||||
|
||||
utils::chunked_vector<mutation> mutations;
|
||||
bool send_failed = false;
|
||||
|
||||
auto& shard_written_at = replay_stats_per_shard.try_emplace(batch_shard, replay_stats{}).first->second;
|
||||
blogger.debug("Replaying batch {}", id);
|
||||
|
||||
try {
|
||||
utils::chunked_vector<std::pair<canonical_mutation, schema_ptr>> fms;
|
||||
auto fms = make_lw_shared<std::deque<canonical_mutation>>();
|
||||
auto in = ser::as_input_stream(data);
|
||||
while (in.size()) {
|
||||
auto fm = ser::deserialize(in, std::type_identity<canonical_mutation>());
|
||||
const auto tbl = _qp.db().try_find_table(fm.column_family_id());
|
||||
if (!tbl) {
|
||||
continue;
|
||||
}
|
||||
if (written_at <= tbl->get_truncation_time()) {
|
||||
continue;
|
||||
}
|
||||
schema_ptr s = tbl->schema();
|
||||
if (s->tombstone_gc_options().mode() == tombstone_gc_mode::repair) {
|
||||
timeout = std::min(timeout, std::chrono::duration_cast<db_clock::duration>(s->tombstone_gc_options().propagation_delay_in_seconds()));
|
||||
}
|
||||
fms.emplace_back(std::move(fm), std::move(s));
|
||||
fms->emplace_back(ser::deserialize(in, std::type_identity<canonical_mutation>()));
|
||||
schema_ptr s = _qp.db().find_schema(fms->back().column_family_id());
|
||||
timeout = std::min(timeout, std::chrono::duration_cast<db_clock::duration>(s->tombstone_gc_options().propagation_delay_in_seconds()));
|
||||
}
|
||||
|
||||
if (now < written_at + timeout) {
|
||||
blogger.debug("Skipping replay of {}, too fresh", id);
|
||||
|
||||
shard_written_at.min_too_fresh = std::min(shard_written_at.min_too_fresh.value_or(written_at), written_at);
|
||||
|
||||
co_return stop_iteration::no;
|
||||
}
|
||||
|
||||
auto size = data.size();
|
||||
|
||||
for (const auto& [fm, s] : fms) {
|
||||
mutations.emplace_back(fm.to_mutation(s));
|
||||
co_await maybe_yield();
|
||||
}
|
||||
auto mutations = co_await map_reduce(*fms, [this, written_at] (canonical_mutation& fm) {
|
||||
const auto& cf = _qp.proxy().local_db().find_column_family(fm.column_family_id());
|
||||
return make_ready_future<canonical_mutation*>(written_at > cf.get_truncation_time() ? &fm : nullptr);
|
||||
},
|
||||
utils::chunked_vector<mutation>(),
|
||||
[this] (utils::chunked_vector<mutation> mutations, canonical_mutation* fm) {
|
||||
if (fm) {
|
||||
schema_ptr s = _qp.db().find_schema(fm->column_family_id());
|
||||
mutations.emplace_back(fm->to_mutation(s));
|
||||
}
|
||||
return mutations;
|
||||
});
|
||||
|
||||
if (!mutations.empty()) {
|
||||
const auto ttl = [written_at]() -> clock_type {
|
||||
@@ -403,11 +265,7 @@ future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches
|
||||
co_await limiter->reserve(size);
|
||||
_stats.write_attempts += mutations.size();
|
||||
auto timeout = db::timeout_clock::now() + write_timeout;
|
||||
if (cleanup) {
|
||||
co_await _qp.proxy().send_batchlog_replay_to_all_replicas(mutations, timeout);
|
||||
} else {
|
||||
co_await _qp.proxy().send_batchlog_replay_to_all_replicas(std::move(mutations), timeout);
|
||||
}
|
||||
co_await _qp.proxy().send_batchlog_replay_to_all_replicas(std::move(mutations), timeout);
|
||||
}
|
||||
}
|
||||
} catch (data_dictionary::no_such_keyspace& ex) {
|
||||
@@ -421,80 +279,31 @@ future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches
|
||||
// Do _not_ remove the batch, assuning we got a node write error.
|
||||
// Since we don't have hints (which origin is satisfied with),
|
||||
// we have to resort to keeping this batch to next lap.
|
||||
if (!cleanup || stage == batchlog_stage::failed_replay) {
|
||||
co_return stop_iteration::no;
|
||||
}
|
||||
send_failed = true;
|
||||
co_return stop_iteration::no;
|
||||
}
|
||||
|
||||
auto& sp = _qp.proxy();
|
||||
|
||||
if (send_failed) {
|
||||
blogger.debug("Moving batch {} to stage failed_replay", id);
|
||||
auto m = get_batchlog_mutation_for(schema, mutations, netw::messaging_service::current_version, batchlog_stage::failed_replay, written_at, id);
|
||||
co_await sp.mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
|
||||
}
|
||||
|
||||
// delete batch
|
||||
auto m = get_batchlog_delete_mutation(schema, netw::messaging_service::current_version, stage, written_at, id);
|
||||
co_await _qp.proxy().mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
|
||||
|
||||
shard_written_at.need_cleanup = true;
|
||||
|
||||
co_await delete_batch(id);
|
||||
co_return stop_iteration::no;
|
||||
};
|
||||
|
||||
co_await with_gate(_gate, [this, cleanup, &all_replayed, batch = std::move(batch), now, &replay_stats_per_shard] () mutable -> future<> {
|
||||
blogger.debug("Started replayAllFailedBatches with cleanup: {}", cleanup);
|
||||
co_await with_gate(_gate, [this, cleanup, batch = std::move(batch)] () mutable -> future<> {
|
||||
blogger.debug("Started replayAllFailedBatches (cpu {})", this_shard_id());
|
||||
co_await utils::get_local_injector().inject("add_delay_to_batch_replay", std::chrono::milliseconds(1000));
|
||||
|
||||
auto schema = _qp.db().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG_V2);
|
||||
|
||||
co_await coroutine::parallel_for_each(std::views::iota(0, 16), [&] (int32_t chunk) -> future<> {
|
||||
const int32_t batchlog_chunk_base = chunk * 16;
|
||||
for (int32_t i = 0; i < 16; ++i) {
|
||||
int32_t batchlog_shard = batchlog_chunk_base + i;
|
||||
|
||||
co_await _qp.query_internal(
|
||||
format("SELECT * FROM {}.{} WHERE version = ? AND stage = ? AND shard = ? BYPASS CACHE", system_keyspace::NAME, system_keyspace::BATCHLOG_V2),
|
||||
db::consistency_level::ONE,
|
||||
{data_value(netw::messaging_service::current_version), data_value(int8_t(batchlog_stage::failed_replay)), data_value(batchlog_shard)},
|
||||
page_size,
|
||||
batch);
|
||||
|
||||
co_await _qp.query_internal(
|
||||
format("SELECT * FROM {}.{} WHERE version = ? AND stage = ? AND shard = ? BYPASS CACHE", system_keyspace::NAME, system_keyspace::BATCHLOG_V2),
|
||||
db::consistency_level::ONE,
|
||||
{data_value(netw::messaging_service::current_version), data_value(int8_t(batchlog_stage::initial)), data_value(batchlog_shard)},
|
||||
page_size,
|
||||
batch);
|
||||
|
||||
if (cleanup != post_replay_cleanup::yes) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto it = replay_stats_per_shard.find(batchlog_shard);
|
||||
if (it == replay_stats_per_shard.end() || !it->second.need_cleanup) {
|
||||
// Nothing was replayed on this batchlog shard, nothing to cleanup.
|
||||
continue;
|
||||
}
|
||||
|
||||
const auto write_time = it->second.min_too_fresh.value_or(now - _replay_timeout);
|
||||
const auto end_weight = it->second.min_too_fresh ? bound_weight::before_all_prefixed : bound_weight::after_all_prefixed;
|
||||
auto [key, ckey] = get_batchlog_key(*schema, netw::messaging_service::current_version, batchlog_stage::initial, batchlog_shard, write_time, {});
|
||||
auto end_pos = position_in_partition(partition_region::clustered, end_weight, std::move(ckey));
|
||||
|
||||
range_tombstone rt(position_in_partition::before_all_clustered_rows(), std::move(end_pos), tombstone(api::new_timestamp(), gc_clock::now()));
|
||||
|
||||
blogger.trace("Clean up batchlog shard {} with range tombstone {}", batchlog_shard, rt);
|
||||
|
||||
mutation m(schema, key);
|
||||
m.partition().apply_row_tombstone(*schema, std::move(rt));
|
||||
co_await _qp.proxy().mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
|
||||
co_await _qp.query_internal(
|
||||
format("SELECT id, data, written_at, version FROM {}.{} BYPASS CACHE", system_keyspace::NAME, system_keyspace::BATCHLOG),
|
||||
db::consistency_level::ONE,
|
||||
{},
|
||||
page_size,
|
||||
std::move(batch)).then([this, cleanup] {
|
||||
if (cleanup == post_replay_cleanup::no) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
// Replaying batches could have generated tombstones, flush to disk,
|
||||
// where they can be compacted away.
|
||||
return replica::database::flush_table_on_all_shards(_qp.proxy().get_db(), system_keyspace::NAME, system_keyspace::BATCHLOG);
|
||||
}).then([] {
|
||||
blogger.debug("Finished replayAllFailedBatches");
|
||||
});
|
||||
|
||||
blogger.debug("Finished replayAllFailedBatches with all_replayed: {}", all_replayed);
|
||||
});
|
||||
|
||||
co_return all_replayed;
|
||||
|
||||
@@ -34,17 +34,12 @@ class system_keyspace;
|
||||
using all_batches_replayed = bool_class<struct all_batches_replayed_tag>;
|
||||
|
||||
struct batchlog_manager_config {
|
||||
db_clock::duration replay_timeout;
|
||||
std::chrono::duration<double> write_request_timeout;
|
||||
uint64_t replay_rate = std::numeric_limits<uint64_t>::max();
|
||||
std::chrono::milliseconds delay = std::chrono::milliseconds(0);
|
||||
unsigned replay_cleanup_after_replays;
|
||||
};
|
||||
|
||||
enum class batchlog_stage : int8_t {
|
||||
initial,
|
||||
failed_replay
|
||||
};
|
||||
|
||||
class batchlog_manager : public peering_sharded_service<batchlog_manager> {
|
||||
public:
|
||||
using post_replay_cleanup = bool_class<class post_replay_cleanup_tag>;
|
||||
@@ -64,7 +59,7 @@ private:
|
||||
|
||||
cql3::query_processor& _qp;
|
||||
db::system_keyspace& _sys_ks;
|
||||
db_clock::duration _replay_timeout;
|
||||
db_clock::duration _write_request_timeout;
|
||||
uint64_t _replay_rate;
|
||||
std::chrono::milliseconds _delay;
|
||||
unsigned _replay_cleanup_after_replays = 100;
|
||||
@@ -76,14 +71,6 @@ private:
|
||||
|
||||
gc_clock::time_point _last_replay;
|
||||
|
||||
// Was the v1 -> v2 migration already done since last restart?
|
||||
// The migration is attempted once after each restart. This is redundant but
|
||||
// keeps thing simple. Once no upgrade path exists from a ScyllaDB version
|
||||
// which can still produce v1 entries, this migration code can be removed.
|
||||
bool _migration_done = false;
|
||||
|
||||
future<> maybe_migrate_v1_to_v2();
|
||||
|
||||
future<all_batches_replayed> replay_all_failed_batches(post_replay_cleanup cleanup);
|
||||
public:
|
||||
// Takes a QP, not a distributes. Because this object is supposed
|
||||
@@ -98,13 +85,10 @@ public:
|
||||
future<all_batches_replayed> do_batch_log_replay(post_replay_cleanup cleanup);
|
||||
|
||||
future<size_t> count_all_batches() const;
|
||||
db_clock::duration get_batch_log_timeout() const;
|
||||
gc_clock::time_point get_last_replay() const {
|
||||
return _last_replay;
|
||||
}
|
||||
|
||||
const stats& stats() const {
|
||||
return _stats;
|
||||
}
|
||||
private:
|
||||
future<> batchlog_replay_loop();
|
||||
};
|
||||
|
||||
@@ -54,14 +54,12 @@ public:
|
||||
uint64_t applied_mutations = 0;
|
||||
uint64_t corrupt_bytes = 0;
|
||||
uint64_t truncated_at = 0;
|
||||
uint64_t broken_files = 0;
|
||||
|
||||
stats& operator+=(const stats& s) {
|
||||
invalid_mutations += s.invalid_mutations;
|
||||
skipped_mutations += s.skipped_mutations;
|
||||
applied_mutations += s.applied_mutations;
|
||||
corrupt_bytes += s.corrupt_bytes;
|
||||
broken_files += s.broken_files;
|
||||
return *this;
|
||||
}
|
||||
stats operator+(const stats& s) const {
|
||||
@@ -194,8 +192,6 @@ db::commitlog_replayer::impl::recover(const commitlog::descriptor& d, const comm
|
||||
s->corrupt_bytes += e.bytes();
|
||||
} catch (commitlog::segment_truncation& e) {
|
||||
s->truncated_at = e.position();
|
||||
} catch (commitlog::header_checksum_error&) {
|
||||
++s->broken_files;
|
||||
} catch (...) {
|
||||
throw;
|
||||
}
|
||||
@@ -374,9 +370,6 @@ future<> db::commitlog_replayer::recover(std::vector<sstring> files, sstring fna
|
||||
if (stats.truncated_at != 0) {
|
||||
rlogger.warn("Truncated file: {} at position {}.", f, stats.truncated_at);
|
||||
}
|
||||
if (stats.broken_files != 0) {
|
||||
rlogger.warn("Corrupted file header: {}. Skipped.", f);
|
||||
}
|
||||
rlogger.debug("Log replay of {} complete, {} replayed mutations ({} invalid, {} skipped)"
|
||||
, f
|
||||
, stats.applied_mutations
|
||||
|
||||
@@ -1152,7 +1152,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
|
||||
"Number of threads with which to deliver hints. In multiple data-center deployments, consider increasing this number because cross data-center handoff is generally slower.")
|
||||
, batchlog_replay_throttle_in_kb(this, "batchlog_replay_throttle_in_kb", value_status::Unused, 1024,
|
||||
"Total maximum throttle. Throttling is reduced proportionally to the number of nodes in the cluster.")
|
||||
, batchlog_replay_cleanup_after_replays(this, "batchlog_replay_cleanup_after_replays", liveness::LiveUpdate, value_status::Used, 1,
|
||||
, batchlog_replay_cleanup_after_replays(this, "batchlog_replay_cleanup_after_replays", liveness::LiveUpdate, value_status::Used, 60,
|
||||
"Clean up batchlog memtable after every N replays. Replays are issued on a timer, every 60 seconds. So if batchlog_replay_cleanup_after_replays is set to 60, the batchlog memtable is flushed every 60 * 60 seconds.")
|
||||
/**
|
||||
* @Group Request scheduler properties
|
||||
|
||||
602
db/legacy_schema_migrator.cc
Normal file
602
db/legacy_schema_migrator.cc
Normal file
@@ -0,0 +1,602 @@
|
||||
/*
|
||||
* Modified by ScyllaDB
|
||||
* Copyright (C) 2017-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
|
||||
*/
|
||||
|
||||
// Since Scylla 2.0, we use system tables whose schemas were introduced in
|
||||
// Cassandra 3. If Scylla boots to find a data directory with system tables
|
||||
// with older schemas - produced by pre-2.0 Scylla or by pre-3.0 Cassandra,
|
||||
// we need to migrate these old tables to the new format.
|
||||
//
|
||||
// We provide here a function, db::legacy_schema_migrator::migrate(),
|
||||
// for a one-time migration from old to new system tables. The function
|
||||
// reads old system tables, write them back in the new format, and finally
|
||||
// delete the old system tables. Scylla's main should call this function and
|
||||
// wait for the returned future, before starting to serve the database.
|
||||
|
||||
#include <boost/iterator/filter_iterator.hpp>
|
||||
#include <seastar/core/future-util.hh>
|
||||
#include <seastar/util/log.hh>
|
||||
#include <map>
|
||||
#include <unordered_set>
|
||||
#include <chrono>
|
||||
|
||||
#include "replica/database.hh"
|
||||
#include "legacy_schema_migrator.hh"
|
||||
#include "system_keyspace.hh"
|
||||
#include "schema_tables.hh"
|
||||
#include "schema/schema_builder.hh"
|
||||
#include "service/storage_proxy.hh"
|
||||
#include "utils/rjson.hh"
|
||||
#include "cql3/query_processor.hh"
|
||||
#include "cql3/untyped_result_set.hh"
|
||||
#include "cql3/util.hh"
|
||||
#include "cql3/statements/property_definitions.hh"
|
||||
|
||||
static seastar::logger mlogger("legacy_schema_migrator");
|
||||
|
||||
namespace db {
|
||||
namespace legacy_schema_migrator {
|
||||
|
||||
// local data carriers
|
||||
|
||||
class migrator {
|
||||
public:
|
||||
static const std::unordered_set<sstring> legacy_schema_tables;
|
||||
|
||||
migrator(sharded<service::storage_proxy>& sp, sharded<replica::database>& db, sharded<db::system_keyspace>& sys_ks, cql3::query_processor& qp)
|
||||
: _sp(sp), _db(db), _sys_ks(sys_ks), _qp(qp) {
|
||||
}
|
||||
migrator(migrator&&) = default;
|
||||
|
||||
typedef db_clock::time_point time_point;
|
||||
|
||||
// TODO: we don't support triggers.
|
||||
// this is a placeholder.
|
||||
struct trigger {
|
||||
time_point timestamp;
|
||||
sstring name;
|
||||
std::unordered_map<sstring, sstring> options;
|
||||
};
|
||||
|
||||
struct table {
|
||||
time_point timestamp;
|
||||
schema_ptr metadata;
|
||||
std::vector<trigger> triggers;
|
||||
};
|
||||
|
||||
struct type {
|
||||
time_point timestamp;
|
||||
user_type metadata;
|
||||
};
|
||||
|
||||
struct function {
|
||||
time_point timestamp;
|
||||
sstring ks_name;
|
||||
sstring fn_name;
|
||||
std::vector<sstring> arg_names;
|
||||
std::vector<sstring> arg_types;
|
||||
sstring return_type;
|
||||
bool called_on_null_input;
|
||||
sstring language;
|
||||
sstring body;
|
||||
};
|
||||
|
||||
struct aggregate {
|
||||
time_point timestamp;
|
||||
sstring ks_name;
|
||||
sstring fn_name;
|
||||
std::vector<sstring> arg_names;
|
||||
std::vector<sstring> arg_types;
|
||||
sstring return_type;
|
||||
sstring final_func;
|
||||
sstring initcond;
|
||||
sstring state_func;
|
||||
sstring state_type;
|
||||
};
|
||||
|
||||
struct keyspace {
|
||||
time_point timestamp;
|
||||
sstring name;
|
||||
bool durable_writes;
|
||||
std::map<sstring, sstring> replication_params;
|
||||
|
||||
std::vector<table> tables;
|
||||
std::vector<type> types;
|
||||
std::vector<function> functions;
|
||||
std::vector<aggregate> aggregates;
|
||||
};
|
||||
|
||||
class unsupported_feature : public std::runtime_error {
|
||||
public:
|
||||
using runtime_error::runtime_error;
|
||||
};
|
||||
|
||||
static sstring fmt_query(const char* fmt, const char* table) {
|
||||
return fmt::format(fmt::runtime(fmt), db::system_keyspace::NAME, table);
|
||||
}
|
||||
|
||||
typedef ::shared_ptr<cql3::untyped_result_set> result_set_type;
|
||||
typedef const cql3::untyped_result_set::row row_type;
|
||||
|
||||
future<> read_table(keyspace& dst, sstring cf_name, time_point timestamp) {
|
||||
auto fmt = "SELECT * FROM {}.{} WHERE keyspace_name = ? AND columnfamily_name = ?";
|
||||
auto tq = fmt_query(fmt, db::system_keyspace::legacy::COLUMNFAMILIES);
|
||||
auto cq = fmt_query(fmt, db::system_keyspace::legacy::COLUMNS);
|
||||
auto zq = fmt_query(fmt, db::system_keyspace::legacy::TRIGGERS);
|
||||
|
||||
typedef std::tuple<future<result_set_type>, future<result_set_type>, future<result_set_type>, future<db::schema_tables::legacy::schema_mutations>> result_tuple;
|
||||
|
||||
return when_all(_qp.execute_internal(tq, { dst.name, cf_name }, cql3::query_processor::cache_internal::yes),
|
||||
_qp.execute_internal(cq, { dst.name, cf_name }, cql3::query_processor::cache_internal::yes),
|
||||
_qp.execute_internal(zq, { dst.name, cf_name }, cql3::query_processor::cache_internal::yes),
|
||||
db::schema_tables::legacy::read_table_mutations(_sp, dst.name, cf_name, db::system_keyspace::legacy::column_families()))
|
||||
.then([&dst, cf_name, timestamp](result_tuple&& t) {
|
||||
|
||||
result_set_type tables = std::get<0>(t).get();
|
||||
result_set_type columns = std::get<1>(t).get();
|
||||
result_set_type triggers = std::get<2>(t).get();
|
||||
db::schema_tables::legacy::schema_mutations sm = std::get<3>(t).get();
|
||||
|
||||
row_type& td = tables->one();
|
||||
|
||||
auto ks_name = td.get_as<sstring>("keyspace_name");
|
||||
auto cf_name = td.get_as<sstring>("columnfamily_name");
|
||||
auto id = table_id(td.get_or("cf_id", generate_legacy_id(ks_name, cf_name).uuid()));
|
||||
|
||||
schema_builder builder(dst.name, cf_name, id);
|
||||
|
||||
builder.with_version(sm.digest());
|
||||
|
||||
cf_type cf = sstring_to_cf_type(td.get_or("type", sstring("standard")));
|
||||
if (cf == cf_type::super) {
|
||||
fail(unimplemented::cause::SUPER);
|
||||
}
|
||||
|
||||
auto comparator = td.get_as<sstring>("comparator");
|
||||
bool is_compound = cell_comparator::check_compound(comparator);
|
||||
builder.set_is_compound(is_compound);
|
||||
cell_comparator::read_collections(builder, comparator);
|
||||
|
||||
bool filter_sparse = false;
|
||||
|
||||
data_type default_validator = {};
|
||||
if (td.has("default_validator")) {
|
||||
default_validator = db::schema_tables::parse_type(td.get_as<sstring>("default_validator"));
|
||||
if (default_validator->is_counter()) {
|
||||
builder.set_is_counter(true);
|
||||
}
|
||||
builder.set_default_validation_class(default_validator);
|
||||
}
|
||||
|
||||
/*
|
||||
* Determine whether or not the table is *really* dense
|
||||
* We cannot trust is_dense value of true (see CASSANDRA-11502, that fixed the issue for 2.2 only, and not retroactively),
|
||||
* but we can trust is_dense value of false.
|
||||
*/
|
||||
auto is_dense = td.get_opt<bool>("is_dense");
|
||||
if (!is_dense || *is_dense) {
|
||||
is_dense = [&] {
|
||||
/*
|
||||
* As said above, this method is only here because we need to deal with thrift upgrades.
|
||||
* Once a CF has been "upgraded", i.e. we've rebuilt and save its CQL3 metadata at least once,
|
||||
* then we'll have saved the "is_dense" value and will be good to go.
|
||||
*
|
||||
* But non-upgraded thrift CF (and pre-7744 CF) will have no value for "is_dense", so we need
|
||||
* to infer that information without relying on it in that case. And for the most part this is
|
||||
* easy, a CF that has at least one REGULAR definition is not dense. But the subtlety is that not
|
||||
* having a REGULAR definition may not mean dense because of CQL3 definitions that have only the
|
||||
* PRIMARY KEY defined.
|
||||
*
|
||||
* So we need to recognize those special case CQL3 table with only a primary key. If we have some
|
||||
* clustering columns, we're fine as said above. So the only problem is that we cannot decide for
|
||||
* sure if a CF without REGULAR columns nor CLUSTERING_COLUMN definition is meant to be dense, or if it
|
||||
* has been created in CQL3 by say:
|
||||
* CREATE TABLE test (k int PRIMARY KEY)
|
||||
* in which case it should not be dense. However, we can limit our margin of error by assuming we are
|
||||
* in the latter case only if the comparator is exactly CompositeType(UTF8Type).
|
||||
*/
|
||||
std::optional<column_id> max_cl_idx;
|
||||
const cql3::untyped_result_set::row * regular = nullptr;
|
||||
for (auto& row : *columns) {
|
||||
auto kind_str = row.get_as<sstring>("type");
|
||||
if (kind_str == "compact_value") {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto kind = db::schema_tables::deserialize_kind(kind_str);
|
||||
|
||||
if (kind == column_kind::regular_column) {
|
||||
if (regular != nullptr) {
|
||||
return false;
|
||||
}
|
||||
regular = &row;
|
||||
continue;
|
||||
}
|
||||
if (kind == column_kind::clustering_key) {
|
||||
max_cl_idx = std::max(column_id(row.get_or("component_index", 0)), max_cl_idx.value_or(column_id()));
|
||||
}
|
||||
}
|
||||
|
||||
auto is_cql3_only_pk_comparator = [](const sstring& comparator) {
|
||||
if (!cell_comparator::check_compound(comparator)) {
|
||||
return false;
|
||||
}
|
||||
// CMH. We don't have composites, nor a parser for it. This is a simple way of c
|
||||
// checking the same.
|
||||
auto comma = comparator.find(',');
|
||||
if (comma != sstring::npos) {
|
||||
return false;
|
||||
}
|
||||
auto off = comparator.find('(');
|
||||
auto end = comparator.find(')');
|
||||
|
||||
return comparator.compare(off, end - off, utf8_type->name()) == 0;
|
||||
};
|
||||
|
||||
if (max_cl_idx) {
|
||||
auto n = std::count(comparator.begin(), comparator.end(), ','); // num comp - 1
|
||||
return *max_cl_idx == n;
|
||||
}
|
||||
|
||||
if (regular) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return !is_cql3_only_pk_comparator(comparator);
|
||||
|
||||
}();
|
||||
|
||||
// now, if switched to sparse, remove redundant compact_value column and the last clustering column,
|
||||
// directly copying CASSANDRA-11502 logic. See CASSANDRA-11315.
|
||||
|
||||
filter_sparse = !*is_dense;
|
||||
}
|
||||
builder.set_is_dense(*is_dense);
|
||||
|
||||
auto is_cql = !*is_dense && is_compound;
|
||||
auto is_static_compact = !*is_dense && !is_compound;
|
||||
|
||||
// org.apache.cassandra.schema.LegacySchemaMigrator#isEmptyCompactValueColumn
|
||||
auto is_empty_compact_value = [](const cql3::untyped_result_set::row& column_row) {
|
||||
auto kind_str = column_row.get_as<sstring>("type");
|
||||
// Cassandra only checks for "compact_value", but Scylla generates "regular" instead (#2586)
|
||||
return (kind_str == "compact_value" || kind_str == "regular")
|
||||
&& column_row.get_as<sstring>("column_name").empty();
|
||||
};
|
||||
|
||||
for (auto& row : *columns) {
|
||||
auto kind_str = row.get_as<sstring>("type");
|
||||
auto kind = db::schema_tables::deserialize_kind(kind_str);
|
||||
auto component_index = kind > column_kind::clustering_key ? 0 : column_id(row.get_or("component_index", 0));
|
||||
auto name = row.get_or<sstring>("column_name", sstring());
|
||||
auto validator = db::schema_tables::parse_type(row.get_as<sstring>("validator"));
|
||||
|
||||
if (is_empty_compact_value(row)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (filter_sparse) {
|
||||
if (kind_str == "compact_value") {
|
||||
continue;
|
||||
}
|
||||
if (kind == column_kind::clustering_key) {
|
||||
if (cf == cf_type::super && component_index != 0) {
|
||||
continue;
|
||||
}
|
||||
if (cf != cf_type::super && !is_compound) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::optional<index_metadata_kind> index_kind;
|
||||
sstring index_name;
|
||||
index_options_map options;
|
||||
if (row.has("index_type")) {
|
||||
index_kind = schema_tables::deserialize_index_kind(row.get_as<sstring>("index_type"));
|
||||
}
|
||||
if (row.has("index_name")) {
|
||||
index_name = row.get_as<sstring>("index_name");
|
||||
}
|
||||
if (row.has("index_options")) {
|
||||
sstring index_options_str = row.get_as<sstring>("index_options");
|
||||
options = rjson::parse_to_map<index_options_map>(std::string_view(index_options_str));
|
||||
sstring type;
|
||||
auto i = options.find("index_keys");
|
||||
if (i != options.end()) {
|
||||
options.erase(i);
|
||||
type = "KEYS";
|
||||
}
|
||||
i = options.find("index_keys_and_values");
|
||||
if (i != options.end()) {
|
||||
options.erase(i);
|
||||
type = "KEYS_AND_VALUES";
|
||||
}
|
||||
if (type.empty()) {
|
||||
if (validator->is_collection() && validator->is_multi_cell()) {
|
||||
type = "FULL";
|
||||
} else {
|
||||
type = "VALUES";
|
||||
}
|
||||
}
|
||||
auto column = cql3::util::maybe_quote(name);
|
||||
options["target"] = validator->is_collection()
|
||||
? type + "(" + column + ")"
|
||||
: column;
|
||||
}
|
||||
if (index_kind) {
|
||||
// Origin assumes index_name is always set, so let's do the same
|
||||
builder.with_index(index_metadata(index_name, options, *index_kind, index_metadata::is_local_index::no));
|
||||
}
|
||||
|
||||
data_type column_name_type = [&] {
|
||||
if (is_static_compact && kind == column_kind::regular_column) {
|
||||
return db::schema_tables::parse_type(comparator);
|
||||
}
|
||||
return utf8_type;
|
||||
}();
|
||||
auto column_name = [&] {
|
||||
try {
|
||||
return column_name_type->from_string(name);
|
||||
} catch (marshal_exception&) {
|
||||
// #2597: Scylla < 2.0 writes names in serialized form, try to recover
|
||||
column_name_type->validate(to_bytes_view(name));
|
||||
return to_bytes(name);
|
||||
}
|
||||
}();
|
||||
builder.with_column_ordered(column_definition(std::move(column_name), std::move(validator), kind, component_index));
|
||||
}
|
||||
|
||||
if (is_static_compact) {
|
||||
builder.set_regular_column_name_type(db::schema_tables::parse_type(comparator));
|
||||
}
|
||||
|
||||
if (td.has("gc_grace_seconds")) {
|
||||
builder.set_gc_grace_seconds(td.get_as<int32_t>("gc_grace_seconds"));
|
||||
}
|
||||
if (td.has("min_compaction_threshold")) {
|
||||
builder.set_min_compaction_threshold(td.get_as<int32_t>("min_compaction_threshold"));
|
||||
}
|
||||
if (td.has("max_compaction_threshold")) {
|
||||
builder.set_max_compaction_threshold(td.get_as<int32_t>("max_compaction_threshold"));
|
||||
}
|
||||
if (td.has("comment")) {
|
||||
builder.set_comment(td.get_as<sstring>("comment"));
|
||||
}
|
||||
if (td.has("memtable_flush_period_in_ms")) {
|
||||
builder.set_memtable_flush_period(td.get_as<int32_t>("memtable_flush_period_in_ms"));
|
||||
}
|
||||
if (td.has("caching")) {
|
||||
builder.set_caching_options(caching_options::from_sstring(td.get_as<sstring>("caching")));
|
||||
}
|
||||
if (td.has("default_time_to_live")) {
|
||||
builder.set_default_time_to_live(gc_clock::duration(td.get_as<int32_t>("default_time_to_live")));
|
||||
}
|
||||
if (td.has("speculative_retry")) {
|
||||
builder.set_speculative_retry(td.get_as<sstring>("speculative_retry"));
|
||||
}
|
||||
if (td.has("compaction_strategy_class")) {
|
||||
auto strategy = td.get_as<sstring>("compaction_strategy_class");
|
||||
try {
|
||||
builder.set_compaction_strategy(compaction::compaction_strategy::type(strategy));
|
||||
} catch (const exceptions::configuration_exception& e) {
|
||||
// If compaction strategy class isn't supported, fallback to incremental.
|
||||
mlogger.warn("Falling back to incremental compaction strategy after the problem: {}", e.what());
|
||||
builder.set_compaction_strategy(compaction::compaction_strategy_type::incremental);
|
||||
}
|
||||
}
|
||||
if (td.has("compaction_strategy_options")) {
|
||||
sstring strategy_options_str = td.get_as<sstring>("compaction_strategy_options");
|
||||
builder.set_compaction_strategy_options(rjson::parse_to_map<std::map<sstring, sstring>>(std::string_view(strategy_options_str)));
|
||||
}
|
||||
auto comp_param = td.get_as<sstring>("compression_parameters");
|
||||
compression_parameters cp(rjson::parse_to_map<std::map<sstring, sstring>>(std::string_view(comp_param)));
|
||||
builder.set_compressor_params(cp);
|
||||
|
||||
if (td.has("min_index_interval")) {
|
||||
builder.set_min_index_interval(td.get_as<int32_t>("min_index_interval"));
|
||||
} else if (td.has("index_interval")) { // compatibility
|
||||
builder.set_min_index_interval(td.get_as<int32_t>("index_interval"));
|
||||
}
|
||||
if (td.has("max_index_interval")) {
|
||||
builder.set_max_index_interval(td.get_as<int32_t>("max_index_interval"));
|
||||
}
|
||||
if (td.has("bloom_filter_fp_chance")) {
|
||||
builder.set_bloom_filter_fp_chance(td.get_as<double>("bloom_filter_fp_chance"));
|
||||
} else {
|
||||
builder.set_bloom_filter_fp_chance(builder.get_bloom_filter_fp_chance());
|
||||
}
|
||||
if (td.has("dropped_columns")) {
|
||||
auto map = td.get_map<sstring, int64_t>("dropped_columns");
|
||||
for (auto&& e : map) {
|
||||
builder.without_column(e.first, api::timestamp_type(e.second));
|
||||
};
|
||||
}
|
||||
|
||||
// ignore version. we're transient
|
||||
if (!triggers->empty()) {
|
||||
throw unsupported_feature("triggers");
|
||||
}
|
||||
|
||||
dst.tables.emplace_back(table{timestamp, builder.build() });
|
||||
});
|
||||
}
|
||||
|
||||
future<> read_tables(keyspace& dst) {
|
||||
auto query = fmt_query("SELECT columnfamily_name, writeTime(type) AS timestamp FROM {}.{} WHERE keyspace_name = ?",
|
||||
db::system_keyspace::legacy::COLUMNFAMILIES);
|
||||
return _qp.execute_internal(query, {dst.name}, cql3::query_processor::cache_internal::yes).then([this, &dst](result_set_type result) {
|
||||
return parallel_for_each(*result, [this, &dst](row_type& row) {
|
||||
return read_table(dst, row.get_as<sstring>("columnfamily_name"), row.get_as<time_point>("timestamp"));
|
||||
}).finally([result] {});
|
||||
});
|
||||
}
|
||||
|
||||
future<time_point> read_type_timestamp(keyspace& dst, sstring type_name) {
|
||||
// TODO: Unfortunately there is not a single REGULAR column in system.schema_usertypes, so annoyingly we cannot
|
||||
// use the writeTime() CQL function, and must resort to a lower level.
|
||||
// Origin digs up the actual cells of target partition and gets timestamp from there.
|
||||
// We should do the same, but g-dam that's messy. Lets give back dung value for now.
|
||||
return make_ready_future<time_point>(dst.timestamp);
|
||||
}
|
||||
|
||||
future<> read_types(keyspace& dst) {
|
||||
auto query = fmt_query("SELECT * FROM {}.{} WHERE keyspace_name = ?", db::system_keyspace::legacy::USERTYPES);
|
||||
return _qp.execute_internal(query, {dst.name}, cql3::query_processor::cache_internal::yes).then([this, &dst](result_set_type result) {
|
||||
return parallel_for_each(*result, [this, &dst](row_type& row) {
|
||||
auto name = row.get_blob_unfragmented("type_name");
|
||||
auto columns = row.get_list<bytes>("field_names");
|
||||
auto types = row.get_list<sstring>("field_types");
|
||||
std::vector<data_type> field_types;
|
||||
for (auto&& value : types) {
|
||||
field_types.emplace_back(db::schema_tables::parse_type(value));
|
||||
}
|
||||
auto ut = user_type_impl::get_instance(dst.name, name, columns, field_types, false);
|
||||
return read_type_timestamp(dst, value_cast<sstring>(utf8_type->deserialize(name))).then([ut = std::move(ut), &dst](time_point timestamp) {
|
||||
dst.types.emplace_back(type{timestamp, ut});
|
||||
});
|
||||
}).finally([result] {});
|
||||
});
|
||||
}
|
||||
|
||||
future<> read_functions(keyspace& dst) {
|
||||
auto query = fmt_query("SELECT * FROM {}.{} WHERE keyspace_name = ?", db::system_keyspace::legacy::FUNCTIONS);
|
||||
return _qp.execute_internal(query, {dst.name}, cql3::query_processor::cache_internal::yes).then([](result_set_type result) {
|
||||
if (!result->empty()) {
|
||||
throw unsupported_feature("functions");
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
future<> read_aggregates(keyspace& dst) {
|
||||
auto query = fmt_query("SELECT * FROM {}.{} WHERE keyspace_name = ?", db::system_keyspace::legacy::AGGREGATES);
|
||||
return _qp.execute_internal(query, {dst.name}, cql3::query_processor::cache_internal::yes).then([](result_set_type result) {
|
||||
if (!result->empty()) {
|
||||
throw unsupported_feature("aggregates");
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
future<keyspace> read_keyspace(sstring ks_name, bool durable_writes, sstring strategy_class, sstring strategy_options, time_point timestamp) {
|
||||
auto map = rjson::parse_to_map<std::map<sstring, sstring>>(std::string_view(strategy_options));
|
||||
map.emplace("class", std::move(strategy_class));
|
||||
auto ks = ::make_lw_shared<keyspace>(keyspace{timestamp, std::move(ks_name), durable_writes, std::move(map) });
|
||||
|
||||
return read_tables(*ks).then([this, ks] {
|
||||
//Collection<Type> types = readTypes(keyspaceName);
|
||||
return read_types(*ks);
|
||||
}).then([this, ks] {
|
||||
return read_functions(*ks);
|
||||
}).then([this, ks] {
|
||||
return read_aggregates(*ks);
|
||||
}).then([ks] {
|
||||
return make_ready_future<keyspace>(std::move(*ks));
|
||||
});
|
||||
}
|
||||
|
||||
future<> read_all_keyspaces() {
|
||||
static auto ks_filter = [](row_type& row) {
|
||||
auto ks_name = row.get_as<sstring>("keyspace_name");
|
||||
return ks_name != db::system_keyspace::NAME && ks_name != db::schema_tables::v3::NAME;
|
||||
};
|
||||
|
||||
auto query = fmt_query("SELECT keyspace_name, durable_writes, strategy_options, strategy_class, writeTime(durable_writes) AS timestamp FROM {}.{}",
|
||||
db::system_keyspace::legacy::KEYSPACES);
|
||||
|
||||
return _qp.execute_internal(query, cql3::query_processor::cache_internal::yes).then([this](result_set_type result) {
|
||||
auto i = boost::make_filter_iterator(ks_filter, result->begin(), result->end());
|
||||
auto e = boost::make_filter_iterator(ks_filter, result->end(), result->end());
|
||||
return parallel_for_each(i, e, [this](row_type& row) {
|
||||
return read_keyspace(row.get_as<sstring>("keyspace_name")
|
||||
, row.get_as<bool>("durable_writes")
|
||||
, row.get_as<sstring>("strategy_class")
|
||||
, row.get_as<sstring>("strategy_options")
|
||||
, row.get_as<db_clock::time_point>("timestamp")
|
||||
).then([this](keyspace ks) {
|
||||
_keyspaces.emplace_back(std::move(ks));
|
||||
});
|
||||
}).finally([result] {});
|
||||
});
|
||||
}
|
||||
|
||||
future<> drop_legacy_tables() {
|
||||
mlogger.info("Dropping legacy schema tables");
|
||||
auto with_snapshot = !_keyspaces.empty();
|
||||
for (const sstring& cfname : legacy_schema_tables) {
|
||||
co_await replica::database::legacy_drop_table_on_all_shards(_db, _sys_ks, db::system_keyspace::NAME, cfname, with_snapshot);
|
||||
}
|
||||
}
|
||||
|
||||
future<> store_keyspaces_in_new_schema_tables() {
|
||||
mlogger.info("Moving {} keyspaces from legacy schema tables to the new schema keyspace ({})",
|
||||
_keyspaces.size(), db::schema_tables::v3::NAME);
|
||||
|
||||
utils::chunked_vector<mutation> mutations;
|
||||
|
||||
for (auto& ks : _keyspaces) {
|
||||
auto ksm = ::make_lw_shared<keyspace_metadata>(ks.name
|
||||
, ks.replication_params["class"] // TODO, make ksm like c3?
|
||||
, cql3::statements::property_definitions::to_extended_map(ks.replication_params)
|
||||
, std::nullopt
|
||||
, std::nullopt
|
||||
, ks.durable_writes);
|
||||
|
||||
// we want separate time stamps for tables/types, so cannot bulk them into the ksm.
|
||||
for (auto&& m : db::schema_tables::make_create_keyspace_mutations(schema_features::full(), ksm, ks.timestamp.time_since_epoch().count(), false)) {
|
||||
mutations.emplace_back(std::move(m));
|
||||
}
|
||||
for (auto& t : ks.tables) {
|
||||
db::schema_tables::add_table_or_view_to_schema_mutation(t.metadata, t.timestamp.time_since_epoch().count(), true, mutations);
|
||||
}
|
||||
for (auto& t : ks.types) {
|
||||
db::schema_tables::add_type_to_schema_mutation(t.metadata, t.timestamp.time_since_epoch().count(), mutations);
|
||||
}
|
||||
}
|
||||
return _qp.proxy().mutate_locally(std::move(mutations), tracing::trace_state_ptr());
|
||||
}
|
||||
|
||||
future<> flush_schemas() {
|
||||
auto& db = _qp.db().real_database().container();
|
||||
return replica::database::flush_tables_on_all_shards(db, db::schema_tables::all_table_infos(schema_features::full()));
|
||||
}
|
||||
|
||||
future<> migrate() {
|
||||
return read_all_keyspaces().then([this]() {
|
||||
// write metadata to the new schema tables
|
||||
return store_keyspaces_in_new_schema_tables()
|
||||
.then(std::bind(&migrator::flush_schemas, this))
|
||||
.then(std::bind(&migrator::drop_legacy_tables, this))
|
||||
.then([] { mlogger.info("Completed migration of legacy schema tables"); });
|
||||
});
|
||||
}
|
||||
|
||||
sharded<service::storage_proxy>& _sp;
|
||||
sharded<replica::database>& _db;
|
||||
sharded<db::system_keyspace>& _sys_ks;
|
||||
cql3::query_processor& _qp;
|
||||
std::vector<keyspace> _keyspaces;
|
||||
};
|
||||
|
||||
const std::unordered_set<sstring> migrator::legacy_schema_tables = {
|
||||
db::system_keyspace::legacy::KEYSPACES,
|
||||
db::system_keyspace::legacy::COLUMNFAMILIES,
|
||||
db::system_keyspace::legacy::COLUMNS,
|
||||
db::system_keyspace::legacy::TRIGGERS,
|
||||
db::system_keyspace::legacy::USERTYPES,
|
||||
db::system_keyspace::legacy::FUNCTIONS,
|
||||
db::system_keyspace::legacy::AGGREGATES,
|
||||
};
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
future<>
|
||||
db::legacy_schema_migrator::migrate(sharded<service::storage_proxy>& sp, sharded<replica::database>& db, sharded<db::system_keyspace>& sys_ks, cql3::query_processor& qp) {
|
||||
return do_with(migrator(sp, db, sys_ks, qp), std::bind(&migrator::migrate, std::placeholders::_1));
|
||||
}
|
||||
|
||||
37
db/legacy_schema_migrator.hh
Normal file
37
db/legacy_schema_migrator.hh
Normal file
@@ -0,0 +1,37 @@
|
||||
/*
|
||||
* Modified by ScyllaDB
|
||||
* Copyright (C) 2017-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <seastar/core/future.hh>
|
||||
#include <seastar/core/sharded.hh>
|
||||
|
||||
#include "seastarx.hh"
|
||||
|
||||
namespace replica {
|
||||
class database;
|
||||
}
|
||||
|
||||
namespace cql3 {
|
||||
class query_processor;
|
||||
}
|
||||
|
||||
namespace service {
|
||||
class storage_proxy;
|
||||
}
|
||||
|
||||
namespace db {
|
||||
class system_keyspace;
|
||||
|
||||
namespace legacy_schema_migrator {
|
||||
|
||||
future<> migrate(sharded<service::storage_proxy>&, sharded<replica::database>& db, sharded<db::system_keyspace>& sys_ks, cql3::query_processor&);
|
||||
|
||||
}
|
||||
}
|
||||
@@ -542,7 +542,6 @@ public:
|
||||
// Returns the range tombstone for the key range adjacent to the cursor's position from the side of smaller keys.
|
||||
// Excludes the range for the row itself. That information is returned by range_tombstone_for_row().
|
||||
// It's possible that range_tombstone() is empty and range_tombstone_for_row() is not empty.
|
||||
// Note that this is different from the meaning of rows_entry::range_tombstone(), which includes the row itself.
|
||||
tombstone range_tombstone() const { return _range_tombstone; }
|
||||
|
||||
// Can be called when cursor is pointing at a row.
|
||||
|
||||
@@ -1287,15 +1287,6 @@ row_cache::row_cache(schema_ptr s, snapshot_source src, cache_tracker& tracker,
|
||||
, _partitions(dht::raw_token_less_comparator{})
|
||||
, _underlying(src())
|
||||
, _snapshot_source(std::move(src))
|
||||
, _update_section(abstract_formatter([this] (fmt::context& ctx) {
|
||||
fmt::format_to(ctx.out(), "cache.update {}.{}", _schema->ks_name(), _schema->cf_name());
|
||||
}))
|
||||
, _populate_section(abstract_formatter([this] (fmt::context& ctx) {
|
||||
fmt::format_to(ctx.out(), "cache.populate {}.{}", _schema->ks_name(), _schema->cf_name());
|
||||
}))
|
||||
, _read_section(abstract_formatter([this] (fmt::context& ctx) {
|
||||
fmt::format_to(ctx.out(), "cache.read {}.{}", _schema->ks_name(), _schema->cf_name());
|
||||
}))
|
||||
{
|
||||
try {
|
||||
with_allocator(_tracker.allocator(), [this, cont] {
|
||||
|
||||
@@ -1262,9 +1262,16 @@ static future<> do_merge_schema(sharded<service::storage_proxy>& proxy, sharded
|
||||
{
|
||||
slogger.trace("do_merge_schema: {}", mutations);
|
||||
schema_applier ap(proxy, ss, sys_ks, reload);
|
||||
co_await execute_do_merge_schema(proxy, ap, std::move(mutations)).finally([&ap]() {
|
||||
return ap.destroy();
|
||||
});
|
||||
std::exception_ptr ex;
|
||||
try {
|
||||
co_await execute_do_merge_schema(proxy, ap, std::move(mutations));
|
||||
} catch (...) {
|
||||
ex = std::current_exception();
|
||||
}
|
||||
co_await ap.destroy();
|
||||
if (ex) {
|
||||
throw ex;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -404,7 +404,10 @@ const std::unordered_set<table_id>& schema_tables_holding_schema_mutations() {
|
||||
computed_columns(),
|
||||
dropped_columns(),
|
||||
indexes(),
|
||||
scylla_tables()}) {
|
||||
scylla_tables(),
|
||||
db::system_keyspace::legacy::column_families(),
|
||||
db::system_keyspace::legacy::columns(),
|
||||
db::system_keyspace::legacy::triggers()}) {
|
||||
SCYLLA_ASSERT(s->clustering_key_size() > 0);
|
||||
auto&& first_column_name = s->clustering_column_at(0).name_as_text();
|
||||
SCYLLA_ASSERT(first_column_name == "table_name"
|
||||
@@ -2837,6 +2840,26 @@ void check_no_legacy_secondary_index_mv_schema(replica::database& db, const view
|
||||
}
|
||||
|
||||
|
||||
namespace legacy {
|
||||
|
||||
table_schema_version schema_mutations::digest() const {
|
||||
md5_hasher h;
|
||||
const db::schema_features no_features;
|
||||
db::schema_tables::feed_hash_for_schema_digest(h, _columnfamilies, no_features);
|
||||
db::schema_tables::feed_hash_for_schema_digest(h, _columns, no_features);
|
||||
return table_schema_version(utils::UUID_gen::get_name_UUID(h.finalize()));
|
||||
}
|
||||
|
||||
future<schema_mutations> read_table_mutations(sharded<service::storage_proxy>& proxy,
|
||||
sstring keyspace_name, sstring table_name, schema_ptr s)
|
||||
{
|
||||
mutation cf_m = co_await read_schema_partition_for_table(proxy, s, keyspace_name, table_name);
|
||||
mutation col_m = co_await read_schema_partition_for_table(proxy, db::system_keyspace::legacy::columns(), keyspace_name, table_name);
|
||||
co_return schema_mutations{std::move(cf_m), std::move(col_m)};
|
||||
}
|
||||
|
||||
} // namespace legacy
|
||||
|
||||
static auto GET_COLUMN_MAPPING_QUERY = format("SELECT column_name, clustering_order, column_name_bytes, kind, position, type FROM system.{} WHERE cf_id = ? AND schema_version = ?",
|
||||
db::schema_tables::SCYLLA_TABLE_SCHEMA_HISTORY);
|
||||
|
||||
|
||||
@@ -155,6 +155,24 @@ schema_ptr scylla_table_schema_history();
|
||||
const std::unordered_set<table_id>& schema_tables_holding_schema_mutations();
|
||||
}
|
||||
|
||||
namespace legacy {
|
||||
|
||||
class schema_mutations {
|
||||
mutation _columnfamilies;
|
||||
mutation _columns;
|
||||
public:
|
||||
schema_mutations(mutation columnfamilies, mutation columns)
|
||||
: _columnfamilies(std::move(columnfamilies))
|
||||
, _columns(std::move(columns))
|
||||
{ }
|
||||
table_schema_version digest() const;
|
||||
};
|
||||
|
||||
future<schema_mutations> read_table_mutations(sharded<service::storage_proxy>& proxy,
|
||||
sstring keyspace_name, sstring table_name, schema_ptr s);
|
||||
|
||||
}
|
||||
|
||||
struct qualified_name {
|
||||
sstring keyspace_name;
|
||||
sstring table_name;
|
||||
|
||||
@@ -213,30 +213,6 @@ schema_ptr system_keyspace::batchlog() {
|
||||
return batchlog;
|
||||
}
|
||||
|
||||
schema_ptr system_keyspace::batchlog_v2() {
|
||||
static thread_local auto batchlog_v2 = [] {
|
||||
schema_builder builder(generate_legacy_id(NAME, BATCHLOG_V2), NAME, BATCHLOG_V2,
|
||||
// partition key
|
||||
{{"version", int32_type}, {"stage", byte_type}, {"shard", int32_type}},
|
||||
// clustering key
|
||||
{{"written_at", timestamp_type}, {"id", uuid_type}},
|
||||
// regular columns
|
||||
{{"data", bytes_type}},
|
||||
// static columns
|
||||
{},
|
||||
// regular column name type
|
||||
utf8_type,
|
||||
// comment
|
||||
"batches awaiting replay"
|
||||
);
|
||||
builder.set_gc_grace_seconds(0);
|
||||
builder.set_caching_options(caching_options::get_disabled_caching_options());
|
||||
builder.with_hash_version();
|
||||
return builder.build(schema_builder::compact_storage::no);
|
||||
}();
|
||||
return batchlog_v2;
|
||||
}
|
||||
|
||||
/*static*/ schema_ptr system_keyspace::paxos() {
|
||||
static thread_local auto paxos = [] {
|
||||
// FIXME: switch to the new schema_builder interface (with_column(...), etc)
|
||||
@@ -871,6 +847,8 @@ schema_ptr system_keyspace::corrupt_data() {
|
||||
return corrupt_data;
|
||||
}
|
||||
|
||||
static constexpr auto schema_gc_grace = std::chrono::duration_cast<std::chrono::seconds>(days(7)).count();
|
||||
|
||||
/*static*/ schema_ptr system_keyspace::scylla_local() {
|
||||
static thread_local auto scylla_local = [] {
|
||||
schema_builder builder(generate_legacy_id(NAME, SCYLLA_LOCAL), NAME, SCYLLA_LOCAL,
|
||||
@@ -1382,6 +1360,289 @@ schema_ptr system_keyspace::role_permissions() {
|
||||
return schema;
|
||||
}
|
||||
|
||||
schema_ptr system_keyspace::legacy::hints() {
|
||||
static thread_local auto schema = [] {
|
||||
schema_builder builder(generate_legacy_id(NAME, HINTS), NAME, HINTS,
|
||||
// partition key
|
||||
{{"target_id", uuid_type}},
|
||||
// clustering key
|
||||
{{"hint_id", timeuuid_type}, {"message_version", int32_type}},
|
||||
// regular columns
|
||||
{{"mutation", bytes_type}},
|
||||
// static columns
|
||||
{},
|
||||
// regular column name type
|
||||
utf8_type,
|
||||
// comment
|
||||
"*DEPRECATED* hints awaiting delivery"
|
||||
);
|
||||
builder.set_gc_grace_seconds(0);
|
||||
builder.set_compaction_strategy(compaction::compaction_strategy_type::incremental);
|
||||
builder.set_compaction_strategy_options({{"enabled", "false"}});
|
||||
builder.with(schema_builder::compact_storage::yes);
|
||||
builder.with_hash_version();
|
||||
return builder.build();
|
||||
}();
|
||||
return schema;
|
||||
}
|
||||
|
||||
schema_ptr system_keyspace::legacy::batchlog() {
|
||||
static thread_local auto schema = [] {
|
||||
schema_builder builder(generate_legacy_id(NAME, BATCHLOG), NAME, BATCHLOG,
|
||||
// partition key
|
||||
{{"id", uuid_type}},
|
||||
// clustering key
|
||||
{},
|
||||
// regular columns
|
||||
{{"data", bytes_type}, {"version", int32_type}, {"written_at", timestamp_type}},
|
||||
// static columns
|
||||
{},
|
||||
// regular column name type
|
||||
utf8_type,
|
||||
// comment
|
||||
"*DEPRECATED* batchlog entries"
|
||||
);
|
||||
builder.set_gc_grace_seconds(0);
|
||||
builder.set_compaction_strategy(compaction::compaction_strategy_type::incremental);
|
||||
builder.set_compaction_strategy_options({{"min_threshold", "2"}});
|
||||
builder.with(schema_builder::compact_storage::no);
|
||||
builder.with_hash_version();
|
||||
return builder.build();
|
||||
}();
|
||||
return schema;
|
||||
}
|
||||
|
||||
schema_ptr system_keyspace::legacy::keyspaces() {
|
||||
static thread_local auto schema = [] {
|
||||
schema_builder builder(generate_legacy_id(NAME, KEYSPACES), NAME, KEYSPACES,
|
||||
// partition key
|
||||
{{"keyspace_name", utf8_type}},
|
||||
// clustering key
|
||||
{},
|
||||
// regular columns
|
||||
{
|
||||
{"durable_writes", boolean_type},
|
||||
{"strategy_class", utf8_type},
|
||||
{"strategy_options", utf8_type}
|
||||
},
|
||||
// static columns
|
||||
{},
|
||||
// regular column name type
|
||||
utf8_type,
|
||||
// comment
|
||||
"*DEPRECATED* keyspace definitions"
|
||||
);
|
||||
builder.set_gc_grace_seconds(schema_gc_grace);
|
||||
builder.with(schema_builder::compact_storage::yes);
|
||||
builder.with_hash_version();
|
||||
return builder.build();
|
||||
}();
|
||||
return schema;
|
||||
}
|
||||
|
||||
schema_ptr system_keyspace::legacy::column_families() {
|
||||
static thread_local auto schema = [] {
|
||||
schema_builder builder(generate_legacy_id(NAME, COLUMNFAMILIES), NAME, COLUMNFAMILIES,
|
||||
// partition key
|
||||
{{"keyspace_name", utf8_type}},
|
||||
// clustering key
|
||||
{{"columnfamily_name", utf8_type}},
|
||||
// regular columns
|
||||
{
|
||||
{"bloom_filter_fp_chance", double_type},
|
||||
{"caching", utf8_type},
|
||||
{"cf_id", uuid_type},
|
||||
{"comment", utf8_type},
|
||||
{"compaction_strategy_class", utf8_type},
|
||||
{"compaction_strategy_options", utf8_type},
|
||||
{"comparator", utf8_type},
|
||||
{"compression_parameters", utf8_type},
|
||||
{"default_time_to_live", int32_type},
|
||||
{"default_validator", utf8_type},
|
||||
{"dropped_columns", map_type_impl::get_instance(utf8_type, long_type, true)},
|
||||
{"gc_grace_seconds", int32_type},
|
||||
{"is_dense", boolean_type},
|
||||
{"key_validator", utf8_type},
|
||||
{"max_compaction_threshold", int32_type},
|
||||
{"max_index_interval", int32_type},
|
||||
{"memtable_flush_period_in_ms", int32_type},
|
||||
{"min_compaction_threshold", int32_type},
|
||||
{"min_index_interval", int32_type},
|
||||
{"speculative_retry", utf8_type},
|
||||
{"subcomparator", utf8_type},
|
||||
{"type", utf8_type},
|
||||
// The following 4 columns are only present up until 2.1.8 tables
|
||||
{"key_aliases", utf8_type},
|
||||
{"value_alias", utf8_type},
|
||||
{"column_aliases", utf8_type},
|
||||
{"index_interval", int32_type},},
|
||||
// static columns
|
||||
{},
|
||||
// regular column name type
|
||||
utf8_type,
|
||||
// comment
|
||||
"*DEPRECATED* table definitions"
|
||||
);
|
||||
builder.set_gc_grace_seconds(schema_gc_grace);
|
||||
builder.with(schema_builder::compact_storage::no);
|
||||
builder.with_hash_version();
|
||||
return builder.build();
|
||||
}();
|
||||
return schema;
|
||||
}
|
||||
|
||||
schema_ptr system_keyspace::legacy::columns() {
|
||||
static thread_local auto schema = [] {
|
||||
schema_builder builder(generate_legacy_id(NAME, COLUMNS), NAME, COLUMNS,
|
||||
// partition key
|
||||
{{"keyspace_name", utf8_type}},
|
||||
// clustering key
|
||||
{{"columnfamily_name", utf8_type}, {"column_name", utf8_type}},
|
||||
// regular columns
|
||||
{
|
||||
{"component_index", int32_type},
|
||||
{"index_name", utf8_type},
|
||||
{"index_options", utf8_type},
|
||||
{"index_type", utf8_type},
|
||||
{"type", utf8_type},
|
||||
{"validator", utf8_type},
|
||||
},
|
||||
// static columns
|
||||
{},
|
||||
// regular column name type
|
||||
utf8_type,
|
||||
// comment
|
||||
"column definitions"
|
||||
);
|
||||
builder.set_gc_grace_seconds(schema_gc_grace);
|
||||
builder.with(schema_builder::compact_storage::no);
|
||||
builder.with_hash_version();
|
||||
return builder.build();
|
||||
}();
|
||||
return schema;
|
||||
}
|
||||
|
||||
schema_ptr system_keyspace::legacy::triggers() {
|
||||
static thread_local auto schema = [] {
|
||||
schema_builder builder(generate_legacy_id(NAME, TRIGGERS), NAME, TRIGGERS,
|
||||
// partition key
|
||||
{{"keyspace_name", utf8_type}},
|
||||
// clustering key
|
||||
{{"columnfamily_name", utf8_type}, {"trigger_name", utf8_type}},
|
||||
// regular columns
|
||||
{
|
||||
{"trigger_options", map_type_impl::get_instance(utf8_type, utf8_type, true)},
|
||||
},
|
||||
// static columns
|
||||
{},
|
||||
// regular column name type
|
||||
utf8_type,
|
||||
// comment
|
||||
"trigger definitions"
|
||||
);
|
||||
builder.set_gc_grace_seconds(schema_gc_grace);
|
||||
builder.with(schema_builder::compact_storage::no);
|
||||
builder.with_hash_version();
|
||||
return builder.build();
|
||||
}();
|
||||
return schema;
|
||||
}
|
||||
|
||||
schema_ptr system_keyspace::legacy::usertypes() {
|
||||
static thread_local auto schema = [] {
|
||||
schema_builder builder(generate_legacy_id(NAME, USERTYPES), NAME, USERTYPES,
|
||||
// partition key
|
||||
{{"keyspace_name", utf8_type}},
|
||||
// clustering key
|
||||
{{"type_name", utf8_type}},
|
||||
// regular columns
|
||||
{
|
||||
{"field_names", list_type_impl::get_instance(utf8_type, true)},
|
||||
{"field_types", list_type_impl::get_instance(utf8_type, true)},
|
||||
},
|
||||
// static columns
|
||||
{},
|
||||
// regular column name type
|
||||
utf8_type,
|
||||
// comment
|
||||
"user defined type definitions"
|
||||
);
|
||||
builder.set_gc_grace_seconds(schema_gc_grace);
|
||||
builder.with(schema_builder::compact_storage::no);
|
||||
builder.with_hash_version();
|
||||
return builder.build();
|
||||
}();
|
||||
return schema;
|
||||
}
|
||||
|
||||
schema_ptr system_keyspace::legacy::functions() {
|
||||
/**
|
||||
* Note: we have our own "legacy" version of this table (in schema_tables),
|
||||
* but it is (afaik) not used, and differs slightly from the origin one.
|
||||
* This is based on the origin schema, since we're more likely to encounter
|
||||
* installations of that to migrate, rather than our own (if we dont use the table).
|
||||
*/
|
||||
static thread_local auto schema = [] {
|
||||
schema_builder builder(generate_legacy_id(NAME, FUNCTIONS), NAME, FUNCTIONS,
|
||||
// partition key
|
||||
{{"keyspace_name", utf8_type}},
|
||||
// clustering key
|
||||
{{"function_name", utf8_type},{"signature", list_type_impl::get_instance(utf8_type, false)}},
|
||||
// regular columns
|
||||
{
|
||||
{"argument_names", list_type_impl::get_instance(utf8_type, true)},
|
||||
{"argument_types", list_type_impl::get_instance(utf8_type, true)},
|
||||
{"body", utf8_type},
|
||||
{"language", utf8_type},
|
||||
{"return_type", utf8_type},
|
||||
{"called_on_null_input", boolean_type},
|
||||
},
|
||||
// static columns
|
||||
{},
|
||||
// regular column name type
|
||||
utf8_type,
|
||||
// comment
|
||||
"*DEPRECATED* user defined type definitions"
|
||||
);
|
||||
builder.set_gc_grace_seconds(schema_gc_grace);
|
||||
builder.with(schema_builder::compact_storage::no);
|
||||
builder.with_hash_version();
|
||||
return builder.build();
|
||||
}();
|
||||
return schema;
|
||||
}
|
||||
|
||||
schema_ptr system_keyspace::legacy::aggregates() {
|
||||
static thread_local auto schema = [] {
|
||||
schema_builder builder(generate_legacy_id(NAME, AGGREGATES), NAME, AGGREGATES,
|
||||
// partition key
|
||||
{{"keyspace_name", utf8_type}},
|
||||
// clustering key
|
||||
{{"aggregate_name", utf8_type},{"signature", list_type_impl::get_instance(utf8_type, false)}},
|
||||
// regular columns
|
||||
{
|
||||
{"argument_types", list_type_impl::get_instance(utf8_type, true)},
|
||||
{"final_func", utf8_type},
|
||||
{"initcond", bytes_type},
|
||||
{"return_type", utf8_type},
|
||||
{"state_func", utf8_type},
|
||||
{"state_type", utf8_type},
|
||||
},
|
||||
// static columns
|
||||
{},
|
||||
// regular column name type
|
||||
utf8_type,
|
||||
// comment
|
||||
"*DEPRECATED* user defined aggregate definition"
|
||||
);
|
||||
builder.set_gc_grace_seconds(schema_gc_grace);
|
||||
builder.with(schema_builder::compact_storage::no);
|
||||
builder.with_hash_version();
|
||||
return builder.build();
|
||||
}();
|
||||
return schema;
|
||||
}
|
||||
|
||||
schema_ptr system_keyspace::dicts() {
|
||||
static thread_local auto schema = [] {
|
||||
auto id = generate_legacy_id(NAME, DICTS);
|
||||
@@ -2328,7 +2589,7 @@ std::vector<schema_ptr> system_keyspace::all_tables(const db::config& cfg) {
|
||||
std::copy(schema_tables.begin(), schema_tables.end(), std::back_inserter(r));
|
||||
auto auth_tables = system_keyspace::auth_tables();
|
||||
std::copy(auth_tables.begin(), auth_tables.end(), std::back_inserter(r));
|
||||
r.insert(r.end(), { built_indexes(), hints(), batchlog(), batchlog_v2(), paxos(), local(),
|
||||
r.insert(r.end(), { built_indexes(), hints(), batchlog(), paxos(), local(),
|
||||
peers(), peer_events(), range_xfers(),
|
||||
compactions_in_progress(), compaction_history(),
|
||||
sstable_activity(), size_estimates(), large_partitions(), large_rows(), large_cells(),
|
||||
@@ -2354,14 +2615,19 @@ std::vector<schema_ptr> system_keyspace::all_tables(const db::config& cfg) {
|
||||
if (cfg.check_experimental(db::experimental_features_t::feature::KEYSPACE_STORAGE_OPTIONS)) {
|
||||
r.insert(r.end(), {sstables_registry()});
|
||||
}
|
||||
// legacy schema
|
||||
r.insert(r.end(), {
|
||||
// TODO: once we migrate hints/batchlog and add converter
|
||||
// legacy::hints(), legacy::batchlog(),
|
||||
legacy::keyspaces(), legacy::column_families(),
|
||||
legacy::columns(), legacy::triggers(), legacy::usertypes(),
|
||||
legacy::functions(), legacy::aggregates(), });
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static bool maybe_write_in_user_memory(schema_ptr s) {
|
||||
return (s.get() == system_keyspace::batchlog().get())
|
||||
|| (s.get() == system_keyspace::batchlog_v2().get())
|
||||
|| (s.get() == system_keyspace::paxos().get())
|
||||
return (s.get() == system_keyspace::batchlog().get()) || (s.get() == system_keyspace::paxos().get())
|
||||
|| s == system_keyspace::v3::scylla_views_builds_in_progress();
|
||||
}
|
||||
|
||||
|
||||
@@ -163,7 +163,6 @@ public:
|
||||
static constexpr auto NAME = "system";
|
||||
static constexpr auto HINTS = "hints";
|
||||
static constexpr auto BATCHLOG = "batchlog";
|
||||
static constexpr auto BATCHLOG_V2 = "batchlog_v2";
|
||||
static constexpr auto PAXOS = "paxos";
|
||||
static constexpr auto BUILT_INDEXES = "IndexInfo";
|
||||
static constexpr auto LOCAL = "local";
|
||||
@@ -242,6 +241,28 @@ public:
|
||||
static schema_ptr cdc_local();
|
||||
};
|
||||
|
||||
struct legacy {
|
||||
static constexpr auto HINTS = "hints";
|
||||
static constexpr auto BATCHLOG = "batchlog";
|
||||
static constexpr auto KEYSPACES = "schema_keyspaces";
|
||||
static constexpr auto COLUMNFAMILIES = "schema_columnfamilies";
|
||||
static constexpr auto COLUMNS = "schema_columns";
|
||||
static constexpr auto TRIGGERS = "schema_triggers";
|
||||
static constexpr auto USERTYPES = "schema_usertypes";
|
||||
static constexpr auto FUNCTIONS = "schema_functions";
|
||||
static constexpr auto AGGREGATES = "schema_aggregates";
|
||||
|
||||
static schema_ptr keyspaces();
|
||||
static schema_ptr column_families();
|
||||
static schema_ptr columns();
|
||||
static schema_ptr triggers();
|
||||
static schema_ptr usertypes();
|
||||
static schema_ptr functions();
|
||||
static schema_ptr aggregates();
|
||||
static schema_ptr hints();
|
||||
static schema_ptr batchlog();
|
||||
};
|
||||
|
||||
// Partition estimates for a given range of tokens.
|
||||
struct range_estimates {
|
||||
schema_ptr schema;
|
||||
@@ -256,7 +277,6 @@ public:
|
||||
|
||||
static schema_ptr hints();
|
||||
static schema_ptr batchlog();
|
||||
static schema_ptr batchlog_v2();
|
||||
static schema_ptr paxos();
|
||||
static schema_ptr built_indexes(); // TODO (from Cassandra): make private
|
||||
static schema_ptr raft();
|
||||
|
||||
429
db/view/view.cc
429
db/view/view.cc
@@ -78,6 +78,7 @@
|
||||
#include "readers/multishard.hh"
|
||||
#include "readers/filtering.hh"
|
||||
#include "delete_ghost_rows_visitor.hh"
|
||||
#include "keys/clustering_interval_set.hh"
|
||||
#include "locator/host_id.hh"
|
||||
#include "cartesian_product.hh"
|
||||
#include "idl/view.dist.hh"
|
||||
@@ -1658,7 +1659,10 @@ future<query::clustering_row_ranges> calculate_affected_clustering_ranges(data_d
|
||||
const dht::decorated_key& key,
|
||||
const mutation_partition& mp,
|
||||
const std::vector<view_ptr>& views) {
|
||||
// WARNING: interval<clustering_key_prefix_view> is unsafe - refer to scylladb#22817 and scylladb#21604
|
||||
// FIXME: This function should be refactored to use position_range and clustering_interval_set
|
||||
// instead of interval<clustering_key_prefix_view> to avoid issues with intersection and deoverlap.
|
||||
// See scylladb#22817, scylladb#21604, and scylladb#8157 for details.
|
||||
// The current implementation uses unsafe operations that can return incorrect results.
|
||||
utils::chunked_vector<interval<clustering_key_prefix_view>> row_ranges;
|
||||
utils::chunked_vector<interval<clustering_key_prefix_view>> view_row_ranges;
|
||||
clustering_key_prefix_view::tri_compare cmp(base);
|
||||
@@ -1684,7 +1688,10 @@ future<query::clustering_row_ranges> calculate_affected_clustering_ranges(data_d
|
||||
bound_view::to_interval_bound<interval>(rt.start_bound()),
|
||||
bound_view::to_interval_bound<interval>(rt.end_bound()));
|
||||
for (auto&& vr : view_row_ranges) {
|
||||
// WARNING: interval<clustering_key_prefix_view>::intersection can return incorrect results - refer to scylladb#8157 and scylladb#21604
|
||||
// FIXME: interval<clustering_key_prefix_view>::intersection can return incorrect results
|
||||
// (scylladb#8157, scylladb#21604). This should be refactored to use position_range.
|
||||
// Proper fix: Convert to position_range, check overlap using position_range::overlaps(),
|
||||
// compute intersection manually with position_in_partition comparisons.
|
||||
auto overlap = rtr.intersection(vr, cmp);
|
||||
if (overlap) {
|
||||
row_ranges.push_back(std::move(overlap).value());
|
||||
@@ -1708,15 +1715,18 @@ future<query::clustering_row_ranges> calculate_affected_clustering_ranges(data_d
|
||||
// content, in case the view includes a column that is not included in
|
||||
// this mutation.
|
||||
|
||||
query::clustering_row_ranges result_ranges;
|
||||
// FIXME: scylladb#22817 - interval<clustering_key_prefix_view>::deoverlap can return incorrect results
|
||||
auto deoverlapped_ranges = interval<clustering_key_prefix_view>::deoverlap(std::move(row_ranges), cmp);
|
||||
result_ranges.reserve(deoverlapped_ranges.size());
|
||||
for (auto&& r : deoverlapped_ranges) {
|
||||
result_ranges.emplace_back(std::move(r).transform([] (auto&& ckv) { return clustering_key_prefix(ckv); }));
|
||||
co_await coroutine::maybe_yield();
|
||||
// FIXME: interval<clustering_key_prefix_view>::deoverlap can return incorrect results (scylladb#22817)
|
||||
// Proper fix: Convert row_ranges to clustering_row_ranges, then use clustering_interval_set
|
||||
// which handles deoverlapping correctly via position_range internally.
|
||||
query::clustering_row_ranges temp_ranges;
|
||||
temp_ranges.reserve(row_ranges.size());
|
||||
for (auto&& r : row_ranges) {
|
||||
temp_ranges.emplace_back(std::move(r).transform([] (auto&& ckv) { return clustering_key_prefix(ckv); }));
|
||||
}
|
||||
co_return result_ranges;
|
||||
|
||||
// Use clustering_interval_set for correct deoverlapping (fixes scylladb#22817)
|
||||
clustering_interval_set interval_set(base, temp_ranges);
|
||||
co_return interval_set.to_clustering_row_ranges();
|
||||
}
|
||||
|
||||
bool needs_static_row(const mutation_partition& mp, const std::vector<view_ptr>& views) {
|
||||
@@ -1744,115 +1754,6 @@ bool should_generate_view_updates_on_this_shard(const schema_ptr& base, const lo
|
||||
&& std::ranges::contains(shards, this_shard_id());
|
||||
}
|
||||
|
||||
static endpoints_to_update get_view_natural_endpoint_vnodes(
|
||||
locator::host_id me,
|
||||
std::vector<std::reference_wrapper<const locator::node>> base_nodes,
|
||||
std::vector<std::reference_wrapper<const locator::node>> view_nodes,
|
||||
locator::endpoint_dc_rack my_location,
|
||||
const locator::network_topology_strategy* network_topology,
|
||||
replica::cf_stats& cf_stats) {
|
||||
using node_vector = std::vector<std::reference_wrapper<const locator::node>>;
|
||||
node_vector base_endpoints, view_endpoints;
|
||||
auto& my_datacenter = my_location.dc;
|
||||
|
||||
auto process_candidate = [&] (node_vector& nodes, std::reference_wrapper<const locator::node> node) {
|
||||
if (!network_topology || node.get().dc() == my_datacenter) {
|
||||
nodes.emplace_back(node);
|
||||
}
|
||||
};
|
||||
|
||||
for (auto&& base_node : base_nodes) {
|
||||
process_candidate(base_endpoints, base_node);
|
||||
}
|
||||
|
||||
for (auto&& view_node : view_nodes) {
|
||||
auto it = std::ranges::find(base_endpoints, view_node.get().host_id(), std::mem_fn(&locator::node::host_id));
|
||||
// If this base replica is also one of the view replicas, we use
|
||||
// ourselves as the view replica.
|
||||
// We don't return an extra endpoint, as it's only needed when
|
||||
// using tablets (so !use_legacy_self_pairing)
|
||||
if (view_node.get().host_id() == me && it != base_endpoints.end()) {
|
||||
return {.natural_endpoint = me};
|
||||
}
|
||||
|
||||
// We have to remove any endpoint which is shared between the base
|
||||
// and the view, as it will select itself and throw off the counts
|
||||
// otherwise.
|
||||
if (it != base_endpoints.end()) {
|
||||
base_endpoints.erase(it);
|
||||
} else if (!network_topology || view_node.get().dc() == my_datacenter) {
|
||||
view_endpoints.push_back(view_node);
|
||||
}
|
||||
}
|
||||
|
||||
auto base_it = std::ranges::find(base_endpoints, me, std::mem_fn(&locator::node::host_id));
|
||||
if (base_it == base_endpoints.end()) {
|
||||
// This node is not a base replica of this key, so we return empty
|
||||
// FIXME: This case shouldn't happen, and if it happens, a view update
|
||||
// would be lost.
|
||||
++cf_stats.total_view_updates_on_wrong_node;
|
||||
vlogger.warn("Could not find {} in base_endpoints={}", me,
|
||||
base_endpoints | std::views::transform(std::mem_fn(&locator::node::host_id)));
|
||||
return {};
|
||||
}
|
||||
size_t idx = base_it - base_endpoints.begin();
|
||||
return {.natural_endpoint = view_endpoints[idx].get().host_id()};
|
||||
}
|
||||
|
||||
static std::optional<locator::host_id> get_unpaired_view_endpoint(
|
||||
std::vector<std::reference_wrapper<const locator::node>> base_nodes,
|
||||
std::vector<std::reference_wrapper<const locator::node>> view_nodes,
|
||||
replica::cf_stats& cf_stats) {
|
||||
std::unordered_set<locator::endpoint_dc_rack> base_dc_racks;
|
||||
for (auto&& base_node : base_nodes) {
|
||||
if (base_dc_racks.contains(base_node.get().dc_rack())) {
|
||||
// We can't do rack-aware pairing if there are multiple replicas in the same rack.
|
||||
++cf_stats.total_view_updates_failed_pairing;
|
||||
vlogger.warn("Can't perform base-view pairing in this topology. There are multiple base table replicas in the same dc/rack({}/{}):",
|
||||
base_node.get().dc(), base_node.get().rack());
|
||||
return std::nullopt;
|
||||
}
|
||||
base_dc_racks.insert(base_node.get().dc_rack());
|
||||
}
|
||||
|
||||
std::unordered_set<locator::endpoint_dc_rack> paired_view_dc_racks;
|
||||
std::unordered_map<locator::endpoint_dc_rack, locator::host_id> unpaired_view_dc_rack_replicas;
|
||||
for (auto&& view_node : view_nodes) {
|
||||
if (paired_view_dc_racks.contains(view_node.get().dc_rack()) || unpaired_view_dc_rack_replicas.contains(view_node.get().dc_rack())) {
|
||||
// We can't do rack-aware pairing if there are multiple replicas in the same rack.
|
||||
++cf_stats.total_view_updates_failed_pairing;
|
||||
vlogger.warn("Can't perform base-view pairing in this topology. There are multiple view table replicas in the same dc/rack({}/{}):",
|
||||
view_node.get().dc(), view_node.get().rack());
|
||||
return std::nullopt;
|
||||
}
|
||||
// Track unpaired replicas in both sets
|
||||
if (base_dc_racks.contains(view_node.get().dc_rack())) {
|
||||
paired_view_dc_racks.insert(view_node.get().dc_rack());
|
||||
} else {
|
||||
unpaired_view_dc_rack_replicas.insert({view_node.get().dc_rack(), view_node.get().host_id()});
|
||||
}
|
||||
}
|
||||
|
||||
if (unpaired_view_dc_rack_replicas.size() > 0) {
|
||||
// There are view replicas that can't be paired with any base replica
|
||||
// This can happen as a result of an RF change when the view replica finishes streaming
|
||||
// before the base replica.
|
||||
// Because of this, a view replica might not get paired with any base replica, so we need
|
||||
// to send an additional update to it.
|
||||
++cf_stats.total_view_updates_due_to_replica_count_mismatch;
|
||||
auto extra_replica = unpaired_view_dc_rack_replicas.begin()->second;
|
||||
unpaired_view_dc_rack_replicas.erase(unpaired_view_dc_rack_replicas.begin());
|
||||
if (unpaired_view_dc_rack_replicas.size() > 0) {
|
||||
// We only expect one extra replica to appear due to an RF change. If there's more, that's an error,
|
||||
// but we'll still perform updates to the paired and last replicas to minimize degradation.
|
||||
vlogger.warn("There are too many view endpoints for base-view pairing. View updates may get lost on view_endpoints={}",
|
||||
unpaired_view_dc_rack_replicas | std::views::values);
|
||||
}
|
||||
return extra_replica;
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
// Calculate the node ("natural endpoint") to which this node should send
|
||||
// a view update.
|
||||
//
|
||||
@@ -1865,19 +1766,29 @@ static std::optional<locator::host_id> get_unpaired_view_endpoint(
|
||||
// of this function is to find, assuming that this node is one of the base
|
||||
// replicas for a given partition, the paired view replica.
|
||||
//
|
||||
// When using vnodes, we have an optimization called "self-pairing" - if a single
|
||||
// node is both a base replica and a view replica for a write, the pairing is
|
||||
// modified so that this node sends the update to itself and this node is removed
|
||||
// from the lists of nodes paired by index. This self-pairing optimization can
|
||||
// cause the pairing to change after view ranges are moved between nodes.
|
||||
// In the past, we used an optimization called "self-pairing" that if a single
|
||||
// node was both a base replica and a view replica for a write, the pairing is
|
||||
// modified so that this node would send the update to itself. This self-
|
||||
// pairing optimization could cause the pairing to change after view ranges
|
||||
// are moved between nodes, so currently we only use it if
|
||||
// use_legacy_self_pairing is set to true. When using tablets - where range
|
||||
// movements are common - it is strongly recommended to set it to false.
|
||||
//
|
||||
// If the keyspace's replication strategy is a NetworkTopologyStrategy,
|
||||
// we pair only nodes in the same datacenter.
|
||||
//
|
||||
// If the table uses tablets, then pairing is rack-aware. In this case, in each
|
||||
// rack where we have a base replica there is also one replica of each view tablet.
|
||||
// Therefore, the base replicas are naturally paired with the view replicas that
|
||||
// are in the same rack.
|
||||
// When use_legacy_self_pairing is enabled, if one of the base replicas
|
||||
// also happens to be a view replica, it is paired with itself
|
||||
// (with the other nodes paired by order in the list
|
||||
// after taking this node out).
|
||||
//
|
||||
// If the table uses tablets and the replication strategy is NetworkTopologyStrategy
|
||||
// and the replication factor in the node's datacenter is a multiple of the number
|
||||
// of racks in the datacenter, then pairing is rack-aware. In this case,
|
||||
// all racks have the same number of replicas, and those are never migrated
|
||||
// outside their racks. Therefore, the base replicas are naturally paired with the
|
||||
// view replicas that are in the same rack, based on the ordinal position.
|
||||
// Note that typically, there is a single replica per rack and pairing is trivial.
|
||||
//
|
||||
// If the assumption that the given base token belongs to this replica
|
||||
// does not hold, we return an empty optional.
|
||||
@@ -1905,12 +1816,19 @@ endpoints_to_update get_view_natural_endpoint(
|
||||
const locator::abstract_replication_strategy& replication_strategy,
|
||||
const dht::token& base_token,
|
||||
const dht::token& view_token,
|
||||
bool use_tablets,
|
||||
bool use_legacy_self_pairing,
|
||||
bool use_tablets_rack_aware_view_pairing,
|
||||
replica::cf_stats& cf_stats) {
|
||||
auto& topology = base_erm->get_token_metadata_ptr()->get_topology();
|
||||
auto& view_topology = view_erm->get_token_metadata_ptr()->get_topology();
|
||||
auto& my_location = topology.get_location(me);
|
||||
auto& my_datacenter = my_location.dc;
|
||||
auto* network_topology = dynamic_cast<const locator::network_topology_strategy*>(&replication_strategy);
|
||||
auto rack_aware_pairing = use_tablets_rack_aware_view_pairing && network_topology;
|
||||
bool simple_rack_aware_pairing = false;
|
||||
using node_vector = std::vector<std::reference_wrapper<const locator::node>>;
|
||||
node_vector orig_base_endpoints, orig_view_endpoints;
|
||||
node_vector base_endpoints, view_endpoints;
|
||||
|
||||
auto resolve = [&] (const locator::topology& topology, const locator::host_id& ep, bool is_view) -> const locator::node& {
|
||||
if (auto* np = topology.find_node(ep)) {
|
||||
@@ -1921,7 +1839,6 @@ endpoints_to_update get_view_natural_endpoint(
|
||||
|
||||
// We need to use get_replicas() for pairing to be stable in case base or view tablet
|
||||
// is rebuilding a replica which has left the ring. get_natural_endpoints() filters such replicas.
|
||||
using node_vector = std::vector<std::reference_wrapper<const locator::node>>;
|
||||
auto base_nodes = base_erm->get_replicas(base_token) | std::views::transform([&] (const locator::host_id& ep) -> const locator::node& {
|
||||
return resolve(topology, ep, false);
|
||||
}) | std::ranges::to<node_vector>();
|
||||
@@ -1945,43 +1862,231 @@ endpoints_to_update get_view_natural_endpoint(
|
||||
// note that the recursive call will not recurse again because leaving_base is in base_nodes.
|
||||
auto leaving_base = it->get().host_id();
|
||||
return get_view_natural_endpoint(leaving_base, base_erm, view_erm, replication_strategy, base_token,
|
||||
view_token, use_tablets, cf_stats);
|
||||
view_token, use_legacy_self_pairing, use_tablets_rack_aware_view_pairing, cf_stats);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!use_tablets) {
|
||||
return get_view_natural_endpoint_vnodes(
|
||||
me,
|
||||
base_nodes,
|
||||
view_nodes,
|
||||
my_location,
|
||||
network_topology,
|
||||
cf_stats);
|
||||
std::function<bool(const locator::node&)> is_candidate;
|
||||
if (network_topology) {
|
||||
is_candidate = [&] (const locator::node& node) { return node.dc() == my_datacenter; };
|
||||
} else {
|
||||
is_candidate = [&] (const locator::node&) { return true; };
|
||||
}
|
||||
auto process_candidate = [&] (node_vector& nodes, std::reference_wrapper<const locator::node> node) {
|
||||
if (is_candidate(node)) {
|
||||
nodes.emplace_back(node);
|
||||
}
|
||||
};
|
||||
|
||||
for (auto&& base_node : base_nodes) {
|
||||
process_candidate(base_endpoints, base_node);
|
||||
}
|
||||
|
||||
std::optional<locator::host_id> paired_replica;
|
||||
for (auto&& view_node : view_nodes) {
|
||||
if (view_node.get().dc_rack() == my_location) {
|
||||
paired_replica = view_node.get().host_id();
|
||||
break;
|
||||
if (use_legacy_self_pairing) {
|
||||
for (auto&& view_node : view_nodes) {
|
||||
auto it = std::ranges::find(base_endpoints, view_node.get().host_id(), std::mem_fn(&locator::node::host_id));
|
||||
// If this base replica is also one of the view replicas, we use
|
||||
// ourselves as the view replica.
|
||||
// We don't return an extra endpoint, as it's only needed when
|
||||
// using tablets (so !use_legacy_self_pairing)
|
||||
if (view_node.get().host_id() == me && it != base_endpoints.end()) {
|
||||
return {.natural_endpoint = me};
|
||||
}
|
||||
|
||||
// We have to remove any endpoint which is shared between the base
|
||||
// and the view, as it will select itself and throw off the counts
|
||||
// otherwise.
|
||||
if (it != base_endpoints.end()) {
|
||||
base_endpoints.erase(it);
|
||||
} else if (is_candidate(view_node)) {
|
||||
view_endpoints.push_back(view_node);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (auto&& view_node : view_nodes) {
|
||||
process_candidate(view_endpoints, view_node);
|
||||
}
|
||||
}
|
||||
if (paired_replica && base_nodes.size() == view_nodes.size()) {
|
||||
// We don't need to find any extra replicas, so we can return early
|
||||
return {.natural_endpoint = paired_replica};
|
||||
|
||||
// Try optimizing for simple rack-aware pairing
|
||||
// If the numbers of base and view replica differ, that means an RF change is taking place
|
||||
// and we can't use simple rack-aware pairing.
|
||||
if (rack_aware_pairing && base_endpoints.size() == view_endpoints.size()) {
|
||||
auto dc_rf = network_topology->get_replication_factor(my_datacenter);
|
||||
const auto& racks = topology.get_datacenter_rack_nodes().at(my_datacenter);
|
||||
// Simple rack-aware pairing is possible when the datacenter replication factor
|
||||
// is a multiple of the number of racks in the datacenter.
|
||||
if (dc_rf % racks.size() == 0) {
|
||||
simple_rack_aware_pairing = true;
|
||||
size_t rack_rf = dc_rf / racks.size();
|
||||
// If any rack doesn't have enough nodes to satisfy the per-rack rf
|
||||
// simple rack-aware pairing is disabled.
|
||||
for (const auto& [rack, nodes] : racks) {
|
||||
if (nodes.size() < rack_rf) {
|
||||
simple_rack_aware_pairing = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (dc_rf != base_endpoints.size()) {
|
||||
// If the datacenter replication factor is not equal to the number of base replicas,
|
||||
// we're in progress of a RF change and we can't use simple rack-aware pairing.
|
||||
simple_rack_aware_pairing = false;
|
||||
}
|
||||
if (simple_rack_aware_pairing) {
|
||||
std::erase_if(base_endpoints, [&] (const locator::node& node) { return node.dc_rack() != my_location; });
|
||||
std::erase_if(view_endpoints, [&] (const locator::node& node) { return node.dc_rack() != my_location; });
|
||||
}
|
||||
}
|
||||
if (!paired_replica) {
|
||||
// We couldn't find any view replica in our rack
|
||||
|
||||
orig_base_endpoints = base_endpoints;
|
||||
orig_view_endpoints = view_endpoints;
|
||||
|
||||
// For the complex rack_aware_pairing case, nodes are already filtered by datacenter
|
||||
// Use best-match, for the minimum number of base and view replicas in each rack,
|
||||
// and ordinal match for the rest.
|
||||
std::optional<std::reference_wrapper<const locator::node>> paired_replica;
|
||||
if (rack_aware_pairing && !simple_rack_aware_pairing) {
|
||||
struct indexed_replica {
|
||||
size_t idx;
|
||||
std::reference_wrapper<const locator::node> node;
|
||||
};
|
||||
std::unordered_map<sstring, std::vector<indexed_replica>> base_racks, view_racks;
|
||||
|
||||
// First, index all replicas by rack
|
||||
auto index_replica_set = [] (std::unordered_map<sstring, std::vector<indexed_replica>>& racks, const node_vector& replicas) {
|
||||
size_t idx = 0;
|
||||
for (const auto& r: replicas) {
|
||||
racks[r.get().rack()].emplace_back(idx++, r);
|
||||
}
|
||||
};
|
||||
index_replica_set(base_racks, base_endpoints);
|
||||
index_replica_set(view_racks, view_endpoints);
|
||||
|
||||
// Try optimistically pairing `me` first
|
||||
const auto& my_base_replicas = base_racks[my_location.rack];
|
||||
auto base_it = std::ranges::find(my_base_replicas, me, [] (const indexed_replica& ir) { return ir.node.get().host_id(); });
|
||||
if (base_it == my_base_replicas.end()) {
|
||||
return {};
|
||||
}
|
||||
const auto& my_view_replicas = view_racks[my_location.rack];
|
||||
size_t idx = base_it - my_base_replicas.begin();
|
||||
if (idx < my_view_replicas.size()) {
|
||||
if (orig_view_endpoints.size() <= orig_base_endpoints.size()) {
|
||||
return {.natural_endpoint = my_view_replicas[idx].node.get().host_id()};
|
||||
} else {
|
||||
// If the number of view replicas is larger than the number of base replicas,
|
||||
// we need to find the unpaired view replica, so we can't return yet.
|
||||
paired_replica = my_view_replicas[idx].node;
|
||||
}
|
||||
}
|
||||
|
||||
// Collect all unpaired base and view replicas,
|
||||
// where the number of replicas in the base rack is different than the respective view rack
|
||||
std::vector<indexed_replica> unpaired_base_replicas, unpaired_view_replicas;
|
||||
for (const auto& [rack, base_replicas] : base_racks) {
|
||||
const auto& view_replicas = view_racks[rack];
|
||||
for (auto i = view_replicas.size(); i < base_replicas.size(); ++i) {
|
||||
unpaired_base_replicas.emplace_back(base_replicas[i]);
|
||||
}
|
||||
}
|
||||
for (const auto& [rack, view_replicas] : view_racks) {
|
||||
const auto& base_replicas = base_racks[rack];
|
||||
for (auto i = base_replicas.size(); i < view_replicas.size(); ++i) {
|
||||
unpaired_view_replicas.emplace_back(view_replicas[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// Sort by the original ordinality, and copy the sorted results
|
||||
// back into {base,view}_endpoints, for backward compatible processing below.
|
||||
std::ranges::sort(unpaired_base_replicas, std::less(), std::mem_fn(&indexed_replica::idx));
|
||||
base_endpoints.clear();
|
||||
std::ranges::transform(unpaired_base_replicas, std::back_inserter(base_endpoints), std::mem_fn(&indexed_replica::node));
|
||||
|
||||
std::ranges::sort(unpaired_view_replicas, std::less(), std::mem_fn(&indexed_replica::idx));
|
||||
view_endpoints.clear();
|
||||
std::ranges::transform(unpaired_view_replicas, std::back_inserter(view_endpoints), std::mem_fn(&indexed_replica::node));
|
||||
}
|
||||
|
||||
auto base_it = std::ranges::find(base_endpoints, me, std::mem_fn(&locator::node::host_id));
|
||||
if (!paired_replica && base_it == base_endpoints.end()) {
|
||||
// This node is not a base replica of this key, so we return empty
|
||||
// FIXME: This case shouldn't happen, and if it happens, a view update
|
||||
// would be lost.
|
||||
++cf_stats.total_view_updates_on_wrong_node;
|
||||
vlogger.warn("Could not find {} in base_endpoints={}", me,
|
||||
orig_base_endpoints | std::views::transform(std::mem_fn(&locator::node::host_id)));
|
||||
return {};
|
||||
}
|
||||
size_t idx = base_it - base_endpoints.begin();
|
||||
std::optional<std::reference_wrapper<const locator::node>> no_pairing_replica;
|
||||
if (!paired_replica && idx >= view_endpoints.size()) {
|
||||
// There are fewer view replicas than base replicas
|
||||
// FIXME: This might still happen when reducing replication factor with tablets,
|
||||
// see https://github.com/scylladb/scylladb/issues/21492
|
||||
++cf_stats.total_view_updates_failed_pairing;
|
||||
vlogger.warn("Could not find a view replica in the same rack as base replica {} for base_endpoints={} view_endpoints={}",
|
||||
me,
|
||||
base_nodes | std::views::transform(std::mem_fn(&locator::node::host_id)),
|
||||
view_nodes | std::views::transform(std::mem_fn(&locator::node::host_id)));
|
||||
vlogger.warn("Could not pair {}: rack_aware={} base_endpoints={} view_endpoints={}", me,
|
||||
rack_aware_pairing ? (simple_rack_aware_pairing ? "simple" : "complex") : "none",
|
||||
orig_base_endpoints | std::views::transform(std::mem_fn(&locator::node::host_id)),
|
||||
orig_view_endpoints | std::views::transform(std::mem_fn(&locator::node::host_id)));
|
||||
return {};
|
||||
} else if (base_endpoints.size() < view_endpoints.size()) {
|
||||
// There are fewer base replicas than view replicas.
|
||||
// This can happen as a result of an RF change when the view replica finishes streaming
|
||||
// before the base replica.
|
||||
// Because of this, a view replica might not get paired with any base replica, so we need
|
||||
// to send an additional update to it.
|
||||
++cf_stats.total_view_updates_due_to_replica_count_mismatch;
|
||||
no_pairing_replica = view_endpoints.back();
|
||||
if (base_endpoints.size() < view_endpoints.size() - 1) {
|
||||
// We only expect one extra replica to appear due to an RF change. If there's more, that's an error,
|
||||
// but we'll still perform updates to the paired and last replicas to minimize degradation.
|
||||
vlogger.warn("There are too many view endpoints for base-view pairing. View updates may get lost on view_endpoints={}",
|
||||
std::span(view_endpoints.begin() + base_endpoints.size(), view_endpoints.end() - 1) | std::views::transform(std::mem_fn(&locator::node::host_id)));
|
||||
}
|
||||
}
|
||||
std::optional<locator::host_id> no_pairing_replica = get_unpaired_view_endpoint(base_nodes, view_nodes, cf_stats);
|
||||
return {.natural_endpoint = paired_replica,
|
||||
.endpoint_with_no_pairing = no_pairing_replica};
|
||||
|
||||
if (!paired_replica) {
|
||||
paired_replica = view_endpoints[idx];
|
||||
}
|
||||
if (!no_pairing_replica && base_nodes.size() < view_nodes.size()) {
|
||||
// This can happen when the view replica with no pairing is in another DC.
|
||||
// We need to send an update to it if there are no base replicas in that DC yet,
|
||||
// as it won't receive updates otherwise.
|
||||
std::unordered_set<sstring> dcs_with_base_replicas;
|
||||
for (const auto& base_node : base_nodes) {
|
||||
dcs_with_base_replicas.insert(base_node.get().dc());
|
||||
}
|
||||
for (const auto& view_node : view_nodes) {
|
||||
if (!dcs_with_base_replicas.contains(view_node.get().dc())) {
|
||||
++cf_stats.total_view_updates_due_to_replica_count_mismatch;
|
||||
no_pairing_replica = view_node;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
// https://github.com/scylladb/scylladb/issues/19439
|
||||
// With tablets, a node being replaced might transition to "left" state
|
||||
// but still be kept as a replica.
|
||||
// As of writing this hints are not prepared to handle nodes that are left
|
||||
// but are still replicas. Therefore, there is no other sensible option
|
||||
// right now but to give up attempt to send the update or write a hint
|
||||
// to the paired, permanently down replica.
|
||||
// We use the same workaround for the extra replica.
|
||||
auto return_host_id_if_not_left = [] (const auto& replica) -> std::optional<locator::host_id> {
|
||||
if (!replica) {
|
||||
return std::nullopt;
|
||||
}
|
||||
const auto& node = replica->get();
|
||||
if (!node.left()) {
|
||||
return node.host_id();
|
||||
} else {
|
||||
return std::nullopt;
|
||||
}
|
||||
};
|
||||
return {.natural_endpoint = return_host_id_if_not_left(paired_replica),
|
||||
.endpoint_with_no_pairing = return_host_id_if_not_left(no_pairing_replica)};
|
||||
}
|
||||
|
||||
static future<> apply_to_remote_endpoints(service::storage_proxy& proxy, locator::effective_replication_map_ptr ermp,
|
||||
@@ -2041,6 +2146,12 @@ future<> view_update_generator::mutate_MV(
|
||||
{
|
||||
auto& ks = _db.find_keyspace(base->ks_name());
|
||||
auto& replication = ks.get_replication_strategy();
|
||||
// We set legacy self-pairing for old vnode-based tables (for backward
|
||||
// compatibility), and unset it for tablets - where range movements
|
||||
// are more frequent and backward compatibility is less important.
|
||||
// TODO: Maybe allow users to set use_legacy_self_pairing explicitly
|
||||
// on a view, like we have the synchronous_updates_flag.
|
||||
bool use_legacy_self_pairing = !ks.uses_tablets();
|
||||
std::unordered_map<table_id, locator::effective_replication_map_ptr> erms;
|
||||
auto get_erm = [&] (table_id id) {
|
||||
auto it = erms.find(id);
|
||||
@@ -2053,6 +2164,10 @@ future<> view_update_generator::mutate_MV(
|
||||
for (const auto& mut : view_updates) {
|
||||
(void)get_erm(mut.s->id());
|
||||
}
|
||||
// Enable rack-aware view updates pairing for tablets
|
||||
// when the cluster feature is enabled so that all replicas agree
|
||||
// on the pairing algorithm.
|
||||
bool use_tablets_rack_aware_view_pairing = _db.features().tablet_rack_aware_view_pairing && ks.uses_tablets();
|
||||
auto me = base_ermp->get_topology().my_host_id();
|
||||
static constexpr size_t max_concurrent_updates = 128;
|
||||
co_await utils::get_local_injector().inject("delay_before_get_view_natural_endpoint", 8000ms);
|
||||
@@ -2060,7 +2175,7 @@ future<> view_update_generator::mutate_MV(
|
||||
auto view_token = dht::get_token(*mut.s, mut.fm.key());
|
||||
auto view_ermp = erms.at(mut.s->id());
|
||||
auto [target_endpoint, no_pairing_endpoint] = get_view_natural_endpoint(me, base_ermp, view_ermp, replication, base_token, view_token,
|
||||
ks.uses_tablets(), cf_stats);
|
||||
use_legacy_self_pairing, use_tablets_rack_aware_view_pairing, cf_stats);
|
||||
auto remote_endpoints = view_ermp->get_pending_replicas(view_token);
|
||||
auto memory_units = seastar::make_lw_shared<db::timeout_semaphore_units>(pending_view_update_memory_units.split(memory_usage_of(mut)));
|
||||
if (no_pairing_endpoint) {
|
||||
|
||||
@@ -305,7 +305,8 @@ endpoints_to_update get_view_natural_endpoint(
|
||||
const locator::abstract_replication_strategy& replication_strategy,
|
||||
const dht::token& base_token,
|
||||
const dht::token& view_token,
|
||||
bool use_tablets,
|
||||
bool use_legacy_self_pairing,
|
||||
bool use_tablets_basic_rack_aware_view_pairing,
|
||||
replica::cf_stats& cf_stats);
|
||||
|
||||
/// Verify that the provided keyspace is eligible for storing materialized views.
|
||||
|
||||
2
dist/common/sysconfig/scylla-node-exporter
vendored
2
dist/common/sysconfig/scylla-node-exporter
vendored
@@ -1 +1 @@
|
||||
SCYLLA_NODE_EXPORTER_ARGS="--collector.interrupts --collector.ethtool.metrics-include='(bw_in_allowance_exceeded|bw_out_allowance_exceeded|conntrack_allowance_exceeded|conntrack_allowance_available|linklocal_allowance_exceeded)' --collector.ethtool --no-collector.hwmon --no-collector.bcache --no-collector.btrfs --no-collector.fibrechannel --no-collector.infiniband --no-collector.ipvs --no-collector.nfs --no-collector.nfsd --no-collector.powersupplyclass --no-collector.rapl --no-collector.tapestats --no-collector.thermal_zone --no-collector.udp_queues --no-collector.zfs"
|
||||
SCYLLA_NODE_EXPORTER_ARGS="--collector.interrupts --no-collector.hwmon --no-collector.bcache --no-collector.btrfs --no-collector.fibrechannel --no-collector.infiniband --no-collector.ipvs --no-collector.nfs --no-collector.nfsd --no-collector.powersupplyclass --no-collector.rapl --no-collector.tapestats --no-collector.thermal_zone --no-collector.udp_queues --no-collector.zfs"
|
||||
|
||||
@@ -71,7 +71,7 @@ Use "Bash on Ubuntu on Windows" for the same tools and capabilities as on Linux
|
||||
|
||||
### Building the Docs
|
||||
|
||||
1. Run `make preview` in the `docs/` directory to build the documentation.
|
||||
1. Run `make preview` to build the documentation.
|
||||
1. Preview the built documentation locally at http://127.0.0.1:5500/.
|
||||
|
||||
### Cleanup
|
||||
|
||||
@@ -41,8 +41,6 @@ class MetricsProcessor:
|
||||
# Get metrics from the file
|
||||
try:
|
||||
metrics_file = metrics.get_metrics_from_file(relative_path, "scylla_", metrics_info, strict=strict)
|
||||
except SystemExit:
|
||||
pass
|
||||
finally:
|
||||
os.chdir(old_cwd)
|
||||
if metrics_file:
|
||||
|
||||
156
docs/dev/clustering-range-to-position-range-migration.md
Normal file
156
docs/dev/clustering-range-to-position-range-migration.md
Normal file
@@ -0,0 +1,156 @@
|
||||
# Clustering Range to Position Range Migration
|
||||
|
||||
## Background
|
||||
|
||||
The `clustering_range` type (alias for `interval<clustering_key_prefix>`) has known issues with operations like `intersection()` and `deoverlap()` that can return incorrect results due to the complexity of comparing clustering key prefixes with different inclusiveness on bounds.
|
||||
|
||||
See issues:
|
||||
- #22817 - `interval<clustering_key_prefix>::deoverlap` can return incorrect results
|
||||
- #21604 - Problems with clustering range operations
|
||||
- #8157 - `interval<clustering_key_prefix_view>::intersection` can return incorrect results
|
||||
|
||||
The `position_range` class was introduced as a safer alternative that represents clustering ranges as a pair of `position_in_partition` objects, avoiding the problematic interval semantics.
|
||||
|
||||
## Migration Strategy
|
||||
|
||||
### 1. Feature Flag
|
||||
|
||||
A new `gms::feature` called `"POSITION_RANGE"` has been added to `gms/feature_service.hh`. This feature gates the use of position_range in RPC interfaces to ensure backward compatibility during rolling upgrades.
|
||||
|
||||
### 2. IDL Support
|
||||
|
||||
The `position_range` class has been added to `idl/position_in_partition.idl.hh` to support serialization in RPC verbs.
|
||||
|
||||
### 3. Internal Code Migration
|
||||
|
||||
Internal code should be migrated to use `position_range` instead of `clustering_range` wherever possible. This migration should be done incrementally:
|
||||
|
||||
#### Priority Areas
|
||||
|
||||
1. **Functions with known problematic operations**:
|
||||
- Any code using `clustering_range::intersection()`
|
||||
- Any code using `clustering_range::deoverlap()`
|
||||
- See marked locations in:
|
||||
- `db/view/view.cc` (lines 1687-1713)
|
||||
- `cql3/statements/cas_request.cc` (line 90-91)
|
||||
|
||||
2. **Internal data structures**:
|
||||
- Readers and iterators that track position ranges
|
||||
- Cache structures
|
||||
- Query processing internals
|
||||
|
||||
3. **Utility functions**:
|
||||
- Helper functions that operate on ranges
|
||||
- Range manipulation and transformation functions
|
||||
|
||||
#### Conversion Utilities
|
||||
|
||||
Existing converters:
|
||||
- `position_range::from_range(const query::clustering_range&)` - Convert clustering_range to position_range
|
||||
- `position_range_to_clustering_range(const position_range&, const schema&)` - Convert position_range to clustering_range (returns optional)
|
||||
|
||||
The `clustering_interval_set` class already demonstrates best practices - it uses `position_range` internally and provides conversion methods to/from `clustering_row_ranges`.
|
||||
|
||||
Helper utilities in `query/position_range_utils.hh`:
|
||||
- `clustering_row_ranges_to_position_ranges()` - Batch convert clustering ranges to position ranges
|
||||
- `position_ranges_to_clustering_row_ranges()` - Batch convert position ranges to clustering ranges
|
||||
- `deoverlap_clustering_row_ranges()` - Safely deoverlap ranges using clustering_interval_set
|
||||
- `intersect_clustering_row_ranges()` - Safely intersect ranges using clustering_interval_set
|
||||
|
||||
#### Migration Pattern
|
||||
|
||||
```cpp
|
||||
// OLD CODE (problematic):
|
||||
void process_ranges(const query::clustering_row_ranges& ranges) {
|
||||
auto deoverlapped = query::clustering_range::deoverlap(ranges, cmp);
|
||||
// ... use deoverlapped ranges
|
||||
}
|
||||
|
||||
// NEW CODE (using position_range):
|
||||
void process_ranges(const schema& s, const query::clustering_row_ranges& ranges) {
|
||||
clustering_interval_set interval_set(s, ranges);
|
||||
// interval_set handles deoverlapping correctly internally
|
||||
for (const position_range& r : interval_set) {
|
||||
// ... use position ranges
|
||||
}
|
||||
// Convert back if needed for compatibility
|
||||
auto result_ranges = interval_set.to_clustering_row_ranges();
|
||||
}
|
||||
```
|
||||
|
||||
### 4. RPC Interface Migration
|
||||
|
||||
RPC interfaces must maintain backward compatibility. The strategy is:
|
||||
|
||||
1. Keep existing RPC verbs that use `clustering_range` (in IDL: `std::vector<interval<clustering_key_prefix>>`)
|
||||
2. Add new RPC verbs that use `position_range`
|
||||
3. Use the new verbs when `feature_service.position_range` is enabled
|
||||
|
||||
#### Example RPC Migration
|
||||
|
||||
In `idl/storage_proxy.idl.hh`:
|
||||
|
||||
```cpp
|
||||
// Existing verb (keep for compatibility)
|
||||
verb [[with_client_info, with_timeout]] read_data (
|
||||
query::read_command cmd [[ref]],
|
||||
::compat::wrapping_partition_range pr,
|
||||
...
|
||||
) -> query::result [[lw_shared_ptr]], ...;
|
||||
|
||||
// New verb using position_range (to be added)
|
||||
verb [[with_client_info, with_timeout]] read_data_v2 (
|
||||
query::read_command_v2 cmd [[ref]],
|
||||
::compat::wrapping_partition_range pr,
|
||||
...
|
||||
) -> query::result [[lw_shared_ptr]], ...;
|
||||
```
|
||||
|
||||
Where `read_command_v2` would use a `partition_slice_v2` that contains position ranges instead of clustering ranges.
|
||||
|
||||
#### Feature-Gated RPC Selection
|
||||
|
||||
```cpp
|
||||
future<query::result> storage_proxy::query_data(...) {
|
||||
if (_features.position_range) {
|
||||
return rpc_verb_read_data_v2(...);
|
||||
} else {
|
||||
return rpc_verb_read_data(...);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 5. Testing
|
||||
|
||||
Tests should verify:
|
||||
1. Correct conversion between clustering_range and position_range
|
||||
2. Correct behavior of position_range operations
|
||||
3. RPC compatibility with both old and new verbs
|
||||
4. Feature flag behavior during rolling upgrades
|
||||
|
||||
### 6. Known Limitations
|
||||
|
||||
- Not all clustering_range uses can be eliminated - some external interfaces may require them
|
||||
- The conversion from position_range to clustering_range can return `nullopt` for empty ranges
|
||||
- Performance implications should be measured for hot paths
|
||||
|
||||
## Implementation Checklist
|
||||
|
||||
- [x] Add `position_range` feature flag to `gms/feature_service.hh`
|
||||
- [x] Add `position_range` IDL definition to `idl/position_in_partition.idl.hh`
|
||||
- [ ] Create new RPC verbs using position_range
|
||||
- [ ] Add feature-gated RPC selection logic
|
||||
- [ ] Migrate high-priority problematic code paths:
|
||||
- [ ] Fix intersection in `db/view/view.cc:1687`
|
||||
- [ ] Fix deoverlap in `db/view/view.cc:1712`
|
||||
- [ ] Fix deoverlap in `cql3/statements/cas_request.cc:90`
|
||||
- [ ] Migrate internal data structures systematically
|
||||
- [ ] Add comprehensive tests
|
||||
- [ ] Performance benchmarking
|
||||
- [ ] Documentation updates
|
||||
|
||||
## References
|
||||
|
||||
- `mutation/position_in_partition.hh` - position_range definition
|
||||
- `keys/clustering_interval_set.hh` - Example of correct position_range usage
|
||||
- Issues: #22817, #21604, #8157
|
||||
271
docs/dev/position-range-rpc-migration.md
Normal file
271
docs/dev/position-range-rpc-migration.md
Normal file
@@ -0,0 +1,271 @@
|
||||
# Position Range RPC Migration Plan
|
||||
|
||||
## Overview
|
||||
|
||||
This document outlines the plan for migrating RPC interfaces from `clustering_range` to `position_range` in a backward-compatible manner using feature flags.
|
||||
|
||||
## Background
|
||||
|
||||
The current RPC interfaces use `clustering_range` (defined as `interval<clustering_key_prefix>`) in structures like `partition_slice` and `read_command`. To enable the use of `position_range` internally while maintaining backward compatibility, we need to:
|
||||
|
||||
1. Create new RPC message types that use `position_range`
|
||||
2. Add new RPC verbs that accept these new types
|
||||
3. Feature-gate the use of these new verbs based on cluster capabilities
|
||||
|
||||
## Feature Flag
|
||||
|
||||
A new feature flag `position_range` has been added to `gms::feature_service`:
|
||||
|
||||
```cpp
|
||||
gms::feature position_range { *this, "POSITION_RANGE"sv };
|
||||
```
|
||||
|
||||
This feature will be enabled when all nodes in the cluster support the new RPC verbs.
|
||||
|
||||
## IDL Changes
|
||||
|
||||
### Already Added
|
||||
|
||||
The `position_range` class has been added to `idl/position_in_partition.idl.hh`:
|
||||
|
||||
```idl
|
||||
class position_range {
|
||||
position_in_partition start();
|
||||
position_in_partition end();
|
||||
};
|
||||
```
|
||||
|
||||
### To Be Added
|
||||
|
||||
New IDL types need to be created for RPC migration:
|
||||
|
||||
#### 1. partition_slice_v2 (in `idl/read_command.idl.hh`)
|
||||
|
||||
```idl
|
||||
namespace query {
|
||||
|
||||
class partition_slice_v2 {
|
||||
std::vector<position_range> default_row_ranges();
|
||||
utils::small_vector<uint32_t, 8> static_columns;
|
||||
utils::small_vector<uint32_t, 8> regular_columns;
|
||||
query::partition_slice::option_set options;
|
||||
std::unique_ptr<query::specific_ranges> get_specific_ranges();
|
||||
cql_serialization_format cql_format();
|
||||
uint32_t partition_row_limit_low_bits();
|
||||
uint32_t partition_row_limit_high_bits();
|
||||
};
|
||||
|
||||
class read_command_v2 {
|
||||
table_id cf_id;
|
||||
table_schema_version schema_version;
|
||||
query::partition_slice_v2 slice;
|
||||
uint32_t row_limit_low_bits;
|
||||
std::chrono::time_point<gc_clock, gc_clock::duration> timestamp;
|
||||
std::optional<tracing::trace_info> trace_info;
|
||||
uint32_t partition_limit;
|
||||
query_id query_uuid;
|
||||
query::is_first_page is_first_page;
|
||||
std::optional<query::max_result_size> max_result_size;
|
||||
uint32_t row_limit_high_bits;
|
||||
uint64_t tombstone_limit;
|
||||
};
|
||||
|
||||
}
|
||||
```
|
||||
|
||||
#### 2. New RPC Verbs (in `idl/storage_proxy.idl.hh`)
|
||||
|
||||
```idl
|
||||
// New verbs using position_range (to be used when position_range feature is enabled)
|
||||
verb [[with_client_info, with_timeout]] read_data_v2 (
|
||||
query::read_command_v2 cmd [[ref]],
|
||||
::compat::wrapping_partition_range pr,
|
||||
query::digest_algorithm digest,
|
||||
db::per_partition_rate_limit::info rate_limit_info,
|
||||
service::fencing_token fence
|
||||
) -> query::result [[lw_shared_ptr]],
|
||||
cache_temperature,
|
||||
replica::exception_variant;
|
||||
|
||||
verb [[with_client_info, with_timeout]] read_mutation_data_v2 (
|
||||
query::read_command_v2 cmd [[ref]],
|
||||
::compat::wrapping_partition_range pr,
|
||||
service::fencing_token fence
|
||||
) -> reconcilable_result [[lw_shared_ptr]],
|
||||
cache_temperature,
|
||||
replica::exception_variant;
|
||||
|
||||
verb [[with_client_info, with_timeout]] read_digest_v2 (
|
||||
query::read_command_v2 cmd [[ref]],
|
||||
::compat::wrapping_partition_range pr,
|
||||
query::digest_algorithm digest,
|
||||
db::per_partition_rate_limit::info rate_limit_info,
|
||||
service::fencing_token fence
|
||||
) -> query::result_digest,
|
||||
api::timestamp_type,
|
||||
cache_temperature,
|
||||
replica::exception_variant,
|
||||
std::optional<full_position>;
|
||||
```
|
||||
|
||||
## Implementation Changes
|
||||
|
||||
### 1. C++ Type Definitions
|
||||
|
||||
Create C++ implementations for the new IDL types:
|
||||
|
||||
```cpp
|
||||
// In query/query-request.hh or a new header
|
||||
namespace query {
|
||||
|
||||
class partition_slice_v2 {
|
||||
std::vector<position_range> _row_ranges;
|
||||
// ... other members same as partition_slice
|
||||
|
||||
public:
|
||||
// Constructors
|
||||
partition_slice_v2(std::vector<position_range> row_ranges, ...);
|
||||
|
||||
// Conversion methods
|
||||
static partition_slice_v2 from_legacy(const partition_slice& legacy);
|
||||
partition_slice to_legacy(const schema& s) const;
|
||||
|
||||
// Accessors
|
||||
const std::vector<position_range>& row_ranges() const { return _row_ranges; }
|
||||
};
|
||||
|
||||
class read_command_v2 {
|
||||
partition_slice_v2 slice;
|
||||
// ... other members same as read_command
|
||||
|
||||
public:
|
||||
// Constructors
|
||||
read_command_v2(...);
|
||||
|
||||
// Conversion methods
|
||||
static read_command_v2 from_legacy(const read_command& legacy);
|
||||
read_command to_legacy(const schema& s) const;
|
||||
};
|
||||
|
||||
}
|
||||
```
|
||||
|
||||
### 2. RPC Handler Implementation
|
||||
|
||||
In `service/storage_proxy.cc`, add handlers for the new verbs:
|
||||
|
||||
```cpp
|
||||
future<rpc::tuple<query::result_lw_shared_ptr, cache_temperature, replica::exception_variant>>
|
||||
storage_proxy::read_data_v2_handler(
|
||||
query::read_command_v2&& cmd,
|
||||
compat::wrapping_partition_range&& pr,
|
||||
query::digest_algorithm da,
|
||||
db::per_partition_rate_limit::info rate_limit_info,
|
||||
service::fencing_token fence) {
|
||||
|
||||
// Convert to legacy format if needed internally
|
||||
// Or better: refactor internal implementation to work with position_range
|
||||
auto legacy_cmd = cmd.to_legacy(*get_schema(cmd.cf_id));
|
||||
|
||||
// Call existing implementation
|
||||
return read_data_handler(std::move(legacy_cmd), std::move(pr), da, rate_limit_info, fence);
|
||||
}
|
||||
```
|
||||
|
||||
### 3. RPC Client Selection
|
||||
|
||||
In code that invokes RPCs (e.g., `storage_proxy::query_result`), add feature detection:
|
||||
|
||||
```cpp
|
||||
future<query::result> storage_proxy::query_data(...) {
|
||||
if (_features.position_range) {
|
||||
// Use new verb with position_range
|
||||
auto cmd_v2 = read_command_v2::from_legacy(cmd);
|
||||
return rpc_verb_read_data_v2(std::move(cmd_v2), ...);
|
||||
} else {
|
||||
// Use legacy verb with clustering_range
|
||||
return rpc_verb_read_data(std::move(cmd), ...);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Migration Strategy
|
||||
|
||||
### Phase 1: Foundation (Complete)
|
||||
- [x] Add `position_range` feature flag
|
||||
- [x] Add `position_range` IDL definition
|
||||
- [x] Fix critical clustering_range bugs using clustering_interval_set
|
||||
|
||||
### Phase 2: RPC Infrastructure (To Do)
|
||||
- [ ] Add `partition_slice_v2` IDL definition
|
||||
- [ ] Add `read_command_v2` IDL definition
|
||||
- [ ] Add new RPC verbs (`read_data_v2`, etc.)
|
||||
- [ ] Implement conversion methods between v1 and v2 types
|
||||
- [ ] Add RPC handlers for new verbs
|
||||
|
||||
### Phase 3: Client Migration (To Do)
|
||||
- [ ] Update RPC clients to check feature flag
|
||||
- [ ] Add logic to select appropriate verb based on feature availability
|
||||
- [ ] Test backward compatibility during rolling upgrades
|
||||
|
||||
### Phase 4: Internal Refactoring (To Do)
|
||||
- [ ] Gradually refactor internal implementations to use position_range natively
|
||||
- [ ] Remove conversion overhead once both versions are established
|
||||
- [ ] Update documentation and examples
|
||||
|
||||
### Phase 5: Deprecation (Future)
|
||||
- [ ] Once all production clusters are upgraded, consider deprecating v1 verbs
|
||||
- [ ] Remove legacy code after sufficient time has passed
|
||||
|
||||
## Testing
|
||||
|
||||
### Unit Tests
|
||||
- Test conversion between partition_slice and partition_slice_v2
|
||||
- Test conversion between read_command and read_command_v2
|
||||
- Verify that converted types produce equivalent results
|
||||
|
||||
### Integration Tests
|
||||
- Test RPC calls using both old and new verbs
|
||||
- Verify feature flag behavior during rolling upgrades
|
||||
- Test mixed-version clusters
|
||||
|
||||
### Backward Compatibility Tests
|
||||
- Ensure old clients can still communicate with new servers
|
||||
- Ensure new clients fall back to old verbs when feature is disabled
|
||||
|
||||
## Performance Considerations
|
||||
|
||||
1. **Conversion Overhead**: During the transition period, conversions between v1 and v2 types add overhead. This should be measured and minimized.
|
||||
|
||||
2. **Memory Usage**: position_range may have different memory characteristics than clustering_range. Monitor memory usage after migration.
|
||||
|
||||
3. **Serialization Size**: Compare wire format sizes to ensure no significant increase in network traffic.
|
||||
|
||||
## Risks and Mitigation
|
||||
|
||||
### Risk: Conversion Bugs
|
||||
**Mitigation**: Comprehensive unit tests for all conversion paths, particularly edge cases like empty ranges and open-ended ranges.
|
||||
|
||||
### Risk: Feature Flag Synchronization
|
||||
**Mitigation**: Use standard Scylla feature propagation mechanisms. Ensure feature is only enabled when all nodes support it.
|
||||
|
||||
### Risk: Performance Regression
|
||||
**Mitigation**: Performance benchmarks comparing old and new implementations. Have rollback plan if issues are discovered.
|
||||
|
||||
## Alternative Approaches Considered
|
||||
|
||||
### 1. Direct Migration Without Feature Flag
|
||||
**Rejected**: Too risky for rolling upgrades. Would require all-at-once cluster upgrade.
|
||||
|
||||
### 2. Transparent Conversion in IDL Layer
|
||||
**Rejected**: Would hide the distinction between old and new formats, making debugging harder.
|
||||
|
||||
### 3. Maintain Both Forever
|
||||
**Rejected**: Increases maintenance burden without clear benefit once migration is complete.
|
||||
|
||||
## References
|
||||
|
||||
- Main migration guide: `docs/dev/clustering-range-to-position-range-migration.md`
|
||||
- Issues: #22817, #21604, #8157
|
||||
- Feature service: `gms/feature_service.hh`
|
||||
- IDL definitions: `idl/position_in_partition.idl.hh`, `idl/read_command.idl.hh`
|
||||
@@ -29,6 +29,9 @@ A CDC generation consists of:
|
||||
|
||||
This is the mapping used to decide on which stream IDs to use when making writes, as explained in the :doc:`./cdc-streams` document. It is a global property of the cluster: it doesn't depend on the table you're making writes to.
|
||||
|
||||
.. caution::
|
||||
The tables mentioned in the following sections: ``system_distributed.cdc_generation_timestamps`` and ``system_distributed.cdc_streams_descriptions_v2`` have been introduced in ScyllaDB 4.4. It is highly recommended to upgrade to 4.4 for efficient CDC usage. The last section explains how to run the below examples in ScyllaDB 4.3.
|
||||
|
||||
When CDC generations change
|
||||
---------------------------
|
||||
|
||||
|
||||
@@ -28,8 +28,7 @@ Incremental Repair is only supported for tables that use the tablets architectur
|
||||
Incremental Repair Modes
|
||||
------------------------
|
||||
|
||||
Incremental is currently disabled by default. You can control its behavior for a given repair operation using the ``incremental_mode`` parameter.
|
||||
This is useful for enabling incremental repair, or in situations where you might need to force a full data validation.
|
||||
While incremental repair is the default and recommended mode, you can control its behavior for a given repair operation using the ``incremental_mode`` parameter. This is useful for situations where you might need to force a full data validation.
|
||||
|
||||
The available modes are:
|
||||
|
||||
|
||||
@@ -53,13 +53,13 @@ ScyllaDB nodetool cluster repair command supports the following options:
|
||||
|
||||
nodetool cluster repair --tablet-tokens 1,10474535988
|
||||
|
||||
- ``--incremental-mode`` specifies the incremental repair mode. Can be 'disabled', 'incremental', or 'full'. 'incremental': The incremental repair logic is enabled. Unrepaired sstables will be included for repair. Repaired sstables will be skipped. The incremental repair states will be updated after repair. 'full': The incremental repair logic is enabled. Both repaired and unrepaired sstables will be included for repair. The incremental repair states will be updated after repair. 'disabled': The incremental repair logic is disabled completely. The incremental repair states, e.g., repaired_at in sstables and sstables_repaired_at in the system.tablets table, will not be updated after repair. When the option is not provided, it defaults to 'disabled'.
|
||||
- ``--incremental-mode`` specifies the incremental repair mode. Can be 'disabled', 'incremental', or 'full'. 'incremental': The incremental repair logic is enabled. Unrepaired sstables will be included for repair. Repaired sstables will be skipped. The incremental repair states will be updated after repair. 'full': The incremental repair logic is enabled. Both repaired and unrepaired sstables will be included for repair. The incremental repair states will be updated after repair. 'disabled': The incremental repair logic is disabled completely. The incremental repair states, e.g., repaired_at in sstables and sstables_repaired_at in the system.tablets table, will not be updated after repair. When the option is not provided, it defaults to incremental.
|
||||
|
||||
For example:
|
||||
|
||||
::
|
||||
|
||||
nodetool cluster repair --incremental-mode disabled
|
||||
nodetool cluster repair --incremental-mode regular
|
||||
|
||||
- ``keyspace`` executes a repair on a specific keyspace. The default is all keyspaces.
|
||||
|
||||
|
||||
@@ -110,6 +110,7 @@ To display the log classes (output changes with each version so your display may
|
||||
keys
|
||||
keyspace_utils
|
||||
large_data
|
||||
legacy_schema_migrator
|
||||
lister
|
||||
load_balancer
|
||||
load_broadcaster
|
||||
|
||||
@@ -64,12 +64,13 @@ ADMIN Logs service level operations: create, alter, drop, attach, detach, l
|
||||
auditing.
|
||||
========= =========================================================================================
|
||||
|
||||
Note that enabling audit may negatively impact performance and audit-to-table may consume extra storage. That's especially true when auditing DML and QUERY categories, which generate a high volume of audit messages.
|
||||
Note that audit for every DML or QUERY might impact performance and consume a lot of storage.
|
||||
|
||||
Configuring Audit Storage
|
||||
---------------------------
|
||||
|
||||
Auditing messages can be sent to :ref:`Syslog <auditing-syslog-storage>` or stored in a Scylla :ref:`table <auditing-table-storage>` or both.
|
||||
Auditing messages can be sent to :ref:`Syslog <auditing-syslog-storage>` or stored in a Scylla :ref:`table <auditing-table-storage>`.
|
||||
Currently, auditing messages can only be saved to one location at a time. You cannot log into both a table and the Syslog.
|
||||
|
||||
.. _auditing-syslog-storage:
|
||||
|
||||
@@ -192,23 +193,6 @@ For example:
|
||||
2018-03-18 00:00:00+0000 | 10.143.2.108 | 3429b1a5-2a94-11e8-8f4e-000000000001 | DDL | ONE | False | nba | DROP TABLE nba.team_roster ; | 127.0.0.1 | team_roster | Scylla |
|
||||
(1 row)
|
||||
|
||||
.. _auditing-table-and-syslog-storage:
|
||||
|
||||
Storing Audit Messages in a Table and Syslog Simultaneously
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
**Procedure**
|
||||
|
||||
#. Follow both procedures from above, and set the ``audit`` parameter in the ``scylla.yaml`` file to both ``syslog`` and ``table``. You need to restart scylla only once.
|
||||
|
||||
To have both syslog and table you need to specify both backends separated by a comma:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
audit: "syslog,table"
|
||||
|
||||
|
||||
|
||||
Handling Audit Failures
|
||||
---------------------------
|
||||
|
||||
|
||||
8
docs/poetry.lock
generated
8
docs/poetry.lock
generated
@@ -1018,14 +1018,14 @@ sphinx-markdown-tables = "0.0.17"
|
||||
|
||||
[[package]]
|
||||
name = "sphinx-scylladb-theme"
|
||||
version = "1.8.10"
|
||||
version = "1.8.9"
|
||||
description = "A Sphinx Theme for ScyllaDB documentation projects"
|
||||
optional = false
|
||||
python-versions = "<4.0,>=3.10"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "sphinx_scylladb_theme-1.8.10-py3-none-any.whl", hash = "sha256:8b930f33bec7308ccaa92698ebb5ad85059bcbf93a463f92917aeaf473fce632"},
|
||||
{file = "sphinx_scylladb_theme-1.8.10.tar.gz", hash = "sha256:8a78a9b692d9a946be2c4a64aa472fd82204cc8ea0b1ee7f60de6db35b356326"},
|
||||
{file = "sphinx_scylladb_theme-1.8.9-py3-none-any.whl", hash = "sha256:f8649a7753a29494fd2b417d1cb855035dddb9ebd498ea033fd73f5f9338271e"},
|
||||
{file = "sphinx_scylladb_theme-1.8.9.tar.gz", hash = "sha256:ab7cda4c10a0d067c5c3a45f7b1f68cb8ebefe135a0be0738bfa282a344769b6"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@@ -1603,4 +1603,4 @@ files = [
|
||||
[metadata]
|
||||
lock-version = "2.1"
|
||||
python-versions = "^3.10"
|
||||
content-hash = "0ae673106f45d3465cbdabbf511e165ca44feadd34d7753f2e68093afaa95c79"
|
||||
content-hash = "74912627a3f424290ed7889451c0bdb1a862ab85b1d07c85f4f3b8c34f32a020"
|
||||
|
||||
@@ -9,7 +9,7 @@ package-mode = false
|
||||
python = "^3.10"
|
||||
pygments = "^2.18.0"
|
||||
redirects_cli ="^0.1.3"
|
||||
sphinx-scylladb-theme = "^1.8.10"
|
||||
sphinx-scylladb-theme = "^1.8.9"
|
||||
sphinx-sitemap = "^2.6.0"
|
||||
sphinx-autobuild = "^2024.4.19"
|
||||
Sphinx = "^7.3.7"
|
||||
|
||||
@@ -176,6 +176,8 @@ public:
|
||||
gms::feature rack_list_rf { *this, "RACK_LIST_RF"sv };
|
||||
gms::feature driver_service_level { *this, "DRIVER_SERVICE_LEVEL"sv };
|
||||
gms::feature strongly_consistent_tables { *this, "STRONGLY_CONSISTENT_TABLES"sv };
|
||||
// Enable position_range in RPC interfaces instead of clustering_range
|
||||
gms::feature position_range { *this, "POSITION_RANGE"sv };
|
||||
public:
|
||||
|
||||
const std::unordered_map<sstring, std::reference_wrapper<feature>>& registered_features() const;
|
||||
|
||||
@@ -26,3 +26,8 @@ class position_in_partition {
|
||||
bound_weight get_bound_weight();
|
||||
std::optional<clustering_key_prefix> get_clustering_key_prefix();
|
||||
};
|
||||
|
||||
class position_range {
|
||||
position_in_partition start();
|
||||
position_in_partition end();
|
||||
};
|
||||
|
||||
@@ -129,6 +129,6 @@ struct direct_fd_ping_reply {
|
||||
std::variant<std::monostate, service::wrong_destination, service::group_liveness_info> result;
|
||||
};
|
||||
|
||||
verb [[with_client_info, with_timeout, cancellable]] direct_fd_ping (raft::server_id dst_id) -> service::direct_fd_ping_reply;
|
||||
verb [[with_client_info, cancellable]] direct_fd_ping (raft::server_id dst_id) -> service::direct_fd_ping_reply;
|
||||
|
||||
} // namespace service
|
||||
|
||||
@@ -38,7 +38,6 @@ debian_base_packages=(
|
||||
python3-aiohttp
|
||||
python3-pyparsing
|
||||
python3-colorama
|
||||
python3-dev
|
||||
python3-tabulate
|
||||
python3-pytest
|
||||
python3-pytest-asyncio
|
||||
@@ -66,7 +65,6 @@ debian_base_packages=(
|
||||
git-lfs
|
||||
e2fsprogs
|
||||
fuse3
|
||||
libev-dev # for python driver
|
||||
)
|
||||
|
||||
fedora_packages=(
|
||||
@@ -92,7 +90,6 @@ fedora_packages=(
|
||||
patchelf
|
||||
python3
|
||||
python3-aiohttp
|
||||
python3-devel
|
||||
python3-pip
|
||||
python3-file-magic
|
||||
python3-colorama
|
||||
@@ -157,8 +154,6 @@ fedora_packages=(
|
||||
https://github.com/scylladb/cassandra-stress/releases/download/v3.18.1/cassandra-stress-java21-3.18.1-1.noarch.rpm
|
||||
elfutils
|
||||
jq
|
||||
|
||||
libev-devel # for python driver
|
||||
)
|
||||
|
||||
fedora_python3_packages=(
|
||||
|
||||
@@ -15,22 +15,3 @@ with the Apache License (version 2) and ScyllaDB-Source-Available-1.0.
|
||||
They contain the following tag:
|
||||
|
||||
SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
|
||||
|
||||
### `musl libc` files
|
||||
|
||||
`licenses/musl-license.txt` is obtained from:
|
||||
https://git.musl-libc.org/cgit/musl/tree/COPYRIGHT
|
||||
|
||||
`utils/crypt_sha512.cc` is obtained from:
|
||||
https://git.musl-libc.org/cgit/musl/tree/src/crypt/crypt_sha512.c
|
||||
|
||||
Both files are obtained from git.musl-libc.org.
|
||||
Import commit:
|
||||
commit 1b76ff0767d01df72f692806ee5adee13c67ef88
|
||||
Author: Alex Rønne Petersen <alex@alexrp.com>
|
||||
Date: Sun Oct 12 05:35:19 2025 +0200
|
||||
|
||||
s390x: shuffle register usage in __tls_get_offset to avoid r0 as address
|
||||
|
||||
musl as a whole is licensed under the standard MIT license included in
|
||||
`licenses/musl-license.txt`.
|
||||
|
||||
@@ -1,193 +0,0 @@
|
||||
musl as a whole is licensed under the following standard MIT license:
|
||||
|
||||
----------------------------------------------------------------------
|
||||
Copyright © 2005-2020 Rich Felker, et al.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
----------------------------------------------------------------------
|
||||
|
||||
Authors/contributors include:
|
||||
|
||||
A. Wilcox
|
||||
Ada Worcester
|
||||
Alex Dowad
|
||||
Alex Suykov
|
||||
Alexander Monakov
|
||||
Andre McCurdy
|
||||
Andrew Kelley
|
||||
Anthony G. Basile
|
||||
Aric Belsito
|
||||
Arvid Picciani
|
||||
Bartosz Brachaczek
|
||||
Benjamin Peterson
|
||||
Bobby Bingham
|
||||
Boris Brezillon
|
||||
Brent Cook
|
||||
Chris Spiegel
|
||||
Clément Vasseur
|
||||
Daniel Micay
|
||||
Daniel Sabogal
|
||||
Daurnimator
|
||||
David Carlier
|
||||
David Edelsohn
|
||||
Denys Vlasenko
|
||||
Dmitry Ivanov
|
||||
Dmitry V. Levin
|
||||
Drew DeVault
|
||||
Emil Renner Berthing
|
||||
Fangrui Song
|
||||
Felix Fietkau
|
||||
Felix Janda
|
||||
Gianluca Anzolin
|
||||
Hauke Mehrtens
|
||||
He X
|
||||
Hiltjo Posthuma
|
||||
Isaac Dunham
|
||||
Jaydeep Patil
|
||||
Jens Gustedt
|
||||
Jeremy Huntwork
|
||||
Jo-Philipp Wich
|
||||
Joakim Sindholt
|
||||
John Spencer
|
||||
Julien Ramseier
|
||||
Justin Cormack
|
||||
Kaarle Ritvanen
|
||||
Khem Raj
|
||||
Kylie McClain
|
||||
Leah Neukirchen
|
||||
Luca Barbato
|
||||
Luka Perkov
|
||||
Lynn Ochs
|
||||
M Farkas-Dyck (Strake)
|
||||
Mahesh Bodapati
|
||||
Markus Wichmann
|
||||
Masanori Ogino
|
||||
Michael Clark
|
||||
Michael Forney
|
||||
Mikhail Kremnyov
|
||||
Natanael Copa
|
||||
Nicholas J. Kain
|
||||
orc
|
||||
Pascal Cuoq
|
||||
Patrick Oppenlander
|
||||
Petr Hosek
|
||||
Petr Skocik
|
||||
Pierre Carrier
|
||||
Reini Urban
|
||||
Rich Felker
|
||||
Richard Pennington
|
||||
Ryan Fairfax
|
||||
Samuel Holland
|
||||
Segev Finer
|
||||
Shiz
|
||||
sin
|
||||
Solar Designer
|
||||
Stefan Kristiansson
|
||||
Stefan O'Rear
|
||||
Szabolcs Nagy
|
||||
Timo Teräs
|
||||
Trutz Behn
|
||||
Will Dietz
|
||||
William Haddon
|
||||
William Pitcock
|
||||
|
||||
Portions of this software are derived from third-party works licensed
|
||||
under terms compatible with the above MIT license:
|
||||
|
||||
The TRE regular expression implementation (src/regex/reg* and
|
||||
src/regex/tre*) is Copyright © 2001-2008 Ville Laurikari and licensed
|
||||
under a 2-clause BSD license (license text in the source files). The
|
||||
included version has been heavily modified by Rich Felker in 2012, in
|
||||
the interests of size, simplicity, and namespace cleanliness.
|
||||
|
||||
Much of the math library code (src/math/* and src/complex/*) is
|
||||
Copyright © 1993,2004 Sun Microsystems or
|
||||
Copyright © 2003-2011 David Schultz or
|
||||
Copyright © 2003-2009 Steven G. Kargl or
|
||||
Copyright © 2003-2009 Bruce D. Evans or
|
||||
Copyright © 2008 Stephen L. Moshier or
|
||||
Copyright © 2017-2018 Arm Limited
|
||||
and labelled as such in comments in the individual source files. All
|
||||
have been licensed under extremely permissive terms.
|
||||
|
||||
The ARM memcpy code (src/string/arm/memcpy.S) is Copyright © 2008
|
||||
The Android Open Source Project and is licensed under a two-clause BSD
|
||||
license. It was taken from Bionic libc, used on Android.
|
||||
|
||||
The AArch64 memcpy and memset code (src/string/aarch64/*) are
|
||||
Copyright © 1999-2019, Arm Limited.
|
||||
|
||||
The implementation of DES for crypt (src/crypt/crypt_des.c) is
|
||||
Copyright © 1994 David Burren. It is licensed under a BSD license.
|
||||
|
||||
The implementation of blowfish crypt (src/crypt/crypt_blowfish.c) was
|
||||
originally written by Solar Designer and placed into the public
|
||||
domain. The code also comes with a fallback permissive license for use
|
||||
in jurisdictions that may not recognize the public domain.
|
||||
|
||||
The smoothsort implementation (src/stdlib/qsort.c) is Copyright © 2011
|
||||
Lynn Ochs and is licensed under an MIT-style license.
|
||||
|
||||
The x86_64 port was written by Nicholas J. Kain and is licensed under
|
||||
the standard MIT terms.
|
||||
|
||||
The mips and microblaze ports were originally written by Richard
|
||||
Pennington for use in the ellcc project. The original code was adapted
|
||||
by Rich Felker for build system and code conventions during upstream
|
||||
integration. It is licensed under the standard MIT terms.
|
||||
|
||||
The mips64 port was contributed by Imagination Technologies and is
|
||||
licensed under the standard MIT terms.
|
||||
|
||||
The powerpc port was also originally written by Richard Pennington,
|
||||
and later supplemented and integrated by John Spencer. It is licensed
|
||||
under the standard MIT terms.
|
||||
|
||||
All other files which have no copyright comments are original works
|
||||
produced specifically for use as part of this library, written either
|
||||
by Rich Felker, the main author of the library, or by one or more
|
||||
contibutors listed above. Details on authorship of individual files
|
||||
can be found in the git version control history of the project. The
|
||||
omission of copyright and license comments in each file is in the
|
||||
interest of source tree size.
|
||||
|
||||
In addition, permission is hereby granted for all public header files
|
||||
(include/* and arch/*/bits/*) and crt files intended to be linked into
|
||||
applications (crt/*, ldso/dlstart.c, and arch/*/crt_arch.h) to omit
|
||||
the copyright notice and permission notice otherwise required by the
|
||||
license, and to use these files without any requirement of
|
||||
attribution. These files include substantial contributions from:
|
||||
|
||||
Bobby Bingham
|
||||
John Spencer
|
||||
Nicholas J. Kain
|
||||
Rich Felker
|
||||
Richard Pennington
|
||||
Stefan Kristiansson
|
||||
Szabolcs Nagy
|
||||
|
||||
all of whom have explicitly granted such permission.
|
||||
|
||||
This file previously contained text expressing a belief that most of
|
||||
the files covered by the above exception were sufficiently trivial not
|
||||
to be subject to copyright, resulting in confusion over whether it
|
||||
negated the permissions granted in the license. In the spirit of
|
||||
permissive licensing, and of not having licensing issues being an
|
||||
obstacle to adoption, that text has been removed.
|
||||
@@ -200,10 +200,7 @@ enum class tablet_repair_incremental_mode : uint8_t {
|
||||
disabled,
|
||||
};
|
||||
|
||||
// FIXME: Incremental repair is disabled by default due to
|
||||
// https://github.com/scylladb/scylladb/issues/26041 and
|
||||
// https://github.com/scylladb/scylladb/issues/27414
|
||||
constexpr tablet_repair_incremental_mode default_tablet_repair_incremental_mode{tablet_repair_incremental_mode::disabled};
|
||||
constexpr tablet_repair_incremental_mode default_tablet_repair_incremental_mode{tablet_repair_incremental_mode::incremental};
|
||||
|
||||
sstring tablet_repair_incremental_mode_to_string(tablet_repair_incremental_mode);
|
||||
tablet_repair_incremental_mode tablet_repair_incremental_mode_from_string(const sstring&);
|
||||
|
||||
@@ -235,6 +235,9 @@ public:
|
||||
const topology& get_topology() const;
|
||||
void debug_show() const;
|
||||
|
||||
/** Return the unique host ID for an end-point. */
|
||||
host_id get_host_id(inet_address endpoint) const;
|
||||
|
||||
/** @return a copy of the endpoint-to-id map for read-only operations */
|
||||
std::unordered_set<host_id> get_host_ids() const;
|
||||
|
||||
|
||||
16
main.cc
16
main.cc
@@ -39,6 +39,7 @@
|
||||
#include "api/api_init.hh"
|
||||
#include "db/config.hh"
|
||||
#include "db/extensions.hh"
|
||||
#include "db/legacy_schema_migrator.hh"
|
||||
#include "service/storage_service.hh"
|
||||
#include "service/migration_manager.hh"
|
||||
#include "service/tablet_allocator.hh"
|
||||
@@ -748,6 +749,8 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
// inherit Seastar's CPU affinity masks. We want this thread to be free
|
||||
// to migrate between CPUs; we think that's what makes the most sense.
|
||||
auto rpc_dict_training_worker = utils::alien_worker(startlog, 19, "rpc-dict");
|
||||
// niceness=10 is ~10% of normal process time
|
||||
auto hashing_worker = utils::alien_worker(startlog, 10, "pwd-hash");
|
||||
|
||||
return app.run(ac, av, [&] () -> future<int> {
|
||||
|
||||
@@ -777,7 +780,8 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
return seastar::async([&app, cfg, ext, &disk_space_monitor_shard0, &cm, &sstm, &db, &qp, &bm, &proxy, &mapreduce_service, &mm, &mm_notifier, &ctx, &opts, &dirs,
|
||||
&prometheus_server, &cf_cache_hitrate_calculator, &load_meter, &feature_service, &gossiper, &snitch,
|
||||
&token_metadata, &erm_factory, &snapshot_ctl, &messaging, &sst_dir_semaphore, &raft_gr, &service_memory_limiter,
|
||||
&repair, &sst_loader, &auth_cache, &ss, &lifecycle_notifier, &stream_manager, &task_manager, &rpc_dict_training_worker, &vector_store_client] {
|
||||
&repair, &sst_loader, &auth_cache, &ss, &lifecycle_notifier, &stream_manager, &task_manager, &rpc_dict_training_worker,
|
||||
&hashing_worker, &vector_store_client] {
|
||||
try {
|
||||
if (opts.contains("relabel-config-file") && !opts["relabel-config-file"].as<sstring>().empty()) {
|
||||
// calling update_relabel_config_from_file can cause an exception that would stop startup
|
||||
@@ -1637,7 +1641,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
fd.start(
|
||||
std::ref(fd_pinger), std::ref(fd_clock),
|
||||
service::direct_fd_clock::base::duration{std::chrono::milliseconds{100}}.count(),
|
||||
service::direct_fd_clock::base::duration{std::chrono::milliseconds{cfg->direct_failure_detector_ping_timeout_in_ms()}}.count(), dbcfg.gossip_scheduling_group).get();
|
||||
service::direct_fd_clock::base::duration{std::chrono::milliseconds{cfg->direct_failure_detector_ping_timeout_in_ms()}}.count()).get();
|
||||
|
||||
auto stop_fd = defer_verbose_shutdown("direct_failure_detector", [] {
|
||||
fd.stop().get();
|
||||
@@ -1847,6 +1851,8 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
group0_client.init().get();
|
||||
|
||||
checkpoint(stop_signal, "initializing system schema");
|
||||
// schema migration, if needed, is also done on shard 0
|
||||
db::legacy_schema_migrator::migrate(proxy, db, sys_ks, qp.local()).get();
|
||||
db::schema_tables::save_system_schema(qp.local()).get();
|
||||
db::schema_tables::recalculate_schema_version(sys_ks, proxy, feature_service.local()).get();
|
||||
|
||||
@@ -2057,7 +2063,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
maintenance_auth_config.authenticator_java_name = sstring{auth::allow_all_authenticator_name};
|
||||
maintenance_auth_config.role_manager_java_name = sstring{auth::maintenance_socket_role_manager_name};
|
||||
|
||||
maintenance_auth_service.start(perm_cache_config, std::ref(qp), std::ref(group0_client), std::ref(mm_notifier), std::ref(mm), maintenance_auth_config, maintenance_socket_enabled::yes, std::ref(auth_cache)).get();
|
||||
maintenance_auth_service.start(perm_cache_config, std::ref(qp), std::ref(group0_client), std::ref(mm_notifier), std::ref(mm), maintenance_auth_config, maintenance_socket_enabled::yes, std::ref(auth_cache), std::ref(hashing_worker)).get();
|
||||
|
||||
cql_maintenance_server_ctl.emplace(maintenance_auth_service, mm_notifier, gossiper, qp, service_memory_limiter, sl_controller, lifecycle_notifier, *cfg, maintenance_cql_sg_stats_key, maintenance_socket_enabled::yes, dbcfg.statement_scheduling_group);
|
||||
|
||||
@@ -2333,7 +2339,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
auth_config.authenticator_java_name = qualified_authenticator_name;
|
||||
auth_config.role_manager_java_name = qualified_role_manager_name;
|
||||
|
||||
auth_service.start(std::move(perm_cache_config), std::ref(qp), std::ref(group0_client), std::ref(mm_notifier), std::ref(mm), auth_config, maintenance_socket_enabled::no, std::ref(auth_cache)).get();
|
||||
auth_service.start(std::move(perm_cache_config), std::ref(qp), std::ref(group0_client), std::ref(mm_notifier), std::ref(mm), auth_config, maintenance_socket_enabled::no, std::ref(auth_cache), std::ref(hashing_worker)).get();
|
||||
|
||||
std::any stop_auth_service;
|
||||
// Has to be called after node joined the cluster (join_cluster())
|
||||
@@ -2377,7 +2383,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
|
||||
checkpoint(stop_signal, "starting batchlog manager");
|
||||
db::batchlog_manager_config bm_cfg;
|
||||
bm_cfg.replay_timeout = cfg->write_request_timeout_in_ms() * 1ms * 2;
|
||||
bm_cfg.write_request_timeout = cfg->write_request_timeout_in_ms() * 1ms;
|
||||
bm_cfg.replay_rate = cfg->batchlog_replay_throttle_in_kb() * 1000;
|
||||
bm_cfg.delay = std::chrono::milliseconds(cfg->ring_delay_ms());
|
||||
bm_cfg.replay_cleanup_after_replays = cfg->batchlog_replay_cleanup_after_replays();
|
||||
|
||||
@@ -686,7 +686,6 @@ static constexpr unsigned do_get_rpc_client_idx(messaging_verb verb) {
|
||||
case messaging_verb::RAFT_MODIFY_CONFIG:
|
||||
case messaging_verb::RAFT_PULL_SNAPSHOT:
|
||||
case messaging_verb::NOTIFY_BANNED:
|
||||
case messaging_verb::DIRECT_FD_PING:
|
||||
// See comment above `TOPOLOGY_INDEPENDENT_IDX`.
|
||||
// DO NOT put any 'hot' (e.g. data path) verbs in this group,
|
||||
// only verbs which are 'rare' and 'cheap'.
|
||||
@@ -748,6 +747,7 @@ static constexpr unsigned do_get_rpc_client_idx(messaging_verb verb) {
|
||||
case messaging_verb::PAXOS_ACCEPT:
|
||||
case messaging_verb::PAXOS_LEARN:
|
||||
case messaging_verb::PAXOS_PRUNE:
|
||||
case messaging_verb::DIRECT_FD_PING:
|
||||
return 2;
|
||||
case messaging_verb::MUTATION_DONE:
|
||||
case messaging_verb::MUTATION_FAILED:
|
||||
|
||||
@@ -575,15 +575,10 @@ utils::coroutine partition_entry::apply_to_incomplete(const schema& s,
|
||||
}
|
||||
res.row.set_range_tombstone(cur.range_tombstone_for_row() + src_cur.range_tombstone());
|
||||
|
||||
if (need_preempt()) {
|
||||
lb = position_in_partition(cur.position());
|
||||
++tracker.get_stats().rows_covered_by_range_tombstones_from_memtable;
|
||||
return stop_iteration::no;
|
||||
}
|
||||
|
||||
// FIXME: Compact the row
|
||||
++tracker.get_stats().rows_covered_by_range_tombstones_from_memtable;
|
||||
cur.next();
|
||||
// FIXME: preempt
|
||||
}
|
||||
}
|
||||
{
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:3cbe2dd05945f8fb76ebce2ea70864063d2b282c4d5080af1f290ead43321ab3
|
||||
size 6444732
|
||||
oid sha256:80a47fe93866989aaf7e949168fcd308e95841e78c976a61f9eac20bfdd34d96
|
||||
size 6448960
|
||||
|
||||
70
query/position_range_utils.hh
Normal file
70
query/position_range_utils.hh
Normal file
@@ -0,0 +1,70 @@
|
||||
/*
|
||||
* Copyright (C) 2025-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "mutation/position_in_partition.hh"
|
||||
#include "query/query-request.hh"
|
||||
#include "keys/clustering_interval_set.hh"
|
||||
|
||||
namespace query {
|
||||
|
||||
/// Helper utilities for working with position_range and migrating from clustering_range.
|
||||
///
|
||||
/// These utilities support the gradual migration from clustering_range (interval<clustering_key_prefix>)
|
||||
/// to position_range. See docs/dev/clustering-range-to-position-range-migration.md for details.
|
||||
|
||||
/// Convert a vector of clustering_ranges to a vector of position_ranges
|
||||
inline std::vector<position_range> clustering_row_ranges_to_position_ranges(const clustering_row_ranges& ranges) {
|
||||
std::vector<position_range> result;
|
||||
result.reserve(ranges.size());
|
||||
for (const auto& r : ranges) {
|
||||
result.emplace_back(position_range::from_range(r));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/// Convert a vector of position_ranges to a vector of clustering_ranges
|
||||
/// Note: Empty position ranges (those that don't contain any keys) are skipped
|
||||
inline clustering_row_ranges position_ranges_to_clustering_row_ranges(const std::vector<position_range>& ranges, const schema& s) {
|
||||
clustering_row_ranges result;
|
||||
result.reserve(ranges.size());
|
||||
for (const auto& r : ranges) {
|
||||
if (auto cr = position_range_to_clustering_range(r, s)) {
|
||||
result.emplace_back(std::move(*cr));
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/// Deoverlap clustering_row_ranges correctly using clustering_interval_set.
|
||||
/// This avoids the known bugs with clustering_range::deoverlap (see scylladb#22817, #21604, #8157).
|
||||
inline clustering_row_ranges deoverlap_clustering_row_ranges(const schema& s, const clustering_row_ranges& ranges) {
|
||||
clustering_interval_set interval_set(s, ranges);
|
||||
return interval_set.to_clustering_row_ranges();
|
||||
}
|
||||
|
||||
/// Intersect two clustering_row_ranges correctly using clustering_interval_set.
|
||||
/// This avoids the known bugs with clustering_range::intersection (see scylladb#22817, #21604, #8157).
|
||||
inline clustering_row_ranges intersect_clustering_row_ranges(const schema& s,
|
||||
const clustering_row_ranges& ranges1,
|
||||
const clustering_row_ranges& ranges2) {
|
||||
clustering_interval_set set1(s, ranges1);
|
||||
clustering_interval_set set2(s, ranges2);
|
||||
|
||||
clustering_interval_set result;
|
||||
for (const auto& r : set1) {
|
||||
if (set2.overlaps(s, r)) {
|
||||
result.add(s, r);
|
||||
}
|
||||
}
|
||||
|
||||
return result.to_clustering_row_ranges();
|
||||
}
|
||||
|
||||
} // namespace query
|
||||
@@ -551,13 +551,9 @@ void repair_writer_impl::create_writer(lw_shared_ptr<repair_writer> w) {
|
||||
}
|
||||
replica::table& t = _db.local().find_column_family(_schema->id());
|
||||
rlogger.debug("repair_writer: keyspace={}, table={}, estimated_partitions={}", w->schema()->ks_name(), w->schema()->cf_name(), w->get_estimated_partitions());
|
||||
// #17384 don't use off-strategy for repair (etc) if using tablets. sstables generated will
|
||||
// be single token range and can just be added to normal sstable set as is, eventually
|
||||
// handled by normal compaction.
|
||||
auto off_str = t.uses_tablets() ? sstables::offstrategy(false) : is_offstrategy_supported(_reason);
|
||||
auto sharder = get_sharder_helper(t, *(w->schema()), _topo_guard);
|
||||
_writer_done = mutation_writer::distribute_reader_and_consume_on_shards(_schema, sharder.sharder, std::move(_queue_reader),
|
||||
streaming::make_streaming_consumer(sstables::repair_origin, _db, _view_builder, _view_building_worker, w->get_estimated_partitions(), _reason, off_str,
|
||||
streaming::make_streaming_consumer(sstables::repair_origin, _db, _view_builder, _view_building_worker, w->get_estimated_partitions(), _reason, is_offstrategy_supported(_reason),
|
||||
_topo_guard, _repaired_at, w->get_sstable_list_to_mark_as_repaired()),
|
||||
t.stream_in_progress()).then([w] (uint64_t partitions) {
|
||||
rlogger.debug("repair_writer: keyspace={}, table={}, managed to write partitions={} to sstable",
|
||||
|
||||
@@ -297,17 +297,17 @@ public:
|
||||
|
||||
const dht::token_range& token_range() const noexcept;
|
||||
|
||||
size_t memtable_count() const;
|
||||
size_t memtable_count() const noexcept;
|
||||
|
||||
const compaction_group_ptr& main_compaction_group() const noexcept;
|
||||
const std::vector<compaction_group_ptr>& split_ready_compaction_groups() const;
|
||||
compaction_group_ptr& select_compaction_group(locator::tablet_range_side) noexcept;
|
||||
|
||||
uint64_t live_disk_space_used() const;
|
||||
uint64_t live_disk_space_used() const noexcept;
|
||||
|
||||
void for_each_compaction_group(std::function<void(const compaction_group_ptr&)> action) const;
|
||||
utils::small_vector<compaction_group_ptr, 3> compaction_groups();
|
||||
utils::small_vector<const_compaction_group_ptr, 3> compaction_groups() const;
|
||||
void for_each_compaction_group(std::function<void(const compaction_group_ptr&)> action) const noexcept;
|
||||
utils::small_vector<compaction_group_ptr, 3> compaction_groups() noexcept;
|
||||
utils::small_vector<const_compaction_group_ptr, 3> compaction_groups() const noexcept;
|
||||
|
||||
utils::small_vector<compaction_group_ptr, 3> split_unready_groups() const;
|
||||
bool split_unready_groups_are_empty() const;
|
||||
@@ -430,7 +430,7 @@ public:
|
||||
virtual storage_group& storage_group_for_token(dht::token) const = 0;
|
||||
virtual utils::chunked_vector<storage_group_ptr> storage_groups_for_token_range(dht::token_range tr) const = 0;
|
||||
|
||||
virtual locator::combined_load_stats table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const = 0;
|
||||
virtual locator::combined_load_stats table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const noexcept = 0;
|
||||
virtual bool all_storage_groups_split() = 0;
|
||||
virtual future<> split_all_storage_groups(tasks::task_info tablet_split_task_info) = 0;
|
||||
virtual future<> maybe_split_compaction_group_of(size_t idx) = 0;
|
||||
|
||||
@@ -96,9 +96,6 @@ public:
|
||||
virtual const secondary_index::secondary_index_manager& get_index_manager(data_dictionary::table t) const override {
|
||||
return const_cast<replica::table&>(unwrap(t)).get_index_manager();
|
||||
}
|
||||
virtual db_clock::time_point get_truncation_time(data_dictionary::table t) const override {
|
||||
return const_cast<replica::table&>(unwrap(t)).get_truncation_time();
|
||||
}
|
||||
virtual lw_shared_ptr<keyspace_metadata> get_keyspace_metadata(data_dictionary::keyspace ks) const override {
|
||||
return unwrap(ks).metadata();
|
||||
}
|
||||
|
||||
@@ -1133,7 +1133,7 @@ public:
|
||||
|
||||
// The tablet filter is used to not double account migrating tablets, so it's important that
|
||||
// only one of pending or leaving replica is accounted based on current migration stage.
|
||||
locator::combined_load_stats table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const;
|
||||
locator::combined_load_stats table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const noexcept;
|
||||
|
||||
const db::view::stats& get_view_stats() const {
|
||||
return _view_stats;
|
||||
|
||||
@@ -234,12 +234,18 @@ distributed_loader::get_sstables_from_upload_dir(sharded<replica::database>& db,
|
||||
}
|
||||
|
||||
future<std::tuple<table_id, std::vector<std::vector<sstables::shared_sstable>>>>
|
||||
distributed_loader::get_sstables_from_object_store(sharded<replica::database>& db, sstring ks, sstring cf, std::vector<sstring> sstables, sstring endpoint, sstring type, sstring bucket, sstring prefix, sstables::sstable_open_config cfg, std::function<seastar::abort_source*()> get_abort_src) {
|
||||
return get_sstables_from(db, ks, cf, cfg, [bucket, endpoint, type, prefix, sstables=std::move(sstables), &get_abort_src] (auto& global_table, auto& directory) {
|
||||
distributed_loader::get_sstables_from_object_store(sharded<replica::database>& db, sstring ks, sstring cf, std::vector<sstring> sstables, sstring endpoint, sstring bucket, sstring prefix, sstables::sstable_open_config cfg, std::function<seastar::abort_source*()> get_abort_src) {
|
||||
return get_sstables_from(db, ks, cf, cfg, [bucket, endpoint, prefix, sstables=std::move(sstables), &get_abort_src, &db] (auto& global_table, auto& directory) {
|
||||
return directory.start(global_table.as_sharded_parameter(),
|
||||
sharded_parameter([bucket, endpoint, type, prefix, &get_abort_src] {
|
||||
sharded_parameter([bucket, endpoint, prefix, &get_abort_src, &db] {
|
||||
auto eps = db.local().get_config().object_storage_endpoints()
|
||||
| std::views::filter([&endpoint](auto& ep) { return ep.key() == endpoint; })
|
||||
;
|
||||
if (eps.empty()) {
|
||||
throw std::invalid_argument(fmt::format("Undefined endpoint {}", endpoint));
|
||||
}
|
||||
seastar::abort_source* as = get_abort_src ? get_abort_src() : nullptr;
|
||||
auto opts = data_dictionary::make_object_storage_options(endpoint, type, bucket, prefix, as);
|
||||
auto opts = data_dictionary::make_object_storage_options(endpoint, eps.front().type(), bucket, prefix, as);
|
||||
return make_lw_shared<const data_dictionary::storage_options>(std::move(opts));
|
||||
}),
|
||||
sstables,
|
||||
|
||||
@@ -92,7 +92,7 @@ public:
|
||||
static future<std::tuple<table_id, std::vector<std::vector<sstables::shared_sstable>>>>
|
||||
get_sstables_from_upload_dir(sharded<replica::database>& db, sstring ks, sstring cf, sstables::sstable_open_config cfg);
|
||||
static future<std::tuple<table_id, std::vector<std::vector<sstables::shared_sstable>>>>
|
||||
get_sstables_from_object_store(sharded<replica::database>& db, sstring ks, sstring cf, std::vector<sstring> sstables, sstring endpoint, sstring type, sstring bucket, sstring prefix, sstables::sstable_open_config cfg, std::function<seastar::abort_source*()> = {});
|
||||
get_sstables_from_object_store(sharded<replica::database>& db, sstring ks, sstring cf, std::vector<sstring> sstables, sstring endpoint, sstring bucket, sstring prefix, sstables::sstable_open_config cfg, std::function<seastar::abort_source*()> = {});
|
||||
static future<> process_upload_dir(sharded<replica::database>& db, sharded<db::view::view_builder>& vb, sharded<db::view::view_building_worker>& vbw, sstring ks_name, sstring cf_name, bool skip_cleanup, bool skip_reshape);
|
||||
};
|
||||
|
||||
|
||||
@@ -215,7 +215,7 @@ private:
|
||||
output_ck_raw_values.emplace_back(bytes{});
|
||||
}
|
||||
}
|
||||
if (pos.region() != partition_region::clustered) {
|
||||
if (underlying_ck_raw_values.empty()) {
|
||||
output_ck_raw_values.push_back(bytes{});
|
||||
} else {
|
||||
output_ck_raw_values.push_back(data_value(static_cast<int8_t>(pos.get_bound_weight())).serialize_nonnull());
|
||||
|
||||
@@ -16,7 +16,6 @@
|
||||
#include <seastar/coroutine/as_future.hh>
|
||||
#include <seastar/util/closeable.hh>
|
||||
#include <seastar/util/defer.hh>
|
||||
#include <seastar/json/json_elements.hh>
|
||||
|
||||
#include "dht/decorated_key.hh"
|
||||
#include "replica/database.hh"
|
||||
@@ -709,7 +708,7 @@ public:
|
||||
return *_single_sg;
|
||||
}
|
||||
|
||||
locator::combined_load_stats table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)>) const override {
|
||||
locator::combined_load_stats table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)>) const noexcept override {
|
||||
return locator::combined_load_stats{
|
||||
.table_ls = locator::table_load_stats{
|
||||
.size_in_bytes = _single_sg->live_disk_space_used(),
|
||||
@@ -875,7 +874,7 @@ public:
|
||||
return storage_group_for_id(storage_group_of(token).first);
|
||||
}
|
||||
|
||||
locator::combined_load_stats table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const override;
|
||||
locator::combined_load_stats table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const noexcept override;
|
||||
bool all_storage_groups_split() override;
|
||||
future<> split_all_storage_groups(tasks::task_info tablet_split_task_info) override;
|
||||
future<> maybe_split_compaction_group_of(size_t idx) override;
|
||||
@@ -923,7 +922,7 @@ compaction_group_ptr& storage_group::select_compaction_group(locator::tablet_ran
|
||||
return _main_cg;
|
||||
}
|
||||
|
||||
void storage_group::for_each_compaction_group(std::function<void(const compaction_group_ptr&)> action) const {
|
||||
void storage_group::for_each_compaction_group(std::function<void(const compaction_group_ptr&)> action) const noexcept {
|
||||
action(_main_cg);
|
||||
for (auto& cg : _merging_groups) {
|
||||
action(cg);
|
||||
@@ -933,7 +932,7 @@ void storage_group::for_each_compaction_group(std::function<void(const compactio
|
||||
}
|
||||
}
|
||||
|
||||
utils::small_vector<compaction_group_ptr, 3> storage_group::compaction_groups() {
|
||||
utils::small_vector<compaction_group_ptr, 3> storage_group::compaction_groups() noexcept {
|
||||
utils::small_vector<compaction_group_ptr, 3> cgs;
|
||||
for_each_compaction_group([&cgs] (const compaction_group_ptr& cg) {
|
||||
cgs.push_back(cg);
|
||||
@@ -941,7 +940,7 @@ utils::small_vector<compaction_group_ptr, 3> storage_group::compaction_groups()
|
||||
return cgs;
|
||||
}
|
||||
|
||||
utils::small_vector<const_compaction_group_ptr, 3> storage_group::compaction_groups() const {
|
||||
utils::small_vector<const_compaction_group_ptr, 3> storage_group::compaction_groups() const noexcept {
|
||||
utils::small_vector<const_compaction_group_ptr, 3> cgs;
|
||||
for_each_compaction_group([&cgs] (const compaction_group_ptr& cg) {
|
||||
cgs.push_back(cg);
|
||||
@@ -1891,7 +1890,7 @@ sstables::file_size_stats compaction_group::live_disk_space_used_full_stats() co
|
||||
return _main_sstables->get_file_size_stats() + _maintenance_sstables->get_file_size_stats();
|
||||
}
|
||||
|
||||
uint64_t storage_group::live_disk_space_used() const {
|
||||
uint64_t storage_group::live_disk_space_used() const noexcept {
|
||||
auto cgs = const_cast<storage_group&>(*this).compaction_groups();
|
||||
return std::ranges::fold_left(cgs | std::views::transform(std::mem_fn(&compaction_group::live_disk_space_used)), uint64_t(0), std::plus{});
|
||||
}
|
||||
@@ -2814,7 +2813,7 @@ void table::on_flush_timer() {
|
||||
});
|
||||
}
|
||||
|
||||
locator::combined_load_stats tablet_storage_group_manager::table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const {
|
||||
locator::combined_load_stats tablet_storage_group_manager::table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const noexcept {
|
||||
locator::table_load_stats table_stats;
|
||||
table_stats.split_ready_seq_number = _split_ready_seq_number;
|
||||
|
||||
@@ -2837,7 +2836,7 @@ locator::combined_load_stats tablet_storage_group_manager::table_load_stats(std:
|
||||
};
|
||||
}
|
||||
|
||||
locator::combined_load_stats table::table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const {
|
||||
locator::combined_load_stats table::table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const noexcept {
|
||||
return _sg_manager->table_load_stats(std::move(tablet_filter));
|
||||
}
|
||||
|
||||
@@ -3199,35 +3198,23 @@ db::replay_position table::highest_flushed_replay_position() const {
|
||||
return _highest_flushed_rp;
|
||||
}
|
||||
|
||||
struct manifest_json : public json::json_base {
|
||||
json::json_chunked_list<sstring> files;
|
||||
|
||||
manifest_json() {
|
||||
register_params();
|
||||
}
|
||||
manifest_json(manifest_json&& e) {
|
||||
register_params();
|
||||
files = std::move(e.files);
|
||||
}
|
||||
manifest_json& operator=(manifest_json&& e) {
|
||||
files = std::move(e.files);
|
||||
return *this;
|
||||
}
|
||||
private:
|
||||
void register_params() {
|
||||
add(&files, "files");
|
||||
}
|
||||
};
|
||||
|
||||
future<>
|
||||
table::seal_snapshot(sstring jsondir, std::vector<snapshot_file_set> file_sets) {
|
||||
manifest_json manifest;
|
||||
std::ostringstream ss;
|
||||
int n = 0;
|
||||
ss << "{" << std::endl << "\t\"files\" : [ ";
|
||||
for (const auto& fsp : file_sets) {
|
||||
for (auto& rf : *fsp) {
|
||||
manifest.files.push(std::move(rf));
|
||||
for (const auto& rf : *fsp) {
|
||||
if (n++ > 0) {
|
||||
ss << ", ";
|
||||
}
|
||||
ss << "\"" << rf << "\"";
|
||||
co_await coroutine::maybe_yield();
|
||||
}
|
||||
}
|
||||
auto streamer = json::stream_object(std::move(manifest));
|
||||
ss << " ]" << std::endl << "}" << std::endl;
|
||||
|
||||
auto json = ss.str();
|
||||
auto jsonfile = jsondir + "/manifest.json";
|
||||
|
||||
tlogger.debug("Storing manifest {}", jsonfile);
|
||||
@@ -3237,10 +3224,12 @@ table::seal_snapshot(sstring jsondir, std::vector<snapshot_file_set> file_sets)
|
||||
auto out = co_await make_file_output_stream(std::move(f));
|
||||
std::exception_ptr ex;
|
||||
try {
|
||||
co_await streamer(std::move(out));
|
||||
co_await out.write(json.c_str(), json.size());
|
||||
co_await out.flush();
|
||||
} catch (...) {
|
||||
ex = std::current_exception();
|
||||
}
|
||||
co_await out.close();
|
||||
|
||||
if (ex) {
|
||||
co_await coroutine::return_exception_ptr(std::move(ex));
|
||||
@@ -3464,7 +3453,7 @@ size_t compaction_group::memtable_count() const noexcept {
|
||||
return _memtables->size();
|
||||
}
|
||||
|
||||
size_t storage_group::memtable_count() const {
|
||||
size_t storage_group::memtable_count() const noexcept {
|
||||
return std::ranges::fold_left(compaction_groups() | std::views::transform(std::mem_fn(&compaction_group::memtable_count)), size_t(0), std::plus{});
|
||||
}
|
||||
|
||||
|
||||
@@ -22,7 +22,7 @@ format_match = re.compile(r'\s*(?:seastar::)?format\(\s*"([^"]+)"\s*,\s*(.*)\s*'
|
||||
def handle_error(message, strict=True, verbose_mode=False):
|
||||
if strict:
|
||||
print(f"[ERROR] {message}")
|
||||
exit(1)
|
||||
exit(-1)
|
||||
elif verbose_mode:
|
||||
print(f"[WARNING] {message}")
|
||||
|
||||
@@ -180,11 +180,12 @@ def get_metrics_from_file(file_name, prefix, metrics_information, verb=None, str
|
||||
groups = {}
|
||||
if clean_name in metrics_information:
|
||||
if (isinstance(metrics_information[clean_name], str) and metrics_information[clean_name] == "skip") or "skip" in metrics_information[clean_name]:
|
||||
return {}
|
||||
exit(0)
|
||||
param_mapping = metrics_information[clean_name]["params"] if clean_name in metrics_information and "params" in metrics_information[clean_name] else {}
|
||||
groups = metrics_information[clean_name]["groups"] if clean_name in metrics_information and "groups" in metrics_information[clean_name] else {}
|
||||
|
||||
metrics = {}
|
||||
multi_line = False
|
||||
names = undefined
|
||||
typ = undefined
|
||||
line_number = 0;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
"cdc/log.cc":
|
||||
params:
|
||||
cdc_group_name: "cdc"
|
||||
cdc_group_name: cdc
|
||||
part_name;suffix: [["static_row", "total"],["clustering_row", "total"], ["map", "total"], ["set", "total"], ["list", "total"], ["udt", "total"], ["range_tombstone", "total"],["partition_delete", "total"],["row_delete", "total"], ["static_row", "failed"],["clustering_row", "failed"], ["map", "failed"], ["set", "failed"], ["list", "failed"], ["udt", "failed"], ["range_tombstone", "failed"],["partition_delete", "failed"],["row_delete", "failed"]]
|
||||
kind: ["total", "failed"]
|
||||
"db/commitlog/commitlog.cc":
|
||||
@@ -9,7 +9,7 @@
|
||||
"cfg.max_active_flushes": "cfg.max_active_flushes"
|
||||
"cql3/query_processor.cc":
|
||||
groups:
|
||||
"80": "query_processor"
|
||||
"80": query_processor
|
||||
"replica/dirty_memory_manager.cc":
|
||||
params:
|
||||
namestr: ["regular", "system"]
|
||||
@@ -19,11 +19,10 @@
|
||||
"replica/database.cc":
|
||||
params:
|
||||
"_dirty_memory_manager.throttle_threshold()": "throttle threshold"
|
||||
"seastar/apps/metrics_tester/metrics_tester.cc": "skip"
|
||||
"seastar/tests/unit/metrics_test.cc": "skip"
|
||||
"seastar/tests/unit/metrics_tester.cc": "skip"
|
||||
"seastar/tests/unit/prometheus_http_test.cc": "skip"
|
||||
"seastar/tests/unit/prometheus_text_test.cc": "skip"
|
||||
"seastar/apps/metrics_tester/metrics_tester.cc": skip
|
||||
"seastar/tests/unit/metrics_test.cc": skip
|
||||
"seastar/tests/unit/metrics_tester.cc": skip
|
||||
"seastar/tests/unit/prometheus_http_test.cc": skip
|
||||
"service/storage_proxy.cc":
|
||||
params:
|
||||
COORDINATOR_STATS_CATEGORY: "storage_proxy_coordinator"
|
||||
@@ -33,25 +32,25 @@
|
||||
_short_description_prefix: ["total_write_attempts", "write_errors", "background_replica_writes_failed", "read_repair_write_attempts"]
|
||||
_long_description_prefix: ["total number of write requests", "number of write requests that failed", "background_replica_writes_failed", "number of write operations in a read repair context"]
|
||||
_category: "storage_proxy_coordinator"
|
||||
"thrift/server.cc": "skip"
|
||||
"thrift/server.cc": skip
|
||||
"tracing/tracing.cc":
|
||||
params:
|
||||
"max_pending_trace_records + write_event_records_threshold": "max_pending_trace_records + write_event_records_threshold"
|
||||
"transport/server.cc":
|
||||
groups:
|
||||
"200": "transport"
|
||||
"200": transport
|
||||
params:
|
||||
"_config.max_request_size": "max_request_size"
|
||||
"seastar/src/net/dpdk.cc": "skip"
|
||||
"seastar/src/net/dpdk.cc": skip
|
||||
"db/hints/manager.cc":
|
||||
params:
|
||||
"group_name": ["hints_for_views_manager", "hints_manager"]
|
||||
"seastar/src/core/execution_stage.cc":
|
||||
groups:
|
||||
"100": "execution_stages"
|
||||
"100": execution_stages
|
||||
"seastar/src/core/fair_queue.cc":
|
||||
groups:
|
||||
"300": "io_queue"
|
||||
"300": io_queue
|
||||
"seastar/src/net/net.cc":
|
||||
params:
|
||||
_stats_plugin_name: ["stats_plugin_name"]
|
||||
|
||||
@@ -38,9 +38,8 @@ for required in jq curl; do
|
||||
fi
|
||||
done
|
||||
|
||||
FORCE=0
|
||||
ALLOW_SUBMODULE=0
|
||||
ALLOW_UNSTABLE=0
|
||||
ALLOW_ANY_BRANCH=0
|
||||
|
||||
function print_usage {
|
||||
cat << EOF
|
||||
@@ -61,18 +60,12 @@ Options:
|
||||
-h
|
||||
Print this help message and exit.
|
||||
|
||||
--allow-submodule
|
||||
Allow a PR to update a submudule
|
||||
|
||||
--allow-unstable
|
||||
--force
|
||||
Do not check current branch to be next*
|
||||
Do not check jenkins job status
|
||||
|
||||
--allow-any-branch
|
||||
Merge PR even if target branch is not next
|
||||
|
||||
--force
|
||||
Sets all above --allow-* options
|
||||
|
||||
--allow-submodule
|
||||
Allow a PR to update a submudule
|
||||
EOF
|
||||
}
|
||||
|
||||
@@ -80,23 +73,13 @@ while [[ $# -gt 0 ]]
|
||||
do
|
||||
case $1 in
|
||||
"--force"|"-f")
|
||||
ALLOW_UNSTABLE=1
|
||||
ALLOW_SUBMODULE=1
|
||||
ALLOW_ANY_BRANCH=1
|
||||
FORCE=1
|
||||
shift 1
|
||||
;;
|
||||
--allow-submodule)
|
||||
ALLOW_SUBMODULE=1
|
||||
shift
|
||||
;;
|
||||
--allow-unstable)
|
||||
ALLOW_UNSTABLE=1
|
||||
shift
|
||||
;;
|
||||
--allow-any-branch)
|
||||
ALLOW_ANY_BRANCH=1
|
||||
shift
|
||||
;;
|
||||
+([0-9]))
|
||||
PR_NUM=$1
|
||||
shift 1
|
||||
@@ -164,7 +147,7 @@ check_jenkins_job_status() {
|
||||
fi
|
||||
}
|
||||
|
||||
if [[ $ALLOW_UNSTABLE -eq 0 ]]; then
|
||||
if [[ $FORCE -eq 0 ]]; then
|
||||
check_jenkins_job_status
|
||||
fi
|
||||
|
||||
@@ -196,19 +179,17 @@ echo -n "Fetching full name of author $PR_LOGIN... "
|
||||
USER_NAME=$(curl -s "https://api.github.com/users/$PR_LOGIN" | jq -r .name)
|
||||
echo "$USER_NAME"
|
||||
|
||||
if [[ $ALLOW_ANY_BRANCH -eq 0 ]]; then
|
||||
BASE_BRANCH=$(jq -r .base.ref <<< $PR_DATA)
|
||||
CURRENT_BRANCH=$(git rev-parse --abbrev-ref HEAD)
|
||||
TARGET_BASE="unknown"
|
||||
if [[ ${BASE_BRANCH} == master ]]; then
|
||||
TARGET_BASE="next"
|
||||
elif [[ ${BASE_BRANCH} == branch-* ]]; then
|
||||
TARGET_BASE=${BASE_BRANCH//branch/next}
|
||||
fi
|
||||
if [[ "${CURRENT_BRANCH}" != "${TARGET_BASE}" ]]; then
|
||||
echo "Merging into wrong next, want ${TARGET_BASE}, have ${CURRENT_BRANCH}. Use --allow-any-branch or --force to skip this check"
|
||||
exit 1
|
||||
fi
|
||||
BASE_BRANCH=$(jq -r .base.ref <<< $PR_DATA)
|
||||
CURRENT_BRANCH=$(git rev-parse --abbrev-ref HEAD)
|
||||
TARGET_BASE="unknown"
|
||||
if [[ ${BASE_BRANCH} == master ]]; then
|
||||
TARGET_BASE="next"
|
||||
elif [[ ${BASE_BRANCH} == branch-* ]]; then
|
||||
TARGET_BASE=${BASE_BRANCH//branch/next}
|
||||
fi
|
||||
if [[ "${CURRENT_BRANCH}" != "${TARGET_BASE}" ]]; then
|
||||
echo "Merging into wrong next, want ${TARGET_BASE}, have ${CURRENT_BRANCH}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
git fetch "$REMOTE" pull/$PR_NUM/head
|
||||
|
||||
@@ -818,9 +818,6 @@ class std_list:
|
||||
self._node = node_header['_M_next']
|
||||
self._end = node_header['_M_next']['_M_prev']
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
if self._node == self._end:
|
||||
raise StopIteration()
|
||||
|
||||
@@ -6,7 +6,6 @@
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
|
||||
#include "seastar/core/scheduling.hh"
|
||||
#include "utils/assert.hh"
|
||||
#include <unordered_set>
|
||||
|
||||
@@ -18,7 +17,6 @@
|
||||
#include <seastar/core/condition-variable.hh>
|
||||
#include <seastar/coroutine/parallel_for_each.hh>
|
||||
#include <seastar/util/defer.hh>
|
||||
#include <seastar/coroutine/switch_to.hh>
|
||||
|
||||
#include "utils/log.hh"
|
||||
|
||||
@@ -120,7 +118,7 @@ struct failure_detector::impl {
|
||||
|
||||
// Fetches endpoint updates from _endpoint_queue and performs the add/remove operation.
|
||||
// Runs on shard 0 only.
|
||||
future<> update_endpoint_fiber(seastar::scheduling_group sg);
|
||||
future<> update_endpoint_fiber();
|
||||
future<> _update_endpoint_fiber = make_ready_future<>();
|
||||
|
||||
// Workers running on this shard.
|
||||
@@ -142,7 +140,7 @@ struct failure_detector::impl {
|
||||
// The unregistering process requires cross-shard operations which we perform on this fiber.
|
||||
future<> _destroy_subscriptions = make_ready_future<>();
|
||||
|
||||
impl(failure_detector& parent, pinger&, clock&, clock::interval_t ping_period, clock::interval_t ping_timeout, seastar::scheduling_group sg);
|
||||
impl(failure_detector& parent, pinger&, clock&, clock::interval_t ping_period, clock::interval_t ping_timeout);
|
||||
~impl();
|
||||
|
||||
// Inform update_endpoint_fiber() about an added/removed endpoint.
|
||||
@@ -179,19 +177,19 @@ struct failure_detector::impl {
|
||||
};
|
||||
|
||||
failure_detector::failure_detector(
|
||||
pinger& pinger, clock& clock, clock::interval_t ping_period, clock::interval_t ping_timeout, seastar::scheduling_group sg)
|
||||
: _impl(std::make_unique<impl>(*this, pinger, clock, ping_period, ping_timeout, sg))
|
||||
pinger& pinger, clock& clock, clock::interval_t ping_period, clock::interval_t ping_timeout)
|
||||
: _impl(std::make_unique<impl>(*this, pinger, clock, ping_period, ping_timeout))
|
||||
{}
|
||||
|
||||
failure_detector::impl::impl(
|
||||
failure_detector& parent, pinger& pinger, clock& clock, clock::interval_t ping_period, clock::interval_t ping_timeout, seastar::scheduling_group sg)
|
||||
failure_detector& parent, pinger& pinger, clock& clock, clock::interval_t ping_period, clock::interval_t ping_timeout)
|
||||
: _parent(parent), _pinger(pinger), _clock(clock), _ping_period(ping_period), _ping_timeout(ping_timeout) {
|
||||
if (this_shard_id() != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
_num_workers.resize(smp::count, 0);
|
||||
_update_endpoint_fiber = update_endpoint_fiber(sg);
|
||||
_update_endpoint_fiber = update_endpoint_fiber();
|
||||
}
|
||||
|
||||
void failure_detector::impl::send_update_endpoint(pinger::endpoint_id ep, endpoint_update update) {
|
||||
@@ -207,9 +205,9 @@ void failure_detector::impl::send_update_endpoint(pinger::endpoint_id ep, endpoi
|
||||
_endpoint_changed.signal();
|
||||
}
|
||||
|
||||
future<> failure_detector::impl::update_endpoint_fiber(seastar::scheduling_group sg) {
|
||||
future<> failure_detector::impl::update_endpoint_fiber() {
|
||||
SCYLLA_ASSERT(this_shard_id() == 0);
|
||||
co_await coroutine::switch_to(sg);
|
||||
|
||||
while (true) {
|
||||
co_await _endpoint_changed.wait([this] { return !_endpoint_updates.empty(); });
|
||||
|
||||
@@ -482,7 +480,7 @@ static future<bool> ping_with_timeout(pinger::endpoint_id id, clock::timepoint_t
|
||||
}
|
||||
});
|
||||
|
||||
auto f = pinger.ping(id, timeout, timeout_as, c);
|
||||
auto f = pinger.ping(id, timeout_as);
|
||||
auto sleep_and_abort = [] (clock::timepoint_t timeout, abort_source& timeout_as, clock& c) -> future<> {
|
||||
co_await c.sleep_until(timeout, timeout_as).then_wrapped([&timeout_as] (auto&& f) {
|
||||
// Avoid throwing if sleep was aborted.
|
||||
|
||||
@@ -19,6 +19,26 @@ class abort_source;
|
||||
|
||||
namespace direct_failure_detector {
|
||||
|
||||
class pinger {
|
||||
public:
|
||||
// Opaque endpoint ID.
|
||||
// A specific implementation of `pinger` maps those IDs to 'real' addresses.
|
||||
using endpoint_id = utils::UUID;
|
||||
|
||||
// Send a message to `ep` and wait until it responds.
|
||||
// The wait can be aborted using `as`.
|
||||
// Abort should be signalized with `abort_requested_exception`.
|
||||
//
|
||||
// If the ping fails in an expected way (e.g. the endpoint is down and refuses to connect),
|
||||
// returns `false`. If it succeeds, returns `true`.
|
||||
virtual future<bool> ping(endpoint_id ep, abort_source& as) = 0;
|
||||
|
||||
protected:
|
||||
// The `pinger` object must not be destroyed through the `pinger` interface.
|
||||
// `failure_detector` does not take ownership of `pinger`, only a non-owning reference.
|
||||
~pinger() = default;
|
||||
};
|
||||
|
||||
// A clock that uses abstract units to measure time.
|
||||
// The implementation is responsible for periodically advancing the clock.
|
||||
//
|
||||
@@ -40,33 +60,12 @@ public:
|
||||
// Aborts should be signalized using `seastar::sleep_aborted`.
|
||||
virtual future<> sleep_until(timepoint_t tp, abort_source& as) = 0;
|
||||
|
||||
virtual std::chrono::milliseconds to_milliseconds(timepoint_t tp) const = 0;
|
||||
protected:
|
||||
// The `clock` object must not be destroyed through the `clock` interface.
|
||||
// `failure_detector` does not take ownership of `clock`, only a non-owning reference.
|
||||
~clock() = default;
|
||||
};
|
||||
|
||||
class pinger {
|
||||
public:
|
||||
// Opaque endpoint ID.
|
||||
// A specific implementation of `pinger` maps those IDs to 'real' addresses.
|
||||
using endpoint_id = utils::UUID;
|
||||
|
||||
// Send a message to `ep` and wait until it responds.
|
||||
// The wait can be aborted using `as`.
|
||||
// Abort should be signalized with `abort_requested_exception`.
|
||||
//
|
||||
// If the ping fails in an expected way (e.g. the endpoint is down and refuses to connect),
|
||||
// returns `false`. If it succeeds, returns `true`.
|
||||
virtual future<bool> ping(endpoint_id ep, clock::timepoint_t timeout, abort_source& as, clock& c) = 0;
|
||||
|
||||
protected:
|
||||
// The `pinger` object must not be destroyed through the `pinger` interface.
|
||||
// `failure_detector` does not take ownership of `pinger`, only a non-owning reference.
|
||||
~pinger() = default;
|
||||
};
|
||||
|
||||
class listener {
|
||||
public:
|
||||
// Called when an endpoint in the detected set (added by `failure_detector::add_endpoint`) responds to a ping
|
||||
@@ -128,10 +127,7 @@ public:
|
||||
|
||||
// Duration after which a ping is aborted, so that next ping can be started
|
||||
// (pings are sent sequentially).
|
||||
clock::interval_t ping_timeout,
|
||||
|
||||
// Scheduling group used for fibers inside the failure detector.
|
||||
seastar::scheduling_group sg
|
||||
clock::interval_t ping_timeout
|
||||
);
|
||||
|
||||
~failure_detector();
|
||||
|
||||
@@ -18,7 +18,6 @@
|
||||
#include "utils/error_injection.hh"
|
||||
#include "seastar/core/shared_future.hh"
|
||||
|
||||
#include <chrono>
|
||||
#include <seastar/core/coroutine.hh>
|
||||
#include <seastar/core/when_all.hh>
|
||||
#include <seastar/core/sleep.hh>
|
||||
@@ -203,11 +202,8 @@ void raft_group_registry::init_rpc_verbs() {
|
||||
});
|
||||
|
||||
ser::raft_rpc_verbs::register_direct_fd_ping(&_ms,
|
||||
[this] (const rpc::client_info&, rpc::opt_time_point timeout, raft::server_id dst) -> future<direct_fd_ping_reply> {
|
||||
|
||||
if (timeout && *timeout <= netw::messaging_service::clock_type::now()) {
|
||||
throw timed_out_error{};
|
||||
}
|
||||
[this] (const rpc::client_info&, raft::server_id dst) -> future<direct_fd_ping_reply> {
|
||||
// XXX: update address map here as well?
|
||||
|
||||
if (_my_id != dst) {
|
||||
return make_ready_future<direct_fd_ping_reply>(direct_fd_ping_reply {
|
||||
@@ -217,10 +213,19 @@ void raft_group_registry::init_rpc_verbs() {
|
||||
});
|
||||
}
|
||||
|
||||
return make_ready_future<direct_fd_ping_reply>(direct_fd_ping_reply {
|
||||
.result = service::group_liveness_info{
|
||||
.group0_alive = _group0_is_alive,
|
||||
return container().invoke_on(0, [] (raft_group_registry& me) -> future<direct_fd_ping_reply> {
|
||||
bool group0_alive = false;
|
||||
if (me._group0_id) {
|
||||
auto* group0_server = me.find_server(*me._group0_id);
|
||||
if (group0_server && group0_server->is_alive()) {
|
||||
group0_alive = true;
|
||||
}
|
||||
}
|
||||
co_return direct_fd_ping_reply {
|
||||
.result = service::group_liveness_info{
|
||||
.group0_alive = group0_alive,
|
||||
}
|
||||
};
|
||||
});
|
||||
});
|
||||
}
|
||||
@@ -375,12 +380,6 @@ future<> raft_group_registry::start_server_for_group(raft_server_for_group new_g
|
||||
co_await server.abort();
|
||||
std::rethrow_exception(ex);
|
||||
}
|
||||
|
||||
if (gid == _group0_id) {
|
||||
co_await container().invoke_on_all([] (raft_group_registry& rg) {
|
||||
rg._group0_is_alive = true;
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
future<> raft_group_registry::abort_server(raft::group_id gid, sstring reason) {
|
||||
@@ -390,18 +389,14 @@ future<> raft_group_registry::abort_server(raft::group_id gid, sstring reason) {
|
||||
if (const auto it = _servers.find(gid); it != _servers.end()) {
|
||||
auto& [gid, s] = *it;
|
||||
if (!s.aborted) {
|
||||
if (gid == _group0_id) {
|
||||
co_await container().invoke_on_all([] (raft_group_registry& rg) {
|
||||
rg._group0_is_alive = false;
|
||||
});
|
||||
}
|
||||
s.aborted = s.server->abort(std::move(reason))
|
||||
.handle_exception([gid] (std::exception_ptr ex) {
|
||||
rslog.warn("Failed to abort raft group server {}: {}", gid, ex);
|
||||
});
|
||||
}
|
||||
co_await s.aborted->get_future();
|
||||
return s.aborted->get_future();
|
||||
}
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
unsigned raft_group_registry::shard_for_group(const raft::group_id& gid) const {
|
||||
@@ -522,13 +517,11 @@ future<> raft_server_with_timeouts::read_barrier(seastar::abort_source* as, std:
|
||||
}, "read_barrier", as, timeout);
|
||||
}
|
||||
|
||||
future<bool> direct_fd_pinger::ping(direct_failure_detector::pinger::endpoint_id id, direct_failure_detector::clock::timepoint_t timeout, abort_source& as, direct_failure_detector::clock& c) {
|
||||
future<bool> direct_fd_pinger::ping(direct_failure_detector::pinger::endpoint_id id, abort_source& as) {
|
||||
auto dst_id = raft::server_id{id};
|
||||
|
||||
try {
|
||||
std::chrono::milliseconds timeout_ms = c.to_milliseconds(timeout);
|
||||
netw::messaging_service::clock_type::time_point deadline = netw::messaging_service::clock_type::now() + timeout_ms;
|
||||
auto reply = co_await ser::raft_rpc_verbs::send_direct_fd_ping(&_ms, locator::host_id{id}, deadline, as, dst_id);
|
||||
auto reply = co_await ser::raft_rpc_verbs::send_direct_fd_ping(&_ms, locator::host_id{id}, as, dst_id);
|
||||
if (auto* wrong_dst = std::get_if<wrong_destination>(&reply.result)) {
|
||||
// FIXME: after moving to host_id based verbs we will not get `wrong_destination`
|
||||
// any more since the connection will fail
|
||||
@@ -561,11 +554,4 @@ future<> direct_fd_clock::sleep_until(direct_failure_detector::clock::timepoint_
|
||||
return sleep_abortable(t - n, as);
|
||||
}
|
||||
|
||||
std::chrono::milliseconds direct_fd_clock::to_milliseconds(direct_failure_detector::clock::timepoint_t tp) const {
|
||||
auto t = base::time_point{base::duration{tp}};
|
||||
auto n = base::now();
|
||||
return std::chrono::duration_cast<std::chrono::milliseconds>(t - n);
|
||||
}
|
||||
|
||||
|
||||
} // end of namespace service
|
||||
|
||||
@@ -127,7 +127,6 @@ private:
|
||||
// My Raft ID. Shared between different Raft groups.
|
||||
raft::server_id _my_id;
|
||||
|
||||
bool _group0_is_alive = false;
|
||||
public:
|
||||
raft_group_registry(raft::server_id my_id, netw::messaging_service& ms,
|
||||
direct_failure_detector::failure_detector& fd);
|
||||
@@ -182,9 +181,6 @@ public:
|
||||
unsigned shard_for_group(const raft::group_id& gid) const;
|
||||
shared_ptr<raft::failure_detector> failure_detector();
|
||||
direct_failure_detector::failure_detector& direct_fd() { return _direct_fd; }
|
||||
bool is_group0_alive() const {
|
||||
return _group0_is_alive;
|
||||
}
|
||||
};
|
||||
|
||||
// Implementation of `direct_failure_detector::pinger` which uses DIRECT_FD_PING verb for pinging.
|
||||
@@ -202,7 +198,7 @@ public:
|
||||
direct_fd_pinger(const direct_fd_pinger&) = delete;
|
||||
direct_fd_pinger(direct_fd_pinger&&) = delete;
|
||||
|
||||
future<bool> ping(direct_failure_detector::pinger::endpoint_id id, direct_failure_detector::clock::timepoint_t timeout, abort_source& as, direct_failure_detector::clock& c) override;
|
||||
future<bool> ping(direct_failure_detector::pinger::endpoint_id id, abort_source& as) override;
|
||||
};
|
||||
|
||||
// XXX: find a better place to put this?
|
||||
@@ -211,7 +207,6 @@ struct direct_fd_clock : public direct_failure_detector::clock {
|
||||
|
||||
direct_failure_detector::clock::timepoint_t now() noexcept override;
|
||||
future<> sleep_until(direct_failure_detector::clock::timepoint_t tp, abort_source& as) override;
|
||||
std::chrono::milliseconds to_milliseconds(direct_failure_detector::clock::timepoint_t tp) const override;
|
||||
};
|
||||
|
||||
} // end of namespace service
|
||||
|
||||
@@ -36,7 +36,6 @@
|
||||
#include <seastar/core/future-util.hh>
|
||||
#include "db/read_repair_decision.hh"
|
||||
#include "db/config.hh"
|
||||
#include "db/batchlog.hh"
|
||||
#include "db/batchlog_manager.hh"
|
||||
#include "db/hints/manager.hh"
|
||||
#include "db/system_keyspace.hh"
|
||||
@@ -4282,13 +4281,12 @@ storage_proxy::mutate_atomically_result(utils::chunked_vector<mutation> mutation
|
||||
coordinator_mutate_options _options;
|
||||
|
||||
const utils::UUID _batch_uuid;
|
||||
const db_clock::time_point _batch_write_time;
|
||||
const host_id_vector_replica_set _batchlog_endpoints;
|
||||
|
||||
public:
|
||||
context(storage_proxy & p, utils::chunked_vector<mutation>&& mutations, lw_shared_ptr<cdc::operation_result_tracker>&& cdc_tracker, db::consistency_level cl, clock_type::time_point timeout, tracing::trace_state_ptr tr_state, service_permit permit, coordinator_mutate_options options)
|
||||
: _p(p)
|
||||
, _schema(_p.local_db().find_schema(db::system_keyspace::NAME, db::system_keyspace::BATCHLOG_V2))
|
||||
, _schema(_p.local_db().find_schema(db::system_keyspace::NAME, db::system_keyspace::BATCHLOG))
|
||||
, _ermp(_p.local_db().find_column_family(_schema->id()).get_effective_replication_map())
|
||||
, _mutations(std::move(mutations))
|
||||
, _cdc_tracker(std::move(cdc_tracker))
|
||||
@@ -4299,7 +4297,6 @@ storage_proxy::mutate_atomically_result(utils::chunked_vector<mutation> mutation
|
||||
, _permit(std::move(permit))
|
||||
, _options(std::move(options))
|
||||
, _batch_uuid(utils::UUID_gen::get_time_UUID())
|
||||
, _batch_write_time(db_clock::now())
|
||||
, _batchlog_endpoints(
|
||||
[this]() -> host_id_vector_replica_set {
|
||||
auto local_addr = _p.my_host_id(*_ermp);
|
||||
@@ -4337,14 +4334,17 @@ storage_proxy::mutate_atomically_result(utils::chunked_vector<mutation> mutation
|
||||
}));
|
||||
}
|
||||
future<result<>> sync_write_to_batchlog() {
|
||||
auto m = db::get_batchlog_mutation_for(_schema, _mutations, netw::messaging_service::current_version, _batch_write_time, _batch_uuid);
|
||||
auto m = _p.do_get_batchlog_mutation_for(_schema, _mutations, _batch_uuid, netw::messaging_service::current_version, db_clock::now());
|
||||
tracing::trace(_trace_state, "Sending a batchlog write mutation");
|
||||
return send_batchlog_mutation(std::move(m));
|
||||
};
|
||||
future<> async_remove_from_batchlog() {
|
||||
// delete batch
|
||||
utils::get_local_injector().inject("storage_proxy_fail_remove_from_batchlog", [] { throw std::runtime_error("Error injection: failing remove from batchlog"); });
|
||||
auto m = db::get_batchlog_delete_mutation(_schema, netw::messaging_service::current_version, _batch_write_time, _batch_uuid);
|
||||
auto key = partition_key::from_exploded(*_schema, {uuid_type->decompose(_batch_uuid)});
|
||||
auto now = service::client_state(service::client_state::internal_tag()).get_timestamp();
|
||||
mutation m(_schema, key);
|
||||
m.partition().apply_delete(*_schema, clustering_key_prefix::make_empty(), tombstone(now, gc_clock::now()));
|
||||
|
||||
tracing::trace(_trace_state, "Sending a batchlog remove mutation");
|
||||
return send_batchlog_mutation(std::move(m), db::consistency_level::ANY).then_wrapped([] (future<result<>> f) {
|
||||
@@ -4363,7 +4363,6 @@ storage_proxy::mutate_atomically_result(utils::chunked_vector<mutation> mutation
|
||||
return _p.mutate_prepare(_mutations, _cl, db::write_type::BATCH, _trace_state, _permit, db::allow_per_partition_rate_limit::no, _options).then(utils::result_wrap([this] (unique_response_handler_vector ids) {
|
||||
return sync_write_to_batchlog().then(utils::result_wrap([this, ids = std::move(ids)] () mutable {
|
||||
tracing::trace(_trace_state, "Sending batch mutations");
|
||||
utils::get_local_injector().inject("storage_proxy_fail_send_batch", [] { throw std::runtime_error("Error injection: failing to send batch"); });
|
||||
_p.register_cdc_operation_result_tracker(ids, _cdc_tracker);
|
||||
return _p.mutate_begin(std::move(ids), _cl, _trace_state, _timeout);
|
||||
})).then(utils::result_wrap([this] {
|
||||
@@ -4399,6 +4398,33 @@ storage_proxy::mutate_atomically_result(utils::chunked_vector<mutation> mutation
|
||||
}).then_wrapped(std::move(cleanup));
|
||||
}
|
||||
|
||||
mutation storage_proxy::get_batchlog_mutation_for(const utils::chunked_vector<mutation>& mutations, const utils::UUID& id, int32_t version, db_clock::time_point now) {
|
||||
auto schema = local_db().find_schema(db::system_keyspace::NAME, db::system_keyspace::BATCHLOG);
|
||||
return do_get_batchlog_mutation_for(std::move(schema), mutations, id, version, now);
|
||||
}
|
||||
|
||||
mutation storage_proxy::do_get_batchlog_mutation_for(schema_ptr schema, const utils::chunked_vector<mutation>& mutations, const utils::UUID& id, int32_t version, db_clock::time_point now) {
|
||||
auto key = partition_key::from_singular(*schema, id);
|
||||
auto timestamp = api::new_timestamp();
|
||||
auto data = [&mutations] {
|
||||
utils::chunked_vector<canonical_mutation> fm(mutations.begin(), mutations.end());
|
||||
bytes_ostream out;
|
||||
for (auto& m : fm) {
|
||||
ser::serialize(out, m);
|
||||
}
|
||||
return std::move(out).to_managed_bytes();
|
||||
}();
|
||||
|
||||
mutation m(schema, key);
|
||||
m.set_cell(clustering_key_prefix::make_empty(), to_bytes("version"), version, timestamp);
|
||||
m.set_cell(clustering_key_prefix::make_empty(), to_bytes("written_at"), now, timestamp);
|
||||
// Avoid going through data_value and therefore `bytes`, as it can be large (#24809).
|
||||
auto cdef_data = schema->get_column_definition(to_bytes("data"));
|
||||
m.set_cell(clustering_key_prefix::make_empty(), *cdef_data, atomic_cell::make_live(*cdef_data->type, timestamp, std::move(data)));
|
||||
|
||||
return m;
|
||||
}
|
||||
|
||||
template<typename Range>
|
||||
bool storage_proxy::cannot_hint(const Range& targets, db::write_type type) const {
|
||||
// if hints are disabled we "can always hint" since there's going to be no hint generated in this case
|
||||
@@ -4502,14 +4528,14 @@ future<> storage_proxy::send_hint_to_all_replicas(frozen_mutation_and_schema fm_
|
||||
}
|
||||
|
||||
future<> storage_proxy::send_batchlog_replay_to_all_replicas(utils::chunked_vector<mutation> mutations, clock_type::time_point timeout) {
|
||||
utils::get_local_injector().inject("storage_proxy_fail_replay_batch", [] { throw std::runtime_error("Error injection: failing to send batch"); });
|
||||
if (utils::get_local_injector().is_enabled("batch_replay_throw")) {
|
||||
throw std::runtime_error("Skipping batch replay due to batch_replay_throw injection");
|
||||
}
|
||||
|
||||
utils::chunked_vector<batchlog_replay_mutation> ms = mutations | std::views::transform([] (auto&& m) {
|
||||
return batchlog_replay_mutation(std::move(m));
|
||||
}) | std::ranges::to<utils::chunked_vector<batchlog_replay_mutation>>();
|
||||
|
||||
utils::get_local_injector().inject("storage_proxy_fail_replay_batch", [] { throw std::runtime_error("Error injection: failing to send batch"); });
|
||||
|
||||
return mutate_internal(std::move(ms), db::consistency_level::EACH_QUORUM, nullptr, empty_service_permit(), timeout, db::write_type::BATCH)
|
||||
.then(utils::result_into_future<result<>>);
|
||||
}
|
||||
@@ -6662,11 +6688,10 @@ storage_proxy::do_query_with_paxos(schema_ptr s,
|
||||
}
|
||||
};
|
||||
|
||||
auto request = std::make_unique<read_cas_request>();
|
||||
auto* request_ptr = request.get();
|
||||
auto request = seastar::make_shared<read_cas_request>();
|
||||
|
||||
return cas(std::move(s), std::move(cas_shard), *request_ptr, cmd, std::move(partition_ranges), std::move(query_options),
|
||||
cl, db::consistency_level::ANY, timeout, cas_timeout, false).then([request = std::move(request)] (bool is_applied) mutable {
|
||||
return cas(std::move(s), std::move(cas_shard), request, cmd, std::move(partition_ranges), std::move(query_options),
|
||||
cl, db::consistency_level::ANY, timeout, cas_timeout, false).then([request] (bool is_applied) mutable {
|
||||
return make_ready_future<coordinator_query_result>(std::move(request->res));
|
||||
});
|
||||
}
|
||||
@@ -6729,13 +6754,11 @@ static mutation_write_failure_exception read_failure_to_write(read_failure_excep
|
||||
* NOTE: `cmd` argument can be nullptr, in which case it's guaranteed that this function would not perform
|
||||
* any reads of committed values (in case user of the function is not interested in them).
|
||||
*
|
||||
* NOTE: The `request` object must be guaranteed to be alive until the returned future is resolved.
|
||||
*
|
||||
* WARNING: the function must be called on a shard that owns the key cas() operates on.
|
||||
* The cas_shard must be created *before* selecting the shard, to protect against
|
||||
* concurrent tablet migrations.
|
||||
*/
|
||||
future<bool> storage_proxy::cas(schema_ptr schema, cas_shard cas_shard, cas_request& request, lw_shared_ptr<query::read_command> cmd,
|
||||
future<bool> storage_proxy::cas(schema_ptr schema, cas_shard cas_shard, shared_ptr<cas_request> request, lw_shared_ptr<query::read_command> cmd,
|
||||
dht::partition_range_vector partition_ranges, storage_proxy::coordinator_query_options query_options,
|
||||
db::consistency_level cl_for_paxos, db::consistency_level cl_for_learn,
|
||||
clock_type::time_point write_timeout, clock_type::time_point cas_timeout, bool write, cdc::per_request_options cdc_opts) {
|
||||
@@ -6836,7 +6859,7 @@ future<bool> storage_proxy::cas(schema_ptr schema, cas_shard cas_shard, cas_requ
|
||||
qr = std::move(cqr.query_result);
|
||||
}
|
||||
|
||||
auto mutation = request.apply(std::move(qr), cmd->slice, utils::UUID_gen::micros_timestamp(ballot), cdc_opts);
|
||||
auto mutation = request->apply(std::move(qr), cmd->slice, utils::UUID_gen::micros_timestamp(ballot), cdc_opts);
|
||||
condition_met = true;
|
||||
if (!mutation) {
|
||||
if (write) {
|
||||
|
||||
@@ -683,6 +683,7 @@ private:
|
||||
fencing_token caller_token, locator::host_id caller_id,
|
||||
Func&& write_func);
|
||||
|
||||
mutation do_get_batchlog_mutation_for(schema_ptr schema, const utils::chunked_vector<mutation>& mutations, const utils::UUID& id, int32_t version, db_clock::time_point now);
|
||||
future<> drain_on_shutdown();
|
||||
public:
|
||||
void update_fence_version(locator::token_metadata::version_t fence_version);
|
||||
@@ -828,11 +829,13 @@ public:
|
||||
clock_type::time_point timeout,
|
||||
tracing::trace_state_ptr trace_state = nullptr);
|
||||
|
||||
future<bool> cas(schema_ptr schema, cas_shard cas_shard, cas_request& request, lw_shared_ptr<query::read_command> cmd,
|
||||
future<bool> cas(schema_ptr schema, cas_shard cas_shard, shared_ptr<cas_request> request, lw_shared_ptr<query::read_command> cmd,
|
||||
dht::partition_range_vector partition_ranges, coordinator_query_options query_options,
|
||||
db::consistency_level cl_for_paxos, db::consistency_level cl_for_learn,
|
||||
clock_type::time_point write_timeout, clock_type::time_point cas_timeout, bool write = true, cdc::per_request_options cdc_opts = {});
|
||||
|
||||
mutation get_batchlog_mutation_for(const utils::chunked_vector<mutation>& mutations, const utils::UUID& id, int32_t version, db_clock::time_point now);
|
||||
|
||||
future<> stop();
|
||||
future<> start_hints_manager();
|
||||
void allow_replaying_hints() noexcept;
|
||||
|
||||
@@ -7531,7 +7531,7 @@ future<join_node_response_result> storage_service::join_node_response_handler(jo
|
||||
&& _join_node_response_done.failed()) {
|
||||
// The topology coordinator accepted the node that was rejected before or failed while handling
|
||||
// the response. Inform the coordinator about it so it moves the node to the left state.
|
||||
co_await coroutine::return_exception_ptr(_join_node_response_done.get_shared_future().get_exception());
|
||||
throw _join_node_response_done.get_shared_future().get_exception();
|
||||
}
|
||||
|
||||
co_return join_node_response_result{};
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user