Compare commits
199 Commits
copilot/ad
...
copilot/fi
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3f31d8a5b8 | ||
|
|
f9d6e36f02 | ||
|
|
f40dd06156 | ||
|
|
9213a163cb | ||
|
|
d9593732b1 | ||
|
|
48cf84064c | ||
|
|
a12165761e | ||
|
|
7dc04b033c | ||
|
|
654ac9099b | ||
|
|
ff1b212319 | ||
|
|
4e7ec9333f | ||
|
|
357f91de52 | ||
|
|
b5c85d08bb | ||
|
|
3aaab5d5a3 | ||
|
|
605f71d074 | ||
|
|
086c6992f5 | ||
|
|
b6afacfc1e | ||
|
|
0ed3452721 | ||
|
|
c3c0991428 | ||
|
|
563e5ddd62 | ||
|
|
796205678f | ||
|
|
902d70d6b2 | ||
|
|
ce2a403f18 | ||
|
|
d4be9a058c | ||
|
|
44c605e59c | ||
|
|
da5cc13e97 | ||
|
|
b54a9f4613 | ||
|
|
d35ce81ff1 | ||
|
|
b76af2d07f | ||
|
|
48a28c24c5 | ||
|
|
59c87025d1 | ||
|
|
1d5f60baac | ||
|
|
37e3dacf33 | ||
|
|
97b7c03709 | ||
|
|
54edb44b20 | ||
|
|
c85671ce51 | ||
|
|
9b968dc72c | ||
|
|
e366030a92 | ||
|
|
32afcdbaf0 | ||
|
|
d6c14de380 | ||
|
|
ab4896dc70 | ||
|
|
287c9eea65 | ||
|
|
4160ae94c1 | ||
|
|
cc273e867d | ||
|
|
68c7236acb | ||
|
|
f4555be8a5 | ||
|
|
943350fd35 | ||
|
|
9cde93e3da | ||
|
|
86cd0a4dce | ||
|
|
9bbdd487b4 | ||
|
|
2138ab6b0e | ||
|
|
90a6aa8057 | ||
|
|
384bffb8da | ||
|
|
1f6e3301e7 | ||
|
|
765a7e9868 | ||
|
|
3c376d1b64 | ||
|
|
584f4e467e | ||
|
|
4c7c5f4af7 | ||
|
|
5dcdaa6f66 | ||
|
|
ff5c7bd960 | ||
|
|
64a65cac55 | ||
|
|
9f97c376f1 | ||
|
|
fe9581f54c | ||
|
|
fb8cbf1615 | ||
|
|
24d69b4005 | ||
|
|
eb04af5020 | ||
|
|
b911a643fd | ||
|
|
1263e1de54 | ||
|
|
bcd1758911 | ||
|
|
868ac42a8b | ||
|
|
005807ebb8 | ||
|
|
273f664496 | ||
|
|
b2c2a99741 | ||
|
|
ca62effdd2 | ||
|
|
9f10aebc66 | ||
|
|
3702e982b9 | ||
|
|
e569a04785 | ||
|
|
39cec4ae45 | ||
|
|
9cb766f929 | ||
|
|
468b800e89 | ||
|
|
f2b0489d8c | ||
|
|
853811be90 | ||
|
|
d4b77c422f | ||
|
|
13eca61d41 | ||
|
|
724dc1e582 | ||
|
|
5541f75405 | ||
|
|
08974e1d50 | ||
|
|
6d853c8f11 | ||
|
|
eb5e9f728c | ||
|
|
d6ef5967ef | ||
|
|
19a7d8e248 | ||
|
|
296d7b8595 | ||
|
|
76174d1f7a | ||
|
|
85db7b1caf | ||
|
|
b0643f8959 | ||
|
|
e8b0f8faa9 | ||
|
|
58456455e3 | ||
|
|
c40b3ba4b3 | ||
|
|
39711920eb | ||
|
|
e96863be0c | ||
|
|
cede4f66af | ||
|
|
38a1b1032a | ||
|
|
dab74471cc | ||
|
|
3003669c96 | ||
|
|
77dcad9484 | ||
|
|
c8d2f89de7 | ||
|
|
18e1dbd42e | ||
|
|
c32e9e1b54 | ||
|
|
da51a30780 | ||
|
|
73090c0d27 | ||
|
|
38e14d9cd5 | ||
|
|
5c6813ccd0 | ||
|
|
6f79fcf4d5 | ||
|
|
939fcc0603 | ||
|
|
d589e68642 | ||
|
|
64a075533b | ||
|
|
3c4546d839 | ||
|
|
66bd3dc22c | ||
|
|
4488a4fb06 | ||
|
|
825d81dde2 | ||
|
|
0cc5208f8e | ||
|
|
f89bb68fe2 | ||
|
|
03408b185e | ||
|
|
ce8db6e19e | ||
|
|
3f11a5ed8c | ||
|
|
22f22d183f | ||
|
|
d51b1fea94 | ||
|
|
3cf1225ae6 | ||
|
|
74ecedfb5c | ||
|
|
a0734b8605 | ||
|
|
45ad93a52c | ||
|
|
2ca926f669 | ||
|
|
ad3cf2c174 | ||
|
|
5d761373c2 | ||
|
|
e5fbe3d217 | ||
|
|
a9cf7d08da | ||
|
|
a084094c18 | ||
|
|
104de44a8d | ||
|
|
1cabc8d9b0 | ||
|
|
dc7944ce5c | ||
|
|
6ee0f1f3a7 | ||
|
|
9563d87f74 | ||
|
|
366ecef1b9 | ||
|
|
8ed36702ae | ||
|
|
53b71018e8 | ||
|
|
0d68512b1f | ||
|
|
fd81333181 | ||
|
|
dedc8bdf71 | ||
|
|
f83c4ffc68 | ||
|
|
4a85ea8eb2 | ||
|
|
ed8d127457 | ||
|
|
d8e299dbb2 | ||
|
|
05b9cafb57 | ||
|
|
7b9428d8d7 | ||
|
|
11f6a25d44 | ||
|
|
4d5f7a57ea | ||
|
|
64e099f03b | ||
|
|
656ce27e7f | ||
|
|
5b78e1cebe | ||
|
|
65b364d94a | ||
|
|
c049992a93 | ||
|
|
35fd603acd | ||
|
|
39bfad48cc | ||
|
|
0602afc085 | ||
|
|
10975bf65c | ||
|
|
8bf09ac6f7 | ||
|
|
991c0f6e6d | ||
|
|
76b21d7a5a | ||
|
|
3856c9d376 | ||
|
|
5a0fddc9ee | ||
|
|
9cb776dee8 | ||
|
|
d55044b696 | ||
|
|
2ec3303edd | ||
|
|
0fede18447 | ||
|
|
675eb3be98 | ||
|
|
c853197281 | ||
|
|
9868341c73 | ||
|
|
e6dee8aab5 | ||
|
|
78ab31118e | ||
|
|
cb1679d299 | ||
|
|
604e5b6727 | ||
|
|
8f9f92728e | ||
|
|
88bb203c9c | ||
|
|
1f6918be3f | ||
|
|
79d0f93693 | ||
|
|
218916e7c2 | ||
|
|
004ba32fa5 | ||
|
|
1895d85ed2 | ||
|
|
346e0f64e2 | ||
|
|
1cfce430f1 | ||
|
|
0398bc0056 | ||
|
|
66ac66178b | ||
|
|
a32e8091a9 | ||
|
|
8c2f60f111 | ||
|
|
4f6aeb7b6b | ||
|
|
ffdc8d49c7 | ||
|
|
e3fde8087a | ||
|
|
c922256616 | ||
|
|
b18b052d26 |
86
.github/copilot-instructions.md
vendored
Normal file
86
.github/copilot-instructions.md
vendored
Normal file
@@ -0,0 +1,86 @@
|
||||
# ScyllaDB Development Instructions
|
||||
|
||||
## Project Context
|
||||
High-performance distributed NoSQL database. Core values: performance, correctness, readability.
|
||||
|
||||
## Build System
|
||||
|
||||
### Modern Build (configure.py + ninja)
|
||||
```bash
|
||||
# Configure (run once per mode, or when switching modes)
|
||||
./configure.py --mode=<mode> # mode: dev, debug, release, sanitize
|
||||
|
||||
# Build everything
|
||||
ninja <mode>-build # e.g., ninja dev-build
|
||||
|
||||
# Build Scylla binary only (sufficient for Python integration tests)
|
||||
ninja build/<mode>/scylla
|
||||
|
||||
# Build specific test
|
||||
ninja build/<mode>/test/boost/<test_name>
|
||||
```
|
||||
|
||||
## Running Tests
|
||||
|
||||
### C++ Unit Tests
|
||||
```bash
|
||||
# Run all tests in a file
|
||||
./test.py --mode=<mode> test/<suite>/<test_name>.cc
|
||||
|
||||
# Run a single test case from a file
|
||||
./test.py --mode=<mode> test/<suite>/<test_name>.cc::<test_case_name>
|
||||
|
||||
# Examples
|
||||
./test.py --mode=dev test/boost/memtable_test.cc
|
||||
./test.py --mode=dev test/raft/raft_server_test.cc::test_check_abort_on_client_api
|
||||
```
|
||||
|
||||
**Important:**
|
||||
- Use full path with `.cc` extension (e.g., `test/boost/test_name.cc`, not `boost/test_name`)
|
||||
- To run a single test case, append `::<test_case_name>` to the file path
|
||||
- If you encounter permission issues with cgroup metric gathering, add `--no-gather-metrics` flag
|
||||
|
||||
**Rebuilding Tests:**
|
||||
- test.py does NOT automatically rebuild when test source files are modified
|
||||
- Many tests are part of composite binaries (e.g., `combined_tests` in test/boost contains multiple test files)
|
||||
- To find which binary contains a test, check `configure.py` in the repository root (primary source) or `test/<suite>/CMakeLists.txt`
|
||||
- To rebuild a specific test binary: `ninja build/<mode>/test/<suite>/<binary_name>`
|
||||
- Examples:
|
||||
- `ninja build/dev/test/boost/combined_tests` (contains group0_voter_calculator_test.cc and others)
|
||||
- `ninja build/dev/test/raft/replication_test` (standalone Raft test)
|
||||
|
||||
### Python Integration Tests
|
||||
```bash
|
||||
# Only requires Scylla binary (full build usually not needed)
|
||||
ninja build/<mode>/scylla
|
||||
|
||||
# Run all tests in a file
|
||||
./test.py --mode=<mode> <test_path>
|
||||
|
||||
# Run a single test case from a file
|
||||
./test.py --mode=<mode> <test_path>::<test_function_name>
|
||||
|
||||
# Examples
|
||||
./test.py --mode=dev alternator/
|
||||
./test.py --mode=dev cluster/test_raft_voters::test_raft_limited_voters_retain_coordinator
|
||||
|
||||
# Optional flags
|
||||
./test.py --mode=dev cluster/test_raft_no_quorum -v # Verbose output
|
||||
./test.py --mode=dev cluster/test_raft_no_quorum --repeat 5 # Repeat test 5 times
|
||||
```
|
||||
|
||||
**Important:**
|
||||
- Use path without `.py` extension (e.g., `cluster/test_raft_no_quorum`, not `cluster/test_raft_no_quorum.py`)
|
||||
- To run a single test case, append `::<test_function_name>` to the file path
|
||||
- Add `-v` for verbose output
|
||||
- Add `--repeat <num>` to repeat a test multiple times
|
||||
- After modifying C++ source files, only rebuild the Scylla binary for Python tests - building the entire repository is unnecessary
|
||||
|
||||
## Code Philosophy
|
||||
- Performance matters in hot paths (data read/write, inner loops)
|
||||
- Self-documenting code through clear naming
|
||||
- Comments explain "why", not "what"
|
||||
- Prefer standard library over custom implementations
|
||||
- Strive for simplicity and clarity, add complexity only when clearly justified
|
||||
- Question requests: don't blindly implement requests - evaluate trade-offs, identify issues, and suggest better alternatives when appropriate
|
||||
- Consider different approaches, weigh pros and cons, and recommend the best fit for the specific context
|
||||
115
.github/instructions/cpp.instructions.md
vendored
Normal file
115
.github/instructions/cpp.instructions.md
vendored
Normal file
@@ -0,0 +1,115 @@
|
||||
---
|
||||
applyTo: "**/*.{cc,hh}"
|
||||
---
|
||||
|
||||
# C++ Guidelines
|
||||
|
||||
**Important:** Always match the style and conventions of existing code in the file and directory.
|
||||
|
||||
## Memory Management
|
||||
- Prefer stack allocation whenever possible
|
||||
- Use `std::unique_ptr` by default for dynamic allocations
|
||||
- `new`/`delete` are forbidden (use RAII)
|
||||
- Use `seastar::lw_shared_ptr` or `seastar::shared_ptr` for shared ownership within same shard
|
||||
- Use `seastar::foreign_ptr` for cross-shard sharing
|
||||
- Avoid `std::shared_ptr` except when interfacing with external C++ APIs
|
||||
- Avoid raw pointers except for non-owning references or C API interop
|
||||
|
||||
## Seastar Asynchronous Programming
|
||||
- Use `seastar::future<T>` for all async operations
|
||||
- Prefer coroutines (`co_await`, `co_return`) over `.then()` chains for readability
|
||||
- Coroutines are preferred over `seastar::do_with()` for managing temporary state
|
||||
- In hot paths where futures are ready, continuations may be more efficient than coroutines
|
||||
- Chain futures with `.then()`, don't block with `.get()` (unless in `seastar::thread` context)
|
||||
- All I/O must be asynchronous (no blocking calls)
|
||||
- Use `seastar::gate` for shutdown coordination
|
||||
- Use `seastar::semaphore` for resource limiting (not `std::mutex`)
|
||||
- Break long loops with `maybe_yield()` to avoid reactor stalls
|
||||
|
||||
## Coroutines
|
||||
```cpp
|
||||
seastar::future<T> func() {
|
||||
auto result = co_await async_operation();
|
||||
co_return result;
|
||||
}
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
- Throw exceptions for errors (futures propagate them automatically)
|
||||
- In data path: avoid exceptions, use `std::expected` (or `boost::outcome`) instead
|
||||
- Use standard exceptions (`std::runtime_error`, `std::invalid_argument`)
|
||||
- Database-specific: throw appropriate schema/query exceptions
|
||||
|
||||
## Performance
|
||||
- Pass large objects by `const&` or `&&` (move semantics)
|
||||
- Use `std::string_view` for non-owning string references
|
||||
- Avoid copies: prefer move semantics
|
||||
- Use `utils::chunked_vector` instead of `std::vector` for large allocations (>128KB)
|
||||
- Minimize dynamic allocations in hot paths
|
||||
|
||||
## Database-Specific Types
|
||||
- Use `schema_ptr` for schema references
|
||||
- Use `mutation` and `mutation_partition` for data modifications
|
||||
- Use `partition_key` and `clustering_key` for keys
|
||||
- Use `api::timestamp_type` for database timestamps
|
||||
- Use `gc_clock` for garbage collection timing
|
||||
|
||||
## Style
|
||||
- C++23 standard (prefer modern features, especially coroutines)
|
||||
- Use `auto` when type is obvious from RHS
|
||||
- Avoid `auto` when it obscures the type
|
||||
- Use range-based for loops: `for (const auto& item : container)`
|
||||
- Use standard algorithms when they clearly simplify code (e.g., replacing 10-line loops)
|
||||
- Avoid chaining multiple algorithms if a straightforward loop is clearer
|
||||
- Mark functions and variables `const` whenever possible
|
||||
- Use scoped enums: `enum class` (not unscoped `enum`)
|
||||
|
||||
## Headers
|
||||
- Use `#pragma once`
|
||||
- Include order: own header, C++ std, Seastar, Boost, project headers
|
||||
- Forward declare when possible
|
||||
- Never `using namespace` in headers (exception: `using namespace seastar` is globally available via `seastarx.hh`)
|
||||
|
||||
## Documentation
|
||||
- Public APIs require clear documentation
|
||||
- Implementation details should be self-evident from code
|
||||
- Use `///` or Doxygen `/** */` for public documentation, `//` for implementation notes - follow the existing style
|
||||
|
||||
## Naming
|
||||
- `snake_case` for most identifiers (classes, functions, variables, namespaces)
|
||||
- Template parameters: `CamelCase` (e.g., `template<typename ValueType>`)
|
||||
- Member variables: prefix with `_` (e.g., `int _count;`)
|
||||
- Structs (value-only): no `_` prefix on members
|
||||
- Constants and `constexpr`: `snake_case` (e.g., `static constexpr int max_size = 100;`)
|
||||
- Files: `.hh` for headers, `.cc` for source
|
||||
|
||||
## Formatting
|
||||
- 4 spaces indentation, never tabs
|
||||
- Opening braces on same line as control structure (except namespaces)
|
||||
- Space after keywords: `if (`, `while (`, `return `
|
||||
- Whitespace around operators matches precedence: `*a + *b` not `* a+* b`
|
||||
- Line length: keep reasonable (<160 chars), use continuation lines with double indent if needed
|
||||
- Brace all nested scopes, even single statements
|
||||
- Minimal patches: only format code you modify, never reformat entire files
|
||||
|
||||
## Logging
|
||||
- Use structured logging with appropriate levels: DEBUG, INFO, WARN, ERROR
|
||||
- Include context in log messages (e.g., request IDs)
|
||||
- Never log sensitive data (credentials, PII)
|
||||
|
||||
## Forbidden
|
||||
- `malloc`/`free`
|
||||
- `printf` family (use logging or fmt)
|
||||
- Raw pointers for ownership
|
||||
- `using namespace` in headers
|
||||
- Blocking operations: `std::sleep`, `std::read`, `std::mutex` (use Seastar equivalents)
|
||||
- `std::atomic` (reserved for very special circumstances only)
|
||||
- Macros (use `inline`, `constexpr`, or templates instead)
|
||||
|
||||
## Testing
|
||||
When modifying existing code, follow TDD: create/update test first, then implement.
|
||||
- Examine existing tests for style and structure
|
||||
- Use Boost.Test framework
|
||||
- Use `SEASTAR_THREAD_TEST_CASE` for Seastar asynchronous tests
|
||||
- Aim for high code coverage, especially for new features and bug fixes
|
||||
- Maintain bisectability: all tests must pass in every commit. Mark failing tests with `BOOST_FAIL()` or similar, then fix in subsequent commit
|
||||
51
.github/instructions/python.instructions.md
vendored
Normal file
51
.github/instructions/python.instructions.md
vendored
Normal file
@@ -0,0 +1,51 @@
|
||||
---
|
||||
applyTo: "**/*.py"
|
||||
---
|
||||
|
||||
# Python Guidelines
|
||||
|
||||
**Important:** Match existing code style. Some directories (like `test/cqlpy` and `test/alternator`) prefer simplicity over type hints and docstrings.
|
||||
|
||||
## Style
|
||||
- Follow PEP 8
|
||||
- Use type hints for function signatures (unless directory style omits them)
|
||||
- Use f-strings for formatting
|
||||
- Line length: 160 characters max
|
||||
- 4 spaces for indentation
|
||||
|
||||
## Imports
|
||||
Order: standard library, third-party, local imports
|
||||
```python
|
||||
import os
|
||||
import sys
|
||||
|
||||
import pytest
|
||||
from cassandra.cluster import Cluster
|
||||
|
||||
from test.utils import setup_keyspace
|
||||
```
|
||||
|
||||
Never use `from module import *`
|
||||
|
||||
## Documentation
|
||||
All public functions/classes need docstrings (unless the current directory conventions omit them):
|
||||
```python
|
||||
def my_function(arg1: str, arg2: int) -> bool:
|
||||
"""
|
||||
Brief summary of function purpose.
|
||||
|
||||
Args:
|
||||
arg1: Description of first argument.
|
||||
arg2: Description of second argument.
|
||||
|
||||
Returns:
|
||||
Description of return value.
|
||||
"""
|
||||
pass
|
||||
```
|
||||
|
||||
## Testing Best Practices
|
||||
- Maintain bisectability: all tests must pass in every commit
|
||||
- Mark currently-failing tests with `@pytest.mark.xfail`, unmark when fixed
|
||||
- Use descriptive names that convey intent
|
||||
- Docstrings/comments should explain what the test verifies and why, and if it reproduces a specific issue or how it fits into the larger test suite
|
||||
34
.github/workflows/docs-validate-metrics.yml
vendored
Normal file
34
.github/workflows/docs-validate-metrics.yml
vendored
Normal file
@@ -0,0 +1,34 @@
|
||||
name: Docs / Validate metrics
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches:
|
||||
- master
|
||||
- enterprise
|
||||
paths:
|
||||
- '**/*.cc'
|
||||
- 'scripts/metrics-config.yml'
|
||||
- 'scripts/get_description.py'
|
||||
- 'docs/_ext/scylladb_metrics.py'
|
||||
|
||||
jobs:
|
||||
validate-metrics:
|
||||
runs-on: ubuntu-latest
|
||||
name: Check metrics documentation coverage
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
python-version: '3.10'
|
||||
|
||||
- name: Install dependencies
|
||||
run: pip install PyYAML
|
||||
|
||||
- name: Validate metrics
|
||||
run: python3 scripts/get_description.py --validate -c scripts/metrics-config.yml
|
||||
@@ -116,6 +116,7 @@ list(APPEND absl_cxx_flags
|
||||
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
|
||||
list(APPEND ABSL_GCC_FLAGS ${absl_cxx_flags})
|
||||
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
|
||||
list(APPEND absl_cxx_flags "-Wno-deprecated-builtins")
|
||||
list(APPEND ABSL_LLVM_FLAGS ${absl_cxx_flags})
|
||||
endif()
|
||||
set(ABSL_DEFAULT_LINKOPTS
|
||||
@@ -163,7 +164,45 @@ file(MAKE_DIRECTORY "${scylla_gen_build_dir}")
|
||||
include(add_version_library)
|
||||
generate_scylla_version()
|
||||
|
||||
option(Scylla_USE_PRECOMPILED_HEADER "Use precompiled header for Scylla" ON)
|
||||
add_library(scylla-precompiled-header STATIC exported_templates.cc)
|
||||
target_link_libraries(scylla-precompiled-header PRIVATE
|
||||
absl::headers
|
||||
absl::btree
|
||||
absl::hash
|
||||
absl::raw_hash_set
|
||||
Seastar::seastar
|
||||
Snappy::snappy
|
||||
systemd
|
||||
ZLIB::ZLIB
|
||||
lz4::lz4_static
|
||||
zstd::zstd_static)
|
||||
if (Scylla_USE_PRECOMPILED_HEADER)
|
||||
set(Scylla_USE_PRECOMPILED_HEADER_USE ON)
|
||||
find_program(DISTCC_EXEC NAMES distcc OPTIONAL)
|
||||
if (DISTCC_EXEC)
|
||||
if(DEFINED ENV{DISTCC_HOSTS})
|
||||
set(Scylla_USE_PRECOMPILED_HEADER_USE OFF)
|
||||
message(STATUS "Disabling precompiled header usage because distcc exists and DISTCC_HOSTS is set, assuming you're using distributed compilation.")
|
||||
else()
|
||||
file(REAL_PATH "~/.distcc/hosts" DIST_CC_HOSTS_PATH EXPAND_TILDE)
|
||||
if (EXISTS ${DIST_CC_HOSTS_PATH})
|
||||
set(Scylla_USE_PRECOMPILED_HEADER_USE OFF)
|
||||
message(STATUS "Disabling precompiled header usage because distcc and ~/.distcc/hosts exists, assuming you're using distributed compilation.")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
if (Scylla_USE_PRECOMPILED_HEADER_USE)
|
||||
message(STATUS "Using precompiled header for Scylla - remember to add `sloppiness = pch_defines,time_macros` to ccache.conf, if you're using ccache.")
|
||||
target_precompile_headers(scylla-precompiled-header PRIVATE "stdafx.hh")
|
||||
target_compile_definitions(scylla-precompiled-header PRIVATE SCYLLA_USE_PRECOMPILED_HEADER)
|
||||
endif()
|
||||
else()
|
||||
set(Scylla_USE_PRECOMPILED_HEADER_USE OFF)
|
||||
endif()
|
||||
|
||||
add_library(scylla-main STATIC)
|
||||
|
||||
target_sources(scylla-main
|
||||
PRIVATE
|
||||
absl-flat_hash_map.cc
|
||||
@@ -208,6 +247,7 @@ target_link_libraries(scylla-main
|
||||
ZLIB::ZLIB
|
||||
lz4::lz4_static
|
||||
zstd::zstd_static
|
||||
scylla-precompiled-header
|
||||
)
|
||||
|
||||
option(Scylla_CHECK_HEADERS
|
||||
|
||||
@@ -34,5 +34,8 @@ target_link_libraries(alternator
|
||||
idl
|
||||
absl::headers)
|
||||
|
||||
if (Scylla_USE_PRECOMPILED_HEADER_USE)
|
||||
target_precompile_headers(alternator REUSE_FROM scylla-precompiled-header)
|
||||
endif()
|
||||
check_headers(check-headers alternator
|
||||
GLOB_RECURSE ${CMAKE_CURRENT_SOURCE_DIR}/*.hh)
|
||||
|
||||
@@ -888,7 +888,7 @@ future<executor::request_return_type> executor::describe_table(client_state& cli
|
||||
|
||||
schema_ptr schema = get_table(_proxy, request);
|
||||
get_stats_from_schema(_proxy, *schema)->api_operations.describe_table++;
|
||||
tracing::add_table_name(trace_state, schema->ks_name(), schema->cf_name());
|
||||
tracing::add_alternator_table_name(trace_state, schema->cf_name());
|
||||
|
||||
rjson::value table_description = co_await fill_table_description(schema, table_status::active, _proxy, client_state, trace_state, permit);
|
||||
rjson::value response = rjson::empty_object();
|
||||
@@ -989,7 +989,7 @@ future<executor::request_return_type> executor::delete_table(client_state& clien
|
||||
std::string table_name = get_table_name(request);
|
||||
|
||||
std::string keyspace_name = executor::KEYSPACE_NAME_PREFIX + table_name;
|
||||
tracing::add_table_name(trace_state, keyspace_name, table_name);
|
||||
tracing::add_alternator_table_name(trace_state, table_name);
|
||||
auto& p = _proxy.container();
|
||||
|
||||
schema_ptr schema = get_table(_proxy, request);
|
||||
@@ -1008,8 +1008,8 @@ future<executor::request_return_type> executor::delete_table(client_state& clien
|
||||
throw api_error::resource_not_found(fmt::format("Requested resource not found: Table: {} not found", table_name));
|
||||
}
|
||||
|
||||
auto m = co_await service::prepare_column_family_drop_announcement(_proxy, keyspace_name, table_name, group0_guard.write_timestamp(), service::drop_views::yes);
|
||||
auto m2 = co_await service::prepare_keyspace_drop_announcement(_proxy, keyspace_name, group0_guard.write_timestamp());
|
||||
auto m = co_await service::prepare_column_family_drop_announcement(p.local(), keyspace_name, table_name, group0_guard.write_timestamp(), service::drop_views::yes);
|
||||
auto m2 = co_await service::prepare_keyspace_drop_announcement(p.local(), keyspace_name, group0_guard.write_timestamp());
|
||||
|
||||
std::move(m2.begin(), m2.end(), std::back_inserter(m));
|
||||
|
||||
@@ -1583,7 +1583,7 @@ static future<executor::request_return_type> create_table_on_shard0(service::cli
|
||||
std::unordered_set<std::string> unused_attribute_definitions =
|
||||
validate_attribute_definitions("", *attribute_definitions);
|
||||
|
||||
tracing::add_table_name(trace_state, keyspace_name, table_name);
|
||||
tracing::add_alternator_table_name(trace_state, table_name);
|
||||
|
||||
schema_builder builder(keyspace_name, table_name);
|
||||
auto [hash_key, range_key] = parse_key_schema(request, "");
|
||||
@@ -1865,10 +1865,10 @@ future<executor::request_return_type> executor::create_table(client_state& clien
|
||||
_stats.api_operations.create_table++;
|
||||
elogger.trace("Creating table {}", request);
|
||||
|
||||
co_return co_await _mm.container().invoke_on(0, [&, tr = tracing::global_trace_state_ptr(trace_state), request = std::move(request), &sp = _proxy.container(), &g = _gossiper.container(), client_state_other_shard = client_state.move_to_other_shard(), enforce_authorization = bool(_enforce_authorization), warn_authorization = bool(_warn_authorization)]
|
||||
co_return co_await _mm.container().invoke_on(0, [&, tr = tracing::global_trace_state_ptr(trace_state), request = std::move(request), &sp = _proxy.container(), &g = _gossiper.container(), &e = this->container(), client_state_other_shard = client_state.move_to_other_shard(), enforce_authorization = bool(_enforce_authorization), warn_authorization = bool(_warn_authorization)]
|
||||
(service::migration_manager& mm) mutable -> future<executor::request_return_type> {
|
||||
const db::tablets_mode_t::mode tablets_mode = _proxy.data_dictionary().get_config().tablets_mode_for_new_keyspaces(); // type cast
|
||||
co_return co_await create_table_on_shard0(client_state_other_shard.get(), tr, std::move(request), sp.local(), mm, g.local(), enforce_authorization, warn_authorization, _stats, std::move(tablets_mode));
|
||||
co_return co_await create_table_on_shard0(client_state_other_shard.get(), tr, std::move(request), sp.local(), mm, g.local(), enforce_authorization, warn_authorization, e.local()._stats, std::move(tablets_mode));
|
||||
});
|
||||
}
|
||||
|
||||
@@ -1930,7 +1930,7 @@ future<executor::request_return_type> executor::update_table(client_state& clien
|
||||
|
||||
schema_ptr tab = get_table(p.local(), request);
|
||||
|
||||
tracing::add_table_name(gt, tab->ks_name(), tab->cf_name());
|
||||
tracing::add_alternator_table_name(gt, tab->cf_name());
|
||||
|
||||
// the ugly but harmless conversion to string_view here is because
|
||||
// Seastar's sstring is missing a find(std::string_view) :-()
|
||||
@@ -2624,14 +2624,14 @@ std::optional<service::cas_shard> rmw_operation::shard_for_execute(bool needs_re
|
||||
// Build the return value from the different RMW operations (UpdateItem,
|
||||
// PutItem, DeleteItem). All these return nothing by default, but can
|
||||
// optionally return Attributes if requested via the ReturnValues option.
|
||||
static future<executor::request_return_type> rmw_operation_return(rjson::value&& attributes, const consumed_capacity_counter& consumed_capacity, uint64_t& metric) {
|
||||
static executor::request_return_type rmw_operation_return(rjson::value&& attributes, const consumed_capacity_counter& consumed_capacity, uint64_t& metric) {
|
||||
rjson::value ret = rjson::empty_object();
|
||||
consumed_capacity.add_consumed_capacity_to_response_if_needed(ret);
|
||||
metric += consumed_capacity.get_consumed_capacity_units();
|
||||
if (!attributes.IsNull()) {
|
||||
rjson::add(ret, "Attributes", std::move(attributes));
|
||||
}
|
||||
return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
|
||||
return rjson::print(std::move(ret));
|
||||
}
|
||||
|
||||
static future<std::unique_ptr<rjson::value>> get_previous_item(
|
||||
@@ -2697,7 +2697,10 @@ future<executor::request_return_type> rmw_operation::execute(service::storage_pr
|
||||
stats& global_stats,
|
||||
stats& per_table_stats,
|
||||
uint64_t& wcu_total) {
|
||||
auto cdc_opts = cdc::per_request_options{};
|
||||
auto cdc_opts = cdc::per_request_options{
|
||||
.alternator = true,
|
||||
.alternator_streams_increased_compatibility = schema()->cdc_options().enabled() && proxy.data_dictionary().get_config().alternator_streams_increased_compatibility(),
|
||||
};
|
||||
if (needs_read_before_write) {
|
||||
if (_write_isolation == write_isolation::FORBID_RMW) {
|
||||
throw api_error::validation("Read-modify-write operations are disabled by 'forbid_rmw' write isolation policy. Refer to https://github.com/scylladb/scylla/blob/master/docs/alternator/alternator.md#write-isolation-policies for more information.");
|
||||
@@ -2742,7 +2745,7 @@ future<executor::request_return_type> rmw_operation::execute(service::storage_pr
|
||||
if (!is_applied) {
|
||||
return make_ready_future<executor::request_return_type>(api_error::conditional_check_failed("The conditional request failed", std::move(_return_attributes)));
|
||||
}
|
||||
return rmw_operation_return(std::move(_return_attributes), _consumed_capacity, wcu_total);
|
||||
return make_ready_future<executor::request_return_type>(rmw_operation_return(std::move(_return_attributes), _consumed_capacity, wcu_total));
|
||||
});
|
||||
}
|
||||
|
||||
@@ -2856,7 +2859,7 @@ future<executor::request_return_type> executor::put_item(client_state& client_st
|
||||
elogger.trace("put_item {}", request);
|
||||
|
||||
auto op = make_shared<put_item_operation>(*_parsed_expression_cache, _proxy, std::move(request));
|
||||
tracing::add_table_name(trace_state, op->schema()->ks_name(), op->schema()->cf_name());
|
||||
tracing::add_alternator_table_name(trace_state, op->schema()->cf_name());
|
||||
const bool needs_read_before_write = op->needs_read_before_write();
|
||||
|
||||
co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, op->schema(), auth::permission::MODIFY, _stats);
|
||||
@@ -2960,7 +2963,7 @@ future<executor::request_return_type> executor::delete_item(client_state& client
|
||||
|
||||
auto op = make_shared<delete_item_operation>(*_parsed_expression_cache, _proxy, std::move(request));
|
||||
lw_shared_ptr<stats> per_table_stats = get_stats_from_schema(_proxy, *(op->schema()));
|
||||
tracing::add_table_name(trace_state, op->schema()->ks_name(), op->schema()->cf_name());
|
||||
tracing::add_alternator_table_name(trace_state, op->schema()->cf_name());
|
||||
const bool needs_read_before_write = _proxy.data_dictionary().get_config().alternator_force_read_before_write() || op->needs_read_before_write();
|
||||
|
||||
co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, op->schema(), auth::permission::MODIFY, _stats);
|
||||
@@ -3054,6 +3057,9 @@ static future<> cas_write(service::storage_proxy& proxy, schema_ptr schema, serv
|
||||
auto timeout = executor::default_timeout();
|
||||
auto op = seastar::make_shared<put_or_delete_item_cas_request>(schema, std::move(mutation_builders));
|
||||
auto cdc_opts = cdc::per_request_options{
|
||||
.alternator = true,
|
||||
.alternator_streams_increased_compatibility =
|
||||
schema->cdc_options().enabled() && proxy.data_dictionary().get_config().alternator_streams_increased_compatibility(),
|
||||
};
|
||||
return proxy.cas(schema, std::move(cas_shard), op, nullptr, to_partition_ranges(dk),
|
||||
{timeout, std::move(permit), client_state, trace_state},
|
||||
@@ -3104,8 +3110,10 @@ static future<> do_batch_write(service::storage_proxy& proxy,
|
||||
utils::chunked_vector<mutation> mutations;
|
||||
mutations.reserve(mutation_builders.size());
|
||||
api::timestamp_type now = api::new_timestamp();
|
||||
bool any_cdc_enabled = false;
|
||||
for (auto& b : mutation_builders) {
|
||||
mutations.push_back(b.second.build(b.first, now));
|
||||
any_cdc_enabled |= b.first->cdc_options().enabled();
|
||||
}
|
||||
return proxy.mutate(std::move(mutations),
|
||||
db::consistency_level::LOCAL_QUORUM,
|
||||
@@ -3114,7 +3122,10 @@ static future<> do_batch_write(service::storage_proxy& proxy,
|
||||
std::move(permit),
|
||||
db::allow_per_partition_rate_limit::yes,
|
||||
false,
|
||||
cdc::per_request_options{});
|
||||
cdc::per_request_options{
|
||||
.alternator = true,
|
||||
.alternator_streams_increased_compatibility = any_cdc_enabled && proxy.data_dictionary().get_config().alternator_streams_increased_compatibility(),
|
||||
});
|
||||
} else {
|
||||
// Do the write via LWT:
|
||||
// Multiple mutations may be destined for the same partition, adding
|
||||
@@ -3204,7 +3215,7 @@ future<executor::request_return_type> executor::batch_write_item(client_state& c
|
||||
per_table_stats->api_operations.batch_write_item++;
|
||||
per_table_stats->api_operations.batch_write_item_batch_total += it->value.Size();
|
||||
per_table_stats->api_operations.batch_write_item_histogram.add(it->value.Size());
|
||||
tracing::add_table_name(trace_state, schema->ks_name(), schema->cf_name());
|
||||
tracing::add_alternator_table_name(trace_state, schema->cf_name());
|
||||
|
||||
std::unordered_set<primary_key, primary_key_hash, primary_key_equal> used_keys(
|
||||
1, primary_key_hash{schema}, primary_key_equal{schema});
|
||||
@@ -4464,7 +4475,7 @@ future<executor::request_return_type> executor::update_item(client_state& client
|
||||
elogger.trace("update_item {}", request);
|
||||
|
||||
auto op = make_shared<update_item_operation>(*_parsed_expression_cache, _proxy, std::move(request));
|
||||
tracing::add_table_name(trace_state, op->schema()->ks_name(), op->schema()->cf_name());
|
||||
tracing::add_alternator_table_name(trace_state, op->schema()->cf_name());
|
||||
const bool needs_read_before_write = _proxy.data_dictionary().get_config().alternator_force_read_before_write() || op->needs_read_before_write();
|
||||
|
||||
co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, op->schema(), auth::permission::MODIFY, _stats);
|
||||
@@ -4545,7 +4556,7 @@ future<executor::request_return_type> executor::get_item(client_state& client_st
|
||||
schema_ptr schema = get_table(_proxy, request);
|
||||
lw_shared_ptr<stats> per_table_stats = get_stats_from_schema(_proxy, *schema);
|
||||
per_table_stats->api_operations.get_item++;
|
||||
tracing::add_table_name(trace_state, schema->ks_name(), schema->cf_name());
|
||||
tracing::add_alternator_table_name(trace_state, schema->cf_name());
|
||||
|
||||
rjson::value& query_key = request["Key"];
|
||||
db::consistency_level cl = get_read_consistency(request);
|
||||
@@ -4694,7 +4705,7 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
|
||||
uint batch_size = 0;
|
||||
for (auto it = request_items.MemberBegin(); it != request_items.MemberEnd(); ++it) {
|
||||
table_requests rs(get_table_from_batch_request(_proxy, it));
|
||||
tracing::add_table_name(trace_state, sstring(executor::KEYSPACE_NAME_PREFIX) + rs.schema->cf_name(), rs.schema->cf_name());
|
||||
tracing::add_alternator_table_name(trace_state, rs.schema->cf_name());
|
||||
rs.cl = get_read_consistency(it->value);
|
||||
std::unordered_set<std::string> used_attribute_names;
|
||||
rs.attrs_to_get = ::make_shared<const std::optional<attrs_to_get>>(calculate_attrs_to_get(it->value, *_parsed_expression_cache, used_attribute_names));
|
||||
@@ -5130,13 +5141,15 @@ static rjson::value encode_paging_state(const schema& schema, const service::pag
|
||||
}
|
||||
auto pos = paging_state.get_position_in_partition();
|
||||
if (pos.has_key()) {
|
||||
auto exploded_ck = pos.key().explode();
|
||||
auto exploded_ck_it = exploded_ck.begin();
|
||||
for (const column_definition& cdef : schema.clustering_key_columns()) {
|
||||
rjson::add_with_string_name(last_evaluated_key, std::string_view(cdef.name_as_text()), rjson::empty_object());
|
||||
rjson::value& key_entry = last_evaluated_key[cdef.name_as_text()];
|
||||
rjson::add_with_string_name(key_entry, type_to_string(cdef.type), json_key_column_value(*exploded_ck_it, cdef));
|
||||
++exploded_ck_it;
|
||||
// Alternator itself allows at most one column in clustering key, but
|
||||
// user can use Alternator api to access system tables which might have
|
||||
// multiple clustering key columns. So we need to handle that case here.
|
||||
auto cdef_it = schema.clustering_key_columns().begin();
|
||||
for(const auto &exploded_ck : pos.key().explode()) {
|
||||
rjson::add_with_string_name(last_evaluated_key, std::string_view(cdef_it->name_as_text()), rjson::empty_object());
|
||||
rjson::value& key_entry = last_evaluated_key[cdef_it->name_as_text()];
|
||||
rjson::add_with_string_name(key_entry, type_to_string(cdef_it->type), json_key_column_value(exploded_ck, *cdef_it));
|
||||
++cdef_it;
|
||||
}
|
||||
}
|
||||
// To avoid possible conflicts (and thus having to reserve these names) we
|
||||
@@ -5296,6 +5309,7 @@ future<executor::request_return_type> executor::scan(client_state& client_state,
|
||||
elogger.trace("Scanning {}", request);
|
||||
|
||||
auto [schema, table_type] = get_table_or_view(_proxy, request);
|
||||
tracing::add_alternator_table_name(trace_state, schema->cf_name());
|
||||
get_stats_from_schema(_proxy, *schema)->api_operations.scan++;
|
||||
auto segment = get_int_attribute(request, "Segment");
|
||||
auto total_segments = get_int_attribute(request, "TotalSegments");
|
||||
@@ -5775,7 +5789,7 @@ future<executor::request_return_type> executor::query(client_state& client_state
|
||||
|
||||
auto [schema, table_type] = get_table_or_view(_proxy, request);
|
||||
get_stats_from_schema(_proxy, *schema)->api_operations.query++;
|
||||
tracing::add_table_name(trace_state, schema->ks_name(), schema->cf_name());
|
||||
tracing::add_alternator_table_name(trace_state, schema->cf_name());
|
||||
|
||||
rjson::value* exclusive_start_key = rjson::find(request, "ExclusiveStartKey");
|
||||
db::consistency_level cl = get_read_consistency(request);
|
||||
|
||||
@@ -282,15 +282,23 @@ std::string type_to_string(data_type type) {
|
||||
return it->second;
|
||||
}
|
||||
|
||||
bytes get_key_column_value(const rjson::value& item, const column_definition& column) {
|
||||
std::optional<bytes> try_get_key_column_value(const rjson::value& item, const column_definition& column) {
|
||||
std::string column_name = column.name_as_text();
|
||||
const rjson::value* key_typed_value = rjson::find(item, column_name);
|
||||
if (!key_typed_value) {
|
||||
throw api_error::validation(fmt::format("Key column {} not found", column_name));
|
||||
return std::nullopt;
|
||||
}
|
||||
return get_key_from_typed_value(*key_typed_value, column);
|
||||
}
|
||||
|
||||
bytes get_key_column_value(const rjson::value& item, const column_definition& column) {
|
||||
auto value = try_get_key_column_value(item, column);
|
||||
if (!value) {
|
||||
throw api_error::validation(fmt::format("Key column {} not found", column.name_as_text()));
|
||||
}
|
||||
return std::move(*value);
|
||||
}
|
||||
|
||||
// Parses the JSON encoding for a key value, which is a map with a single
|
||||
// entry whose key is the type and the value is the encoded value.
|
||||
// If this type does not match the desired "type_str", an api_error::validation
|
||||
@@ -380,20 +388,38 @@ clustering_key ck_from_json(const rjson::value& item, schema_ptr schema) {
|
||||
return clustering_key::make_empty();
|
||||
}
|
||||
std::vector<bytes> raw_ck;
|
||||
// FIXME: this is a loop, but we really allow only one clustering key column.
|
||||
// Note: it's possible to get more than one clustering column here, as
|
||||
// Alternator can be used to read scylla internal tables.
|
||||
for (const column_definition& cdef : schema->clustering_key_columns()) {
|
||||
bytes raw_value = get_key_column_value(item, cdef);
|
||||
auto raw_value = get_key_column_value(item, cdef);
|
||||
raw_ck.push_back(std::move(raw_value));
|
||||
}
|
||||
|
||||
return clustering_key::from_exploded(raw_ck);
|
||||
}
|
||||
|
||||
position_in_partition pos_from_json(const rjson::value& item, schema_ptr schema) {
|
||||
auto ck = ck_from_json(item, schema);
|
||||
if (is_alternator_keyspace(schema->ks_name())) {
|
||||
return position_in_partition::for_key(std::move(ck));
|
||||
clustering_key_prefix ck_prefix_from_json(const rjson::value& item, schema_ptr schema) {
|
||||
if (schema->clustering_key_size() == 0) {
|
||||
return clustering_key_prefix::make_empty();
|
||||
}
|
||||
std::vector<bytes> raw_ck;
|
||||
for (const column_definition& cdef : schema->clustering_key_columns()) {
|
||||
auto raw_value = try_get_key_column_value(item, cdef);
|
||||
if (!raw_value) {
|
||||
break;
|
||||
}
|
||||
raw_ck.push_back(std::move(*raw_value));
|
||||
}
|
||||
|
||||
return clustering_key_prefix::from_exploded(raw_ck);
|
||||
}
|
||||
|
||||
position_in_partition pos_from_json(const rjson::value& item, schema_ptr schema) {
|
||||
const bool is_alternator_ks = is_alternator_keyspace(schema->ks_name());
|
||||
if (is_alternator_ks) {
|
||||
return position_in_partition::for_key(ck_from_json(item, schema));
|
||||
}
|
||||
|
||||
const auto region_item = rjson::find(item, scylla_paging_region);
|
||||
const auto weight_item = rjson::find(item, scylla_paging_weight);
|
||||
if (bool(region_item) != bool(weight_item)) {
|
||||
@@ -413,8 +439,9 @@ position_in_partition pos_from_json(const rjson::value& item, schema_ptr schema)
|
||||
} else {
|
||||
throw std::runtime_error(fmt::format("Invalid value for weight: {}", weight_view));
|
||||
}
|
||||
return position_in_partition(region, weight, region == partition_region::clustered ? std::optional(std::move(ck)) : std::nullopt);
|
||||
return position_in_partition(region, weight, region == partition_region::clustered ? std::optional(ck_prefix_from_json(item, schema)) : std::nullopt);
|
||||
}
|
||||
auto ck = ck_from_json(item, schema);
|
||||
if (ck.is_empty()) {
|
||||
return position_in_partition::for_partition_start();
|
||||
}
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
#include <seastar/http/function_handlers.hh>
|
||||
#include <seastar/http/short_streams.hh>
|
||||
#include <seastar/core/coroutine.hh>
|
||||
#include <seastar/coroutine/maybe_yield.hh>
|
||||
#include <seastar/util/defer.hh>
|
||||
#include <seastar/util/short_streams.hh>
|
||||
#include "seastarx.hh"
|
||||
@@ -32,6 +33,7 @@
|
||||
#include "utils/aws_sigv4.hh"
|
||||
#include "client_data.hh"
|
||||
#include "utils/updateable_value.hh"
|
||||
#include <zlib.h>
|
||||
|
||||
static logging::logger slogger("alternator-server");
|
||||
|
||||
@@ -551,6 +553,106 @@ read_entire_stream(input_stream<char>& inp, size_t length_limit) {
|
||||
co_return ret;
|
||||
}
|
||||
|
||||
// safe_gzip_stream is an exception-safe wrapper for zlib's z_stream.
|
||||
// The "z_stream" struct is used by zlib to hold state while decompressing a
|
||||
// stream of data. It allocates memory which must be freed with inflateEnd(),
|
||||
// which the destructor of this class does.
|
||||
class safe_gzip_zstream {
|
||||
z_stream _zs;
|
||||
public:
|
||||
safe_gzip_zstream() {
|
||||
memset(&_zs, 0, sizeof(_zs));
|
||||
// The strange 16 + WMAX_BITS tells zlib to expect and decode
|
||||
// a gzip header, not a zlib header.
|
||||
if (inflateInit2(&_zs, 16 + MAX_WBITS) != Z_OK) {
|
||||
// Should only happen if memory allocation fails
|
||||
throw std::bad_alloc();
|
||||
}
|
||||
}
|
||||
~safe_gzip_zstream() {
|
||||
inflateEnd(&_zs);
|
||||
}
|
||||
z_stream* operator->() {
|
||||
return &_zs;
|
||||
}
|
||||
z_stream* get() {
|
||||
return &_zs;
|
||||
}
|
||||
void reset() {
|
||||
inflateReset(&_zs);
|
||||
}
|
||||
};
|
||||
|
||||
// ungzip() takes a chunked_content with a gzip-compressed request body,
|
||||
// uncompresses it, and returns the uncompressed content as a chunked_content.
|
||||
// If the uncompressed content exceeds length_limit, an error is thrown.
|
||||
static future<chunked_content>
|
||||
ungzip(chunked_content&& compressed_body, size_t length_limit) {
|
||||
chunked_content ret;
|
||||
// output_buf can be any size - when uncompressing input_buf, it doesn't
|
||||
// need to fit in a single output_buf, we'll use multiple output_buf for
|
||||
// a single input_buf if needed.
|
||||
constexpr size_t OUTPUT_BUF_SIZE = 4096;
|
||||
temporary_buffer<char> output_buf;
|
||||
safe_gzip_zstream strm;
|
||||
bool complete_stream = false; // empty input is not a valid gzip
|
||||
size_t total_out_bytes = 0;
|
||||
for (const temporary_buffer<char>& input_buf : compressed_body) {
|
||||
if (input_buf.empty()) {
|
||||
continue;
|
||||
}
|
||||
complete_stream = false;
|
||||
strm->next_in = (Bytef*) input_buf.get();
|
||||
strm->avail_in = (uInt) input_buf.size();
|
||||
do {
|
||||
co_await coroutine::maybe_yield();
|
||||
if (output_buf.empty()) {
|
||||
output_buf = temporary_buffer<char>(OUTPUT_BUF_SIZE);
|
||||
}
|
||||
strm->next_out = (Bytef*) output_buf.get();
|
||||
strm->avail_out = OUTPUT_BUF_SIZE;
|
||||
int e = inflate(strm.get(), Z_NO_FLUSH);
|
||||
size_t out_bytes = OUTPUT_BUF_SIZE - strm->avail_out;
|
||||
if (out_bytes > 0) {
|
||||
// If output_buf is nearly full, we save it as-is in ret. But
|
||||
// if it only has little data, better copy to a small buffer.
|
||||
if (out_bytes > OUTPUT_BUF_SIZE/2) {
|
||||
ret.push_back(std::move(output_buf).prefix(out_bytes));
|
||||
// output_buf is now empty. if this loop finds more input,
|
||||
// we'll allocate a new output buffer.
|
||||
} else {
|
||||
ret.push_back(temporary_buffer<char>(output_buf.get(), out_bytes));
|
||||
}
|
||||
total_out_bytes += out_bytes;
|
||||
if (total_out_bytes > length_limit) {
|
||||
throw api_error::payload_too_large(fmt::format("Request content length limit of {} bytes exceeded", length_limit));
|
||||
}
|
||||
}
|
||||
if (e == Z_STREAM_END) {
|
||||
// There may be more input after the first gzip stream - in
|
||||
// either this input_buf or the next one. The additional input
|
||||
// should be a second concatenated gzip. We need to allow that
|
||||
// by resetting the gzip stream and continuing the input loop
|
||||
// until there's no more input.
|
||||
strm.reset();
|
||||
if (strm->avail_in == 0) {
|
||||
complete_stream = true;
|
||||
break;
|
||||
}
|
||||
} else if (e != Z_OK && e != Z_BUF_ERROR) {
|
||||
// DynamoDB returns an InternalServerError when given a bad
|
||||
// gzip request body. See test test_broken_gzip_content
|
||||
throw api_error::internal("Error during gzip decompression of request body");
|
||||
}
|
||||
} while (strm->avail_in > 0 || strm->avail_out == 0);
|
||||
}
|
||||
if (!complete_stream) {
|
||||
// The gzip stream was not properly finished with Z_STREAM_END
|
||||
throw api_error::internal("Truncated gzip in request body");
|
||||
}
|
||||
co_return ret;
|
||||
}
|
||||
|
||||
future<executor::request_return_type> server::handle_api_request(std::unique_ptr<request> req) {
|
||||
_executor._stats.total_operations++;
|
||||
sstring target = req->get_header("X-Amz-Target");
|
||||
@@ -588,6 +690,21 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
|
||||
units.return_units(mem_estimate - new_mem_estimate);
|
||||
}
|
||||
auto username = co_await verify_signature(*req, content);
|
||||
// If the request is compressed, uncompress it now, after we checked
|
||||
// the signature (the signature is computed on the compressed content).
|
||||
// We apply the request_content_length_limit again to the uncompressed
|
||||
// content - we don't want to allow a tiny compressed request to
|
||||
// expand to a huge uncompressed request.
|
||||
sstring content_encoding = req->get_header("Content-Encoding");
|
||||
if (content_encoding == "gzip") {
|
||||
content = co_await ungzip(std::move(content), request_content_length_limit);
|
||||
} else if (!content_encoding.empty()) {
|
||||
// DynamoDB returns a 500 error for unsupported Content-Encoding.
|
||||
// I'm not sure if this is the best error code, but let's do it too.
|
||||
// See the test test_garbage_content_encoding confirming this case.
|
||||
co_return api_error::internal("Unsupported Content-Encoding");
|
||||
}
|
||||
|
||||
// As long as the system_clients_entry object is alive, this request will
|
||||
// be visible in the "system.clients" virtual table. When requested, this
|
||||
// entry will be formatted by server::ongoing_request::make_client_data().
|
||||
|
||||
@@ -106,5 +106,8 @@ target_link_libraries(api
|
||||
wasmtime_bindings
|
||||
absl::headers)
|
||||
|
||||
if (Scylla_USE_PRECOMPILED_HEADER_USE)
|
||||
target_precompile_headers(api REUSE_FROM scylla-precompiled-header)
|
||||
endif()
|
||||
check_headers(check-headers api
|
||||
GLOB_RECURSE ${CMAKE_CURRENT_SOURCE_DIR}/*.hh)
|
||||
|
||||
@@ -66,6 +66,13 @@ static future<json::json_return_type> get_cf_stats(sharded<replica::database>&
|
||||
}, std::plus<int64_t>());
|
||||
}
|
||||
|
||||
static future<json::json_return_type> get_cf_stats(sharded<replica::database>& db,
|
||||
std::function<int64_t(const replica::column_family_stats&)> f) {
|
||||
return map_reduce_cf(db, int64_t(0), [f](const replica::column_family& cf) {
|
||||
return f(cf.get_stats());
|
||||
}, std::plus<int64_t>());
|
||||
}
|
||||
|
||||
static future<json::json_return_type> for_tables_on_all_shards(sharded<replica::database>& db, std::vector<table_info> tables, std::function<future<>(replica::table&)> set) {
|
||||
return do_with(std::move(tables), [&db, set] (const std::vector<table_info>& tables) {
|
||||
return db.invoke_on_all([&tables, set] (replica::database& db) {
|
||||
@@ -1066,10 +1073,14 @@ void set_column_family(http_context& ctx, routes& r, sharded<replica::database>&
|
||||
});
|
||||
|
||||
ss::get_load.set(r, [&db] (std::unique_ptr<http::request> req) {
|
||||
return get_cf_stats(db, &replica::column_family_stats::live_disk_space_used);
|
||||
return get_cf_stats(db, [](const replica::column_family_stats& stats) {
|
||||
return stats.live_disk_space_used.on_disk;
|
||||
});
|
||||
});
|
||||
ss::get_metrics_load.set(r, [&db] (std::unique_ptr<http::request> req) {
|
||||
return get_cf_stats(db, &replica::column_family_stats::live_disk_space_used);
|
||||
return get_cf_stats(db, [](const replica::column_family_stats& stats) {
|
||||
return stats.live_disk_space_used.on_disk;
|
||||
});
|
||||
});
|
||||
|
||||
ss::get_keyspaces.set(r, [&db] (const_req req) {
|
||||
|
||||
@@ -17,4 +17,7 @@ target_link_libraries(scylla_audit
|
||||
PRIVATE
|
||||
cql3)
|
||||
|
||||
if (Scylla_USE_PRECOMPILED_HEADER_USE)
|
||||
target_precompile_headers(scylla_audit REUSE_FROM scylla-precompiled-header)
|
||||
endif()
|
||||
add_whole_archive(audit scylla_audit)
|
||||
|
||||
@@ -44,5 +44,8 @@ target_link_libraries(scylla_auth
|
||||
|
||||
add_whole_archive(auth scylla_auth)
|
||||
|
||||
if (Scylla_USE_PRECOMPILED_HEADER_USE)
|
||||
target_precompile_headers(scylla_auth REUSE_FROM scylla-precompiled-header)
|
||||
endif()
|
||||
check_headers(check-headers scylla_auth
|
||||
GLOB_RECURSE ${CMAKE_CURRENT_SOURCE_DIR}/*.hh)
|
||||
@@ -15,6 +15,7 @@
|
||||
#include <cmath>
|
||||
|
||||
#include "seastarx.hh"
|
||||
#include "backlog_controller_fwd.hh"
|
||||
|
||||
// Simple proportional controller to adjust shares for processes for which a backlog can be clearly
|
||||
// defined.
|
||||
@@ -128,11 +129,21 @@ public:
|
||||
static constexpr unsigned normalization_factor = 30;
|
||||
static constexpr float disable_backlog = std::numeric_limits<double>::infinity();
|
||||
static constexpr float backlog_disabled(float backlog) { return std::isinf(backlog); }
|
||||
compaction_controller(backlog_controller::scheduling_group sg, float static_shares, std::chrono::milliseconds interval, std::function<float()> current_backlog)
|
||||
static inline const std::vector<backlog_controller::control_point> default_control_points = {
|
||||
backlog_controller::control_point{0.0, 50}, {1.5, 100}, {normalization_factor, default_compaction_maximum_shares}};
|
||||
compaction_controller(backlog_controller::scheduling_group sg, float static_shares, std::optional<float> max_shares,
|
||||
std::chrono::milliseconds interval, std::function<float()> current_backlog)
|
||||
: backlog_controller(std::move(sg), std::move(interval),
|
||||
std::vector<backlog_controller::control_point>({{0.0, 50}, {1.5, 100} , {normalization_factor, 1000}}),
|
||||
default_control_points,
|
||||
std::move(current_backlog),
|
||||
static_shares
|
||||
)
|
||||
{}
|
||||
{
|
||||
if (max_shares) {
|
||||
set_max_shares(*max_shares);
|
||||
}
|
||||
}
|
||||
|
||||
// Updates the maximum output value for control points.
|
||||
void set_max_shares(float max_shares);
|
||||
};
|
||||
|
||||
13
backlog_controller_fwd.hh
Normal file
13
backlog_controller_fwd.hh
Normal file
@@ -0,0 +1,13 @@
|
||||
/*
|
||||
* Copyright (C) 2025-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
static constexpr uint64_t default_compaction_maximum_shares = 1000;
|
||||
@@ -17,5 +17,8 @@ target_link_libraries(cdc
|
||||
PRIVATE
|
||||
replica)
|
||||
|
||||
if (Scylla_USE_PRECOMPILED_HEADER_USE)
|
||||
target_precompile_headers(cdc REUSE_FROM scylla-precompiled-header)
|
||||
endif()
|
||||
check_headers(check-headers cdc
|
||||
GLOB_RECURSE ${CMAKE_CURRENT_SOURCE_DIR}/*.hh)
|
||||
|
||||
79
cdc/log.cc
79
cdc/log.cc
@@ -25,6 +25,7 @@
|
||||
#include "locator/abstract_replication_strategy.hh"
|
||||
#include "locator/topology.hh"
|
||||
#include "replica/database.hh"
|
||||
#include "db/config.hh"
|
||||
#include "db/schema_tables.hh"
|
||||
#include "gms/feature_service.hh"
|
||||
#include "schema/schema.hh"
|
||||
@@ -586,11 +587,9 @@ bytes log_data_column_deleted_elements_name_bytes(const bytes& column_name) {
|
||||
return to_bytes(cdc_deleted_elements_column_prefix) + column_name;
|
||||
}
|
||||
|
||||
static schema_ptr create_log_schema(const schema& s, const replica::database& db,
|
||||
const keyspace_metadata& ksm, api::timestamp_type timestamp, std::optional<table_id> uuid, schema_ptr old)
|
||||
static void set_default_properties_log_table(schema_builder& b, const schema& s,
|
||||
const replica::database& db, const keyspace_metadata& ksm)
|
||||
{
|
||||
schema_builder b(s.ks_name(), log_name(s.cf_name()));
|
||||
b.with_partitioner(cdc::cdc_partitioner::classname);
|
||||
b.set_compaction_strategy(compaction::compaction_strategy_type::time_window);
|
||||
b.set_comment(fmt::format("CDC log for {}.{}", s.ks_name(), s.cf_name()));
|
||||
auto ttl_seconds = s.cdc_options().ttl();
|
||||
@@ -616,13 +615,22 @@ static schema_ptr create_log_schema(const schema& s, const replica::database& db
|
||||
std::to_string(std::max(1, window_seconds / 2))},
|
||||
});
|
||||
}
|
||||
b.set_caching_options(caching_options::get_disabled_caching_options());
|
||||
|
||||
auto rs = generate_replication_strategy(ksm, db.get_token_metadata().get_topology());
|
||||
auto tombstone_gc_ext = seastar::make_shared<tombstone_gc_extension>(get_default_tombstone_gc_mode(*rs, db.get_token_metadata(), false));
|
||||
b.add_extension(tombstone_gc_extension::NAME, std::move(tombstone_gc_ext));
|
||||
}
|
||||
|
||||
static void add_columns_to_cdc_log(schema_builder& b, const schema& s,
|
||||
const api::timestamp_type timestamp, const schema_ptr old)
|
||||
{
|
||||
b.with_column(log_meta_column_name_bytes("stream_id"), bytes_type, column_kind::partition_key);
|
||||
b.with_column(log_meta_column_name_bytes("time"), timeuuid_type, column_kind::clustering_key);
|
||||
b.with_column(log_meta_column_name_bytes("batch_seq_no"), int32_type, column_kind::clustering_key);
|
||||
b.with_column(log_meta_column_name_bytes("operation"), data_type_for<operation_native_type>());
|
||||
b.with_column(log_meta_column_name_bytes("ttl"), long_type);
|
||||
b.with_column(log_meta_column_name_bytes("end_of_batch"), boolean_type);
|
||||
b.set_caching_options(caching_options::get_disabled_caching_options());
|
||||
|
||||
auto validate_new_column = [&] (const sstring& name) {
|
||||
// When dropping a column from a CDC log table, we set the drop timestamp to be
|
||||
@@ -692,15 +700,28 @@ static schema_ptr create_log_schema(const schema& s, const replica::database& db
|
||||
add_columns(s.clustering_key_columns());
|
||||
add_columns(s.static_columns(), true);
|
||||
add_columns(s.regular_columns(), true);
|
||||
}
|
||||
|
||||
static schema_ptr create_log_schema(const schema& s, const replica::database& db,
|
||||
const keyspace_metadata& ksm, api::timestamp_type timestamp, std::optional<table_id> uuid, schema_ptr old)
|
||||
{
|
||||
schema_builder b(s.ks_name(), log_name(s.cf_name()));
|
||||
|
||||
b.with_partitioner(cdc::cdc_partitioner::classname);
|
||||
|
||||
if (old) {
|
||||
// If the user reattaches the log table, do not change its properties.
|
||||
b.set_properties(old->get_properties());
|
||||
} else {
|
||||
set_default_properties_log_table(b, s, db, ksm);
|
||||
}
|
||||
|
||||
add_columns_to_cdc_log(b, s, timestamp, old);
|
||||
|
||||
if (uuid) {
|
||||
b.set_uuid(*uuid);
|
||||
}
|
||||
|
||||
auto rs = generate_replication_strategy(ksm, db.get_token_metadata().get_topology());
|
||||
auto tombstone_gc_ext = seastar::make_shared<tombstone_gc_extension>(get_default_tombstone_gc_mode(*rs, db.get_token_metadata()));
|
||||
b.add_extension(tombstone_gc_extension::NAME, std::move(tombstone_gc_ext));
|
||||
|
||||
/**
|
||||
* #10473 - if we are redefining the log table, we need to ensure any dropped
|
||||
* columns are registered in "dropped_columns" table, otherwise clients will not
|
||||
@@ -931,9 +952,6 @@ static managed_bytes merge(const abstract_type& type, const managed_bytes_opt& p
|
||||
throw std::runtime_error(format("cdc merge: unknown type {}", type.name()));
|
||||
}
|
||||
|
||||
using cell_map = std::unordered_map<const column_definition*, managed_bytes_opt>;
|
||||
using row_states_map = std::unordered_map<clustering_key, cell_map, clustering_key::hashing, clustering_key::equality>;
|
||||
|
||||
static managed_bytes_opt get_col_from_row_state(const cell_map* state, const column_definition& cdef) {
|
||||
if (state) {
|
||||
if (auto it = state->find(&cdef); it != state->end()) {
|
||||
@@ -943,7 +961,12 @@ static managed_bytes_opt get_col_from_row_state(const cell_map* state, const col
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
static cell_map* get_row_state(row_states_map& row_states, const clustering_key& ck) {
|
||||
cell_map* get_row_state(row_states_map& row_states, const clustering_key& ck) {
|
||||
auto it = row_states.find(ck);
|
||||
return it == row_states.end() ? nullptr : &it->second;
|
||||
}
|
||||
|
||||
const cell_map* get_row_state(const row_states_map& row_states, const clustering_key& ck) {
|
||||
auto it = row_states.find(ck);
|
||||
return it == row_states.end() ? nullptr : &it->second;
|
||||
}
|
||||
@@ -1413,6 +1436,8 @@ struct process_change_visitor {
|
||||
row_states_map& _clustering_row_states;
|
||||
cell_map& _static_row_state;
|
||||
|
||||
const bool _is_update = false;
|
||||
|
||||
const bool _generate_delta_values = true;
|
||||
|
||||
void static_row_cells(auto&& visit_row_cells) {
|
||||
@@ -1436,12 +1461,13 @@ struct process_change_visitor {
|
||||
|
||||
struct clustering_row_cells_visitor : public process_row_visitor {
|
||||
operation _cdc_op = operation::update;
|
||||
operation _marker_op = operation::insert;
|
||||
|
||||
using process_row_visitor::process_row_visitor;
|
||||
|
||||
void marker(const row_marker& rm) {
|
||||
_ttl_column = get_ttl(rm);
|
||||
_cdc_op = operation::insert;
|
||||
_cdc_op = _marker_op;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -1449,6 +1475,9 @@ struct process_change_visitor {
|
||||
log_ck, _touched_parts, _builder,
|
||||
_enable_updating_state, &ckey, get_row_state(_clustering_row_states, ckey),
|
||||
_clustering_row_states, _generate_delta_values);
|
||||
if (_is_update && _request_options.alternator) {
|
||||
v._marker_op = operation::update;
|
||||
}
|
||||
visit_row_cells(v);
|
||||
|
||||
if (_enable_updating_state) {
|
||||
@@ -1602,6 +1631,11 @@ private:
|
||||
|
||||
row_states_map _clustering_row_states;
|
||||
cell_map _static_row_state;
|
||||
// True if the mutated row existed before applying the mutation. In other
|
||||
// words, if the preimage is enabled and it isn't empty (otherwise, we
|
||||
// assume that the row is non-existent). Used for Alternator Streams (see
|
||||
// #6918).
|
||||
bool _is_update = false;
|
||||
|
||||
const bool _uses_tablets;
|
||||
|
||||
@@ -1728,6 +1762,7 @@ public:
|
||||
._enable_updating_state = _enable_updating_state,
|
||||
._clustering_row_states = _clustering_row_states,
|
||||
._static_row_state = _static_row_state,
|
||||
._is_update = _is_update,
|
||||
._generate_delta_values = generate_delta_values(_builder->base_schema())
|
||||
};
|
||||
cdc::inspect_mutation(m, v);
|
||||
@@ -1738,6 +1773,10 @@ public:
|
||||
_builder->end_record();
|
||||
}
|
||||
|
||||
const row_states_map& clustering_row_states() const override {
|
||||
return _clustering_row_states;
|
||||
}
|
||||
|
||||
// Takes and returns generated cdc log mutations and associated statistics about parts touched during transformer's lifetime.
|
||||
// The `transformer` object on which this method was called on should not be used anymore.
|
||||
std::tuple<utils::chunked_vector<mutation>, stats::part_type_set> finish() && {
|
||||
@@ -1861,6 +1900,7 @@ public:
|
||||
_static_row_state[&c] = std::move(*maybe_cell_view);
|
||||
}
|
||||
}
|
||||
_is_update = true;
|
||||
}
|
||||
|
||||
if (static_only) {
|
||||
@@ -1948,6 +1988,7 @@ cdc::cdc_service::impl::augment_mutation_call(lowres_clock::time_point timeout,
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
const bool alternator_increased_compatibility = options.alternator && options.alternator_streams_increased_compatibility;
|
||||
transformer trans(_ctxt, s, m.decorated_key(), options);
|
||||
|
||||
auto f = make_ready_future<lw_shared_ptr<cql3::untyped_result_set>>(nullptr);
|
||||
@@ -1955,7 +1996,7 @@ cdc::cdc_service::impl::augment_mutation_call(lowres_clock::time_point timeout,
|
||||
// Preimage has been fetched by upper layers.
|
||||
tracing::trace(tr_state, "CDC: Using a prefetched preimage");
|
||||
f = make_ready_future<lw_shared_ptr<cql3::untyped_result_set>>(options.preimage);
|
||||
} else if (s->cdc_options().preimage() || s->cdc_options().postimage()) {
|
||||
} else if (s->cdc_options().preimage() || s->cdc_options().postimage() || alternator_increased_compatibility) {
|
||||
// Note: further improvement here would be to coalesce the pre-image selects into one
|
||||
// if a batch contains several modifications to the same table. Otoh, batch is rare(?)
|
||||
// so this is premature.
|
||||
@@ -1972,7 +2013,7 @@ cdc::cdc_service::impl::augment_mutation_call(lowres_clock::time_point timeout,
|
||||
tracing::trace(tr_state, "CDC: Preimage not enabled for the table, not querying current value of {}", m.decorated_key());
|
||||
}
|
||||
|
||||
return f.then([trans = std::move(trans), &mutations, idx, tr_state, &details] (lw_shared_ptr<cql3::untyped_result_set> rs) mutable {
|
||||
return f.then([alternator_increased_compatibility, trans = std::move(trans), &mutations, idx, tr_state, &details, &options] (lw_shared_ptr<cql3::untyped_result_set> rs) mutable {
|
||||
auto& m = mutations[idx];
|
||||
auto& s = m.schema();
|
||||
|
||||
@@ -1987,13 +2028,13 @@ cdc::cdc_service::impl::augment_mutation_call(lowres_clock::time_point timeout,
|
||||
details.had_preimage |= preimage;
|
||||
details.had_postimage |= postimage;
|
||||
tracing::trace(tr_state, "CDC: Generating log mutations for {}", m.decorated_key());
|
||||
if (should_split(m)) {
|
||||
if (should_split(m, options)) {
|
||||
tracing::trace(tr_state, "CDC: Splitting {}", m.decorated_key());
|
||||
details.was_split = true;
|
||||
process_changes_with_splitting(m, trans, preimage, postimage);
|
||||
process_changes_with_splitting(m, trans, preimage, postimage, alternator_increased_compatibility);
|
||||
} else {
|
||||
tracing::trace(tr_state, "CDC: No need to split {}", m.decorated_key());
|
||||
process_changes_without_splitting(m, trans, preimage, postimage);
|
||||
process_changes_without_splitting(m, trans, preimage, postimage, alternator_increased_compatibility);
|
||||
}
|
||||
auto [log_mut, touched_parts] = std::move(trans).finish();
|
||||
const int generated_count = log_mut.size();
|
||||
|
||||
14
cdc/log.hh
14
cdc/log.hh
@@ -52,6 +52,9 @@ class database;
|
||||
|
||||
namespace cdc {
|
||||
|
||||
using cell_map = std::unordered_map<const column_definition*, managed_bytes_opt>;
|
||||
using row_states_map = std::unordered_map<clustering_key, cell_map, clustering_key::hashing, clustering_key::equality>;
|
||||
|
||||
// cdc log table operation
|
||||
enum class operation : int8_t {
|
||||
// note: these values will eventually be read by a third party, probably not privvy to this
|
||||
@@ -73,6 +76,14 @@ struct per_request_options {
|
||||
// Scylla. Currently, only TTL expiration implementation for Alternator
|
||||
// uses this.
|
||||
const bool is_system_originated = false;
|
||||
// True if this mutation was emitted by Alternator.
|
||||
const bool alternator = false;
|
||||
// Sacrifice performance for the sake of better compatibility with DynamoDB
|
||||
// Streams. It's important for correctness that
|
||||
// alternator_streams_increased_compatibility config flag be read once per
|
||||
// request, because it's live-updateable. As a result, the flag may change
|
||||
// between reads.
|
||||
const bool alternator_streams_increased_compatibility = false;
|
||||
};
|
||||
|
||||
struct operation_result_tracker;
|
||||
@@ -142,4 +153,7 @@ bool is_cdc_metacolumn_name(const sstring& name);
|
||||
|
||||
utils::UUID generate_timeuuid(api::timestamp_type t);
|
||||
|
||||
cell_map* get_row_state(row_states_map& row_states, const clustering_key& ck);
|
||||
const cell_map* get_row_state(const row_states_map& row_states, const clustering_key& ck);
|
||||
|
||||
} // namespace cdc
|
||||
|
||||
163
cdc/split.cc
163
cdc/split.cc
@@ -6,15 +6,28 @@
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
|
||||
#include "bytes.hh"
|
||||
#include "bytes_fwd.hh"
|
||||
#include "mutation/atomic_cell.hh"
|
||||
#include "mutation/atomic_cell_or_collection.hh"
|
||||
#include "mutation/collection_mutation.hh"
|
||||
#include "mutation/mutation.hh"
|
||||
#include "mutation/tombstone.hh"
|
||||
#include "schema/schema.hh"
|
||||
|
||||
#include "seastar/core/sstring.hh"
|
||||
#include "types/concrete_types.hh"
|
||||
#include "types/types.hh"
|
||||
#include "types/user.hh"
|
||||
|
||||
#include "split.hh"
|
||||
#include "log.hh"
|
||||
#include "change_visitor.hh"
|
||||
#include "utils/managed_bytes.hh"
|
||||
#include <string_view>
|
||||
#include <unordered_map>
|
||||
|
||||
extern logging::logger cdc_log;
|
||||
|
||||
struct atomic_column_update {
|
||||
column_id id;
|
||||
@@ -490,6 +503,8 @@ struct should_split_visitor {
|
||||
// Otherwise we store the change's ttl.
|
||||
std::optional<gc_clock::duration> _ttl = std::nullopt;
|
||||
|
||||
virtual ~should_split_visitor() = default;
|
||||
|
||||
inline bool finished() const { return _result; }
|
||||
inline void stop() { _result = true; }
|
||||
|
||||
@@ -512,7 +527,7 @@ struct should_split_visitor {
|
||||
|
||||
void collection_tombstone(const tombstone& t) { visit(t.timestamp + 1); }
|
||||
|
||||
void live_collection_cell(bytes_view, const atomic_cell_view& cell) {
|
||||
virtual void live_collection_cell(bytes_view, const atomic_cell_view& cell) {
|
||||
if (_had_row_marker) {
|
||||
// nonatomic updates cannot be expressed with an INSERT.
|
||||
return stop();
|
||||
@@ -522,7 +537,7 @@ struct should_split_visitor {
|
||||
void dead_collection_cell(bytes_view, const atomic_cell_view& cell) { visit(cell); }
|
||||
void collection_column(const column_definition&, auto&& visit_collection) { visit_collection(*this); }
|
||||
|
||||
void marker(const row_marker& rm) {
|
||||
virtual void marker(const row_marker& rm) {
|
||||
_had_row_marker = true;
|
||||
visit(rm.timestamp(), get_ttl(rm));
|
||||
}
|
||||
@@ -563,7 +578,29 @@ struct should_split_visitor {
|
||||
}
|
||||
};
|
||||
|
||||
bool should_split(const mutation& m) {
|
||||
// This is the same as the above, but it doesn't split a row marker away from
|
||||
// an update. As a result, updates that create an item appear as a single log
|
||||
// row.
|
||||
class alternator_should_split_visitor : public should_split_visitor {
|
||||
public:
|
||||
~alternator_should_split_visitor() override = default;
|
||||
|
||||
void live_collection_cell(bytes_view, const atomic_cell_view& cell) override {
|
||||
visit(cell.timestamp());
|
||||
}
|
||||
|
||||
void marker(const row_marker& rm) override {
|
||||
visit(rm.timestamp());
|
||||
}
|
||||
};
|
||||
|
||||
bool should_split(const mutation& m, const per_request_options& options) {
|
||||
if (options.alternator) {
|
||||
alternator_should_split_visitor v;
|
||||
cdc::inspect_mutation(m, v);
|
||||
return v._result || v._ts == api::missing_timestamp;
|
||||
}
|
||||
|
||||
should_split_visitor v;
|
||||
|
||||
cdc::inspect_mutation(m, v);
|
||||
@@ -573,8 +610,109 @@ bool should_split(const mutation& m) {
|
||||
|| v._ts == api::missing_timestamp;
|
||||
}
|
||||
|
||||
// Returns true if the row state and the atomic and nonatomic entries represent
|
||||
// an equivalent item.
|
||||
static bool entries_match_row_state(const schema_ptr& base_schema, const cell_map& row_state, const std::vector<atomic_column_update>& atomic_entries,
|
||||
std::vector<nonatomic_column_update>& nonatomic_entries) {
|
||||
for (const auto& update : atomic_entries) {
|
||||
const column_definition& cdef = base_schema->column_at(column_kind::regular_column, update.id);
|
||||
const auto it = row_state.find(&cdef);
|
||||
if (it == row_state.end()) {
|
||||
return false;
|
||||
}
|
||||
if (to_managed_bytes_opt(update.cell.value().linearize()) != it->second) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (nonatomic_entries.empty()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
for (const auto& update : nonatomic_entries) {
|
||||
const column_definition& cdef = base_schema->column_at(column_kind::regular_column, update.id);
|
||||
const auto it = row_state.find(&cdef);
|
||||
if (it == row_state.end()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// The only collection used by Alternator is a non-frozen map.
|
||||
auto current_raw_map = cdef.type->deserialize(*it->second);
|
||||
map_type_impl::native_type current_values = value_cast<map_type_impl::native_type>(current_raw_map);
|
||||
|
||||
if (current_values.size() != update.cells.size()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
std::unordered_map<sstring_view, bytes> current_values_map;
|
||||
for (const auto& entry : current_values) {
|
||||
const auto attr_name = std::string_view(value_cast<sstring>(entry.first));
|
||||
current_values_map[attr_name] = value_cast<bytes>(entry.second);
|
||||
}
|
||||
|
||||
for (const auto& [key, value] : update.cells) {
|
||||
const auto key_str = to_string_view(key);
|
||||
if (!value.is_live()) {
|
||||
if (current_values_map.contains(key_str)) {
|
||||
return false;
|
||||
}
|
||||
} else if (current_values_map[key_str] != value.value().linearize()) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool should_skip(batch& changes, const mutation& base_mutation, change_processor& processor) {
|
||||
const schema_ptr& base_schema = base_mutation.schema();
|
||||
// Alternator doesn't use static updates and clustered range deletions.
|
||||
if (!changes.static_updates.empty() || !changes.clustered_range_deletions.empty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (clustered_row_insert& u : changes.clustered_inserts) {
|
||||
const cell_map* row_state = get_row_state(processor.clustering_row_states(), u.key);
|
||||
if (!row_state) {
|
||||
return false;
|
||||
}
|
||||
if (!entries_match_row_state(base_schema, *row_state, u.atomic_entries, u.nonatomic_entries)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
for (clustered_row_update& u : changes.clustered_updates) {
|
||||
const cell_map* row_state = get_row_state(processor.clustering_row_states(), u.key);
|
||||
if (!row_state) {
|
||||
return false;
|
||||
}
|
||||
if (!entries_match_row_state(base_schema, *row_state, u.atomic_entries, u.nonatomic_entries)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Skip only if the row being deleted does not exist (i.e. the deletion is a no-op).
|
||||
for (const auto& row_deletion : changes.clustered_row_deletions) {
|
||||
if (processor.clustering_row_states().contains(row_deletion.key)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Don't skip if the item exists.
|
||||
//
|
||||
// Increased DynamoDB Streams compatibility guarantees that single-item
|
||||
// operations will read the item and store it in the clustering row states.
|
||||
// If it is not found there, we may skip CDC. This is safe as long as the
|
||||
// assumptions of this operation's write isolation are not violated.
|
||||
if (changes.partition_deletions && processor.clustering_row_states().contains(clustering_key::make_empty())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
cdc_log.trace("Skipping CDC log for mutation {}", base_mutation);
|
||||
return true;
|
||||
}
|
||||
|
||||
void process_changes_with_splitting(const mutation& base_mutation, change_processor& processor,
|
||||
bool enable_preimage, bool enable_postimage) {
|
||||
bool enable_preimage, bool enable_postimage, bool alternator_strict_compatibility) {
|
||||
const auto base_schema = base_mutation.schema();
|
||||
auto changes = extract_changes(base_mutation);
|
||||
auto pk = base_mutation.key();
|
||||
@@ -586,9 +724,6 @@ void process_changes_with_splitting(const mutation& base_mutation, change_proces
|
||||
const auto last_timestamp = changes.rbegin()->first;
|
||||
|
||||
for (auto& [change_ts, btch] : changes) {
|
||||
const bool is_last = change_ts == last_timestamp;
|
||||
processor.begin_timestamp(change_ts, is_last);
|
||||
|
||||
clustered_column_set affected_clustered_columns_per_row{clustering_key::less_compare(*base_schema)};
|
||||
one_kind_column_set affected_static_columns{base_schema->static_columns_count()};
|
||||
|
||||
@@ -597,6 +732,12 @@ void process_changes_with_splitting(const mutation& base_mutation, change_proces
|
||||
affected_clustered_columns_per_row = btch.get_affected_clustered_columns_per_row(*base_mutation.schema());
|
||||
}
|
||||
|
||||
if (alternator_strict_compatibility && should_skip(btch, base_mutation, processor)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const bool is_last = change_ts == last_timestamp;
|
||||
processor.begin_timestamp(change_ts, is_last);
|
||||
if (enable_preimage) {
|
||||
if (affected_static_columns.count() > 0) {
|
||||
processor.produce_preimage(nullptr, affected_static_columns);
|
||||
@@ -684,7 +825,13 @@ void process_changes_with_splitting(const mutation& base_mutation, change_proces
|
||||
}
|
||||
|
||||
void process_changes_without_splitting(const mutation& base_mutation, change_processor& processor,
|
||||
bool enable_preimage, bool enable_postimage) {
|
||||
bool enable_preimage, bool enable_postimage, bool alternator_strict_compatibility) {
|
||||
if (alternator_strict_compatibility) {
|
||||
auto changes = extract_changes(base_mutation);
|
||||
if (should_skip(changes.begin()->second, base_mutation, processor)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
auto ts = find_timestamp(base_mutation);
|
||||
processor.begin_timestamp(ts, true);
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
#pragma once
|
||||
|
||||
#include <boost/dynamic_bitset.hpp> // IWYU pragma: keep
|
||||
#include "cdc/log.hh"
|
||||
#include "replica/database_fwd.hh"
|
||||
#include "mutation/timestamp.hh"
|
||||
|
||||
@@ -65,12 +66,14 @@ public:
|
||||
// Tells processor we have reached end of record - last part
|
||||
// of a given timestamp batch
|
||||
virtual void end_record() = 0;
|
||||
|
||||
virtual const row_states_map& clustering_row_states() const = 0;
|
||||
};
|
||||
|
||||
bool should_split(const mutation& base_mutation);
|
||||
bool should_split(const mutation& base_mutation, const per_request_options& options);
|
||||
void process_changes_with_splitting(const mutation& base_mutation, change_processor& processor,
|
||||
bool enable_preimage, bool enable_postimage);
|
||||
bool enable_preimage, bool enable_postimage, bool alternator_strict_compatibility);
|
||||
void process_changes_without_splitting(const mutation& base_mutation, change_processor& processor,
|
||||
bool enable_preimage, bool enable_postimage);
|
||||
bool enable_preimage, bool enable_postimage, bool alternator_strict_compatibility);
|
||||
|
||||
}
|
||||
|
||||
@@ -21,5 +21,8 @@ target_link_libraries(compaction
|
||||
mutation_writer
|
||||
replica)
|
||||
|
||||
if (Scylla_USE_PRECOMPILED_HEADER_USE)
|
||||
target_precompile_headers(compaction REUSE_FROM scylla-precompiled-header)
|
||||
endif()
|
||||
check_headers(check-headers compaction
|
||||
GLOB_RECURSE ${CMAKE_CURRENT_SOURCE_DIR}/*.hh)
|
||||
|
||||
@@ -867,8 +867,8 @@ auto fmt::formatter<compaction::compaction_task_executor>::format(const compacti
|
||||
|
||||
namespace compaction {
|
||||
|
||||
inline compaction_controller make_compaction_controller(const compaction_manager::scheduling_group& csg, uint64_t static_shares, std::function<double()> fn) {
|
||||
return compaction_controller(csg, static_shares, 250ms, std::move(fn));
|
||||
inline compaction_controller make_compaction_controller(const compaction_manager::scheduling_group& csg, uint64_t static_shares, std::optional<float> max_shares, std::function<double()> fn) {
|
||||
return compaction_controller(csg, static_shares, max_shares, 250ms, std::move(fn));
|
||||
}
|
||||
|
||||
compaction::compaction_state::~compaction_state() {
|
||||
@@ -1014,7 +1014,7 @@ compaction_manager::compaction_manager(config cfg, abort_source& as, tasks::task
|
||||
, _sys_ks("compaction_manager::system_keyspace")
|
||||
, _cfg(std::move(cfg))
|
||||
, _compaction_submission_timer(compaction_sg(), compaction_submission_callback())
|
||||
, _compaction_controller(make_compaction_controller(compaction_sg(), static_shares(), [this] () -> float {
|
||||
, _compaction_controller(make_compaction_controller(compaction_sg(), static_shares(), _cfg.max_shares.get(), [this] () -> float {
|
||||
_last_backlog = backlog();
|
||||
auto b = _last_backlog / available_memory();
|
||||
// This means we are using an unimplemented strategy
|
||||
@@ -1033,6 +1033,10 @@ compaction_manager::compaction_manager(config cfg, abort_source& as, tasks::task
|
||||
, _throughput_updater(serialized_action([this] { return update_throughput(throughput_mbs()); }))
|
||||
, _update_compaction_static_shares_action([this] { return update_static_shares(static_shares()); })
|
||||
, _compaction_static_shares_observer(_cfg.static_shares.observe(_update_compaction_static_shares_action.make_observer()))
|
||||
, _compaction_max_shares_observer(_cfg.max_shares.observe([this] (const float& max_shares) {
|
||||
cmlog.info("Updating max shares to {}", max_shares);
|
||||
_compaction_controller.set_max_shares(max_shares);
|
||||
}))
|
||||
, _strategy_control(std::make_unique<strategy_control>(*this))
|
||||
, _tombstone_gc_state(_shared_tombstone_gc_state) {
|
||||
tm.register_module(_task_manager_module->get_name(), _task_manager_module);
|
||||
@@ -1051,11 +1055,12 @@ compaction_manager::compaction_manager(tasks::task_manager& tm)
|
||||
, _sys_ks("compaction_manager::system_keyspace")
|
||||
, _cfg(config{ .available_memory = 1 })
|
||||
, _compaction_submission_timer(compaction_sg(), compaction_submission_callback())
|
||||
, _compaction_controller(make_compaction_controller(compaction_sg(), 1, [] () -> float { return 1.0; }))
|
||||
, _compaction_controller(make_compaction_controller(compaction_sg(), 1, std::nullopt, [] () -> float { return 1.0; }))
|
||||
, _backlog_manager(_compaction_controller)
|
||||
, _throughput_updater(serialized_action([this] { return update_throughput(throughput_mbs()); }))
|
||||
, _update_compaction_static_shares_action([] { return make_ready_future<>(); })
|
||||
, _compaction_static_shares_observer(_cfg.static_shares.observe(_update_compaction_static_shares_action.make_observer()))
|
||||
, _compaction_max_shares_observer(_cfg.max_shares.observe([] (const float& max_shares) {}))
|
||||
, _strategy_control(std::make_unique<strategy_control>(*this))
|
||||
, _tombstone_gc_state(_shared_tombstone_gc_state) {
|
||||
tm.register_module(_task_manager_module->get_name(), _task_manager_module);
|
||||
|
||||
@@ -80,6 +80,7 @@ public:
|
||||
scheduling_group maintenance_sched_group;
|
||||
size_t available_memory = 0;
|
||||
utils::updateable_value<float> static_shares = utils::updateable_value<float>(0);
|
||||
utils::updateable_value<float> max_shares = utils::updateable_value<float>(0);
|
||||
utils::updateable_value<uint32_t> throughput_mb_per_sec = utils::updateable_value<uint32_t>(0);
|
||||
std::chrono::seconds flush_all_tables_before_major = std::chrono::duration_cast<std::chrono::seconds>(std::chrono::days(1));
|
||||
};
|
||||
@@ -159,6 +160,7 @@ private:
|
||||
std::optional<utils::observer<uint32_t>> _throughput_option_observer;
|
||||
serialized_action _update_compaction_static_shares_action;
|
||||
utils::observer<float> _compaction_static_shares_observer;
|
||||
utils::observer<float> _compaction_max_shares_observer;
|
||||
uint64_t _validation_errors = 0;
|
||||
|
||||
class strategy_control;
|
||||
@@ -291,6 +293,10 @@ public:
|
||||
return _cfg.static_shares.get();
|
||||
}
|
||||
|
||||
float max_shares() const noexcept {
|
||||
return _cfg.max_shares.get();
|
||||
}
|
||||
|
||||
uint32_t throughput_mbs() const noexcept {
|
||||
return _cfg.throughput_mb_per_sec.get();
|
||||
}
|
||||
|
||||
@@ -227,7 +227,7 @@ future<> run_table_tasks(replica::database& db, std::vector<table_tasks_info> ta
|
||||
// Tables will be kept in descending order.
|
||||
std::ranges::sort(table_tasks, std::greater<>(), [&] (const table_tasks_info& tti) {
|
||||
try {
|
||||
return db.find_column_family(tti.ti.id).get_stats().live_disk_space_used;
|
||||
return db.find_column_family(tti.ti.id).get_stats().live_disk_space_used.on_disk;
|
||||
} catch (const replica::no_such_column_family& e) {
|
||||
return int64_t(-1);
|
||||
}
|
||||
@@ -281,7 +281,7 @@ future<> run_keyspace_tasks(replica::database& db, std::vector<keyspace_tasks_in
|
||||
try {
|
||||
return std::accumulate(kti.table_infos.begin(), kti.table_infos.end(), int64_t(0), [&] (int64_t sum, const table_info& t) {
|
||||
try {
|
||||
sum += db.find_column_family(t.id).get_stats().live_disk_space_used;
|
||||
sum += db.find_column_family(t.id).get_stats().live_disk_space_used.on_disk;
|
||||
} catch (const replica::no_such_column_family&) {
|
||||
// ignore
|
||||
}
|
||||
|
||||
@@ -888,9 +888,18 @@ rf_rack_valid_keyspaces: false
|
||||
#
|
||||
# Vector Store options
|
||||
#
|
||||
# A comma-separated list of URIs for the vector store using DNS name. Only HTTP schema is supported. Port number is mandatory.
|
||||
# Default is empty, which means that the vector store is not used.
|
||||
# HTTP and HTTPS schemes are supported. Port number is mandatory.
|
||||
# If both `vector_store_primary_uri` and `vector_store_secondary_uri` are unset or empty, vector search is disabled.
|
||||
#
|
||||
# A comma-separated list of primary vector store node URIs. These nodes are preferred for vector search operations.
|
||||
# vector_store_primary_uri: http://vector-store.dns.name:{port}
|
||||
#
|
||||
# A comma-separated list of secondary vector store node URIs. These nodes are used as a fallback when all primary nodes are unavailable, and are typically located in a different availability zone for high availability.
|
||||
# vector_store_secondary_uri: http://vector-store.dns.name:{port}
|
||||
#
|
||||
# Options for encrypted connections to the vector store. These options are used for HTTPS URIs in vector_store_primary_uri and vector_store_secondary_uri.
|
||||
# vector_store_encryption_options:
|
||||
# truststore: <not set, use system trust>
|
||||
|
||||
#
|
||||
# io-streaming rate limiting
|
||||
|
||||
96
configure.py
96
configure.py
@@ -646,6 +646,28 @@ vector_search_tests = set([
|
||||
'test/vector_search/client_test'
|
||||
])
|
||||
|
||||
vector_search_validator_bin = 'vector-search-validator/bin/vector-search-validator'
|
||||
vector_search_validator_deps = set([
|
||||
'test/vector_search_validator/build-validator',
|
||||
'test/vector_search_validator/Cargo.toml',
|
||||
'test/vector_search_validator/crates/validator/Cargo.toml',
|
||||
'test/vector_search_validator/crates/validator/src/main.rs',
|
||||
'test/vector_search_validator/crates/validator-scylla/Cargo.toml',
|
||||
'test/vector_search_validator/crates/validator-scylla/src/lib.rs',
|
||||
'test/vector_search_validator/crates/validator-scylla/src/cql.rs',
|
||||
])
|
||||
|
||||
vector_store_bin = 'vector-search-validator/bin/vector-store'
|
||||
vector_store_deps = set([
|
||||
'test/vector_search_validator/build-env',
|
||||
'test/vector_search_validator/build-vector-store',
|
||||
])
|
||||
|
||||
vector_search_validator_bins = set([
|
||||
vector_search_validator_bin,
|
||||
vector_store_bin,
|
||||
])
|
||||
|
||||
wasms = set([
|
||||
'wasm/return_input.wat',
|
||||
'wasm/test_complex_null_values.wat',
|
||||
@@ -679,7 +701,7 @@ other = set([
|
||||
'iotune',
|
||||
])
|
||||
|
||||
all_artifacts = apps | cpp_apps | tests | other | wasms
|
||||
all_artifacts = apps | cpp_apps | tests | other | wasms | vector_search_validator_bins
|
||||
|
||||
arg_parser = argparse.ArgumentParser('Configure scylla', add_help=False, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
arg_parser.add_argument('--out', dest='buildfile', action='store', default='build.ninja',
|
||||
@@ -763,6 +785,7 @@ arg_parser.add_argument('--use-cmake', action=argparse.BooleanOptionalAction, de
|
||||
arg_parser.add_argument('--coverage', action = 'store_true', help = 'Compile scylla with coverage instrumentation')
|
||||
arg_parser.add_argument('--build-dir', action='store', default='build',
|
||||
help='Build directory path')
|
||||
arg_parser.add_argument('--disable-precompiled-header', action='store_true', default=False, help='Disable precompiled header for scylla binary')
|
||||
arg_parser.add_argument('-h', '--help', action='store_true', help='show this help message and exit')
|
||||
args = arg_parser.parse_args()
|
||||
if args.help:
|
||||
@@ -1268,7 +1291,8 @@ scylla_core = (['message/messaging_service.cc',
|
||||
'vector_search/vector_store_client.cc',
|
||||
'vector_search/dns.cc',
|
||||
'vector_search/client.cc',
|
||||
'vector_search/clients.cc'
|
||||
'vector_search/clients.cc',
|
||||
'vector_search/truststore.cc'
|
||||
] + [Antlr3Grammar('cql3/Cql.g')] \
|
||||
+ scylla_raft_core
|
||||
)
|
||||
@@ -1579,6 +1603,7 @@ deps['test/boost/combined_tests'] += [
|
||||
'test/boost/query_processor_test.cc',
|
||||
'test/boost/reader_concurrency_semaphore_test.cc',
|
||||
'test/boost/repair_test.cc',
|
||||
'test/boost/replicator_test.cc',
|
||||
'test/boost/restrictions_test.cc',
|
||||
'test/boost/role_manager_test.cc',
|
||||
'test/boost/row_cache_test.cc',
|
||||
@@ -2185,7 +2210,15 @@ if os.path.exists(kmipc_lib):
|
||||
user_cflags += f' -I{kmipc_dir}/include -DHAVE_KMIP'
|
||||
|
||||
def get_extra_cxxflags(mode, mode_config, cxx, debuginfo):
|
||||
cxxflags = []
|
||||
cxxflags = [
|
||||
# we need this flag for correct precompiled header handling in connection with ccache (or similar)
|
||||
# `git` tools don't preserve timestamps, so when using ccache it might be possible to add pch to ccache
|
||||
# and then later (after for example rebase) get `stdafx.hh` with different timestamp, but the same content.
|
||||
# this will tell ccache to bring pch from its cache. Later on clang will check if timestamps match and complain.
|
||||
# Adding `-fpch-validate-input-files-content` tells clang to check content of stdafx.hh if timestamps don't match.
|
||||
# The flag seems to be present in gcc as well.
|
||||
"" if args.disable_precompiled_header else '-fpch-validate-input-files-content'
|
||||
]
|
||||
|
||||
optimization_level = mode_config['optimization-level']
|
||||
cxxflags.append(f'-O{optimization_level}')
|
||||
@@ -2250,6 +2283,7 @@ def write_build_file(f,
|
||||
scylla_version,
|
||||
scylla_release,
|
||||
args):
|
||||
use_precompiled_header = not args.disable_precompiled_header
|
||||
warnings = get_warning_options(args.cxx)
|
||||
rustc_target = pick_rustc_target('wasm32-wasi', 'wasm32-wasip1')
|
||||
f.write(textwrap.dedent('''\
|
||||
@@ -2356,7 +2390,10 @@ def write_build_file(f,
|
||||
|
||||
for mode in build_modes:
|
||||
modeval = modes[mode]
|
||||
|
||||
seastar_lib_ext = 'so' if modeval['build_seastar_shared_libs'] else 'a'
|
||||
seastar_dep = f'$builddir/{mode}/seastar/libseastar.{seastar_lib_ext}'
|
||||
seastar_testing_dep = f'$builddir/{mode}/seastar/libseastar_testing.{seastar_lib_ext}'
|
||||
abseil_dep = ' '.join(f'$builddir/{mode}/abseil/{lib}' for lib in abseil_libs)
|
||||
fmt_lib = 'fmt'
|
||||
f.write(textwrap.dedent('''\
|
||||
cxx_ld_flags_{mode} = {cxx_ld_flags}
|
||||
@@ -2369,6 +2406,14 @@ def write_build_file(f,
|
||||
command = $cxx -MD -MT $out -MF $out.d {seastar_cflags} $cxxflags_{mode} $cxxflags $obj_cxxflags -c -o $out $in
|
||||
description = CXX $out
|
||||
depfile = $out.d
|
||||
rule cxx_build_precompiled_header.{mode}
|
||||
command = $cxx -MD -MT $out -MF $out.d {seastar_cflags} $cxxflags_{mode} $cxxflags $obj_cxxflags -c -o $out $in -Winvalid-pch -fpch-instantiate-templates -Xclang -emit-pch -DSCYLLA_USE_PRECOMPILED_HEADER
|
||||
description = CXX-PRECOMPILED-HEADER $out
|
||||
depfile = $out.d
|
||||
rule cxx_with_pch.{mode}
|
||||
command = $cxx -MD -MT $out -MF $out.d {seastar_cflags} $cxxflags_{mode} $cxxflags $obj_cxxflags -c -o $out $in -Winvalid-pch -Xclang -include-pch -Xclang $builddir/{mode}/stdafx.hh.pch
|
||||
description = CXX $out
|
||||
depfile = $out.d
|
||||
rule link.{mode}
|
||||
command = $cxx $ld_flags_{mode} $ldflags -o $out $in $libs $libs_{mode}
|
||||
description = LINK $out
|
||||
@@ -2402,7 +2447,7 @@ def write_build_file(f,
|
||||
$builddir/{mode}/gen/${{stem}}Parser.cpp
|
||||
description = ANTLR3 $in
|
||||
rule checkhh.{mode}
|
||||
command = $cxx -MD -MT $out -MF $out.d {seastar_cflags} $cxxflags $cxxflags_{mode} $obj_cxxflags --include $in -c -o $out $builddir/{mode}/gen/empty.cc
|
||||
command = $cxx -MD -MT $out -MF $out.d {seastar_cflags} $cxxflags $cxxflags_{mode} $obj_cxxflags --include $in -c -o $out $builddir/{mode}/gen/empty.cc -USCYLLA_USE_PRECOMPILED_HEADER
|
||||
description = CHECKHH $in
|
||||
depfile = $out.d
|
||||
rule test.{mode}
|
||||
@@ -2416,10 +2461,11 @@ def write_build_file(f,
|
||||
description = RUST_LIB $out
|
||||
''').format(mode=mode, antlr3_exec=args.antlr3_exec, fmt_lib=fmt_lib, test_repeat=args.test_repeat, test_timeout=args.test_timeout, **modeval))
|
||||
f.write(
|
||||
'build {mode}-build: phony {artifacts} {wasms}\n'.format(
|
||||
'build {mode}-build: phony {artifacts} {wasms} {vector_search_validator_bins}\n'.format(
|
||||
mode=mode,
|
||||
artifacts=str.join(' ', ['$builddir/' + mode + '/' + x for x in sorted(build_artifacts - wasms)]),
|
||||
artifacts=str.join(' ', ['$builddir/' + mode + '/' + x for x in sorted(build_artifacts - wasms - vector_search_validator_bins)]),
|
||||
wasms = str.join(' ', ['$builddir/' + x for x in sorted(build_artifacts & wasms)]),
|
||||
vector_search_validator_bins=str.join(' ', ['$builddir/' + x for x in sorted(build_artifacts & vector_search_validator_bins)]),
|
||||
)
|
||||
)
|
||||
if profile_recipe := modes[mode].get('profile_recipe'):
|
||||
@@ -2428,6 +2474,7 @@ def write_build_file(f,
|
||||
include_dist_target = f'dist-{mode}' if args.enable_dist is None or args.enable_dist else ''
|
||||
f.write(f'build {mode}: phony {include_cxx_target} {include_dist_target}\n')
|
||||
compiles = {}
|
||||
compiles_with_pch = set()
|
||||
swaggers = set()
|
||||
serializers = {}
|
||||
ragels = {}
|
||||
@@ -2442,16 +2489,16 @@ def write_build_file(f,
|
||||
# object code. And we enable LTO when linking the main Scylla executable, while disable
|
||||
# it when linking anything else.
|
||||
|
||||
seastar_lib_ext = 'so' if modeval['build_seastar_shared_libs'] else 'a'
|
||||
for binary in sorted(build_artifacts):
|
||||
if modeval['is_profile'] and binary != "scylla":
|
||||
# Just to avoid clutter in build.ninja
|
||||
continue
|
||||
profile_dep = modes[mode].get('profile_target', "")
|
||||
|
||||
if binary in other or binary in wasms:
|
||||
if binary in other or binary in wasms or binary in vector_search_validator_bins:
|
||||
continue
|
||||
srcs = deps[binary]
|
||||
# 'scylla'
|
||||
objs = ['$builddir/' + mode + '/' + src.replace('.cc', '.o')
|
||||
for src in srcs
|
||||
if src.endswith('.cc')]
|
||||
@@ -2487,9 +2534,6 @@ def write_build_file(f,
|
||||
continue
|
||||
|
||||
do_lto = modes[mode]['has_lto'] and binary in lto_binaries
|
||||
seastar_dep = f'$builddir/{mode}/seastar/libseastar.{seastar_lib_ext}'
|
||||
seastar_testing_dep = f'$builddir/{mode}/seastar/libseastar_testing.{seastar_lib_ext}'
|
||||
abseil_dep = ' '.join(f'$builddir/{mode}/abseil/{lib}' for lib in abseil_libs)
|
||||
seastar_testing_libs = f'$seastar_testing_libs_{mode}'
|
||||
|
||||
local_libs = f'$seastar_libs_{mode} $libs'
|
||||
@@ -2499,6 +2543,7 @@ def write_build_file(f,
|
||||
local_libs += ' -flto=thin -ffat-lto-objects'
|
||||
else:
|
||||
local_libs += ' -fno-lto'
|
||||
use_pch = use_precompiled_header and binary == 'scylla'
|
||||
if binary in tests:
|
||||
if binary in pure_boost_tests:
|
||||
local_libs += ' ' + maybe_static(args.staticboost, '-lboost_unit_test_framework')
|
||||
@@ -2527,6 +2572,8 @@ def write_build_file(f,
|
||||
if src.endswith('.cc'):
|
||||
obj = '$builddir/' + mode + '/' + src.replace('.cc', '.o')
|
||||
compiles[obj] = src
|
||||
if use_pch:
|
||||
compiles_with_pch.add(obj)
|
||||
elif src.endswith('.idl.hh'):
|
||||
hh = '$builddir/' + mode + '/gen/' + src.replace('.idl.hh', '.dist.hh')
|
||||
serializers[hh] = src
|
||||
@@ -2559,10 +2606,11 @@ def write_build_file(f,
|
||||
)
|
||||
|
||||
f.write(
|
||||
'build {mode}-test: test.{mode} {test_executables} $builddir/{mode}/scylla {wasms}\n'.format(
|
||||
'build {mode}-test: test.{mode} {test_executables} $builddir/{mode}/scylla {wasms} {vector_search_validator_bins} \n'.format(
|
||||
mode=mode,
|
||||
test_executables=' '.join(['$builddir/{}/{}'.format(mode, binary) for binary in sorted(tests)]),
|
||||
wasms=' '.join([f'$builddir/{binary}' for binary in sorted(wasms)]),
|
||||
vector_search_validator_bins=' '.join([f'$builddir/{binary}' for binary in sorted(vector_search_validator_bins)]),
|
||||
)
|
||||
)
|
||||
f.write(
|
||||
@@ -2605,7 +2653,9 @@ def write_build_file(f,
|
||||
src = compiles[obj]
|
||||
seastar_dep = f'$builddir/{mode}/seastar/libseastar.{seastar_lib_ext}'
|
||||
abseil_dep = ' '.join(f'$builddir/{mode}/abseil/{lib}' for lib in abseil_libs)
|
||||
f.write(f'build {obj}: cxx.{mode} {src} | {profile_dep} || {seastar_dep} {abseil_dep} {gen_headers_dep}\n')
|
||||
pch_dep = f'$builddir/{mode}/stdafx.hh.pch' if obj in compiles_with_pch else ''
|
||||
cxx_cmd = 'cxx_with_pch' if obj in compiles_with_pch else 'cxx'
|
||||
f.write(f'build {obj}: {cxx_cmd}.{mode} {src} | {profile_dep} {seastar_dep} {abseil_dep} {gen_headers_dep} {pch_dep}\n')
|
||||
if src in modeval['per_src_extra_cxxflags']:
|
||||
f.write(' cxxflags = {seastar_cflags} $cxxflags $cxxflags_{mode} {extra_cxxflags}\n'.format(mode=mode, extra_cxxflags=modeval["per_src_extra_cxxflags"][src], **modeval))
|
||||
for swagger in swaggers:
|
||||
@@ -2666,6 +2716,8 @@ def write_build_file(f,
|
||||
f.write(' target = {lib}\n'.format(**locals()))
|
||||
f.write(' profile_dep = {profile_dep}\n'.format(**locals()))
|
||||
|
||||
f.write(f'build $builddir/{mode}/stdafx.hh.pch: cxx_build_precompiled_header.{mode} stdafx.hh | {profile_dep} {seastar_dep} {abseil_dep} {gen_headers_dep} {pch_dep}\n')
|
||||
|
||||
f.write('build $builddir/{mode}/seastar/apps/iotune/iotune: ninja $builddir/{mode}/seastar/build.ninja | $builddir/{mode}/seastar/libseastar.{seastar_lib_ext}\n'
|
||||
.format(**locals()))
|
||||
f.write(' pool = submodule_pool\n')
|
||||
@@ -2729,6 +2781,19 @@ def write_build_file(f,
|
||||
'build compiler-training: phony {}\n'.format(' '.join(['{mode}-compiler-training'.format(mode=mode) for mode in default_modes]))
|
||||
)
|
||||
|
||||
f.write(textwrap.dedent(f'''\
|
||||
rule build-vector-search-validator
|
||||
command = test/vector_search_validator/build-validator $builddir
|
||||
rule build-vector-store
|
||||
command = test/vector_search_validator/build-vector-store $builddir
|
||||
'''))
|
||||
f.write(
|
||||
'build $builddir/{vector_search_validator_bin}: build-vector-search-validator {}\n'.format(' '.join([dep for dep in sorted(vector_search_validator_deps)]), vector_search_validator_bin=vector_search_validator_bin)
|
||||
)
|
||||
f.write(
|
||||
'build $builddir/{vector_store_bin}: build-vector-store {}\n'.format(' '.join([dep for dep in sorted(vector_store_deps)]), vector_store_bin=vector_store_bin)
|
||||
)
|
||||
|
||||
f.write(textwrap.dedent(f'''\
|
||||
build dist-unified-tar: phony {' '.join([f'$builddir/{mode}/dist/tar/{scylla_product}-unified-{scylla_version}-{scylla_release}.{arch}.tar.gz' for mode in default_modes])}
|
||||
build dist-unified: phony dist-unified-tar
|
||||
@@ -2942,7 +3007,7 @@ def configure_using_cmake(args):
|
||||
'CMAKE_DEFAULT_CONFIGS': selected_configs,
|
||||
'CMAKE_C_COMPILER': args.cc,
|
||||
'CMAKE_CXX_COMPILER': args.cxx,
|
||||
'CMAKE_CXX_FLAGS': args.user_cflags,
|
||||
'CMAKE_CXX_FLAGS': args.user_cflags + ("" if args.disable_precompiled_header else " -fpch-validate-input-files-content"),
|
||||
'CMAKE_EXE_LINKER_FLAGS': args.user_ldflags,
|
||||
'CMAKE_EXPORT_COMPILE_COMMANDS': 'ON',
|
||||
'Scylla_CHECK_HEADERS': 'ON',
|
||||
@@ -2951,6 +3016,7 @@ def configure_using_cmake(args):
|
||||
'Scylla_TEST_REPEAT': args.test_repeat,
|
||||
'Scylla_ENABLE_LTO': 'ON' if args.lto else 'OFF',
|
||||
'Scylla_WITH_DEBUG_INFO' : 'ON' if args.debuginfo else 'OFF',
|
||||
'Scylla_USE_PRECOMPILED_HEADER': 'OFF' if args.disable_precompiled_header else 'ON',
|
||||
}
|
||||
if args.date_stamp:
|
||||
settings['Scylla_DATE_STAMP'] = args.date_stamp
|
||||
|
||||
@@ -138,5 +138,8 @@ target_link_libraries(cql3
|
||||
lang
|
||||
transport)
|
||||
|
||||
if (Scylla_USE_PRECOMPILED_HEADER_USE)
|
||||
target_precompile_headers(cql3 REUSE_FROM scylla-precompiled-header)
|
||||
endif()
|
||||
check_headers(check-headers cql3
|
||||
GLOB_RECURSE ${CMAKE_CURRENT_SOURCE_DIR}/*.hh)
|
||||
|
||||
@@ -1560,6 +1560,10 @@ serviceLevelOrRoleName returns [sstring name]
|
||||
| t=QUOTED_NAME { $name = sstring($t.text); }
|
||||
| k=unreserved_keyword { $name = k;
|
||||
std::transform($name.begin(), $name.end(), $name.begin(), ::tolower);}
|
||||
// The literal `default` will not be parsed by any of the previous
|
||||
// rules, so we need to cover it manually. Needed by CREATE SERVICE
|
||||
// LEVEL and ATTACH SERVICE LEVEL.
|
||||
| t=K_DEFAULT { $name = sstring("default"); }
|
||||
| QMARK {add_recognition_error("Bind variables cannot be used for service levels or role names");}
|
||||
;
|
||||
|
||||
|
||||
@@ -37,6 +37,12 @@ future<::shared_ptr<cql_transport::messages::result_message>>
|
||||
alter_service_level_statement::execute(query_processor& qp,
|
||||
service::query_state &state,
|
||||
const query_options &, std::optional<service::group0_guard> guard) const {
|
||||
if (_service_level == qos::service_level_controller::default_service_level_name) {
|
||||
sstring reason = seastar::format("The default service level, {}, cannot be altered",
|
||||
qos::service_level_controller::default_service_level_name);
|
||||
throw exceptions::invalid_request_exception(std::move(reason));
|
||||
}
|
||||
|
||||
service::group0_batch mc{std::move(guard)};
|
||||
validate_shares_option(qp, _slo);
|
||||
qos::service_level& sl = state.get_service_level_controller().get_service_level(_service_level);
|
||||
|
||||
@@ -422,7 +422,14 @@ std::pair<schema_ptr, std::vector<view_ptr>> alter_table_statement::prepare_sche
|
||||
throw exceptions::invalid_request_exception(format("The synchronous_updates option is only applicable to materialized views, not to base tables"));
|
||||
}
|
||||
|
||||
_properties->apply_to_builder(cfm, std::move(schema_extensions), db, keyspace());
|
||||
if (is_cdc_log_table) {
|
||||
auto gc_opts = _properties->get_tombstone_gc_options(schema_extensions);
|
||||
if (gc_opts && gc_opts->mode() == tombstone_gc_mode::repair) {
|
||||
throw exceptions::invalid_request_exception("The 'repair' mode for tombstone_gc is not allowed on CDC log tables.");
|
||||
}
|
||||
}
|
||||
|
||||
_properties->apply_to_builder(cfm, std::move(schema_extensions), db, keyspace(), !is_cdc_log_table);
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
@@ -55,8 +55,29 @@ view_ptr alter_view_statement::prepare_view(data_dictionary::database db) const
|
||||
auto schema_extensions = _properties->make_schema_extensions(db.extensions());
|
||||
_properties->validate(db, keyspace(), schema_extensions);
|
||||
|
||||
bool is_colocated = [&] {
|
||||
if (!db.find_keyspace(keyspace()).get_replication_strategy().uses_tablets()) {
|
||||
return false;
|
||||
}
|
||||
auto base_schema = db.find_schema(schema->view_info()->base_id());
|
||||
if (!base_schema) {
|
||||
return false;
|
||||
}
|
||||
return std::ranges::equal(
|
||||
schema->partition_key_columns(),
|
||||
base_schema->partition_key_columns(),
|
||||
[](const column_definition& a, const column_definition& b) { return a.name() == b.name(); });
|
||||
}();
|
||||
|
||||
if (is_colocated) {
|
||||
auto gc_opts = _properties->get_tombstone_gc_options(schema_extensions);
|
||||
if (gc_opts && gc_opts->mode() == tombstone_gc_mode::repair) {
|
||||
throw exceptions::invalid_request_exception("The 'repair' mode for tombstone_gc is not allowed on co-located materialized view tables.");
|
||||
}
|
||||
}
|
||||
|
||||
auto builder = schema_builder(schema);
|
||||
_properties->apply_to_builder(builder, std::move(schema_extensions), db, keyspace());
|
||||
_properties->apply_to_builder(builder, std::move(schema_extensions), db, keyspace(), !is_colocated);
|
||||
|
||||
if (builder.get_gc_grace_seconds() == 0) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
|
||||
@@ -43,6 +43,14 @@ attach_service_level_statement::execute(query_processor& qp,
|
||||
service::query_state &state,
|
||||
const query_options &,
|
||||
std::optional<service::group0_guard> guard) const {
|
||||
if (_service_level == qos::service_level_controller::default_service_level_name) {
|
||||
sstring reason = seastar::format("The default service level, {}, cannot be "
|
||||
"attached to a role. If you want to detach an attached service level, "
|
||||
"use the DETACH SERVICE LEVEL statement",
|
||||
qos::service_level_controller::default_service_level_name);
|
||||
throw exceptions::invalid_request_exception(std::move(reason));
|
||||
}
|
||||
|
||||
auto sli = co_await state.get_service_level_controller().get_distributed_service_level(_service_level);
|
||||
if (sli.empty()) {
|
||||
throw qos::nonexistant_service_level_exception(_service_level);
|
||||
|
||||
@@ -293,7 +293,7 @@ std::optional<db::tablet_options::map_type> cf_prop_defs::get_tablet_options() c
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
void cf_prop_defs::apply_to_builder(schema_builder& builder, schema::extensions_map schema_extensions, const data_dictionary::database& db, sstring ks_name) const {
|
||||
void cf_prop_defs::apply_to_builder(schema_builder& builder, schema::extensions_map schema_extensions, const data_dictionary::database& db, sstring ks_name, bool supports_repair) const {
|
||||
if (has_property(KW_COMMENT)) {
|
||||
builder.set_comment(get_string(KW_COMMENT, ""));
|
||||
}
|
||||
@@ -379,7 +379,7 @@ void cf_prop_defs::apply_to_builder(schema_builder& builder, schema::extensions_
|
||||
}
|
||||
// Set default tombstone_gc mode.
|
||||
if (!schema_extensions.contains(tombstone_gc_extension::NAME)) {
|
||||
auto ext = seastar::make_shared<tombstone_gc_extension>(get_default_tombstone_gc_mode(db, ks_name));
|
||||
auto ext = seastar::make_shared<tombstone_gc_extension>(get_default_tombstone_gc_mode(db, ks_name, supports_repair));
|
||||
schema_extensions.emplace(tombstone_gc_extension::NAME, std::move(ext));
|
||||
}
|
||||
builder.set_extensions(std::move(schema_extensions));
|
||||
|
||||
@@ -110,7 +110,7 @@ public:
|
||||
bool get_synchronous_updates_flag() const;
|
||||
std::optional<db::tablet_options::map_type> get_tablet_options() const;
|
||||
|
||||
void apply_to_builder(schema_builder& builder, schema::extensions_map schema_extensions, const data_dictionary::database& db, sstring ks_name) const;
|
||||
void apply_to_builder(schema_builder& builder, schema::extensions_map schema_extensions, const data_dictionary::database& db, sstring ks_name, bool supports_repair) const;
|
||||
void validate_minimum_int(const sstring& field, int32_t minimum_value, int32_t default_value) const;
|
||||
};
|
||||
|
||||
|
||||
@@ -201,7 +201,14 @@ view_ptr create_index_statement::create_view_for_index(const schema_ptr schema,
|
||||
"";
|
||||
builder.with_view_info(schema, false, where_clause);
|
||||
|
||||
auto tombstone_gc_ext = seastar::make_shared<tombstone_gc_extension>(get_default_tombstone_gc_mode(db, schema->ks_name()));
|
||||
bool is_colocated = [&] {
|
||||
if (!db.find_keyspace(keyspace()).get_replication_strategy().uses_tablets()) {
|
||||
return false;
|
||||
}
|
||||
return im.local();
|
||||
}();
|
||||
|
||||
auto tombstone_gc_ext = seastar::make_shared<tombstone_gc_extension>(get_default_tombstone_gc_mode(db, schema->ks_name(), !is_colocated));
|
||||
builder.add_extension(tombstone_gc_extension::NAME, std::move(tombstone_gc_ext));
|
||||
|
||||
// A local secondary index should be backed by a *synchronous* view,
|
||||
@@ -292,7 +299,7 @@ std::vector<::shared_ptr<index_target>> create_index_statement::validate_while_e
|
||||
throw exceptions::invalid_request_exception(format("Non-supported custom class \'{}\' provided", *(_properties->custom_class)));
|
||||
}
|
||||
auto custom_index = (*custom_index_factory)();
|
||||
custom_index->validate(*schema, *_properties, targets, db.features());
|
||||
custom_index->validate(*schema, *_properties, targets, db.features(), db);
|
||||
_properties->index_version = custom_index->index_version(*schema);
|
||||
}
|
||||
|
||||
|
||||
@@ -45,6 +45,12 @@ create_service_level_statement::execute(query_processor& qp,
|
||||
throw exceptions::invalid_request_exception("Names starting with '$' are reserved for internal tenants. Use a different name.");
|
||||
}
|
||||
|
||||
if (_service_level == qos::service_level_controller::default_service_level_name) {
|
||||
sstring reason = seastar::format("The default service level, {}, already exists "
|
||||
"and cannot be created", qos::service_level_controller::default_service_level_name);
|
||||
throw exceptions::invalid_request_exception(std::move(reason));
|
||||
}
|
||||
|
||||
service::group0_batch mc{std::move(guard)};
|
||||
validate_shares_option(qp, _slo);
|
||||
|
||||
|
||||
@@ -128,7 +128,7 @@ void create_table_statement::apply_properties_to(schema_builder& builder, const
|
||||
builder.set_compressor_params(db.get_config().sstable_compression_user_table_options());
|
||||
}
|
||||
|
||||
_properties->apply_to_builder(builder, _properties->make_schema_extensions(db.extensions()), db, keyspace());
|
||||
_properties->apply_to_builder(builder, _properties->make_schema_extensions(db.extensions()), db, keyspace(), true);
|
||||
}
|
||||
|
||||
void create_table_statement::add_column_metadata_from_aliases(schema_builder& builder, std::vector<bytes> aliases, const std::vector<data_type>& types, column_kind kind) const
|
||||
|
||||
@@ -373,7 +373,30 @@ std::pair<view_ptr, cql3::cql_warnings_vec> create_view_statement::prepare_view(
|
||||
db::view::create_virtual_column(builder, def->name(), def->type);
|
||||
}
|
||||
}
|
||||
_properties.properties()->apply_to_builder(builder, std::move(schema_extensions), db, keyspace());
|
||||
|
||||
bool is_colocated = [&] {
|
||||
if (!db.find_keyspace(keyspace()).get_replication_strategy().uses_tablets()) {
|
||||
return false;
|
||||
}
|
||||
if (target_partition_keys.size() != schema->partition_key_columns().size()) {
|
||||
return false;
|
||||
}
|
||||
for (size_t i = 0; i < target_partition_keys.size(); ++i) {
|
||||
if (target_partition_keys[i] != &schema->partition_key_columns()[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}();
|
||||
|
||||
if (is_colocated) {
|
||||
auto gc_opts = _properties.properties()->get_tombstone_gc_options(schema_extensions);
|
||||
if (gc_opts && gc_opts->mode() == tombstone_gc_mode::repair) {
|
||||
throw exceptions::invalid_request_exception("The 'repair' mode for tombstone_gc is not allowed on co-located materialized view tables.");
|
||||
}
|
||||
}
|
||||
|
||||
_properties.properties()->apply_to_builder(builder, std::move(schema_extensions), db, keyspace(), !is_colocated);
|
||||
|
||||
if (builder.default_time_to_live().count() > 0) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
|
||||
@@ -34,6 +34,11 @@ drop_service_level_statement::execute(query_processor& qp,
|
||||
service::query_state &state,
|
||||
const query_options &,
|
||||
std::optional<service::group0_guard> guard) const {
|
||||
if (_service_level == qos::service_level_controller::default_service_level_name) {
|
||||
sstring reason = seastar::format("The default service level, {}, cannot be dropped",
|
||||
qos::service_level_controller::default_service_level_name);
|
||||
throw exceptions::invalid_request_exception(std::move(reason));
|
||||
}
|
||||
service::group0_batch mc{std::move(guard)};
|
||||
auto& sl = state.get_service_level_controller();
|
||||
co_await sl.drop_distributed_service_level(_service_level, _if_exists, mc);
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
* SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
|
||||
*/
|
||||
|
||||
#include "seastar/core/format.hh"
|
||||
#include "seastar/core/sstring.hh"
|
||||
#include "utils/assert.hh"
|
||||
#include "cql3/statements/ks_prop_defs.hh"
|
||||
@@ -113,6 +114,17 @@ static locator::replication_strategy_config_options prepare_options(
|
||||
return options;
|
||||
}
|
||||
|
||||
if (uses_tablets) {
|
||||
for (const auto& opt: old_options) {
|
||||
if (opt.first == ks_prop_defs::REPLICATION_FACTOR_KEY) {
|
||||
on_internal_error(logger, format("prepare_options: old_options contains invalid key '{}'", ks_prop_defs::REPLICATION_FACTOR_KEY));
|
||||
}
|
||||
if (!options.contains(opt.first)) {
|
||||
throw exceptions::configuration_exception(fmt::format("Attempted to implicitly drop replicas in datacenter {}. If this is the desired behavior, set replication factor to 0 in {} explicitly.", opt.first, opt.first));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// For users' convenience, expand the 'replication_factor' option into a replication factor for each DC.
|
||||
// If the user simply switches from another strategy without providing any options,
|
||||
// but the other strategy used the 'replication_factor' option, it will also be expanded.
|
||||
|
||||
@@ -2031,14 +2031,16 @@ future<shared_ptr<cql_transport::messages::result_message>> vector_indexed_table
|
||||
fmt::format("Use of ANN OF in an ORDER BY clause requires a LIMIT that is not greater than {}. LIMIT was {}", max_ann_query_limit, limit)));
|
||||
}
|
||||
|
||||
auto as = abort_source();
|
||||
auto pkeys = co_await qp.vector_store_client().ann(_schema->ks_name(), _index.metadata().name(), _schema, get_ann_ordering_vector(options), limit, as);
|
||||
auto timeout = db::timeout_clock::now() + get_timeout(state.get_client_state(), options);
|
||||
auto aoe = abort_on_expiry(timeout);
|
||||
auto pkeys = co_await qp.vector_store_client().ann(
|
||||
_schema->ks_name(), _index.metadata().name(), _schema, get_ann_ordering_vector(options), limit, aoe.abort_source());
|
||||
if (!pkeys.has_value()) {
|
||||
co_await coroutine::return_exception(
|
||||
exceptions::invalid_request_exception(std::visit(vector_search::vector_store_client::ann_error_visitor{}, pkeys.error())));
|
||||
}
|
||||
|
||||
co_return co_await query_base_table(qp, state, options, pkeys.value());
|
||||
co_return co_await query_base_table(qp, state, options, pkeys.value(), timeout);
|
||||
});
|
||||
|
||||
auto page_size = options.get_page_size();
|
||||
@@ -2073,10 +2075,10 @@ std::vector<float> vector_indexed_table_select_statement::get_ann_ordering_vecto
|
||||
return util::to_vector<float>(values);
|
||||
}
|
||||
|
||||
future<::shared_ptr<cql_transport::messages::result_message>> vector_indexed_table_select_statement::query_base_table(
|
||||
query_processor& qp, service::query_state& state, const query_options& options, const std::vector<vector_search::primary_key>& pkeys) const {
|
||||
future<::shared_ptr<cql_transport::messages::result_message>> vector_indexed_table_select_statement::query_base_table(query_processor& qp,
|
||||
service::query_state& state, const query_options& options, const std::vector<vector_search::primary_key>& pkeys,
|
||||
lowres_clock::time_point timeout) const {
|
||||
auto command = prepare_command_for_base_query(qp, state, options);
|
||||
auto timeout = db::timeout_clock::now() + get_timeout(state.get_client_state(), options);
|
||||
|
||||
// For tables without clustering columns, we can optimize by querying
|
||||
// partition ranges instead of individual primary keys, since the
|
||||
|
||||
@@ -389,8 +389,8 @@ private:
|
||||
|
||||
std::vector<float> get_ann_ordering_vector(const query_options& options) const;
|
||||
|
||||
future<::shared_ptr<cql_transport::messages::result_message>> query_base_table(
|
||||
query_processor& qp, service::query_state& state, const query_options& options, const std::vector<vector_search::primary_key>& pkeys) const;
|
||||
future<::shared_ptr<cql_transport::messages::result_message>> query_base_table(query_processor& qp, service::query_state& state,
|
||||
const query_options& options, const std::vector<vector_search::primary_key>& pkeys, lowres_clock::time_point timeout) const;
|
||||
|
||||
future<::shared_ptr<cql_transport::messages::result_message>> query_base_table(query_processor& qp, service::query_state& state,
|
||||
const query_options& options, lw_shared_ptr<query::read_command> command, lowres_clock::time_point timeout,
|
||||
|
||||
@@ -12,5 +12,8 @@ target_link_libraries(data_dictionary
|
||||
Seastar::seastar
|
||||
xxHash::xxhash)
|
||||
|
||||
if (Scylla_USE_PRECOMPILED_HEADER_USE)
|
||||
target_precompile_headers(data_dictionary REUSE_FROM scylla-precompiled-header)
|
||||
endif()
|
||||
check_headers(check-headers data_dictionary
|
||||
GLOB_RECURSE ${CMAKE_CURRENT_SOURCE_DIR}/*.hh)
|
||||
|
||||
@@ -60,5 +60,8 @@ target_link_libraries(db
|
||||
data_dictionary
|
||||
cql3)
|
||||
|
||||
if (Scylla_USE_PRECOMPILED_HEADER_USE)
|
||||
target_precompile_headers(db REUSE_FROM scylla-precompiled-header)
|
||||
endif()
|
||||
check_headers(check-headers db
|
||||
GLOB_RECURSE ${CMAKE_CURRENT_SOURCE_DIR}/*.hh)
|
||||
|
||||
@@ -3461,12 +3461,15 @@ db::commitlog::read_log_file(const replay_state& state, sstring filename, sstrin
|
||||
clogger.debug("Read {} bytes of data ({}, {})", size, pos, rem);
|
||||
|
||||
while (rem < size) {
|
||||
const auto initial_size = initial.size_bytes();
|
||||
|
||||
if (eof) {
|
||||
auto reason = fmt::format("unexpected EOF, rem={}, size={}", rem, size);
|
||||
auto reason = fmt::format("unexpected EOF, pos={}, rem={}, size={}, alignment={}, initial_size={}",
|
||||
pos, rem, size, alignment, initial_size);
|
||||
throw segment_truncation(std::move(reason), block_boundry);
|
||||
}
|
||||
|
||||
auto block_size = alignment - initial.size_bytes();
|
||||
auto block_size = alignment - initial_size;
|
||||
// using a stream is perhaps not 100% effective, but we need to
|
||||
// potentially address data in pages smaller than the current
|
||||
// disk/fs we are reading from can handle (but please no).
|
||||
@@ -3474,8 +3477,9 @@ db::commitlog::read_log_file(const replay_state& state, sstring filename, sstrin
|
||||
|
||||
if (tmp.size_bytes() == 0) {
|
||||
eof = true;
|
||||
auto reason = fmt::format("read 0 bytes, while tried to read {} bytes. rem={}, size={}",
|
||||
block_size, rem, size);
|
||||
auto reason = fmt::format("read 0 bytes, while tried to read {} bytes. "
|
||||
"pos={}, rem={}, size={}, alignment={}, initial_size={}",
|
||||
block_size, pos, rem, size, alignment, initial_size);
|
||||
throw segment_truncation(std::move(reason), block_boundry);
|
||||
}
|
||||
|
||||
@@ -3511,13 +3515,13 @@ db::commitlog::read_log_file(const replay_state& state, sstring filename, sstrin
|
||||
auto checksum = crc.checksum();
|
||||
|
||||
if (check != checksum) {
|
||||
auto reason = fmt::format("checksums do not match: {:x} vs. {:x}. rem={}, size={}",
|
||||
check, checksum, rem, size);
|
||||
auto reason = fmt::format("checksums do not match: {:x} vs. {:x}. pos={}, rem={}, size={}, alignment={}, initial_size={}",
|
||||
check, checksum, pos, rem, size, alignment, initial_size);
|
||||
throw segment_data_corruption_error(std::move(reason), alignment);
|
||||
}
|
||||
if (id != this->id) {
|
||||
auto reason = fmt::format("IDs do not match: {} vs. {}. rem={}, size={}",
|
||||
id, this->id, rem, size);
|
||||
auto reason = fmt::format("IDs do not match: {} vs. {}. pos={}, rem={}, size={}, alignment={}, initial_size={}",
|
||||
id, this->id, pos, rem, size, alignment, initial_size);
|
||||
throw segment_truncation(std::move(reason), pos + rem);
|
||||
}
|
||||
}
|
||||
@@ -3626,6 +3630,10 @@ db::commitlog::read_log_file(const replay_state& state, sstring filename, sstrin
|
||||
auto old = pos;
|
||||
pos = next_pos(off);
|
||||
clogger.trace("Pos {} -> {} ({})", old, pos, off);
|
||||
// #24346 check eof status whenever we move file pos.
|
||||
if (pos >= file_size) {
|
||||
eof = true;
|
||||
}
|
||||
}
|
||||
|
||||
future<> read_entry() {
|
||||
|
||||
19
db/config.cc
19
db/config.cc
@@ -36,6 +36,7 @@
|
||||
#include "sstables/compressor.hh"
|
||||
#include "utils/log.hh"
|
||||
#include "service/tablet_allocator_fwd.hh"
|
||||
#include "backlog_controller_fwd.hh"
|
||||
#include "utils/config_file_impl.hh"
|
||||
#include "exceptions/exceptions.hh"
|
||||
#include <seastar/core/metrics_api.hh>
|
||||
@@ -630,6 +631,8 @@ db::config::config(std::shared_ptr<db::extensions> exts)
|
||||
"If set to higher than 0, ignore the controller's output and set the memtable shares statically. Do not set this unless you know what you are doing and suspect a problem in the controller. This option will be retired when the controller reaches more maturity.")
|
||||
, compaction_static_shares(this, "compaction_static_shares", liveness::LiveUpdate, value_status::Used, 0,
|
||||
"If set to higher than 0, ignore the controller's output and set the compaction shares statically. Do not set this unless you know what you are doing and suspect a problem in the controller. This option will be retired when the controller reaches more maturity.")
|
||||
, compaction_max_shares(this, "compaction_max_shares", liveness::LiveUpdate, value_status::Used, default_compaction_maximum_shares,
|
||||
"Set the maximum shares of regular compaction to the specific value. Do not set this unless you know what you are doing and suspect a problem in the controller. This option will be retired when the controller reaches more maturity.")
|
||||
, compaction_enforce_min_threshold(this, "compaction_enforce_min_threshold", liveness::LiveUpdate, value_status::Used, false,
|
||||
"If set to true, enforce the min_threshold option for compactions strictly. If false (default), Scylla may decide to compact even if below min_threshold.")
|
||||
, compaction_flush_all_tables_before_major_seconds(this, "compaction_flush_all_tables_before_major_seconds", value_status::Used, 86400,
|
||||
@@ -1035,8 +1038,9 @@ db::config::config(std::shared_ptr<db::extensions> exts)
|
||||
"Controls whether traffic between nodes is compressed. The valid values are:\n"
|
||||
"* all: All traffic is compressed.\n"
|
||||
"* dc : Traffic between data centers is compressed.\n"
|
||||
"* rack : Traffic between racks is compressed.\n"
|
||||
"* none : No compression.",
|
||||
{"all", "dc", "none"})
|
||||
{"all", "dc", "rack", "none"})
|
||||
, internode_compression_zstd_max_cpu_fraction(this, "internode_compression_zstd_max_cpu_fraction", liveness::LiveUpdate, value_status::Used, 0.000,
|
||||
"ZSTD compression of RPC will consume at most this fraction of each internode_compression_zstd_quota_refresh_period_ms time slice.\n"
|
||||
"If you wish to try out zstd for RPC compression, 0.05 is a reasonable starting point.")
|
||||
@@ -1429,6 +1433,11 @@ db::config::config(std::shared_ptr<db::extensions> exts)
|
||||
, alternator_warn_authorization(this, "alternator_warn_authorization", liveness::LiveUpdate, value_status::Used, false, "Count and log warnings about failed authentication or authorization")
|
||||
, alternator_write_isolation(this, "alternator_write_isolation", value_status::Used, "", "Default write isolation policy for Alternator.")
|
||||
, alternator_streams_time_window_s(this, "alternator_streams_time_window_s", value_status::Used, 10, "CDC query confidence window for alternator streams.")
|
||||
, alternator_streams_increased_compatibility(this, "alternator_streams_increased_compatibility", liveness::LiveUpdate, value_status::Used, false,
|
||||
"Increases compatibility with DynamoDB Streams at the cost of performance. "
|
||||
"If enabled, Alternator compares the existing item with the new one during "
|
||||
"data-modifying operations to determine which event type should be emitted. "
|
||||
"This penalty is incurred only for tables with Alternator Streams enabled.")
|
||||
, alternator_timeout_in_ms(this, "alternator_timeout_in_ms", liveness::LiveUpdate, value_status::Used, 10000,
|
||||
"The server-side timeout for completing Alternator API requests.")
|
||||
, alternator_ttl_period_in_seconds(this, "alternator_ttl_period_in_seconds", value_status::Used,
|
||||
@@ -1450,7 +1459,13 @@ db::config::config(std::shared_ptr<db::extensions> exts)
|
||||
, alternator_max_expression_cache_entries_per_shard(this, "alternator_max_expression_cache_entries_per_shard", liveness::LiveUpdate, value_status::Used, 2000, "Maximum number of cached parsed request expressions, per shard.")
|
||||
, alternator_max_users_query_size_in_trace_output(this, "alternator_max_users_query_size_in_trace_output", liveness::LiveUpdate, value_status::Used, uint64_t(4096),
|
||||
"Maximum size of user's command in trace output (`alternator_op` entry). Larger traces will be truncated and have `<truncated>` message appended - which doesn't count to the maximum limit.")
|
||||
, vector_store_primary_uri(this, "vector_store_primary_uri", liveness::LiveUpdate, value_status::Used, "", "A comma-separated list of vector store node URIs. If not set, vector search is disabled.")
|
||||
, vector_store_primary_uri(
|
||||
this, "vector_store_primary_uri", liveness::LiveUpdate, value_status::Used, "", "A comma-separated list of primary vector store node URIs. These nodes are preferred for vector search operations.")
|
||||
, vector_store_secondary_uri(this, "vector_store_secondary_uri", liveness::LiveUpdate, value_status::Used, "",
|
||||
"A comma-separated list of secondary vector store node URIs. These nodes are used as a fallback when all primary nodes are unavailable, and are typically located in a different availability zone for high availability.")
|
||||
, vector_store_encryption_options(this, "vector_store_encryption_options", value_status::Used, {},
|
||||
"Options for encrypted connections to the vector store. These options are used for HTTPS URIs in vector_store_primary_uri and vector_store_secondary_uri. The available options are:\n"
|
||||
"* truststore: (Default: <not set. use system truststore>) Location of the truststore containing the trusted certificate for authenticating remote servers.")
|
||||
, abort_on_ebadf(this, "abort_on_ebadf", value_status::Used, true, "Abort the server on incorrect file descriptor access. Throws exception when disabled.")
|
||||
, sanitizer_report_backtrace(this, "sanitizer_report_backtrace", value_status::Used, false,
|
||||
"In debug mode, report log-structured allocator sanitizer violations with a backtrace. Slow.")
|
||||
|
||||
@@ -189,6 +189,7 @@ public:
|
||||
named_value<bool> auto_adjust_flush_quota;
|
||||
named_value<float> memtable_flush_static_shares;
|
||||
named_value<float> compaction_static_shares;
|
||||
named_value<float> compaction_max_shares;
|
||||
named_value<bool> compaction_enforce_min_threshold;
|
||||
named_value<uint32_t> compaction_flush_all_tables_before_major_seconds;
|
||||
named_value<sstring> cluster_name;
|
||||
@@ -461,6 +462,7 @@ public:
|
||||
named_value<bool> alternator_warn_authorization;
|
||||
named_value<sstring> alternator_write_isolation;
|
||||
named_value<uint32_t> alternator_streams_time_window_s;
|
||||
named_value<bool> alternator_streams_increased_compatibility;
|
||||
named_value<uint32_t> alternator_timeout_in_ms;
|
||||
named_value<double> alternator_ttl_period_in_seconds;
|
||||
named_value<sstring> alternator_describe_endpoints;
|
||||
@@ -470,6 +472,8 @@ public:
|
||||
named_value<uint64_t> alternator_max_users_query_size_in_trace_output;
|
||||
|
||||
named_value<sstring> vector_store_primary_uri;
|
||||
named_value<sstring> vector_store_secondary_uri;
|
||||
named_value<string_map> vector_store_encryption_options;
|
||||
|
||||
named_value<bool> abort_on_ebadf;
|
||||
|
||||
|
||||
@@ -766,9 +766,6 @@ schema_ptr system_keyspace::size_estimates() {
|
||||
"partitions larger than specified threshold"
|
||||
);
|
||||
builder.set_gc_grace_seconds(0);
|
||||
// FIXME re-enable caching for this and the other two
|
||||
// system.large_* tables once
|
||||
// https://github.com/scylladb/scylla/issues/3288 is fixed
|
||||
builder.set_caching_options(caching_options::get_disabled_caching_options());
|
||||
builder.with_hash_version();
|
||||
return builder.build(schema_builder::compact_storage::no);
|
||||
@@ -1667,7 +1664,7 @@ schema_ptr system_keyspace::view_building_tasks() {
|
||||
.with_column("key", utf8_type, column_kind::partition_key)
|
||||
.with_column("id", timeuuid_type, column_kind::clustering_key)
|
||||
.with_column("type", utf8_type)
|
||||
.with_column("state", utf8_type)
|
||||
.with_column("aborted", boolean_type)
|
||||
.with_column("base_id", uuid_type)
|
||||
.with_column("view_id", uuid_type)
|
||||
.with_column("last_token", long_type)
|
||||
@@ -3062,14 +3059,14 @@ future<mutation> system_keyspace::make_remove_view_build_status_on_host_mutation
|
||||
static constexpr auto VIEW_BUILDING_KEY = "view_building";
|
||||
|
||||
future<db::view::building_tasks> system_keyspace::get_view_building_tasks() {
|
||||
static const sstring query = format("SELECT id, type, state, base_id, view_id, last_token, host_id, shard FROM {}.{} WHERE key = '{}'", NAME, VIEW_BUILDING_TASKS, VIEW_BUILDING_KEY);
|
||||
static const sstring query = format("SELECT id, type, aborted, base_id, view_id, last_token, host_id, shard FROM {}.{} WHERE key = '{}'", NAME, VIEW_BUILDING_TASKS, VIEW_BUILDING_KEY);
|
||||
using namespace db::view;
|
||||
|
||||
building_tasks tasks;
|
||||
co_await _qp.query_internal(query, [&] (const cql3::untyped_result_set_row& row) -> future<stop_iteration> {
|
||||
auto id = row.get_as<utils::UUID>("id");
|
||||
auto type = task_type_from_string(row.get_as<sstring>("type"));
|
||||
auto state = task_state_from_string(row.get_as<sstring>("state"));
|
||||
auto aborted = row.get_as<bool>("aborted");
|
||||
auto base_id = table_id(row.get_as<utils::UUID>("base_id"));
|
||||
auto view_id = row.get_opt<utils::UUID>("view_id").transform([] (const utils::UUID& uuid) { return table_id(uuid); });
|
||||
auto last_token = dht::token::from_int64(row.get_as<int64_t>("last_token"));
|
||||
@@ -3077,7 +3074,7 @@ future<db::view::building_tasks> system_keyspace::get_view_building_tasks() {
|
||||
auto shard = unsigned(row.get_as<int32_t>("shard"));
|
||||
|
||||
locator::tablet_replica replica{host_id, shard};
|
||||
view_building_task task{id, type, state, base_id, view_id, replica, last_token};
|
||||
view_building_task task{id, type, aborted, base_id, view_id, replica, last_token};
|
||||
|
||||
switch (type) {
|
||||
case db::view::view_building_task::task_type::build_range:
|
||||
@@ -3096,7 +3093,7 @@ future<db::view::building_tasks> system_keyspace::get_view_building_tasks() {
|
||||
}
|
||||
|
||||
future<mutation> system_keyspace::make_view_building_task_mutation(api::timestamp_type ts, const db::view::view_building_task& task) {
|
||||
static const sstring stmt = format("INSERT INTO {}.{}(key, id, type, state, base_id, view_id, last_token, host_id, shard) VALUES ('{}', ?, ?, ?, ?, ?, ?, ?, ?)", NAME, VIEW_BUILDING_TASKS, VIEW_BUILDING_KEY);
|
||||
static const sstring stmt = format("INSERT INTO {}.{}(key, id, type, aborted, base_id, view_id, last_token, host_id, shard) VALUES ('{}', ?, ?, ?, ?, ?, ?, ?, ?)", NAME, VIEW_BUILDING_TASKS, VIEW_BUILDING_KEY);
|
||||
using namespace db::view;
|
||||
|
||||
data_value_or_unset view_id = unset_value{};
|
||||
@@ -3107,7 +3104,7 @@ future<mutation> system_keyspace::make_view_building_task_mutation(api::timestam
|
||||
view_id = data_value(task.view_id->uuid());
|
||||
}
|
||||
auto muts = co_await _qp.get_mutations_internal(stmt, internal_system_query_state(), ts, {
|
||||
task.id, task_type_to_sstring(task.type), task_state_to_sstring(task.state),
|
||||
task.id, task_type_to_sstring(task.type), task.aborted,
|
||||
task.base_id.uuid(), view_id, dht::token::to_int64(task.last_token),
|
||||
task.replica.host.uuid(), int32_t(task.replica.shard)
|
||||
});
|
||||
@@ -3117,18 +3114,6 @@ future<mutation> system_keyspace::make_view_building_task_mutation(api::timestam
|
||||
co_return std::move(muts[0]);
|
||||
}
|
||||
|
||||
future<mutation> system_keyspace::make_update_view_building_task_state_mutation(api::timestamp_type ts, utils::UUID id, db::view::view_building_task::task_state state) {
|
||||
static const sstring stmt = format("UPDATE {}.{} SET state = ? WHERE key = '{}' AND id = ?", NAME, VIEW_BUILDING_TASKS, VIEW_BUILDING_KEY);
|
||||
|
||||
auto muts = co_await _qp.get_mutations_internal(stmt, internal_system_query_state(), ts, {
|
||||
task_state_to_sstring(state), id
|
||||
});
|
||||
if (muts.size() != 1) {
|
||||
on_internal_error(slogger, fmt::format("expected 1 mutation got {}", muts.size()));
|
||||
}
|
||||
co_return std::move(muts[0]);
|
||||
}
|
||||
|
||||
future<mutation> system_keyspace::make_remove_view_building_task_mutation(api::timestamp_type ts, utils::UUID id) {
|
||||
static const sstring stmt = format("DELETE FROM {}.{} WHERE key = '{}' AND id = ?", NAME, VIEW_BUILDING_TASKS, VIEW_BUILDING_KEY);
|
||||
|
||||
|
||||
@@ -576,7 +576,6 @@ public:
|
||||
// system.view_building_tasks
|
||||
future<db::view::building_tasks> get_view_building_tasks();
|
||||
future<mutation> make_view_building_task_mutation(api::timestamp_type ts, const db::view::view_building_task& task);
|
||||
future<mutation> make_update_view_building_task_state_mutation(api::timestamp_type ts, utils::UUID id, db::view::view_building_task::task_state state);
|
||||
future<mutation> make_remove_view_building_task_mutation(api::timestamp_type ts, utils::UUID id);
|
||||
|
||||
// system.scylla_local, view_building_processing_base key
|
||||
|
||||
@@ -104,6 +104,8 @@ future<> view_building_coordinator::run() {
|
||||
_vb_sm.event.broadcast();
|
||||
});
|
||||
|
||||
auto finished_tasks_gc_fiber = finished_task_gc_fiber();
|
||||
|
||||
while (!_as.abort_requested()) {
|
||||
co_await utils::get_local_injector().inject("view_building_coordinator_pause_main_loop", utils::wait_for_message(std::chrono::minutes(2)));
|
||||
if (utils::get_local_injector().enter("view_building_coordinator_skip_main_loop")) {
|
||||
@@ -121,12 +123,7 @@ future<> view_building_coordinator::run() {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto started_new_work = co_await work_on_view_building(std::move(*guard_opt));
|
||||
if (started_new_work) {
|
||||
// If any tasks were started, do another iteration, so the coordinator can attach itself to the tasks (via RPC)
|
||||
vbc_logger.debug("view building coordinator started new tasks, do next iteration without waiting for event");
|
||||
continue;
|
||||
}
|
||||
co_await work_on_view_building(std::move(*guard_opt));
|
||||
co_await await_event();
|
||||
} catch (...) {
|
||||
handle_coordinator_error(std::current_exception());
|
||||
@@ -142,6 +139,66 @@ future<> view_building_coordinator::run() {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
co_await std::move(finished_tasks_gc_fiber);
|
||||
}
|
||||
|
||||
future<> view_building_coordinator::finished_task_gc_fiber() {
|
||||
static auto task_gc_interval = 200ms;
|
||||
|
||||
while (!_as.abort_requested()) {
|
||||
try {
|
||||
co_await clean_finished_tasks();
|
||||
co_await sleep_abortable(task_gc_interval, _as);
|
||||
} catch (abort_requested_exception&) {
|
||||
vbc_logger.debug("view_building_coordinator::finished_task_gc_fiber got abort_requested_exception");
|
||||
} catch (service::group0_concurrent_modification&) {
|
||||
vbc_logger.info("view_building_coordinator::finished_task_gc_fiber got group0_concurrent_modification");
|
||||
} catch (raft::request_aborted&) {
|
||||
vbc_logger.debug("view_building_coordinator::finished_task_gc_fiber got raft::request_aborted");
|
||||
} catch (service::term_changed_error&) {
|
||||
vbc_logger.debug("view_building_coordinator::finished_task_gc_fiber notices term change {} -> {}", _term, _raft.get_current_term());
|
||||
} catch (raft::commit_status_unknown&) {
|
||||
vbc_logger.warn("view_building_coordinator::finished_task_gc_fiber got raft::commit_status_unknown");
|
||||
} catch (...) {
|
||||
vbc_logger.error("view_building_coordinator::finished_task_gc_fiber got error: {}", std::current_exception());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
future<> view_building_coordinator::clean_finished_tasks() {
|
||||
// Avoid acquiring a group0 operation if there are no tasks.
|
||||
if (_finished_tasks.empty()) {
|
||||
co_return;
|
||||
}
|
||||
|
||||
auto guard = co_await start_operation();
|
||||
auto lock = co_await get_unique_lock(_mutex);
|
||||
|
||||
if (!_vb_sm.building_state.currently_processed_base_table || std::ranges::all_of(_finished_tasks, [] (auto& e) { return e.second.empty(); })) {
|
||||
co_return;
|
||||
}
|
||||
|
||||
view_building_task_mutation_builder builder(guard.write_timestamp());
|
||||
for (auto& [replica, tasks]: _finished_tasks) {
|
||||
for (auto& task_id: tasks) {
|
||||
// The task might be aborted in the meantime. In this case we cannot remove it because we need it to create a new task.
|
||||
//
|
||||
// TODO: When we're aborting a view building task (for instance due to tablet migration),
|
||||
// we can look if we already finished it (check if it's in `_finished_tasks`).
|
||||
// If yes, we can just remove it instead of aborting it.
|
||||
auto task_opt = _vb_sm.building_state.get_task(*_vb_sm.building_state.currently_processed_base_table, replica, task_id);
|
||||
if (task_opt && !task_opt->get().aborted) {
|
||||
builder.del_task(task_id);
|
||||
vbc_logger.debug("Removing finished task with ID: {}", task_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
co_await commit_mutations(std::move(guard), {builder.build()}, "remove finished view building tasks");
|
||||
for (auto& [_, tasks_set]: _finished_tasks) {
|
||||
tasks_set.clear();
|
||||
}
|
||||
}
|
||||
|
||||
future<std::optional<service::group0_guard>> view_building_coordinator::update_state(service::group0_guard guard) {
|
||||
@@ -301,18 +358,16 @@ future<> view_building_coordinator::update_views_statuses(const service::group0_
|
||||
}
|
||||
}
|
||||
|
||||
future<bool> view_building_coordinator::work_on_view_building(service::group0_guard guard) {
|
||||
future<> view_building_coordinator::work_on_view_building(service::group0_guard guard) {
|
||||
if (!_vb_sm.building_state.currently_processed_base_table) {
|
||||
vbc_logger.debug("No base table is selected, nothing to do.");
|
||||
co_return false;
|
||||
co_return;
|
||||
}
|
||||
|
||||
utils::chunked_vector<mutation> muts;
|
||||
std::unordered_set<locator::tablet_replica> _remote_work_keys_to_erase;
|
||||
// Acquire unique lock of `_finished_tasks` to ensure each replica has its own entry in it
|
||||
// and to select tasks for them.
|
||||
auto lock = co_await get_unique_lock(_mutex);
|
||||
for (auto& replica: get_replicas_with_tasks()) {
|
||||
// Check whether the coordinator already waits for the remote work on the replica to be finished.
|
||||
// If so: check if the work is done and and remove the shared_future, skip this replica otherwise.
|
||||
bool skip_work_on_this_replica = false;
|
||||
if (_remote_work.contains(replica)) {
|
||||
if (!_remote_work[replica].available()) {
|
||||
vbc_logger.debug("Replica {} is still doing work", replica);
|
||||
@@ -320,21 +375,7 @@ future<bool> view_building_coordinator::work_on_view_building(service::group0_gu
|
||||
}
|
||||
|
||||
auto remote_results_opt = co_await _remote_work[replica].get_future();
|
||||
if (remote_results_opt) {
|
||||
auto results_muts = co_await update_state_after_work_is_done(guard, replica, std::move(*remote_results_opt));
|
||||
muts.insert(muts.end(), std::make_move_iterator(results_muts.begin()), std::make_move_iterator(results_muts.end()));
|
||||
// If the replica successfully finished its work, we need to commit mutations generated above before selecting next task
|
||||
skip_work_on_this_replica = !results_muts.empty();
|
||||
}
|
||||
|
||||
// If there were no mutations for this replica, we can just remove the entry from `_remote_work` map
|
||||
// and start new work in the same iteration.
|
||||
// Otherwise, the entry needs to be removed after the mutations are committed successfully.
|
||||
if (skip_work_on_this_replica) {
|
||||
_remote_work_keys_to_erase.insert(replica);
|
||||
} else {
|
||||
_remote_work.erase(replica);
|
||||
}
|
||||
_remote_work.erase(replica);
|
||||
}
|
||||
|
||||
const bool ignore_gossiper = utils::get_local_injector().enter("view_building_coordinator_ignore_gossiper");
|
||||
@@ -343,31 +384,16 @@ future<bool> view_building_coordinator::work_on_view_building(service::group0_gu
|
||||
continue;
|
||||
}
|
||||
|
||||
if (skip_work_on_this_replica) {
|
||||
continue;
|
||||
if (!_finished_tasks.contains(replica)) {
|
||||
_finished_tasks.insert({replica, {}});
|
||||
}
|
||||
|
||||
if (auto already_started_ids = _vb_sm.building_state.get_started_tasks(*_vb_sm.building_state.currently_processed_base_table, replica); !already_started_ids.empty()) {
|
||||
// If the replica has any task in `STARTED` state, attach the coordinator to the work.
|
||||
attach_to_started_tasks(replica, std::move(already_started_ids));
|
||||
} else if (auto todo_ids = select_tasks_for_replica(replica); !todo_ids.empty()) {
|
||||
// If the replica has no started tasks and there are tasks to do, mark them as started.
|
||||
// The coordinator will attach itself to the work in next iteration.
|
||||
auto new_mutations = co_await start_tasks(guard, std::move(todo_ids));
|
||||
muts.insert(muts.end(), std::make_move_iterator(new_mutations.begin()), std::make_move_iterator(new_mutations.end()));
|
||||
if (auto todo_ids = select_tasks_for_replica(replica); !todo_ids.empty()) {
|
||||
start_remote_worker(replica, std::move(todo_ids));
|
||||
} else {
|
||||
vbc_logger.debug("Nothing to do for replica {}", replica);
|
||||
}
|
||||
}
|
||||
|
||||
if (!muts.empty()) {
|
||||
co_await commit_mutations(std::move(guard), std::move(muts), "start view building tasks");
|
||||
for (auto& key: _remote_work_keys_to_erase) {
|
||||
_remote_work.erase(key);
|
||||
}
|
||||
co_return true;
|
||||
}
|
||||
co_return false;
|
||||
}
|
||||
|
||||
std::set<locator::tablet_replica> view_building_coordinator::get_replicas_with_tasks() {
|
||||
@@ -390,7 +416,7 @@ std::vector<utils::UUID> view_building_coordinator::select_tasks_for_replica(loc
|
||||
// Select only building tasks and return theirs ids
|
||||
auto filter_building_tasks = [] (const std::vector<view_building_task>& tasks) -> std::vector<utils::UUID> {
|
||||
return tasks | std::views::filter([] (const view_building_task& t) {
|
||||
return t.type == view_building_task::task_type::build_range && t.state == view_building_task::task_state::idle;
|
||||
return t.type == view_building_task::task_type::build_range && !t.aborted;
|
||||
}) | std::views::transform([] (const view_building_task& t) {
|
||||
return t.id;
|
||||
}) | std::ranges::to<std::vector>();
|
||||
@@ -404,7 +430,29 @@ std::vector<utils::UUID> view_building_coordinator::select_tasks_for_replica(loc
|
||||
}
|
||||
|
||||
auto& tablet_map = _db.get_token_metadata().tablets().get_tablet_map(*_vb_sm.building_state.currently_processed_base_table);
|
||||
for (auto& [token, tasks]: _vb_sm.building_state.collect_tasks_by_last_token(*_vb_sm.building_state.currently_processed_base_table, replica)) {
|
||||
auto tasks_by_last_token = _vb_sm.building_state.collect_tasks_by_last_token(*_vb_sm.building_state.currently_processed_base_table, replica);
|
||||
|
||||
// Remove completed tasks in `_finished_tasks` from `tasks_by_last_token`
|
||||
auto it = tasks_by_last_token.begin();
|
||||
while (it != tasks_by_last_token.end()) {
|
||||
auto task_it = it->second.begin();
|
||||
while (task_it != it->second.end()) {
|
||||
if (_finished_tasks.at(replica).contains(task_it->id)) {
|
||||
task_it = it->second.erase(task_it);
|
||||
} else {
|
||||
++task_it;
|
||||
}
|
||||
}
|
||||
|
||||
// Remove the entry from `tasks_by_last_token` if its vector is empty
|
||||
if (it->second.empty()) {
|
||||
it = tasks_by_last_token.erase(it);
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
|
||||
for (auto& [token, tasks]: tasks_by_last_token) {
|
||||
auto tid = tablet_map.get_tablet_id(token);
|
||||
if (tablet_map.get_tablet_transition_info(tid)) {
|
||||
vbc_logger.debug("Tablet {} on replica {} is in transition.", tid, replica);
|
||||
@@ -416,7 +464,7 @@ std::vector<utils::UUID> view_building_coordinator::select_tasks_for_replica(loc
|
||||
return building_tasks;
|
||||
} else {
|
||||
return tasks | std::views::filter([] (const view_building_task& t) {
|
||||
return t.state == view_building_task::task_state::idle;
|
||||
return !t.aborted;
|
||||
}) | std::views::transform([] (const view_building_task& t) {
|
||||
return t.id;
|
||||
}) | std::ranges::to<std::vector>();
|
||||
@@ -426,32 +474,21 @@ std::vector<utils::UUID> view_building_coordinator::select_tasks_for_replica(loc
|
||||
return {};
|
||||
}
|
||||
|
||||
future<utils::chunked_vector<mutation>> view_building_coordinator::start_tasks(const service::group0_guard& guard, std::vector<utils::UUID> tasks) {
|
||||
vbc_logger.info("Starting tasks {}", tasks);
|
||||
|
||||
utils::chunked_vector<mutation> muts;
|
||||
for (auto& t: tasks) {
|
||||
auto mut = co_await _sys_ks.make_update_view_building_task_state_mutation(guard.write_timestamp(), t, view_building_task::task_state::started);
|
||||
muts.push_back(std::move(mut));
|
||||
}
|
||||
co_return muts;
|
||||
}
|
||||
|
||||
void view_building_coordinator::attach_to_started_tasks(const locator::tablet_replica& replica, std::vector<utils::UUID> tasks) {
|
||||
void view_building_coordinator::start_remote_worker(const locator::tablet_replica& replica, std::vector<utils::UUID> tasks) {
|
||||
vbc_logger.debug("Attaching to started tasks {} on replica {}", tasks, replica);
|
||||
shared_future<std::optional<remote_work_results>> work = work_on_tasks(replica, std::move(tasks));
|
||||
shared_future<std::optional<std::vector<utils::UUID>>> work = work_on_tasks(replica, std::move(tasks));
|
||||
_remote_work.insert({replica, std::move(work)});
|
||||
}
|
||||
|
||||
future<std::optional<view_building_coordinator::remote_work_results>> view_building_coordinator::work_on_tasks(locator::tablet_replica replica, std::vector<utils::UUID> tasks) {
|
||||
future<std::optional<std::vector<utils::UUID>>> view_building_coordinator::work_on_tasks(locator::tablet_replica replica, std::vector<utils::UUID> tasks) {
|
||||
constexpr auto backoff_duration = std::chrono::seconds(1);
|
||||
static thread_local logger::rate_limit rate_limit{backoff_duration};
|
||||
|
||||
std::vector<view_task_result> remote_results;
|
||||
std::vector<utils::UUID> remote_results;
|
||||
bool rpc_failed = false;
|
||||
|
||||
try {
|
||||
remote_results = co_await ser::view_rpc_verbs::send_work_on_view_building_tasks(&_messaging, replica.host, _as, tasks);
|
||||
remote_results = co_await ser::view_rpc_verbs::send_work_on_view_building_tasks(&_messaging, replica.host, _as, _term, replica.shard, tasks);
|
||||
} catch (...) {
|
||||
vbc_logger.log(log_level::warn, rate_limit, "Work on tasks {} on replica {}, failed with error: {}",
|
||||
tasks, replica, std::current_exception());
|
||||
@@ -464,44 +501,14 @@ future<std::optional<view_building_coordinator::remote_work_results>> view_build
|
||||
co_return std::nullopt;
|
||||
}
|
||||
|
||||
if (tasks.size() != remote_results.size()) {
|
||||
on_internal_error(vbc_logger, fmt::format("Number of tasks ({}) and results ({}) do not match for replica {}", tasks.size(), remote_results.size(), replica));
|
||||
}
|
||||
// In `view_building_coordinator::work_on_view_building()` we made sure that,
|
||||
// each replica has its own entry in the `_finished_tasks`, so now we can just take a shared lock
|
||||
// and insert its of finished tasks to this replica bucket as there is at most one instance of this method for each replica.
|
||||
auto lock = co_await get_shared_lock(_mutex);
|
||||
_finished_tasks.at(replica).insert_range(remote_results);
|
||||
|
||||
remote_work_results results;
|
||||
for (size_t i = 0; i < tasks.size(); ++i) {
|
||||
results.push_back({tasks[i], remote_results[i]});
|
||||
}
|
||||
_vb_sm.event.broadcast();
|
||||
co_return results;
|
||||
}
|
||||
|
||||
// Mark finished task as done (remove them from the table).
|
||||
// Retry failed tasks if possible (if failed tasks wasn't aborted).
|
||||
future<utils::chunked_vector<mutation>> view_building_coordinator::update_state_after_work_is_done(const service::group0_guard& guard, const locator::tablet_replica& replica, view_building_coordinator::remote_work_results results) {
|
||||
vbc_logger.debug("Got results from replica {}: {}", replica, results);
|
||||
|
||||
utils::chunked_vector<mutation> muts;
|
||||
for (auto& result: results) {
|
||||
vbc_logger.info("Task {} was finished with result: {}", result.first, result.second);
|
||||
|
||||
if (!_vb_sm.building_state.currently_processed_base_table) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// A task can be aborted by deleting it or by setting its state to `ABORTED`.
|
||||
// If the task was aborted by changing the state,
|
||||
// we shouldn't remove it here because it might be needed
|
||||
// to generate updated after tablet operation (migration/resize)
|
||||
// is finished.
|
||||
auto task_opt = _vb_sm.building_state.get_task(*_vb_sm.building_state.currently_processed_base_table, replica, result.first);
|
||||
if (task_opt && task_opt->get().state != view_building_task::task_state::aborted) {
|
||||
// Otherwise, the task was completed successfully and we can remove it.
|
||||
auto delete_mut = co_await _sys_ks.make_remove_view_building_task_mutation(guard.write_timestamp(), result.first);
|
||||
muts.push_back(std::move(delete_mut));
|
||||
}
|
||||
}
|
||||
co_return muts;
|
||||
co_return remote_results;
|
||||
}
|
||||
|
||||
future<> view_building_coordinator::stop() {
|
||||
@@ -531,7 +538,7 @@ void view_building_coordinator::generate_tablet_migration_updates(utils::chunked
|
||||
auto create_task_copy_on_pending_replica = [&] (const view_building_task& task) {
|
||||
auto new_id = builder.new_id();
|
||||
builder.set_type(new_id, task.type)
|
||||
.set_state(new_id, view_building_task::task_state::idle)
|
||||
.set_aborted(new_id, false)
|
||||
.set_base_id(new_id, task.base_id)
|
||||
.set_last_token(new_id, task.last_token)
|
||||
.set_replica(new_id, *trinfo.pending_replica);
|
||||
@@ -599,7 +606,7 @@ void view_building_coordinator::generate_tablet_resize_updates(utils::chunked_ve
|
||||
auto create_task_copy = [&] (const view_building_task& task, dht::token last_token) -> utils::UUID {
|
||||
auto new_id = builder.new_id();
|
||||
builder.set_type(new_id, task.type)
|
||||
.set_state(new_id, view_building_task::task_state::idle)
|
||||
.set_aborted(new_id, false)
|
||||
.set_base_id(new_id, task.base_id)
|
||||
.set_last_token(new_id, last_token)
|
||||
.set_replica(new_id, task.replica);
|
||||
@@ -668,7 +675,7 @@ void view_building_coordinator::abort_tasks(utils::chunked_vector<canonical_muta
|
||||
auto abort_task_map = [&] (const task_map& task_map) {
|
||||
for (auto& [id, _]: task_map) {
|
||||
vbc_logger.debug("Aborting task {}", id);
|
||||
builder.set_state(id, view_building_task::task_state::aborted);
|
||||
builder.set_aborted(id, true);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -698,7 +705,7 @@ void abort_view_building_tasks(const view_building_state_machine& vb_sm,
|
||||
for (auto& [id, task]: task_map) {
|
||||
if (task.last_token == last_token) {
|
||||
vbc_logger.debug("Aborting task {}", id);
|
||||
builder.set_state(id, view_building_task::task_state::aborted);
|
||||
builder.set_aborted(id, true);
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -714,10 +721,10 @@ void abort_view_building_tasks(const view_building_state_machine& vb_sm,
|
||||
|
||||
static void rollback_task_map(view_building_task_mutation_builder& builder, const task_map& task_map) {
|
||||
for (auto& [id, task]: task_map) {
|
||||
if (task.state == view_building_task::task_state::aborted) {
|
||||
if (task.aborted) {
|
||||
auto new_id = builder.new_id();
|
||||
builder.set_type(new_id, task.type)
|
||||
.set_state(new_id, view_building_task::task_state::idle)
|
||||
.set_aborted(new_id, false)
|
||||
.set_base_id(new_id, task.base_id)
|
||||
.set_last_token(new_id, task.last_token)
|
||||
.set_replica(new_id, task.replica);
|
||||
|
||||
@@ -54,9 +54,9 @@ class view_building_coordinator : public service::endpoint_lifecycle_subscriber
|
||||
const raft::term_t _term;
|
||||
abort_source& _as;
|
||||
|
||||
|
||||
using remote_work_results = std::vector<std::pair<utils::UUID, db::view::view_task_result>>;
|
||||
std::unordered_map<locator::tablet_replica, shared_future<std::optional<remote_work_results>>> _remote_work;
|
||||
std::unordered_map<locator::tablet_replica, shared_future<std::optional<std::vector<utils::UUID>>>> _remote_work;
|
||||
shared_mutex _mutex; // guards `_finished_tasks` field
|
||||
std::unordered_map<locator::tablet_replica, std::unordered_set<utils::UUID>> _finished_tasks;
|
||||
|
||||
public:
|
||||
view_building_coordinator(replica::database& db, raft::server& raft, service::raft_group0& group0,
|
||||
@@ -86,9 +86,11 @@ private:
|
||||
future<> commit_mutations(service::group0_guard guard, utils::chunked_vector<mutation> mutations, std::string_view description);
|
||||
void handle_coordinator_error(std::exception_ptr eptr);
|
||||
|
||||
future<> finished_task_gc_fiber();
|
||||
future<> clean_finished_tasks();
|
||||
|
||||
future<std::optional<service::group0_guard>> update_state(service::group0_guard guard);
|
||||
// Returns if any new tasks were started
|
||||
future<bool> work_on_view_building(service::group0_guard guard);
|
||||
future<> work_on_view_building(service::group0_guard guard);
|
||||
|
||||
future<> mark_view_build_status_started(const service::group0_guard& guard, table_id view_id, utils::chunked_vector<mutation>& out);
|
||||
future<> mark_all_remaining_view_build_statuses_started(const service::group0_guard& guard, table_id base_id, utils::chunked_vector<mutation>& out);
|
||||
@@ -97,10 +99,8 @@ private:
|
||||
std::set<locator::tablet_replica> get_replicas_with_tasks();
|
||||
std::vector<utils::UUID> select_tasks_for_replica(locator::tablet_replica replica);
|
||||
|
||||
future<utils::chunked_vector<mutation>> start_tasks(const service::group0_guard& guard, std::vector<utils::UUID> tasks);
|
||||
void attach_to_started_tasks(const locator::tablet_replica& replica, std::vector<utils::UUID> tasks);
|
||||
future<std::optional<remote_work_results>> work_on_tasks(locator::tablet_replica replica, std::vector<utils::UUID> tasks);
|
||||
future<utils::chunked_vector<mutation>> update_state_after_work_is_done(const service::group0_guard& guard, const locator::tablet_replica& replica, remote_work_results results);
|
||||
void start_remote_worker(const locator::tablet_replica& replica, std::vector<utils::UUID> tasks);
|
||||
future<std::optional<std::vector<utils::UUID>>> work_on_tasks(locator::tablet_replica replica, std::vector<utils::UUID> tasks);
|
||||
};
|
||||
|
||||
void abort_view_building_tasks(const db::view::view_building_state_machine& vb_sm,
|
||||
|
||||
@@ -13,10 +13,10 @@ namespace db {
|
||||
|
||||
namespace view {
|
||||
|
||||
view_building_task::view_building_task(utils::UUID id, task_type type, task_state state, table_id base_id, std::optional<table_id> view_id, locator::tablet_replica replica, dht::token last_token)
|
||||
view_building_task::view_building_task(utils::UUID id, task_type type, bool aborted, table_id base_id, std::optional<table_id> view_id, locator::tablet_replica replica, dht::token last_token)
|
||||
: id(id)
|
||||
, type(type)
|
||||
, state(state)
|
||||
, aborted(aborted)
|
||||
, base_id(base_id)
|
||||
, view_id(view_id)
|
||||
, replica(replica)
|
||||
@@ -49,30 +49,6 @@ seastar::sstring task_type_to_sstring(view_building_task::task_type type) {
|
||||
}
|
||||
}
|
||||
|
||||
view_building_task::task_state task_state_from_string(std::string_view str) {
|
||||
if (str == "IDLE") {
|
||||
return view_building_task::task_state::idle;
|
||||
}
|
||||
if (str == "STARTED") {
|
||||
return view_building_task::task_state::started;
|
||||
}
|
||||
if (str == "ABORTED") {
|
||||
return view_building_task::task_state::aborted;
|
||||
}
|
||||
throw std::runtime_error(fmt::format("Unknown view building task state: {}", str));
|
||||
}
|
||||
|
||||
seastar::sstring task_state_to_sstring(view_building_task::task_state state) {
|
||||
switch (state) {
|
||||
case view_building_task::task_state::idle:
|
||||
return "IDLE";
|
||||
case view_building_task::task_state::started:
|
||||
return "STARTED";
|
||||
case view_building_task::task_state::aborted:
|
||||
return "ABORTED";
|
||||
}
|
||||
}
|
||||
|
||||
std::optional<std::reference_wrapper<const view_building_task>> view_building_state::get_task(table_id base_id, locator::tablet_replica replica, utils::UUID id) const {
|
||||
if (!tasks_state.contains(base_id) || !tasks_state.at(base_id).contains(replica)) {
|
||||
return {};
|
||||
@@ -151,46 +127,6 @@ std::map<dht::token, std::vector<view_building_task>> view_building_state::colle
|
||||
return tasks;
|
||||
}
|
||||
|
||||
// Returns all tasks for `_vb_sm.building_state.currently_processed_base_table` and `replica` with `STARTED` state.
|
||||
std::vector<utils::UUID> view_building_state::get_started_tasks(table_id base_table_id, locator::tablet_replica replica) const {
|
||||
if (!tasks_state.contains(base_table_id) || !tasks_state.at(base_table_id).contains(replica)) {
|
||||
// No tasks for this replica
|
||||
return {};
|
||||
}
|
||||
|
||||
std::vector<view_building_task> tasks;
|
||||
auto& replica_tasks = tasks_state.at(base_table_id).at(replica);
|
||||
for (auto& [_, view_tasks]: replica_tasks.view_tasks) {
|
||||
for (auto& [_, task]: view_tasks) {
|
||||
if (task.state == view_building_task::task_state::started) {
|
||||
tasks.push_back(task);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (auto& [_, task]: replica_tasks.staging_tasks) {
|
||||
if (task.state == view_building_task::task_state::started) {
|
||||
tasks.push_back(task);
|
||||
}
|
||||
}
|
||||
|
||||
// All collected tasks should have the same: type, base_id and last_token,
|
||||
// so they can be executed in the same view_building_worker::batch.
|
||||
#ifdef SEASTAR_DEBUG
|
||||
if (!tasks.empty()) {
|
||||
auto& task = tasks.front();
|
||||
for (auto& t: tasks) {
|
||||
SCYLLA_ASSERT(task.type == t.type);
|
||||
SCYLLA_ASSERT(task.base_id == t.base_id);
|
||||
SCYLLA_ASSERT(task.last_token == t.last_token);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
return tasks | std::views::transform([] (const view_building_task& t) {
|
||||
return t.id;
|
||||
}) | std::ranges::to<std::vector>();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -39,28 +39,17 @@ struct view_building_task {
|
||||
process_staging,
|
||||
};
|
||||
|
||||
// When a task is created, it starts with `IDLE` state.
|
||||
// Then, the view building coordinator will decide to do the task and it will
|
||||
// set the state to `STARTED`.
|
||||
// When a task is finished the entry is removed.
|
||||
//
|
||||
// If a task is in progress when a tablet operation (migration/resize) starts,
|
||||
// the task's state is set to `ABORTED`.
|
||||
enum class task_state {
|
||||
idle,
|
||||
started,
|
||||
aborted,
|
||||
};
|
||||
|
||||
utils::UUID id;
|
||||
task_type type;
|
||||
task_state state;
|
||||
bool aborted;
|
||||
|
||||
table_id base_id;
|
||||
std::optional<table_id> view_id; // nullopt when task_type is `process_staging`
|
||||
locator::tablet_replica replica;
|
||||
dht::token last_token;
|
||||
|
||||
view_building_task(utils::UUID id, task_type type, task_state state,
|
||||
view_building_task(utils::UUID id, task_type type, bool aborted,
|
||||
table_id base_id, std::optional<table_id> view_id,
|
||||
locator::tablet_replica replica, dht::token last_token);
|
||||
};
|
||||
@@ -92,7 +81,6 @@ struct view_building_state {
|
||||
std::vector<std::reference_wrapper<const view_building_task>> get_tasks_for_host(table_id base_id, locator::host_id host) const;
|
||||
std::map<dht::token, std::vector<view_building_task>> collect_tasks_by_last_token(table_id base_table_id) const;
|
||||
std::map<dht::token, std::vector<view_building_task>> collect_tasks_by_last_token(table_id base_table_id, const locator::tablet_replica& replica) const;
|
||||
std::vector<utils::UUID> get_started_tasks(table_id base_table_id, locator::tablet_replica replica) const;
|
||||
};
|
||||
|
||||
// Represents global state of tablet-based views.
|
||||
@@ -113,18 +101,8 @@ struct view_building_state_machine {
|
||||
condition_variable event;
|
||||
};
|
||||
|
||||
struct view_task_result {
|
||||
enum class command_status: uint8_t {
|
||||
success = 0,
|
||||
abort = 1,
|
||||
};
|
||||
db::view::view_task_result::command_status status;
|
||||
};
|
||||
|
||||
view_building_task::task_type task_type_from_string(std::string_view str);
|
||||
seastar::sstring task_type_to_sstring(view_building_task::task_type type);
|
||||
view_building_task::task_state task_state_from_string(std::string_view str);
|
||||
seastar::sstring task_state_to_sstring(view_building_task::task_state state);
|
||||
|
||||
} // namespace view_building
|
||||
|
||||
@@ -136,17 +114,11 @@ template <> struct fmt::formatter<db::view::view_building_task::task_type> : fmt
|
||||
}
|
||||
};
|
||||
|
||||
template <> struct fmt::formatter<db::view::view_building_task::task_state> : fmt::formatter<string_view> {
|
||||
auto format(db::view::view_building_task::task_state state, fmt::format_context& ctx) const {
|
||||
return fmt::format_to(ctx.out(), "{}", db::view::task_state_to_sstring(state));
|
||||
}
|
||||
};
|
||||
|
||||
template <> struct fmt::formatter<db::view::view_building_task> : fmt::formatter<string_view> {
|
||||
auto format(db::view::view_building_task task, fmt::format_context& ctx) const {
|
||||
auto view_id = task.view_id ? fmt::to_string(*task.view_id) : "nullopt";
|
||||
return fmt::format_to(ctx.out(), "view_building_task{{type: {}, state: {}, base_id: {}, view_id: {}, last_token: {}}}",
|
||||
task.type, task.state, task.base_id, view_id, task.last_token);
|
||||
return fmt::format_to(ctx.out(), "view_building_task{{type: {}, aborted: {}, base_id: {}, view_id: {}, last_token: {}}}",
|
||||
task.type, task.aborted, task.base_id, view_id, task.last_token);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -161,18 +133,3 @@ template <> struct fmt::formatter<db::view::replica_tasks> : fmt::formatter<stri
|
||||
return fmt::format_to(ctx.out(), "{{view_tasks: {}, staging_tasks: {}}}", replica_tasks.view_tasks, replica_tasks.staging_tasks);
|
||||
}
|
||||
};
|
||||
|
||||
template <> struct fmt::formatter<db::view::view_task_result> : fmt::formatter<string_view> {
|
||||
auto format(db::view::view_task_result result, fmt::format_context& ctx) const {
|
||||
std::string_view res;
|
||||
switch (result.status) {
|
||||
case db::view::view_task_result::command_status::success:
|
||||
res = "success";
|
||||
break;
|
||||
case db::view::view_task_result::command_status::abort:
|
||||
res = "abort";
|
||||
break;
|
||||
}
|
||||
return format_to(ctx.out(), "{}", res);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -25,8 +25,8 @@ view_building_task_mutation_builder& view_building_task_mutation_builder::set_ty
|
||||
_m.set_clustered_cell(get_ck(id), "type", data_value(task_type_to_sstring(type)), _ts);
|
||||
return *this;
|
||||
}
|
||||
view_building_task_mutation_builder& view_building_task_mutation_builder::set_state(utils::UUID id, db::view::view_building_task::task_state state) {
|
||||
_m.set_clustered_cell(get_ck(id), "state", data_value(task_state_to_sstring(state)), _ts);
|
||||
view_building_task_mutation_builder& view_building_task_mutation_builder::set_aborted(utils::UUID id, bool aborted) {
|
||||
_m.set_clustered_cell(get_ck(id), "aborted", data_value(aborted), _ts);
|
||||
return *this;
|
||||
}
|
||||
view_building_task_mutation_builder& view_building_task_mutation_builder::set_base_id(utils::UUID id, table_id base_id) {
|
||||
|
||||
@@ -32,7 +32,7 @@ public:
|
||||
static utils::UUID new_id();
|
||||
|
||||
view_building_task_mutation_builder& set_type(utils::UUID id, db::view::view_building_task::task_type type);
|
||||
view_building_task_mutation_builder& set_state(utils::UUID id, db::view::view_building_task::task_state state);
|
||||
view_building_task_mutation_builder& set_aborted(utils::UUID id, bool aborted);
|
||||
view_building_task_mutation_builder& set_base_id(utils::UUID id, table_id base_id);
|
||||
view_building_task_mutation_builder& set_view_id(utils::UUID id, table_id view_id);
|
||||
view_building_task_mutation_builder& set_last_token(utils::UUID id, dht::token last_token);
|
||||
|
||||
@@ -22,6 +22,7 @@
|
||||
#include "replica/database.hh"
|
||||
#include "service/storage_proxy.hh"
|
||||
#include "service/raft/raft_group0_client.hh"
|
||||
#include "service/raft/raft_group0.hh"
|
||||
#include "schema/schema_fwd.hh"
|
||||
#include "idl/view.dist.hh"
|
||||
#include "sstables/sstables.hh"
|
||||
@@ -114,11 +115,11 @@ static locator::tablet_id get_sstable_tablet_id(const locator::tablet_map& table
|
||||
return tablet_id;
|
||||
}
|
||||
|
||||
view_building_worker::view_building_worker(replica::database& db, db::system_keyspace& sys_ks, service::migration_notifier& mnotifier, service::raft_group0_client& group0_client, view_update_generator& vug, netw::messaging_service& ms, view_building_state_machine& vbsm)
|
||||
view_building_worker::view_building_worker(replica::database& db, db::system_keyspace& sys_ks, service::migration_notifier& mnotifier, service::raft_group0& group0, view_update_generator& vug, netw::messaging_service& ms, view_building_state_machine& vbsm)
|
||||
: _db(db)
|
||||
, _sys_ks(sys_ks)
|
||||
, _mnotifier(mnotifier)
|
||||
, _group0_client(group0_client)
|
||||
, _group0(group0)
|
||||
, _vug(vug)
|
||||
, _messaging(ms)
|
||||
, _vb_state_machine(vbsm)
|
||||
@@ -145,6 +146,7 @@ future<> view_building_worker::drain() {
|
||||
if (!_as.abort_requested()) {
|
||||
_as.request_abort();
|
||||
}
|
||||
_state._mutex.broken();
|
||||
_staging_sstables_mutex.broken();
|
||||
_sstables_to_register_event.broken();
|
||||
if (this_shard_id() == 0) {
|
||||
@@ -154,8 +156,7 @@ future<> view_building_worker::drain() {
|
||||
co_await std::move(state_observer);
|
||||
co_await _mnotifier.unregister_listener(this);
|
||||
}
|
||||
co_await _state.clear_state();
|
||||
_state.state_updated_cv.broken();
|
||||
co_await _state.clear();
|
||||
co_await uninit_messaging_service();
|
||||
}
|
||||
|
||||
@@ -224,22 +225,22 @@ future<> view_building_worker::create_staging_sstable_tasks() {
|
||||
|
||||
utils::chunked_vector<canonical_mutation> cmuts;
|
||||
|
||||
auto guard = co_await _group0_client.start_operation(_as);
|
||||
auto guard = co_await _group0.client().start_operation(_as);
|
||||
auto my_host_id = _db.get_token_metadata().get_topology().my_host_id();
|
||||
for (auto& [table_id, sst_infos]: _sstables_to_register) {
|
||||
for (auto& sst_info: sst_infos) {
|
||||
view_building_task task {
|
||||
utils::UUID_gen::get_time_UUID(), view_building_task::task_type::process_staging, view_building_task::task_state::idle,
|
||||
utils::UUID_gen::get_time_UUID(), view_building_task::task_type::process_staging, false,
|
||||
table_id, ::table_id{}, {my_host_id, sst_info.shard}, sst_info.last_token
|
||||
};
|
||||
auto mut = co_await _group0_client.sys_ks().make_view_building_task_mutation(guard.write_timestamp(), task);
|
||||
auto mut = co_await _group0.client().sys_ks().make_view_building_task_mutation(guard.write_timestamp(), task);
|
||||
cmuts.emplace_back(std::move(mut));
|
||||
}
|
||||
}
|
||||
|
||||
vbw_logger.debug("Creating {} process_staging view_building_tasks", cmuts.size());
|
||||
auto cmd = _group0_client.prepare_command(service::write_mutations{std::move(cmuts)}, guard, "create view building tasks");
|
||||
co_await _group0_client.add_entry(std::move(cmd), std::move(guard), _as);
|
||||
auto cmd = _group0.client().prepare_command(service::write_mutations{std::move(cmuts)}, guard, "create view building tasks");
|
||||
co_await _group0.client().add_entry(std::move(cmd), std::move(guard), _as);
|
||||
|
||||
// Move staging sstables from `_sstables_to_register` (on shard0) to `_staging_sstables` on corresponding shards.
|
||||
// Firstly reorgenize `_sstables_to_register` for easier movement.
|
||||
@@ -340,22 +341,16 @@ future<> view_building_worker::run_view_building_state_observer() {
|
||||
|
||||
while (!_as.abort_requested()) {
|
||||
bool sleep = false;
|
||||
_state.some_batch_finished = false;
|
||||
try {
|
||||
vbw_logger.trace("view_building_state_observer() iteration");
|
||||
auto read_apply_mutex_holder = co_await _group0_client.hold_read_apply_mutex(_as);
|
||||
auto read_apply_mutex_holder = co_await _group0.client().hold_read_apply_mutex(_as);
|
||||
|
||||
co_await update_built_views();
|
||||
co_await update_building_state();
|
||||
co_await check_for_aborted_tasks();
|
||||
_as.check();
|
||||
|
||||
read_apply_mutex_holder.return_all();
|
||||
|
||||
// A batch could finished its work while the worker was
|
||||
// updating the state. In that case we should do another iteration.
|
||||
if (!_state.some_batch_finished) {
|
||||
co_await _vb_state_machine.event.wait();
|
||||
}
|
||||
co_await _vb_state_machine.event.wait();
|
||||
} catch (abort_requested_exception&) {
|
||||
} catch (broken_condition_variable&) {
|
||||
} catch (...) {
|
||||
@@ -382,7 +377,7 @@ future<> view_building_worker::update_built_views() {
|
||||
auto schema = _db.find_schema(table_id);
|
||||
return std::make_pair(schema->ks_name(), schema->cf_name());
|
||||
};
|
||||
auto& sys_ks = _group0_client.sys_ks();
|
||||
auto& sys_ks = _group0.client().sys_ks();
|
||||
|
||||
std::set<std::pair<sstring, sstring>> built_views;
|
||||
for (auto& [id, statuses]: _vb_state_machine.views_state.status_map) {
|
||||
@@ -411,22 +406,35 @@ future<> view_building_worker::update_built_views() {
|
||||
}
|
||||
}
|
||||
|
||||
future<> view_building_worker::update_building_state() {
|
||||
co_await _state.update(*this);
|
||||
co_await _state.finish_completed_tasks();
|
||||
_state.state_updated_cv.broadcast();
|
||||
}
|
||||
// Must be executed on shard0
|
||||
future<> view_building_worker::check_for_aborted_tasks() {
|
||||
return container().invoke_on_all([building_state = _vb_state_machine.building_state] (view_building_worker& vbw) -> future<> {
|
||||
auto lock = co_await get_units(vbw._state._mutex, 1, vbw._as);
|
||||
co_await vbw._state.update_processing_base_table(vbw._db, building_state, vbw._as);
|
||||
if (!vbw._state._batch) {
|
||||
co_return;
|
||||
}
|
||||
|
||||
bool view_building_worker::is_shard_free(shard_id shard) {
|
||||
return !std::ranges::any_of(_state.tasks_map, [&shard] (auto& task_entry) {
|
||||
return task_entry.second->replica.shard == shard && task_entry.second->state == view_building_worker::batch_state::in_progress;
|
||||
auto my_host_id = vbw._db.get_token_metadata().get_topology().my_host_id();
|
||||
auto my_replica = locator::tablet_replica{my_host_id, this_shard_id()};
|
||||
auto tasks_map = vbw._state._batch->tasks; // Potentially, we'll remove elements from the map, so we need a copy to iterate over it
|
||||
for (auto& [id, t]: tasks_map) {
|
||||
auto task_opt = building_state.get_task(t.base_id, my_replica, id);
|
||||
if (!task_opt || task_opt->get().aborted) {
|
||||
co_await vbw._state._batch->abort_task(id);
|
||||
}
|
||||
}
|
||||
|
||||
if (vbw._state._batch->tasks.empty()) {
|
||||
co_await vbw._state.clean_up_after_batch();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
void view_building_worker::init_messaging_service() {
|
||||
ser::view_rpc_verbs::register_work_on_view_building_tasks(&_messaging, [this] (std::vector<utils::UUID> ids) -> future<std::vector<view_task_result>> {
|
||||
return container().invoke_on(0, [ids = std::move(ids)] (view_building_worker& vbw) mutable -> future<std::vector<view_task_result>> {
|
||||
return vbw.work_on_tasks(std::move(ids));
|
||||
ser::view_rpc_verbs::register_work_on_view_building_tasks(&_messaging, [this] (raft::term_t term, shard_id shard, std::vector<utils::UUID> ids) -> future<std::vector<utils::UUID>> {
|
||||
return container().invoke_on(shard, [term, ids = std::move(ids)] (auto& vbw) mutable -> future<std::vector<utils::UUID>> {
|
||||
return vbw.work_on_tasks(term, std::move(ids));
|
||||
});
|
||||
});
|
||||
}
|
||||
@@ -435,236 +443,53 @@ future<> view_building_worker::uninit_messaging_service() {
|
||||
return ser::view_rpc_verbs::unregister(&_messaging);
|
||||
}
|
||||
|
||||
future<std::vector<view_task_result>> view_building_worker::work_on_tasks(std::vector<utils::UUID> ids) {
|
||||
vbw_logger.debug("Got request for results of tasks: {}", ids);
|
||||
auto guard = co_await _group0_client.start_operation(_as, service::raft_timeout{});
|
||||
auto processing_base_table = _state.processing_base_table;
|
||||
|
||||
auto are_tasks_finished = [&] () {
|
||||
return std::ranges::all_of(ids, [this] (const utils::UUID& id) {
|
||||
return _state.finished_tasks.contains(id) || _state.aborted_tasks.contains(id);
|
||||
});
|
||||
};
|
||||
|
||||
auto get_results = [&] () -> std::vector<view_task_result> {
|
||||
std::vector<view_task_result> results;
|
||||
for (const auto& id: ids) {
|
||||
if (_state.finished_tasks.contains(id)) {
|
||||
results.emplace_back(view_task_result::command_status::success);
|
||||
} else if (_state.aborted_tasks.contains(id)) {
|
||||
results.emplace_back(view_task_result::command_status::abort);
|
||||
} else {
|
||||
// This means that the task was aborted. Throw an error,
|
||||
// so the coordinator will refresh its state and retry without aborted IDs.
|
||||
throw std::runtime_error(fmt::format("No status for task {}", id));
|
||||
}
|
||||
}
|
||||
return results;
|
||||
};
|
||||
|
||||
if (are_tasks_finished()) {
|
||||
// If the batch is already finished, we can return the results immediately.
|
||||
vbw_logger.debug("Batch with tasks {} is already finished, returning results", ids);
|
||||
co_return get_results();
|
||||
}
|
||||
|
||||
// All of the tasks should be executed in the same batch
|
||||
// (their statuses are set to started in the same group0 operation).
|
||||
// If any ID is not present in the `tasks_map`, it means that it was aborted and we should fail this RPC call,
|
||||
// so the coordinator can retry without aborted IDs.
|
||||
// That's why we can identify the batch by random (.front()) ID from the `ids` vector.
|
||||
auto id = ids.front();
|
||||
while (!_state.tasks_map.contains(id) && processing_base_table == _state.processing_base_table) {
|
||||
vbw_logger.warn("Batch with task {} is not found in tasks map, waiting until worker updates its state", id);
|
||||
service::release_guard(std::move(guard));
|
||||
co_await _state.state_updated_cv.wait();
|
||||
guard = co_await _group0_client.start_operation(_as, service::raft_timeout{});
|
||||
}
|
||||
|
||||
if (processing_base_table != _state.processing_base_table) {
|
||||
// If the processing base table was changed, we should fail this RPC call because the tasks were aborted.
|
||||
throw std::runtime_error(fmt::format("Processing base table was changed to {} ", _state.processing_base_table));
|
||||
}
|
||||
|
||||
// Validate that any of the IDs wasn't aborted.
|
||||
for (const auto& tid: ids) {
|
||||
if (!_state.tasks_map[id]->tasks.contains(tid)) {
|
||||
vbw_logger.warn("Task {} is not found in the batch", tid);
|
||||
throw std::runtime_error(fmt::format("Task {} is not found in the batch", tid));
|
||||
}
|
||||
}
|
||||
|
||||
if (_state.tasks_map[id]->state == view_building_worker::batch_state::idle) {
|
||||
vbw_logger.debug("Starting batch with tasks {}", _state.tasks_map[id]->tasks);
|
||||
if (!is_shard_free(_state.tasks_map[id]->replica.shard)) {
|
||||
throw std::runtime_error(fmt::format("Tried to start view building tasks ({}) on shard {} but the shard is busy", _state.tasks_map[id]->tasks, _state.tasks_map[id]->replica.shard, _state.tasks_map[id]->tasks));
|
||||
}
|
||||
_state.tasks_map[id]->start();
|
||||
}
|
||||
|
||||
service::release_guard(std::move(guard));
|
||||
while (!_as.abort_requested()) {
|
||||
auto read_apply_mutex_holder = co_await _group0_client.hold_read_apply_mutex(_as);
|
||||
|
||||
if (are_tasks_finished()) {
|
||||
co_return get_results();
|
||||
}
|
||||
|
||||
// Check if the batch is still alive
|
||||
if (!_state.tasks_map.contains(id)) {
|
||||
throw std::runtime_error(fmt::format("Batch with task {} is not found in tasks map anymore.", id));
|
||||
}
|
||||
|
||||
read_apply_mutex_holder.return_all();
|
||||
co_await _state.tasks_map[id]->batch_done_cv.wait();
|
||||
}
|
||||
throw std::runtime_error("View building worker was aborted");
|
||||
}
|
||||
|
||||
// Validates if the task can be executed in a batch on the same shard.
|
||||
static bool validate_can_be_one_batch(const view_building_task& t1, const view_building_task& t2) {
|
||||
return t1.type == t2.type && t1.base_id == t2.base_id && t1.replica == t2.replica && t1.last_token == t2.last_token;
|
||||
}
|
||||
|
||||
static std::unordered_set<table_id> get_ids_of_all_views(replica::database& db, table_id table_id) {
|
||||
return db.find_column_family(table_id).views() | std::views::transform([] (view_ptr vptr) {
|
||||
return vptr->id();
|
||||
}) | std::ranges::to<std::unordered_set>();;
|
||||
}
|
||||
|
||||
future<> view_building_worker::local_state::flush_table(view_building_worker& vbw, table_id table_id) {
|
||||
// `table_id` should point to currently processing base table but
|
||||
// `view_building_worker::local_state::processing_base_table` may not be set to it yet,
|
||||
// so we need to pass it directly
|
||||
co_await vbw.container().invoke_on_all([table_id] (view_building_worker& local_vbw) -> future<> {
|
||||
auto base_cf = local_vbw._db.find_column_family(table_id).shared_from_this();
|
||||
co_await when_all(base_cf->await_pending_writes(), base_cf->await_pending_streams());
|
||||
co_await flush_base(base_cf, local_vbw._as);
|
||||
});
|
||||
|
||||
flushed_views = get_ids_of_all_views(vbw._db, table_id);
|
||||
}
|
||||
|
||||
future<> view_building_worker::local_state::update(view_building_worker& vbw) {
|
||||
const auto& vb_state = vbw._vb_state_machine.building_state;
|
||||
|
||||
// Check if the base table to process was changed.
|
||||
// If so, we clear the state, aborting tasks for previous base table and starting new ones for the new base table.
|
||||
if (processing_base_table != vb_state.currently_processed_base_table) {
|
||||
co_await clear_state();
|
||||
|
||||
if (vb_state.currently_processed_base_table) {
|
||||
// When we start to process new base table, we need to flush its current data, so we can build the view.
|
||||
co_await flush_table(vbw, *vb_state.currently_processed_base_table);
|
||||
}
|
||||
|
||||
processing_base_table = vb_state.currently_processed_base_table;
|
||||
vbw_logger.info("Processing base table was changed to: {}", processing_base_table);
|
||||
}
|
||||
|
||||
if (!processing_base_table) {
|
||||
vbw_logger.debug("No base table is selected to be processed.");
|
||||
co_return;
|
||||
}
|
||||
|
||||
std::vector<table_id> new_views;
|
||||
auto all_view_ids = get_ids_of_all_views(vbw._db, *processing_base_table);
|
||||
std::ranges::set_difference(all_view_ids, flushed_views, std::back_inserter(new_views));
|
||||
if (!new_views.empty()) {
|
||||
// Flush base table again in any new view was created, so the view building tasks will see up-to-date sstables.
|
||||
// Otherwise, we may lose mutations created after previous flush but before the new view was created.
|
||||
co_await flush_table(vbw, *processing_base_table);
|
||||
}
|
||||
|
||||
auto erm = vbw._db.find_column_family(*processing_base_table).get_effective_replication_map();
|
||||
auto my_host_id = erm->get_topology().my_host_id();
|
||||
auto current_tasks_for_this_host = vb_state.get_tasks_for_host(*processing_base_table, my_host_id);
|
||||
|
||||
// scan view building state, collect alive and new (in STARTED state but not started by this worker) tasks
|
||||
std::unordered_map<shard_id, std::vector<view_building_task>> new_tasks;
|
||||
std::unordered_set<utils::UUID> alive_tasks; // save information about alive tasks to cleanup done/aborted ones
|
||||
for (auto& task_ref: current_tasks_for_this_host) {
|
||||
auto& task = task_ref.get();
|
||||
auto id = task.id;
|
||||
|
||||
if (task.state != view_building_task::task_state::aborted) {
|
||||
alive_tasks.insert(id);
|
||||
}
|
||||
|
||||
if (tasks_map.contains(id) || finished_tasks.contains(id)) {
|
||||
continue;
|
||||
}
|
||||
else if (task.state == view_building_task::task_state::started) {
|
||||
auto shard = task.replica.shard;
|
||||
if (new_tasks.contains(shard) && !validate_can_be_one_batch(new_tasks[shard].front(), task)) {
|
||||
// Currently we allow only one batch per shard at a time
|
||||
on_internal_error(vbw_logger, fmt::format("Got not-compatible tasks for the same shard. Task: {}, other: {}", new_tasks[shard].front(), task));
|
||||
}
|
||||
new_tasks[shard].push_back(task);
|
||||
}
|
||||
co_await coroutine::maybe_yield();
|
||||
}
|
||||
|
||||
auto tasks_map_copy = tasks_map;
|
||||
|
||||
// Clear aborted tasks from tasks_map
|
||||
for (auto it = tasks_map_copy.begin(); it != tasks_map_copy.end();) {
|
||||
if (!alive_tasks.contains(it->first)) {
|
||||
vbw_logger.debug("Aborting task {}", it->first);
|
||||
aborted_tasks.insert(it->first);
|
||||
co_await it->second->abort_task(it->first);
|
||||
it = tasks_map_copy.erase(it);
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
|
||||
// Create batches for new tasks
|
||||
for (const auto& [shard, shard_tasks]: new_tasks) {
|
||||
auto tasks = shard_tasks | std::views::transform([] (const view_building_task& t) {
|
||||
return std::make_pair(t.id, t);
|
||||
}) | std::ranges::to<std::unordered_map>();
|
||||
auto batch = seastar::make_shared<view_building_worker::batch>(vbw.container(), tasks, shard_tasks.front().base_id, shard_tasks.front().replica);
|
||||
|
||||
for (auto& [id, _]: tasks) {
|
||||
tasks_map_copy.insert({id, batch});
|
||||
}
|
||||
co_await coroutine::maybe_yield();
|
||||
}
|
||||
|
||||
tasks_map = std::move(tasks_map_copy);
|
||||
}
|
||||
|
||||
future<> view_building_worker::local_state::finish_completed_tasks() {
|
||||
for (auto it = tasks_map.begin(); it != tasks_map.end();) {
|
||||
if (it->second->state == view_building_worker::batch_state::idle) {
|
||||
++it;
|
||||
} else if (it->second->state == view_building_worker::batch_state::in_progress) {
|
||||
vbw_logger.debug("Task {} is still in progress", it->first);
|
||||
++it;
|
||||
} else {
|
||||
co_await it->second->work.get_future();
|
||||
finished_tasks.insert(it->first);
|
||||
vbw_logger.info("Task {} was completed", it->first);
|
||||
it->second->batch_done_cv.broadcast();
|
||||
it = tasks_map.erase(it);
|
||||
// If `state::processing_base_table` is diffrent that the `view_building_state::currently_processed_base_table`,
|
||||
// clear the state, save and flush new base table
|
||||
future<> view_building_worker::state::update_processing_base_table(replica::database& db, const view_building_state& building_state, abort_source& as) {
|
||||
if (processing_base_table != building_state.currently_processed_base_table) {
|
||||
co_await clear();
|
||||
if (building_state.currently_processed_base_table) {
|
||||
co_await flush_base_table(db, *building_state.currently_processed_base_table, as);
|
||||
}
|
||||
processing_base_table = building_state.currently_processed_base_table;
|
||||
}
|
||||
}
|
||||
|
||||
future<> view_building_worker::local_state::clear_state() {
|
||||
for (auto& [_, batch]: tasks_map) {
|
||||
co_await batch->abort();
|
||||
// If `_batch` ptr points to valid object, co_await its `work` future, save completed tasks and delete the object
|
||||
future<> view_building_worker::state::clean_up_after_batch() {
|
||||
if (_batch) {
|
||||
co_await std::move(_batch->work);
|
||||
for (auto& [id, _]: _batch->tasks) {
|
||||
completed_tasks.insert(id);
|
||||
}
|
||||
_batch = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// Flush base table, set is as currently processing base table and save which views exist at the time of flush
|
||||
future<> view_building_worker::state::flush_base_table(replica::database& db, table_id base_table_id, abort_source& as) {
|
||||
auto cf = db.find_column_family(base_table_id).shared_from_this();
|
||||
co_await when_all(cf->await_pending_writes(), cf->await_pending_streams());
|
||||
co_await flush_base(cf, as);
|
||||
processing_base_table = base_table_id;
|
||||
flushed_views = get_ids_of_all_views(db, base_table_id);
|
||||
}
|
||||
|
||||
future<> view_building_worker::state::clear() {
|
||||
if (_batch) {
|
||||
_batch->as.request_abort();
|
||||
co_await std::move(_batch->work);
|
||||
_batch = nullptr;
|
||||
}
|
||||
processing_base_table.reset();
|
||||
completed_tasks.clear();
|
||||
flushed_views.clear();
|
||||
tasks_map.clear();
|
||||
finished_tasks.clear();
|
||||
aborted_tasks.clear();
|
||||
state_updated_cv.broadcast();
|
||||
some_batch_finished = false;
|
||||
vbw_logger.debug("View building worker state was cleared.");
|
||||
}
|
||||
|
||||
view_building_worker::batch::batch(sharded<view_building_worker>& vbw, std::unordered_map<utils::UUID, view_building_task> tasks, table_id base_id, locator::tablet_replica replica)
|
||||
@@ -674,17 +499,12 @@ view_building_worker::batch::batch(sharded<view_building_worker>& vbw, std::unor
|
||||
, _vbw(vbw) {}
|
||||
|
||||
void view_building_worker::batch::start() {
|
||||
if (this_shard_id() != 0) {
|
||||
on_internal_error(vbw_logger, "view_building_worker::batch should be started on shard0");
|
||||
if (this_shard_id() != replica.shard) {
|
||||
on_internal_error(vbw_logger, "view_building_worker::batch should be started on replica shard");
|
||||
}
|
||||
|
||||
state = batch_state::in_progress;
|
||||
work = smp::submit_to(replica.shard, [this] () -> future<> {
|
||||
return do_work();
|
||||
}).finally([this] () {
|
||||
state = batch_state::finished;
|
||||
_vbw.local()._state.some_batch_finished = true;
|
||||
_vbw.local()._vb_state_machine.event.broadcast();
|
||||
work = do_work().finally([this] {
|
||||
promise.set_value();
|
||||
});
|
||||
}
|
||||
|
||||
@@ -699,10 +519,6 @@ future<> view_building_worker::batch::abort() {
|
||||
co_await smp::submit_to(replica.shard, [this] () {
|
||||
as.request_abort();
|
||||
});
|
||||
|
||||
if (work.valid()) {
|
||||
co_await work.get_future();
|
||||
}
|
||||
}
|
||||
|
||||
future<> view_building_worker::batch::do_work() {
|
||||
@@ -896,6 +712,124 @@ void view_building_worker::cleanup_staging_sstables(locator::effective_replicati
|
||||
_staging_sstables[table_id].erase(first, last);
|
||||
}
|
||||
|
||||
future<view_building_state> view_building_worker::get_latest_view_building_state(raft::term_t term) {
|
||||
return smp::submit_to(0, [&sharded_vbw = container(), term] () -> future<view_building_state> {
|
||||
auto& vbw = sharded_vbw.local();
|
||||
// auto guard = vbw._group0.client().start_operation(vbw._as);
|
||||
|
||||
auto& raft_server = vbw._group0.group0_server();
|
||||
auto group0_holder = vbw._group0.hold_group0_gate();
|
||||
co_await raft_server.read_barrier(&vbw._as);
|
||||
if (raft_server.get_current_term() != term) {
|
||||
throw std::runtime_error(fmt::format("Invalid raft term. Got {} but current term is {}", term, raft_server.get_current_term()));
|
||||
}
|
||||
|
||||
co_return vbw._vb_state_machine.building_state;
|
||||
});
|
||||
}
|
||||
|
||||
future<std::vector<utils::UUID>> view_building_worker::work_on_tasks(raft::term_t term, std::vector<utils::UUID> ids) {
|
||||
auto collect_completed_tasks = [&] {
|
||||
std::vector<utils::UUID> completed;
|
||||
for (auto& id: ids) {
|
||||
if (_state.completed_tasks.contains(id)) {
|
||||
completed.push_back(id);
|
||||
}
|
||||
}
|
||||
return completed;
|
||||
};
|
||||
|
||||
auto lock = co_await get_units(_state._mutex, 1, _as);
|
||||
// Firstly check if there is any batch that is finished but wasn't cleaned up.
|
||||
if (_state._batch && _state._batch->promise.available()) {
|
||||
co_await _state.clean_up_after_batch();
|
||||
}
|
||||
|
||||
// Check if tasks were already completed.
|
||||
// If only part of the tasks were finished, return the subset and don't execute the remaining tasks.
|
||||
std::vector<utils::UUID> completed = collect_completed_tasks();
|
||||
if (!completed.empty()) {
|
||||
co_return completed;
|
||||
}
|
||||
lock.return_all();
|
||||
|
||||
auto building_state = co_await get_latest_view_building_state(term);
|
||||
|
||||
lock = co_await get_units(_state._mutex, 1, _as);
|
||||
co_await _state.update_processing_base_table(_db, building_state, _as);
|
||||
// If there is no running batch, create it.
|
||||
if (!_state._batch) {
|
||||
if (!_state.processing_base_table) {
|
||||
throw std::runtime_error("view_building_worker::state::processing_base_table needs to be set to work on view building");
|
||||
}
|
||||
|
||||
auto my_host_id = _db.get_token_metadata().get_topology().my_host_id();
|
||||
auto my_replica = locator::tablet_replica{my_host_id, this_shard_id()};
|
||||
std::unordered_map<utils::UUID, view_building_task> tasks;
|
||||
for (auto& id: ids) {
|
||||
auto task_opt = building_state.get_task(*_state.processing_base_table, my_replica, id);
|
||||
if (!task_opt) {
|
||||
throw std::runtime_error(fmt::format("Task {} was not found for base table {} on replica {}", id, *building_state.currently_processed_base_table, my_replica));
|
||||
}
|
||||
tasks.insert({id, *task_opt});
|
||||
}
|
||||
#ifdef SEASTAR_DEBUG
|
||||
auto& some_task = tasks.begin()->second;
|
||||
for (auto& [_, t]: tasks) {
|
||||
SCYLLA_ASSERT(t.base_id == some_task.base_id);
|
||||
SCYLLA_ASSERT(t.last_token == some_task.last_token);
|
||||
SCYLLA_ASSERT(t.replica == some_task.replica);
|
||||
SCYLLA_ASSERT(t.type == some_task.type);
|
||||
SCYLLA_ASSERT(t.replica.shard == this_shard_id());
|
||||
}
|
||||
#endif
|
||||
|
||||
// If any view was added after we did the initial flush, we need to do it again
|
||||
if (std::ranges::any_of(tasks | std::views::values, [&] (const view_building_task& t) {
|
||||
return t.view_id && !_state.flushed_views.contains(*t.view_id);
|
||||
})) {
|
||||
co_await _state.flush_base_table(_db, *_state.processing_base_table, _as);
|
||||
}
|
||||
|
||||
// Create and start the batch
|
||||
_state._batch = std::make_unique<batch>(container(), std::move(tasks), *building_state.currently_processed_base_table, my_replica);
|
||||
_state._batch->start();
|
||||
}
|
||||
|
||||
if (std::ranges::all_of(ids, [&] (auto& id) { return !_state._batch->tasks.contains(id); })) {
|
||||
throw std::runtime_error(fmt::format(
|
||||
"None of the tasks requested to work on is executed in current view building batch. Batch executes: {}, the RPC requested: {}",
|
||||
_state._batch->tasks | std::views::keys, ids));
|
||||
}
|
||||
auto batch_future = _state._batch->promise.get_shared_future();
|
||||
lock.return_all();
|
||||
|
||||
co_await std::move(batch_future);
|
||||
|
||||
lock = co_await get_units(_state._mutex, 1, _as);
|
||||
co_await _state.clean_up_after_batch();
|
||||
co_return collect_completed_tasks();
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#include <unordered_set>
|
||||
#include "locator/abstract_replication_strategy.hh"
|
||||
#include "locator/tablets.hh"
|
||||
#include "raft/raft.hh"
|
||||
#include "seastar/core/gate.hh"
|
||||
#include "db/view/view_building_state.hh"
|
||||
#include "sstables/shared_sstable.hh"
|
||||
@@ -31,7 +32,7 @@ class messaging_service;
|
||||
}
|
||||
|
||||
namespace service {
|
||||
class raft_group0_client;
|
||||
class raft_group0;
|
||||
}
|
||||
|
||||
namespace db {
|
||||
@@ -65,27 +66,16 @@ class view_building_worker : public seastar::peering_sharded_service<view_buildi
|
||||
*
|
||||
* When `work` future is finished, it means all tasks in `tasks_ids` are done.
|
||||
*
|
||||
* The batch lives on shard 0 exclusively.
|
||||
* When the batch starts to execute its tasks, it firstly copies all necessary data
|
||||
* to the designated shard, then the work is done on the local copy of the data only.
|
||||
* The batch lives on shard, where its executing its work exclusively.
|
||||
*/
|
||||
|
||||
enum class batch_state {
|
||||
idle,
|
||||
in_progress,
|
||||
finished,
|
||||
};
|
||||
|
||||
class batch {
|
||||
public:
|
||||
batch_state state = batch_state::idle;
|
||||
table_id base_id;
|
||||
locator::tablet_replica replica;
|
||||
std::unordered_map<utils::UUID, view_building_task> tasks;
|
||||
|
||||
shared_future<> work;
|
||||
condition_variable batch_done_cv;
|
||||
// The abort has to be used only on `replica.shard`
|
||||
shared_promise<> promise;
|
||||
future<> work = make_ready_future();
|
||||
abort_source as;
|
||||
|
||||
batch(sharded<view_building_worker>& vbw, std::unordered_map<utils::UUID, view_building_task> tasks, table_id base_id, locator::tablet_replica replica);
|
||||
@@ -101,35 +91,18 @@ class view_building_worker : public seastar::peering_sharded_service<view_buildi
|
||||
|
||||
friend class batch;
|
||||
|
||||
struct local_state {
|
||||
struct state {
|
||||
std::optional<table_id> processing_base_table = std::nullopt;
|
||||
// Stores ids of views for which the flush was done.
|
||||
// When a new view is created, we need to flush the base table again,
|
||||
// as data might be inserted.
|
||||
std::unordered_set<utils::UUID> completed_tasks;
|
||||
std::unique_ptr<batch> _batch = nullptr;
|
||||
std::unordered_set<table_id> flushed_views;
|
||||
std::unordered_map<utils::UUID, shared_ptr<batch>> tasks_map;
|
||||
|
||||
std::unordered_set<utils::UUID> finished_tasks;
|
||||
std::unordered_set<utils::UUID> aborted_tasks;
|
||||
|
||||
bool some_batch_finished = false;
|
||||
condition_variable state_updated_cv;
|
||||
|
||||
// Clears completed/aborted tasks and creates batches (without starting them) for started tasks.
|
||||
// Returns a map of tasks per shard to execute.
|
||||
future<> update(view_building_worker& vbw);
|
||||
|
||||
future<> finish_completed_tasks();
|
||||
|
||||
// The state can be aborted if, for example, a view is dropped, then all its tasks
|
||||
// are aborted and the coordinator may choose new base table to process.
|
||||
// This method aborts all batches as we stop to processing the current base table.
|
||||
future<> clear_state();
|
||||
|
||||
// Flush table with `table_id` on all shards.
|
||||
// This method should be used only on currently processing base table and
|
||||
// it updates `flushed_views` field.
|
||||
future<> flush_table(view_building_worker& vbw, table_id table_id);
|
||||
semaphore _mutex = semaphore(1);
|
||||
// All of the methods below should be executed while holding `_mutex` unit!
|
||||
future<> update_processing_base_table(replica::database& db, const view_building_state& building_state, abort_source& as);
|
||||
future<> flush_base_table(replica::database& db, table_id base_table_id, abort_source& as);
|
||||
future<> clean_up_after_batch();
|
||||
future<> clear();
|
||||
};
|
||||
|
||||
// Wrapper which represents information needed to create
|
||||
@@ -147,14 +120,14 @@ private:
|
||||
replica::database& _db;
|
||||
db::system_keyspace& _sys_ks;
|
||||
service::migration_notifier& _mnotifier;
|
||||
service::raft_group0_client& _group0_client;
|
||||
service::raft_group0& _group0;
|
||||
view_update_generator& _vug;
|
||||
netw::messaging_service& _messaging;
|
||||
view_building_state_machine& _vb_state_machine;
|
||||
abort_source _as;
|
||||
named_gate _gate;
|
||||
|
||||
local_state _state;
|
||||
state _state;
|
||||
std::unordered_set<table_id> _views_in_progress;
|
||||
future<> _view_building_state_observer = make_ready_future<>();
|
||||
|
||||
@@ -166,7 +139,7 @@ private:
|
||||
|
||||
public:
|
||||
view_building_worker(replica::database& db, db::system_keyspace& sys_ks, service::migration_notifier& mnotifier,
|
||||
service::raft_group0_client& group0_client, view_update_generator& vug, netw::messaging_service& ms,
|
||||
service::raft_group0& group0, view_update_generator& vug, netw::messaging_service& ms,
|
||||
view_building_state_machine& vbsm);
|
||||
future<> init();
|
||||
|
||||
@@ -185,10 +158,11 @@ public:
|
||||
void cleanup_staging_sstables(locator::effective_replication_map_ptr erm, table_id table_id, locator::tablet_id tid);
|
||||
|
||||
private:
|
||||
future<view_building_state> get_latest_view_building_state(raft::term_t term);
|
||||
future<> check_for_aborted_tasks();
|
||||
|
||||
future<> run_view_building_state_observer();
|
||||
future<> update_built_views();
|
||||
future<> update_building_state();
|
||||
bool is_shard_free(shard_id shard);
|
||||
|
||||
dht::token_range get_tablet_token_range(table_id table_id, dht::token last_token);
|
||||
future<> do_build_range(table_id base_id, std::vector<table_id> views_ids, dht::token last_token, abort_source& as);
|
||||
@@ -202,7 +176,7 @@ private:
|
||||
|
||||
void init_messaging_service();
|
||||
future<> uninit_messaging_service();
|
||||
future<std::vector<view_task_result>> work_on_tasks(std::vector<utils::UUID> ids);
|
||||
future<std::vector<utils::UUID>> work_on_tasks(raft::term_t term, std::vector<utils::UUID> ids);
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@@ -483,7 +483,7 @@ public:
|
||||
});
|
||||
co_await add_partition(mutation_sink, "load", [this] () -> future<sstring> {
|
||||
return map_reduce_tables<int64_t>([] (replica::table& tbl) {
|
||||
return tbl.get_stats().live_disk_space_used;
|
||||
return tbl.get_stats().live_disk_space_used.on_disk;
|
||||
}).then([] (int64_t load) {
|
||||
return format("{}", load);
|
||||
});
|
||||
@@ -1158,6 +1158,104 @@ private:
|
||||
}
|
||||
};
|
||||
|
||||
class tablet_sizes : public group0_virtual_table {
|
||||
private:
|
||||
sharded<service::tablet_allocator>& _talloc;
|
||||
sharded<replica::database>& _db;
|
||||
public:
|
||||
tablet_sizes(sharded<service::tablet_allocator>& talloc,
|
||||
sharded<replica::database>& db,
|
||||
sharded<service::raft_group_registry>& raft_gr,
|
||||
sharded<netw::messaging_service>& ms)
|
||||
: group0_virtual_table(build_schema(), raft_gr, ms)
|
||||
, _talloc(talloc)
|
||||
, _db(db)
|
||||
{ }
|
||||
|
||||
future<> execute_on_leader(std::function<void(mutation)> mutation_sink, reader_permit permit) override {
|
||||
auto stats = _talloc.local().get_load_stats();
|
||||
while (!stats) {
|
||||
// Wait for stats to be refreshed by topology coordinator
|
||||
{
|
||||
abort_on_expiry aoe(permit.timeout());
|
||||
reader_permit::awaits_guard ag(permit);
|
||||
co_await seastar::sleep_abortable(std::chrono::milliseconds(200), aoe.abort_source());
|
||||
}
|
||||
if (!co_await is_leader(permit)) {
|
||||
co_await redirect_to_leader(std::move(mutation_sink), std::move(permit));
|
||||
co_return;
|
||||
}
|
||||
stats = _talloc.local().get_load_stats();
|
||||
}
|
||||
|
||||
auto tm = _db.local().get_token_metadata_ptr();
|
||||
|
||||
auto prepare_replica_sizes = [] (const std::unordered_map<host_id, uint64_t>& replica_sizes) {
|
||||
map_type_impl::native_type tmp;
|
||||
for (auto& r: replica_sizes) {
|
||||
auto replica = r.first.uuid();
|
||||
int64_t tablet_size = int64_t(r.second);
|
||||
auto map_element = std::make_pair<data_value, data_value>(data_value(replica), data_value(tablet_size));
|
||||
tmp.push_back(std::move(map_element));
|
||||
}
|
||||
return tmp;
|
||||
};
|
||||
|
||||
auto prepare_missing_replica = [] (const std::unordered_set<host_id>& missing_replicas) {
|
||||
set_type_impl::native_type tmp;
|
||||
for (auto& r: missing_replicas) {
|
||||
tmp.push_back(data_value(r.uuid()));
|
||||
}
|
||||
return tmp;
|
||||
};
|
||||
|
||||
auto map_type = map_type_impl::get_instance(uuid_type, long_type, false);
|
||||
auto set_type = set_type_impl::get_instance(uuid_type, false);
|
||||
for (auto&& [table, tmap] : tm->tablets().all_tables_ungrouped()) {
|
||||
mutation m(schema(), make_partition_key(table));
|
||||
co_await tmap->for_each_tablet([&] (locator::tablet_id tid, const locator::tablet_info& tinfo) -> future<> {
|
||||
auto trange = tmap->get_token_range(tid);
|
||||
int64_t last_token = trange.end()->value().raw();
|
||||
auto& r = m.partition().clustered_row(*schema(), clustering_key::from_single_value(*schema(), data_value(last_token).serialize_nonnull()));
|
||||
const range_based_tablet_id rb_tid {table, trange};
|
||||
std::unordered_map<host_id, uint64_t> replica_sizes;
|
||||
std::unordered_set<host_id> missing_replicas;
|
||||
for (auto& replica : tinfo.replicas) {
|
||||
auto tablet_size_opt = stats->get_tablet_size(replica.host, rb_tid);
|
||||
if (tablet_size_opt) {
|
||||
replica_sizes[replica.host] = *tablet_size_opt;
|
||||
} else {
|
||||
missing_replicas.insert(replica.host);
|
||||
}
|
||||
}
|
||||
set_cell(r.cells(), "replicas", make_map_value(map_type, prepare_replica_sizes(replica_sizes)));
|
||||
set_cell(r.cells(), "missing_replicas", make_set_value(set_type, prepare_missing_replica(missing_replicas)));
|
||||
return make_ready_future<>();
|
||||
});
|
||||
|
||||
mutation_sink(m);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
static schema_ptr build_schema() {
|
||||
auto id = generate_legacy_id(system_keyspace::NAME, "tablet_sizes");
|
||||
return schema_builder(system_keyspace::NAME, "tablet_sizes", std::make_optional(id))
|
||||
.with_column("table_id", uuid_type, column_kind::partition_key)
|
||||
.with_column("last_token", long_type, column_kind::clustering_key)
|
||||
.with_column("replicas", map_type_impl::get_instance(uuid_type, long_type, false))
|
||||
.with_column("missing_replicas", set_type_impl::get_instance(uuid_type, false))
|
||||
.with_sharder(1, 0) // shard0-only
|
||||
.with_hash_version()
|
||||
.build();
|
||||
}
|
||||
|
||||
dht::decorated_key make_partition_key(table_id table) {
|
||||
return dht::decorate_key(*_s, partition_key::from_single_value(
|
||||
*_s, data_value(table.uuid()).serialize_nonnull()));
|
||||
}
|
||||
};
|
||||
|
||||
class cdc_timestamps_table : public streaming_virtual_table {
|
||||
private:
|
||||
replica::database& _db;
|
||||
@@ -1353,6 +1451,7 @@ future<> initialize_virtual_tables(
|
||||
co_await add_table(std::make_unique<clients_table>(ss));
|
||||
co_await add_table(std::make_unique<raft_state_table>(dist_raft_gr));
|
||||
co_await add_table(std::make_unique<load_per_node>(tablet_allocator, dist_db, dist_raft_gr, ms, dist_gossiper));
|
||||
co_await add_table(std::make_unique<tablet_sizes>(tablet_allocator, dist_db, dist_raft_gr, ms));
|
||||
co_await add_table(std::make_unique<cdc_timestamps_table>(db, ss));
|
||||
co_await add_table(std::make_unique<cdc_streams_table>(db, ss));
|
||||
|
||||
|
||||
@@ -18,6 +18,9 @@ target_link_libraries(scylla_dht
|
||||
PRIVATE
|
||||
replica)
|
||||
|
||||
if (Scylla_USE_PRECOMPILED_HEADER_USE)
|
||||
target_precompile_headers(scylla_dht REUSE_FROM scylla-precompiled-header)
|
||||
endif()
|
||||
add_whole_archive(dht scylla_dht)
|
||||
|
||||
check_headers(check-headers scylla_dht
|
||||
|
||||
6
dist/common/systemd/scylla-server.slice
vendored
6
dist/common/systemd/scylla-server.slice
vendored
@@ -6,13 +6,7 @@ Before=slices.target
|
||||
MemoryAccounting=true
|
||||
IOAccounting=true
|
||||
CPUAccounting=true
|
||||
# Systemd deprecated settings BlockIOWeight and CPUShares. But they are still the ones used in RHEL7
|
||||
# Newer SystemD wants IOWeight and CPUWeight instead. Luckily both newer and older SystemD seem to
|
||||
# ignore the unwanted option so safest to get both. Using just the old versions would work too but
|
||||
# seems less future proof. Using just the new versions does not work at all for RHEL7/
|
||||
BlockIOWeight=1000
|
||||
IOWeight=1000
|
||||
MemorySwapMax=0
|
||||
CPUShares=1000
|
||||
CPUWeight=1000
|
||||
|
||||
|
||||
65
docs/README-metrics.md
Normal file
65
docs/README-metrics.md
Normal file
@@ -0,0 +1,65 @@
|
||||
# ScyllaDB metrics docs scripts
|
||||
|
||||
The following files extracts metrics from C++ source files and generates documentation:
|
||||
|
||||
- **`scripts/get_description.py`** - Metrics parser and extractor
|
||||
- **`scripts/metrics-config.yml`** - Configuration for special cases only
|
||||
- **`docs/_ext/scylladb_metrics.py`** - Sphinx extension for rendering
|
||||
|
||||
## Configuration
|
||||
|
||||
The system automatically handles most metrics extraction. You only need configuration in the `metrics-config.yml` file for:
|
||||
|
||||
**Complex parameter combinations:**
|
||||
```yaml
|
||||
"cdc/log.cc":
|
||||
params:
|
||||
part_name;suffix: [["static_row", "total"], ["clustering_row", "failed"]]
|
||||
kind: ["total", "failed"]
|
||||
```
|
||||
|
||||
**Multiple parameter values:**
|
||||
```yaml
|
||||
"service/storage_proxy.cc":
|
||||
params:
|
||||
_short_description_prefix: ["total_write_attempts", "write_errors"]
|
||||
```
|
||||
|
||||
**Complex expressions:**
|
||||
```yaml
|
||||
"tracing/tracing.cc":
|
||||
params:
|
||||
"max_pending_trace_records + write_event_records_threshold": "max_pending_trace_records + write_event_records_threshold"
|
||||
```
|
||||
|
||||
**Group assignments:**
|
||||
```yaml
|
||||
"cql3/query_processor.cc":
|
||||
groups:
|
||||
"80": query_processor
|
||||
```
|
||||
|
||||
**Skip files:**
|
||||
```yaml
|
||||
"seastar/tests/unit/metrics_test.cc": skip
|
||||
```
|
||||
|
||||
## Validation
|
||||
|
||||
Use the built-in validation to check all metrics files:
|
||||
|
||||
```bash
|
||||
# Validate all metrics files
|
||||
python scripts/get_description.py --validate -c scripts/metrics-config.yml
|
||||
|
||||
# Validate with verbose output
|
||||
python scripts/get_description.py --validate -c scripts/metrics-config.yml -v
|
||||
```
|
||||
|
||||
The GitHub workflow `docs-validate-metrics.yml` automatically runs validation on PRs to `master` that modify `.cc` files or metrics configuration.
|
||||
|
||||
## Common fixes
|
||||
|
||||
- **"Parameter not found"**: Add parameter mapping to config `params` section
|
||||
- **"Could not resolve param"**: Check parameter name matches C++ code exactly
|
||||
- **"No group found"**: Add group mapping or verify `add_group()` calls
|
||||
@@ -27,38 +27,48 @@ class MetricsProcessor:
|
||||
os.makedirs(output_directory, exist_ok=True)
|
||||
return output_directory
|
||||
|
||||
def _process_single_file(self, file_path, destination_path, metrics_config_path):
|
||||
def _process_single_file(self, file_path, destination_path, metrics_config_path, strict=False):
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
if self.MARKER in content and not os.path.exists(destination_path):
|
||||
try:
|
||||
metrics_file = metrics.get_metrics_from_file(file_path, "scylla", metrics.get_metrics_information(metrics_config_path))
|
||||
with open(destination_path, 'w+', encoding='utf-8') as f:
|
||||
json.dump(metrics_file, f, indent=4)
|
||||
except SystemExit:
|
||||
LOGGER.info(f'Skipping file: {file_path}')
|
||||
metrics_info = metrics.get_metrics_information(metrics_config_path)
|
||||
# Get relative path to the repo root
|
||||
relative_path = os.path.relpath(file_path, os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
|
||||
repo_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
|
||||
old_cwd = os.getcwd()
|
||||
os.chdir(repo_root)
|
||||
# Get metrics from the file
|
||||
try:
|
||||
metrics_file = metrics.get_metrics_from_file(relative_path, "scylla_", metrics_info, strict=strict)
|
||||
finally:
|
||||
os.chdir(old_cwd)
|
||||
if metrics_file:
|
||||
with open(destination_path, 'w+', encoding='utf-8') as f:
|
||||
json.dump(metrics_file, f, indent=4)
|
||||
LOGGER.info(f'Generated {len(metrics_file)} metrics for {file_path}')
|
||||
else:
|
||||
LOGGER.info(f'No metrics generated for {file_path}')
|
||||
except Exception as error:
|
||||
# Remove [Errno X] prefix from error message
|
||||
error_msg = str(error)
|
||||
if '[Errno' in error_msg:
|
||||
error_msg = error_msg.split('] ', 1)[1]
|
||||
LOGGER.info(error_msg)
|
||||
LOGGER.info(f'Error processing {file_path}: {str(error)}')
|
||||
|
||||
def _process_metrics_files(self, repo_dir, output_directory, metrics_config_path):
|
||||
def _process_metrics_files(self, repo_dir, output_directory, metrics_config_path, strict=False):
|
||||
for root, _, files in os.walk(repo_dir):
|
||||
for file in files:
|
||||
if file.endswith(".cc"):
|
||||
file_path = os.path.join(root, file)
|
||||
file_name = os.path.splitext(file)[0] + ".json"
|
||||
destination_path = os.path.join(output_directory, file_name)
|
||||
self._process_single_file(file_path, destination_path, metrics_config_path)
|
||||
self._process_single_file(file_path, destination_path, metrics_config_path, strict)
|
||||
|
||||
def run(self, app, exception=None):
|
||||
repo_dir = os.path.abspath(os.path.join(app.srcdir, ".."))
|
||||
metrics_config_path = os.path.join(repo_dir, app.config.scylladb_metrics_config_path)
|
||||
output_directory = self._create_output_directory(app, app.config.scylladb_metrics_directory)
|
||||
|
||||
self._process_metrics_files(repo_dir, output_directory, metrics_config_path)
|
||||
strict_mode = getattr(app.config, 'scylladb_metrics_strict_mode', False) or False
|
||||
|
||||
self._process_metrics_files(repo_dir, output_directory, metrics_config_path, strict_mode)
|
||||
|
||||
|
||||
class MetricsTemplateDirective(DataTemplateJSON):
|
||||
@@ -163,7 +173,7 @@ class MetricsDirective(Directive):
|
||||
output = []
|
||||
try:
|
||||
relative_path_from_current_rst = self._get_relative_path(metrics_directory, app, docname)
|
||||
files = os.listdir(metrics_directory)
|
||||
files = sorted(os.listdir(metrics_directory))
|
||||
for _, file in enumerate(files):
|
||||
output.extend(self._process_file(file, relative_path_from_current_rst))
|
||||
except Exception as error:
|
||||
@@ -174,6 +184,7 @@ def setup(app):
|
||||
app.add_config_value("scylladb_metrics_directory", default="_data/metrics", rebuild="html")
|
||||
app.add_config_value("scylladb_metrics_config_path", default='scripts/metrics-config.yml', rebuild="html")
|
||||
app.add_config_value('scylladb_metrics_option_template', default='metrics_option.tmpl', rebuild='html', types=[str])
|
||||
app.add_config_value('scylladb_metrics_strict_mode', default=None, rebuild='html', types=[bool])
|
||||
app.connect("builder-inited", MetricsProcessor().run)
|
||||
app.add_object_type(
|
||||
'metrics_option',
|
||||
@@ -183,7 +194,7 @@ def setup(app):
|
||||
app.add_directive("metrics_option", MetricsOption)
|
||||
app.add_directive("scylladb_metrics", MetricsDirective)
|
||||
|
||||
|
||||
|
||||
return {
|
||||
"version": "0.1",
|
||||
"parallel_read_safe": True,
|
||||
|
||||
@@ -29,6 +29,7 @@ def readable_desc_rst(description):
|
||||
|
||||
cleaned_line = cleaned_line.lstrip()
|
||||
cleaned_line = cleaned_line.replace('"', '')
|
||||
cleaned_line = cleaned_line.replace('`', '\\`')
|
||||
|
||||
if cleaned_line != '':
|
||||
cleaned_line = indent + cleaned_line
|
||||
|
||||
@@ -428,3 +428,7 @@ they should be easy to detect. Here is a list of these unimplemented features:
|
||||
that can be used to achieve consistent reads on global (multi-region) tables.
|
||||
This table option was added as a preview to DynamoDB in December 2024.
|
||||
<https://github.com/scylladb/scylladb/issues/21852>
|
||||
|
||||
* Alternator does not support multi-attribute (composite) keys in GSIs.
|
||||
This feature was added to DynamoDB in November 2025.
|
||||
<https://github.com/scylladb/scylladb/issues/27182>
|
||||
|
||||
@@ -76,7 +76,7 @@ author = u"ScyllaDB Project Contributors"
|
||||
|
||||
# List of patterns, relative to source directory, that match files and
|
||||
# directories to ignore when looking for source files.
|
||||
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', 'lib', 'lib64','**/_common/*', 'README.md', 'index.md', '.git', '.github', '_utils', 'rst_include', 'venv', 'dev', '_data/**']
|
||||
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', 'lib', 'lib64','**/_common/*', 'README.md', 'README-metrics.md', 'index.md', '.git', '.github', '_utils', 'rst_include', 'venv', 'dev', '_data/**']
|
||||
|
||||
# The name of the Pygments (syntax highlighting) style to use.
|
||||
pygments_style = "sphinx"
|
||||
|
||||
@@ -79,35 +79,6 @@ and to the TRUNCATE data definition query.
|
||||
|
||||
In addition, the timeout parameter can be applied to SELECT queries as well.
|
||||
|
||||
|
||||
After [enabling object storage support](../operating-scylla/admin.rst#admin-keyspace-storage-options), configure your endpoints by
|
||||
following these [instructions](../operating-scylla/admin.rst#object-storage-configuration).
|
||||
|
||||
|
||||
Now you can configure your object storage when creating a keyspace:
|
||||
|
||||
```cql
|
||||
CREATE KEYSPACE with STORAGE = { 'type': 'S3', 'endpoint': '$endpoint_name', 'bucket': '$bucket' }
|
||||
```
|
||||
|
||||
**Example**
|
||||
|
||||
```cql
|
||||
CREATE KEYSPACE ks
|
||||
WITH REPLICATION = { 'class' : 'NetworkTopologyStrategy', 'replication_factor' : 3 }
|
||||
AND STORAGE = { 'type' : 'S3', 'bucket' : '/tmp/b1', 'endpoint' : 'localhost' } ;
|
||||
```
|
||||
|
||||
Storage options can be inspected by checking the new system schema table: `system_schema.scylla_keyspaces`:
|
||||
|
||||
```cql
|
||||
cassandra@cqlsh> select * from system_schema.scylla_keyspaces;
|
||||
|
||||
keyspace_name | storage_options | storage_type
|
||||
---------------+------------------------------------------------+--------------
|
||||
ksx | {'bucket': '/tmp/xx', 'endpoint': 'localhost'} | S3
|
||||
```
|
||||
|
||||
## PRUNE MATERIALIZED VIEW statements
|
||||
|
||||
A special statement is dedicated for pruning ghost rows from materialized views.
|
||||
|
||||
@@ -312,14 +312,6 @@ Please use :ref:`Per-table tablet options <cql-per-table-tablet-options>` instea
|
||||
|
||||
See :doc:`Data Distribution with Tablets </architecture/tablets>` for more information about tablets.
|
||||
|
||||
Keyspace storage options :label-caution:`Experimental`
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
By default, SStables of a keyspace are stored locally.
|
||||
As an alternative, you can configure your keyspace to be stored
|
||||
on Amazon S3 or another S3-compatible object store.
|
||||
See :ref:`Keyspace storage options <admin-keyspace-storage-options>` for details.
|
||||
|
||||
.. _consistency-option:
|
||||
|
||||
Keyspace ``consistency`` options :label-caution:`Experimental`
|
||||
@@ -637,7 +629,7 @@ Some examples of primary key definition are:
|
||||
key), and ``c`` is the clustering column.
|
||||
|
||||
|
||||
.. note:: A *null* value is not allowed as any partition-key or clustering-key column. A Null value is *not* the same as an empty string.
|
||||
.. note:: A *null* value is not allowed as any partition-key or clustering-key column. A *null* value is *not* the same as an empty string.
|
||||
|
||||
.. _partition-key:
|
||||
|
||||
|
||||
@@ -67,9 +67,9 @@ Please refer to the :ref:`update parameters <update-parameters>` section for mor
|
||||
|
||||
.. code-block:: none
|
||||
|
||||
movie | director | main_actor | year
|
||||
----------+---------------+------------+------
|
||||
Serenity | Joseph Whedon | Unknown | null
|
||||
movie | director | main_actor
|
||||
----------+---------------+------------
|
||||
Serenity | Joseph Whedon | Unknown
|
||||
|
||||
|
||||
``INSERT`` is not required to assign all columns, so if two
|
||||
@@ -80,7 +80,7 @@ columns effects of both statements are preserved:
|
||||
|
||||
INSERT INTO NerdMovies (movie, director, main_actor)
|
||||
VALUES ('Serenity', 'Joss Whedon', 'Nathan Fillion');
|
||||
INSERT INTO NerdMovies (movie, director, main_actor, year)
|
||||
INSERT INTO NerdMovies (movie, director, year)
|
||||
VALUES ('Serenity', 'Josseph Hill Whedon', 2005);
|
||||
SELECT * FROM NerdMovies WHERE movie = 'Serenity'
|
||||
|
||||
|
||||
@@ -28,6 +28,7 @@ Scylla uses the following directory structure to store all its SSTables, for exa
|
||||
│ │ │ ├── ...
|
||||
│ │ │ └── mc-1-big-TOC.txt
|
||||
│ │ ├── staging
|
||||
│ │ ├── quarantine
|
||||
│ │ └── upload
|
||||
│ └── cf-7ec943202fc611e9a130000000000000
|
||||
│ ├── snapshots
|
||||
@@ -36,6 +37,7 @@ Scylla uses the following directory structure to store all its SSTables, for exa
|
||||
│ │ ├── ks-cf-ka-3-TOC.txt
|
||||
│ │ └── manifest.json
|
||||
│ ├── staging
|
||||
│ ├── quarantine
|
||||
│ └── upload
|
||||
├── system
|
||||
│ ├── schema_columnfamilies-45f5b36024bc3f83a3631034ea4fa697
|
||||
@@ -167,6 +169,21 @@ The per-table directory may contain several sub-directories, as listed below:
|
||||
Used for ingesting external SSTables into Scylla on startup.
|
||||
|
||||
|
||||
* Quarantine directory (`quarantine`)
|
||||
A sub-directory holding SSTables that have been quarantined, typically due to
|
||||
validation failures or corruption detected during scrub operations.
|
||||
|
||||
Quarantined SSTables are isolated to prevent them from being read or used by the
|
||||
database. They can be inspected manually for debugging purposes or removed using
|
||||
the `drop_quarantined_sstables` API operation.
|
||||
|
||||
The scrub operation can be configured to handle quarantined SSTables using the
|
||||
`quarantine_mode` parameter with the following options:
|
||||
- `INCLUDE`: Process both regular and quarantined SSTables (default)
|
||||
- `EXCLUDE`: Skip quarantined SSTables during scrub
|
||||
- `ONLY`: Process only quarantined SSTables
|
||||
|
||||
|
||||
* Temporary SSTable directory (`<generation>.sstable`)
|
||||
A directory created when writing new SSTables.
|
||||
|
||||
|
||||
@@ -375,6 +375,30 @@ Columns:
|
||||
* `tablets_allocated` - Number of tablet replicas on the node. Migrating tablets are accounted as if migration already finished.
|
||||
* `tablets_allocated_per_shard` - `tablets_allocated` divided by shard count on the node.
|
||||
|
||||
## system.tablet_sizes
|
||||
|
||||
Contains information about the current tablet disk sizes. Table can contain incomplete data, in which case `missing_replicas`
|
||||
will contain the host IDs of replicas for which the tablet size is not known.
|
||||
Can be queried on any node, but the data comes from the group0 leader.
|
||||
Reads wait for group0 leader to be elected and load balancer stats to become available.
|
||||
|
||||
Schema:
|
||||
```cql
|
||||
CREATE TABLE system.tablet_sizes (
|
||||
table_id uuid,
|
||||
last_token bigint,
|
||||
missing_replicas frozen<set<uuid>>,
|
||||
replicas frozen<map<uuid, bigint>>,
|
||||
PRIMARY KEY (table_id, last_token)
|
||||
);
|
||||
```
|
||||
|
||||
Columns:
|
||||
* `table_id` - The table ID of the table for which tablet sizes are reported.
|
||||
* `last_token` - The last token owned by the tablet.
|
||||
* `missing_replicas` - Set of host IDs for replicas for which a tablet size was not found.
|
||||
* `replicas` - A map of replica host IDs and the disk size of the tablet replica, in bytes
|
||||
|
||||
## system.protocol_servers
|
||||
|
||||
The list of all the client-facing data-plane protocol servers and listen addresses (if running).
|
||||
|
||||
@@ -125,6 +125,12 @@ Note that there is no guaranteed way to know when the tracing of a particular se
|
||||
* `request`: a short string describing the current query, like "Execute CQL3 query"
|
||||
* `started_at`: is a timestamp taken when tracing session has began
|
||||
|
||||
##### Alternator specific
|
||||
Alternator commands will add following information to traces:
|
||||
* `alternator_op` key in `parameters` map - operation type, for example `CreateTable`
|
||||
* `table` key in `parameters` map - table used in given session if there was exactly one
|
||||
* `table[0]`, `table[1]`, ... in `parameters` map - tables used in given session, if there were more than one. Names will be sorted before inserting, names will not repeat
|
||||
|
||||
### Slow queries logging
|
||||
#### The motivation
|
||||
Many times in real life installations one of the most important parameters of the system is the longest response time. Naturally, the shorter it is - the better. Therefore capturing the request that take a long time and understanding why it took it so long is a very critical and challenging task.
|
||||
|
||||
@@ -21,13 +21,6 @@ There are 2 types of tasks:
|
||||
- `process_staging` - process (generate view updates and move to base directory)
|
||||
all staging sstables associated with the tablet replica of the base table
|
||||
|
||||
State of alive task can be either:
|
||||
- IDLE
|
||||
- STARTED
|
||||
- ABORTED
|
||||
|
||||
If the task doesn't exist in the state, this means it was finished or aborted.
|
||||
|
||||
View building tasks are created when:
|
||||
- `build_range` tasks:
|
||||
- a view/index is created
|
||||
@@ -38,12 +31,29 @@ View building tasks are created when:
|
||||
- `process_staging` tasks:
|
||||
- a staging sstable was registered to `view_building_worker`
|
||||
|
||||
A task might be aborted in two ways: by deleting it or by setting its state to `ABORTED`.
|
||||
If a view/keyspace is dropped, then its tasks are aborted by deleting them as they are no longer needed.
|
||||
On the other hand, at the beginning of a tablet operation (migration/resize/RF change), relevant view building tasks are aborted using `ABORTED` state.
|
||||
This intermediate state is needed to create new tasks at the end of the operation or in case of failure and rollback (aborted tasks are also deleted then).
|
||||
### Lifetime of a task
|
||||
|
||||
The view building coordinator starts a task only if its tablet is not in transition (`tablet_map.get_tablet_transition_info(tid) == nullptr`).
|
||||
The group0 state stores only tasks that haven't been completed yet or were aborted but haven't been cleaned up yet.
|
||||
|
||||
When a task is created, it is stored in group0 state (`system.view_building_tasks`) to be processed in the future.
|
||||
Then at some point, the view building cooridnator will decide to process it by sending a [`work_on_view_building_tasks` RPC](#rpc) to a worker.
|
||||
Unless the task was aborted, the worker will eventually reply that the task was finished. After the coordinator gets the response from the worker,
|
||||
it temporarily saves list of ids of finished tasks and removes those tasks from group0 state (pernamently marking them as finished) in 200ms intervals. (*)
|
||||
This batching of removing finished tasks is done in order to reduce number of generated group0 operations.
|
||||
|
||||
On the other hand, view buildind tasks can can also be aborted due to 2 main reasons:
|
||||
- a keyspace/view was dropped
|
||||
- tablet operations (see [tablet operations section](#tablet-operations))
|
||||
In the first case we simply delete relevant view building tasks as they are no longer needed.
|
||||
But if a task needs to be aborted due to tablet operation, we're firstly setting the `aborted` flag to true. We need to do this because we need the task informations
|
||||
to created a new adjusted tasks (if the operation succeeded) or rollback them (if the operation failed).
|
||||
Once a task is aborted by setting the flag, this cannot be revoked, so rolling back a task means creating its duplicate and removing the original task.
|
||||
|
||||
(*) - Because there is a time gap between when the coordinator learns that a task is finished (from the RPC response) and when the task is marked as completed,
|
||||
it is possible that the coordinator may lose this information (e.g. due to Raft leader change). But each view building worker keeps track of finished tasks locally,
|
||||
so when a new coordinator will send an RPC with the same view building tasks, the worker will immediately response that those tasks were completed.
|
||||
In the worst case, when both the coordinator and worker nodes are restarted, we can completely lose that information and will have to redo the work.
|
||||
However, view building tasks are idempotent.
|
||||
|
||||
View building task struct:
|
||||
```c++
|
||||
@@ -53,14 +63,9 @@ struct view_building_task {
|
||||
process_staging,
|
||||
};
|
||||
|
||||
enum class task_state {
|
||||
idle,
|
||||
started,
|
||||
aborted
|
||||
};
|
||||
utils::UUID id;
|
||||
task_type type;
|
||||
task_state state;
|
||||
bool aborted;
|
||||
|
||||
table_id base_id;
|
||||
table_id view_id; // is default value when `type == task_type::process_staging`
|
||||
@@ -73,13 +78,25 @@ State machine:
|
||||
|
||||
```mermaid
|
||||
stateDiagram-v2
|
||||
[*] --> IDLE
|
||||
IDLE --> [*]: aborted due to drop view
|
||||
IDLE --> STARTED: vbc chooses to work on the task
|
||||
STARTED --> ABORTED: aborted due to tablet operation
|
||||
STARTED --> [*]: done or aborted due to drop view
|
||||
IDLE --> ABORTED: aborted due to tablet operation
|
||||
ABORTED --> [*]
|
||||
state "the task is alive" as NORMAL
|
||||
state "aborted flag is set to true" as ABORTED
|
||||
|
||||
[*] --> NORMAL: task is created
|
||||
NORMAL --> work_on_view_building_tasks: view building coordinator sends a RPC
|
||||
work_on_view_building_tasks --> [*]: the coordinator gets response from the RPC call if the task wasn't aborted in the mean time
|
||||
NORMAL --> [*]: aborted due to keyspace/view drop
|
||||
NORMAL --> ABORTED: aborted due to tablet operation
|
||||
ABORTED --> [*]: new adjusted task is created
|
||||
ABORTED --> [*]: the task is rolled back with new ID
|
||||
|
||||
state work_on_view_building_tasks {
|
||||
state "view building worker is executing the task" as EXECUTE
|
||||
state "the worker saves information that the task was finished locally" as DONE
|
||||
|
||||
[*] --> EXECUTE
|
||||
EXECUTE --> DONE
|
||||
DONE --> [*]
|
||||
}
|
||||
```
|
||||
|
||||
## Schema
|
||||
@@ -90,7 +107,7 @@ CREATE TABLE system.view_building_tasks (
|
||||
key text,
|
||||
id timeuuid,
|
||||
type text,
|
||||
state text,
|
||||
aborted boolean,
|
||||
base_id uuid,
|
||||
view_id uuid, -- NULL for "process_staging" tasks
|
||||
base_tablet_id bigint,
|
||||
@@ -104,11 +121,6 @@ The view building coordinator stores currently processing base table in `system.
|
||||
under `view_building_processing_base` key.
|
||||
The entry is managed by group0.
|
||||
|
||||
The coordinator also updates view build statuses in `system.view_build_status_v2`.
|
||||
When it selects new base table to process, it marks build statuses for all base table's views on all hosts as `STARTED`.
|
||||
When there are no more task for the some view (for all hosts!) and there are no `process_staging` tasks for the base table,
|
||||
then the view is marked as `SUCCESS` on all hosts.
|
||||
|
||||
Once the view is built, an entry in `system.built_views` is created. Before the view building coordinator,
|
||||
this table was node-local one. But now the table is partially managed by group0,
|
||||
meaning that all entries from tablet-based keyspaces are managed by group0 and
|
||||
@@ -117,14 +129,14 @@ entries from vnode-based keyspaces are still node local.
|
||||
## View building worker
|
||||
|
||||
The view building worker is node-local service responsible for executing view building tasks.
|
||||
It observers view building state machine and executed the tasks once they enter `STARTED` state.
|
||||
It handles [work on view building tasks RPC](#rpc) and responses the coordinator once the tasks are finished.
|
||||
The worker also observes the group0 state to notice when tasks are aborted (by deleting them or by setting the aborted flag).
|
||||
|
||||
The worker groups multiple view building tasks into a batch and it can execute only one batch per shard
|
||||
(it's the coordinator responsibility to schedule only batch per tablet replica).
|
||||
|
||||
Tasks can be in one batch only if they have the same:
|
||||
- type
|
||||
- state (obviously it has to be STARTED)
|
||||
- base_id
|
||||
- tablet replica
|
||||
- tablet id
|
||||
@@ -132,24 +144,16 @@ Tasks can be in one batch only if they have the same:
|
||||
### RPC
|
||||
|
||||
The view building worker doesn't mark tasks as finished (it doesn't do group0 operation with one exception).
|
||||
Instead, it saves ids of finished and aborted tasks and it is the coordinator who asks the worker
|
||||
Instead, it saves ids of finished tasks and it is the coordinator who asks the worker
|
||||
what is the result of some tasks using following RPC call:
|
||||
|
||||
```c++
|
||||
struct task_result {
|
||||
enum class command_status: uint8_t {
|
||||
success,
|
||||
abort,
|
||||
};
|
||||
service::view_build_coordinator::command_status status;
|
||||
};
|
||||
|
||||
verb [[cancellable]] work_on_view_building_tasks(std::vector<utils::UUID> tasks_ids) -> std::vector<service::view_building::view_task_result>
|
||||
verb [[cancellable]] work_on_view_building_tasks(raft::term_t term, shard_id shard, std::vector<utils::UUID> tasks_ids) -> std::vector<utils::UUID>
|
||||
```
|
||||
|
||||
The worker registers handler for the RPC, which:
|
||||
- attaches to the tasks and waits for the result
|
||||
- returns result when the tasks are finished/aborted
|
||||
- returns result when the tasks are finished
|
||||
|
||||
## Tablet operations
|
||||
|
||||
|
||||
52
docs/features/incremental-repair.rst
Normal file
52
docs/features/incremental-repair.rst
Normal file
@@ -0,0 +1,52 @@
|
||||
.. _incremental-repair:
|
||||
|
||||
Incremental Repair
|
||||
==================
|
||||
|
||||
ScyllaDB's standard repair process scans and processes all the data on a node, regardless of whether it has changed since the last repair. This operation can be resource-intensive and time-consuming. The Incremental Repair feature provides a much more efficient and lightweight alternative for maintaining data consistency.
|
||||
|
||||
The core idea of incremental repair is to repair only the data that has been written or changed since the last repair was run. It intelligently skips data that has already been verified, dramatically reducing the time, I/O, and CPU resources required for the repair operation.
|
||||
|
||||
How It Works
|
||||
------------
|
||||
|
||||
ScyllaDB keeps track of the repair status of its data files (SSTables). When new data is written or existing data is modified, it is considered "unrepaired." When you run an incremental repair, the process works as follows:
|
||||
|
||||
1. ScyllaDB identifies and selects only the SSTables containing unrepaired data.
|
||||
2. It then synchronizes this data across the replica nodes.
|
||||
3. Once the data is successfully synchronized, the corresponding SSTables are marked as "repaired."
|
||||
|
||||
Subsequent incremental repairs will skip these marked SSTables, focusing only on new data that has arrived since. To ensure data integrity, ScyllaDB's compaction process handles repaired and unrepaired SSTables separately.
|
||||
|
||||
This approach is highly efficient because it allows entire SSTables to be skipped, avoiding the overhead of reading and processing unchanged data.
|
||||
|
||||
Prerequisites
|
||||
-------------
|
||||
|
||||
Incremental Repair is only supported for tables that use the tablets architecture. It is not available for legacy vnode-based tables.
|
||||
|
||||
Incremental Repair Modes
|
||||
------------------------
|
||||
|
||||
While incremental repair is the default and recommended mode, you can control its behavior for a given repair operation using the ``incremental_mode`` parameter. This is useful for situations where you might need to force a full data validation.
|
||||
|
||||
The available modes are:
|
||||
|
||||
* ``incremental``: Performs a standard incremental repair. It processes only unrepaired data and skips SSTables that are already marked as repaired. The repair status is updated after the operation.
|
||||
* ``full``: Forces the repair to process **all** SSTables, including those that have been previously repaired. This is useful when a complete data validation is required. The repair status is updated upon completion.
|
||||
* ``disabled``: Completely disables the incremental repair logic for the current operation. The repair behaves like a classic, non-incremental repair, and it does not read or update any incremental repair status markers.
|
||||
|
||||
|
||||
The incremental_mode parameter can be specified using nodetool cluster repair, e.g., nodetool cluster repair --incremental-mode incremental. It can also be specified with the REST API, e.g., curl -X POST "http://127.0.0.1:10000/storage_service/tablets/repair?ks=ks1&table=tb1&tokens=all&incremental_mode=incremental"
|
||||
|
||||
Benefits of Incremental Repair
|
||||
------------------------------
|
||||
|
||||
* **Faster Repairs:** By targeting only new or changed data, repair operations complete in a fraction of the time.
|
||||
* **Reduced Resource Usage:** Consumes significantly less CPU, I/O, and network bandwidth compared to a full repair.
|
||||
* **More Frequent Repairs:** The efficiency of incremental repair allows you to run it more frequently, ensuring a higher level of data consistency across your cluster at all times.
|
||||
|
||||
Notes
|
||||
-----
|
||||
|
||||
With the incremental repair feature, the repaired and unrepaired SSTables are compacted separately. After incremental repair, unrepaired SSTables become repaired SSTables, allowing them to be compacted together. A shorter repair interval is therefore recommended to mitigate potential space amplification resulting from these separate compactions.
|
||||
@@ -16,6 +16,7 @@ This document highlights ScyllaDB's key data modeling features.
|
||||
Workload Attributes </features/workload-attributes>
|
||||
Workload Prioritization </features/workload-prioritization>
|
||||
Backup and Restore </features/backup-and-restore>
|
||||
Incremental Repair </features/incremental-repair/>
|
||||
|
||||
.. panel-box::
|
||||
:title: ScyllaDB Features
|
||||
@@ -39,3 +40,6 @@ This document highlights ScyllaDB's key data modeling features.
|
||||
specify how ScyllaDB will handle requests depending on the workload.
|
||||
* :doc:`Backup and Restore </features/backup-and-restore>` allows you to create
|
||||
backups of your data and restore it when needed.
|
||||
* :doc:`Incremental Repair </features/incremental-repair/>` provides a much more
|
||||
efficient and lightweight approach to maintaining data consistency by
|
||||
repairing only the data that has changed since the last repair.
|
||||
|
||||
@@ -208,8 +208,8 @@ Pick a zone where Haswell CPUs are found. Local SSD performance offers, accordin
|
||||
Image with NVMe disk interface is recommended.
|
||||
(`More info <https://cloud.google.com/compute/docs/disks/local-ssd>`_)
|
||||
|
||||
Recommended instances types are `z3-highmem-highlssd and z3-highmem-standardlssd <https://cloud.google.com/compute/docs/storage-optimized-machines#z3_machine_types>`_,
|
||||
`n1-highmem <https://cloud.google.com/compute/docs/general-purpose-machines#n1_machines>`_, and `n2-highmem <https://cloud.google.com/compute/docs/general-purpose-machines#n2_machines>`_.
|
||||
Recommended instances types are `z3-highmem-highlssd, z3-highmem-standardlssd <https://cloud.google.com/compute/docs/storage-optimized-machines#z3_machine_types>`_,
|
||||
and `n2-highmem <https://cloud.google.com/compute/docs/general-purpose-machines#n2_series>`_.
|
||||
|
||||
|
||||
.. list-table::
|
||||
@@ -274,38 +274,9 @@ Recommended instances types are `z3-highmem-highlssd and z3-highmem-standardlssd
|
||||
- 1,406
|
||||
- 36,000
|
||||
|
||||
.. list-table::
|
||||
:widths: 30 20 20 30
|
||||
:header-rows: 1
|
||||
|
||||
* - Model
|
||||
- vCPU
|
||||
- Mem (GB)
|
||||
- Storage (GB)
|
||||
* - n1-highmem-2
|
||||
- 2
|
||||
- 13
|
||||
- 375
|
||||
* - n1-highmem-4
|
||||
- 4
|
||||
- 26
|
||||
- 750
|
||||
* - n1-highmem-8
|
||||
- 8
|
||||
- 52
|
||||
- 1,500
|
||||
* - n1-highmem-16
|
||||
- 16
|
||||
- 104
|
||||
- 3,000
|
||||
* - n1-highmem-32
|
||||
- 32
|
||||
- 208
|
||||
- 6,000
|
||||
* - n1-highmem-64
|
||||
- 64
|
||||
- 416
|
||||
- 9,000
|
||||
* Storage: Each Z3 instance supports up to 36,000 GiB (~36 TiB) of local
|
||||
Titanium SSD (or up to 72,000 GiB on bare-metal).
|
||||
* Z3 Processor: Sapphire Rapids
|
||||
|
||||
.. list-table::
|
||||
:widths: 30 20 20 30
|
||||
@@ -347,9 +318,15 @@ Recommended instances types are `z3-highmem-highlssd and z3-highmem-standardlssd
|
||||
- 80
|
||||
- 640
|
||||
- 9,000
|
||||
* - n2-highmem-96
|
||||
- 96
|
||||
- 768
|
||||
- 9,000
|
||||
|
||||
|
||||
Storage: each instance can support maximum of 24 local SSD of 375 GB partitions each for a total of `9 TB per instance <https://cloud.google.com/compute/docs/disks>`_
|
||||
* Storage: Each instance can support maximum of 24 local SSD of 375 GB
|
||||
partitions each for a total of `9 TB per instance <https://cloud.google.com/compute/docs/disks>`_.
|
||||
* Processors: Ice Lake (the default for larger machine types) and Cascade Lake
|
||||
(the default for machine types up to 80 vCPUs).
|
||||
|
||||
.. _system-requirements-azure:
|
||||
|
||||
|
||||
@@ -46,7 +46,8 @@ Install ScyllaDB
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
sudo gpg --homedir /tmp --no-default-keyring --keyring /etc/apt/keyrings/scylladb.gpg --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys a43e06657bac99e3
|
||||
sudo gpg --homedir /tmp --no-default-keyring --keyring /tmp/temp.gpg --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys a43e06657bac99e3
|
||||
sudo gpg --homedir /tmp --no-default-keyring --keyring /tmp/temp.gpg --export --armor a43e06657bac99e3 | gpg --dearmor > /etc/apt/keyrings/scylladb.gpg
|
||||
|
||||
.. code-block:: console
|
||||
:substitutions:
|
||||
@@ -72,23 +73,26 @@ Install ScyllaDB
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
apt-get install scylla{,-server,-tools,-tools-core,-kernel-conf,-node-exporter,-conf,-python3}=<your patch version>
|
||||
apt-get install scylla{,-server,-kernel-conf,-node-exporter,-conf,-python3}=<your patch version>
|
||||
|
||||
The following example shows installing ScyllaDB 5.2.3.
|
||||
The following example shows installing ScyllaDB 2025.3.1.
|
||||
|
||||
.. code-block:: console
|
||||
:class: hide-copy-button
|
||||
|
||||
apt-cache madison scylla
|
||||
scylla | 5.2.3-0.20230608.ea08d409f155-1 | https://downloads.scylladb.com/downloads/scylla/deb/debian-ubuntu/scylladb-5.2 stable/main amd64 Packages
|
||||
scylla | 5.2.2-0.20230521.9dd70a58c3f9-1 | https://downloads.scylladb.com/downloads/scylla/deb/debian-ubuntu/scylladb-5.2 stable/main amd64 Packages
|
||||
scylla | 5.2.1-0.20230508.f1c45553bc29-1 | https://downloads.scylladb.com/downloads/scylla/deb/debian-ubuntu/scylladb-5.2 stable/main amd64 Packages
|
||||
scylla | 5.2.0-0.20230427.429b696bbc1b-1 | https://downloads.scylladb.com/downloads/scylla/deb/debian-ubuntu/scylladb-5.2 stable/main amd64 Packages
|
||||
|
||||
scylla | 2025.3.4-0.20251116.898f193ef677-1 | https://downloads.scylladb.com/downloads/scylla/deb/debian-ubuntu/scylladb-2025.3 stable/main arm64 Packages
|
||||
scylla | 2025.3.3-0.20251029.0e6381f14db2-1 | https://downloads.scylladb.com/downloads/scylla/deb/debian-ubuntu/scylladb-2025.3 stable/main arm64 Packages
|
||||
scylla | 2025.3.2-0.20251010.295ed0e9e158-1 | https://downloads.scylladb.com/downloads/scylla/deb/debian-ubuntu/scylladb-2025.3 stable/main arm64 Packages
|
||||
scylla | 2025.3.1-0.20250907.2bbf3cf669bb-1 | https://downloads.scylladb.com/downloads/scylla/deb/debian-ubuntu/scylladb-2025.3 stable/main arm64 Packages
|
||||
scylla | 2025.3.0-0.20250827.d9e492a90c2e-1 | https://downloads.scylladb.com/downloads/scylla/deb/debian-ubuntu/scylladb-2025.3 stable/main arm64 Packages
|
||||
scylla | 2025.3.0~rc2-0.20250730.7164f11b997d-1 | https://downloads.scylladb.com/downloads/scylla/deb/debian-ubuntu/scylladb-2025.3 stable/main arm64 Packages
|
||||
scylla | 2025.3.0~rc1-0.20250710.f3297824e397-1 | https://downloads.scylladb.com/downloads/scylla/deb/debian-ubuntu/scylladb-2025.3 stable/main arm64 Packages
|
||||
scylla | 2025.3.0~rc0-0.20250701.e64bb3819ca7-1 | https://downloads.scylladb.com/downloads/scylla/deb/debian-ubuntu/scylladb-2025.3 stable/main arm64 Packages
|
||||
.. code-block:: console
|
||||
:class: hide-copy-button
|
||||
|
||||
apt-get install scylla{,-server,-tools,-tools-core,-kernel-conf,-node-exporter,-conf,-python3}=5.2.3-0.20230608.ea08d409f155-1
|
||||
|
||||
apt-get install scylla{,-server,-kernel-conf,-node-exporter,-conf,-python3,-cqlsh}=2025.3.1-0.20250907.2bbf3cf669bb-1
|
||||
|
||||
|
||||
#. (Ubuntu only) Set Java 11.
|
||||
|
||||
@@ -28,8 +28,16 @@ The command syntax is as follows:
|
||||
|
||||
scylla sstable <operation> <path to SStable>
|
||||
|
||||
You can specify more than one SSTable.
|
||||
|
||||
You can specify more than one SSTable. Additionally, the path to SSTable can point to an S3 fully qualified path in the form of s3://bucket-name/prefix/of/your/sstable/sstable-TOC.txt. To use this feature, you need to have AWS credentials set up in your environment. For more information, see :ref:`Configuring AWS S3 access <aws-s3-configuration>`. Additionally, you must ensure the tool is able to load the correct Scylla YAML file, which can be done using the --scylla-yaml-file parameter or by placing the YAML file in one of the default locations the tool checks.
|
||||
Additionally, the path to SSTable can point to an S3 fully qualified path in
|
||||
the form of s3://bucket-name/prefix/of/your/sstable/sstable-TOC.txt. To use
|
||||
this feature, you need to have AWS credentials set up in your environment.
|
||||
For more information, see :ref:`Configuring Object Storage <object-storage-configuration>`.
|
||||
|
||||
Additionally, you must ensure the tool is able to load the correct scylla.yaml
|
||||
file, which can be done using the ``--scylla-yaml-file`` parameter or by
|
||||
placing the YAML file in one of the default locations the tool checks.
|
||||
|
||||
.. _scylla-sstable-schema:
|
||||
|
||||
@@ -547,6 +555,17 @@ The content is dumped in JSON, using the following schema:
|
||||
"above_threshold": Uint
|
||||
}
|
||||
|
||||
dump-schema
|
||||
^^^^^^^^^^^
|
||||
|
||||
Dump the schema of the table or sstable in CQL describe table format.
|
||||
|
||||
Uses the regular `schema load <schema_>`_ mechanism to obtain the schema.
|
||||
With certain schema sources, the schema can be obtained without any sstables passed to the tool.
|
||||
|
||||
Important note: the dumped schema will always be a `CREATE TABLE` statement, even if the table is in fact a materialized view or an index.
|
||||
This schema is enough to understand and parse the sstable data, but it may not be enough to recreate the table or write new sstables for it.
|
||||
|
||||
.. _scylla-sstable-validate-operation:
|
||||
|
||||
validate
|
||||
|
||||
@@ -97,17 +97,16 @@ The :code:`scylla-server` file contains configuration related to starting up the
|
||||
|
||||
.. _object-storage-configuration:
|
||||
|
||||
Configuring Object Storage :label-caution:`Experimental`
|
||||
--------------------------------------------------------------
|
||||
Configuring Object Storage
|
||||
----------------------------------
|
||||
|
||||
Scylla has the ability to communicate directly with S3-compatible storage. This
|
||||
feature enables various functionalities, but requires proper configuration of
|
||||
storage endpoints.
|
||||
ScyllaDB can communicate directly with S3-compatible object storage. Before
|
||||
using features that rely on object storage, you must first enable and
|
||||
configure access to the storage endpoints.
|
||||
|
||||
To enable S3-compatible storage features, you need to describe the endpoints
|
||||
where SSTable files can be stored. This is done using a YAML configuration file.
|
||||
|
||||
The relevant ``scylla.yaml`` section should follow this format:
|
||||
Storage endpoints define where data can be stored and are specified in
|
||||
the ``scylla.yaml`` configuration file. The relevant section of ``scylla.yaml``
|
||||
should follow this format:
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
@@ -118,6 +117,18 @@ The relevant ``scylla.yaml`` section should follow this format:
|
||||
aws_region: <region_name> # optional, e.g. us-east-1
|
||||
iam_role_arn: <iam_role> # optional
|
||||
|
||||
|
||||
Example:
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
object_storage_endpoints:
|
||||
- name: s3.us-east-1.amazonaws.com
|
||||
port: 443
|
||||
https: true
|
||||
aws_region: us-east-1
|
||||
iam_role_arn: arn:aws:iam::123456789012:instance-profile/my-instance-instance-profile
|
||||
|
||||
The ``aws_region`` option can also be specified using
|
||||
the ``AWS_DEFAULT_REGION`` environment variable.
|
||||
|
||||
@@ -129,7 +140,7 @@ the following environment variables:
|
||||
- ``AWS_SECRET_ACCESS_KEY``
|
||||
- ``AWS_SESSION_TOKEN``
|
||||
|
||||
The Scylla S3 client will first attempt to access credentials from environment variables.
|
||||
The ScyllaDB S3 client will first attempt to access credentials from environment variables.
|
||||
If it fails to obtain credentials, it will then try to retrieve them from the
|
||||
AWS Security Token Service (STS) or the EC2 Instance Metadata Service.
|
||||
|
||||
@@ -139,64 +150,6 @@ AWS Security Token Service (STS) or the EC2 Instance Metadata Service.
|
||||
- When set, these values are used by the S3 client to sign requests.
|
||||
- If not set, requests are sent unsigned, which may not be accepted by all servers.
|
||||
|
||||
.. _aws-s3-configuration:
|
||||
|
||||
Configuring AWS S3 access
|
||||
============================
|
||||
|
||||
You can define endpoint details in the ``scylla.yaml`` file. For example:
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
object_storage_endpoints:
|
||||
- name: s3.us-east-1.amazonaws.com
|
||||
port: 443
|
||||
https: true
|
||||
aws_region: us-east-1
|
||||
|
||||
Local/Development Environment
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
In a local or development environment, you usually need to set authentication
|
||||
tokens in environment variables to ensure the client works properly.
|
||||
For instance:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export AWS_ACCESS_KEY_ID=EXAMPLE_ACCESS_KEY_ID
|
||||
export AWS_SECRET_ACCESS_KEY=EXAMPLE_SECRET_ACCESS_KEY
|
||||
|
||||
Additionally, you may include an `aws_session_token`, although this is not typically necessary for local or development environments:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export AWS_ACCESS_KEY_ID=EXAMPLE_ACCESS_KEY_ID
|
||||
export AWS_SECRET_ACCESS_KEY=EXAMPLE_SECRET_ACCESS_KEY
|
||||
export AWS_SESSION_TOKEN=EXAMPLE_TEMPORARY_SESSION_TOKEN
|
||||
|
||||
Important Note
|
||||
^^^^^^^^^^^^^^^
|
||||
|
||||
The examples above are intended for development or local environments.
|
||||
You should *never* use this approach in production. The Scylla S3 client
|
||||
will first attempt to access credentials from the file or environment
|
||||
variables. If it fails to obtain credentials, it will then try to
|
||||
retrieve them from the AWS Security Token Service (STS) or the EC2
|
||||
Instance Metadata Service.
|
||||
|
||||
For the EC2 Instance Metadata Service to function correctly, no
|
||||
additional configuration is required. However, STS requires the IAM Role
|
||||
ARN to be defined in the ``scylla.yaml`` file, as shown below:
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
object_storage_endpoints:
|
||||
- name: s3.us-east-1.amazonaws.com
|
||||
port: 443
|
||||
https: true
|
||||
aws_region: us-east-1
|
||||
iam_role_arn: arn:aws:iam::123456789012:instance-profile/my-instance-instance-profile
|
||||
|
||||
.. _admin-compression:
|
||||
|
||||
Compression
|
||||
@@ -309,33 +262,6 @@ ScyllaDB uses experimental flags to expose non-production-ready features safely.
|
||||
In recent ScyllaDB versions, these features are controlled by the ``experimental_features`` list in scylla.yaml, allowing one to choose which experimental to enable.
|
||||
Use ``scylla --help`` to get the list of experimental features.
|
||||
|
||||
.. _admin-keyspace-storage-options:
|
||||
|
||||
Keyspace storage options
|
||||
------------------------
|
||||
|
||||
..
|
||||
This section must be moved to Data Definition> CREATE KEYSPACE
|
||||
when support for object storage is GA.
|
||||
|
||||
By default, SStables of a keyspace are stored in a local directory.
|
||||
As an alternative, you can configure your keyspace to be stored
|
||||
on Amazon S3 or another S3-compatible object store.
|
||||
|
||||
Support for object storage is experimental and must be explicitly
|
||||
enabled in the ``scylla.yaml`` configuration file by specifying
|
||||
the ``keyspace-storage-options`` option:
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
experimental_features:
|
||||
- keyspace-storage-options
|
||||
|
||||
|
||||
Before creating keyspaces with object storage, you also need to
|
||||
:ref:`configure <object-storage-configuration>` the object storage
|
||||
credentials and endpoint.
|
||||
|
||||
.. _admin-views-with-tablets:
|
||||
|
||||
Views with Tablets
|
||||
|
||||
@@ -4,8 +4,8 @@ Nodetool scrub
|
||||
NAME
|
||||
....
|
||||
|
||||
**scrub** - Help identify and fix corrupted SSTable.
|
||||
Remove faulty data, eliminate tombstoned rows that have surpassed the table's gc_grace period, and fix out-of-order rows and partitions.
|
||||
**scrub** - Help identify and fix corrupted SSTable. Not all kinds of corruption can be skipped or fixed by scrub.
|
||||
Remove faulty data, eliminate tombstoned rows that have surpassed the table's gc_grace period, and fix out-of-order rows and partitions.
|
||||
|
||||
|
||||
SYNOPSIS
|
||||
@@ -19,9 +19,12 @@ SYNOPSIS
|
||||
[(-ns | --no-snapshot)]
|
||||
[(-s | --skip-corrupted)]
|
||||
[(-m <scrub_mode> | --mode <scrub_mode>)]
|
||||
[(-q <quarantine_mode> | --quarantine-mode <quarantine_mode>)]
|
||||
[--drop-unfixable-sstables]
|
||||
[--] <keyspace> [<table...>]
|
||||
|
||||
Supported scrub modes: ABORT, SKIP, SEGREGATE, VALIDATE
|
||||
Supported quarantine modes: INCLUDE, EXCLUDE, ONLY
|
||||
|
||||
OPTIONS
|
||||
.......
|
||||
@@ -35,6 +38,10 @@ Parameter Descriptio
|
||||
(Deprecated, use '--mode' instead. default false)
|
||||
-------------------------------------------------------------------- ------------------------------------------------------------------------------------------------------------------
|
||||
-m <scrub_mode> / --mode <scrub_mode> How to handle corrupt data (one of: ABORT|SKIP|SEGREGATE|VALIDATE, default ABORT; overrides '--skip-corrupted')
|
||||
-------------------------------------------------------------------- ------------------------------------------------------------------------------------------------------------------
|
||||
-q <quarantine_mode> / --quarantine-mode <quarantine_mode> How to handle quarantined SSTables (one of: INCLUDE|EXCLUDE|ONLY, default INCLUDE)
|
||||
-------------------------------------------------------------------- ------------------------------------------------------------------------------------------------------------------
|
||||
--drop-unfixable-sstables Drop unfixable SSTables instead of aborting the entire scrub (only valid with --mode=SEGREGATE)
|
||||
==================================================================== ==================================================================================================================
|
||||
|
||||
``--`` This option can be used to separate command-line options from the list of argument, (useful when arguments might be mistaken for command-line options.
|
||||
@@ -52,6 +59,8 @@ Scrub mode Descriptio
|
||||
ABORT Abort scrubbing when the first validation error occurs. (default).
|
||||
-------------------------------------------------------------------- ------------------------------------------------------------------------------------------------------------------
|
||||
SKIP Skip corrupted rows or partitions. (equivalent to the legacy --skip-corrupted option).
|
||||
**Warning**: This mode can cause data loss by removing invalid data portions or entire
|
||||
SSTables if severely corrupted (e.g., digest mismatch detected).
|
||||
-------------------------------------------------------------------- ------------------------------------------------------------------------------------------------------------------
|
||||
SEGREGATE Sort out-of-order rows or partitions by segregating them into additional SSTables.
|
||||
-------------------------------------------------------------------- ------------------------------------------------------------------------------------------------------------------
|
||||
@@ -59,6 +68,19 @@ VALIDATE Read-only
|
||||
By default, corrupt SSTables are moved into a "quarantine" subdirectory so they will not be subject to compaction.
|
||||
==================================================================== ==================================================================================================================
|
||||
|
||||
QUARANTINE MODES
|
||||
................
|
||||
|
||||
==================================================================== ==================================================================================================================
|
||||
Quarantine mode Description
|
||||
==================================================================== ==================================================================================================================
|
||||
INCLUDE Process both regular and quarantined SSTables (default).
|
||||
-------------------------------------------------------------------- ------------------------------------------------------------------------------------------------------------------
|
||||
EXCLUDE Process only regular (non-quarantined) SSTables.
|
||||
-------------------------------------------------------------------- ------------------------------------------------------------------------------------------------------------------
|
||||
ONLY Process only quarantined SSTables.
|
||||
==================================================================== ==================================================================================================================
|
||||
|
||||
Examples
|
||||
........
|
||||
|
||||
@@ -67,7 +89,7 @@ Scrub **all** tables in a keyspace (mykeyspace)
|
||||
.. code-block:: shell
|
||||
|
||||
> nodetool scrub mykeyspace
|
||||
|
||||
|
||||
|
||||
Scrub **a** specific table (mytable) in a keyspace (mykeyspace)
|
||||
|
||||
@@ -87,4 +109,65 @@ Scrub **a** specific table (mytable) in a keyspace (mykeyspace) in VALIDATE mode
|
||||
|
||||
> nodetool scrub -m VALIDATE --no-snapshot mykeyspace mytable
|
||||
|
||||
Scrub **a** specific table (mytable) in a keyspace (mykeyspace) in SEGREGATE mode dropping unfixable SSTables
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
> nodetool scrub -m SEGREGATE --drop-unfixable-sstables mykeyspace mytable
|
||||
|
||||
Procedures for Removing Bad SSTables
|
||||
.....................................
|
||||
|
||||
Method 1: Quarantine and Drop
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
**Step 1**: Run scrub in VALIDATE mode to identify and quarantine corrupted SSTables:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
> nodetool scrub -m VALIDATE keyspace_name table_name
|
||||
|
||||
This will move corrupted SSTables to a ``quarantine`` directory.
|
||||
The ``quarantine`` directory is a sub-directory of the table's respective data directory.
|
||||
|
||||
**Step 2** (Optional): Preserve quarantined SSTables for analysis:
|
||||
|
||||
Before permanently dropping the corrupted SSTables, consider copying some or all of them aside,
|
||||
somewhere outside of the ScyllaDB data directory, so they are preserved for later investigation by the ScyllaDB R&D team,
|
||||
to determine the root cause of the corruption.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Copy quarantined SSTables to a backup location for analysis
|
||||
> cp -r /path/to/data/keyspace_name/table_dir/quarantine /path/to/backup/location/
|
||||
|
||||
**Step 3**: Drop the quarantined SSTables using :doc:`dropquarantinedsstables </operating-scylla/nodetool-commands/dropquarantinedsstables>`:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
> nodetool dropquarantinedsstables keyspace_name table_name
|
||||
|
||||
This permanently removes the quarantined SSTables from the specified table.
|
||||
|
||||
Method 2: Segregate with Drop Unfixable Flag
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
This approach attempts to fix what can be fixed and automatically drops SSTables that cannot be fixed.
|
||||
|
||||
.. note::
|
||||
This method should be used for the subset of corruption issues where SEGREGATE mode can actually help: where corruption manifests at least partly in reordered partitions or rows.
|
||||
|
||||
**Step 1**: Run scrub in SEGREGATE mode with the ``--drop-unfixable-sstables`` flag:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
> nodetool scrub -m SEGREGATE --drop-unfixable-sstables keyspace_name table_name
|
||||
|
||||
This will:
|
||||
|
||||
- Attempt to segregate and fix out-of-order data where possible
|
||||
- Remove faulty data
|
||||
- Automatically drop SSTables that cannot be fixed
|
||||
- Create new properly ordered SSTables from the recoverable data
|
||||
|
||||
.. include:: /rst_include/apache-copyrights.rst
|
||||
|
||||
@@ -4,25 +4,58 @@ Update Topology Strategy From Simple to Network
|
||||
The following procedure specifies how to update the replication strategy from ``SimpleStrategy`` to ``NetworkTopologyStrategy``.
|
||||
Note that ``SimpleStrategy`` is **not** recommended for production usage, and it is strongly advised to create new clusters with ``NetworkTopologyStrategy``.
|
||||
|
||||
In case you are using ``SimpleStrategy``, there are two alternatives:
|
||||
In case you are using ``SimpleStrategy``, there are two possible procedures:
|
||||
|
||||
* Nodes are all on the same rack (can be updated without downtime)
|
||||
* Nodes are on different racks (requires full shutdown)
|
||||
* **Procedure with no downtime**: This procedure can be used if any of the following conditions are met:
|
||||
|
||||
To check which node is on which rack, use ``nodetool status``
|
||||
* All nodes are on the same rack
|
||||
|
||||
Note that if the Replication Factor (RF) of the relevant Keyspace is equal to the number of nodes, regardless of the number of racks, for example, RF=3, nodes=3, you can use the first procedure without downtime.
|
||||
* Regardless of the number of racks, the Replication Factor (RF) of the ``SimpleStrategy`` keyspace is not changing, and the RF is also equal to the number of cluster nodes. For example, if the RF of a keyspace is 3 and the cluster has 3 nodes, and if the RF is not changing, then the number of racks does not matter.
|
||||
|
||||
All nodes are on the same rack
|
||||
------------------------------
|
||||
* **Procedure with downtime**: This procedure should be used if any of the following conditions are met:
|
||||
|
||||
* Cluster nodes are on multiple racks, and the Replication Factor (RF) of the ``SimpleStrategy`` keyspace is not changing, and the RF is **not** equal to the number of nodes in the cluster.
|
||||
|
||||
* Cluster nodes are on multiple racks, and the Replication Factor (RF) of the ``SimpleStrategy`` keyspace is changing.
|
||||
|
||||
To check which node is on which rack, use ``nodetool status`` (or ``sctool status`` from the Scylla Manager node)
|
||||
|
||||
Check keyspaces
|
||||
---------------
|
||||
|
||||
Check if the following keyspaces use ``SimpleStrategy``:
|
||||
|
||||
* Any user-created keyspace.
|
||||
|
||||
To check the replication strategy of a specific user-created keyspace:
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
DESCRIBE KEYSPACE mykeyspace;
|
||||
|
||||
Or to check all user-created keyspaces:
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
DESCRIBE SCHEMA;
|
||||
|
||||
* Certain system keyspaces, including ``system_distributed``, ``system_traces``, ``system_audit``.
|
||||
|
||||
To check the replication strategy of the above system keyspaces:
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
DESCRIBE KEYSPACE system_distributed;
|
||||
DESCRIBE KEYSPACE system_traces;
|
||||
DESCRIBE KEYSPACE system_audit;
|
||||
|
||||
If any of the above keyspaces uses ``SimpleStrategy``, use the relevant procedure on this page to switch it to ``NetworkTopologyStategy``.
|
||||
|
||||
Procedure with no downtime
|
||||
--------------------------
|
||||
|
||||
Alter each Keyspace replication to use ``class : NetworkTopologyStrategy``.
|
||||
|
||||
Alter the following:
|
||||
|
||||
* Keyspace created by the user.
|
||||
* System: ``system_distributed``, ``system_traces``.
|
||||
|
||||
For example:
|
||||
|
||||
Before
|
||||
@@ -30,27 +63,26 @@ Before
|
||||
.. code-block:: cql
|
||||
|
||||
DESCRIBE KEYSPACE mykeyspace;
|
||||
CREATE KEYSPACE mykespace WITH replication = { 'class' : 'SimpleStrategy', 'replication_factor': '3'};
|
||||
CREATE KEYSPACE mykeyspace WITH replication = { 'class' : 'SimpleStrategy', 'replication_factor': '3'};
|
||||
|
||||
ALTER Command
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
ALTER KEYSPACE mykespace WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3};
|
||||
ALTER KEYSPACE mykeyspace WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3};
|
||||
|
||||
After
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
DESCRIBE KEYSPACE mykeyspace;
|
||||
CREATE KEYSPACE mykespace WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3};
|
||||
CREATE KEYSPACE mykeyspace WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3};
|
||||
|
||||
To complete the process, you need to :doc:`change the snitch </operating-scylla/procedures/config-change/switch-snitch/>`, edit the
|
||||
``cassandra-rackdc.properties`` file, and set the preferred data-center name.
|
||||
|
||||
|
||||
Nodes are on different racks
|
||||
----------------------------
|
||||
Procedure with downtime
|
||||
-----------------------
|
||||
|
||||
This is a more complex scenario, as the new strategy may select different replicas depending on whether the nodes are on different racks.
|
||||
To fix that, you will need a **full shutdown of the cluster**.
|
||||
|
||||
1241
docs/poetry.lock
generated
1241
docs/poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -9,11 +9,11 @@ package-mode = false
|
||||
python = "^3.10"
|
||||
pygments = "^2.18.0"
|
||||
redirects_cli ="^0.1.3"
|
||||
sphinx-scylladb-theme = "^1.8.8"
|
||||
sphinx-scylladb-theme = "^1.8.9"
|
||||
sphinx-sitemap = "^2.6.0"
|
||||
sphinx-autobuild = "^2024.4.19"
|
||||
Sphinx = "^7.3.7"
|
||||
sphinx-multiversion-scylla = "^0.3.1"
|
||||
sphinx-multiversion-scylla = "^0.3.4"
|
||||
sphinxcontrib-datatemplates = "^0.9.2"
|
||||
sphinx-scylladb-markdown = "^0.1.2"
|
||||
sphinx_collapse ="^0.1.3"
|
||||
|
||||
@@ -4,7 +4,7 @@ Upgrade ScyllaDB
|
||||
|
||||
.. toctree::
|
||||
|
||||
ScyllaDB 2025.2 to ScyllaDB 2025.3 <upgrade-guide-from-2025.2-to-2025.3/index>
|
||||
ScyllaDB 2025.x to ScyllaDB 2025.4 <upgrade-guide-from-2025.x-to-2025.4/index>
|
||||
ScyllaDB Image <ami-upgrade>
|
||||
|
||||
|
||||
|
||||
@@ -1,13 +0,0 @@
|
||||
==========================================================
|
||||
Upgrade - ScyllaDB 2025.2 to ScyllaDB 2025.3
|
||||
==========================================================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:hidden:
|
||||
|
||||
Upgrade ScyllaDB <upgrade-guide-from-2025.2-to-2025.3>
|
||||
Metrics Update <metric-update-2025.2-to-2025.3>
|
||||
|
||||
* :doc:`Upgrade from ScyllaDB 2025.2.x to ScyllaDB 2025.3.y <upgrade-guide-from-2025.2-to-2025.3>`
|
||||
* :doc:`Metrics Update Between 2025.2 and 2025.3 <metric-update-2025.2-to-2025.3>`
|
||||
@@ -1,95 +0,0 @@
|
||||
.. |SRC_VERSION| replace:: 2025.2
|
||||
.. |NEW_VERSION| replace:: 2025.3
|
||||
|
||||
================================================================
|
||||
Metrics Update Between |SRC_VERSION| and |NEW_VERSION|
|
||||
================================================================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:hidden:
|
||||
|
||||
ScyllaDB |NEW_VERSION| Dashboards are available as part of the latest |mon_root|.
|
||||
|
||||
New Metrics
|
||||
------------
|
||||
|
||||
The following metrics are new in ScyllaDB |NEW_VERSION| compared to |SRC_VERSION|.
|
||||
|
||||
Alternator Per-table Metrics
|
||||
===================================
|
||||
|
||||
.. list-table::
|
||||
:widths: 25 150
|
||||
:header-rows: 1
|
||||
|
||||
* - Metric
|
||||
- Description
|
||||
* - scylla_alternator_table_batch_item_count
|
||||
- The total number of items processed across all batches.
|
||||
* - scylla_alternator_table_batch_item_count_histogram
|
||||
- A histogram of the number of items in a batch request.
|
||||
* - scylla_alternator_table_filtered_rows_dropped_total
|
||||
- The number of rows read and dropped during filtering operations.
|
||||
* - scylla_alternator_table_filtered_rows_matched_total
|
||||
- The number of rows read and matched during filtering operations.
|
||||
* - scylla_alternator_table_filtered_rows_read_total
|
||||
- The number of rows read during filtering operations.
|
||||
* - scylla_alternator_table_op_latency
|
||||
- A latency histogram of an operation via Alternator API.
|
||||
* - scylla_alternator_table_op_latency_summary
|
||||
- A latency summary of an operation via Alternator API.
|
||||
* - scylla_alternator_table_operation
|
||||
- The number of operations via Alternator API.
|
||||
* - scylla_alternator_table_rcu_total
|
||||
- The total number of consumed read units.
|
||||
* - scylla_alternator_table_reads_before_write
|
||||
- The number of performed read-before-write operations.
|
||||
* - scylla_alternator_table_requests_blocked_memory
|
||||
- Counts the number of requests blocked due to memory pressure.
|
||||
* - scylla_alternator_table_requests_shed
|
||||
- Counts the number of requests shed due to overload.
|
||||
* - scylla_alternator_table_shard_bounce_for_lwt
|
||||
- The number of writes that had to be bounced from this shard because of LWT requirements.
|
||||
* - scylla_alternator_table_total_operations
|
||||
- The number of total operations via Alternator API.
|
||||
* - scylla_alternator_table_unsupported_operations
|
||||
- The number of unsupported operations via Alternator API.
|
||||
* - scylla_alternator_table_wcu_total
|
||||
- The total number of consumed write units.
|
||||
* - scylla_alternator_table_write_using_lwt
|
||||
- The number of writes that used LWT.
|
||||
|
||||
Other Metrics
|
||||
===============
|
||||
|
||||
.. list-table::
|
||||
:widths: 25 150
|
||||
:header-rows: 1
|
||||
|
||||
* - Metric
|
||||
- Description
|
||||
* - scylla_batchlog_manager_total_write_replay_attempts
|
||||
- Counts write operations issued in a batchlog replay flow.
|
||||
A high value of this metric indicates that there is a long batch replay list.
|
||||
* - scylla_corrupt_data_entries_reported
|
||||
- Counts the number of corrupt data instances reported to the corrupt data handler.
|
||||
A non-zero value indicates that the database suffered data corruption.
|
||||
* - scylla_memory_oversized_allocs
|
||||
- The total count of oversized memory allocations.
|
||||
* - scylla_reactor_internal_errors
|
||||
- The total number of internal errors (subset of cpp_exceptions) that usually
|
||||
indicate a malfunction in the code
|
||||
* - scylla_stall_detector_io_threaded_fallbacks
|
||||
- The total number of io-threaded-fallbacks operations.
|
||||
|
||||
Removed Metrics
|
||||
---------------------
|
||||
|
||||
The following metrics have been removed in 2025.3:
|
||||
|
||||
* scylla_cql_authorized_prepared_statements_cache_evictions
|
||||
* scylla_lsa_large_objects_total_space_bytes
|
||||
* scylla_lsa_small_objects_total_space_bytes
|
||||
* scylla_lsa_small_objects_used_space_bytes
|
||||
|
||||
@@ -0,0 +1,13 @@
|
||||
==========================================================
|
||||
Upgrade - ScyllaDB 2025.x to ScyllaDB 2025.4
|
||||
==========================================================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:hidden:
|
||||
|
||||
Upgrade ScyllaDB <upgrade-guide-from-2025.x-to-2025.4>
|
||||
Metrics Update <metric-update-2025.x-to-2025.4>
|
||||
|
||||
* :doc:`Upgrade from ScyllaDB 2025.x to ScyllaDB 2025.4 <upgrade-guide-from-2025.x-to-2025.4>`
|
||||
* :doc:`Metrics Update Between 2025.x and 2025.4 <metric-update-2025.x-to-2025.4>`
|
||||
@@ -0,0 +1,68 @@
|
||||
.. |SRC_VERSION| replace:: 2025.x
|
||||
.. |NEW_VERSION| replace:: 2025.4
|
||||
.. |PRECEDING_VERSION| replace:: 2025.3
|
||||
|
||||
================================================================
|
||||
Metrics Update Between |SRC_VERSION| and |NEW_VERSION|
|
||||
================================================================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:hidden:
|
||||
|
||||
ScyllaDB |NEW_VERSION| Dashboards are available as part of the latest |mon_root|.
|
||||
|
||||
|
||||
New Metrics in |NEW_VERSION|
|
||||
--------------------------------------
|
||||
|
||||
The following metrics are new in ScyllaDB |NEW_VERSION| compared to |PRECEDING_VERSION|.
|
||||
|
||||
.. list-table::
|
||||
:widths: 25 150
|
||||
:header-rows: 1
|
||||
|
||||
* - Metric
|
||||
- Description
|
||||
* - scylla_database_total_view_updates_due_to_replica_count_mismatch
|
||||
- The total number of view updates for which there were more view replicas
|
||||
than base replicas and we had to generate an extra view update because
|
||||
the additional view replica wouldn't get paired with any base replica.
|
||||
It should only increase during the Replication Factor (RF) change. It
|
||||
should stop increasing shortly after finishing the RF change.
|
||||
* - scylla_database_total_writes_rejected_due_to_out_of_space_prevention
|
||||
- Counts write operations that were rejected due to disabled user tables
|
||||
writes.
|
||||
* - scylla_index_query_latencies
|
||||
- Index query latencies.
|
||||
* - scylla_reactor_aio_retries
|
||||
- The total number of IOCB-s re-submitted via thread-pool.
|
||||
* - scylla_reactor_io_threaded_fallbacks
|
||||
- The total number of io-threaded-fallbacks operations.
|
||||
* - scylla_repair_inc_sst_read_bytes
|
||||
- The total number of bytes read from SStables for incremental repair
|
||||
on this shard.
|
||||
* - scylla_repair_inc_sst_skipped_bytes
|
||||
- The total number of bytes skipped from SStables for incremental repair
|
||||
on this shard.
|
||||
* - scylla_repair_tablet_time_ms
|
||||
- The time spent on tablet repair on this shard (in milliseconds).
|
||||
* - scylla_s3_downloads_blocked_on_memory
|
||||
- Counts the number of times the S3 client downloads were delayed due to
|
||||
insufficient memory availability.
|
||||
* - scylla_s3_memory_usage
|
||||
- The total number of bytes consumed by the S3 client.
|
||||
* - scylla_s3_total_read_prefetch_bytes
|
||||
- The total number of bytes requested from object.
|
||||
* - scylla_storage_proxy_replica_fenced_out_requests
|
||||
- The number of requests that resulted in a stale_topology_exception.
|
||||
* - scylla_vector_store_dns_refreshes
|
||||
- The number of DNS refreshes.
|
||||
|
||||
New and Updated Metrics in Previous 2025.x Releases
|
||||
-------------------------------------------------------
|
||||
|
||||
* `Metrics Update Between 2025.2 and 2025.3 <https://docs.scylladb.com/manual/branch-2025.3/upgrade/upgrade-guides/upgrade-guide-from-2025.2-to-2025.3/metric-update-2025.2-to-2025.3.html>`_
|
||||
* `Metrics Update Between 2025.1 and 2025.2 <https://docs.scylladb.com/manual/branch-2025.2/upgrade/upgrade-guides/upgrade-guide-from-2025.1-to-2025.2/metric-update-2025.1-to-2025.2.html>`_
|
||||
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
.. |SCYLLA_NAME| replace:: ScyllaDB
|
||||
|
||||
.. |SRC_VERSION| replace:: 2025.2
|
||||
.. |NEW_VERSION| replace:: 2025.3
|
||||
.. |SRC_VERSION| replace:: 2025.x
|
||||
.. |NEW_VERSION| replace:: 2025.4
|
||||
|
||||
.. |ROLLBACK| replace:: rollback
|
||||
.. _ROLLBACK: ./#rollback-procedure
|
||||
|
||||
.. |SCYLLA_METRICS| replace:: ScyllaDB Metrics Update - ScyllaDB 2025.2 to 2025.3
|
||||
.. _SCYLLA_METRICS: ../metric-update-2025.2-to-2025.3
|
||||
.. |SCYLLA_METRICS| replace:: ScyllaDB Metrics Update - ScyllaDB 2025.x to 2025.4
|
||||
.. _SCYLLA_METRICS: ../metric-update-2025.x-to-2025.4
|
||||
|
||||
=======================================================================================
|
||||
Upgrade from |SCYLLA_NAME| |SRC_VERSION| to |SCYLLA_NAME| |NEW_VERSION|
|
||||
@@ -27,9 +27,9 @@ Before You Upgrade ScyllaDB
|
||||
|
||||
**Upgrade Your Driver**
|
||||
|
||||
If you're using a :doc:`ScyllaDB driver </using-scylla/drivers/cql-drivers/index>`,
|
||||
If you're using a `ScyllaDB driver <https://docs.scylladb.com/stable/drivers/index.html>`_,
|
||||
upgrade the driver before upgrading ScyllaDB. The latest two versions of each driver
|
||||
are supported.
|
||||
are supported. See `Driver Support <https://docs.scylladb.com/stable/versioning/driver-support.html>`_.
|
||||
|
||||
**Upgrade ScyllaDB Monitoring Stack**
|
||||
|
||||
@@ -42,7 +42,7 @@ We recommend upgrading the Monitoring Stack to the latest version.
|
||||
**Check Feature Updates**
|
||||
|
||||
See the ScyllaDB Release Notes for the latest updates. The Release Notes are published
|
||||
at the `ScyllaDB Community Forum <https://forum.scylladb.com/>`_.
|
||||
at the `ScyllaDB Community Forum <https://forum.scylladb.com/c/scylladb-release-notes/>`_.
|
||||
|
||||
Upgrade Procedure
|
||||
=================
|
||||
@@ -150,7 +150,7 @@ You should take note of the current version in case you want to |ROLLBACK|_ the
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
sudo wget -O /etc/apt/sources.list.d/scylla.list https://downloads.scylladb.com/deb/debian/scylla-2025.3.list
|
||||
sudo wget -O /etc/apt/sources.list.d/scylla.list https://downloads.scylladb.com/deb/debian/scylla-2025.4.list
|
||||
|
||||
#. Install the new ScyllaDB version:
|
||||
|
||||
@@ -168,7 +168,7 @@ You should take note of the current version in case you want to |ROLLBACK|_ the
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
sudo curl -o /etc/yum.repos.d/scylla.repo -L https://downloads.scylladb.com/rpm/centos/scylla-2025.3.repo
|
||||
sudo curl -o /etc/yum.repos.d/scylla.repo -L https://downloads.scylladb.com/rpm/centos/scylla-2025.4.repo
|
||||
|
||||
#. Install the new ScyllaDB version:
|
||||
|
||||
@@ -234,8 +234,8 @@ Rollback Procedure
|
||||
point, the only way to restore a cluster to |SRC_VERSION| is by restoring it
|
||||
from backup.
|
||||
|
||||
The following procedure describes a rollback from |SCYLLA_NAME| |NEW_VERSION|.x to
|
||||
|SRC_VERSION|.y. Apply this procedure if an upgrade from |SRC_VERSION| to
|
||||
The following procedure describes a rollback from |SCYLLA_NAME| |NEW_VERSION| to
|
||||
|SRC_VERSION|. Apply this procedure if an upgrade from |SRC_VERSION| to
|
||||
|NEW_VERSION| fails before completing on all nodes.
|
||||
|
||||
* Use this procedure only on the nodes you upgraded to |NEW_VERSION|.
|
||||
@@ -32,6 +32,10 @@ target_link_libraries(scylla_encryption
|
||||
cql3
|
||||
utils
|
||||
cpp-jwt::cpp-jwt)
|
||||
if (Scylla_USE_PRECOMPILED_HEADER_USE)
|
||||
target_precompile_headers(scylla_encryption REUSE_FROM scylla-precompiled-header)
|
||||
endif()
|
||||
|
||||
if(kmip_FOUND)
|
||||
target_link_libraries(scylla_encryption
|
||||
PRIVATE
|
||||
|
||||
@@ -12,6 +12,9 @@ target_link_libraries(ldap
|
||||
Seastar::seastar
|
||||
PRIVATE
|
||||
OpenLDAP::ldap OpenLDAP::lber)
|
||||
if (Scylla_USE_PRECOMPILED_HEADER_USE)
|
||||
target_precompile_headers(ldap REUSE_FROM scylla-precompiled-header)
|
||||
endif()
|
||||
|
||||
check_headers(check-headers ldap
|
||||
GLOB_RECURSE ${CMAKE_CURRENT_SOURCE_DIR}/*.hh)
|
||||
9
exported_templates.cc
Normal file
9
exported_templates.cc
Normal file
@@ -0,0 +1,9 @@
|
||||
/*
|
||||
* Copyright (C) 2025-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
|
||||
#include "exported_templates.hh"
|
||||
4
exported_templates.hh
Normal file
4
exported_templates.hh
Normal file
@@ -0,0 +1,4 @@
|
||||
// Copyright (C) 2025-present ScyllaDB
|
||||
// SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
|
||||
#pragma once
|
||||
@@ -22,6 +22,9 @@ target_link_libraries(gms
|
||||
PRIVATE
|
||||
db
|
||||
absl::headers)
|
||||
if (Scylla_USE_PRECOMPILED_HEADER_USE)
|
||||
target_precompile_headers(gms REUSE_FROM scylla-precompiled-header)
|
||||
endif()
|
||||
|
||||
check_headers(check-headers gms
|
||||
GLOB_RECURSE ${CMAKE_CURRENT_SOURCE_DIR}/*.hh)
|
||||
|
||||
@@ -781,7 +781,7 @@ future<> gossiper::remove_endpoint(locator::host_id endpoint, permit_id pid) {
|
||||
|
||||
if (was_alive) {
|
||||
try {
|
||||
logger.info("InetAddress {}/{} is now DOWN, status = {}", state->get_host_id(), endpoint, get_gossip_status(*state));
|
||||
logger.info("InetAddress {}/{} is now DOWN, status = {}", state->get_host_id(), ip, get_gossip_status(*state));
|
||||
co_await do_on_dead_notifications(ip, std::move(state), pid);
|
||||
} catch (...) {
|
||||
logger.warn("Fail to call on_dead callback: {}", std::current_exception());
|
||||
@@ -846,7 +846,7 @@ future<> gossiper::do_status_check() {
|
||||
}
|
||||
}
|
||||
|
||||
gossiper::endpoint_permit::endpoint_permit(endpoint_locks_map::entry_ptr&& ptr, locator::host_id addr, seastar::compat::source_location caller) noexcept
|
||||
gossiper::endpoint_permit::endpoint_permit(endpoint_locks_map::entry_ptr&& ptr, locator::host_id addr, std::source_location caller) noexcept
|
||||
: _ptr(std::move(ptr))
|
||||
, _permit_id(_ptr->pid)
|
||||
, _addr(std::move(addr))
|
||||
@@ -892,7 +892,7 @@ gossiper::endpoint_lock_entry::endpoint_lock_entry() noexcept
|
||||
, pid(permit_id::create_null_id())
|
||||
{}
|
||||
|
||||
future<gossiper::endpoint_permit> gossiper::lock_endpoint(locator::host_id ep, permit_id pid, seastar::compat::source_location l) {
|
||||
future<gossiper::endpoint_permit> gossiper::lock_endpoint(locator::host_id ep, permit_id pid, std::source_location l) {
|
||||
if (current_scheduling_group() != _gcfg.gossip_scheduling_group) {
|
||||
logger.warn("Incorrect scheduling group used for gossiper::lock_endpoint: {}, should be {}, backtrace {}", current_scheduling_group().name(), _gcfg.gossip_scheduling_group.name(), current_backtrace());
|
||||
}
|
||||
@@ -931,10 +931,10 @@ future<gossiper::endpoint_permit> gossiper::lock_endpoint(locator::host_id ep, p
|
||||
|
||||
// If we didn't rethrow above, the abort had to come from `abort_on_expiry`'s timer.
|
||||
|
||||
static constexpr auto fmt_loc = [] (const seastar::compat::source_location& l) {
|
||||
static constexpr auto fmt_loc = [] (const std::source_location& l) {
|
||||
return fmt::format("{}({}:{}) `{}`", l.file_name(), l.line(), l.column(), l.function_name());
|
||||
};
|
||||
static constexpr auto fmt_loc_opt = [] (const std::optional<seastar::compat::source_location>& l) {
|
||||
static constexpr auto fmt_loc_opt = [] (const std::optional<std::source_location>& l) {
|
||||
if (!l) {
|
||||
return "null"s;
|
||||
}
|
||||
|
||||
@@ -158,10 +158,10 @@ public:
|
||||
permit_id pid;
|
||||
semaphore_units<> units;
|
||||
size_t holders = 0;
|
||||
std::optional<seastar::compat::source_location> first_holder;
|
||||
std::optional<std::source_location> first_holder;
|
||||
// last_holder is the caller of endpoint_permit who last took this entry,
|
||||
// it might not be a current holder (the permit might've been destroyed)
|
||||
std::optional<seastar::compat::source_location> last_holder;
|
||||
std::optional<std::source_location> last_holder;
|
||||
|
||||
endpoint_lock_entry() noexcept;
|
||||
};
|
||||
@@ -170,16 +170,16 @@ public:
|
||||
endpoint_locks_map::entry_ptr _ptr;
|
||||
permit_id _permit_id;
|
||||
locator::host_id _addr;
|
||||
seastar::compat::source_location _caller;
|
||||
std::source_location _caller;
|
||||
public:
|
||||
endpoint_permit(endpoint_locks_map::entry_ptr&& ptr, locator::host_id addr, seastar::compat::source_location caller) noexcept;
|
||||
endpoint_permit(endpoint_locks_map::entry_ptr&& ptr, locator::host_id addr, std::source_location caller) noexcept;
|
||||
endpoint_permit(endpoint_permit&&) noexcept;
|
||||
~endpoint_permit();
|
||||
bool release() noexcept;
|
||||
const permit_id& id() const noexcept { return _permit_id; }
|
||||
};
|
||||
// Must be called on shard 0
|
||||
future<endpoint_permit> lock_endpoint(locator::host_id, permit_id pid, seastar::compat::source_location l = seastar::compat::source_location::current());
|
||||
future<endpoint_permit> lock_endpoint(locator::host_id, permit_id pid, std::source_location l = std::source_location::current());
|
||||
|
||||
private:
|
||||
void permit_internal_error(const locator::host_id& addr, permit_id pid);
|
||||
|
||||
@@ -8,7 +8,6 @@
|
||||
* SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
|
||||
*/
|
||||
|
||||
#include <seastar/util/modules.hh>
|
||||
#include <seastar/core/shard_id.hh>
|
||||
#include <seastar/core/on_internal_error.hh>
|
||||
#include <seastar/core/format.hh>
|
||||
|
||||
@@ -66,5 +66,6 @@ struct join_node_response_result {};
|
||||
verb [[ip]] join_node_query (raft::server_id dst_id, service::join_node_query_params) -> service::join_node_query_result;
|
||||
verb [[ip]] join_node_request (raft::server_id dst_id, service::join_node_request_params) -> service::join_node_request_result;
|
||||
verb join_node_response (raft::server_id dst_id, service::join_node_response_params) -> service::join_node_response_result;
|
||||
verb [[with_client_info, one_way, ip]] notify_banned (raft::server_id dst_id);
|
||||
|
||||
}
|
||||
|
||||
@@ -93,7 +93,6 @@ verb [[cancellable]] tablet_cleanup (raft::server_id dst_id, locator::global_tab
|
||||
verb [[cancellable]] table_load_stats_v1 (raft::server_id dst_id) -> locator::load_stats_v1;
|
||||
verb [[cancellable]] table_load_stats (raft::server_id dst_id) -> locator::load_stats;
|
||||
verb [[cancellable]] tablet_repair(raft::server_id dst_id, locator::global_tablet_id, service::session_id session [[version 2025.4]]) -> service::tablet_operation_repair_result;
|
||||
verb [[cancellable]] tablet_repair_colocated(raft::server_id dst_id, locator::global_tablet_id, std::vector<locator::global_tablet_id>, service::session_id session [[version 2025.4]]) -> service::tablet_operation_repair_result;
|
||||
verb [[]] estimate_sstable_volume(table_id table) -> uint64_t;
|
||||
verb [[]] sample_sstables(table_id table, uint64_t chunk_size, uint64_t n_chunks) -> utils::chunked_vector<temporary_buffer<char>>;
|
||||
|
||||
|
||||
@@ -15,15 +15,7 @@ class update_backlog {
|
||||
size_t get_max_bytes();
|
||||
};
|
||||
|
||||
struct view_task_result {
|
||||
enum class command_status: uint8_t {
|
||||
success,
|
||||
abort,
|
||||
};
|
||||
db::view::view_task_result::command_status status;
|
||||
};
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
verb [[cancellable]] work_on_view_building_tasks(std::vector<utils::UUID> tasks_ids) -> std::vector<db::view::view_task_result>
|
||||
verb [[cancellable]] work_on_view_building_tasks(raft::term_t term, shard_id shard, std::vector<utils::UUID> tasks_ids) -> std::vector<utils::UUID>
|
||||
|
||||
@@ -15,6 +15,9 @@ target_link_libraries(index
|
||||
xxHash::xxhash
|
||||
PRIVATE
|
||||
cql3)
|
||||
if (Scylla_USE_PRECOMPILED_HEADER_USE)
|
||||
target_precompile_headers(index REUSE_FROM scylla-precompiled-header)
|
||||
endif()
|
||||
|
||||
check_headers(check-headers index
|
||||
GLOB_RECURSE ${CMAKE_CURRENT_SOURCE_DIR}/*.hh)
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user