mirror of
https://github.com/scylladb/scylladb.git
synced 2026-04-20 08:30:35 +00:00
Compare commits
240 Commits
fix_sl_v2_
...
copilot/cr
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
410b38fd6d | ||
|
|
b9beb281d3 | ||
|
|
537b1511a5 | ||
|
|
513044fe32 | ||
|
|
b3690f0815 | ||
|
|
6b475a8215 | ||
|
|
a0fb037395 | ||
|
|
d5aa7e3aff | ||
|
|
727c2b529d | ||
|
|
33cf97d688 | ||
|
|
e144d5b0bb | ||
|
|
69249671a7 | ||
|
|
27aaafb8aa | ||
|
|
9c1e310b0d | ||
|
|
f955a90309 | ||
|
|
4ca40929ef | ||
|
|
079fe17e8b | ||
|
|
aef5ff7491 | ||
|
|
38c4a14a5b | ||
|
|
f83f911bae | ||
|
|
a256ba7de0 | ||
|
|
c5239edf2a | ||
|
|
ac4af5f461 | ||
|
|
628e74f157 | ||
|
|
81d11a23ce | ||
|
|
bb99bfe815 | ||
|
|
dc8f7c9d62 | ||
|
|
7a3ce5f91e | ||
|
|
5d1e6243af | ||
|
|
10c278fff7 | ||
|
|
a04dbac369 | ||
|
|
0753d9fae5 | ||
|
|
6eca74b7bb | ||
|
|
b30ecb72d5 | ||
|
|
6b9fcc6ca3 | ||
|
|
47e827262f | ||
|
|
c63f43975f | ||
|
|
03ff091bee | ||
|
|
a427ad3bf9 | ||
|
|
3adf8b58c4 | ||
|
|
19ea05692c | ||
|
|
77480c9d8f | ||
|
|
64b38a2d0a | ||
|
|
48b01e72fa | ||
|
|
ed9a96fdb7 | ||
|
|
3a422e82b4 | ||
|
|
84caa94340 | ||
|
|
8c42704c72 | ||
|
|
b5c3587588 | ||
|
|
a63ad48b0f | ||
|
|
10b81c1e97 | ||
|
|
508bb97089 | ||
|
|
3682c06157 | ||
|
|
df69dbec2a | ||
|
|
f23e796e76 | ||
|
|
88c4ca3697 | ||
|
|
acc54cf304 | ||
|
|
419636ca8f | ||
|
|
2b3f3d9ba7 | ||
|
|
68981cc90b | ||
|
|
c907fc6789 | ||
|
|
b0afd3aa63 | ||
|
|
298aca7da8 | ||
|
|
136db260ca | ||
|
|
ec2f99b3d1 | ||
|
|
1f28a55448 | ||
|
|
bcf0114e90 | ||
|
|
af2d7a146f | ||
|
|
08268eee3f | ||
|
|
ceec703bb7 | ||
|
|
cc03f5c89d | ||
|
|
68b105b21c | ||
|
|
6f3f30ee07 | ||
|
|
b8c75673c8 | ||
|
|
6676953555 | ||
|
|
e216504113 | ||
|
|
b93472d595 | ||
|
|
92dbde54a5 | ||
|
|
7e7b9977c5 | ||
|
|
6c547e1692 | ||
|
|
867a1ca346 | ||
|
|
53f58b85b7 | ||
|
|
408c6ea3ee | ||
|
|
c92962ca45 | ||
|
|
9d4a5ade08 | ||
|
|
a08c53ae4b | ||
|
|
625f292417 | ||
|
|
576ad29ddb | ||
|
|
64c774c23a | ||
|
|
e18b519692 | ||
|
|
71be10b8d6 | ||
|
|
80e627c64b | ||
|
|
f49c9e896a | ||
|
|
3d1558be7e | ||
|
|
f150629948 | ||
|
|
7984925059 | ||
|
|
a6fdda86b5 | ||
|
|
56e212ea8d | ||
|
|
258a1a03e3 | ||
|
|
ea29e4963e | ||
|
|
d974ee1e21 | ||
|
|
a74b442c65 | ||
|
|
3158e9b017 | ||
|
|
937d008d3c | ||
|
|
f6d7f606aa | ||
|
|
e978cc2a80 | ||
|
|
347c69b7e2 | ||
|
|
482ffe06fd | ||
|
|
a8767f36da | ||
|
|
ec6a2661de | ||
|
|
cb1d05d65a | ||
|
|
5d4e2ec522 | ||
|
|
1454228a05 | ||
|
|
3ebd02513a | ||
|
|
2439d27b60 | ||
|
|
5ce12f2404 | ||
|
|
0da1a222fc | ||
|
|
1713d75c0d | ||
|
|
7e1bbbd937 | ||
|
|
8a613960af | ||
|
|
fde09fd136 | ||
|
|
21348050e8 | ||
|
|
2d3a40e023 | ||
|
|
5a7cea00d0 | ||
|
|
df949dc506 | ||
|
|
ee631f31a0 | ||
|
|
7c49711906 | ||
|
|
42fdea7410 | ||
|
|
e1f623dd69 | ||
|
|
a2c1569e04 | ||
|
|
8d2689d1b5 | ||
|
|
2ffe5b7d80 | ||
|
|
47315c63dc | ||
|
|
b7dccdbe93 | ||
|
|
931a38de6e | ||
|
|
834921251b | ||
|
|
335e81cdf7 | ||
|
|
8e831a7b6d | ||
|
|
9715965d0c | ||
|
|
ef0e9ad34a | ||
|
|
4a161bff2d | ||
|
|
7228bd1502 | ||
|
|
615b86e88b | ||
|
|
12fdd205d6 | ||
|
|
0d090aa47b | ||
|
|
f2b0146f0f | ||
|
|
df32318f66 | ||
|
|
834961c308 | ||
|
|
02af292869 | ||
|
|
59f2a3ce72 | ||
|
|
ebac810c4e | ||
|
|
7ac32097da | ||
|
|
32b336e062 | ||
|
|
29da20744a | ||
|
|
d28c841fa9 | ||
|
|
8829098e90 | ||
|
|
3ef594f9eb | ||
|
|
0f86fc680c | ||
|
|
10e9672852 | ||
|
|
87920d16d8 | ||
|
|
966119ce30 | ||
|
|
dded1feeb7 | ||
|
|
20a2b944df | ||
|
|
16b56c2451 | ||
|
|
c61d855250 | ||
|
|
9daa109d2c | ||
|
|
fa5ed619e8 | ||
|
|
3f10f44232 | ||
|
|
f1c6094150 | ||
|
|
0e07c6556d | ||
|
|
19af46d83a | ||
|
|
edc291961b | ||
|
|
5d5e829107 | ||
|
|
55d246ce76 | ||
|
|
3483452d9f | ||
|
|
755e8319ee | ||
|
|
756837c5b4 | ||
|
|
9d1933492a | ||
|
|
32cc593558 | ||
|
|
912c48a806 | ||
|
|
3a31380b2c | ||
|
|
a05a4593a6 | ||
|
|
6eb7dba352 | ||
|
|
25ff4bec2a | ||
|
|
3d3fabf5fb | ||
|
|
f06094aa95 | ||
|
|
0a058e53c7 | ||
|
|
b12f46babd | ||
|
|
7c331b7319 | ||
|
|
440590f823 | ||
|
|
84281f900f | ||
|
|
c25b770342 | ||
|
|
954d18903e | ||
|
|
2c3ab8490c | ||
|
|
114f88cb9b | ||
|
|
87c1c6f40f | ||
|
|
ec70cea2a1 | ||
|
|
77435206b9 | ||
|
|
111b376d0d | ||
|
|
e297ed0b88 | ||
|
|
4639681907 | ||
|
|
2399bb8995 | ||
|
|
6f32290756 | ||
|
|
a93ad3838f | ||
|
|
02d59a0529 | ||
|
|
57b2cd2c16 | ||
|
|
a84b1b8b78 | ||
|
|
1796997ace | ||
|
|
cb2aa85cf5 | ||
|
|
55422593a7 | ||
|
|
cc5ac75d73 | ||
|
|
66a33619da | ||
|
|
5b3e513cba | ||
|
|
ce0c7b5896 | ||
|
|
359d0b7a3e | ||
|
|
bd9d5ad75b | ||
|
|
1aadedc596 | ||
|
|
70f5bc1a50 | ||
|
|
13fb605edb | ||
|
|
3dea15bc9d | ||
|
|
f375288b58 | ||
|
|
21900c55eb | ||
|
|
a1ed73820f | ||
|
|
d228e6eda6 | ||
|
|
32543625fc | ||
|
|
7bf7ff785a | ||
|
|
0d20300313 | ||
|
|
a033b70704 | ||
|
|
9baaddb613 | ||
|
|
51285785fa | ||
|
|
757e9d0f52 | ||
|
|
4ffa070715 | ||
|
|
0fcd369ef2 | ||
|
|
32173ccfe1 | ||
|
|
f89a8c4ec4 | ||
|
|
3b8bf85fbc | ||
|
|
1129599df8 | ||
|
|
1318ff5a0d | ||
|
|
7b1060fad3 | ||
|
|
ef63fe400a |
7
.github/copilot-instructions.md
vendored
7
.github/copilot-instructions.md
vendored
@@ -95,3 +95,10 @@ ninja build/<mode>/scylla
|
||||
- Tests should strive to be repeatable, and not use random input that will make their results unpredictable.
|
||||
- Tests should consume as little resources as possible. Prefer running tests on a single node if it is sufficient, for example.
|
||||
|
||||
## Code Review
|
||||
When performing code reviews, follow the comprehensive patterns and checks documented in:
|
||||
- **`.github/instructions/reviewer.instructions.md`** - Complete review skill with examples and feedback templates
|
||||
- **`.github/instructions/review-checklist.md`** - Quick reference checklist for reviews
|
||||
|
||||
These documents capture common review patterns from ScyllaDB maintainers and provide structured guidance for high-quality code reviews.
|
||||
|
||||
|
||||
8
.github/instructions/cpp.instructions.md
vendored
8
.github/instructions/cpp.instructions.md
vendored
@@ -6,6 +6,14 @@ applyTo: "**/*.{cc,hh}"
|
||||
|
||||
**Important:** Always match the style and conventions of existing code in the file and directory.
|
||||
|
||||
## External C++ Resources
|
||||
|
||||
Follow standard C++ best practices from:
|
||||
- **[ISO C++ FAQ](https://isocpp.org/faq)** - Language features, idioms, and common questions
|
||||
- **[C++ Core Guidelines](https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines)** - Modern C++ best practices
|
||||
|
||||
**Note:** ScyllaDB has specific requirements (Seastar async patterns, performance constraints) that may supersede general guidelines. ScyllaDB-specific conventions documented here take precedence.
|
||||
|
||||
## Memory Management
|
||||
- Prefer stack allocation whenever possible
|
||||
- Use `std::unique_ptr` by default for dynamic allocations
|
||||
|
||||
124
.github/instructions/review-checklist.md
vendored
Normal file
124
.github/instructions/review-checklist.md
vendored
Normal file
@@ -0,0 +1,124 @@
|
||||
# ScyllaDB Code Review Checklist
|
||||
|
||||
Quick reference for code reviewers. See `reviewer.instructions.md` for detailed guidance.
|
||||
|
||||
---
|
||||
|
||||
## ⚠️ Critical (P0) - Must Fix Before Merge
|
||||
|
||||
### Async/Seastar Violations
|
||||
- [ ] No `.get()` calls on futures (blocks reactor)
|
||||
- [ ] All async operations use `co_await`
|
||||
- [ ] No blocking I/O operations
|
||||
- [ ] No `std::mutex` (use `seastar::semaphore`)
|
||||
|
||||
### Exception Handling
|
||||
- [ ] No exceptions in hot paths (use `std::expected` or results)
|
||||
- [ ] No exceptions for control flow
|
||||
- [ ] Correct `noexcept` specifications
|
||||
- [ ] Exceptions properly handled in coroutines
|
||||
|
||||
### Memory Management
|
||||
- [ ] No raw `new`/`delete` (use smart pointers)
|
||||
- [ ] Large objects passed by `const&` or `&&`
|
||||
- [ ] Pre-allocation when size is known
|
||||
- [ ] Correct smart pointer type (`lw_shared_ptr` vs `shared_ptr`)
|
||||
|
||||
### Test Quality
|
||||
- [ ] No hardcoded `sleep()` (use proper synchronization)
|
||||
- [ ] Consistency levels specified (CL=ALL for setup)
|
||||
- [ ] Test validates the actual fix
|
||||
- [ ] Test runs in correct mode (debug/release guards)
|
||||
|
||||
---
|
||||
|
||||
## ⚡ High Priority (P1) - Should Fix
|
||||
|
||||
### Naming & API Design
|
||||
- [ ] Function names are specific, not generic
|
||||
- [ ] Consistent verb usage in related functions
|
||||
- [ ] Functions in appropriate namespace
|
||||
- [ ] Clear API boundaries
|
||||
|
||||
### Error Handling
|
||||
- [ ] All function calls checked for errors
|
||||
- [ ] Null pointer checks before dereferencing
|
||||
- [ ] Errors logged with context
|
||||
- [ ] No silently ignored errors
|
||||
|
||||
### Resource Management
|
||||
- [ ] RAII patterns used throughout
|
||||
- [ ] No manual resource management
|
||||
- [ ] Resources cleaned up in error paths
|
||||
- [ ] Appropriate smart pointer usage
|
||||
|
||||
### Test Coverage
|
||||
- [ ] Bug fixes include tests
|
||||
- [ ] New features have tests
|
||||
- [ ] Negative test cases included
|
||||
- [ ] Edge cases covered
|
||||
|
||||
---
|
||||
|
||||
## 📋 Medium Priority (P2) - Nice to Fix
|
||||
|
||||
### Code Style
|
||||
- [ ] Spaces after commas
|
||||
- [ ] Consistent formatting
|
||||
- [ ] `fmt` instead of streams
|
||||
- [ ] Python follows PEP 8
|
||||
|
||||
### Documentation
|
||||
- [ ] Comments explain "why", not "what"
|
||||
- [ ] No obvious comments
|
||||
- [ ] Preconditions documented
|
||||
- [ ] Public APIs documented
|
||||
|
||||
### Code Organization
|
||||
- [ ] Commit messages have subsystem prefixes
|
||||
- [ ] Related changes together
|
||||
- [ ] No copy-paste without refactoring
|
||||
- [ ] Appropriate commit structure
|
||||
|
||||
---
|
||||
|
||||
## 🎯 Quick Spot Checks
|
||||
|
||||
**In every PR, quickly scan for:**
|
||||
|
||||
1. Any `.get()` on a future → P0
|
||||
2. Exception thrown in a loop → P0
|
||||
3. Raw `new` or `delete` → P0
|
||||
4. `sleep()` in a test → P0
|
||||
5. Generic function name like `process()` → P1
|
||||
6. Missing error check → P1
|
||||
7. Manual resource management → P1
|
||||
8. Missing test for bug fix → P1
|
||||
|
||||
---
|
||||
|
||||
## 📚 Reference Mantras
|
||||
|
||||
1. "Make it obvious" - Self-documenting code
|
||||
2. "Don't block the reactor" - Always async
|
||||
3. "Keep commits bisectable"
|
||||
4. "Test what you fix"
|
||||
5. "Subsystem prefixes matter"
|
||||
6. "Don't allocate in hot paths"
|
||||
7. "RAII everything"
|
||||
8. "Fail fast with context"
|
||||
9. "One fiber per connection"
|
||||
10. "Results over exceptions in data path"
|
||||
|
||||
---
|
||||
|
||||
## 🔗 Related Guidelines
|
||||
|
||||
- Full details: `.github/instructions/reviewer.instructions.md`
|
||||
- C++ patterns: `.github/instructions/cpp.instructions.md`
|
||||
- Python patterns: `.github/instructions/python.instructions.md`
|
||||
- Build & test: `.github/copilot-instructions.md`
|
||||
|
||||
---
|
||||
|
||||
**Quick Tip:** Start with P0 issues. If any found, request fixes before deeper review.
|
||||
942
.github/instructions/reviewer.instructions.md
vendored
Normal file
942
.github/instructions/reviewer.instructions.md
vendored
Normal file
@@ -0,0 +1,942 @@
|
||||
# ScyllaDB Code Review Skill
|
||||
|
||||
**Purpose:** Perform in-depth code reviews similar to ScyllaDB maintainers
|
||||
**Based on:** Analysis of **1,009 PRs** (2022-2025) and **~12,222 review comments**
|
||||
**Last Updated:** February 2026
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
This document captures common review patterns, feedback themes, and critical checks from ScyllaDB maintainers. Based on comprehensive analysis of over 1,000 pull requests spanning 4 years, it provides structured guidance for high-quality code reviews that maintain ScyllaDB's standards for correctness, performance, and readability.
|
||||
|
||||
**Analysis Scope:**
|
||||
- 1,009 merged PRs analyzed (2022-2025)
|
||||
- ~12,222 review comments examined
|
||||
- 169 high-activity PRs (30+ comments) analyzed in depth
|
||||
- 25+ distinct review patterns identified
|
||||
|
||||
## External C++ Resources
|
||||
|
||||
This review skill is supplemented by standard C++ best practices:
|
||||
|
||||
- **[ISO C++ FAQ](https://isocpp.org/faq)** - Comprehensive C++ guidance on language features, best practices, and common pitfalls
|
||||
- **[C++ Core Guidelines](https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines)** - Modern C++ best practices covering resource management, interfaces, functions, classes, and more
|
||||
|
||||
When reviewing C++ code, consider these resources alongside ScyllaDB-specific patterns documented here. The Core Guidelines are particularly relevant for:
|
||||
- Resource management (RAII, smart pointers, lifetimes)
|
||||
- Function design (parameter passing, error handling)
|
||||
- Class design (constructors, operators, interfaces)
|
||||
- Concurrency (though ScyllaDB uses Seastar patterns instead of std::thread)
|
||||
|
||||
**Note:** ScyllaDB has specific conventions (e.g., Seastar async patterns, `seastar::lw_shared_ptr`) that may differ from general C++ guidelines. ScyllaDB-specific patterns take precedence.
|
||||
|
||||
## Review Priority Levels
|
||||
|
||||
- **P0 (CRITICAL):** Must be fixed - can cause crashes, data loss, or severe performance issues
|
||||
- **P1 (HIGH):** Should be fixed - impacts correctness, maintainability, or moderate performance
|
||||
- **P2 (MEDIUM):** Nice to fix - style, documentation, minor optimizations
|
||||
|
||||
---
|
||||
|
||||
## P0: Critical Issues (Must Fix Before Merge)
|
||||
|
||||
### 1. Async/Seastar Violations
|
||||
|
||||
**Why Critical:** Can block the reactor thread, causing the entire system to hang
|
||||
|
||||
**Check for:**
|
||||
- `.get()` calls on futures (converts async to blocking)
|
||||
- Missing `co_await` keywords in coroutines
|
||||
- Synchronous I/O or blocking operations in async contexts
|
||||
- Using `std::mutex` instead of `seastar::semaphore`
|
||||
|
||||
**Example Issues:**
|
||||
```cpp
|
||||
// ❌ WRONG: Blocks the reactor
|
||||
auto result = some_future().get();
|
||||
|
||||
// ✅ CORRECT: Async all the way
|
||||
auto result = co_await some_future();
|
||||
|
||||
// ❌ WRONG: Blocking I/O
|
||||
std::ifstream file("data.txt");
|
||||
|
||||
// ✅ CORRECT: Seastar async I/O
|
||||
co_await file_io.read(...);
|
||||
```
|
||||
|
||||
**Feedback Template:**
|
||||
```
|
||||
This function performs blocking operations using `.get()` calls which can block
|
||||
the entire reactor thread. Consider making this function async and using `co_await`.
|
||||
|
||||
Example:
|
||||
```cpp
|
||||
seastar::future<T> func() {
|
||||
auto result = co_await async_operation();
|
||||
co_return result;
|
||||
}
|
||||
```
|
||||
```
|
||||
|
||||
### 2. Exception Handling in Data Path
|
||||
|
||||
**Why Critical:** Exceptions in hot paths cause performance degradation and can be incorrectly propagated
|
||||
|
||||
**Check for:**
|
||||
- Throwing exceptions in loops or per-row operations
|
||||
- Using exceptions for control flow
|
||||
- Missing `noexcept` when functions truly don't throw
|
||||
- Incorrect `noexcept` on functions that can throw (including transitive calls)
|
||||
- Unhandled exceptions in coroutines
|
||||
- Functions with `noexcept` that call throwing functions
|
||||
|
||||
**Common noexcept Issues (from PR #27476 analysis):**
|
||||
- Container operations beyond inline capacity throw (e.g., `small_vector` when exceeding N)
|
||||
- Functions marked `noexcept` but calling other functions that can throw
|
||||
- Need to check entire call chain, not just direct function body
|
||||
- Coroutines can keep `noexcept` - exceptions convert to exceptional futures
|
||||
|
||||
**Example Issues:**
|
||||
```cpp
|
||||
// ❌ WRONG: Exception for control flow
|
||||
try {
|
||||
if (!condition) throw control_exception();
|
||||
} catch (control_exception&) {
|
||||
// handle
|
||||
}
|
||||
|
||||
// ✅ CORRECT: Use return values or std::expected
|
||||
if (!condition) {
|
||||
return std::unexpected(error_code);
|
||||
}
|
||||
|
||||
// ❌ WRONG: Can throw but marked noexcept
|
||||
void process() noexcept {
|
||||
vector.push_back(item); // can throw std::bad_alloc
|
||||
}
|
||||
|
||||
// ✅ CORRECT: Remove noexcept
|
||||
void process() {
|
||||
vector.push_back(item);
|
||||
}
|
||||
|
||||
// ❌ WRONG: small_vector exceeds capacity
|
||||
small_vector<T, 3> get_items() noexcept {
|
||||
small_vector<T, 3> result;
|
||||
for (int i = 0; i < 10; i++) { // Will exceed capacity=3!
|
||||
result.push_back(compute(i)); // Throws std::bad_alloc
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// ✅ CORRECT: Either remove noexcept or ensure size <= capacity
|
||||
small_vector<T, 10> get_items() noexcept { // Increased capacity
|
||||
small_vector<T, 10> result;
|
||||
for (int i = 0; i < 10; i++) {
|
||||
result.push_back(compute(i)); // Won't exceed inline capacity
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// ✅ CORRECT: Coroutines can keep noexcept (exceptions → futures)
|
||||
seastar::future<void> process() noexcept {
|
||||
try {
|
||||
co_await work_that_might_throw();
|
||||
} catch (...) {
|
||||
// Exception converted to exceptional future automatically
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Feedback Template:**
|
||||
```
|
||||
In the data path, avoid exceptions for performance. Consider:
|
||||
1. Using `std::expected<T, E>` or `boost::outcome` for results
|
||||
2. Returning error codes or status enums
|
||||
3. If exceptions must be used, catch and convert to exceptional_future
|
||||
|
||||
Also, this function is marked `noexcept` but can throw `std::bad_alloc` from
|
||||
vector allocation. Either remove `noexcept` or ensure the operation truly cannot throw.
|
||||
|
||||
Note: Check the entire call chain - if this function calls other functions that
|
||||
can throw, this function cannot be noexcept (unless it's a coroutine where exceptions
|
||||
automatically convert to exceptional futures).
|
||||
|
||||
See PR #27476 for detailed discussion on noexcept specifications.
|
||||
```
|
||||
|
||||
### 3. Memory Management Issues
|
||||
|
||||
**Why Critical:** Can cause memory leaks, crashes, or severe performance degradation
|
||||
|
||||
**Check for:**
|
||||
- Raw `new`/`delete` usage (should use smart pointers)
|
||||
- Missing RAII patterns
|
||||
- Unnecessary copies of large objects (pass by `const&` or `&&`)
|
||||
- Allocations in hot paths without pre-allocation
|
||||
- Wrong choice of smart pointer type
|
||||
|
||||
**Example Issues:**
|
||||
```cpp
|
||||
// ❌ WRONG: Raw pointer, manual management
|
||||
auto* obj = new MyObject();
|
||||
// ... what if exception is thrown?
|
||||
delete obj;
|
||||
|
||||
// ✅ CORRECT: RAII with unique_ptr
|
||||
auto obj = std::make_unique<MyObject>();
|
||||
|
||||
// ❌ WRONG: Expensive copy in hot path
|
||||
void process(large_object obj) { }
|
||||
|
||||
// ✅ CORRECT: Pass by const reference
|
||||
void process(const large_object& obj) { }
|
||||
|
||||
// ❌ WRONG: Repeated allocations
|
||||
for (int i = 0; i < n; i++) {
|
||||
vec.push_back(compute(i));
|
||||
}
|
||||
|
||||
// ✅ CORRECT: Pre-allocate
|
||||
vec.reserve(n);
|
||||
for (int i = 0; i < n; i++) {
|
||||
vec.push_back(compute(i));
|
||||
}
|
||||
```
|
||||
|
||||
**Feedback Template:**
|
||||
```
|
||||
This code uses raw `new`/`delete` which can leak if an exception is thrown.
|
||||
Use RAII with `std::unique_ptr` or `seastar::lw_shared_ptr`.
|
||||
|
||||
Also, consider pre-allocating the vector since the size is known:
|
||||
```cpp
|
||||
vec.reserve(known_size);
|
||||
```
|
||||
This will avoid multiple reallocations in the hot path.
|
||||
```
|
||||
|
||||
### 4. Test Quality Issues
|
||||
|
||||
**Why Critical:** Flaky tests waste CI resources and hide real bugs
|
||||
|
||||
**Check for:**
|
||||
- Hardcoded `sleep()` calls (use proper synchronization)
|
||||
- Missing consistency levels in distributed tests
|
||||
- Tests that don't validate the fix
|
||||
- Tests without proper mode guards (debug/release)
|
||||
- High tolerance for failures (e.g., "passes if only fails 10% of time")
|
||||
|
||||
**Example Issues:**
|
||||
```python
|
||||
# ❌ WRONG: Race condition with sleep
|
||||
await insert_data(key, value)
|
||||
await asyncio.sleep(1) # Hope it replicates?
|
||||
assert await read_data(key) == value
|
||||
|
||||
# ✅ CORRECT: Use consistency level
|
||||
await insert_data(key, value, consistency_level=Consistency.ALL)
|
||||
assert await read_data(key) == value
|
||||
|
||||
# ❌ WRONG: Test doesn't validate the fix
|
||||
def test_fix():
|
||||
# Does something
|
||||
assert True # Always passes, even without the fix
|
||||
|
||||
# ✅ CORRECT: Ensure test fails without fix
|
||||
def test_fix():
|
||||
# First verify test would fail without fix
|
||||
# Then apply fix and verify it passes
|
||||
result = the_fixed_function()
|
||||
assert result == expected_value # Fails without fix
|
||||
```
|
||||
|
||||
**Feedback Template:**
|
||||
```
|
||||
This test has potential reliability issues:
|
||||
|
||||
1. **Timing**: Instead of `sleep(1)`, use `consistency_level=Consistency.ALL`
|
||||
when inserting test data to ensure replicas are synchronized.
|
||||
|
||||
2. **Validation**: Did you verify this test fails without your fix? Please run
|
||||
the test with your fix temporarily disabled to confirm it catches the bug.
|
||||
|
||||
3. **Stability**: Consider running with `--repeat 100` to check for flakiness.
|
||||
```
|
||||
|
||||
### 5. Tablets Compatibility Issues
|
||||
|
||||
**Why Critical:** Code using vnodes assumptions breaks with tablets feature (modern ScyllaDB)
|
||||
|
||||
**Check for:**
|
||||
- Using `calculate_natural_endpoints()` (vnodes only)
|
||||
- Accessing `token_metadata` directly instead of through ERM
|
||||
- Assumptions about token ownership without tablets support
|
||||
- Maintenance/recovery operations incompatible with tablets
|
||||
|
||||
**Example Issues:**
|
||||
```cpp
|
||||
// ❌ WRONG: Doesn't work with tablets (vnodes only)
|
||||
auto& strat = table.get_replication_strategy();
|
||||
auto endpoints = strat.calculate_natural_endpoints(token, tm);
|
||||
|
||||
// ✅ CORRECT: Works with both vnodes and tablets
|
||||
auto erm = table.get_effective_replication_map();
|
||||
auto endpoints = erm->get_natural_endpoints(token);
|
||||
|
||||
// ❌ WRONG: Direct token_metadata access
|
||||
auto& tm = get_token_metadata();
|
||||
auto endpoints = tm.get_topology().get_endpoints(token);
|
||||
|
||||
// ✅ CORRECT: Use ERM abstraction
|
||||
auto endpoints = erm->get_natural_endpoints(token);
|
||||
```
|
||||
|
||||
**Feedback Template:**
|
||||
```
|
||||
This code uses `calculate_natural_endpoints()` which only works with vnodes.
|
||||
With tablets enabled, this will not work correctly.
|
||||
|
||||
Use the effective replication map (ERM) instead:
|
||||
```cpp
|
||||
auto erm = table.get_effective_replication_map();
|
||||
auto endpoints = erm->get_natural_endpoints(token);
|
||||
```
|
||||
|
||||
This works for both vnodes and tablets configurations.
|
||||
|
||||
See PR #15974, #21207 for examples.
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## P1: High Priority Issues (Should Fix)
|
||||
|
||||
### 5. Poor Naming and API Design
|
||||
|
||||
**Check for:**
|
||||
- Overly generic function names without context
|
||||
- Inconsistent verb usage in related functions
|
||||
- Functions in wrong namespace
|
||||
- Unclear API boundaries
|
||||
|
||||
**Example Issues:**
|
||||
```cpp
|
||||
// ❌ WRONG: Too generic
|
||||
namespace cql3::restrictions {
|
||||
json to_json(const data& d); // to_json of what?
|
||||
}
|
||||
|
||||
// ✅ CORRECT: Specific and clear
|
||||
namespace vector_search {
|
||||
json to_json(const restrictions& r); // Clear context
|
||||
}
|
||||
// or
|
||||
json to_vector_search_json(const restrictions& r);
|
||||
|
||||
// ❌ WRONG: Inconsistent verbs
|
||||
void parse_data();
|
||||
void extract_info();
|
||||
void process_items();
|
||||
|
||||
// ✅ CORRECT: Consistent naming
|
||||
void parse_data();
|
||||
void parse_info();
|
||||
void parse_items();
|
||||
```
|
||||
|
||||
**Feedback Template:**
|
||||
```
|
||||
The function name 'to_json' is too general. Since this is converting restrictions
|
||||
to JSON specifically for vector-search consumption, consider:
|
||||
|
||||
1. `restrictions_to_vector_search_json()` (descriptive)
|
||||
2. Moving to `vector_search::` namespace with name `to_json()`
|
||||
3. Adding a comment explaining this is for the vector-store protocol only
|
||||
|
||||
Also, you used 'parse', 'extract', and 'process' in this patch. Pick one verb
|
||||
for consistency since they all do similar things.
|
||||
```
|
||||
|
||||
### 6. Missing Error Handling
|
||||
|
||||
**Check for:**
|
||||
- Unchecked function calls that can fail
|
||||
- Missing null pointer checks
|
||||
- Silently ignored errors
|
||||
- Poor error messages without context
|
||||
|
||||
**Example Issues:**
|
||||
```cpp
|
||||
// ❌ WRONG: Unchecked access
|
||||
auto node = topology.get_node(host_id); // Throws if not found
|
||||
node->process();
|
||||
|
||||
// ✅ CORRECT: Check first
|
||||
auto node = topology.find_node(host_id);
|
||||
if (!node) {
|
||||
return make_exception_future<>(std::runtime_error(
|
||||
format("Node {} not found in topology", host_id)));
|
||||
}
|
||||
node->process();
|
||||
|
||||
// ❌ WRONG: Silent error
|
||||
if (auto err = do_something()) {
|
||||
// Ignore error
|
||||
}
|
||||
|
||||
// ✅ CORRECT: Handle or log
|
||||
if (auto err = do_something()) {
|
||||
logger.warn("Failed to do_something: {}", err);
|
||||
return std::unexpected(err);
|
||||
}
|
||||
```
|
||||
|
||||
**Feedback Template:**
|
||||
```
|
||||
This code calls `get_node(host_id)` which throws if the node doesn't exist.
|
||||
During restart scenarios, the node may not be in topology yet.
|
||||
|
||||
Consider using `find_node(host_id)` and checking for nullptr:
|
||||
```cpp
|
||||
auto node = topology.find_node(host_id);
|
||||
if (!node) {
|
||||
on_internal_error(logger, format("Expected node {} in topology", host_id));
|
||||
}
|
||||
```
|
||||
|
||||
If the node must exist at this point, add an assertion with context for debugging.
|
||||
```
|
||||
|
||||
### 7. Resource Management Problems
|
||||
|
||||
**Check for:**
|
||||
- Manual resource management instead of RAII
|
||||
- Incorrect smart pointer choices
|
||||
- Leaked resources in error paths
|
||||
- Missing cleanup on exceptions
|
||||
|
||||
**Example Issues:**
|
||||
```cpp
|
||||
// ❌ WRONG: Manual management
|
||||
semaphore sem;
|
||||
sem.acquire().get();
|
||||
try {
|
||||
do_work();
|
||||
} catch (...) {
|
||||
sem.signal();
|
||||
throw;
|
||||
}
|
||||
sem.signal();
|
||||
|
||||
// ✅ CORRECT: RAII
|
||||
auto units = get_units(sem);
|
||||
do_work();
|
||||
|
||||
// ❌ WRONG: std::shared_ptr for single-shard ownership
|
||||
auto obj = std::make_shared<MyObject>();
|
||||
|
||||
// ✅ CORRECT: seastar::lw_shared_ptr for same-shard
|
||||
auto obj = make_lw_shared<MyObject>();
|
||||
```
|
||||
|
||||
**Feedback Template:**
|
||||
```
|
||||
This code manually manages semaphore units, which can leak if an exception is thrown.
|
||||
Use RAII with `get_units()`:
|
||||
|
||||
```cpp
|
||||
auto units = co_await get_units(sem, 1);
|
||||
co_await do_work();
|
||||
// units automatically released
|
||||
```
|
||||
|
||||
Also, for same-shard sharing, prefer `seastar::lw_shared_ptr` over `std::shared_ptr`
|
||||
for better performance.
|
||||
```
|
||||
|
||||
### 8. Missing Test Coverage
|
||||
|
||||
**Check for:**
|
||||
- Bug fixes without tests
|
||||
- New features without tests
|
||||
- Missing negative test cases
|
||||
- No edge case testing
|
||||
|
||||
**Feedback Template:**
|
||||
```
|
||||
This PR fixes a bug but doesn't include a test. Please add a test that:
|
||||
1. Reproduces the original bug (fails without your fix)
|
||||
2. Passes with your fix applied
|
||||
3. Documents which issue it tests (add comment with issue number)
|
||||
|
||||
Example:
|
||||
```python
|
||||
async def test_issue_12345_node_restart():
|
||||
"""Test that node restart doesn't cause data loss.
|
||||
|
||||
Reproduces: https://github.com/scylladb/scylladb/issues/12345
|
||||
"""
|
||||
# Test implementation
|
||||
```
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## P2: Medium Priority Issues (Nice to Fix)
|
||||
|
||||
### 9. Code Style and Formatting
|
||||
|
||||
**Check for:**
|
||||
- Missing spaces after commas
|
||||
- Wrong indentation
|
||||
- Old-style streams instead of fmt
|
||||
- PEP 8 violations in Python
|
||||
|
||||
**Example Issues:**
|
||||
```cpp
|
||||
// ❌ Style issues
|
||||
function(a,b,c); // Missing spaces
|
||||
std::cout << value << std::endl; // Use fmt
|
||||
|
||||
// ✅ Correct style
|
||||
function(a, b, c);
|
||||
fmt::print("{}\n", value);
|
||||
```
|
||||
|
||||
**Feedback Template:**
|
||||
```
|
||||
nit: Missing space after comma
|
||||
nit: Consider using `fmt::print()` instead of streams for consistency
|
||||
```
|
||||
|
||||
### 10. Documentation Issues
|
||||
|
||||
**Check for:**
|
||||
- Comments that explain "what" instead of "why"
|
||||
- Obvious comments that restate code
|
||||
- Missing precondition documentation
|
||||
- Undocumented public APIs
|
||||
|
||||
**Example Issues:**
|
||||
```cpp
|
||||
// ❌ WRONG: States the obvious
|
||||
// Increment counter
|
||||
counter++;
|
||||
|
||||
// ✅ CORRECT: Explains why
|
||||
// Track retries for backoff calculation
|
||||
counter++;
|
||||
|
||||
// ❌ WRONG: Missing preconditions
|
||||
void process(node* n) {
|
||||
n->update(); // What if n is null?
|
||||
}
|
||||
|
||||
// ✅ CORRECT: Document assumptions
|
||||
/// Process node.
|
||||
/// Precondition: n must be non-null and in the topology
|
||||
void process(node* n) {
|
||||
assert(n != nullptr);
|
||||
n->update();
|
||||
}
|
||||
```
|
||||
|
||||
**Feedback Template:**
|
||||
```
|
||||
nit: Eliminate this comment - the function name is self-explanatory.
|
||||
|
||||
For public APIs, please add documentation explaining:
|
||||
- What the function does
|
||||
- Preconditions and assumptions
|
||||
- Return value meaning
|
||||
- Any side effects
|
||||
```
|
||||
|
||||
### 11. Code Organization
|
||||
|
||||
**Check for:**
|
||||
- Missing subsystem prefixes in commit messages
|
||||
- Unrelated changes mixed together
|
||||
- Copy-pasted code that should be extracted
|
||||
- Changes that should be squashed
|
||||
|
||||
**Feedback Template:**
|
||||
```
|
||||
Please add a subsystem prefix to the commit message (e.g., 'raft:', 'cql:', 'test:').
|
||||
This helps with:
|
||||
- Browsing changelog
|
||||
- Quick relevance assessment
|
||||
- Bisecting issues
|
||||
- Generating release notes
|
||||
|
||||
Example: "raft: Fix node restart issue" instead of "Fix node restart issue"
|
||||
```
|
||||
|
||||
### 12. Minor Optimizations
|
||||
|
||||
**Check for:**
|
||||
- Redundant operations
|
||||
- Inefficient data structures for use case
|
||||
- Unnecessary intermediate structures
|
||||
|
||||
**Feedback Template:**
|
||||
```
|
||||
Could you convert `_restrictions` directly to JSON without the intermediate structure?
|
||||
This would avoid an extra allocation and copy.
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Code Review Workflow
|
||||
|
||||
### Phase 1: Critical Issues (5 minutes)
|
||||
1. Scan for async violations (`.get()`, blocking calls)
|
||||
2. Check exception handling in data paths
|
||||
3. Identify memory management issues
|
||||
4. Flag test reliability problems
|
||||
|
||||
**If P0 issues found:** Stop here and request fixes before deeper review
|
||||
|
||||
### Phase 2: Design Review (10 minutes)
|
||||
1. Evaluate API design and naming
|
||||
2. Check error handling completeness
|
||||
3. Review resource management patterns
|
||||
4. Assess test coverage
|
||||
|
||||
### Phase 3: Polish (5 minutes)
|
||||
1. Style and formatting
|
||||
2. Documentation quality
|
||||
3. Code simplification opportunities
|
||||
4. Commit message quality
|
||||
|
||||
---
|
||||
|
||||
## Review Output Format
|
||||
|
||||
**IMPORTANT: Keep reviews concise and actionable**
|
||||
|
||||
### Summary (Required)
|
||||
Provide a **single sentence** summarizing the most critical issue(s), **only if confident**:
|
||||
- ✅ "P0: This PR blocks the reactor with `.get()` calls in 3 locations - must use `co_await` instead"
|
||||
- ✅ "P1: Missing error handling for null pointer in `process_node()` - add `find_node()` check"
|
||||
- ✅ "No critical issues found. Minor: Consider pre-allocating vector in hot path (line 42)"
|
||||
|
||||
**If uncertain or no major issues:** Skip the summary, go directly to specific comments.
|
||||
|
||||
### Detailed Comments (As Needed)
|
||||
- Focus on P0/P1 issues
|
||||
- Be specific with file/line references
|
||||
- Provide concrete fix suggestions
|
||||
- Avoid long explanations of "why" unless critical for correctness
|
||||
|
||||
### What to Avoid
|
||||
❌ **Long introductions** explaining what you're about to review
|
||||
❌ **Restating the obvious** ("This PR adds feature X...")
|
||||
❌ **Walls of text** - people will skip them
|
||||
❌ **Explaining your methodology** - just provide findings
|
||||
❌ **Academic-style reviews** - this is engineering, not a thesis defense
|
||||
|
||||
### Example Good Review
|
||||
```
|
||||
P0: Blocks reactor at service.cc:123 - replace `future.get()` with `co_await future`
|
||||
|
||||
P1: Missing null check at topology.cc:45 - use `find_node()` instead of `get_node()`
|
||||
|
||||
nit: Line 67 - missing space after comma
|
||||
```
|
||||
|
||||
### Example Bad Review (Too Verbose)
|
||||
```
|
||||
## Comprehensive Analysis
|
||||
|
||||
I have performed a thorough examination of this pull request, analyzing
|
||||
the changes across multiple dimensions including correctness, performance,
|
||||
and maintainability. Let me walk you through my findings...
|
||||
|
||||
### Background
|
||||
This PR introduces functionality that...
|
||||
|
||||
[3 more paragraphs of context that nobody will read]
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Reviewer Tone Guidelines
|
||||
|
||||
### DO: Be Specific and Educational
|
||||
✅ "This can block the reactor thread because `.get()` waits synchronously. Use `co_await` instead."
|
||||
✅ "For consistency with the rest of the codebase, prefer X over Y"
|
||||
✅ "This works, but for better performance consider..."
|
||||
✅ "See similar pattern in `service/storage_service.cc:4351`"
|
||||
|
||||
### DON'T: Be Vague or Harsh
|
||||
❌ "This is wrong" (no context)
|
||||
❌ "You should know better" (condescending)
|
||||
❌ "Maybe fix this?" (too vague)
|
||||
❌ Long tangents on unrelated topics
|
||||
|
||||
### Use "nit:" for Minor Issues
|
||||
```
|
||||
nit: missing space after comma
|
||||
nit: extra blank line
|
||||
nit: can be simplified to X
|
||||
```
|
||||
|
||||
### Offer Alternatives
|
||||
```
|
||||
Consider one of these approaches:
|
||||
1. Option A - simpler but less flexible
|
||||
2. Option B - more complex but handles edge cases
|
||||
3. Option C - matches pattern used in module X
|
||||
|
||||
I'd lean toward B because...
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Key Reviewer Mantras
|
||||
|
||||
Based on frequency and emphasis in ScyllaDB reviews:
|
||||
|
||||
1. **"Make it obvious"** - Self-documenting code > comments
|
||||
2. **"Don't block the reactor"** - Always async/await, never `.get()`
|
||||
3. **"Keep commits bisectable"** - Each commit builds and passes tests
|
||||
4. **"Test what you fix"** - Bug fixes need tests
|
||||
5. **"Subsystem prefixes matter"** - For changelog and bisecting
|
||||
6. **"Don't allocate in hot paths"** - Performance matters
|
||||
7. **"RAII everything"** - No manual resource management
|
||||
8. **"Fail fast with context"** - Check assumptions, log useful info
|
||||
9. **"One fiber per connection"** - Realistic concurrency patterns
|
||||
10. **"Results over exceptions"** - In data path, avoid exception overhead
|
||||
|
||||
---
|
||||
|
||||
## Common Anti-Patterns to Catch
|
||||
|
||||
### 1. Blocking Async Operations
|
||||
```cpp
|
||||
❌ auto result = async_func().get();
|
||||
✅ auto result = co_await async_func();
|
||||
```
|
||||
|
||||
### 2. Exceptions for Control Flow
|
||||
```cpp
|
||||
❌ try { if (!ok) throw control_exception(); } catch { }
|
||||
✅ if (!ok) { return handle_error(); }
|
||||
```
|
||||
|
||||
### 3. Manual Resource Management
|
||||
```cpp
|
||||
❌ sem.acquire().get(); try { work(); } finally { sem.signal(); }
|
||||
✅ auto units = co_await get_units(sem); co_await work();
|
||||
```
|
||||
|
||||
### 4. Generic Naming
|
||||
```cpp
|
||||
❌ void process(data d); // Process how?
|
||||
✅ void parse_vector_search_query(data d);
|
||||
```
|
||||
|
||||
### 5. Flaky Test Timing
|
||||
```python
|
||||
❌ await insert(x); await sleep(1); assert read(x)
|
||||
✅ await insert(x, cl=ALL); assert read(x)
|
||||
```
|
||||
|
||||
### 6. Missing Null Checks
|
||||
```cpp
|
||||
❌ auto node = get_node(id); node->update();
|
||||
✅ auto node = find_node(id); if (node) node->update();
|
||||
```
|
||||
|
||||
### 7. Poor Error Messages
|
||||
```cpp
|
||||
❌ throw std::runtime_error("error");
|
||||
✅ throw std::runtime_error(format("Node {} not found in topology", host_id));
|
||||
```
|
||||
|
||||
### 8. Copy-Paste Without Refactoring
|
||||
```cpp
|
||||
❌ Same 10 lines in 3 places
|
||||
✅ Extract to function, call from 3 places
|
||||
```
|
||||
|
||||
### 9. Overly Coupled Code
|
||||
```cpp
|
||||
❌ One function doing 5 different things
|
||||
✅ Break into focused, single-purpose functions
|
||||
```
|
||||
|
||||
### 10. Missing Test Consistency Levels
|
||||
```python
|
||||
❌ cql.execute("INSERT ...") # Default CL
|
||||
✅ cql.execute("INSERT ...", consistency_level=ConsistencyLevel.ALL)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Integration with Existing Guidelines
|
||||
|
||||
When reviewing, reference these existing instruction files:
|
||||
|
||||
1. **`.github/instructions/cpp.instructions.md`** - C++ style, Seastar patterns, memory management
|
||||
2. **`.github/instructions/python.instructions.md`** - Python style, testing patterns
|
||||
3. **`.github/copilot-instructions.md`** - Build system, test philosophy, code philosophy
|
||||
|
||||
Example reference:
|
||||
```
|
||||
Per the C++ guidelines (cpp.instructions.md), prefer `seastar::lw_shared_ptr`
|
||||
for same-shard sharing rather than `std::shared_ptr`.
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Distinguishing Automated vs Human Review
|
||||
|
||||
### Good for Automated Review (This Skill)
|
||||
- ✅ Style violations (spacing, formatting)
|
||||
- ✅ Common async anti-patterns (`.get()` calls)
|
||||
- ✅ Missing `noexcept` specifications
|
||||
- ✅ Generic naming issues
|
||||
- ✅ Test synchronization patterns
|
||||
- ✅ Obvious comments
|
||||
- ✅ Missing error checks
|
||||
|
||||
### Better Left to Human Reviewers
|
||||
- 🧑 Architecture and design decisions
|
||||
- 🧑 Complex performance trade-off analysis
|
||||
- 🧑 API design philosophy
|
||||
- 🧑 Test coverage sufficiency
|
||||
- 🧑 Security implications
|
||||
- 🧑 Business logic correctness
|
||||
- 🧑 Cross-module impact assessment
|
||||
|
||||
**Guideline:** When in doubt, flag for human review with context:
|
||||
```
|
||||
This might need a closer look from a maintainer: [explanation of concern]
|
||||
@avikivity - this touches load shedding logic you authored
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Example High-Value Review Comments
|
||||
|
||||
### Performance Issue
|
||||
```
|
||||
⚠️ P0: Performance issue in hot path
|
||||
|
||||
This code runs for every row processed. Consider:
|
||||
1. Pre-allocating the vector since size is known: `vec.reserve(row_count)`
|
||||
2. Using `small_vector<T, 16>` for common case (avoids heap allocation)
|
||||
3. Passing by `string_view` instead of `string` to avoid allocation
|
||||
|
||||
The current version allocates on every call, which will show up in profiles.
|
||||
```
|
||||
|
||||
### Correctness Issue
|
||||
```
|
||||
⚠️ P0: Potential crash
|
||||
|
||||
This function calls `get_node(id)` which throws if the node doesn't exist.
|
||||
During certain restart scenarios, the node may not be in topology yet.
|
||||
|
||||
Suggested fix:
|
||||
```cpp
|
||||
auto node = topology.find_node(id);
|
||||
if (!node) {
|
||||
on_internal_error(logger, format("Node {} expected in topology", id));
|
||||
co_return make_exception_future<>(std::runtime_error(...));
|
||||
}
|
||||
```
|
||||
|
||||
If the node must exist here, the `on_internal_error` will help debug why it doesn't.
|
||||
```
|
||||
|
||||
### Test Quality Issue
|
||||
```
|
||||
⚠️ P1: Test doesn't validate the fix
|
||||
|
||||
This test has a subtle issue: it doesn't verify that the fix actually works.
|
||||
|
||||
Please:
|
||||
1. Temporarily disable your fix
|
||||
2. Run the test and confirm it fails
|
||||
3. Re-enable the fix
|
||||
4. Run the test and confirm it passes
|
||||
|
||||
This ensures the test is actually validating the behavior and will catch regressions.
|
||||
|
||||
Also consider running `./test.py --mode=dev --repeat 100 test/...` to check for flakiness.
|
||||
```
|
||||
|
||||
### API Design Issue
|
||||
```
|
||||
⚠️ P1: API naming
|
||||
|
||||
The function name `process()` is too generic. Since this converts internal
|
||||
restrictions to JSON for vector-search consumption, consider:
|
||||
|
||||
**Option 1:** More descriptive name
|
||||
```cpp
|
||||
json restrictions_to_vector_search_json(const restrictions& r)
|
||||
```
|
||||
|
||||
**Option 2:** Move to appropriate namespace
|
||||
```cpp
|
||||
namespace vector_search {
|
||||
json to_json(const restrictions& r)
|
||||
}
|
||||
```
|
||||
|
||||
**Option 3:** Add clarifying comment
|
||||
```cpp
|
||||
/// Converts CQL restrictions to JSON format for vector-store protocol
|
||||
json to_json(const restrictions& r)
|
||||
```
|
||||
|
||||
I'd suggest Option 2 as it provides context through namespace and keeps the name concise.
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Notable ScyllaDB Reviewers and Focus Areas
|
||||
|
||||
When flagging for human review, consider reviewer expertise:
|
||||
|
||||
- **avikivity**: Performance, async patterns, test quality, architecture
|
||||
- **denesb**: Reader concurrency, memory management, correctness
|
||||
- **bhalevy**: Style, noexcept specifications, resource management
|
||||
- **tgrabiec**: Architecture, load balancing, design patterns
|
||||
- **nyh**: Naming, API clarity, code organization
|
||||
- **patjed41**: Testing, Python style, edge cases
|
||||
- **gleb-cloudius**: State management, topology coordination
|
||||
|
||||
Example: `@avikivity - this changes load shedding logic, please review performance implications`
|
||||
|
||||
---
|
||||
|
||||
## Conclusion
|
||||
|
||||
This reviewer skill is designed to:
|
||||
1. **Prevent critical bugs** from merging (P0 issues)
|
||||
2. **Help contributors learn** ScyllaDB patterns through educational feedback
|
||||
3. **Reduce burden** on human reviewers for repetitive issues
|
||||
4. **Maintain consistent** code quality standards
|
||||
|
||||
**Remember:** The goal is not to catch every issue, but to catch the most important ones and provide actionable, educational feedback that helps contributors improve.
|
||||
|
||||
When in doubt:
|
||||
- **Be specific** with examples
|
||||
- **Explain why** something matters
|
||||
- **Offer alternatives** when suggesting changes
|
||||
- **Reference** existing code/guidelines
|
||||
- **Flag for human review** if uncertain
|
||||
|
||||
---
|
||||
|
||||
**Version:** 1.0
|
||||
**Last Updated:** February 2026
|
||||
**Based on:** Analysis of 200+ PRs, 700+ review comments
|
||||
@@ -18,7 +18,7 @@ jobs:
|
||||
|
||||
// Regular expression pattern to check for "Fixes" prefix
|
||||
// Adjusted to dynamically insert the repository full name
|
||||
const pattern = `Fixes:? ((?:#|${repo.replace('/', '\\/')}#|https://github\\.com/${repo.replace('/', '\\/')}/issues/)(\\d+)|([A-Z]+-\\d+))`;
|
||||
const pattern = `Fixes:? ((?:#|${repo.replace('/', '\\/')}#|https://github\\.com/${repo.replace('/', '\\/')}/issues/)(\\d+)|(?:https://scylladb\\.atlassian\\.net/browse/)?([A-Z]+-\\d+))`;
|
||||
const regex = new RegExp(pattern);
|
||||
|
||||
if (!regex.test(body)) {
|
||||
|
||||
5
.github/workflows/iwyu.yaml
vendored
5
.github/workflows/iwyu.yaml
vendored
@@ -14,7 +14,8 @@ env:
|
||||
CLEANER_DIRS: test/unit exceptions alternator api auth cdc compaction db dht gms index lang message mutation mutation_writer node_ops raft redis replica service
|
||||
SEASTAR_BAD_INCLUDE_OUTPUT_PATH: build/seastar-bad-include.log
|
||||
|
||||
permissions: {}
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
# cancel the in-progress run upon a repush
|
||||
concurrency:
|
||||
@@ -34,8 +35,6 @@ jobs:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
- run: |
|
||||
sudo dnf -y install clang-tools-extra
|
||||
- name: Generate compilation database
|
||||
run: |
|
||||
cmake \
|
||||
|
||||
22
.github/workflows/trigger-scylla-ci.yaml
vendored
22
.github/workflows/trigger-scylla-ci.yaml
vendored
@@ -9,16 +9,34 @@ on:
|
||||
|
||||
jobs:
|
||||
trigger-jenkins:
|
||||
if: (github.event.comment.user.login != 'scylladbbot' && contains(github.event.comment.body, '@scylladbbot') && contains(github.event.comment.body, 'trigger-ci')) || github.event.label.name == 'conflicts'
|
||||
if: (github.event_name == 'issue_comment' && github.event.comment.user.login != 'scylladbbot') || github.event.label.name == 'conflicts'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Validate Comment Trigger
|
||||
if: github.event_name == 'issue_comment'
|
||||
id: verify_comment
|
||||
shell: bash
|
||||
run: |
|
||||
BODY=$(cat << 'EOF'
|
||||
${{ github.event.comment.body }}
|
||||
EOF
|
||||
)
|
||||
CLEAN_BODY=$(echo "$BODY" | grep -v '^[[:space:]]*>')
|
||||
|
||||
if echo "$CLEAN_BODY" | grep -qi '@scylladbbot' && echo "$CLEAN_BODY" | grep -qi 'trigger-ci'; then
|
||||
echo "trigger=true" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "trigger=false" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Trigger Scylla-CI-Route Jenkins Job
|
||||
if: github.event_name == 'pull_request_target' || steps.verify_comment.outputs.trigger == 'true'
|
||||
env:
|
||||
JENKINS_USER: ${{ secrets.JENKINS_USERNAME }}
|
||||
JENKINS_API_TOKEN: ${{ secrets.JENKINS_TOKEN }}
|
||||
JENKINS_URL: "https://jenkins.scylladb.com"
|
||||
run: |
|
||||
PR_NUMBER=${{ github.event.issue.number }}
|
||||
PR_NUMBER=${{ github.event.issue.number || github.event.pull_request.number }}
|
||||
PR_REPO_NAME=${{ github.event.repository.full_name }}
|
||||
curl -X POST "$JENKINS_URL/job/releng/job/Scylla-CI-Route/buildWithParameters?PR_NUMBER=$PR_NUMBER&PR_REPO_NAME=$PR_REPO_NAME" \
|
||||
--user "$JENKINS_USER:$JENKINS_API_TOKEN" --fail -i -v
|
||||
|
||||
@@ -78,7 +78,7 @@ fi
|
||||
|
||||
# Default scylla product/version tags
|
||||
PRODUCT=scylla
|
||||
VERSION=2026.1.0-dev
|
||||
VERSION=2026.2.0-dev
|
||||
|
||||
if test -f version
|
||||
then
|
||||
|
||||
@@ -618,7 +618,7 @@ conditional_operator_type get_conditional_operator(const rjson::value& req) {
|
||||
// Check if the existing values of the item (previous_item) match the
|
||||
// conditions given by the Expected and ConditionalOperator parameters
|
||||
// (if they exist) in the request (an UpdateItem, PutItem or DeleteItem).
|
||||
// This function can throw an ValidationException API error if there
|
||||
// This function can throw a ValidationException API error if there
|
||||
// are errors in the format of the condition itself.
|
||||
bool verify_expected(const rjson::value& req, const rjson::value* previous_item) {
|
||||
const rjson::value* expected = rjson::find(req, "Expected");
|
||||
|
||||
@@ -45,7 +45,7 @@ bool consumed_capacity_counter::should_add_capacity(const rjson::value& request)
|
||||
}
|
||||
|
||||
void consumed_capacity_counter::add_consumed_capacity_to_response_if_needed(rjson::value& response) const noexcept {
|
||||
if (_should_add_to_reponse) {
|
||||
if (_should_add_to_response) {
|
||||
auto consumption = rjson::empty_object();
|
||||
rjson::add(consumption, "CapacityUnits", get_consumed_capacity_units());
|
||||
rjson::add(response, "ConsumedCapacity", std::move(consumption));
|
||||
|
||||
@@ -28,9 +28,9 @@ namespace alternator {
|
||||
class consumed_capacity_counter {
|
||||
public:
|
||||
consumed_capacity_counter() = default;
|
||||
consumed_capacity_counter(bool should_add_to_reponse) : _should_add_to_reponse(should_add_to_reponse){}
|
||||
consumed_capacity_counter(bool should_add_to_response) : _should_add_to_response(should_add_to_response){}
|
||||
bool operator()() const noexcept {
|
||||
return _should_add_to_reponse;
|
||||
return _should_add_to_response;
|
||||
}
|
||||
|
||||
consumed_capacity_counter& operator +=(uint64_t bytes);
|
||||
@@ -44,7 +44,7 @@ public:
|
||||
uint64_t _total_bytes = 0;
|
||||
static bool should_add_capacity(const rjson::value& request);
|
||||
protected:
|
||||
bool _should_add_to_reponse = false;
|
||||
bool _should_add_to_response = false;
|
||||
};
|
||||
|
||||
class rcu_consumed_capacity_counter : public consumed_capacity_counter {
|
||||
|
||||
@@ -237,7 +237,7 @@ static void validate_is_object(const rjson::value& value, const char* caller) {
|
||||
}
|
||||
|
||||
// This function assumes the given value is an object and returns requested member value.
|
||||
// If it is not possible an api_error::validation is thrown.
|
||||
// If it is not possible, an api_error::validation is thrown.
|
||||
static const rjson::value& get_member(const rjson::value& obj, const char* member_name, const char* caller) {
|
||||
validate_is_object(obj, caller);
|
||||
const rjson::value* ret = rjson::find(obj, member_name);
|
||||
@@ -249,7 +249,7 @@ static const rjson::value& get_member(const rjson::value& obj, const char* membe
|
||||
|
||||
|
||||
// This function assumes the given value is an object with a single member, and returns this member.
|
||||
// In case the requirements are not met an api_error::validation is thrown.
|
||||
// In case the requirements are not met, an api_error::validation is thrown.
|
||||
static const rjson::value::Member& get_single_member(const rjson::value& v, const char* caller) {
|
||||
if (!v.IsObject() || v.MemberCount() != 1) {
|
||||
throw api_error::validation(format("{}: expected an object with a single member.", caller));
|
||||
@@ -682,7 +682,7 @@ static std::optional<int> get_int_attribute(const rjson::value& value, std::stri
|
||||
}
|
||||
|
||||
// Sets a KeySchema object inside the given JSON parent describing the key
|
||||
// attributes of the the given schema as being either HASH or RANGE keys.
|
||||
// attributes of the given schema as being either HASH or RANGE keys.
|
||||
// Additionally, adds to a given map mappings between the key attribute
|
||||
// names and their type (as a DynamoDB type string).
|
||||
void executor::describe_key_schema(rjson::value& parent, const schema& schema, std::unordered_map<std::string,std::string>* attribute_types, const std::map<sstring, sstring> *tags) {
|
||||
@@ -916,7 +916,7 @@ future<rjson::value> executor::fill_table_description(schema_ptr schema, table_s
|
||||
sstring index_name = cf_name.substr(delim_it + 1);
|
||||
rjson::add(view_entry, "IndexName", rjson::from_string(index_name));
|
||||
rjson::add(view_entry, "IndexArn", generate_arn_for_index(*schema, index_name));
|
||||
// Add indexes's KeySchema and collect types for AttributeDefinitions:
|
||||
// Add index's KeySchema and collect types for AttributeDefinitions:
|
||||
executor::describe_key_schema(view_entry, *vptr, key_attribute_types, db::get_tags_of_table(vptr));
|
||||
// Add projection type
|
||||
rjson::value projection = rjson::empty_object();
|
||||
@@ -2435,7 +2435,7 @@ std::unordered_map<bytes, std::string> si_key_attributes(data_dictionary::table
|
||||
// case, this function simply won't be called for this attribute.)
|
||||
//
|
||||
// This function checks if the given attribute update is an update to some
|
||||
// GSI's key, and if the value is unsuitable, a api_error::validation is
|
||||
// GSI's key, and if the value is unsuitable, an api_error::validation is
|
||||
// thrown. The checking here is similar to the checking done in
|
||||
// get_key_from_typed_value() for the base table's key columns.
|
||||
//
|
||||
@@ -3548,7 +3548,7 @@ static bool hierarchy_filter(rjson::value& val, const attribute_path_map_node<T>
|
||||
return true;
|
||||
}
|
||||
|
||||
// Add a path to a attribute_path_map. Throws a validation error if the path
|
||||
// Add a path to an attribute_path_map. Throws a validation error if the path
|
||||
// "overlaps" with one already in the filter (one is a sub-path of the other)
|
||||
// or "conflicts" with it (both a member and index is requested).
|
||||
template<typename T>
|
||||
|
||||
@@ -50,7 +50,7 @@ public:
|
||||
_operators.emplace_back(i);
|
||||
check_depth_limit();
|
||||
}
|
||||
void add_dot(std::string(name)) {
|
||||
void add_dot(std::string name) {
|
||||
_operators.emplace_back(std::move(name));
|
||||
check_depth_limit();
|
||||
}
|
||||
@@ -85,7 +85,7 @@ struct constant {
|
||||
}
|
||||
};
|
||||
|
||||
// "value" is is a value used in the right hand side of an assignment
|
||||
// "value" is a value used in the right hand side of an assignment
|
||||
// expression, "SET a = ...". It can be a constant (a reference to a value
|
||||
// included in the request, e.g., ":val"), a path to an attribute from the
|
||||
// existing item (e.g., "a.b[3].c"), or a function of other such values.
|
||||
@@ -205,7 +205,7 @@ public:
|
||||
// The supported primitive conditions are:
|
||||
// 1. Binary operators - v1 OP v2, where OP is =, <>, <, <=, >, or >= and
|
||||
// v1 and v2 are values - from the item (an attribute path), the query
|
||||
// (a ":val" reference), or a function of the the above (only the size()
|
||||
// (a ":val" reference), or a function of the above (only the size()
|
||||
// function is supported).
|
||||
// 2. Ternary operator - v1 BETWEEN v2 and v3 (means v1 >= v2 AND v1 <= v3).
|
||||
// 3. N-ary operator - v1 IN ( v2, v3, ... )
|
||||
|
||||
@@ -55,7 +55,7 @@ partition_key pk_from_json(const rjson::value& item, schema_ptr schema);
|
||||
clustering_key ck_from_json(const rjson::value& item, schema_ptr schema);
|
||||
position_in_partition pos_from_json(const rjson::value& item, schema_ptr schema);
|
||||
|
||||
// If v encodes a number (i.e., it is a {"N": [...]}, returns an object representing it. Otherwise,
|
||||
// If v encodes a number (i.e., it is a {"N": [...]}), returns an object representing it. Otherwise,
|
||||
// raises ValidationException with diagnostic.
|
||||
big_decimal unwrap_number(const rjson::value& v, std::string_view diagnostic);
|
||||
|
||||
|
||||
@@ -491,7 +491,7 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
|
||||
|
||||
if (!opts.enabled()) {
|
||||
rjson::add(ret, "StreamDescription", std::move(stream_desc));
|
||||
return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
|
||||
co_return rjson::print(std::move(ret));
|
||||
}
|
||||
|
||||
// TODO: label
|
||||
@@ -502,123 +502,121 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
|
||||
// filter out cdc generations older than the table or now() - cdc::ttl (typically dynamodb_streams_max_window - 24h)
|
||||
auto low_ts = std::max(as_timepoint(schema->id()), db_clock::now() - ttl);
|
||||
|
||||
return _sdks.cdc_get_versioned_streams(low_ts, { normal_token_owners }).then([db, shard_start, limit, ret = std::move(ret), stream_desc = std::move(stream_desc)] (std::map<db_clock::time_point, cdc::streams_version> topologies) mutable {
|
||||
std::map<db_clock::time_point, cdc::streams_version> topologies = co_await _sdks.cdc_get_versioned_streams(low_ts, { normal_token_owners });
|
||||
auto e = topologies.end();
|
||||
auto prev = e;
|
||||
auto shards = rjson::empty_array();
|
||||
|
||||
auto e = topologies.end();
|
||||
auto prev = e;
|
||||
auto shards = rjson::empty_array();
|
||||
std::optional<shard_id> last;
|
||||
|
||||
std::optional<shard_id> last;
|
||||
auto i = topologies.begin();
|
||||
// if we're a paged query, skip to the generation where we left of.
|
||||
if (shard_start) {
|
||||
i = topologies.find(shard_start->time);
|
||||
}
|
||||
|
||||
auto i = topologies.begin();
|
||||
// if we're a paged query, skip to the generation where we left of.
|
||||
if (shard_start) {
|
||||
i = topologies.find(shard_start->time);
|
||||
}
|
||||
// for parent-child stuff we need id:s to be sorted by token
|
||||
// (see explanation above) since we want to find closest
|
||||
// token boundary when determining parent.
|
||||
// #7346 - we processed and searched children/parents in
|
||||
// stored order, which is not necessarily token order,
|
||||
// so the finding of "closest" token boundary (using upper bound)
|
||||
// could give somewhat weird results.
|
||||
static auto token_cmp = [](const cdc::stream_id& id1, const cdc::stream_id& id2) {
|
||||
return id1.token() < id2.token();
|
||||
};
|
||||
|
||||
// for parent-child stuff we need id:s to be sorted by token
|
||||
// (see explanation above) since we want to find closest
|
||||
// token boundary when determining parent.
|
||||
// #7346 - we processed and searched children/parents in
|
||||
// stored order, which is not necessarily token order,
|
||||
// so the finding of "closest" token boundary (using upper bound)
|
||||
// could give somewhat weird results.
|
||||
static auto token_cmp = [](const cdc::stream_id& id1, const cdc::stream_id& id2) {
|
||||
return id1.token() < id2.token();
|
||||
};
|
||||
// #7409 - shards must be returned in lexicographical order,
|
||||
// normal bytes compare is string_traits<int8_t>::compare.
|
||||
// thus bytes 0x8000 is less than 0x0000. By doing unsigned
|
||||
// compare instead we inadvertently will sort in string lexical.
|
||||
static auto id_cmp = [](const cdc::stream_id& id1, const cdc::stream_id& id2) {
|
||||
return compare_unsigned(id1.to_bytes(), id2.to_bytes()) < 0;
|
||||
};
|
||||
|
||||
// need a prev even if we are skipping stuff
|
||||
if (i != topologies.begin()) {
|
||||
prev = std::prev(i);
|
||||
}
|
||||
|
||||
for (; limit > 0 && i != e; prev = i, ++i) {
|
||||
auto& [ts, sv] = *i;
|
||||
|
||||
last = std::nullopt;
|
||||
|
||||
auto lo = sv.streams.begin();
|
||||
auto end = sv.streams.end();
|
||||
|
||||
// #7409 - shards must be returned in lexicographical order,
|
||||
// normal bytes compare is string_traits<int8_t>::compare.
|
||||
// thus bytes 0x8000 is less than 0x0000. By doing unsigned
|
||||
// compare instead we inadvertently will sort in string lexical.
|
||||
static auto id_cmp = [](const cdc::stream_id& id1, const cdc::stream_id& id2) {
|
||||
return compare_unsigned(id1.to_bytes(), id2.to_bytes()) < 0;
|
||||
};
|
||||
std::sort(lo, end, id_cmp);
|
||||
|
||||
// need a prev even if we are skipping stuff
|
||||
if (i != topologies.begin()) {
|
||||
prev = std::prev(i);
|
||||
if (shard_start) {
|
||||
// find next shard position
|
||||
lo = std::upper_bound(lo, end, shard_start->id, id_cmp);
|
||||
shard_start = std::nullopt;
|
||||
}
|
||||
|
||||
for (; limit > 0 && i != e; prev = i, ++i) {
|
||||
auto& [ts, sv] = *i;
|
||||
if (lo != end && prev != e) {
|
||||
// We want older stuff sorted in token order so we can find matching
|
||||
// token range when determining parent shard.
|
||||
std::stable_sort(prev->second.streams.begin(), prev->second.streams.end(), token_cmp);
|
||||
}
|
||||
|
||||
auto expired = [&]() -> std::optional<db_clock::time_point> {
|
||||
auto j = std::next(i);
|
||||
if (j == e) {
|
||||
return std::nullopt;
|
||||
}
|
||||
// add this so we sort of match potential
|
||||
// sequence numbers in get_records result.
|
||||
return j->first + confidence_interval(db);
|
||||
}();
|
||||
|
||||
while (lo != end) {
|
||||
auto& id = *lo++;
|
||||
|
||||
auto shard = rjson::empty_object();
|
||||
|
||||
if (prev != e) {
|
||||
auto& pids = prev->second.streams;
|
||||
auto pid = std::upper_bound(pids.begin(), pids.end(), id.token(), [](const dht::token& t, const cdc::stream_id& id) {
|
||||
return t < id.token();
|
||||
});
|
||||
if (pid != pids.begin()) {
|
||||
pid = std::prev(pid);
|
||||
}
|
||||
if (pid != pids.end()) {
|
||||
rjson::add(shard, "ParentShardId", shard_id(prev->first, *pid));
|
||||
}
|
||||
}
|
||||
|
||||
last.emplace(ts, id);
|
||||
rjson::add(shard, "ShardId", *last);
|
||||
auto range = rjson::empty_object();
|
||||
rjson::add(range, "StartingSequenceNumber", sequence_number(utils::UUID_gen::min_time_UUID(ts.time_since_epoch())));
|
||||
if (expired) {
|
||||
rjson::add(range, "EndingSequenceNumber", sequence_number(utils::UUID_gen::min_time_UUID(expired->time_since_epoch())));
|
||||
}
|
||||
|
||||
rjson::add(shard, "SequenceNumberRange", std::move(range));
|
||||
rjson::push_back(shards, std::move(shard));
|
||||
|
||||
if (--limit == 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
last = std::nullopt;
|
||||
|
||||
auto lo = sv.streams.begin();
|
||||
auto end = sv.streams.end();
|
||||
|
||||
// #7409 - shards must be returned in lexicographical order,
|
||||
std::sort(lo, end, id_cmp);
|
||||
|
||||
if (shard_start) {
|
||||
// find next shard position
|
||||
lo = std::upper_bound(lo, end, shard_start->id, id_cmp);
|
||||
shard_start = std::nullopt;
|
||||
}
|
||||
|
||||
if (lo != end && prev != e) {
|
||||
// We want older stuff sorted in token order so we can find matching
|
||||
// token range when determining parent shard.
|
||||
std::stable_sort(prev->second.streams.begin(), prev->second.streams.end(), token_cmp);
|
||||
}
|
||||
|
||||
auto expired = [&]() -> std::optional<db_clock::time_point> {
|
||||
auto j = std::next(i);
|
||||
if (j == e) {
|
||||
return std::nullopt;
|
||||
}
|
||||
// add this so we sort of match potential
|
||||
// sequence numbers in get_records result.
|
||||
return j->first + confidence_interval(db);
|
||||
}();
|
||||
|
||||
while (lo != end) {
|
||||
auto& id = *lo++;
|
||||
|
||||
auto shard = rjson::empty_object();
|
||||
|
||||
if (prev != e) {
|
||||
auto& pids = prev->second.streams;
|
||||
auto pid = std::upper_bound(pids.begin(), pids.end(), id.token(), [](const dht::token& t, const cdc::stream_id& id) {
|
||||
return t < id.token();
|
||||
});
|
||||
if (pid != pids.begin()) {
|
||||
pid = std::prev(pid);
|
||||
}
|
||||
if (pid != pids.end()) {
|
||||
rjson::add(shard, "ParentShardId", shard_id(prev->first, *pid));
|
||||
}
|
||||
}
|
||||
|
||||
last.emplace(ts, id);
|
||||
rjson::add(shard, "ShardId", *last);
|
||||
auto range = rjson::empty_object();
|
||||
rjson::add(range, "StartingSequenceNumber", sequence_number(utils::UUID_gen::min_time_UUID(ts.time_since_epoch())));
|
||||
if (expired) {
|
||||
rjson::add(range, "EndingSequenceNumber", sequence_number(utils::UUID_gen::min_time_UUID(expired->time_since_epoch())));
|
||||
}
|
||||
|
||||
rjson::add(shard, "SequenceNumberRange", std::move(range));
|
||||
rjson::push_back(shards, std::move(shard));
|
||||
|
||||
if (--limit == 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
last = std::nullopt;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (last) {
|
||||
rjson::add(stream_desc, "LastEvaluatedShardId", *last);
|
||||
}
|
||||
if (last) {
|
||||
rjson::add(stream_desc, "LastEvaluatedShardId", *last);
|
||||
}
|
||||
|
||||
rjson::add(stream_desc, "Shards", std::move(shards));
|
||||
rjson::add(ret, "StreamDescription", std::move(stream_desc));
|
||||
|
||||
return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
|
||||
});
|
||||
rjson::add(stream_desc, "Shards", std::move(shards));
|
||||
rjson::add(ret, "StreamDescription", std::move(stream_desc));
|
||||
|
||||
co_return rjson::print(std::move(ret));
|
||||
}
|
||||
|
||||
enum class shard_iterator_type {
|
||||
@@ -898,172 +896,169 @@ future<executor::request_return_type> executor::get_records(client_state& client
|
||||
auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, _proxy.get_max_result_size(partition_slice),
|
||||
query::tombstone_limit(_proxy.get_tombstone_limit()), query::row_limit(limit * mul));
|
||||
|
||||
co_return co_await _proxy.query(schema, std::move(command), std::move(partition_ranges), cl, service::storage_proxy::coordinator_query_options(default_timeout(), std::move(permit), client_state)).then(
|
||||
[this, schema, partition_slice = std::move(partition_slice), selection = std::move(selection), start_time = std::move(start_time), limit, key_names = std::move(key_names), attr_names = std::move(attr_names), type, iter, high_ts] (service::storage_proxy::coordinator_query_result qr) mutable {
|
||||
cql3::selection::result_set_builder builder(*selection, gc_clock::now());
|
||||
query::result_view::consume(*qr.query_result, partition_slice, cql3::selection::result_set_builder::visitor(builder, *schema, *selection));
|
||||
service::storage_proxy::coordinator_query_result qr = co_await _proxy.query(schema, std::move(command), std::move(partition_ranges), cl, service::storage_proxy::coordinator_query_options(default_timeout(), std::move(permit), client_state));
|
||||
cql3::selection::result_set_builder builder(*selection, gc_clock::now());
|
||||
query::result_view::consume(*qr.query_result, partition_slice, cql3::selection::result_set_builder::visitor(builder, *schema, *selection));
|
||||
|
||||
auto result_set = builder.build();
|
||||
auto records = rjson::empty_array();
|
||||
auto result_set = builder.build();
|
||||
auto records = rjson::empty_array();
|
||||
|
||||
auto& metadata = result_set->get_metadata();
|
||||
auto& metadata = result_set->get_metadata();
|
||||
|
||||
auto op_index = std::distance(metadata.get_names().begin(),
|
||||
std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
|
||||
return cdef->name->name() == op_column_name;
|
||||
})
|
||||
);
|
||||
auto ts_index = std::distance(metadata.get_names().begin(),
|
||||
std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
|
||||
return cdef->name->name() == timestamp_column_name;
|
||||
})
|
||||
);
|
||||
auto eor_index = std::distance(metadata.get_names().begin(),
|
||||
std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
|
||||
return cdef->name->name() == eor_column_name;
|
||||
})
|
||||
);
|
||||
auto op_index = std::distance(metadata.get_names().begin(),
|
||||
std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
|
||||
return cdef->name->name() == op_column_name;
|
||||
})
|
||||
);
|
||||
auto ts_index = std::distance(metadata.get_names().begin(),
|
||||
std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
|
||||
return cdef->name->name() == timestamp_column_name;
|
||||
})
|
||||
);
|
||||
auto eor_index = std::distance(metadata.get_names().begin(),
|
||||
std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
|
||||
return cdef->name->name() == eor_column_name;
|
||||
})
|
||||
);
|
||||
|
||||
std::optional<utils::UUID> timestamp;
|
||||
auto dynamodb = rjson::empty_object();
|
||||
auto record = rjson::empty_object();
|
||||
const auto dc_name = _proxy.get_token_metadata_ptr()->get_topology().get_datacenter();
|
||||
std::optional<utils::UUID> timestamp;
|
||||
auto dynamodb = rjson::empty_object();
|
||||
auto record = rjson::empty_object();
|
||||
const auto dc_name = _proxy.get_token_metadata_ptr()->get_topology().get_datacenter();
|
||||
|
||||
using op_utype = std::underlying_type_t<cdc::operation>;
|
||||
using op_utype = std::underlying_type_t<cdc::operation>;
|
||||
|
||||
auto maybe_add_record = [&] {
|
||||
if (!dynamodb.ObjectEmpty()) {
|
||||
rjson::add(record, "dynamodb", std::move(dynamodb));
|
||||
dynamodb = rjson::empty_object();
|
||||
}
|
||||
if (!record.ObjectEmpty()) {
|
||||
rjson::add(record, "awsRegion", rjson::from_string(dc_name));
|
||||
rjson::add(record, "eventID", event_id(iter.shard.id, *timestamp));
|
||||
rjson::add(record, "eventSource", "scylladb:alternator");
|
||||
rjson::add(record, "eventVersion", "1.1");
|
||||
rjson::push_back(records, std::move(record));
|
||||
record = rjson::empty_object();
|
||||
--limit;
|
||||
}
|
||||
};
|
||||
auto maybe_add_record = [&] {
|
||||
if (!dynamodb.ObjectEmpty()) {
|
||||
rjson::add(record, "dynamodb", std::move(dynamodb));
|
||||
dynamodb = rjson::empty_object();
|
||||
}
|
||||
if (!record.ObjectEmpty()) {
|
||||
rjson::add(record, "awsRegion", rjson::from_string(dc_name));
|
||||
rjson::add(record, "eventID", event_id(iter.shard.id, *timestamp));
|
||||
rjson::add(record, "eventSource", "scylladb:alternator");
|
||||
rjson::add(record, "eventVersion", "1.1");
|
||||
rjson::push_back(records, std::move(record));
|
||||
record = rjson::empty_object();
|
||||
--limit;
|
||||
}
|
||||
};
|
||||
|
||||
for (auto& row : result_set->rows()) {
|
||||
auto op = static_cast<cdc::operation>(value_cast<op_utype>(data_type_for<op_utype>()->deserialize(*row[op_index])));
|
||||
auto ts = value_cast<utils::UUID>(data_type_for<utils::UUID>()->deserialize(*row[ts_index]));
|
||||
auto eor = row[eor_index].has_value() ? value_cast<bool>(boolean_type->deserialize(*row[eor_index])) : false;
|
||||
for (auto& row : result_set->rows()) {
|
||||
auto op = static_cast<cdc::operation>(value_cast<op_utype>(data_type_for<op_utype>()->deserialize(*row[op_index])));
|
||||
auto ts = value_cast<utils::UUID>(data_type_for<utils::UUID>()->deserialize(*row[ts_index]));
|
||||
auto eor = row[eor_index].has_value() ? value_cast<bool>(boolean_type->deserialize(*row[eor_index])) : false;
|
||||
|
||||
if (!dynamodb.HasMember("Keys")) {
|
||||
auto keys = rjson::empty_object();
|
||||
describe_single_item(*selection, row, key_names, keys);
|
||||
rjson::add(dynamodb, "Keys", std::move(keys));
|
||||
rjson::add(dynamodb, "ApproximateCreationDateTime", utils::UUID_gen::unix_timestamp_in_sec(ts).count());
|
||||
rjson::add(dynamodb, "SequenceNumber", sequence_number(ts));
|
||||
rjson::add(dynamodb, "StreamViewType", type);
|
||||
// TODO: SizeBytes
|
||||
}
|
||||
|
||||
/**
|
||||
* We merge rows with same timestamp into a single event.
|
||||
* This is pretty much needed, because a CDC row typically
|
||||
* encodes ~half the info of an alternator write.
|
||||
*
|
||||
* A big, big downside to how alternator records are written
|
||||
* (i.e. CQL), is that the distinction between INSERT and UPDATE
|
||||
* is somewhat lost/unmappable to actual eventName.
|
||||
* A write (currently) always looks like an insert+modify
|
||||
* regardless whether we wrote existing record or not.
|
||||
*
|
||||
* Maybe RMW ops could be done slightly differently so
|
||||
* we can distinguish them here...
|
||||
*
|
||||
* For now, all writes will become MODIFY.
|
||||
*
|
||||
* Note: we do not check the current pre/post
|
||||
* flags on CDC log, instead we use data to
|
||||
* drive what is returned. This is (afaict)
|
||||
* consistent with dynamo streams
|
||||
*/
|
||||
switch (op) {
|
||||
case cdc::operation::pre_image:
|
||||
case cdc::operation::post_image:
|
||||
{
|
||||
auto item = rjson::empty_object();
|
||||
describe_single_item(*selection, row, attr_names, item, nullptr, true);
|
||||
describe_single_item(*selection, row, key_names, item);
|
||||
rjson::add(dynamodb, op == cdc::operation::pre_image ? "OldImage" : "NewImage", std::move(item));
|
||||
break;
|
||||
}
|
||||
case cdc::operation::update:
|
||||
rjson::add(record, "eventName", "MODIFY");
|
||||
break;
|
||||
case cdc::operation::insert:
|
||||
rjson::add(record, "eventName", "INSERT");
|
||||
break;
|
||||
case cdc::operation::service_row_delete:
|
||||
case cdc::operation::service_partition_delete:
|
||||
{
|
||||
auto user_identity = rjson::empty_object();
|
||||
rjson::add(user_identity, "Type", "Service");
|
||||
rjson::add(user_identity, "PrincipalId", "dynamodb.amazonaws.com");
|
||||
rjson::add(record, "userIdentity", std::move(user_identity));
|
||||
rjson::add(record, "eventName", "REMOVE");
|
||||
break;
|
||||
}
|
||||
default:
|
||||
rjson::add(record, "eventName", "REMOVE");
|
||||
break;
|
||||
}
|
||||
if (eor) {
|
||||
maybe_add_record();
|
||||
timestamp = ts;
|
||||
if (limit == 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!dynamodb.HasMember("Keys")) {
|
||||
auto keys = rjson::empty_object();
|
||||
describe_single_item(*selection, row, key_names, keys);
|
||||
rjson::add(dynamodb, "Keys", std::move(keys));
|
||||
rjson::add(dynamodb, "ApproximateCreationDateTime", utils::UUID_gen::unix_timestamp_in_sec(ts).count());
|
||||
rjson::add(dynamodb, "SequenceNumber", sequence_number(ts));
|
||||
rjson::add(dynamodb, "StreamViewType", type);
|
||||
// TODO: SizeBytes
|
||||
}
|
||||
|
||||
auto ret = rjson::empty_object();
|
||||
auto nrecords = records.Size();
|
||||
rjson::add(ret, "Records", std::move(records));
|
||||
|
||||
if (nrecords != 0) {
|
||||
// #9642. Set next iterators threshold to > last
|
||||
shard_iterator next_iter(iter.table, iter.shard, *timestamp, false);
|
||||
// Note that here we unconditionally return NextShardIterator,
|
||||
// without checking if maybe we reached the end-of-shard. If the
|
||||
// shard did end, then the next read will have nrecords == 0 and
|
||||
// will notice end end of shard and not return NextShardIterator.
|
||||
rjson::add(ret, "NextShardIterator", next_iter);
|
||||
_stats.api_operations.get_records_latency.mark(std::chrono::steady_clock::now() - start_time);
|
||||
return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
|
||||
/**
|
||||
* We merge rows with same timestamp into a single event.
|
||||
* This is pretty much needed, because a CDC row typically
|
||||
* encodes ~half the info of an alternator write.
|
||||
*
|
||||
* A big, big downside to how alternator records are written
|
||||
* (i.e. CQL), is that the distinction between INSERT and UPDATE
|
||||
* is somewhat lost/unmappable to actual eventName.
|
||||
* A write (currently) always looks like an insert+modify
|
||||
* regardless whether we wrote existing record or not.
|
||||
*
|
||||
* Maybe RMW ops could be done slightly differently so
|
||||
* we can distinguish them here...
|
||||
*
|
||||
* For now, all writes will become MODIFY.
|
||||
*
|
||||
* Note: we do not check the current pre/post
|
||||
* flags on CDC log, instead we use data to
|
||||
* drive what is returned. This is (afaict)
|
||||
* consistent with dynamo streams
|
||||
*/
|
||||
switch (op) {
|
||||
case cdc::operation::pre_image:
|
||||
case cdc::operation::post_image:
|
||||
{
|
||||
auto item = rjson::empty_object();
|
||||
describe_single_item(*selection, row, attr_names, item, nullptr, true);
|
||||
describe_single_item(*selection, row, key_names, item);
|
||||
rjson::add(dynamodb, op == cdc::operation::pre_image ? "OldImage" : "NewImage", std::move(item));
|
||||
break;
|
||||
}
|
||||
|
||||
// ugh. figure out if we are and end-of-shard
|
||||
auto normal_token_owners = _proxy.get_token_metadata_ptr()->count_normal_token_owners();
|
||||
|
||||
return _sdks.cdc_current_generation_timestamp({ normal_token_owners }).then([this, iter, high_ts, start_time, ret = std::move(ret)](db_clock::time_point ts) mutable {
|
||||
auto& shard = iter.shard;
|
||||
|
||||
if (shard.time < ts && ts < high_ts) {
|
||||
// The DynamoDB documentation states that when a shard is
|
||||
// closed, reading it until the end has NextShardIterator
|
||||
// "set to null". Our test test_streams_closed_read
|
||||
// confirms that by "null" they meant not set at all.
|
||||
} else {
|
||||
// We could have return the same iterator again, but we did
|
||||
// a search from it until high_ts and found nothing, so we
|
||||
// can also start the next search from high_ts.
|
||||
// TODO: but why? It's simpler just to leave the iterator be.
|
||||
shard_iterator next_iter(iter.table, iter.shard, utils::UUID_gen::min_time_UUID(high_ts.time_since_epoch()), true);
|
||||
rjson::add(ret, "NextShardIterator", iter);
|
||||
case cdc::operation::update:
|
||||
rjson::add(record, "eventName", "MODIFY");
|
||||
break;
|
||||
case cdc::operation::insert:
|
||||
rjson::add(record, "eventName", "INSERT");
|
||||
break;
|
||||
case cdc::operation::service_row_delete:
|
||||
case cdc::operation::service_partition_delete:
|
||||
{
|
||||
auto user_identity = rjson::empty_object();
|
||||
rjson::add(user_identity, "Type", "Service");
|
||||
rjson::add(user_identity, "PrincipalId", "dynamodb.amazonaws.com");
|
||||
rjson::add(record, "userIdentity", std::move(user_identity));
|
||||
rjson::add(record, "eventName", "REMOVE");
|
||||
break;
|
||||
}
|
||||
default:
|
||||
rjson::add(record, "eventName", "REMOVE");
|
||||
break;
|
||||
}
|
||||
if (eor) {
|
||||
maybe_add_record();
|
||||
timestamp = ts;
|
||||
if (limit == 0) {
|
||||
break;
|
||||
}
|
||||
_stats.api_operations.get_records_latency.mark(std::chrono::steady_clock::now() - start_time);
|
||||
if (is_big(ret)) {
|
||||
return make_ready_future<executor::request_return_type>(make_streamed(std::move(ret)));
|
||||
}
|
||||
return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
auto ret = rjson::empty_object();
|
||||
auto nrecords = records.Size();
|
||||
rjson::add(ret, "Records", std::move(records));
|
||||
|
||||
if (nrecords != 0) {
|
||||
// #9642. Set next iterators threshold to > last
|
||||
shard_iterator next_iter(iter.table, iter.shard, *timestamp, false);
|
||||
// Note that here we unconditionally return NextShardIterator,
|
||||
// without checking if maybe we reached the end-of-shard. If the
|
||||
// shard did end, then the next read will have nrecords == 0 and
|
||||
// will notice end end of shard and not return NextShardIterator.
|
||||
rjson::add(ret, "NextShardIterator", next_iter);
|
||||
_stats.api_operations.get_records_latency.mark(std::chrono::steady_clock::now() - start_time);
|
||||
co_return rjson::print(std::move(ret));
|
||||
}
|
||||
|
||||
// ugh. figure out if we are and end-of-shard
|
||||
auto normal_token_owners = _proxy.get_token_metadata_ptr()->count_normal_token_owners();
|
||||
|
||||
db_clock::time_point ts = co_await _sdks.cdc_current_generation_timestamp({ normal_token_owners });
|
||||
auto& shard = iter.shard;
|
||||
|
||||
if (shard.time < ts && ts < high_ts) {
|
||||
// The DynamoDB documentation states that when a shard is
|
||||
// closed, reading it until the end has NextShardIterator
|
||||
// "set to null". Our test test_streams_closed_read
|
||||
// confirms that by "null" they meant not set at all.
|
||||
} else {
|
||||
// We could have return the same iterator again, but we did
|
||||
// a search from it until high_ts and found nothing, so we
|
||||
// can also start the next search from high_ts.
|
||||
// TODO: but why? It's simpler just to leave the iterator be.
|
||||
shard_iterator next_iter(iter.table, iter.shard, utils::UUID_gen::min_time_UUID(high_ts.time_since_epoch()), true);
|
||||
rjson::add(ret, "NextShardIterator", iter);
|
||||
}
|
||||
_stats.api_operations.get_records_latency.mark(std::chrono::steady_clock::now() - start_time);
|
||||
if (is_big(ret)) {
|
||||
co_return make_streamed(std::move(ret));
|
||||
}
|
||||
co_return rjson::print(std::move(ret));
|
||||
}
|
||||
|
||||
bool executor::add_stream_options(const rjson::value& stream_specification, schema_builder& builder, service::storage_proxy& sp) {
|
||||
|
||||
@@ -141,7 +141,7 @@ future<executor::request_return_type> executor::describe_time_to_live(client_sta
|
||||
|
||||
// expiration_service is a sharded service responsible for cleaning up expired
|
||||
// items in all tables with per-item expiration enabled. Currently, this means
|
||||
// Alternator tables with TTL configured via a UpdateTimeToLive request.
|
||||
// Alternator tables with TTL configured via an UpdateTimeToLive request.
|
||||
//
|
||||
// Here is a brief overview of how the expiration service works:
|
||||
//
|
||||
@@ -593,7 +593,7 @@ static future<> scan_table_ranges(
|
||||
if (retries >= 10) {
|
||||
// Don't get stuck forever asking the same page, maybe there's
|
||||
// a bug or a real problem in several replicas. Give up on
|
||||
// this scan an retry the scan from a random position later,
|
||||
// this scan and retry the scan from a random position later,
|
||||
// in the next scan period.
|
||||
throw runtime_exception("scanner thread failed after too many timeouts for the same page");
|
||||
}
|
||||
|
||||
@@ -30,7 +30,7 @@ namespace alternator {
|
||||
|
||||
// expiration_service is a sharded service responsible for cleaning up expired
|
||||
// items in all tables with per-item expiration enabled. Currently, this means
|
||||
// Alternator tables with TTL configured via a UpdateTimeToLeave request.
|
||||
// Alternator tables with TTL configured via an UpdateTimeToLive request.
|
||||
class expiration_service final : public seastar::peering_sharded_service<expiration_service> {
|
||||
public:
|
||||
// Object holding per-shard statistics related to the expiration service.
|
||||
@@ -52,7 +52,7 @@ private:
|
||||
data_dictionary::database _db;
|
||||
service::storage_proxy& _proxy;
|
||||
gms::gossiper& _gossiper;
|
||||
// _end is set by start(), and resolves when the the background service
|
||||
// _end is set by start(), and resolves when the background service
|
||||
// started by it ends. To ask the background service to end, _abort_source
|
||||
// should be triggered. stop() below uses both _abort_source and _end.
|
||||
std::optional<future<>> _end;
|
||||
|
||||
@@ -209,15 +209,11 @@ future<> audit::stop_audit() {
|
||||
});
|
||||
}
|
||||
|
||||
audit_info_ptr audit::create_audit_info(statement_category cat, const sstring& keyspace, const sstring& table) {
|
||||
audit_info_ptr audit::create_audit_info(statement_category cat, const sstring& keyspace, const sstring& table, bool batch) {
|
||||
if (!audit_instance().local_is_initialized()) {
|
||||
return nullptr;
|
||||
}
|
||||
return std::make_unique<audit_info>(cat, keyspace, table);
|
||||
}
|
||||
|
||||
audit_info_ptr audit::create_no_audit_info() {
|
||||
return audit_info_ptr();
|
||||
return std::make_unique<audit_info>(cat, keyspace, table, batch);
|
||||
}
|
||||
|
||||
future<> audit::start(const db::config& cfg) {
|
||||
@@ -267,18 +263,21 @@ future<> audit::log_login(const sstring& username, socket_address client_ip, boo
|
||||
}
|
||||
|
||||
future<> inspect(shared_ptr<cql3::cql_statement> statement, service::query_state& query_state, const cql3::query_options& options, bool error) {
|
||||
cql3::statements::batch_statement* batch = dynamic_cast<cql3::statements::batch_statement*>(statement.get());
|
||||
if (batch != nullptr) {
|
||||
auto audit_info = statement->get_audit_info();
|
||||
if (!audit_info) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
if (audit_info->batch()) {
|
||||
cql3::statements::batch_statement* batch = static_cast<cql3::statements::batch_statement*>(statement.get());
|
||||
return do_for_each(batch->statements().begin(), batch->statements().end(), [&query_state, &options, error] (auto&& m) {
|
||||
return inspect(m.statement, query_state, options, error);
|
||||
});
|
||||
} else {
|
||||
auto audit_info = statement->get_audit_info();
|
||||
if (bool(audit_info) && audit::local_audit_instance().should_log(audit_info)) {
|
||||
if (audit::local_audit_instance().should_log(audit_info)) {
|
||||
return audit::local_audit_instance().log(audit_info, query_state, options, error);
|
||||
}
|
||||
return make_ready_future<>();
|
||||
}
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
future<> inspect_login(const sstring& username, socket_address client_ip, bool error) {
|
||||
|
||||
@@ -75,11 +75,13 @@ class audit_info final {
|
||||
sstring _keyspace;
|
||||
sstring _table;
|
||||
sstring _query;
|
||||
bool _batch;
|
||||
public:
|
||||
audit_info(statement_category cat, sstring keyspace, sstring table)
|
||||
audit_info(statement_category cat, sstring keyspace, sstring table, bool batch)
|
||||
: _category(cat)
|
||||
, _keyspace(std::move(keyspace))
|
||||
, _table(std::move(table))
|
||||
, _batch(batch)
|
||||
{ }
|
||||
void set_query_string(const std::string_view& query_string) {
|
||||
_query = sstring(query_string);
|
||||
@@ -89,6 +91,7 @@ public:
|
||||
const sstring& query() const { return _query; }
|
||||
sstring category_string() const;
|
||||
statement_category category() const { return _category; }
|
||||
bool batch() const { return _batch; }
|
||||
};
|
||||
|
||||
using audit_info_ptr = std::unique_ptr<audit_info>;
|
||||
@@ -126,8 +129,7 @@ public:
|
||||
}
|
||||
static future<> start_audit(const db::config& cfg, sharded<locator::shared_token_metadata>& stm, sharded<cql3::query_processor>& qp, sharded<service::migration_manager>& mm);
|
||||
static future<> stop_audit();
|
||||
static audit_info_ptr create_audit_info(statement_category cat, const sstring& keyspace, const sstring& table);
|
||||
static audit_info_ptr create_no_audit_info();
|
||||
static audit_info_ptr create_audit_info(statement_category cat, const sstring& keyspace, const sstring& table, bool batch = false);
|
||||
audit(locator::shared_token_metadata& stm,
|
||||
cql3::query_processor& qp,
|
||||
service::migration_manager& mm,
|
||||
|
||||
@@ -52,13 +52,6 @@ static const class_registrator<
|
||||
::service::migration_manager&,
|
||||
cache&> registration("org.apache.cassandra.auth.CassandraRoleManager");
|
||||
|
||||
struct record final {
|
||||
sstring name;
|
||||
bool is_superuser;
|
||||
bool can_login;
|
||||
role_set member_of;
|
||||
};
|
||||
|
||||
static db::consistency_level consistency_for_role(std::string_view role_name) noexcept {
|
||||
if (role_name == meta::DEFAULT_SUPERUSER_NAME) {
|
||||
return db::consistency_level::QUORUM;
|
||||
@@ -67,13 +60,13 @@ static db::consistency_level consistency_for_role(std::string_view role_name) no
|
||||
return db::consistency_level::LOCAL_ONE;
|
||||
}
|
||||
|
||||
static future<std::optional<record>> find_record(cql3::query_processor& qp, std::string_view role_name) {
|
||||
future<std::optional<standard_role_manager::record>> standard_role_manager::legacy_find_record(std::string_view role_name) {
|
||||
const sstring query = seastar::format("SELECT * FROM {}.{} WHERE {} = ?",
|
||||
get_auth_ks_name(qp),
|
||||
get_auth_ks_name(_qp),
|
||||
meta::roles_table::name,
|
||||
meta::roles_table::role_col_name);
|
||||
|
||||
const auto results = co_await qp.execute_internal(
|
||||
const auto results = co_await _qp.execute_internal(
|
||||
query,
|
||||
consistency_for_role(role_name),
|
||||
internal_distributed_query_state(),
|
||||
@@ -93,8 +86,25 @@ static future<std::optional<record>> find_record(cql3::query_processor& qp, std:
|
||||
: role_set())});
|
||||
}
|
||||
|
||||
static future<record> require_record(cql3::query_processor& qp, std::string_view role_name) {
|
||||
return find_record(qp, role_name).then([role_name](std::optional<record> mr) {
|
||||
future<std::optional<standard_role_manager::record>> standard_role_manager::find_record(std::string_view role_name) {
|
||||
if (legacy_mode(_qp)) {
|
||||
return legacy_find_record(role_name);
|
||||
}
|
||||
auto name = sstring(role_name);
|
||||
auto role = _cache.get(name);
|
||||
if (!role) {
|
||||
return make_ready_future<std::optional<record>>(std::nullopt);
|
||||
}
|
||||
return make_ready_future<std::optional<record>>(std::make_optional(record{
|
||||
.name = std::move(name),
|
||||
.is_superuser = role->is_superuser,
|
||||
.can_login = role->can_login,
|
||||
.member_of = role->member_of
|
||||
}));
|
||||
}
|
||||
|
||||
future<standard_role_manager::record> standard_role_manager::require_record(std::string_view role_name) {
|
||||
return find_record(role_name).then([role_name](std::optional<record> mr) {
|
||||
if (!mr) {
|
||||
throw nonexistant_role(role_name);
|
||||
}
|
||||
@@ -386,7 +396,7 @@ standard_role_manager::alter(std::string_view role_name, const role_config_updat
|
||||
return fmt::to_string(fmt::join(assignments, ", "));
|
||||
};
|
||||
|
||||
return require_record(_qp, role_name).then([this, role_name, &u, &mc](record) {
|
||||
return require_record(role_name).then([this, role_name, &u, &mc](record) {
|
||||
if (!u.is_superuser && !u.can_login) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
@@ -620,18 +630,17 @@ standard_role_manager::revoke(std::string_view revokee_name, std::string_view ro
|
||||
});
|
||||
}
|
||||
|
||||
static future<> collect_roles(
|
||||
cql3::query_processor& qp,
|
||||
future<> standard_role_manager::collect_roles(
|
||||
std::string_view grantee_name,
|
||||
bool recurse,
|
||||
role_set& roles) {
|
||||
return require_record(qp, grantee_name).then([&qp, &roles, recurse](record r) {
|
||||
return do_with(std::move(r.member_of), [&qp, &roles, recurse](const role_set& memberships) {
|
||||
return do_for_each(memberships.begin(), memberships.end(), [&qp, &roles, recurse](const sstring& role_name) {
|
||||
return require_record(grantee_name).then([this, &roles, recurse](standard_role_manager::record r) {
|
||||
return do_with(std::move(r.member_of), [this, &roles, recurse](const role_set& memberships) {
|
||||
return do_for_each(memberships.begin(), memberships.end(), [this, &roles, recurse](const sstring& role_name) {
|
||||
roles.insert(role_name);
|
||||
|
||||
if (recurse) {
|
||||
return collect_roles(qp, role_name, true, roles);
|
||||
return collect_roles(role_name, true, roles);
|
||||
}
|
||||
|
||||
return make_ready_future<>();
|
||||
@@ -646,7 +655,7 @@ future<role_set> standard_role_manager::query_granted(std::string_view grantee_n
|
||||
return do_with(
|
||||
role_set{sstring(grantee_name)},
|
||||
[this, grantee_name, recurse](role_set& roles) {
|
||||
return collect_roles(_qp, grantee_name, recurse, roles).then([&roles] { return roles; });
|
||||
return collect_roles(grantee_name, recurse, roles).then([&roles] { return roles; });
|
||||
});
|
||||
}
|
||||
|
||||
@@ -706,27 +715,21 @@ future<role_set> standard_role_manager::query_all(::service::query_state& qs) {
|
||||
}
|
||||
|
||||
future<bool> standard_role_manager::exists(std::string_view role_name) {
|
||||
return find_record(_qp, role_name).then([](std::optional<record> mr) {
|
||||
return find_record(role_name).then([](std::optional<record> mr) {
|
||||
return static_cast<bool>(mr);
|
||||
});
|
||||
}
|
||||
|
||||
future<bool> standard_role_manager::is_superuser(std::string_view role_name) {
|
||||
return require_record(_qp, role_name).then([](record r) {
|
||||
return require_record(role_name).then([](record r) {
|
||||
return r.is_superuser;
|
||||
});
|
||||
}
|
||||
|
||||
future<bool> standard_role_manager::can_login(std::string_view role_name) {
|
||||
if (legacy_mode(_qp)) {
|
||||
const auto r = co_await require_record(_qp, role_name);
|
||||
co_return r.can_login;
|
||||
}
|
||||
auto role = _cache.get(sstring(role_name));
|
||||
if (!role) {
|
||||
throw nonexistant_role(role_name);
|
||||
}
|
||||
co_return role->can_login;
|
||||
return require_record(role_name).then([](record r) {
|
||||
return r.can_login;
|
||||
});
|
||||
}
|
||||
|
||||
future<std::optional<sstring>> standard_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state& qs) {
|
||||
|
||||
@@ -90,6 +90,12 @@ public:
|
||||
|
||||
private:
|
||||
enum class membership_change { add, remove };
|
||||
struct record final {
|
||||
sstring name;
|
||||
bool is_superuser;
|
||||
bool can_login;
|
||||
role_set member_of;
|
||||
};
|
||||
|
||||
future<> create_legacy_metadata_tables_if_missing() const;
|
||||
|
||||
@@ -107,6 +113,14 @@ private:
|
||||
future<> legacy_modify_membership(std::string_view role_name, std::string_view grantee_name, membership_change);
|
||||
|
||||
future<> modify_membership(std::string_view role_name, std::string_view grantee_name, membership_change, ::service::group0_batch& mc);
|
||||
|
||||
future<std::optional<record>> legacy_find_record(std::string_view role_name);
|
||||
future<std::optional<record>> find_record(std::string_view role_name);
|
||||
future<record> require_record(std::string_view role_name);
|
||||
future<> collect_roles(
|
||||
std::string_view grantee_name,
|
||||
bool recurse,
|
||||
role_set& roles);
|
||||
};
|
||||
|
||||
} // namespace auth
|
||||
|
||||
@@ -814,8 +814,7 @@ generation_service::generation_service(
|
||||
config cfg, gms::gossiper& g, sharded<db::system_distributed_keyspace>& sys_dist_ks,
|
||||
sharded<db::system_keyspace>& sys_ks,
|
||||
abort_source& abort_src, const locator::shared_token_metadata& stm, gms::feature_service& f,
|
||||
replica::database& db,
|
||||
std::function<bool()> raft_topology_change_enabled)
|
||||
replica::database& db)
|
||||
: _cfg(std::move(cfg))
|
||||
, _gossiper(g)
|
||||
, _sys_dist_ks(sys_dist_ks)
|
||||
@@ -824,7 +823,6 @@ generation_service::generation_service(
|
||||
, _token_metadata(stm)
|
||||
, _feature_service(f)
|
||||
, _db(db)
|
||||
, _raft_topology_change_enabled(std::move(raft_topology_change_enabled))
|
||||
{
|
||||
}
|
||||
|
||||
@@ -878,16 +876,7 @@ future<> generation_service::on_join(gms::inet_address ep, locator::host_id id,
|
||||
future<> generation_service::on_change(gms::inet_address ep, locator::host_id id, const gms::application_state_map& states, gms::permit_id pid) {
|
||||
assert_shard_zero(__PRETTY_FUNCTION__);
|
||||
|
||||
if (_raft_topology_change_enabled()) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
return on_application_state_change(ep, id, states, gms::application_state::CDC_GENERATION_ID, pid, [this] (gms::inet_address ep, locator::host_id id, const gms::versioned_value& v, gms::permit_id) {
|
||||
auto gen_id = gms::versioned_value::cdc_generation_id_from_string(v.value());
|
||||
cdc_log.debug("Endpoint: {}, CDC generation ID change: {}", ep, gen_id);
|
||||
|
||||
return legacy_handle_cdc_generation(gen_id);
|
||||
});
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
future<> generation_service::check_and_repair_cdc_streams() {
|
||||
|
||||
@@ -79,17 +79,12 @@ private:
|
||||
std::optional<cdc::generation_id> _gen_id;
|
||||
future<> _cdc_streams_rewrite_complete = make_ready_future<>();
|
||||
|
||||
/* Returns true if raft topology changes are enabled.
|
||||
* Can only be called from shard 0.
|
||||
*/
|
||||
std::function<bool()> _raft_topology_change_enabled;
|
||||
public:
|
||||
generation_service(config cfg, gms::gossiper&,
|
||||
sharded<db::system_distributed_keyspace>&,
|
||||
sharded<db::system_keyspace>& sys_ks,
|
||||
abort_source&, const locator::shared_token_metadata&,
|
||||
gms::feature_service&, replica::database& db,
|
||||
std::function<bool()> raft_topology_change_enabled);
|
||||
gms::feature_service&, replica::database& db);
|
||||
|
||||
future<> stop();
|
||||
~generation_service();
|
||||
|
||||
58
configure.py
58
configure.py
@@ -730,28 +730,6 @@ vector_search_tests = set([
|
||||
'test/vector_search/rescoring_test'
|
||||
])
|
||||
|
||||
vector_search_validator_bin = 'vector-search-validator/bin/vector-search-validator'
|
||||
vector_search_validator_deps = set([
|
||||
'test/vector_search_validator/build-validator',
|
||||
'test/vector_search_validator/Cargo.toml',
|
||||
'test/vector_search_validator/crates/validator/Cargo.toml',
|
||||
'test/vector_search_validator/crates/validator/src/main.rs',
|
||||
'test/vector_search_validator/crates/validator-scylla/Cargo.toml',
|
||||
'test/vector_search_validator/crates/validator-scylla/src/lib.rs',
|
||||
'test/vector_search_validator/crates/validator-scylla/src/cql.rs',
|
||||
])
|
||||
|
||||
vector_store_bin = 'vector-search-validator/bin/vector-store'
|
||||
vector_store_deps = set([
|
||||
'test/vector_search_validator/build-env',
|
||||
'test/vector_search_validator/build-vector-store',
|
||||
])
|
||||
|
||||
vector_search_validator_bins = set([
|
||||
vector_search_validator_bin,
|
||||
vector_store_bin,
|
||||
])
|
||||
|
||||
wasms = set([
|
||||
'wasm/return_input.wat',
|
||||
'wasm/test_complex_null_values.wat',
|
||||
@@ -785,7 +763,7 @@ other = set([
|
||||
'iotune',
|
||||
])
|
||||
|
||||
all_artifacts = apps | cpp_apps | tests | other | wasms | vector_search_validator_bins
|
||||
all_artifacts = apps | cpp_apps | tests | other | wasms
|
||||
|
||||
arg_parser = argparse.ArgumentParser('Configure scylla', add_help=False, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
arg_parser.add_argument('--out', dest='buildfile', action='store', default='build.ninja',
|
||||
@@ -817,6 +795,9 @@ arg_parser.add_argument('--c-compiler', action='store', dest='cc', default='clan
|
||||
help='C compiler path')
|
||||
arg_parser.add_argument('--compiler-cache', action='store', dest='compiler_cache', default='auto',
|
||||
help='Compiler cache to use: auto (default, prefers sccache), sccache, ccache, none, or a path to a binary')
|
||||
# Workaround for https://github.com/mozilla/sccache/issues/2575
|
||||
arg_parser.add_argument('--sccache-rust', action=argparse.BooleanOptionalAction, default=False,
|
||||
help='Use sccache for rust code (if sccache is selected as compiler cache). Doesn\'t work with distributed builds.')
|
||||
add_tristate(arg_parser, name='dpdk', dest='dpdk', default=False,
|
||||
help='Use dpdk (from seastar dpdk sources)')
|
||||
arg_parser.add_argument('--dpdk-target', action='store', dest='dpdk_target', default='',
|
||||
@@ -947,8 +928,7 @@ scylla_core = (['message/messaging_service.cc',
|
||||
'utils/crypt_sha512.cc',
|
||||
'utils/logalloc.cc',
|
||||
'utils/large_bitset.cc',
|
||||
'utils/buffer_input_stream.cc',
|
||||
'utils/limiting_data_source.cc',
|
||||
'test/lib/limiting_data_source.cc',
|
||||
'utils/updateable_value.cc',
|
||||
'message/dictionary_service.cc',
|
||||
'utils/directories.cc',
|
||||
@@ -1557,6 +1537,7 @@ scylla_perfs = ['test/perf/perf_alternator.cc',
|
||||
'test/perf/perf_fast_forward.cc',
|
||||
'test/perf/perf_row_cache_update.cc',
|
||||
'test/perf/perf_simple_query.cc',
|
||||
'test/perf/perf_cql_raw.cc',
|
||||
'test/perf/perf_sstable.cc',
|
||||
'test/perf/perf_tablets.cc',
|
||||
'test/perf/tablet_load_balancing.cc',
|
||||
@@ -2405,7 +2386,7 @@ def write_build_file(f,
|
||||
# If compiler cache is available, prefix the compiler with it
|
||||
cxx_with_cache = f'{compiler_cache} {args.cxx}' if compiler_cache else args.cxx
|
||||
# For Rust, sccache is used via RUSTC_WRAPPER environment variable
|
||||
rustc_wrapper = f'RUSTC_WRAPPER={compiler_cache} ' if compiler_cache and 'sccache' in compiler_cache else ''
|
||||
rustc_wrapper = f'RUSTC_WRAPPER={compiler_cache} ' if compiler_cache and 'sccache' in compiler_cache and args.sccache_rust else ''
|
||||
f.write(textwrap.dedent('''\
|
||||
configure_args = {configure_args}
|
||||
builddir = {outdir}
|
||||
@@ -2582,11 +2563,10 @@ def write_build_file(f,
|
||||
description = RUST_LIB $out
|
||||
''').format(mode=mode, antlr3_exec=args.antlr3_exec, fmt_lib=fmt_lib, test_repeat=args.test_repeat, test_timeout=args.test_timeout, rustc_wrapper=rustc_wrapper, **modeval))
|
||||
f.write(
|
||||
'build {mode}-build: phony {artifacts} {wasms} {vector_search_validator_bins}\n'.format(
|
||||
'build {mode}-build: phony {artifacts} {wasms}\n'.format(
|
||||
mode=mode,
|
||||
artifacts=str.join(' ', ['$builddir/' + mode + '/' + x for x in sorted(build_artifacts - wasms - vector_search_validator_bins)]),
|
||||
artifacts=str.join(' ', ['$builddir/' + mode + '/' + x for x in sorted(build_artifacts - wasms)]),
|
||||
wasms = str.join(' ', ['$builddir/' + x for x in sorted(build_artifacts & wasms)]),
|
||||
vector_search_validator_bins=str.join(' ', ['$builddir/' + x for x in sorted(build_artifacts & vector_search_validator_bins)]),
|
||||
)
|
||||
)
|
||||
if profile_recipe := modes[mode].get('profile_recipe'):
|
||||
@@ -2616,7 +2596,7 @@ def write_build_file(f,
|
||||
continue
|
||||
profile_dep = modes[mode].get('profile_target', "")
|
||||
|
||||
if binary in other or binary in wasms or binary in vector_search_validator_bins:
|
||||
if binary in other or binary in wasms:
|
||||
continue
|
||||
srcs = deps[binary]
|
||||
# 'scylla'
|
||||
@@ -2727,11 +2707,10 @@ def write_build_file(f,
|
||||
)
|
||||
|
||||
f.write(
|
||||
'build {mode}-test: test.{mode} {test_executables} $builddir/{mode}/scylla {wasms} {vector_search_validator_bins} \n'.format(
|
||||
'build {mode}-test: test.{mode} {test_executables} $builddir/{mode}/scylla {wasms}\n'.format(
|
||||
mode=mode,
|
||||
test_executables=' '.join(['$builddir/{}/{}'.format(mode, binary) for binary in sorted(tests)]),
|
||||
wasms=' '.join([f'$builddir/{binary}' for binary in sorted(wasms)]),
|
||||
vector_search_validator_bins=' '.join([f'$builddir/{binary}' for binary in sorted(vector_search_validator_bins)]),
|
||||
)
|
||||
)
|
||||
f.write(
|
||||
@@ -2899,19 +2878,6 @@ def write_build_file(f,
|
||||
'build compiler-training: phony {}\n'.format(' '.join(['{mode}-compiler-training'.format(mode=mode) for mode in default_modes]))
|
||||
)
|
||||
|
||||
f.write(textwrap.dedent(f'''\
|
||||
rule build-vector-search-validator
|
||||
command = test/vector_search_validator/build-validator $builddir
|
||||
rule build-vector-store
|
||||
command = test/vector_search_validator/build-vector-store $builddir
|
||||
'''))
|
||||
f.write(
|
||||
'build $builddir/{vector_search_validator_bin}: build-vector-search-validator {}\n'.format(' '.join([dep for dep in sorted(vector_search_validator_deps)]), vector_search_validator_bin=vector_search_validator_bin)
|
||||
)
|
||||
f.write(
|
||||
'build $builddir/{vector_store_bin}: build-vector-store {}\n'.format(' '.join([dep for dep in sorted(vector_store_deps)]), vector_store_bin=vector_store_bin)
|
||||
)
|
||||
|
||||
f.write(textwrap.dedent(f'''\
|
||||
build dist-unified-tar: phony {' '.join([f'$builddir/{mode}/dist/tar/{scylla_product}-unified-{scylla_version}-{scylla_release}.{arch}.tar.gz' for mode in default_modes])}
|
||||
build dist-unified: phony dist-unified-tar
|
||||
@@ -3149,7 +3115,7 @@ def configure_using_cmake(args):
|
||||
settings['CMAKE_CXX_COMPILER_LAUNCHER'] = compiler_cache
|
||||
settings['CMAKE_C_COMPILER_LAUNCHER'] = compiler_cache
|
||||
# For Rust, sccache is used via RUSTC_WRAPPER
|
||||
if 'sccache' in compiler_cache:
|
||||
if 'sccache' in compiler_cache and args.sccache_rust:
|
||||
settings['Scylla_RUSTC_WRAPPER'] = compiler_cache
|
||||
|
||||
if args.date_stamp:
|
||||
|
||||
12
cql3/Cql.g
12
cql3/Cql.g
@@ -389,8 +389,10 @@ selectStatement returns [std::unique_ptr<raw::select_statement> expr]
|
||||
bool is_ann_ordering = false;
|
||||
}
|
||||
: K_SELECT (
|
||||
( K_JSON { statement_subtype = raw::select_statement::parameters::statement_subtype::JSON; } )?
|
||||
( K_DISTINCT { is_distinct = true; } )?
|
||||
( (K_JSON K_DISTINCT)=> K_JSON { statement_subtype = raw::select_statement::parameters::statement_subtype::JSON; }
|
||||
| (K_JSON selectClause K_FROM)=> K_JSON { statement_subtype = raw::select_statement::parameters::statement_subtype::JSON; }
|
||||
)?
|
||||
( (K_DISTINCT selectClause K_FROM)=> K_DISTINCT { is_distinct = true; } )?
|
||||
sclause=selectClause
|
||||
)
|
||||
K_FROM (
|
||||
@@ -425,6 +427,7 @@ selector returns [shared_ptr<raw_selector> s]
|
||||
|
||||
unaliasedSelector returns [uexpression tmp]
|
||||
: ( c=cident { tmp = unresolved_identifier{std::move(c)}; }
|
||||
| v=value { tmp = std::move(v); }
|
||||
| K_COUNT '(' countArgument ')' { tmp = make_count_rows_function_expression(); }
|
||||
| K_WRITETIME '(' c=cident ')' { tmp = column_mutation_attribute{column_mutation_attribute::attribute_kind::writetime,
|
||||
unresolved_identifier{std::move(c)}}; }
|
||||
@@ -455,14 +458,11 @@ vectorSimilarityArgs returns [std::vector<expression> a]
|
||||
|
||||
vectorSimilarityArg returns [uexpression a]
|
||||
: s=unaliasedSelector { a = std::move(s); }
|
||||
| v=value { a = std::move(v); }
|
||||
;
|
||||
|
||||
countArgument
|
||||
: '*'
|
||||
| i=INTEGER { if (i->getText() != "1") {
|
||||
add_recognition_error("Only COUNT(1) is supported, got COUNT(" + i->getText() + ")");
|
||||
} }
|
||||
/* COUNT(1) is also allowed, it is recognized via the general function(args) path */
|
||||
;
|
||||
|
||||
whereClause returns [uexpression clause]
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
#include "expr-utils.hh"
|
||||
#include "evaluate.hh"
|
||||
#include "cql3/functions/functions.hh"
|
||||
#include "cql3/functions/aggregate_fcts.hh"
|
||||
#include "cql3/functions/castas_fcts.hh"
|
||||
#include "cql3/functions/scalar_function.hh"
|
||||
#include "cql3/column_identifier.hh"
|
||||
@@ -1047,8 +1048,47 @@ prepare_function_args_for_type_inference(std::span<const expression> args, data_
|
||||
return partially_prepared_args;
|
||||
}
|
||||
|
||||
// Special case for count(1) - recognize it as the countRows() function. Note it is quite
|
||||
// artificial and we might relax it to the more general count(expression) later.
|
||||
static
|
||||
std::optional<expression>
|
||||
try_prepare_count_rows(const expr::function_call& fc, data_dictionary::database db, const sstring& keyspace, const schema* schema_opt, lw_shared_ptr<column_specification> receiver) {
|
||||
return std::visit(overloaded_functor{
|
||||
[&] (const functions::function_name& name) -> std::optional<expression> {
|
||||
auto native_name = name;
|
||||
if (!native_name.has_keyspace()) {
|
||||
native_name = name.as_native_function();
|
||||
}
|
||||
// Collapse count(1) into countRows()
|
||||
if (native_name == functions::function_name::native_function("count")) {
|
||||
if (fc.args.size() == 1) {
|
||||
if (auto uc_arg = expr::as_if<expr::untyped_constant>(&fc.args[0])) {
|
||||
if (uc_arg->partial_type == expr::untyped_constant::type_class::integer
|
||||
&& uc_arg->raw_text == "1") {
|
||||
return expr::function_call{
|
||||
.func = functions::aggregate_fcts::make_count_rows_function(),
|
||||
.args = {},
|
||||
};
|
||||
} else {
|
||||
throw exceptions::invalid_request_exception(format("count() expects a column or the literal 1 as an argument", fc.args[0]));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return std::nullopt;
|
||||
},
|
||||
[] (const shared_ptr<functions::function>&) -> std::optional<expression> {
|
||||
// Already prepared, nothing to do
|
||||
return std::nullopt;
|
||||
},
|
||||
}, fc.func);
|
||||
}
|
||||
|
||||
std::optional<expression>
|
||||
prepare_function_call(const expr::function_call& fc, data_dictionary::database db, const sstring& keyspace, const schema* schema_opt, lw_shared_ptr<column_specification> receiver) {
|
||||
if (auto prepared = try_prepare_count_rows(fc, db, keyspace, schema_opt, receiver)) {
|
||||
return prepared;
|
||||
}
|
||||
// Try to extract a column family name from the available information.
|
||||
// Most functions can be prepared without information about the column family, usually just the keyspace is enough.
|
||||
// One exception is the token() function - in order to prepare system.token() we have to know the partition key of the table,
|
||||
|
||||
@@ -23,6 +23,7 @@
|
||||
#include "index/vector_index.hh"
|
||||
#include "schema/schema.hh"
|
||||
#include "service/client_state.hh"
|
||||
#include "service/paxos/paxos_state.hh"
|
||||
#include "types/types.hh"
|
||||
#include "cql3/query_processor.hh"
|
||||
#include "cql3/cql_statement.hh"
|
||||
@@ -329,6 +330,19 @@ future<std::vector<description>> table(const data_dictionary::database& db, cons
|
||||
"*/",
|
||||
*table_desc.create_statement);
|
||||
|
||||
table_desc.create_statement = std::move(os).to_managed_string();
|
||||
} else if (service::paxos::paxos_store::try_get_base_table(name)) {
|
||||
// Paxos state table is internally managed by Scylla and it shouldn't be exposed to the user.
|
||||
// The table is allowed to be described as a comment to ease administrative work but it's hidden from all listings.
|
||||
fragmented_ostringstream os{};
|
||||
|
||||
fmt::format_to(os.to_iter(),
|
||||
"/* Do NOT execute this statement! It's only for informational purposes.\n"
|
||||
" A paxos state table is created automatically when enabling LWT on a base table.\n"
|
||||
"\n{}\n"
|
||||
"*/",
|
||||
*table_desc.create_statement);
|
||||
|
||||
table_desc.create_statement = std::move(os).to_managed_string();
|
||||
}
|
||||
result.push_back(std::move(table_desc));
|
||||
@@ -364,7 +378,7 @@ future<std::vector<description>> table(const data_dictionary::database& db, cons
|
||||
future<std::vector<description>> tables(const data_dictionary::database& db, const lw_shared_ptr<keyspace_metadata>& ks, std::optional<bool> with_internals = std::nullopt) {
|
||||
auto& replica_db = db.real_database();
|
||||
auto tables = ks->tables() | std::views::filter([&replica_db] (const schema_ptr& s) {
|
||||
return !cdc::is_log_for_some_table(replica_db, s->ks_name(), s->cf_name());
|
||||
return !cdc::is_log_for_some_table(replica_db, s->ks_name(), s->cf_name()) && !service::paxos::paxos_store::try_get_base_table(s->cf_name());
|
||||
}) | std::ranges::to<std::vector<schema_ptr>>();
|
||||
std::ranges::sort(tables, std::ranges::less(), std::mem_fn(&schema::cf_name));
|
||||
|
||||
|
||||
@@ -50,8 +50,8 @@ public:
|
||||
protected:
|
||||
virtual audit::statement_category category() const override;
|
||||
virtual audit::audit_info_ptr audit_info() const override {
|
||||
// We don't audit batch statements. Instead we audit statements that are inside the batch.
|
||||
return audit::audit::create_no_audit_info();
|
||||
constexpr bool batch = true;
|
||||
return audit::audit::create_audit_info(category(), sstring(), sstring(), batch);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -259,11 +259,9 @@ uint32_t select_statement::get_bound_terms() const {
|
||||
|
||||
future<> select_statement::check_access(query_processor& qp, const service::client_state& state) const {
|
||||
try {
|
||||
const data_dictionary::database db = qp.db();
|
||||
auto&& s = db.find_schema(keyspace(), column_family());
|
||||
auto cdc = db.get_cdc_base_table(*s);
|
||||
auto& cf_name = s->is_view()
|
||||
? s->view_info()->base_name()
|
||||
auto cdc = qp.db().get_cdc_base_table(*_schema);
|
||||
auto& cf_name = _schema->is_view()
|
||||
? _schema->view_info()->base_name()
|
||||
: (cdc ? cdc->cf_name() : column_family());
|
||||
const schema_ptr& base_schema = cdc ? cdc : _schema;
|
||||
bool is_vector_indexed = secondary_index::vector_index::has_vector_index(*base_schema);
|
||||
|
||||
44
db/config.cc
44
db/config.cc
@@ -621,25 +621,6 @@ db::config::config(std::shared_ptr<db::extensions> exts)
|
||||
* @GroupDescription: Provides an overview of the group.
|
||||
*/
|
||||
/**
|
||||
* @Group Ungrouped properties
|
||||
*/
|
||||
, background_writer_scheduling_quota(this, "background_writer_scheduling_quota", value_status::Deprecated, 1.0,
|
||||
"max cpu usage ratio (between 0 and 1) for compaction process. Not intended for setting in normal operations. Setting it to 1 or higher will disable it, recommended operational setting is 0.5.")
|
||||
, auto_adjust_flush_quota(this, "auto_adjust_flush_quota", value_status::Deprecated, false,
|
||||
"true: auto-adjust memtable shares for flush processes")
|
||||
, memtable_flush_static_shares(this, "memtable_flush_static_shares", liveness::LiveUpdate, value_status::Used, 0,
|
||||
"If set to higher than 0, ignore the controller's output and set the memtable shares statically. Do not set this unless you know what you are doing and suspect a problem in the controller. This option will be retired when the controller reaches more maturity.")
|
||||
, compaction_static_shares(this, "compaction_static_shares", liveness::LiveUpdate, value_status::Used, 0,
|
||||
"If set to higher than 0, ignore the controller's output and set the compaction shares statically. Do not set this unless you know what you are doing and suspect a problem in the controller. This option will be retired when the controller reaches more maturity.")
|
||||
, compaction_max_shares(this, "compaction_max_shares", liveness::LiveUpdate, value_status::Used, default_compaction_maximum_shares,
|
||||
"Set the maximum shares of regular compaction to the specific value. Do not set this unless you know what you are doing and suspect a problem in the controller. This option will be retired when the controller reaches more maturity.")
|
||||
, compaction_enforce_min_threshold(this, "compaction_enforce_min_threshold", liveness::LiveUpdate, value_status::Used, false,
|
||||
"If set to true, enforce the min_threshold option for compactions strictly. If false (default), Scylla may decide to compact even if below min_threshold.")
|
||||
, compaction_flush_all_tables_before_major_seconds(this, "compaction_flush_all_tables_before_major_seconds", value_status::Used, 86400,
|
||||
"Set the minimum interval in seconds between flushing all tables before each major compaction (default is 86400)."
|
||||
"This option is useful for maximizing tombstone garbage collection by releasing all active commitlog segments."
|
||||
"Set to 0 to disable automatic flushing all tables before major compaction.")
|
||||
/**
|
||||
* @Group Initialization properties
|
||||
* @GroupDescription The minimal properties needed for configuring a cluster.
|
||||
*/
|
||||
@@ -1394,6 +1375,10 @@ db::config::config(std::shared_ptr<db::extensions> exts)
|
||||
"Start killing reads after their collective memory consumption goes above $normal_limit * $multiplier.")
|
||||
, reader_concurrency_semaphore_cpu_concurrency(this, "reader_concurrency_semaphore_cpu_concurrency", liveness::LiveUpdate, value_status::Used, 2,
|
||||
"Admit new reads while there are less than this number of requests that need CPU.")
|
||||
, reader_concurrency_semaphore_preemptive_abort_factor(this, "reader_concurrency_semaphore_preemptive_abort_factor", liveness::LiveUpdate, value_status::Used, 0.3,
|
||||
"Admit new reads while their remaining time is more than this factor times their timeout times when arrived to a semaphore. Its vale means\n"
|
||||
"* <= 0.0 means new reads will never get rejected during admission\n"
|
||||
"* >= 1.0 means new reads will always get rejected during admission\n")
|
||||
, view_update_reader_concurrency_semaphore_serialize_limit_multiplier(this, "view_update_reader_concurrency_semaphore_serialize_limit_multiplier", liveness::LiveUpdate, value_status::Used, 2,
|
||||
"Start serializing view update reads after their collective memory consumption goes above $normal_limit * $multiplier.")
|
||||
, view_update_reader_concurrency_semaphore_kill_limit_multiplier(this, "view_update_reader_concurrency_semaphore_kill_limit_multiplier", liveness::LiveUpdate, value_status::Used, 4,
|
||||
@@ -1513,7 +1498,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
|
||||
, index_cache_fraction(this, "index_cache_fraction", liveness::LiveUpdate, value_status::Used, 0.2,
|
||||
"The maximum fraction of cache memory permitted for use by index cache. Clamped to the [0.0; 1.0] range. Must be small enough to not deprive the row cache of memory, but should be big enough to fit a large fraction of the index. The default value 0.2 means that at least 80\% of cache memory is reserved for the row cache, while at most 20\% is usable by the index cache.")
|
||||
, consistent_cluster_management(this, "consistent_cluster_management", value_status::Deprecated, true, "Use RAFT for cluster management and DDL.")
|
||||
, force_gossip_topology_changes(this, "force_gossip_topology_changes", value_status::Used, false, "Force gossip-based topology operations in a fresh cluster. Only the first node in the cluster must use it. The rest will fall back to gossip-based operations anyway. This option should be used only for testing. Note: gossip topology changes are incompatible with tablets.")
|
||||
, force_gossip_topology_changes(this, "force_gossip_topology_changes", value_status::Deprecated, false, "Force gossip-based topology operations in a fresh cluster. Only the first node in the cluster must use it. The rest will fall back to gossip-based operations anyway. This option should be used only for testing. Note: gossip topology changes are incompatible with tablets.")
|
||||
, recovery_leader(this, "recovery_leader", liveness::LiveUpdate, value_status::Used, utils::null_uuid(), "Host ID of the node restarted first while performing the Manual Raft-based Recovery Procedure. Warning: this option disables some guardrails for the needs of the Manual Raft-based Recovery Procedure. Make sure you unset it at the end of the procedure.")
|
||||
, wasm_cache_memory_fraction(this, "wasm_cache_memory_fraction", value_status::Used, 0.01, "Maximum total size of all WASM instances stored in the cache as fraction of total shard memory.")
|
||||
, wasm_cache_timeout_in_ms(this, "wasm_cache_timeout_in_ms", value_status::Used, 5000, "Time after which an instance is evicted from the cache.")
|
||||
@@ -1602,6 +1587,25 @@ db::config::config(std::shared_ptr<db::extensions> exts)
|
||||
"Sets the maximum difference in percentages between the most loaded and least loaded nodes, below which the load balancer considers nodes balanced.")
|
||||
, minimal_tablet_size_for_balancing(this, "minimal_tablet_size_for_balancing", liveness::LiveUpdate, value_status::Used, service::default_target_tablet_size / 100,
|
||||
"Sets the minimal tablet size for the load balancer. For any tablet smaller than this, the balancer will use this size instead of the actual tablet size.")
|
||||
/**
|
||||
* @Group Ungrouped properties
|
||||
*/
|
||||
, background_writer_scheduling_quota(this, "background_writer_scheduling_quota", value_status::Deprecated, 1.0,
|
||||
"max cpu usage ratio (between 0 and 1) for compaction process. Not intended for setting in normal operations. Setting it to 1 or higher will disable it, recommended operational setting is 0.5.")
|
||||
, auto_adjust_flush_quota(this, "auto_adjust_flush_quota", value_status::Deprecated, false,
|
||||
"true: auto-adjust memtable shares for flush processes")
|
||||
, memtable_flush_static_shares(this, "memtable_flush_static_shares", liveness::LiveUpdate, value_status::Used, 0,
|
||||
"If set to higher than 0, ignore the controller's output and set the memtable shares statically. Do not set this unless you know what you are doing and suspect a problem in the controller. This option will be retired when the controller reaches more maturity.")
|
||||
, compaction_static_shares(this, "compaction_static_shares", liveness::LiveUpdate, value_status::Used, 0,
|
||||
"If set to higher than 0, ignore the controller's output and set the compaction shares statically. Do not set this unless you know what you are doing and suspect a problem in the controller. This option will be retired when the controller reaches more maturity.")
|
||||
, compaction_max_shares(this, "compaction_max_shares", liveness::LiveUpdate, value_status::Used, default_compaction_maximum_shares,
|
||||
"Set the maximum shares of regular compaction to the specific value. Do not set this unless you know what you are doing and suspect a problem in the controller. This option will be retired when the controller reaches more maturity.")
|
||||
, compaction_enforce_min_threshold(this, "compaction_enforce_min_threshold", liveness::LiveUpdate, value_status::Used, false,
|
||||
"If set to true, enforce the min_threshold option for compactions strictly. If false (default), Scylla may decide to compact even if below min_threshold.")
|
||||
, compaction_flush_all_tables_before_major_seconds(this, "compaction_flush_all_tables_before_major_seconds", value_status::Used, 86400,
|
||||
"Set the minimum interval in seconds between flushing all tables before each major compaction (default is 86400)."
|
||||
"This option is useful for maximizing tombstone garbage collection by releasing all active commitlog segments."
|
||||
"Set to 0 to disable automatic flushing all tables before major compaction.")
|
||||
, default_log_level(this, "default_log_level", value_status::Used, seastar::log_level::info, "Default log level for log messages")
|
||||
, logger_log_level(this, "logger_log_level", value_status::Used, {}, "Map of logger name to log level. Valid log levels are 'error', 'warn', 'info', 'debug' and 'trace'")
|
||||
, log_to_stdout(this, "log_to_stdout", value_status::Used, true, "Send log output to stdout")
|
||||
|
||||
16
db/config.hh
16
db/config.hh
@@ -185,13 +185,6 @@ public:
|
||||
* All values and documentation taken from
|
||||
* http://docs.datastax.com/en/cassandra/2.1/cassandra/configuration/configCassandra_yaml_r.html
|
||||
*/
|
||||
named_value<double> background_writer_scheduling_quota;
|
||||
named_value<bool> auto_adjust_flush_quota;
|
||||
named_value<float> memtable_flush_static_shares;
|
||||
named_value<float> compaction_static_shares;
|
||||
named_value<float> compaction_max_shares;
|
||||
named_value<bool> compaction_enforce_min_threshold;
|
||||
named_value<uint32_t> compaction_flush_all_tables_before_major_seconds;
|
||||
named_value<sstring> cluster_name;
|
||||
named_value<sstring> listen_address;
|
||||
named_value<sstring> listen_interface;
|
||||
@@ -446,6 +439,7 @@ public:
|
||||
named_value<uint32_t> reader_concurrency_semaphore_serialize_limit_multiplier;
|
||||
named_value<uint32_t> reader_concurrency_semaphore_kill_limit_multiplier;
|
||||
named_value<uint32_t> reader_concurrency_semaphore_cpu_concurrency;
|
||||
named_value<float> reader_concurrency_semaphore_preemptive_abort_factor;
|
||||
named_value<uint32_t> view_update_reader_concurrency_semaphore_serialize_limit_multiplier;
|
||||
named_value<uint32_t> view_update_reader_concurrency_semaphore_kill_limit_multiplier;
|
||||
named_value<uint32_t> view_update_reader_concurrency_semaphore_cpu_concurrency;
|
||||
@@ -612,6 +606,14 @@ public:
|
||||
named_value<float> size_based_balance_threshold_percentage;
|
||||
named_value<uint64_t> minimal_tablet_size_for_balancing;
|
||||
|
||||
named_value<double> background_writer_scheduling_quota;
|
||||
named_value<bool> auto_adjust_flush_quota;
|
||||
named_value<float> memtable_flush_static_shares;
|
||||
named_value<float> compaction_static_shares;
|
||||
named_value<float> compaction_max_shares;
|
||||
named_value<bool> compaction_enforce_min_threshold;
|
||||
named_value<uint32_t> compaction_flush_all_tables_before_major_seconds;
|
||||
|
||||
static const sstring default_tls_priority;
|
||||
private:
|
||||
template<typename T>
|
||||
|
||||
@@ -158,7 +158,7 @@ void hint_endpoint_manager::cancel_draining() noexcept {
|
||||
_sender.cancel_draining();
|
||||
}
|
||||
|
||||
hint_endpoint_manager::hint_endpoint_manager(const endpoint_id& key, fs::path hint_directory, manager& shard_manager)
|
||||
hint_endpoint_manager::hint_endpoint_manager(const endpoint_id& key, fs::path hint_directory, manager& shard_manager, scheduling_group send_sg)
|
||||
: _key(key)
|
||||
, _shard_manager(shard_manager)
|
||||
, _store_gate("hint_endpoint_manager")
|
||||
@@ -169,7 +169,7 @@ hint_endpoint_manager::hint_endpoint_manager(const endpoint_id& key, fs::path hi
|
||||
// Approximate the position of the last written hint by using the same formula as for segment id calculation in commitlog
|
||||
// TODO: Should this logic be deduplicated with what is in the commitlog?
|
||||
, _last_written_rp(this_shard_id(), std::chrono::duration_cast<std::chrono::milliseconds>(runtime::get_boot_time().time_since_epoch()).count())
|
||||
, _sender(*this, _shard_manager.local_storage_proxy(), _shard_manager.local_db(), _shard_manager.local_gossiper())
|
||||
, _sender(*this, _shard_manager.local_storage_proxy(), _shard_manager.local_db(), _shard_manager.local_gossiper(), send_sg)
|
||||
{}
|
||||
|
||||
hint_endpoint_manager::hint_endpoint_manager(hint_endpoint_manager&& other)
|
||||
|
||||
@@ -63,7 +63,7 @@ private:
|
||||
hint_sender _sender;
|
||||
|
||||
public:
|
||||
hint_endpoint_manager(const endpoint_id& key, std::filesystem::path hint_directory, manager& shard_manager);
|
||||
hint_endpoint_manager(const endpoint_id& key, std::filesystem::path hint_directory, manager& shard_manager, scheduling_group send_sg);
|
||||
hint_endpoint_manager(hint_endpoint_manager&&);
|
||||
~hint_endpoint_manager();
|
||||
|
||||
|
||||
@@ -122,7 +122,7 @@ const column_mapping& hint_sender::get_column_mapping(lw_shared_ptr<send_one_fil
|
||||
return cm_it->second;
|
||||
}
|
||||
|
||||
hint_sender::hint_sender(hint_endpoint_manager& parent, service::storage_proxy& local_storage_proxy,replica::database& local_db, const gms::gossiper& local_gossiper) noexcept
|
||||
hint_sender::hint_sender(hint_endpoint_manager& parent, service::storage_proxy& local_storage_proxy,replica::database& local_db, const gms::gossiper& local_gossiper, scheduling_group sg) noexcept
|
||||
: _stopped(make_ready_future<>())
|
||||
, _ep_key(parent.end_point_key())
|
||||
, _ep_manager(parent)
|
||||
@@ -130,7 +130,7 @@ hint_sender::hint_sender(hint_endpoint_manager& parent, service::storage_proxy&
|
||||
, _resource_manager(_shard_manager._resource_manager)
|
||||
, _proxy(local_storage_proxy)
|
||||
, _db(local_db)
|
||||
, _hints_cpu_sched_group(_db.get_streaming_scheduling_group())
|
||||
, _hints_cpu_sched_group(sg)
|
||||
, _gossiper(local_gossiper)
|
||||
, _file_update_mutex(_ep_manager.file_update_mutex())
|
||||
{}
|
||||
|
||||
@@ -120,7 +120,7 @@ private:
|
||||
std::multimap<db::replay_position, lw_shared_ptr<std::optional<promise<>>>> _replay_waiters;
|
||||
|
||||
public:
|
||||
hint_sender(hint_endpoint_manager& parent, service::storage_proxy& local_storage_proxy, replica::database& local_db, const gms::gossiper& local_gossiper) noexcept;
|
||||
hint_sender(hint_endpoint_manager& parent, service::storage_proxy& local_storage_proxy, replica::database& local_db, const gms::gossiper& local_gossiper, scheduling_group sg) noexcept;
|
||||
~hint_sender();
|
||||
|
||||
/// \brief A constructor that should be called from the copy/move-constructor of hint_endpoint_manager.
|
||||
|
||||
@@ -142,7 +142,7 @@ future<> directory_initializer::ensure_rebalanced() {
|
||||
}
|
||||
|
||||
manager::manager(service::storage_proxy& proxy, sstring hints_directory, host_filter filter, int64_t max_hint_window_ms,
|
||||
resource_manager& res_manager, sharded<replica::database>& db)
|
||||
resource_manager& res_manager, sharded<replica::database>& db, scheduling_group sg)
|
||||
: _hints_dir(fs::path(hints_directory) / fmt::to_string(this_shard_id()))
|
||||
, _host_filter(std::move(filter))
|
||||
, _proxy(proxy)
|
||||
@@ -150,6 +150,7 @@ manager::manager(service::storage_proxy& proxy, sstring hints_directory, host_fi
|
||||
, _local_db(db.local())
|
||||
, _draining_eps_gate(seastar::format("hints::manager::{}", _hints_dir.native()))
|
||||
, _resource_manager(res_manager)
|
||||
, _hints_sending_sched_group(sg)
|
||||
{
|
||||
if (utils::get_local_injector().enter("decrease_hints_flush_period")) {
|
||||
hints_flush_period = std::chrono::seconds{1};
|
||||
@@ -415,7 +416,7 @@ hint_endpoint_manager& manager::get_ep_manager(const endpoint_id& host_id, const
|
||||
|
||||
try {
|
||||
std::filesystem::path hint_directory = hints_dir() / (_uses_host_id ? fmt::to_string(host_id) : fmt::to_string(ip));
|
||||
auto [it, _] = _ep_managers.emplace(host_id, hint_endpoint_manager{host_id, std::move(hint_directory), *this});
|
||||
auto [it, _] = _ep_managers.emplace(host_id, hint_endpoint_manager{host_id, std::move(hint_directory), *this, _hints_sending_sched_group});
|
||||
hint_endpoint_manager& ep_man = it->second;
|
||||
|
||||
manager_logger.trace("Created an endpoint manager for {}", host_id);
|
||||
|
||||
@@ -133,6 +133,7 @@ private:
|
||||
|
||||
hint_stats _stats;
|
||||
seastar::metrics::metric_groups _metrics;
|
||||
scheduling_group _hints_sending_sched_group;
|
||||
|
||||
// We need to keep a variant here. Before migrating hinted handoff to using host ID, hint directories will
|
||||
// still represent IP addresses. But after the migration, they will start representing host IDs.
|
||||
@@ -155,7 +156,7 @@ private:
|
||||
|
||||
public:
|
||||
manager(service::storage_proxy& proxy, sstring hints_directory, host_filter filter,
|
||||
int64_t max_hint_window_ms, resource_manager& res_manager, sharded<replica::database>& db);
|
||||
int64_t max_hint_window_ms, resource_manager& res_manager, sharded<replica::database>& db, scheduling_group sg);
|
||||
|
||||
manager(const manager&) = delete;
|
||||
manager& operator=(const manager&) = delete;
|
||||
|
||||
@@ -24,7 +24,7 @@
|
||||
#include "readers/forwardable.hh"
|
||||
#include "readers/nonforwardable.hh"
|
||||
#include "cache_mutation_reader.hh"
|
||||
#include "partition_snapshot_reader.hh"
|
||||
#include "replica/partition_snapshot_reader.hh"
|
||||
#include "keys/clustering_key_filter.hh"
|
||||
#include "utils/assert.hh"
|
||||
#include "utils/updateable_value.hh"
|
||||
@@ -845,7 +845,7 @@ mutation_reader row_cache::make_nonpopulating_reader(schema_ptr schema, reader_p
|
||||
cache_entry& e = *i;
|
||||
upgrade_entry(e);
|
||||
tracing::trace(ts, "Reading partition {} from cache", pos);
|
||||
return make_partition_snapshot_flat_reader<false, dummy_accounter>(
|
||||
return replica::make_partition_snapshot_reader<false, dummy_accounter>(
|
||||
schema,
|
||||
std::move(permit),
|
||||
e.key(),
|
||||
|
||||
@@ -215,6 +215,8 @@ public:
|
||||
static constexpr auto BUILT_VIEWS = "built_views";
|
||||
static constexpr auto SCYLLA_VIEWS_BUILDS_IN_PROGRESS = "scylla_views_builds_in_progress";
|
||||
static constexpr auto CDC_LOCAL = "cdc_local";
|
||||
static constexpr auto CDC_TIMESTAMPS = "cdc_timestamps";
|
||||
static constexpr auto CDC_STREAMS = "cdc_streams";
|
||||
|
||||
// auth
|
||||
static constexpr auto ROLES = "roles";
|
||||
|
||||
208
db/view/view.cc
208
db/view/view.cc
@@ -23,6 +23,7 @@
|
||||
|
||||
#include <seastar/core/future-util.hh>
|
||||
#include <seastar/core/coroutine.hh>
|
||||
#include <seastar/coroutine/all.hh>
|
||||
#include <seastar/coroutine/maybe_yield.hh>
|
||||
#include <flat_map>
|
||||
|
||||
@@ -65,6 +66,7 @@
|
||||
#include "mutation/timestamp.hh"
|
||||
#include "utils/assert.hh"
|
||||
#include "utils/small_vector.hh"
|
||||
#include "view_builder.hh"
|
||||
#include "view_info.hh"
|
||||
#include "view_update_checks.hh"
|
||||
#include "types/list.hh"
|
||||
@@ -2238,12 +2240,20 @@ void view_builder::setup_metrics() {
|
||||
}
|
||||
|
||||
future<> view_builder::start_in_background(service::migration_manager& mm, utils::cross_shard_barrier barrier) {
|
||||
auto step_fiber = make_ready_future<>();
|
||||
try {
|
||||
view_builder_init_state vbi;
|
||||
auto fail = defer([&barrier] mutable { barrier.abort(); });
|
||||
// Guard the whole startup routine with a semaphore,
|
||||
// so that it's not intercepted by `on_drop_view`, `on_create_view`
|
||||
// or `on_update_view` events.
|
||||
// Semaphore usage invariants:
|
||||
// - One unit of _sem serializes all per-shard bookkeeping that mutates view-builder state
|
||||
// (_base_to_build_step, _built_views, build_status, reader resets).
|
||||
// - The unit is held for the whole operation, including the async chain, until the state
|
||||
// is stable for the next operation on that shard.
|
||||
// - Cross-shard operations acquire _sem on shard 0 for the duration of the broadcast.
|
||||
// Other shards acquire their own _sem only around their local handling; shard 0 skips
|
||||
// the local acquire because it already holds the unit from the dispatcher.
|
||||
// Guard the whole startup routine with a semaphore so that it's not intercepted by
|
||||
// `on_drop_view`, `on_create_view`, or `on_update_view` events.
|
||||
auto units = co_await get_units(_sem, view_builder_semaphore_units);
|
||||
// Wait for schema agreement even if we're a seed node.
|
||||
co_await mm.wait_for_schema_agreement(_db, db::timeout_clock::time_point::max(), &_as);
|
||||
@@ -2264,8 +2274,10 @@ future<> view_builder::start_in_background(service::migration_manager& mm, utils
|
||||
_mnotifier.register_listener(this);
|
||||
co_await calculate_shard_build_step(vbi);
|
||||
_current_step = _base_to_build_step.begin();
|
||||
// Waited on indirectly in stop().
|
||||
(void)_build_step.trigger();
|
||||
|
||||
// If preparation above fails, run_in_background() is not invoked, just
|
||||
// the start_in_background() emits a warning into logs and resolves
|
||||
step_fiber = run_in_background();
|
||||
} catch (...) {
|
||||
auto ex = std::current_exception();
|
||||
auto ll = log_level::error;
|
||||
@@ -2280,10 +2292,12 @@ future<> view_builder::start_in_background(service::migration_manager& mm, utils
|
||||
}
|
||||
vlogger.log(ll, "start aborted: {}", ex);
|
||||
}
|
||||
|
||||
co_await std::move(step_fiber);
|
||||
}
|
||||
|
||||
future<> view_builder::start(service::migration_manager& mm, utils::cross_shard_barrier barrier) {
|
||||
_started = start_in_background(mm, std::move(barrier));
|
||||
_step_fiber = start_in_background(mm, std::move(barrier));
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
@@ -2293,12 +2307,12 @@ future<> view_builder::drain() {
|
||||
}
|
||||
vlogger.info("Draining view builder");
|
||||
_as.request_abort();
|
||||
co_await std::move(_started);
|
||||
co_await _mnotifier.unregister_listener(this);
|
||||
co_await _vug.drain();
|
||||
co_await _sem.wait();
|
||||
_sem.broken();
|
||||
co_await _build_step.join();
|
||||
_build_step.broken();
|
||||
co_await std::move(_step_fiber);
|
||||
co_await coroutine::parallel_for_each(_base_to_build_step, [] (std::pair<const table_id, build_step>& p) {
|
||||
return p.second.reader.close();
|
||||
});
|
||||
@@ -2667,63 +2681,59 @@ static bool should_ignore_tablet_keyspace(const replica::database& db, const sst
|
||||
return db.features().view_building_coordinator && db.has_keyspace(ks_name) && db.find_keyspace(ks_name).uses_tablets();
|
||||
}
|
||||
|
||||
future<> view_builder::dispatch_create_view(sstring ks_name, sstring view_name) {
|
||||
if (should_ignore_tablet_keyspace(_db, ks_name)) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
return with_semaphore(_sem, view_builder_semaphore_units, [this, ks_name = std::move(ks_name), view_name = std::move(view_name)] () mutable {
|
||||
// This runs on shard 0 only; seed the global rows before broadcasting.
|
||||
return handle_seed_view_build_progress(ks_name, view_name).then([this, ks_name = std::move(ks_name), view_name = std::move(view_name)] () mutable {
|
||||
return container().invoke_on_all([ks_name = std::move(ks_name), view_name = std::move(view_name)] (view_builder& vb) mutable {
|
||||
return vb.handle_create_view_local(std::move(ks_name), std::move(view_name));
|
||||
});
|
||||
});
|
||||
});
|
||||
future<view_builder::view_builder_units> view_builder::get_or_adopt_view_builder_lock(view_builder_units_opt units) {
|
||||
co_return units ? std::move(*units) : co_await get_units(_sem, view_builder_semaphore_units);
|
||||
}
|
||||
|
||||
future<> view_builder::handle_seed_view_build_progress(sstring ks_name, sstring view_name) {
|
||||
future<> view_builder::dispatch_create_view(sstring ks_name, sstring view_name) {
|
||||
if (should_ignore_tablet_keyspace(_db, ks_name)) {
|
||||
co_return;
|
||||
}
|
||||
|
||||
auto units = co_await get_or_adopt_view_builder_lock(std::nullopt);
|
||||
co_await handle_seed_view_build_progress(ks_name, view_name);
|
||||
|
||||
co_await coroutine::all(
|
||||
[this, ks_name, view_name, units = std::move(units)] mutable -> future<> {
|
||||
co_await handle_create_view_local(ks_name, view_name, std::move(units)); },
|
||||
[this, ks_name, view_name] mutable -> future<> {
|
||||
co_await container().invoke_on_others([ks_name = std::move(ks_name), view_name = std::move(view_name)] (view_builder& vb) mutable -> future<> {
|
||||
return vb.handle_create_view_local(ks_name, view_name, std::nullopt); }); });
|
||||
}
|
||||
|
||||
future<> view_builder::handle_seed_view_build_progress(const sstring& ks_name, const sstring& view_name) {
|
||||
auto view = view_ptr(_db.find_schema(ks_name, view_name));
|
||||
auto& step = get_or_create_build_step(view->view_info()->base_id());
|
||||
return _sys_ks.register_view_for_building_for_all_shards(view->ks_name(), view->cf_name(), step.current_token());
|
||||
}
|
||||
|
||||
future<> view_builder::handle_create_view_local(sstring ks_name, sstring view_name){
|
||||
if (this_shard_id() == 0) {
|
||||
return handle_create_view_local_impl(std::move(ks_name), std::move(view_name));
|
||||
} else {
|
||||
return with_semaphore(_sem, view_builder_semaphore_units, [this, ks_name = std::move(ks_name), view_name = std::move(view_name)] () mutable {
|
||||
return handle_create_view_local_impl(std::move(ks_name), std::move(view_name));
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
future<> view_builder::handle_create_view_local_impl(sstring ks_name, sstring view_name) {
|
||||
future<> view_builder::handle_create_view_local(const sstring& ks_name, const sstring& view_name, view_builder_units_opt units) {
|
||||
[[maybe_unused]] auto sem_units = co_await get_or_adopt_view_builder_lock(std::move(units));
|
||||
auto view = view_ptr(_db.find_schema(ks_name, view_name));
|
||||
auto& step = get_or_create_build_step(view->view_info()->base_id());
|
||||
return when_all(step.base->await_pending_writes(), step.base->await_pending_streams()).discard_result().then([this, &step] {
|
||||
return flush_base(step.base, _as);
|
||||
}).then([this, view, &step] () {
|
||||
try {
|
||||
co_await coroutine::all(
|
||||
[&step] -> future<> {
|
||||
co_await step.base->await_pending_writes(); },
|
||||
[&step] -> future<> {
|
||||
co_await step.base->await_pending_streams(); });
|
||||
co_await flush_base(step.base, _as);
|
||||
|
||||
// This resets the build step to the current token. It may result in views currently
|
||||
// being built to receive duplicate updates, but it simplifies things as we don't have
|
||||
// to keep around a list of new views to build the next time the reader crosses a token
|
||||
// threshold.
|
||||
return initialize_reader_at_current_token(step).then([this, view, &step] () mutable {
|
||||
return add_new_view(view, step);
|
||||
}).then_wrapped([this, view] (future<>&& f) {
|
||||
try {
|
||||
f.get();
|
||||
} catch (abort_requested_exception&) {
|
||||
vlogger.debug("Aborted while setting up view for building {}.{}", view->ks_name(), view->cf_name());
|
||||
} catch (raft::request_aborted&) {
|
||||
vlogger.debug("Aborted while setting up view for building {}.{}", view->ks_name(), view->cf_name());
|
||||
} catch (...) {
|
||||
vlogger.error("Error setting up view for building {}.{}: {}", view->ks_name(), view->cf_name(), std::current_exception());
|
||||
}
|
||||
co_await initialize_reader_at_current_token(step);
|
||||
co_await add_new_view(view, step);
|
||||
} catch (abort_requested_exception&) {
|
||||
vlogger.debug("Aborted while setting up view for building {}.{}", view->ks_name(), view->cf_name());
|
||||
} catch (raft::request_aborted&) {
|
||||
vlogger.debug("Aborted while setting up view for building {}.{}", view->ks_name(), view->cf_name());
|
||||
} catch (...) {
|
||||
vlogger.error("Error setting up view for building {}.{}: {}", view->ks_name(), view->cf_name(), std::current_exception());
|
||||
}
|
||||
|
||||
// Waited on indirectly in stop().
|
||||
static_cast<void>(_build_step.trigger());
|
||||
});
|
||||
});
|
||||
_build_step.signal();
|
||||
}
|
||||
|
||||
void view_builder::on_create_view(const sstring& ks_name, const sstring& view_name) {
|
||||
@@ -2760,62 +2770,55 @@ void view_builder::on_update_view(const sstring& ks_name, const sstring& view_na
|
||||
|
||||
future<> view_builder::dispatch_drop_view(sstring ks_name, sstring view_name) {
|
||||
if (should_ignore_tablet_keyspace(_db, ks_name)) {
|
||||
return make_ready_future<>();
|
||||
co_return;
|
||||
}
|
||||
|
||||
return with_semaphore(_sem, view_builder_semaphore_units, [this, ks_name = std::move(ks_name), view_name = std::move(view_name)] () mutable {
|
||||
// This runs on shard 0 only; broadcast local cleanup before global cleanup.
|
||||
return container().invoke_on_all([ks_name, view_name] (view_builder& vb) mutable {
|
||||
return vb.handle_drop_view_local(std::move(ks_name), std::move(view_name));
|
||||
}).then([this, ks_name = std::move(ks_name), view_name = std::move(view_name)] () mutable {
|
||||
return handle_drop_view_global_cleanup(std::move(ks_name), std::move(view_name));
|
||||
});
|
||||
});
|
||||
auto units = co_await get_or_adopt_view_builder_lock(std::nullopt);
|
||||
|
||||
co_await coroutine::all(
|
||||
[this, ks_name, view_name, units = std::move(units)] mutable -> future<> {
|
||||
co_await handle_drop_view_local(ks_name, view_name, std::move(units)); },
|
||||
[this, ks_name, view_name] mutable -> future<> {
|
||||
co_await container().invoke_on_others([ks_name = std::move(ks_name), view_name = std::move(view_name)] (view_builder& vb) mutable -> future<> {
|
||||
return vb.handle_drop_view_local(ks_name, view_name, std::nullopt); });});
|
||||
co_await handle_drop_view_global_cleanup(ks_name, view_name);
|
||||
}
|
||||
|
||||
future<> view_builder::handle_drop_view_local(sstring ks_name, sstring view_name) {
|
||||
if (this_shard_id() == 0) {
|
||||
return handle_drop_view_local_impl(std::move(ks_name), std::move(view_name));
|
||||
} else {
|
||||
return with_semaphore(_sem, view_builder_semaphore_units, [this, ks_name = std::move(ks_name), view_name = std::move(view_name)] () mutable {
|
||||
return handle_drop_view_local_impl(std::move(ks_name), std::move(view_name));
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
future<> view_builder::handle_drop_view_local_impl(sstring ks_name, sstring view_name) {
|
||||
future<> view_builder::handle_drop_view_local(const sstring& ks_name, const sstring& view_name, view_builder_units_opt units) {
|
||||
[[maybe_unused]] auto sem_units = co_await get_or_adopt_view_builder_lock(std::move(units));
|
||||
vlogger.info0("Stopping to build view {}.{}", ks_name, view_name);
|
||||
// The view is absent from the database at this point, so find it by brute force.
|
||||
([&, this] {
|
||||
for (auto& [_, step] : _base_to_build_step) {
|
||||
if (step.build_status.empty() || step.build_status.front().view->ks_name() != ks_name) {
|
||||
continue;
|
||||
}
|
||||
for (auto it = step.build_status.begin(); it != step.build_status.end(); ++it) {
|
||||
if (it->view->cf_name() == view_name) {
|
||||
_built_views.erase(it->view->id());
|
||||
step.build_status.erase(it);
|
||||
return;
|
||||
}
|
||||
|
||||
for (auto& [_, step] : _base_to_build_step) {
|
||||
if (step.build_status.empty() || step.build_status.front().view->ks_name() != ks_name) {
|
||||
continue;
|
||||
}
|
||||
for (auto it = step.build_status.begin(); it != step.build_status.end(); ++it) {
|
||||
if (it->view->cf_name() == view_name) {
|
||||
_built_views.erase(it->view->id());
|
||||
step.build_status.erase(it);
|
||||
co_return;
|
||||
}
|
||||
}
|
||||
})();
|
||||
return make_ready_future<>();
|
||||
}
|
||||
}
|
||||
|
||||
future<> view_builder::handle_drop_view_global_cleanup(sstring ks_name, sstring view_name) {
|
||||
future<> view_builder::handle_drop_view_global_cleanup(const sstring& ks_name, const sstring& view_name) {
|
||||
if (this_shard_id() != 0) {
|
||||
return make_ready_future<>();
|
||||
co_return;
|
||||
}
|
||||
vlogger.info0("Starting view global cleanup {}.{}", ks_name, view_name);
|
||||
return when_all_succeed(
|
||||
_sys_ks.remove_view_build_progress_across_all_shards(ks_name, view_name),
|
||||
_sys_ks.remove_built_view(ks_name, view_name),
|
||||
remove_view_build_status(ks_name, view_name))
|
||||
.discard_result()
|
||||
.handle_exception([ks_name, view_name] (std::exception_ptr ep) {
|
||||
vlogger.warn("Failed to cleanup view {}.{}: {}", ks_name, view_name, ep);
|
||||
});
|
||||
|
||||
try {
|
||||
co_await coroutine::all(
|
||||
[this, &ks_name, &view_name] -> future<> {
|
||||
co_await _sys_ks.remove_view_build_progress_across_all_shards(ks_name, view_name); },
|
||||
[this, &ks_name, &view_name] -> future<> {
|
||||
co_await _sys_ks.remove_built_view(ks_name, view_name); },
|
||||
[this, &ks_name, &view_name] -> future<> {
|
||||
co_await remove_view_build_status(ks_name, view_name); });
|
||||
} catch (...) {
|
||||
vlogger.warn("Failed to cleanup view {}.{}: {}", ks_name, view_name, std::current_exception());
|
||||
}
|
||||
}
|
||||
|
||||
void view_builder::on_drop_view(const sstring& ks_name, const sstring& view_name) {
|
||||
@@ -2829,14 +2832,15 @@ void view_builder::on_drop_view(const sstring& ks_name, const sstring& view_name
|
||||
}));
|
||||
}
|
||||
|
||||
future<> view_builder::do_build_step() {
|
||||
// Run the view building in the streaming scheduling group
|
||||
// so that it doesn't impact other tasks with higher priority.
|
||||
seastar::thread_attributes attr;
|
||||
attr.sched_group = _db.get_streaming_scheduling_group();
|
||||
return seastar::async(std::move(attr), [this] {
|
||||
future<> view_builder::run_in_background() {
|
||||
return seastar::async([this] {
|
||||
exponential_backoff_retry r(1s, 1min);
|
||||
while (!_base_to_build_step.empty() && !_as.abort_requested()) {
|
||||
while (!_as.abort_requested()) {
|
||||
try {
|
||||
_build_step.wait([this] { return !_base_to_build_step.empty(); }).get();
|
||||
} catch (const seastar::broken_condition_variable&) {
|
||||
return;
|
||||
}
|
||||
auto units = get_units(_sem, view_builder_semaphore_units).get();
|
||||
++_stats.steps_performed;
|
||||
try {
|
||||
|
||||
@@ -11,13 +11,13 @@
|
||||
#include "query/query-request.hh"
|
||||
#include "service/migration_listener.hh"
|
||||
#include "service/raft/raft_group0_client.hh"
|
||||
#include "utils/serialized_action.hh"
|
||||
#include "utils/cross-shard-barrier.hh"
|
||||
#include "replica/database.hh"
|
||||
|
||||
#include <seastar/core/abort_source.hh>
|
||||
#include <seastar/core/future.hh>
|
||||
#include <seastar/core/semaphore.hh>
|
||||
#include <seastar/core/condition-variable.hh>
|
||||
#include <seastar/core/sharded.hh>
|
||||
#include <seastar/core/shared_future.hh>
|
||||
#include <seastar/core/shared_ptr.hh>
|
||||
@@ -104,6 +104,12 @@ class view_update_generator;
|
||||
* redo the missing step, for simplicity.
|
||||
*/
|
||||
class view_builder final : public service::migration_listener::only_view_notifications, public seastar::peering_sharded_service<view_builder> {
|
||||
//aliasing for semaphore units that will be used throughout the class
|
||||
using view_builder_units = semaphore_units<named_semaphore_exception_factory>;
|
||||
|
||||
//aliasing for optional semaphore units that will be used throughout the class
|
||||
using view_builder_units_opt = std::optional<view_builder_units>;
|
||||
|
||||
/**
|
||||
* Keeps track of the build progress for a particular view.
|
||||
* When the view is built, next_token == first_token.
|
||||
@@ -168,14 +174,24 @@ class view_builder final : public service::migration_listener::only_view_notific
|
||||
reader_permit _permit;
|
||||
base_to_build_step_type _base_to_build_step;
|
||||
base_to_build_step_type::iterator _current_step = _base_to_build_step.end();
|
||||
serialized_action _build_step{std::bind(&view_builder::do_build_step, this)};
|
||||
condition_variable _build_step;
|
||||
static constexpr size_t view_builder_semaphore_units = 1;
|
||||
// Ensures bookkeeping operations are serialized, meaning that while we execute
|
||||
// a build step we don't consider newly added or removed views. This simplifies
|
||||
// the algorithms. Also synchronizes an operation wrt. a call to stop().
|
||||
// Semaphore usage invariants:
|
||||
// - One unit of _sem serializes all per-shard bookkeeping that mutates view-builder state
|
||||
// (_base_to_build_step, _built_views, build_status, reader resets).
|
||||
// - The unit is held for the whole operation, including the async chain, until the state
|
||||
// is stable for the next operation on that shard.
|
||||
// - Cross-shard operations acquire _sem on shard 0 for the duration of the broadcast.
|
||||
// Other shards acquire their own _sem only around their local handling; shard 0 skips
|
||||
// the local acquire because it already holds the unit from the dispatcher.
|
||||
// Guard the whole startup routine with a semaphore so that it's not intercepted by
|
||||
// `on_drop_view`, `on_create_view`, or `on_update_view` events.
|
||||
seastar::named_semaphore _sem{view_builder_semaphore_units, named_semaphore_exception_factory{"view builder"}};
|
||||
seastar::abort_source _as;
|
||||
future<> _started = make_ready_future<>();
|
||||
future<> _step_fiber = make_ready_future<>();
|
||||
// Used to coordinate between shards the conclusion of the build process for a particular view.
|
||||
std::unordered_set<table_id> _built_views;
|
||||
// Used for testing.
|
||||
@@ -262,19 +278,18 @@ private:
|
||||
void setup_shard_build_step(view_builder_init_state& vbi, std::vector<system_keyspace_view_name>, std::vector<system_keyspace_view_build_progress>);
|
||||
future<> calculate_shard_build_step(view_builder_init_state& vbi);
|
||||
future<> add_new_view(view_ptr, build_step&);
|
||||
future<> do_build_step();
|
||||
future<> run_in_background();
|
||||
void execute(build_step&, exponential_backoff_retry);
|
||||
future<> maybe_mark_view_as_built(view_ptr, dht::token);
|
||||
future<> mark_as_built(view_ptr);
|
||||
void setup_metrics();
|
||||
future<> dispatch_create_view(sstring ks_name, sstring view_name);
|
||||
future<> dispatch_drop_view(sstring ks_name, sstring view_name);
|
||||
future<> handle_seed_view_build_progress(sstring ks_name, sstring view_name);
|
||||
future<> handle_create_view_local(sstring ks_name, sstring view_name);
|
||||
future<> handle_drop_view_local(sstring ks_name, sstring view_name);
|
||||
future<> handle_create_view_local_impl(sstring ks_name, sstring view_name);
|
||||
future<> handle_drop_view_local_impl(sstring ks_name, sstring view_name);
|
||||
future<> handle_drop_view_global_cleanup(sstring ks_name, sstring view_name);
|
||||
future<> handle_seed_view_build_progress(const sstring& ks_name, const sstring& view_name);
|
||||
future<> handle_create_view_local(const sstring& ks_name, const sstring& view_name, view_builder_units_opt units);
|
||||
future<> handle_drop_view_local(const sstring& ks_name, const sstring& view_name, view_builder_units_opt units);
|
||||
future<> handle_drop_view_global_cleanup(const sstring& ks_name, const sstring& view_name);
|
||||
future<view_builder_units> get_or_adopt_view_builder_lock(view_builder_units_opt units);
|
||||
|
||||
template <typename Func1, typename Func2>
|
||||
future<> write_view_build_status(Func1&& fn_group0, Func2&& fn_sys_dist) {
|
||||
|
||||
@@ -242,7 +242,7 @@ future<> view_building_worker::create_staging_sstable_tasks() {
|
||||
utils::UUID_gen::get_time_UUID(), view_building_task::task_type::process_staging, false,
|
||||
table_id, ::table_id{}, {my_host_id, sst_info.shard}, sst_info.last_token
|
||||
};
|
||||
auto mut = co_await _group0.client().sys_ks().make_view_building_task_mutation(guard.write_timestamp(), task);
|
||||
auto mut = co_await _sys_ks.make_view_building_task_mutation(guard.write_timestamp(), task);
|
||||
cmuts.emplace_back(std::move(mut));
|
||||
}
|
||||
}
|
||||
@@ -386,7 +386,6 @@ future<> view_building_worker::update_built_views() {
|
||||
auto schema = _db.find_schema(table_id);
|
||||
return std::make_pair(schema->ks_name(), schema->cf_name());
|
||||
};
|
||||
auto& sys_ks = _group0.client().sys_ks();
|
||||
|
||||
std::set<std::pair<sstring, sstring>> built_views;
|
||||
for (auto& [id, statuses]: _vb_state_machine.views_state.status_map) {
|
||||
@@ -395,22 +394,22 @@ future<> view_building_worker::update_built_views() {
|
||||
}
|
||||
}
|
||||
|
||||
auto local_built = co_await sys_ks.load_built_views() | std::views::filter([&] (auto& v) {
|
||||
auto local_built = co_await _sys_ks.load_built_views() | std::views::filter([&] (auto& v) {
|
||||
return !_db.has_keyspace(v.first) || _db.find_keyspace(v.first).uses_tablets();
|
||||
}) | std::ranges::to<std::set>();
|
||||
|
||||
// Remove dead entries
|
||||
for (auto& view: local_built) {
|
||||
if (!built_views.contains(view)) {
|
||||
co_await sys_ks.remove_built_view(view.first, view.second);
|
||||
co_await _sys_ks.remove_built_view(view.first, view.second);
|
||||
}
|
||||
}
|
||||
|
||||
// Add new entries
|
||||
for (auto& view: built_views) {
|
||||
if (!local_built.contains(view)) {
|
||||
co_await sys_ks.mark_view_as_built(view.first, view.second);
|
||||
co_await sys_ks.remove_view_build_progress_across_all_shards(view.first, view.second);
|
||||
co_await _sys_ks.mark_view_as_built(view.first, view.second);
|
||||
co_await _sys_ks.remove_view_build_progress_across_all_shards(view.first, view.second);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -589,11 +588,7 @@ future<> view_building_worker::do_build_range(table_id base_id, std::vector<tabl
|
||||
utils::get_local_injector().inject("do_build_range_fail",
|
||||
[] { throw std::runtime_error("do_build_range failed due to error injection"); });
|
||||
|
||||
// Run the view building in the streaming scheduling group
|
||||
// so that it doesn't impact other tasks with higher priority.
|
||||
seastar::thread_attributes attr;
|
||||
attr.sched_group = _db.get_streaming_scheduling_group();
|
||||
return seastar::async(std::move(attr), [this, base_id, views_ids = std::move(views_ids), last_token, &as] {
|
||||
return seastar::async([this, base_id, views_ids = std::move(views_ids), last_token, &as] {
|
||||
gc_clock::time_point now = gc_clock::now();
|
||||
auto base_cf = _db.find_column_family(base_id).shared_from_this();
|
||||
reader_permit permit = _db.get_reader_concurrency_semaphore().make_tracking_only_permit(nullptr, "build_views_range", db::no_timeout, {});
|
||||
|
||||
@@ -67,6 +67,7 @@ public:
|
||||
return schema_builder(system_keyspace::NAME, "cluster_status", std::make_optional(id))
|
||||
.with_column("peer", inet_addr_type, column_kind::partition_key)
|
||||
.with_column("dc", utf8_type)
|
||||
.with_column("rack", utf8_type)
|
||||
.with_column("up", boolean_type)
|
||||
.with_column("draining", boolean_type)
|
||||
.with_column("excluded", boolean_type)
|
||||
@@ -111,7 +112,9 @@ public:
|
||||
// Not all entries in gossiper are present in the topology
|
||||
auto& node = tm.get_topology().get_node(hostid);
|
||||
sstring dc = node.dc_rack().dc;
|
||||
sstring rack = node.dc_rack().rack;
|
||||
set_cell(cr, "dc", dc);
|
||||
set_cell(cr, "rack", rack);
|
||||
set_cell(cr, "draining", node.is_draining());
|
||||
set_cell(cr, "excluded", node.is_excluded());
|
||||
}
|
||||
@@ -1345,8 +1348,8 @@ public:
|
||||
|
||||
private:
|
||||
static schema_ptr build_schema() {
|
||||
auto id = generate_legacy_id(system_keyspace::NAME, "cdc_timestamps");
|
||||
return schema_builder(system_keyspace::NAME, "cdc_timestamps", std::make_optional(id))
|
||||
auto id = generate_legacy_id(system_keyspace::NAME, system_keyspace::CDC_TIMESTAMPS);
|
||||
return schema_builder(system_keyspace::NAME, system_keyspace::CDC_TIMESTAMPS, std::make_optional(id))
|
||||
.with_column("keyspace_name", utf8_type, column_kind::partition_key)
|
||||
.with_column("table_name", utf8_type, column_kind::partition_key)
|
||||
.with_column("timestamp", reversed_type_impl::get_instance(timestamp_type), column_kind::clustering_key)
|
||||
@@ -1428,8 +1431,8 @@ public:
|
||||
}
|
||||
private:
|
||||
static schema_ptr build_schema() {
|
||||
auto id = generate_legacy_id(system_keyspace::NAME, "cdc_streams");
|
||||
return schema_builder(system_keyspace::NAME, "cdc_streams", std::make_optional(id))
|
||||
auto id = generate_legacy_id(system_keyspace::NAME, system_keyspace::CDC_STREAMS);
|
||||
return schema_builder(system_keyspace::NAME, system_keyspace::CDC_STREAMS, std::make_optional(id))
|
||||
.with_column("keyspace_name", utf8_type, column_kind::partition_key)
|
||||
.with_column("table_name", utf8_type, column_kind::partition_key)
|
||||
.with_column("timestamp", timestamp_type, column_kind::clustering_key)
|
||||
|
||||
2
debug.cc
2
debug.cc
@@ -11,5 +11,7 @@
|
||||
namespace debug {
|
||||
|
||||
seastar::sharded<replica::database>* volatile the_database = nullptr;
|
||||
seastar::scheduling_group streaming_scheduling_group;
|
||||
seastar::scheduling_group gossip_scheduling_group;
|
||||
|
||||
}
|
||||
|
||||
3
debug.hh
3
debug.hh
@@ -17,7 +17,8 @@ class database;
|
||||
namespace debug {
|
||||
|
||||
extern seastar::sharded<replica::database>* volatile the_database;
|
||||
|
||||
extern seastar::scheduling_group streaming_scheduling_group;
|
||||
extern seastar::scheduling_group gossip_scheduling_group;
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -1,10 +1,18 @@
|
||||
### a dictionary of redirections
|
||||
#old path: new path
|
||||
|
||||
# Move the OS Support page
|
||||
|
||||
/stable/getting-started/os-support.html: https://docs.scylladb.com/stable/versioning/os-support-per-version.html
|
||||
|
||||
# Remove an outdated KB
|
||||
|
||||
/stable/kb/perftune-modes-sync.html: /stable/kb/index.html
|
||||
|
||||
# Remove the troubleshooting page relevant for Open Source only
|
||||
|
||||
/stable/troubleshooting/missing-dotmount-files.html: /troubleshooting/index.html
|
||||
|
||||
# Move the diver information to another project
|
||||
|
||||
/stable/using-scylla/drivers/index.html: https://docs.scylladb.com/stable/drivers/index.html
|
||||
|
||||
@@ -1026,7 +1026,29 @@ You can enable the after-repair tombstone GC by setting the ``repair`` mode usin
|
||||
|
||||
ALTER TABLE ks.cf WITH tombstone_gc = {'mode':'repair'} ;
|
||||
|
||||
The following modes are available:
|
||||
To support writes arriving out-of-order -- either due to natural delays, or user provided timestamps -- the repair mode has a propagation delay.
|
||||
Out-of-order writes present a problem for repair mode tombstone gc. Consider the following example sequence of events:
|
||||
|
||||
1) Write ``DELETE FROM table WHERE key = K1`` arrives at the node.
|
||||
2) Repair is run.
|
||||
3) Compaction runs and garbage collects the tombstone for ``key = K1``.
|
||||
4) Write ``INSERT INTO table (key, ...) VALUES (K1, ...)`` arrives at the node with timestamp smaller than that of the delete. The tombstone for ``key = K1`` should apply to this write, but it is already garbage collected, so this data is resurrected.
|
||||
|
||||
Propagation delay solves this problem by establishing a window before repair, where tombstones are not yet garbage collectible: a tombstone is garbage collectible if it was written before the last repair by at least the propagation delay.
|
||||
|
||||
The value of the propagation delay can be set via the ``propagation_delay_in_seconds`` parameter:
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
CREATE TABLE ks.cf (key blob PRIMARY KEY, val blob) WITH tombstone_gc = {'mode':'repair', 'propagation_delay_in_seconds': 120};
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
ALTER TABLE ks.cf WITH tombstone_gc = {'mode':'repair', 'propagation_delay_in_seconds': 120};
|
||||
|
||||
The default value of the propagation delay is 1 hour. This parameter should only be changed if your application uses user provided timestamps and writes and deletes can arrive out-of-order by more than the default 1 hour.
|
||||
|
||||
The following tombstone gc modes are available:
|
||||
|
||||
.. list-table::
|
||||
:widths: 20 80
|
||||
|
||||
@@ -25,6 +25,8 @@ Querying data from data is done using a ``SELECT`` statement:
|
||||
: | CAST '(' `selector` AS `cql_type` ')'
|
||||
: | `function_name` '(' [ `selector` ( ',' `selector` )* ] ')'
|
||||
: | COUNT '(' '*' ')'
|
||||
: | literal
|
||||
: | bind_marker
|
||||
: )
|
||||
: ( '.' `field_name` | '[' `term` ']' )*
|
||||
where_clause: `relation` ( AND `relation` )*
|
||||
@@ -35,6 +37,8 @@ Querying data from data is done using a ``SELECT`` statement:
|
||||
operator: '=' | '<' | '>' | '<=' | '>=' | IN | NOT IN | CONTAINS | CONTAINS KEY
|
||||
ordering_clause: `column_name` [ ASC | DESC ] ( ',' `column_name` [ ASC | DESC ] )*
|
||||
timeout: `duration`
|
||||
literal: number | 'string' | boolean | NULL | tuple_literal | list_literal | map_literal
|
||||
bind_marker: '?' | ':' `identifier`
|
||||
|
||||
For instance::
|
||||
|
||||
@@ -81,6 +85,13 @@ A :token:`selector` can be one of the following:
|
||||
- A casting, which allows you to convert a nested selector to a (compatible) type.
|
||||
- A function call, where the arguments are selector themselves.
|
||||
- A call to the :ref:`COUNT function <count-function>`, which counts all non-null results.
|
||||
- A literal value (constant).
|
||||
- A bind variable (`?` or `:name`).
|
||||
|
||||
Note that due to a quirk of the type system, literals and bind markers cannot be
|
||||
used as top-level selectors, as the parser cannot infer their type. However, they can be used
|
||||
when nested inside functions, as the function formal parameter types provide the
|
||||
necessary context.
|
||||
|
||||
Aliases
|
||||
```````
|
||||
@@ -281,7 +292,8 @@ For example::
|
||||
ORDER BY embedding ANN OF [0.1, 0.2, 0.3, 0.4] LIMIT 5;
|
||||
|
||||
|
||||
Vector queries also support filtering with ``WHERE`` clauses on columns that are part of the primary key.
|
||||
Vector queries also support filtering with ``WHERE`` clauses on columns that are part of the primary key
|
||||
or columns provided in a definition of the index.
|
||||
|
||||
For example::
|
||||
|
||||
|
||||
@@ -140,17 +140,83 @@ Vector Index :label-note:`ScyllaDB Cloud`
|
||||
`ScyllaDB Cloud documentation <https://cloud.docs.scylladb.com/stable/vector-search/>`_.
|
||||
|
||||
ScyllaDB supports creating vector indexes on tables, allowing queries on the table to use those indexes for efficient
|
||||
similarity search on vector data.
|
||||
similarity search on vector data. Vector indexes can be a global index for indexing vectors per table or a local
|
||||
index for indexing vectors per partition.
|
||||
|
||||
The vector index is the only custom type index supported in ScyllaDB. It is created using
|
||||
the ``CUSTOM`` keyword and specifying the index type as ``vector_index``. Example:
|
||||
the ``CUSTOM`` keyword and specifying the index type as ``vector_index``. It is also possible to
|
||||
add additional columns to the index for filtering the search results. The partition column
|
||||
specified in the global vector index definition must be the vector column, and any subsequent
|
||||
columns are treated as filtering columns. The local vector index requires that the partition key
|
||||
of the base table is also the partition key of the index and the vector column is the first one
|
||||
from the following columns.
|
||||
|
||||
Example of a simple index:
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
CREATE CUSTOM INDEX vectorIndex ON ImageEmbeddings (embedding)
|
||||
CREATE CUSTOM INDEX vectorIndex ON ImageEmbeddings (embedding)
|
||||
USING 'vector_index'
|
||||
WITH OPTIONS = {'similarity_function': 'COSINE', 'maximum_node_connections': '16'};
|
||||
|
||||
The vector column (``embedding``) is indexed to enable similarity search using
|
||||
a global vector index. Additional filtering can be performed on the primary key
|
||||
columns of the base table.
|
||||
|
||||
Example of a global vector index with additional filtering:
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
CREATE CUSTOM INDEX vectorIndex ON ImageEmbeddings (embedding, category, info)
|
||||
USING 'vector_index'
|
||||
WITH OPTIONS = {'similarity_function': 'COSINE', 'maximum_node_connections': '16'};
|
||||
|
||||
The vector column (``embedding``) is indexed to enable similarity search using
|
||||
a global index. Additional columns are added for filtering the search results.
|
||||
The filtering is possible on ``category``, ``info`` and all primary key columns
|
||||
of the base table.
|
||||
|
||||
Example of a local vector index:
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
CREATE CUSTOM INDEX vectorIndex ON ImageEmbeddings ((id, created_at), embedding, category, info)
|
||||
USING 'vector_index'
|
||||
WITH OPTIONS = {'similarity_function': 'COSINE', 'maximum_node_connections': '16'};
|
||||
|
||||
The vector column (``embedding``) is indexed for similarity search (a local
|
||||
index) and additional columns are added for filtering the search results. The
|
||||
filtering is possible on ``category``, ``info`` and all primary key columns of
|
||||
the base table. The columns ``id`` and ``created_at`` must be the partition key
|
||||
of the base table.
|
||||
|
||||
Vector indexes support additional filtering columns of native data types
|
||||
(excluding counter and duration). The indexed column itself must be a vector
|
||||
column, while the extra columns can be used to filter search results.
|
||||
|
||||
The supported types are:
|
||||
|
||||
* ``ascii``
|
||||
* ``bigint``
|
||||
* ``blob``
|
||||
* ``boolean``
|
||||
* ``date``
|
||||
* ``decimal``
|
||||
* ``double``
|
||||
* ``float``
|
||||
* ``inet``
|
||||
* ``int``
|
||||
* ``smallint``
|
||||
* ``text``
|
||||
* ``varchar``
|
||||
* ``time``
|
||||
* ``timestamp``
|
||||
* ``timeuuid``
|
||||
* ``tinyint``
|
||||
* ``uuid``
|
||||
* ``varint``
|
||||
|
||||
|
||||
The following options are supported for vector indexes. All of them are optional.
|
||||
|
||||
+------------------------------+----------------------------------------------------------------------------------------------------------+---------------+
|
||||
|
||||
@@ -78,6 +78,7 @@ Permits are in one of the following states:
|
||||
* `active/await` - a previously `active/need_cpu` permit, which needs something other than CPU to proceed, it is waiting on I/O or a remote shards, other permits can be admitted while the permit is in this state, pending resource availability;
|
||||
* `inactive` - the permit was marked inactive, it can be evicted to make room for admitting more permits if needed;
|
||||
* `evicted` - a former inactive permit which was evicted, the permit has to undergo admission again for the read to resume;
|
||||
* `preemptive_aborted` - the permit timed out or was rejected during admission as it was detected the read might time out later during execution;
|
||||
|
||||
Note that some older releases will have different names for some of these states or lack some of the states altogether:
|
||||
|
||||
|
||||
@@ -124,6 +124,7 @@ There are several test directories that are excluded from orchestration by `test
|
||||
- test/cql
|
||||
- test/cqlpy
|
||||
- test/rest_api
|
||||
- test/scylla_gdb
|
||||
|
||||
This means that `test.py` will not run tests directly, but will delegate all work to `pytest`.
|
||||
That's why all these directories do not have `suite.yaml` files.
|
||||
|
||||
@@ -156,7 +156,7 @@ How do I check the current version of ScyllaDB that I am running?
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
* On a regular system or VM (running Ubuntu, CentOS, or RedHat Enterprise): :code:`$ scylla --version`
|
||||
|
||||
Check the :doc:`Operating System Support Guide </getting-started/os-support>` for a list of supported operating systems and versions.
|
||||
Check the `Operating System Support Guide <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_ for a list of supported operating systems and versions.
|
||||
|
||||
* On a docker node: :code:`$ docker exec -it Node_Z scylla --version`
|
||||
|
||||
|
||||
23
docs/features/automatic-repair.rst
Normal file
23
docs/features/automatic-repair.rst
Normal file
@@ -0,0 +1,23 @@
|
||||
.. _automatic-repair:
|
||||
|
||||
Automatic Repair
|
||||
================
|
||||
|
||||
Traditionally, launching :doc:`repairs </operating-scylla/procedures/maintenance/repair>` in a ScyllaDB cluster is left to an external process, typically done via `Scylla Manager <https://manager.docs.scylladb.com/stable/repair/index.html>`_.
|
||||
|
||||
Automatic repair offers built-in scheduling in ScyllaDB itself. If the time since the last repair is greater than the configured repair interval, ScyllaDB will start a repair for the :doc:`tablet table </architecture/tablets>` automatically.
|
||||
Repairs are spread over time and among nodes and shards, to avoid load spikes or any adverse effects on user workloads.
|
||||
|
||||
To enable automatic repair, add this to the configuration (``scylla.yaml``):
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
auto_repair_enabled_default: true
|
||||
auto_repair_threshold_default_in_seconds: 86400
|
||||
|
||||
This will enable automatic repair for all tables with a repair period of 1 day. This configuration has to be set on each node, to an identical value.
|
||||
More featureful configuration methods will be implemented in the future.
|
||||
|
||||
To disable, set ``auto_repair_enabled_default: false``.
|
||||
|
||||
Automatic repair relies on :doc:`Incremental Repair </features/incremental-repair>` and as such it only works with :doc:`tablet </architecture/tablets>` tables.
|
||||
@@ -3,7 +3,7 @@
|
||||
Incremental Repair
|
||||
==================
|
||||
|
||||
ScyllaDB's standard repair process scans and processes all the data on a node, regardless of whether it has changed since the last repair. This operation can be resource-intensive and time-consuming. The Incremental Repair feature provides a much more efficient and lightweight alternative for maintaining data consistency.
|
||||
ScyllaDB's standard :doc:`repair </operating-scylla/procedures/maintenance/repair>` process scans and processes all the data on a node, regardless of whether it has changed since the last repair. This operation can be resource-intensive and time-consuming. The Incremental Repair feature provides a much more efficient and lightweight alternative for maintaining data consistency.
|
||||
|
||||
The core idea of incremental repair is to repair only the data that has been written or changed since the last repair was run. It intelligently skips data that has already been verified, dramatically reducing the time, I/O, and CPU resources required for the repair operation.
|
||||
|
||||
@@ -37,7 +37,12 @@ The available modes are:
|
||||
* ``disabled``: Completely disables the incremental repair logic for the current operation. The repair behaves like a classic, non-incremental repair, and it does not read or update any incremental repair status markers.
|
||||
|
||||
|
||||
The incremental_mode parameter can be specified using nodetool cluster repair, e.g., nodetool cluster repair --incremental-mode incremental. It can also be specified with the REST API, e.g., curl -X POST "http://127.0.0.1:10000/storage_service/tablets/repair?ks=ks1&table=tb1&tokens=all&incremental_mode=incremental"
|
||||
The incremental_mode parameter can be specified using nodetool cluster repair, e.g., nodetool cluster repair --incremental-mode incremental.
|
||||
It can also be specified with the REST API, e.g.:
|
||||
|
||||
.. code::
|
||||
|
||||
curl -X POST "http://127.0.0.1:10000/storage_service/tablets/repair?ks=ks1&table=tb1&tokens=all&incremental_mode=incremental"
|
||||
|
||||
Benefits of Incremental Repair
|
||||
------------------------------
|
||||
@@ -46,6 +51,8 @@ Benefits of Incremental Repair
|
||||
* **Reduced Resource Usage:** Consumes significantly less CPU, I/O, and network bandwidth compared to a full repair.
|
||||
* **More Frequent Repairs:** The efficiency of incremental repair allows you to run it more frequently, ensuring a higher level of data consistency across your cluster at all times.
|
||||
|
||||
Tables using Incremental Repair can schedule repairs in ScyllaDB itself, with :doc:`Automatic Repair </features/automatic-repair>`.
|
||||
|
||||
Notes
|
||||
-----
|
||||
|
||||
|
||||
@@ -17,6 +17,7 @@ This document highlights ScyllaDB's key data modeling features.
|
||||
Workload Prioritization </features/workload-prioritization>
|
||||
Backup and Restore </features/backup-and-restore>
|
||||
Incremental Repair </features/incremental-repair/>
|
||||
Automatic Repair </features/automatic-repair/>
|
||||
Vector Search </features/vector-search/>
|
||||
|
||||
.. panel-box::
|
||||
@@ -44,5 +45,7 @@ This document highlights ScyllaDB's key data modeling features.
|
||||
* :doc:`Incremental Repair </features/incremental-repair/>` provides a much more
|
||||
efficient and lightweight approach to maintaining data consistency by
|
||||
repairing only the data that has changed since the last repair.
|
||||
* :doc:`Automatic Repair </features/automatic-repair/>` schedules and runs repairs
|
||||
directly in ScyllaDB, without external schedulers.
|
||||
* :doc:`Vector Search in ScyllaDB </features/vector-search/>` enables
|
||||
similarity-based queries on vector embeddings.
|
||||
|
||||
@@ -18,7 +18,7 @@ Getting Started
|
||||
:class: my-panel
|
||||
|
||||
* :doc:`ScyllaDB System Requirements Guide</getting-started/system-requirements/>`
|
||||
* :doc:`OS Support by Platform and Version</getting-started/os-support/>`
|
||||
* `OS Support by Platform and Version <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_
|
||||
|
||||
.. panel-box::
|
||||
:title: Install and Configure ScyllaDB
|
||||
|
||||
@@ -24,9 +24,9 @@ Keep your versions up-to-date. The two latest versions are supported. Also, alwa
|
||||
:id: "getting-started"
|
||||
:class: my-panel
|
||||
|
||||
* :doc:`Launch ScyllaDB on AWS </getting-started/install-scylla/launch-on-aws>`
|
||||
* :doc:`Launch ScyllaDB on GCP </getting-started/install-scylla/launch-on-gcp>`
|
||||
* :doc:`Launch ScyllaDB on Azure </getting-started/install-scylla/launch-on-azure>`
|
||||
* :doc:`Launch ScyllaDB |CURRENT_VERSION| on AWS </getting-started/install-scylla/launch-on-aws>`
|
||||
* :doc:`Launch ScyllaDB |CURRENT_VERSION| on GCP </getting-started/install-scylla/launch-on-gcp>`
|
||||
* :doc:`Launch ScyllaDB |CURRENT_VERSION| on Azure </getting-started/install-scylla/launch-on-azure>`
|
||||
|
||||
|
||||
.. panel-box::
|
||||
@@ -35,7 +35,7 @@ Keep your versions up-to-date. The two latest versions are supported. Also, alwa
|
||||
:class: my-panel
|
||||
|
||||
* :doc:`Install ScyllaDB with Web Installer (recommended) </getting-started/installation-common/scylla-web-installer>`
|
||||
* :doc:`Install ScyllaDB Linux Packages </getting-started/install-scylla/install-on-linux>`
|
||||
* :doc:`Install ScyllaDB |CURRENT_VERSION| Linux Packages </getting-started/install-scylla/install-on-linux>`
|
||||
* :doc:`Install scylla-jmx Package </getting-started/installation-common/install-jmx>`
|
||||
* :doc:`Install ScyllaDB Without root Privileges </getting-started/installation-common/unified-installer>`
|
||||
* :doc:`Air-gapped Server Installation </getting-started/installation-common/air-gapped-install>`
|
||||
|
||||
@@ -4,9 +4,9 @@
|
||||
.. |RHEL_EPEL_8| replace:: https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm
|
||||
.. |RHEL_EPEL_9| replace:: https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm
|
||||
|
||||
======================================
|
||||
Install ScyllaDB Linux Packages
|
||||
======================================
|
||||
========================================================
|
||||
Install ScyllaDB |CURRENT_VERSION| Linux Packages
|
||||
========================================================
|
||||
|
||||
We recommend installing ScyllaDB using :doc:`ScyllaDB Web Installer for Linux </getting-started/installation-common/scylla-web-installer/>`,
|
||||
a platform-agnostic installation script, to install ScyllaDB on any supported Linux platform.
|
||||
@@ -17,7 +17,7 @@ This article will help you install ScyllaDB on Linux using platform-specific pac
|
||||
Prerequisites
|
||||
----------------
|
||||
|
||||
* Ubuntu, Debian, CentOS, or RHEL (see :doc:`OS Support by Platform and Version </getting-started/os-support>`
|
||||
* Ubuntu, Debian, CentOS, or RHEL (see `OS Support by Platform and Version <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_
|
||||
for details about supported versions and architecture)
|
||||
* Root or ``sudo`` access to the system
|
||||
* Open :ref:`ports used by ScyllaDB <networking-ports>`
|
||||
@@ -46,8 +46,8 @@ Install ScyllaDB
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
sudo gpg --homedir /tmp --no-default-keyring --keyring /tmp/temp.gpg --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys a43e06657bac99e3
|
||||
sudo gpg --homedir /tmp --no-default-keyring --keyring /tmp/temp.gpg --export --armor a43e06657bac99e3 | gpg --dearmor > /etc/apt/keyrings/scylladb.gpg
|
||||
sudo gpg --homedir /tmp --no-default-keyring --keyring /tmp/temp.gpg --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys c503c686b007f39e
|
||||
sudo gpg --homedir /tmp --no-default-keyring --keyring /tmp/temp.gpg --export --armor c503c686b007f39e | gpg --dearmor > /etc/apt/keyrings/scylladb.gpg
|
||||
|
||||
.. code-block:: console
|
||||
:substitutions:
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
==========================
|
||||
Launch ScyllaDB on AWS
|
||||
==========================
|
||||
===============================================
|
||||
Launch ScyllaDB |CURRENT_VERSION| on AWS
|
||||
===============================================
|
||||
|
||||
This article will guide you through self-managed ScyllaDB deployment on AWS. For a fully-managed deployment of ScyllaDB
|
||||
as-a-service, see `ScyllaDB Cloud documentation <https://cloud.docs.scylladb.com/>`_.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
==========================
|
||||
Launch ScyllaDB on Azure
|
||||
==========================
|
||||
===============================================
|
||||
Launch ScyllaDB |CURRENT_VERSION| on Azure
|
||||
===============================================
|
||||
|
||||
This article will guide you through self-managed ScyllaDB deployment on Azure. For a fully-managed deployment of ScyllaDB
|
||||
as-a-service, see `ScyllaDB Cloud documentation <https://cloud.docs.scylladb.com/>`_.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
==========================
|
||||
Launch ScyllaDB on GCP
|
||||
==========================
|
||||
=============================================
|
||||
Launch ScyllaDB |CURRENT_VERSION| on GCP
|
||||
=============================================
|
||||
|
||||
This article will guide you through self-managed ScyllaDB deployment on GCP. For a fully-managed deployment of ScyllaDB
|
||||
as-a-service, see `ScyllaDB Cloud documentation <https://cloud.docs.scylladb.com/>`_.
|
||||
|
||||
@@ -10,7 +10,7 @@ Prerequisites
|
||||
--------------
|
||||
|
||||
Ensure that your platform is supported by the ScyllaDB version you want to install.
|
||||
See :doc:`OS Support by Platform and Version </getting-started/os-support/>`.
|
||||
See `OS Support by Platform and Version <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_.
|
||||
|
||||
Install ScyllaDB with Web Installer
|
||||
---------------------------------------
|
||||
|
||||
@@ -12,7 +12,8 @@ the package manager (dnf and apt).
|
||||
Prerequisites
|
||||
---------------
|
||||
Ensure your platform is supported by the ScyllaDB version you want to install.
|
||||
See :doc:`OS Support </getting-started/os-support>` for information about supported Linux distributions and versions.
|
||||
See `OS Support <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_
|
||||
for information about supported Linux distributions and versions.
|
||||
|
||||
Note that if you're on CentOS 7, only root offline installation is supported.
|
||||
|
||||
|
||||
@@ -1,26 +0,0 @@
|
||||
OS Support by Linux Distributions and Version
|
||||
==============================================
|
||||
|
||||
The following matrix shows which Linux distributions, containers, and images
|
||||
are :ref:`supported <os-support-definition>` with which versions of ScyllaDB.
|
||||
|
||||
.. datatemplate:json:: /_static/data/os-support.json
|
||||
:template: platforms.tmpl
|
||||
|
||||
``*`` 2024.1.9 and later
|
||||
|
||||
All releases are available as a Docker container, EC2 AMI, GCP, and Azure images.
|
||||
|
||||
.. _os-support-definition:
|
||||
|
||||
By *supported*, it is meant that:
|
||||
|
||||
- A binary installation package is available.
|
||||
- The download and install procedures are tested as part of the ScyllaDB release process for each version.
|
||||
- An automated install is included from :doc:`ScyllaDB Web Installer for Linux tool </getting-started/installation-common/scylla-web-installer>` (for the latest versions).
|
||||
|
||||
You can `build ScyllaDB from source <https://github.com/scylladb/scylladb#build-prerequisites>`_
|
||||
on other x86_64 or aarch64 platforms, without any guarantees.
|
||||
|
||||
|
||||
|
||||
@@ -8,12 +8,12 @@ ScyllaDB Requirements
|
||||
:hidden:
|
||||
|
||||
system-requirements
|
||||
OS Support <os-support>
|
||||
OS Support <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>
|
||||
Cloud Instance Recommendations <cloud-instance-recommendations>
|
||||
scylla-in-a-shared-environment
|
||||
|
||||
* :doc:`System Requirements</getting-started/system-requirements/>`
|
||||
* :doc:`OS Support by Platform and Version</getting-started/os-support/>`
|
||||
* `OS Support by Platform and Version <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_
|
||||
* :doc:`Cloud Instance Recommendations AWS, GCP, and Azure </getting-started/cloud-instance-recommendations>`
|
||||
* :doc:`Running ScyllaDB in a Shared Environment </getting-started/scylla-in-a-shared-environment>`
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@ Supported Platforms
|
||||
===================
|
||||
ScyllaDB runs on 64-bit Linux. The x86_64 and AArch64 architectures are supported (AArch64 support includes AWS EC2 Graviton).
|
||||
|
||||
See :doc:`OS Support by Platform and Version </getting-started/os-support>` for information about
|
||||
See `OS Support by Platform and Version <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_ for information about
|
||||
supported operating systems, distros, and versions.
|
||||
|
||||
See :doc:`Cloud Instance Recommendations for AWS, GCP, and Azure </getting-started/cloud-instance-recommendations>` for information
|
||||
|
||||
@@ -601,11 +601,7 @@ Scrub has several modes:
|
||||
* **segregate** - Fixes partition/row/mutation-fragment out-of-order errors by segregating the output into as many SStables as required so that the content of each output SStable is properly ordered.
|
||||
* **validate** - Validates the content of the SStable, reporting any corruptions found. Writes no output SStables. In this mode, scrub has the same outcome as the `validate operation <scylla-sstable-validate-operation_>`_ - and the validate operation is recommended over scrub.
|
||||
|
||||
Output SStables are written to the directory specified via ``--output-directory``. They will be written with the ``BIG`` format and the highest supported SStable format, with generations chosen by scylla-sstable. Generations are chosen such
|
||||
that they are unique among the SStables written by the current scrub.
|
||||
|
||||
The output directory must be empty; otherwise, scylla-sstable will abort scrub. You can allow writing to a non-empty directory by setting the ``--unsafe-accept-nonempty-output-dir`` command line flag.
|
||||
Note that scrub will be aborted if an SStable cannot be written because its generation clashes with a pre-existing SStable in the output directory.
|
||||
Output SStables are written to the directory specified via ``--output-dir``. They will be written with the ``BIG`` format and the highest supported SStable format, with random generation.
|
||||
|
||||
validate-checksums
|
||||
^^^^^^^^^^^^^^^^^^
|
||||
@@ -870,7 +866,7 @@ The SSTable version to be used can be overridden with the ``--version`` flag, al
|
||||
SSTables which are already on the designated version are skipped. To force rewriting *all* SSTables, use the ``--all`` flag.
|
||||
|
||||
Output SSTables are written to the path provided by the ``--output-dir`` flag, or to the current directory if not specified.
|
||||
This directory is expected to exist and be empty. If not empty the tool will refuse to run. This can be overridden with the ``--unsafe-accept-nonempty-output-dir`` flag.
|
||||
This directory is expected to exist.
|
||||
|
||||
It is strongly recommended to use the system schema tables as the schema source for this command, see the :ref:`schema options <scylla-sstable-schema>` for more details.
|
||||
A schema which is good enough to read the SSTable and dump its content, may not be good enough to write its content back verbatim.
|
||||
@@ -882,6 +878,25 @@ But even an altered schema which changed only the table options can lead to data
|
||||
|
||||
The mapping of input SSTables to output SSTables is printed to ``stdout``.
|
||||
|
||||
filter
|
||||
^^^^^^
|
||||
|
||||
Filter the SSTable(s), including/excluding specified partitions.
|
||||
|
||||
Similar to ``scylla sstable dump-data --partition|--partition-file``, with some notable differences:
|
||||
|
||||
* Instead of dumping the content to stdout, the filtered content is written back to SSTable(s) on disk.
|
||||
* Also supports negative filters (keep all partitions except the those specified).
|
||||
|
||||
The partition list can be provided either via the ``--partition`` command line argument, or via a file path passed to the the ``--partitions-file`` argument. The file should contain one partition key per line.
|
||||
Partition keys should be provided in the hex format, as produced by `scylla types serialize </operating-scylla/admin-tools/scylla-types/>`_.
|
||||
|
||||
With ``--include``, only the specified partitions are kept from the input SSTable(s). With ``--exclude``, the specified partitions are discarded and won't be written to the output SSTable(s).
|
||||
It is possible that certain input SSTable(s) won't have any content left after the filtering. These input SSTable(s) will not have a matching output SSTable.
|
||||
|
||||
By default, each input sstable is filtered individually. Use ``--merge`` to filter the combined content of all input sstables, producing a single output SSTable.
|
||||
|
||||
Output sstables use the latest supported sstable format (can be changed with ``--sstable-version``).
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
@@ -52,10 +52,14 @@ Row-level repair improves ScyllaDB in two ways:
|
||||
* keeping the data in a temporary buffer.
|
||||
* using the cached data to calculate the checksum and send it to the replicas.
|
||||
|
||||
See also
|
||||
See also the `ScyllaDB Manager documentation <https://manager.docs.scylladb.com/>`_.
|
||||
|
||||
* `ScyllaDB Manager documentation <https://manager.docs.scylladb.com/>`_
|
||||
Incremental Repair
|
||||
------------------
|
||||
|
||||
* `Blog: ScyllaDB Open Source 3.1: Efficiently Maintaining Consistency with Row-Level Repair <https://www.scylladb.com/2019/08/13/scylla-open-source-3-1-efficiently-maintaining-consistency-with-row-level-repair/>`_
|
||||
Built on top of :ref:`Row-level Repair <row-level-repair>` and :doc:`Tablets </architecture/tablets>`, Incremental Repair enables frequent and quick repairs. For more details, see :doc:`Incremental Repair </features/incremental-repair>`.
|
||||
|
||||
Automatic Repair
|
||||
----------------
|
||||
|
||||
Built on top of :doc:`Incremental Repair </features/incremental-repair>`, :doc:`Automatic Repair </features/automatic-repair>` offers repair scheduling and execution directly in ScyllaDB, without external processes.
|
||||
|
||||
@@ -8,7 +8,6 @@ Troubleshooting ScyllaDB
|
||||
|
||||
support/index
|
||||
startup/index
|
||||
upgrade/index
|
||||
cluster/index
|
||||
modeling/index
|
||||
storage/index
|
||||
@@ -29,7 +28,6 @@ Keep your versions up-to-date. The two latest versions are supported. Also, alwa
|
||||
* :doc:`Errors and ScyllaDB Customer Support <support/index>`
|
||||
* :doc:`ScyllaDB Startup <startup/index>`
|
||||
* :doc:`ScyllaDB Cluster and Node <cluster/index>`
|
||||
* :doc:`ScyllaDB Upgrade <upgrade/index>`
|
||||
* :doc:`Data Modeling <modeling/index>`
|
||||
* :doc:`Data Storage and SSTables <storage/index>`
|
||||
* :doc:`CQL errors <CQL/index>`
|
||||
|
||||
@@ -1,79 +0,0 @@
|
||||
Inaccessible "/var/lib/scylla" and "/var/lib/systemd/coredump" after ScyllaDB upgrade
|
||||
======================================================================================
|
||||
|
||||
Problem
|
||||
^^^^^^^
|
||||
When you reboot the machine after a ScyllaDB upgrade, you cannot access data directories under ``/var/lib/scylla``, and
|
||||
coredump saves to ``rootfs``.
|
||||
|
||||
|
||||
The problem may occur when you upgrade ScylaDB Open Source 4.6 or later to a version of ScyllaDB Enterprise if
|
||||
the ``/etc/systemd/system/var-lib-scylla.mount`` and ``/etc/systemd/system/var-lib-systemd-coredump.mount`` are
|
||||
deleted by RPM.
|
||||
|
||||
To avoid losing the files, the upgrade procedure includes a step to backup the .mount files. The following
|
||||
example shows the command to backup the files before the upgrade from version 5.0:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
for conf in $( rpm -qc $(rpm -qa | grep scylla) | grep -v contains ) /etc/systemd/system/{var-lib-scylla,var-lib-systemd-coredump}.mount; do sudo cp -v $conf $conf.backup-5.0; done
|
||||
|
||||
If you don't backup the .mount files before the upgrade, the files may be lost.
|
||||
|
||||
|
||||
Solution
|
||||
^^^^^^^^
|
||||
|
||||
If you didn't backup the .mount files before the upgrade and the files were deleted during the upgrade,
|
||||
you need to restore them manually.
|
||||
|
||||
To restore ``/etc/systemd/system/var-lib-systemd-coredump.mount``, run the following:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ cat << EOS | sudo tee /etc/systemd/system/var-lib-systemd-coredump.mount
|
||||
[Unit]
|
||||
Description=Save coredump to scylla data directory
|
||||
Conflicts=umount.target
|
||||
Before=scylla-server.service
|
||||
After=local-fs.target
|
||||
DefaultDependencies=no
|
||||
[Mount]
|
||||
What=/var/lib/scylla/coredump
|
||||
Where=/var/lib/systemd/coredump
|
||||
Type=none
|
||||
Options=bind
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
EOS
|
||||
|
||||
To restore ``/etc/systemd/system/var-lib-scylla.mount``, run the following (specifying your data disk):
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ UUID=`blkid -s UUID -o value <specify your data disk, eg: /dev/md0>`
|
||||
$ cat << EOS | sudo tee /etc/systemd/system/var-lib-scylla.mount
|
||||
[Unit]
|
||||
Description=ScyllaDB data directory
|
||||
Before=scylla-server.service
|
||||
After=local-fs.target
|
||||
DefaultDependencies=no
|
||||
[Mount]
|
||||
What=/dev/disk/by-uuid/$UUID
|
||||
Where=/var/lib/scylla
|
||||
Type=xfs
|
||||
Options=noatime
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
EOS
|
||||
|
||||
After restoring .mount files, you need to enable them:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ sudo systemctl daemon-reload
|
||||
$ sudo systemctl enable --now var-lib-scylla.mount
|
||||
$ sudo systemctl enable --now var-lib-systemd-coredump.mount
|
||||
|
||||
|
||||
.. include:: /troubleshooting/_common/ts-return.rst
|
||||
@@ -1,16 +0,0 @@
|
||||
Upgrade
|
||||
=================
|
||||
|
||||
.. toctree::
|
||||
:hidden:
|
||||
:maxdepth: 2
|
||||
|
||||
Inaccessible configuration files after ScyllaDB upgrade </troubleshooting/missing-dotmount-files>
|
||||
|
||||
.. panel-box::
|
||||
:title: Upgrade Issues
|
||||
:id: "getting-started"
|
||||
:class: my-panel
|
||||
|
||||
* :doc:`Inaccessible "/var/lib/scylla" and "/var/lib/systemd/coredump" after ScyllaDB upgrade </troubleshooting//missing-dotmount-files>`
|
||||
|
||||
@@ -14,7 +14,7 @@ if necessary.
|
||||
|
||||
This guide covers upgrading ScyllaDB on Red Hat Enterprise Linux (RHEL),
|
||||
CentOS, Debian, and Ubuntu.
|
||||
See :doc:`OS Support by Platform and Version </getting-started/os-support>`
|
||||
See `OS Support by Platform and Version <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_
|
||||
for information about supported versions.
|
||||
|
||||
It also applies to the ScyllaDB official image on EC2, GCP, or Azure.
|
||||
|
||||
@@ -17,7 +17,7 @@ This document describes a step-by-step procedure for upgrading from |SCYLLA_NAME
|
||||
to |SCYLLA_NAME| |NEW_VERSION| and rollback to version |SRC_VERSION| if necessary.
|
||||
|
||||
This guide covers upgrading ScyllaDB on Red Hat Enterprise Linux (RHEL), CentOS, Debian,
|
||||
and Ubuntu. See :doc:`OS Support by Platform and Version </getting-started/os-support>`
|
||||
and Ubuntu. See `OS Support by Platform and Version <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_
|
||||
for information about supported versions.
|
||||
|
||||
It also applies when using the ScyllaDB official image on EC2, GCP, or Azure.
|
||||
|
||||
@@ -182,6 +182,7 @@ public:
|
||||
gms::feature removenode_with_left_token_ring { *this, "REMOVENODE_WITH_LEFT_TOKEN_RING"sv };
|
||||
gms::feature size_based_load_balancing { *this, "SIZE_BASED_LOAD_BALANCING"sv };
|
||||
gms::feature topology_noop_request { *this, "TOPOLOGY_NOOP_REQUEST"sv };
|
||||
gms::feature tablets_intermediate_fallback_cleanup { *this, "TABLETS_INTERMEDIATE_FALLBACK_CLEANUP"sv };
|
||||
public:
|
||||
|
||||
const std::unordered_map<sstring, std::reference_wrapper<feature>>& registered_features() const;
|
||||
|
||||
@@ -2424,8 +2424,8 @@ bool gossiper::is_enabled() const {
|
||||
void gossiper::add_expire_time_for_endpoint(locator::host_id endpoint, clk::time_point expire_time) {
|
||||
auto now_ = now();
|
||||
auto diff = std::chrono::duration_cast<std::chrono::seconds>(expire_time - now_).count();
|
||||
logger.info("Node {} will be removed from gossip at [{:%Y-%m-%d %T}]: (expire = {}, now = {}, diff = {} seconds)",
|
||||
endpoint, fmt::localtime(clk::to_time_t(expire_time)), expire_time.time_since_epoch().count(),
|
||||
logger.info("Node {} will be removed from gossip at [{:%Y-%m-%d %T %z}]: (expire = {}, now = {}, diff = {} seconds)",
|
||||
endpoint, fmt::gmtime(clk::to_time_t(expire_time)), expire_time.time_since_epoch().count(),
|
||||
now_.time_since_epoch().count(), diff);
|
||||
_expire_time_endpoint_map[endpoint] = expire_time;
|
||||
}
|
||||
|
||||
@@ -153,6 +153,8 @@ public:
|
||||
}
|
||||
const std::set<inet_address>& get_seeds() const noexcept;
|
||||
|
||||
seastar::scheduling_group get_scheduling_group() const noexcept { return _gcfg.gossip_scheduling_group; }
|
||||
|
||||
public:
|
||||
static clk::time_point inline now() noexcept { return clk::now(); }
|
||||
public:
|
||||
|
||||
@@ -17,11 +17,11 @@
|
||||
#include "index/secondary_index.hh"
|
||||
#include "index/secondary_index_manager.hh"
|
||||
#include "types/concrete_types.hh"
|
||||
#include "types/types.hh"
|
||||
#include "utils/managed_string.hh"
|
||||
#include <seastar/core/sstring.hh>
|
||||
#include <boost/algorithm/string.hpp>
|
||||
|
||||
|
||||
namespace secondary_index {
|
||||
|
||||
static void validate_positive_option(int max, const sstring& value_name, const sstring& value) {
|
||||
@@ -147,17 +147,88 @@ std::optional<cql3::description> vector_index::describe(const index_metadata& im
|
||||
}
|
||||
|
||||
void vector_index::check_target(const schema& schema, const std::vector<::shared_ptr<cql3::statements::index_target>>& targets) const {
|
||||
if (targets.size() != 1) {
|
||||
throw exceptions::invalid_request_exception("Vector index can only be created on a single column");
|
||||
}
|
||||
auto target = targets[0];
|
||||
auto c_def = schema.get_column_definition(to_bytes(target->column_name()));
|
||||
if (!c_def) {
|
||||
throw exceptions::invalid_request_exception(format("Column {} not found in schema", target->column_name()));
|
||||
}
|
||||
auto type = c_def->type;
|
||||
if (!type->is_vector() || static_cast<const vector_type_impl*>(type.get())->get_elements_type()->get_kind() != abstract_type::kind::float_kind) {
|
||||
throw exceptions::invalid_request_exception(format("Vector indexes are only supported on columns of vectors of floats", target->column_name()));
|
||||
|
||||
struct validate_visitor {
|
||||
const class schema& schema;
|
||||
bool& is_vector;
|
||||
|
||||
/// Vector indexes support filtering on native types that can be used as primary key columns.
|
||||
/// There is no counter (it cannot be used with vector columns)
|
||||
/// and no duration (it cannot be used as a primary key or in secondary indexes).
|
||||
static bool is_supported_filtering_column(abstract_type const & kind_type) {
|
||||
switch (kind_type.get_kind()) {
|
||||
case abstract_type::kind::ascii:
|
||||
case abstract_type::kind::boolean:
|
||||
case abstract_type::kind::byte:
|
||||
case abstract_type::kind::bytes:
|
||||
case abstract_type::kind::date:
|
||||
case abstract_type::kind::decimal:
|
||||
case abstract_type::kind::double_kind:
|
||||
case abstract_type::kind::float_kind:
|
||||
case abstract_type::kind::inet:
|
||||
case abstract_type::kind::int32:
|
||||
case abstract_type::kind::long_kind:
|
||||
case abstract_type::kind::short_kind:
|
||||
case abstract_type::kind::simple_date:
|
||||
case abstract_type::kind::time:
|
||||
case abstract_type::kind::timestamp:
|
||||
case abstract_type::kind::timeuuid:
|
||||
case abstract_type::kind::utf8:
|
||||
case abstract_type::kind::uuid:
|
||||
case abstract_type::kind::varint:
|
||||
return true;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void validate(cql3::column_identifier const& column, bool is_vector) const {
|
||||
auto const& c_name = column.to_string();
|
||||
auto const* c_def = schema.get_column_definition(column.name());
|
||||
if (c_def == nullptr) {
|
||||
throw exceptions::invalid_request_exception(format("Column {} not found in schema", c_name));
|
||||
}
|
||||
|
||||
auto type = c_def->type;
|
||||
|
||||
if (is_vector) {
|
||||
auto const* vector_type = dynamic_cast<const vector_type_impl*>(type.get());
|
||||
if (vector_type == nullptr) {
|
||||
throw exceptions::invalid_request_exception("Vector indexes are only supported on columns of vectors of floats");
|
||||
}
|
||||
|
||||
auto elements_type = vector_type->get_elements_type();
|
||||
if (elements_type->get_kind() != abstract_type::kind::float_kind) {
|
||||
throw exceptions::invalid_request_exception("Vector indexes are only supported on columns of vectors of floats");
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (!is_supported_filtering_column(*type)) {
|
||||
throw exceptions::invalid_request_exception(format("Unsupported vector index filtering column {} type", c_name));
|
||||
}
|
||||
}
|
||||
|
||||
void operator()(const std::vector<::shared_ptr<cql3::column_identifier>>& columns) const {
|
||||
for (const auto& column : columns) {
|
||||
// CQL restricts the secondary local index to have multiple columns with partition key only.
|
||||
// Vectors shouldn't be partition key columns and they aren't supported as a filtering column,
|
||||
// so we can assume here that these are non-vectors filtering columns.
|
||||
validate(*column, false);
|
||||
}
|
||||
}
|
||||
|
||||
void operator()(const ::shared_ptr<cql3::column_identifier>& column) {
|
||||
validate(*column, is_vector);
|
||||
// The first column is the vector column, the rest mustn't be vectors.
|
||||
is_vector = false;
|
||||
}
|
||||
};
|
||||
|
||||
bool is_vector = true;
|
||||
for (const auto& target : targets) {
|
||||
std::visit(validate_visitor{.schema = schema, .is_vector = is_vector}, target->value);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
7
init.cc
7
init.cc
@@ -102,13 +102,6 @@ std::set<sstring> get_disabled_features_from_db_config(const db::config& cfg, st
|
||||
if (!cfg.check_experimental(db::experimental_features_t::feature::STRONGLY_CONSISTENT_TABLES)) {
|
||||
disabled.insert("STRONGLY_CONSISTENT_TABLES"s);
|
||||
}
|
||||
if (cfg.force_gossip_topology_changes()) {
|
||||
if (cfg.enable_tablets_by_default()) {
|
||||
throw std::runtime_error("Tablets cannot be enabled with gossip topology changes. Use either --tablets-mode-for-new-keyspaces=enabled|enforced or --force-gossip-topology-changes, but not both.");
|
||||
}
|
||||
startlog.warn("The tablets feature is disabled due to forced gossip topology changes");
|
||||
disabled.insert("TABLETS"s);
|
||||
}
|
||||
if (!cfg.table_digest_insensitive_to_expiry()) {
|
||||
disabled.insert("TABLE_DIGEST_INSENSITIVE_TO_EXPIRY"s);
|
||||
}
|
||||
|
||||
@@ -31,6 +31,7 @@ fi
|
||||
|
||||
debian_base_packages=(
|
||||
clang
|
||||
clang-tools
|
||||
gdb
|
||||
cargo
|
||||
wabt
|
||||
@@ -72,6 +73,7 @@ debian_base_packages=(
|
||||
|
||||
fedora_packages=(
|
||||
clang
|
||||
clang-tools-extra
|
||||
compiler-rt
|
||||
libasan
|
||||
libubsan
|
||||
@@ -148,7 +150,6 @@ fedora_packages=(
|
||||
llvm
|
||||
openldap-servers
|
||||
openldap-devel
|
||||
toxiproxy
|
||||
cyrus-sasl
|
||||
fipscheck
|
||||
cpp-jwt-devel
|
||||
@@ -293,6 +294,7 @@ print_usage() {
|
||||
echo " --print-pip-runtime-packages Print required pip packages for Scylla"
|
||||
echo " --print-pip-symlinks Print list of pip provided commands which need to install to /usr/bin"
|
||||
echo " --print-node-exporter-filename Print node_exporter filename"
|
||||
echo " --future Install dependencies for future toolchain (Fedora rawhide based)"
|
||||
exit 1
|
||||
}
|
||||
|
||||
@@ -300,6 +302,7 @@ PRINT_PYTHON3=false
|
||||
PRINT_PIP=false
|
||||
PRINT_PIP_SYMLINK=false
|
||||
PRINT_NODE_EXPORTER=false
|
||||
FUTURE=false
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
"--print-python3-runtime-packages")
|
||||
@@ -318,6 +321,10 @@ while [ $# -gt 0 ]; do
|
||||
PRINT_NODE_EXPORTER=true
|
||||
shift 1
|
||||
;;
|
||||
"--future")
|
||||
FUTURE=true
|
||||
shift 1
|
||||
;;
|
||||
*)
|
||||
print_usage
|
||||
;;
|
||||
@@ -348,6 +355,10 @@ if $PRINT_NODE_EXPORTER; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if ! $FUTURE; then
|
||||
fedora_packages+=(toxiproxy)
|
||||
fi
|
||||
|
||||
umask 0022
|
||||
|
||||
./seastar/install-dependencies.sh
|
||||
@@ -445,3 +456,11 @@ if [ ! -z "${CURL_ARGS}" ]; then
|
||||
else
|
||||
echo "Minio server and client are up-to-date, skipping download"
|
||||
fi
|
||||
|
||||
if $FUTURE ; then
|
||||
toxyproxy_version="v2.12.0"
|
||||
for bin in toxiproxy-cli toxiproxy-server; do
|
||||
curl -fSL -o "/usr/local/bin/${bin}" "https://github.com/Shopify/toxiproxy/releases/download/${toxyproxy_version}/${bin}-linux-$(go_arch)"
|
||||
chmod +x "/usr/local/bin/${bin}"
|
||||
done
|
||||
fi
|
||||
|
||||
@@ -50,6 +50,8 @@ write_replica_set_selector get_selector_for_writes(tablet_transition_stage stage
|
||||
return write_replica_set_selector::previous;
|
||||
case tablet_transition_stage::write_both_read_old:
|
||||
return write_replica_set_selector::both;
|
||||
case tablet_transition_stage::write_both_read_old_fallback_cleanup:
|
||||
return write_replica_set_selector::both;
|
||||
case tablet_transition_stage::streaming:
|
||||
return write_replica_set_selector::both;
|
||||
case tablet_transition_stage::rebuild_repair:
|
||||
@@ -81,6 +83,8 @@ read_replica_set_selector get_selector_for_reads(tablet_transition_stage stage)
|
||||
return read_replica_set_selector::previous;
|
||||
case tablet_transition_stage::write_both_read_old:
|
||||
return read_replica_set_selector::previous;
|
||||
case tablet_transition_stage::write_both_read_old_fallback_cleanup:
|
||||
return read_replica_set_selector::previous;
|
||||
case tablet_transition_stage::streaming:
|
||||
return read_replica_set_selector::previous;
|
||||
case tablet_transition_stage::rebuild_repair:
|
||||
@@ -741,6 +745,7 @@ void tablet_map::set_tablet_raft_info(tablet_id id, tablet_raft_info raft_info)
|
||||
static const std::unordered_map<tablet_transition_stage, sstring> tablet_transition_stage_to_name = {
|
||||
{tablet_transition_stage::allow_write_both_read_old, "allow_write_both_read_old"},
|
||||
{tablet_transition_stage::write_both_read_old, "write_both_read_old"},
|
||||
{tablet_transition_stage::write_both_read_old_fallback_cleanup, "write_both_read_old_fallback_cleanup"},
|
||||
{tablet_transition_stage::write_both_read_new, "write_both_read_new"},
|
||||
{tablet_transition_stage::streaming, "streaming"},
|
||||
{tablet_transition_stage::rebuild_repair, "rebuild_repair"},
|
||||
|
||||
@@ -277,6 +277,7 @@ std::optional<tablet_info> merge_tablet_info(tablet_info a, tablet_info b);
|
||||
enum class tablet_transition_stage {
|
||||
allow_write_both_read_old,
|
||||
write_both_read_old,
|
||||
write_both_read_old_fallback_cleanup,
|
||||
streaming,
|
||||
rebuild_repair,
|
||||
write_both_read_new,
|
||||
|
||||
18
main.cc
18
main.cc
@@ -906,6 +906,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
|
||||
auto background_reclaim_scheduling_group = create_scheduling_group("background_reclaim", "bgre", 50).get();
|
||||
auto maintenance_scheduling_group = create_scheduling_group("streaming", "strm", 200).get();
|
||||
debug::streaming_scheduling_group = maintenance_scheduling_group;
|
||||
|
||||
smp::invoke_on_all([&cfg, background_reclaim_scheduling_group] {
|
||||
logalloc::tracker::config st_cfg;
|
||||
@@ -1149,6 +1150,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
dbcfg.memtable_scheduling_group = create_scheduling_group("memtable", "mt", 1000).get();
|
||||
dbcfg.memtable_to_cache_scheduling_group = create_scheduling_group("memtable_to_cache", "mt2c", 200).get();
|
||||
dbcfg.gossip_scheduling_group = create_scheduling_group("gossip", "gms", 1000).get();
|
||||
debug::gossip_scheduling_group = dbcfg.gossip_scheduling_group;
|
||||
dbcfg.commitlog_scheduling_group = create_scheduling_group("commitlog", "clog", 1000).get();
|
||||
dbcfg.schema_commitlog_scheduling_group = create_scheduling_group("schema_commitlog", "sclg", 1000).get();
|
||||
dbcfg.available_memory = memory::stats().total_memory();
|
||||
@@ -1306,6 +1308,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
checkpoint(stop_signal, "starting storage proxy");
|
||||
service::storage_proxy::config spcfg {
|
||||
.hints_directory_initializer = hints_dir_initializer,
|
||||
.hints_sched_group = maintenance_scheduling_group,
|
||||
};
|
||||
spcfg.hinted_handoff_enabled = hinted_handoff_enabled;
|
||||
spcfg.available_memory = memory::stats().total_memory();
|
||||
@@ -1677,7 +1680,9 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
gossiper.local(), feature_service.local(), sys_ks.local(), group0_client, dbcfg.gossip_scheduling_group};
|
||||
|
||||
checkpoint(stop_signal, "starting tablet allocator");
|
||||
service::tablet_allocator::config tacfg;
|
||||
service::tablet_allocator::config tacfg {
|
||||
.background_sg = maintenance_scheduling_group,
|
||||
};
|
||||
sharded<service::tablet_allocator> tablet_allocator;
|
||||
tablet_allocator.start(tacfg, std::ref(mm_notifier), std::ref(db)).get();
|
||||
auto stop_tablet_allocator = defer_verbose_shutdown("tablet allocator", [&tablet_allocator] {
|
||||
@@ -2037,8 +2042,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
cdc_config.ring_delay = std::chrono::milliseconds(cfg->ring_delay_ms());
|
||||
cdc_config.dont_rewrite_streams = cfg->cdc_dont_rewrite_streams();
|
||||
cdc_generation_service.start(std::move(cdc_config), std::ref(gossiper), std::ref(sys_dist_ks), std::ref(sys_ks),
|
||||
std::ref(stop_signal.as_sharded_abort_source()), std::ref(token_metadata), std::ref(feature_service), std::ref(db),
|
||||
[&ss] () -> bool { return ss.local().raft_topology_change_enabled(); }).get();
|
||||
std::ref(stop_signal.as_sharded_abort_source()), std::ref(token_metadata), std::ref(feature_service), std::ref(db)).get();
|
||||
auto stop_cdc_generation_service = defer_verbose_shutdown("CDC Generation Management service", [] {
|
||||
cdc_generation_service.stop().get();
|
||||
});
|
||||
@@ -2073,7 +2077,6 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
perm_cache_config.refresh = std::chrono::milliseconds(cfg->permissions_update_interval_in_ms());
|
||||
|
||||
auto start_auth_service = [&mm] (sharded<auth::service>& auth_service, std::any& stop_auth_service, const char* what) {
|
||||
supervisor::notify(fmt::format("starting {}", what));
|
||||
auth_service.invoke_on_all(&auth::service::start, std::ref(mm), std::ref(sys_ks)).get();
|
||||
|
||||
stop_auth_service = defer_verbose_shutdown(what, [&auth_service] {
|
||||
@@ -2490,7 +2493,9 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
|
||||
if (cfg->view_building()) {
|
||||
checkpoint(stop_signal, "starting view builders");
|
||||
view_builder.invoke_on_all(&db::view::view_builder::start, std::ref(mm), utils::cross_shard_barrier()).get();
|
||||
with_scheduling_group(maintenance_scheduling_group, [&mm] {
|
||||
return view_builder.invoke_on_all(&db::view::view_builder::start, std::ref(mm), utils::cross_shard_barrier());
|
||||
}).get();
|
||||
}
|
||||
auto drain_view_builder = defer_verbose_shutdown("draining view builders", [&] {
|
||||
view_builder.invoke_on_all(&db::view::view_builder::drain).get();
|
||||
@@ -2650,7 +2655,8 @@ int main(int ac, char** av) {
|
||||
{"perf-load-balancing", perf::scylla_tablet_load_balancing_main, "run tablet load balancer tests"},
|
||||
{"perf-simple-query", perf::scylla_simple_query_main, "run performance tests by sending simple queries to this server"},
|
||||
{"perf-sstable", perf::scylla_sstable_main, "run performance tests by exercising sstable related operations on this server"},
|
||||
{"perf-alternator", perf::alternator(scylla_main, &after_init_func), "run performance tests on full alternator stack"}
|
||||
{"perf-alternator", perf::alternator(scylla_main, &after_init_func), "run performance tests on full alternator stack"},
|
||||
{"perf-cql-raw", perf::perf_cql_raw(scylla_main, &after_init_func), "run performance tests using raw CQL protocol frames"}
|
||||
};
|
||||
|
||||
main_func_type main_func;
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:a4710f1f0b0bb329721c21d133618e811e820f2e70553b0aca28fb278bff89c9
|
||||
size 6492280
|
||||
oid sha256:9034610470ff645fab03da5ad6c690e5b41f3307ea4b529c7e63b0786a1289ed
|
||||
size 6539600
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:2433f7a1fc5cda0dd990ab59587eb6046dca0fe1ae48d599953d1936fe014ed9
|
||||
size 6492176
|
||||
oid sha256:0c4bbf51dbe01d684ea5b9a9157781988ed499604d2fde90143bad0b9a5594f0
|
||||
size 6543944
|
||||
|
||||
@@ -148,6 +148,7 @@ public:
|
||||
};
|
||||
|
||||
private:
|
||||
const db::timeout_clock::time_point _created;
|
||||
reader_concurrency_semaphore& _semaphore;
|
||||
schema_ptr _schema;
|
||||
|
||||
@@ -237,17 +238,25 @@ private:
|
||||
break;
|
||||
case state::inactive:
|
||||
_semaphore.evict(*this, reader_concurrency_semaphore::evict_reason::time);
|
||||
break;
|
||||
// Return here on purpose. The evicted permit is destroyed when closing a reader.
|
||||
// As a consequence, any member access beyond this point is invalid.
|
||||
return;
|
||||
case state::evicted:
|
||||
case state::preemptive_aborted:
|
||||
break;
|
||||
}
|
||||
|
||||
// The function call not only sets state to reader_permit::state::preemptive_aborted
|
||||
// but also correctly decreases the statistics i.e. need_cpu_permits and awaits_permits.
|
||||
on_permit_inactive(reader_permit::state::preemptive_aborted);
|
||||
}
|
||||
|
||||
public:
|
||||
struct value_tag {};
|
||||
|
||||
impl(reader_concurrency_semaphore& semaphore, schema_ptr schema, const std::string_view& op_name, reader_resources base_resources, db::timeout_clock::time_point timeout, tracing::trace_state_ptr trace_ptr)
|
||||
: _semaphore(semaphore)
|
||||
: _created(db::timeout_clock::now())
|
||||
, _semaphore(semaphore)
|
||||
, _schema(std::move(schema))
|
||||
, _op_name_view(op_name)
|
||||
, _base_resources(base_resources)
|
||||
@@ -258,7 +267,8 @@ public:
|
||||
_semaphore.on_permit_created(*this);
|
||||
}
|
||||
impl(reader_concurrency_semaphore& semaphore, schema_ptr schema, sstring&& op_name, reader_resources base_resources, db::timeout_clock::time_point timeout, tracing::trace_state_ptr trace_ptr)
|
||||
: _semaphore(semaphore)
|
||||
: _created(db::timeout_clock::now())
|
||||
, _semaphore(semaphore)
|
||||
, _schema(std::move(schema))
|
||||
, _op_name(std::move(op_name))
|
||||
, _op_name_view(_op_name)
|
||||
@@ -360,6 +370,17 @@ public:
|
||||
on_permit_active();
|
||||
}
|
||||
|
||||
void on_preemptive_aborted() {
|
||||
if (_state != reader_permit::state::waiting_for_admission && _state != reader_permit::state::waiting_for_memory) {
|
||||
on_internal_error(rcslog, format("on_preemptive_aborted(): permit in invalid state {}", _state));
|
||||
}
|
||||
|
||||
_ttl_timer.cancel();
|
||||
_state = reader_permit::state::preemptive_aborted;
|
||||
_aux_data.pr.set_exception(named_semaphore_aborted(_semaphore._name));
|
||||
_semaphore.on_permit_preemptive_aborted();
|
||||
}
|
||||
|
||||
void on_register_as_inactive() {
|
||||
SCYLLA_ASSERT(_state == reader_permit::state::active || _state == reader_permit::state::active_need_cpu || _state == reader_permit::state::waiting_for_memory);
|
||||
on_permit_inactive(reader_permit::state::inactive);
|
||||
@@ -467,6 +488,10 @@ public:
|
||||
return _semaphore.do_wait_admission(*this);
|
||||
}
|
||||
|
||||
db::timeout_clock::time_point created() const noexcept {
|
||||
return _created;
|
||||
}
|
||||
|
||||
db::timeout_clock::time_point timeout() const noexcept {
|
||||
return _ttl_timer.armed() ? _ttl_timer.get_timeout() : db::no_timeout;
|
||||
}
|
||||
@@ -689,6 +714,9 @@ auto fmt::formatter<reader_permit::state>::format(reader_permit::state s, fmt::f
|
||||
case reader_permit::state::evicted:
|
||||
name = "evicted";
|
||||
break;
|
||||
case reader_permit::state::preemptive_aborted:
|
||||
name = "preemptive_aborted";
|
||||
break;
|
||||
}
|
||||
return formatter<string_view>::format(name, ctx);
|
||||
}
|
||||
@@ -1038,6 +1066,7 @@ reader_concurrency_semaphore::reader_concurrency_semaphore(
|
||||
utils::updateable_value<uint32_t> serialize_limit_multiplier,
|
||||
utils::updateable_value<uint32_t> kill_limit_multiplier,
|
||||
utils::updateable_value<uint32_t> cpu_concurrency,
|
||||
utils::updateable_value<float> preemptive_abort_factor,
|
||||
register_metrics metrics)
|
||||
: _initial_resources(count, memory)
|
||||
, _resources(count, memory)
|
||||
@@ -1047,6 +1076,7 @@ reader_concurrency_semaphore::reader_concurrency_semaphore(
|
||||
, _serialize_limit_multiplier(std::move(serialize_limit_multiplier))
|
||||
, _kill_limit_multiplier(std::move(kill_limit_multiplier))
|
||||
, _cpu_concurrency(cpu_concurrency)
|
||||
, _preemptive_abort_factor(preemptive_abort_factor)
|
||||
, _close_readers_gate(format("[reader_concurrency_semaphore {}] close_readers", _name))
|
||||
, _permit_gate(format("[reader_concurrency_semaphore {}] permit", _name))
|
||||
{
|
||||
@@ -1114,6 +1144,7 @@ reader_concurrency_semaphore::reader_concurrency_semaphore(no_limits, sstring na
|
||||
utils::updateable_value(std::numeric_limits<uint32_t>::max()),
|
||||
utils::updateable_value(std::numeric_limits<uint32_t>::max()),
|
||||
utils::updateable_value(uint32_t(1)),
|
||||
utils::updateable_value(float(0.0)),
|
||||
metrics) {}
|
||||
|
||||
reader_concurrency_semaphore::~reader_concurrency_semaphore() {
|
||||
@@ -1489,6 +1520,25 @@ void reader_concurrency_semaphore::maybe_admit_waiters() noexcept {
|
||||
auto& permit = _wait_list.front();
|
||||
dequeue_permit(permit);
|
||||
try {
|
||||
// Do not admit the read as it is unlikely to finish before its timeout. The condition is:
|
||||
// permit's remaining time <= preemptive_abort_factor * permit's time budget
|
||||
//
|
||||
// The additional check for remaining_time > 0 is to avoid preemptive aborting reads
|
||||
// that already timed out but are still in the wait list due to scheduling delays.
|
||||
// It also effectively disables preemptive aborting when the factor is set to 0.
|
||||
const auto time_budget = permit.timeout() - permit.created();
|
||||
const auto remaining_time = permit.timeout() - db::timeout_clock::now();
|
||||
if (remaining_time > db::timeout_clock::duration::zero() && remaining_time <= _preemptive_abort_factor() * time_budget) {
|
||||
permit.on_preemptive_aborted();
|
||||
using ms = std::chrono::milliseconds;
|
||||
tracing::trace(permit.trace_state(), "[reader concurrency semaphore {}] read shed as unlikely to finish (elapsed: {}, timeout: {}, preemptive_factor: {})",
|
||||
_name,
|
||||
std::chrono::duration_cast<ms>(time_budget - remaining_time),
|
||||
std::chrono::duration_cast<ms>(time_budget),
|
||||
_preemptive_abort_factor());
|
||||
continue;
|
||||
}
|
||||
|
||||
if (permit.get_state() == reader_permit::state::waiting_for_memory) {
|
||||
_blessed_permit = &permit;
|
||||
permit.on_granted_memory();
|
||||
@@ -1549,7 +1599,11 @@ void reader_concurrency_semaphore::dequeue_permit(reader_permit::impl& permit) {
|
||||
case reader_permit::state::waiting_for_admission:
|
||||
case reader_permit::state::waiting_for_memory:
|
||||
case reader_permit::state::waiting_for_execution:
|
||||
--_stats.waiters;
|
||||
if (_stats.waiters > 0) {
|
||||
--_stats.waiters;
|
||||
} else {
|
||||
on_internal_error_noexcept(rcslog, "reader_concurrency_semaphore::dequeue_permit(): invalid state: no waiters yet dequeueing a waiting permit");
|
||||
}
|
||||
break;
|
||||
case reader_permit::state::inactive:
|
||||
case reader_permit::state::evicted:
|
||||
@@ -1558,12 +1612,17 @@ void reader_concurrency_semaphore::dequeue_permit(reader_permit::impl& permit) {
|
||||
case reader_permit::state::active:
|
||||
case reader_permit::state::active_need_cpu:
|
||||
case reader_permit::state::active_await:
|
||||
case reader_permit::state::preemptive_aborted:
|
||||
on_internal_error_noexcept(rcslog, format("reader_concurrency_semaphore::dequeue_permit(): unrecognized queued state: {}", permit.get_state()));
|
||||
}
|
||||
permit.unlink();
|
||||
_permit_list.push_back(permit);
|
||||
}
|
||||
|
||||
void reader_concurrency_semaphore::on_permit_preemptive_aborted() noexcept {
|
||||
++_stats.total_reads_shed_due_to_overload;
|
||||
}
|
||||
|
||||
void reader_concurrency_semaphore::on_permit_created(reader_permit::impl& permit) {
|
||||
_permit_gate.enter();
|
||||
_permit_list.push_back(permit);
|
||||
|
||||
@@ -42,7 +42,7 @@ using mutation_reader_opt = optimized_optional<mutation_reader>;
|
||||
/// number of waiting readers becomes equal or greater than
|
||||
/// `max_queue_length` (upon calling `obtain_permit()`) an exception of
|
||||
/// type `std::runtime_error` is thrown. Optionally, some additional
|
||||
/// code can be executed just before throwing (`prethrow_action`
|
||||
/// code can be executed just before throwing (`prethrow_action`
|
||||
/// constructor parameter).
|
||||
///
|
||||
/// The semaphore has 3 layers of defense against consuming more memory
|
||||
@@ -89,6 +89,7 @@ public:
|
||||
// Total number of failed reads executed through this semaphore.
|
||||
uint64_t total_failed_reads = 0;
|
||||
// Total number of reads rejected because the admission queue reached its max capacity
|
||||
// or rejected due to a high probability of not getting finalized on time.
|
||||
uint64_t total_reads_shed_due_to_overload = 0;
|
||||
// Total number of reads killed due to the memory consumption reaching the kill limit.
|
||||
uint64_t total_reads_killed_due_to_kill_limit = 0;
|
||||
@@ -192,6 +193,8 @@ private:
|
||||
utils::updateable_value<uint32_t> _serialize_limit_multiplier;
|
||||
utils::updateable_value<uint32_t> _kill_limit_multiplier;
|
||||
utils::updateable_value<uint32_t> _cpu_concurrency;
|
||||
utils::updateable_value<float> _preemptive_abort_factor;
|
||||
|
||||
stats _stats;
|
||||
std::optional<seastar::metrics::metric_groups> _metrics;
|
||||
bool _stopped = false;
|
||||
@@ -250,6 +253,8 @@ private:
|
||||
void on_permit_created(reader_permit::impl&);
|
||||
void on_permit_destroyed(reader_permit::impl&) noexcept;
|
||||
|
||||
void on_permit_preemptive_aborted() noexcept;
|
||||
|
||||
void on_permit_need_cpu() noexcept;
|
||||
void on_permit_not_need_cpu() noexcept;
|
||||
|
||||
@@ -287,6 +292,7 @@ public:
|
||||
utils::updateable_value<uint32_t> serialize_limit_multiplier,
|
||||
utils::updateable_value<uint32_t> kill_limit_multiplier,
|
||||
utils::updateable_value<uint32_t> cpu_concurrency,
|
||||
utils::updateable_value<float> preemptive_abort_factor,
|
||||
register_metrics metrics);
|
||||
|
||||
reader_concurrency_semaphore(
|
||||
@@ -296,9 +302,12 @@ public:
|
||||
size_t max_queue_length,
|
||||
utils::updateable_value<uint32_t> serialize_limit_multiplier,
|
||||
utils::updateable_value<uint32_t> kill_limit_multiplier,
|
||||
utils::updateable_value<uint32_t> cpu_concurrency,
|
||||
utils::updateable_value<float> preemptive_abort_factor,
|
||||
register_metrics metrics)
|
||||
: reader_concurrency_semaphore(utils::updateable_value(count), memory, std::move(name), max_queue_length,
|
||||
std::move(serialize_limit_multiplier), std::move(kill_limit_multiplier), utils::updateable_value<uint32_t>(1), metrics)
|
||||
std::move(serialize_limit_multiplier), std::move(kill_limit_multiplier), std::move(cpu_concurrency),
|
||||
std::move(preemptive_abort_factor), metrics)
|
||||
{ }
|
||||
|
||||
/// Create a semaphore with practically unlimited count and memory.
|
||||
@@ -318,9 +327,10 @@ public:
|
||||
utils::updateable_value<uint32_t> serialize_limit_multipler = utils::updateable_value(std::numeric_limits<uint32_t>::max()),
|
||||
utils::updateable_value<uint32_t> kill_limit_multipler = utils::updateable_value(std::numeric_limits<uint32_t>::max()),
|
||||
utils::updateable_value<uint32_t> cpu_concurrency = utils::updateable_value<uint32_t>(1),
|
||||
utils::updateable_value<float> preemptive_abort_factor = utils::updateable_value<float>(0.0f),
|
||||
register_metrics metrics = register_metrics::no)
|
||||
: reader_concurrency_semaphore(utils::updateable_value(count), memory, std::move(name), max_queue_length, std::move(serialize_limit_multipler),
|
||||
std::move(kill_limit_multipler), std::move(cpu_concurrency), metrics)
|
||||
std::move(kill_limit_multipler), std::move(cpu_concurrency), std::move(preemptive_abort_factor), metrics)
|
||||
{}
|
||||
|
||||
virtual ~reader_concurrency_semaphore();
|
||||
|
||||
@@ -70,7 +70,8 @@ reader_concurrency_semaphore& reader_concurrency_semaphore_group::add_or_update(
|
||||
_max_queue_length,
|
||||
_serialize_limit_multiplier,
|
||||
_kill_limit_multiplier,
|
||||
_cpu_concurrency
|
||||
_cpu_concurrency,
|
||||
_preemptive_abort_factor
|
||||
);
|
||||
auto&& it = result.first;
|
||||
// since we serialize all group changes this change wait will be queues and no further operations
|
||||
|
||||
@@ -26,6 +26,7 @@ class reader_concurrency_semaphore_group {
|
||||
utils::updateable_value<uint32_t> _serialize_limit_multiplier;
|
||||
utils::updateable_value<uint32_t> _kill_limit_multiplier;
|
||||
utils::updateable_value<uint32_t> _cpu_concurrency;
|
||||
utils::updateable_value<float> _preemptive_abort_factor;
|
||||
|
||||
friend class database_test_wrapper;
|
||||
|
||||
@@ -36,11 +37,12 @@ class reader_concurrency_semaphore_group {
|
||||
weighted_reader_concurrency_semaphore(size_t shares, int count, sstring name, size_t max_queue_length,
|
||||
utils::updateable_value<uint32_t> serialize_limit_multiplier,
|
||||
utils::updateable_value<uint32_t> kill_limit_multiplier,
|
||||
utils::updateable_value<uint32_t> cpu_concurrency)
|
||||
utils::updateable_value<uint32_t> cpu_concurrency,
|
||||
utils::updateable_value<float> preemptive_abort_factor)
|
||||
: weight(shares)
|
||||
, memory_share(0)
|
||||
, sem(utils::updateable_value(count), 0, name, max_queue_length, std::move(serialize_limit_multiplier), std::move(kill_limit_multiplier),
|
||||
std::move(cpu_concurrency), reader_concurrency_semaphore::register_metrics::yes) {}
|
||||
std::move(cpu_concurrency), std::move(preemptive_abort_factor), reader_concurrency_semaphore::register_metrics::yes) {}
|
||||
};
|
||||
|
||||
std::unordered_map<scheduling_group, weighted_reader_concurrency_semaphore> _semaphores;
|
||||
@@ -54,6 +56,7 @@ public:
|
||||
utils::updateable_value<uint32_t> serialize_limit_multiplier,
|
||||
utils::updateable_value<uint32_t> kill_limit_multiplier,
|
||||
utils::updateable_value<uint32_t> cpu_concurrency,
|
||||
utils::updateable_value<float> preemptive_abort_factor,
|
||||
std::optional<sstring> name_prefix = std::nullopt)
|
||||
: _total_memory(memory)
|
||||
, _total_weight(0)
|
||||
@@ -62,6 +65,7 @@ public:
|
||||
, _serialize_limit_multiplier(std::move(serialize_limit_multiplier))
|
||||
, _kill_limit_multiplier(std::move(kill_limit_multiplier))
|
||||
, _cpu_concurrency(std::move(cpu_concurrency))
|
||||
, _preemptive_abort_factor(std::move(preemptive_abort_factor))
|
||||
, _operations_serializer(1)
|
||||
, _name_prefix(std::move(name_prefix)) { }
|
||||
|
||||
|
||||
@@ -92,6 +92,7 @@ public:
|
||||
active_await,
|
||||
inactive,
|
||||
evicted,
|
||||
preemptive_aborted,
|
||||
};
|
||||
|
||||
class impl;
|
||||
|
||||
@@ -103,8 +103,8 @@ thread_local dirty_memory_manager default_dirty_memory_manager;
|
||||
|
||||
inline
|
||||
flush_controller
|
||||
make_flush_controller(const db::config& cfg, backlog_controller::scheduling_group& sg, std::function<double()> fn) {
|
||||
return flush_controller(sg, cfg.memtable_flush_static_shares(), 50ms, cfg.unspooled_dirty_soft_limit(), std::move(fn));
|
||||
make_flush_controller(const db::config& cfg, const database_config& dbcfg, std::function<double()> fn) {
|
||||
return flush_controller(dbcfg.memtable_scheduling_group, cfg.memtable_flush_static_shares(), 50ms, cfg.unspooled_dirty_soft_limit(), std::move(fn));
|
||||
}
|
||||
|
||||
keyspace::keyspace(config cfg, locator::effective_replication_map_factory& erm_factory)
|
||||
@@ -394,8 +394,7 @@ database::database(const db::config& cfg, database_config dbcfg, service::migrat
|
||||
, _system_dirty_memory_manager(*this, 10 << 20, cfg.unspooled_dirty_soft_limit(), default_scheduling_group())
|
||||
, _dirty_memory_manager(*this, dbcfg.available_memory * 0.50, cfg.unspooled_dirty_soft_limit(), dbcfg.statement_scheduling_group)
|
||||
, _dbcfg(dbcfg)
|
||||
, _flush_sg(dbcfg.memtable_scheduling_group)
|
||||
, _memtable_controller(make_flush_controller(_cfg, _flush_sg, [this, limit = float(_dirty_memory_manager.throttle_threshold())] {
|
||||
, _memtable_controller(make_flush_controller(_cfg, _dbcfg, [this, limit = float(_dirty_memory_manager.throttle_threshold())] {
|
||||
auto backlog = (_dirty_memory_manager.unspooled_dirty_memory()) / limit;
|
||||
if (_dirty_memory_manager.has_extraneous_flushes_requested()) {
|
||||
backlog = std::max(backlog, _memtable_controller.backlog_of_shares(200));
|
||||
@@ -412,6 +411,7 @@ database::database(const db::config& cfg, database_config dbcfg, service::migrat
|
||||
utils::updateable_value(std::numeric_limits<uint32_t>::max()),
|
||||
utils::updateable_value(std::numeric_limits<uint32_t>::max()),
|
||||
utils::updateable_value(uint32_t(1)),
|
||||
utils::updateable_value(0.0f),
|
||||
reader_concurrency_semaphore::register_metrics::yes)
|
||||
// No limits, just for accounting.
|
||||
, _compaction_concurrency_sem(reader_concurrency_semaphore::no_limits{}, "compaction", reader_concurrency_semaphore::register_metrics::no)
|
||||
@@ -423,6 +423,8 @@ database::database(const db::config& cfg, database_config dbcfg, service::migrat
|
||||
std::numeric_limits<size_t>::max(),
|
||||
utils::updateable_value(std::numeric_limits<uint32_t>::max()),
|
||||
utils::updateable_value(std::numeric_limits<uint32_t>::max()),
|
||||
utils::updateable_value(uint32_t(1)),
|
||||
utils::updateable_value(0.0f),
|
||||
reader_concurrency_semaphore::register_metrics::yes)
|
||||
, _view_update_read_concurrency_semaphores_group(
|
||||
max_memory_concurrent_view_update_reads(),
|
||||
@@ -431,6 +433,7 @@ database::database(const db::config& cfg, database_config dbcfg, service::migrat
|
||||
_cfg.view_update_reader_concurrency_semaphore_serialize_limit_multiplier,
|
||||
_cfg.view_update_reader_concurrency_semaphore_kill_limit_multiplier,
|
||||
_cfg.view_update_reader_concurrency_semaphore_cpu_concurrency,
|
||||
utils::updateable_value(0.0f),
|
||||
"view_update")
|
||||
, _row_cache_tracker(_cfg.index_cache_fraction.operator utils::updateable_value<double>(), cache_tracker::register_metrics::yes)
|
||||
, _apply_stage("db_apply", &database::do_apply)
|
||||
@@ -460,7 +463,8 @@ database::database(const db::config& cfg, database_config dbcfg, service::migrat
|
||||
, _reader_concurrency_semaphores_group(max_memory_concurrent_reads(), max_count_concurrent_reads, max_inactive_queue_length(),
|
||||
_cfg.reader_concurrency_semaphore_serialize_limit_multiplier,
|
||||
_cfg.reader_concurrency_semaphore_kill_limit_multiplier,
|
||||
_cfg.reader_concurrency_semaphore_cpu_concurrency)
|
||||
_cfg.reader_concurrency_semaphore_cpu_concurrency,
|
||||
_cfg.reader_concurrency_semaphore_preemptive_abort_factor)
|
||||
, _stop_barrier(std::move(barrier))
|
||||
, _update_memtable_flush_static_shares_action([this, &cfg] { return _memtable_controller.update_static_shares(cfg.memtable_flush_static_shares()); })
|
||||
, _memtable_flush_static_shares_observer(cfg.memtable_flush_static_shares.observe(_update_memtable_flush_static_shares_action.make_observer()))
|
||||
|
||||
@@ -1617,7 +1617,6 @@ private:
|
||||
dirty_memory_manager _dirty_memory_manager;
|
||||
|
||||
database_config _dbcfg;
|
||||
backlog_controller::scheduling_group _flush_sg;
|
||||
flush_controller _memtable_controller;
|
||||
drain_progress _drain_progress {};
|
||||
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
#include "memtable.hh"
|
||||
#include "replica/database.hh"
|
||||
#include "mutation/frozen_mutation.hh"
|
||||
#include "partition_snapshot_reader.hh"
|
||||
#include "replica/partition_snapshot_reader.hh"
|
||||
#include "partition_builder.hh"
|
||||
#include "mutation/mutation_partition_view.hh"
|
||||
#include "readers/empty.hh"
|
||||
@@ -19,7 +19,7 @@
|
||||
|
||||
namespace replica {
|
||||
|
||||
static mutation_reader make_partition_snapshot_flat_reader_from_snp_schema(
|
||||
static mutation_reader make_partition_snapshot_reader_from_snp_schema(
|
||||
bool is_reversed,
|
||||
reader_permit permit,
|
||||
dht::decorated_key dk,
|
||||
@@ -482,7 +482,7 @@ public:
|
||||
auto cr = query::clustering_key_filter_ranges::get_ranges(*schema(), _slice, key_and_snp->first.key());
|
||||
bool digest_requested = _slice.options.contains<query::partition_slice::option::with_digest>();
|
||||
bool is_reversed = _slice.is_reversed();
|
||||
_delegate = make_partition_snapshot_flat_reader_from_snp_schema(is_reversed, _permit, std::move(key_and_snp->first), std::move(cr), std::move(key_and_snp->second), digest_requested, region(), read_section(), mtbl(), streamed_mutation::forwarding::no, *mtbl());
|
||||
_delegate = make_partition_snapshot_reader_from_snp_schema(is_reversed, _permit, std::move(key_and_snp->first), std::move(cr), std::move(key_and_snp->second), digest_requested, region(), read_section(), mtbl(), streamed_mutation::forwarding::no, *mtbl());
|
||||
_delegate->upgrade_schema(schema());
|
||||
} else {
|
||||
_end_of_stream = true;
|
||||
@@ -604,7 +604,7 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
static mutation_reader make_partition_snapshot_flat_reader_from_snp_schema(
|
||||
static mutation_reader make_partition_snapshot_reader_from_snp_schema(
|
||||
bool is_reversed,
|
||||
reader_permit permit,
|
||||
dht::decorated_key dk,
|
||||
@@ -617,10 +617,10 @@ static mutation_reader make_partition_snapshot_flat_reader_from_snp_schema(
|
||||
streamed_mutation::forwarding fwd, memtable& memtable) {
|
||||
if (is_reversed) {
|
||||
schema_ptr rev_snp_schema = snp->schema()->make_reversed();
|
||||
return make_partition_snapshot_flat_reader<true, partition_snapshot_read_accounter>(std::move(rev_snp_schema), std::move(permit), std::move(dk), std::move(crr), std::move(snp), digest_requested, region, read_section, pointer_to_container, fwd, memtable);
|
||||
return make_partition_snapshot_reader<true, partition_snapshot_read_accounter>(std::move(rev_snp_schema), std::move(permit), std::move(dk), std::move(crr), std::move(snp), digest_requested, region, read_section, pointer_to_container, fwd, memtable);
|
||||
} else {
|
||||
schema_ptr snp_schema = snp->schema();
|
||||
return make_partition_snapshot_flat_reader<false, partition_snapshot_read_accounter>(std::move(snp_schema), std::move(permit), std::move(dk), std::move(crr), std::move(snp), digest_requested, region, read_section, pointer_to_container, fwd, memtable);
|
||||
return make_partition_snapshot_reader<false, partition_snapshot_read_accounter>(std::move(snp_schema), std::move(permit), std::move(dk), std::move(crr), std::move(snp), digest_requested, region, read_section, pointer_to_container, fwd, memtable);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -660,7 +660,7 @@ private:
|
||||
update_last(key_and_snp->first);
|
||||
auto cr = query::clustering_key_filter_ranges::get_ranges(*schema(), schema()->full_slice(), key_and_snp->first.key());
|
||||
auto snp_schema = key_and_snp->second->schema();
|
||||
_partition_reader = make_partition_snapshot_flat_reader<false, partition_snapshot_flush_accounter>(snp_schema, _permit, std::move(key_and_snp->first), std::move(cr),
|
||||
_partition_reader = make_partition_snapshot_reader<false, partition_snapshot_flush_accounter>(snp_schema, _permit, std::move(key_and_snp->first), std::move(cr),
|
||||
std::move(key_and_snp->second), false, region(), read_section(), mtbl(), streamed_mutation::forwarding::no, *snp_schema, _flushed_memory);
|
||||
_partition_reader->upgrade_schema(schema());
|
||||
}
|
||||
@@ -737,7 +737,7 @@ memtable::make_mutation_reader_opt(schema_ptr query_schema,
|
||||
auto dk = pos.as_decorated_key();
|
||||
auto cr = query::clustering_key_filter_ranges::get_ranges(*query_schema, slice, dk.key());
|
||||
bool digest_requested = slice.options.contains<query::partition_slice::option::with_digest>();
|
||||
auto rd = make_partition_snapshot_flat_reader_from_snp_schema(is_reversed, std::move(permit), std::move(dk), std::move(cr), std::move(snp), digest_requested, *this, _table_shared_data.read_section, shared_from_this(), fwd, *this);
|
||||
auto rd = make_partition_snapshot_reader_from_snp_schema(is_reversed, std::move(permit), std::move(dk), std::move(cr), std::move(snp), digest_requested, *this, _table_shared_data.read_section, shared_from_this(), fwd, *this);
|
||||
rd.upgrade_schema(query_schema);
|
||||
return rd;
|
||||
} else {
|
||||
|
||||
@@ -9,9 +9,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "mutation/partition_version.hh"
|
||||
#include "readers/mutation_reader_fwd.hh"
|
||||
#include "readers/mutation_reader.hh"
|
||||
#include "readers/range_tombstone_change_merger.hh"
|
||||
#include "keys/clustering_key_filter.hh"
|
||||
#include "query/query-request.hh"
|
||||
#include "db/partition_snapshot_row_cursor.hh"
|
||||
@@ -19,8 +17,10 @@
|
||||
|
||||
extern seastar::logger mplog;
|
||||
|
||||
namespace replica {
|
||||
|
||||
template <bool Reversing, typename Accounter>
|
||||
class partition_snapshot_flat_reader : public mutation_reader::impl, public Accounter {
|
||||
class partition_snapshot_reader : public mutation_reader::impl, public Accounter {
|
||||
struct row_info {
|
||||
mutation_fragment_v2 row;
|
||||
tombstone rt_for_row;
|
||||
@@ -232,7 +232,7 @@ private:
|
||||
}
|
||||
public:
|
||||
template <typename... Args>
|
||||
partition_snapshot_flat_reader(schema_ptr s, reader_permit permit, dht::decorated_key dk, partition_snapshot_ptr snp,
|
||||
partition_snapshot_reader(schema_ptr s, reader_permit permit, dht::decorated_key dk, partition_snapshot_ptr snp,
|
||||
query::clustering_key_filter_ranges crr, bool digest_requested,
|
||||
logalloc::region& region, logalloc::allocating_section& read_section,
|
||||
std::any pointer_to_container, Args&&... args)
|
||||
@@ -285,7 +285,7 @@ public:
|
||||
|
||||
template <bool Reversing, typename Accounter, typename... Args>
|
||||
inline mutation_reader
|
||||
make_partition_snapshot_flat_reader(schema_ptr s,
|
||||
make_partition_snapshot_reader(schema_ptr s,
|
||||
reader_permit permit,
|
||||
dht::decorated_key dk,
|
||||
query::clustering_key_filter_ranges crr,
|
||||
@@ -297,7 +297,7 @@ make_partition_snapshot_flat_reader(schema_ptr s,
|
||||
streamed_mutation::forwarding fwd,
|
||||
Args&&... args)
|
||||
{
|
||||
auto res = make_mutation_reader<partition_snapshot_flat_reader<Reversing, Accounter>>(std::move(s), std::move(permit), std::move(dk),
|
||||
auto res = make_mutation_reader<partition_snapshot_reader<Reversing, Accounter>>(std::move(s), std::move(permit), std::move(dk),
|
||||
snp, std::move(crr), digest_requested, region, read_section, std::move(pointer_to_container), std::forward<Args>(args)...);
|
||||
if (fwd) {
|
||||
return make_forwardable(std::move(res)); // FIXME: optimize
|
||||
@@ -305,3 +305,5 @@ make_partition_snapshot_flat_reader(schema_ptr s,
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace replica
|
||||
163
replica/table.cc
163
replica/table.cc
@@ -7,6 +7,7 @@
|
||||
*/
|
||||
|
||||
#include <seastar/core/seastar.hh>
|
||||
#include <seastar/core/shard_id.hh>
|
||||
#include <seastar/core/coroutine.hh>
|
||||
#include <seastar/core/with_scheduling_group.hh>
|
||||
#include <seastar/coroutine/maybe_yield.hh>
|
||||
@@ -23,7 +24,6 @@
|
||||
#include "replica/data_dictionary_impl.hh"
|
||||
#include "replica/compaction_group.hh"
|
||||
#include "replica/query_state.hh"
|
||||
#include "seastar/core/shard_id.hh"
|
||||
#include "sstables/shared_sstable.hh"
|
||||
#include "sstables/sstable_set.hh"
|
||||
#include "sstables/sstables.hh"
|
||||
@@ -1746,100 +1746,97 @@ table::seal_active_memtable(compaction_group& cg, flush_permit&& flush_permit) n
|
||||
}
|
||||
|
||||
future<>
|
||||
table::try_flush_memtable_to_sstable(compaction_group& cg, lw_shared_ptr<memtable> old, sstable_write_permit&& permit) {
|
||||
table::try_flush_memtable_to_sstable(compaction_group& cg, lw_shared_ptr<memtable> old, sstable_write_permit&& permit_) {
|
||||
co_await utils::get_local_injector().inject("flush_memtable_to_sstable_wait", utils::wait_for_message(60s));
|
||||
|
||||
auto try_flush = [this, old = std::move(old), permit = make_lw_shared(std::move(permit)), &cg] () mutable -> future<> {
|
||||
// Note that due to our sharded architecture, it is possible that
|
||||
// in the face of a value change some shards will backup sstables
|
||||
// while others won't.
|
||||
//
|
||||
// This is, in theory, possible to mitigate through a rwlock.
|
||||
// However, this doesn't differ from the situation where all tables
|
||||
// are coming from a single shard and the toggle happens in the
|
||||
// middle of them.
|
||||
//
|
||||
// The code as is guarantees that we'll never partially backup a
|
||||
// single sstable, so that is enough of a guarantee.
|
||||
auto permit = make_lw_shared(std::move(permit_));
|
||||
co_await coroutine::switch_to(_config.memtable_scheduling_group);
|
||||
// Note that due to our sharded architecture, it is possible that
|
||||
// in the face of a value change some shards will backup sstables
|
||||
// while others won't.
|
||||
//
|
||||
// This is, in theory, possible to mitigate through a rwlock.
|
||||
// However, this doesn't differ from the situation where all tables
|
||||
// are coming from a single shard and the toggle happens in the
|
||||
// middle of them.
|
||||
//
|
||||
// The code as is guarantees that we'll never partially backup a
|
||||
// single sstable, so that is enough of a guarantee.
|
||||
|
||||
auto newtabs = std::vector<sstables::shared_sstable>();
|
||||
auto metadata = mutation_source_metadata{};
|
||||
metadata.min_timestamp = old->get_min_timestamp();
|
||||
metadata.max_timestamp = old->get_max_timestamp();
|
||||
auto estimated_partitions = _compaction_strategy.adjust_partition_estimate(metadata, old->partition_count(), _schema);
|
||||
auto newtabs = std::vector<sstables::shared_sstable>();
|
||||
auto metadata = mutation_source_metadata{};
|
||||
metadata.min_timestamp = old->get_min_timestamp();
|
||||
metadata.max_timestamp = old->get_max_timestamp();
|
||||
auto estimated_partitions = _compaction_strategy.adjust_partition_estimate(metadata, old->partition_count(), _schema);
|
||||
|
||||
if (!cg.async_gate().is_closed()) {
|
||||
co_await _compaction_manager.maybe_wait_for_sstable_count_reduction(cg.view_for_unrepaired_data());
|
||||
}
|
||||
if (!cg.async_gate().is_closed()) {
|
||||
co_await _compaction_manager.maybe_wait_for_sstable_count_reduction(cg.view_for_unrepaired_data());
|
||||
}
|
||||
|
||||
auto consumer = _compaction_strategy.make_interposer_consumer(metadata, [this, old, permit, &newtabs, estimated_partitions, &cg] (mutation_reader reader) mutable -> future<> {
|
||||
std::exception_ptr ex;
|
||||
try {
|
||||
sstables::sstable_writer_config cfg = get_sstables_manager().configure_writer("memtable");
|
||||
cfg.backup = incremental_backups_enabled();
|
||||
auto consumer = _compaction_strategy.make_interposer_consumer(metadata, [this, old, permit, &newtabs, estimated_partitions, &cg] (mutation_reader reader) mutable -> future<> {
|
||||
std::exception_ptr ex;
|
||||
try {
|
||||
sstables::sstable_writer_config cfg = get_sstables_manager().configure_writer("memtable");
|
||||
cfg.backup = incremental_backups_enabled();
|
||||
|
||||
auto newtab = make_sstable();
|
||||
newtabs.push_back(newtab);
|
||||
tlogger.debug("Flushing to {}", newtab->get_filename());
|
||||
auto newtab = make_sstable();
|
||||
newtabs.push_back(newtab);
|
||||
tlogger.debug("Flushing to {}", newtab->get_filename());
|
||||
|
||||
auto monitor = database_sstable_write_monitor(permit, newtab, cg,
|
||||
old->get_max_timestamp());
|
||||
auto monitor = database_sstable_write_monitor(permit, newtab, cg,
|
||||
old->get_max_timestamp());
|
||||
|
||||
co_return co_await write_memtable_to_sstable(std::move(reader), *old, newtab, estimated_partitions, monitor, cfg);
|
||||
} catch (...) {
|
||||
ex = std::current_exception();
|
||||
}
|
||||
co_await reader.close();
|
||||
co_await coroutine::return_exception_ptr(std::move(ex));
|
||||
co_return co_await write_memtable_to_sstable(std::move(reader), *old, newtab, estimated_partitions, monitor, cfg);
|
||||
} catch (...) {
|
||||
ex = std::current_exception();
|
||||
}
|
||||
co_await reader.close();
|
||||
co_await coroutine::return_exception_ptr(std::move(ex));
|
||||
});
|
||||
|
||||
auto f = consumer(old->make_flush_reader(
|
||||
old->schema(),
|
||||
compaction_concurrency_semaphore().make_tracking_only_permit(old->schema(), "try_flush_memtable_to_sstable()", db::no_timeout, {})));
|
||||
|
||||
// Switch back to default scheduling group for post-flush actions, to avoid them being staved by the memtable flush
|
||||
// controller. Cache update does not affect the input of the memtable cpu controller, so it can be subject to
|
||||
// priority inversion.
|
||||
co_await coroutine::switch_to(default_scheduling_group());
|
||||
try {
|
||||
co_await std::move(f);
|
||||
co_await coroutine::parallel_for_each(newtabs, [] (auto& newtab) -> future<> {
|
||||
co_await newtab->open_data();
|
||||
tlogger.debug("Flushing to {} done", newtab->get_filename());
|
||||
});
|
||||
|
||||
auto f = consumer(old->make_flush_reader(
|
||||
old->schema(),
|
||||
compaction_concurrency_semaphore().make_tracking_only_permit(old->schema(), "try_flush_memtable_to_sstable()", db::no_timeout, {})));
|
||||
co_await with_scheduling_group(_config.memtable_to_cache_scheduling_group, [this, old, &newtabs, &cg] {
|
||||
return update_cache(cg, old, newtabs);
|
||||
});
|
||||
|
||||
// Switch back to default scheduling group for post-flush actions, to avoid them being staved by the memtable flush
|
||||
// controller. Cache update does not affect the input of the memtable cpu controller, so it can be subject to
|
||||
// priority inversion.
|
||||
auto post_flush = [this, old = std::move(old), &newtabs, f = std::move(f), &cg] () mutable -> future<> {
|
||||
try {
|
||||
co_await std::move(f);
|
||||
co_await coroutine::parallel_for_each(newtabs, [] (auto& newtab) -> future<> {
|
||||
co_await newtab->open_data();
|
||||
tlogger.debug("Flushing to {} done", newtab->get_filename());
|
||||
});
|
||||
|
||||
co_await with_scheduling_group(_config.memtable_to_cache_scheduling_group, [this, old, &newtabs, &cg] {
|
||||
return update_cache(cg, old, newtabs);
|
||||
});
|
||||
|
||||
co_await utils::get_local_injector().inject("replica_post_flush_after_update_cache", [this] (auto& handler) -> future<> {
|
||||
const auto this_table_name = format("{}.{}", _schema->ks_name(), _schema->cf_name());
|
||||
if (this_table_name == handler.get("table_name")) {
|
||||
tlogger.info("error injection handler replica_post_flush_after_update_cache: suspending flush for table {}", this_table_name);
|
||||
handler.set("suspended", true);
|
||||
co_await handler.wait_for_message(std::chrono::steady_clock::now() + std::chrono::minutes{5});
|
||||
tlogger.info("error injection handler replica_post_flush_after_update_cache: resuming flush for table {}", this_table_name);
|
||||
}
|
||||
});
|
||||
|
||||
cg.memtables()->erase(old);
|
||||
tlogger.debug("Memtable for {}.{} replaced, into {} sstables", old->schema()->ks_name(), old->schema()->cf_name(), newtabs.size());
|
||||
co_return;
|
||||
} catch (const std::exception& e) {
|
||||
for (auto& newtab : newtabs) {
|
||||
newtab->mark_for_deletion();
|
||||
tlogger.error("failed to write sstable {}: {}", newtab->get_filename(), e);
|
||||
}
|
||||
_config.cf_stats->failed_memtables_flushes_count++;
|
||||
// If we failed this write we will try the write again and that will create a new flush reader
|
||||
// that will decrease dirty memory again. So we need to reset the accounting.
|
||||
old->revert_flushed_memory();
|
||||
throw;
|
||||
co_await utils::get_local_injector().inject("replica_post_flush_after_update_cache", [this] (auto& handler) -> future<> {
|
||||
const auto this_table_name = format("{}.{}", _schema->ks_name(), _schema->cf_name());
|
||||
if (this_table_name == handler.get("table_name")) {
|
||||
tlogger.info("error injection handler replica_post_flush_after_update_cache: suspending flush for table {}", this_table_name);
|
||||
handler.set("suspended", true);
|
||||
co_await handler.wait_for_message(std::chrono::steady_clock::now() + std::chrono::minutes{5});
|
||||
tlogger.info("error injection handler replica_post_flush_after_update_cache: resuming flush for table {}", this_table_name);
|
||||
}
|
||||
};
|
||||
co_return co_await with_scheduling_group(default_scheduling_group(), std::ref(post_flush));
|
||||
};
|
||||
co_return co_await with_scheduling_group(_config.memtable_scheduling_group, std::ref(try_flush));
|
||||
});
|
||||
|
||||
cg.memtables()->erase(old);
|
||||
tlogger.debug("Memtable for {}.{} replaced, into {} sstables", old->schema()->ks_name(), old->schema()->cf_name(), newtabs.size());
|
||||
co_return;
|
||||
} catch (const std::exception& e) {
|
||||
for (auto& newtab : newtabs) {
|
||||
newtab->mark_for_deletion();
|
||||
tlogger.error("failed to write sstable {}: {}", newtab->get_filename(), e);
|
||||
}
|
||||
_config.cf_stats->failed_memtables_flushes_count++;
|
||||
// If we failed this write we will try the write again and that will create a new flush reader
|
||||
// that will decrease dirty memory again. So we need to reset the accounting.
|
||||
old->revert_flushed_memory();
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
|
||||
@@ -20,7 +20,7 @@ set -e
|
||||
trap 'echo "error $? in $0 line $LINENO"' ERR
|
||||
|
||||
SCRIPT_NAME=$(basename $0)
|
||||
SCYLLA_S3_RELOC_SERVER_DEFAULT_URL=http://backtrace.scylladb.com
|
||||
SCYLLA_S3_RELOC_SERVER_DEFAULT_URL=https://api.backtrace.scylladb.com
|
||||
|
||||
function print_usage {
|
||||
cat << EOF
|
||||
@@ -284,7 +284,8 @@ then
|
||||
|
||||
log "Build id: ${BUILD_ID}"
|
||||
|
||||
BUILD=$(curl -s -X GET "${SCYLLA_S3_RELOC_SERVER_URL}/build.json?build_id=${BUILD_ID}")
|
||||
# https://api.backtrace.scylladb.com/api/docs#/default/search_by_build_id_search_build_id_get
|
||||
BUILD=$(curl "${SCYLLA_S3_RELOC_SERVER_URL}/api/search/build_id?build_id=${BUILD_ID}" -H 'accept: application/json')
|
||||
|
||||
if [[ -z "$BUILD" ]]
|
||||
then
|
||||
@@ -293,12 +294,16 @@ then
|
||||
fi
|
||||
|
||||
RESPONSE_BUILD_ID=$(get_json_field "$BUILD" "build_id")
|
||||
VERSION=$(get_json_field "$BUILD" "version")
|
||||
PRODUCT=$(get_json_field "$BUILD" "product")
|
||||
RELEASE=$(get_json_field "$BUILD" "release")
|
||||
ARCH=$(get_json_field "$BUILD" "arch")
|
||||
BUILD_MODE=$(get_json_field "$BUILD" "build_mode")
|
||||
PACKAGE_URL=$(get_json_field "$BUILD" "package_url" 1)
|
||||
BUILD_MODE=$(get_json_field "$BUILD" "build_type")
|
||||
PACKAGE_URL=$(get_json_field "$BUILD" "unstripped_url")
|
||||
BUILD_DATA=$(get_json_field "$BUILD" "build_data")
|
||||
|
||||
VERSION=$(get_json_field "$BUILD_DATA" "version")
|
||||
PRODUCT=$(get_json_field "$BUILD_DATA" "product")
|
||||
RELEASE=$(get_json_field "$BUILD_DATA" "release")
|
||||
ARCH=$(get_json_field "$BUILD_DATA" "platform")
|
||||
TIMESTAMP=$(get_json_field "$BUILD_DATA" "timestamp")
|
||||
|
||||
|
||||
if [[ "$RESPONSE_BUILD_ID" != "$BUILD_ID" ]]
|
||||
then
|
||||
@@ -306,7 +311,7 @@ then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log "Matching build is ${PRODUCT}-${VERSION} ${RELEASE} ${BUILD_MODE}-${ARCH}"
|
||||
log "Matching build is ${PRODUCT}-${VERSION} ${RELEASE} ${BUILD_MODE}-${ARCH} from ${TIMESTAMP}"
|
||||
fi
|
||||
|
||||
if ! [[ -d ${ARTIFACT_DIR}/scylla.package ]]
|
||||
|
||||
@@ -98,16 +98,6 @@ future<> service::client_state::has_column_family_access(const sstring& ks,
|
||||
co_return co_await has_access(ks, {p, r, t}, is_vector_indexed);
|
||||
}
|
||||
|
||||
future<> service::client_state::has_schema_access(const schema& s, auth::permission p) const {
|
||||
auth::resource r = auth::make_data_resource(s.ks_name(), s.cf_name());
|
||||
co_return co_await has_access(s.ks_name(), {p, r});
|
||||
}
|
||||
|
||||
future<> service::client_state::has_schema_access(const sstring& ks_name, const sstring& cf_name, auth::permission p) const {
|
||||
auth::resource r = auth::make_data_resource(ks_name, cf_name);
|
||||
co_return co_await has_access(ks_name, {p, r});
|
||||
}
|
||||
|
||||
future<> service::client_state::check_internal_table_permissions(std::string_view ks, std::string_view table_name, const auth::command_desc& cmd) const {
|
||||
// 1. CDC and $paxos tables are managed internally by Scylla. Users are prohibited
|
||||
// from running ALTER or DROP commands on them.
|
||||
@@ -227,6 +217,8 @@ future<> service::client_state::has_access(const sstring& ks, auth::command_desc
|
||||
static const std::unordered_set<auth::resource> vector_search_system_resources = {
|
||||
auth::make_data_resource(db::system_keyspace::NAME, db::system_keyspace::GROUP0_HISTORY),
|
||||
auth::make_data_resource(db::system_keyspace::NAME, db::system_keyspace::VERSIONS),
|
||||
auth::make_data_resource(db::system_keyspace::NAME, db::system_keyspace::CDC_STREAMS),
|
||||
auth::make_data_resource(db::system_keyspace::NAME, db::system_keyspace::CDC_TIMESTAMPS),
|
||||
};
|
||||
|
||||
if ((cmd.resource.kind() == auth::resource_kind::data && cmd.permission == auth::permission::SELECT && is_vector_indexed.has_value() && is_vector_indexed.value()) ||
|
||||
@@ -363,4 +355,4 @@ future<> service::client_state::set_client_options(
|
||||
});
|
||||
_client_options.emplace_back(std::move(cached_key), std::move(cached_value));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -359,8 +359,6 @@ public:
|
||||
future<> has_keyspace_access(const sstring&, auth::permission) const;
|
||||
future<> has_column_family_access(const sstring&, const sstring&, auth::permission,
|
||||
auth::command_desc::type = auth::command_desc::type::OTHER, std::optional<bool> is_vector_indexed = std::nullopt) const;
|
||||
future<> has_schema_access(const schema& s, auth::permission p) const;
|
||||
future<> has_schema_access(const sstring&, const sstring&, auth::permission p) const;
|
||||
|
||||
future<> has_functions_access(auth::permission p) const;
|
||||
future<> has_functions_access(const sstring& ks, auth::permission p) const;
|
||||
|
||||
@@ -56,6 +56,9 @@ static future<schema_ptr> get_schema_definition(table_schema_version v, locator:
|
||||
migration_manager::migration_manager(migration_notifier& notifier, gms::feature_service& feat, netw::messaging_service& ms,
|
||||
service::storage_proxy& storage_proxy, gms::gossiper& gossiper, service::raft_group0_client& group0_client, sharded<db::system_keyspace>& sysks) :
|
||||
_notifier(notifier)
|
||||
, _background_tasks("migration_manager::background_tasks")
|
||||
, _feat(feat), _messaging(ms), _storage_proxy(storage_proxy), _ss("migration_manager::storage_service"), _gossiper(gossiper), _group0_client(group0_client)
|
||||
, _sys_ks(sysks)
|
||||
, _group0_barrier(this_shard_id() == 0 ?
|
||||
std::function<future<>()>([this] () -> future<> {
|
||||
if ((co_await _group0_client.get_group0_upgrade_state()).second == group0_upgrade_state::use_pre_raft_procedures) {
|
||||
@@ -63,7 +66,7 @@ migration_manager::migration_manager(migration_notifier& notifier, gms::feature_
|
||||
}
|
||||
|
||||
// This will run raft barrier and will sync schema with the leader
|
||||
co_await with_scheduling_group(_storage_proxy.get_db().local().get_gossip_scheduling_group(), [this] {
|
||||
co_await with_scheduling_group(_gossiper.get_scheduling_group(), [this] {
|
||||
return start_group0_operation().discard_result();
|
||||
});
|
||||
}) :
|
||||
@@ -74,9 +77,6 @@ migration_manager::migration_manager(migration_notifier& notifier, gms::feature_
|
||||
});
|
||||
})
|
||||
)
|
||||
, _background_tasks("migration_manager::background_tasks")
|
||||
, _feat(feat), _messaging(ms), _storage_proxy(storage_proxy), _ss("migration_manager::storage_service"), _gossiper(gossiper), _group0_client(group0_client)
|
||||
, _sys_ks(sysks)
|
||||
, _schema_push([this] { return passive_announce(); })
|
||||
, _concurrent_ddl_retries{10}
|
||||
{
|
||||
|
||||
@@ -57,7 +57,6 @@ private:
|
||||
migration_notifier& _notifier;
|
||||
|
||||
std::unordered_map<locator::host_id, serialized_action> _schema_pulls;
|
||||
serialized_action _group0_barrier;
|
||||
std::vector<gms::feature::listener_registration> _feature_listeners;
|
||||
seastar::named_gate _background_tasks;
|
||||
static const std::chrono::milliseconds migration_delay;
|
||||
@@ -69,6 +68,7 @@ private:
|
||||
seastar::abort_source _as;
|
||||
service::raft_group0_client& _group0_client;
|
||||
sharded<db::system_keyspace>& _sys_ks;
|
||||
serialized_action _group0_barrier;
|
||||
serialized_action _schema_push;
|
||||
table_schema_version _schema_version_to_publish;
|
||||
|
||||
|
||||
@@ -338,7 +338,7 @@ future<> group0_state_machine::merge_and_apply(group0_state_machine_merger& merg
|
||||
}
|
||||
|
||||
#ifndef SCYLLA_BUILD_MODE_RELEASE
|
||||
static void ensure_group0_schema(const group0_command& cmd, const replica::database& db) {
|
||||
static void ensure_group0_schema(const group0_command& cmd, data_dictionary::database db) {
|
||||
auto validate_schema = [&db](const utils::chunked_vector<canonical_mutation>& mutations) {
|
||||
for (const auto& mut : mutations) {
|
||||
// Get the schema for the column family
|
||||
@@ -382,7 +382,7 @@ future<> group0_state_machine::apply(std::vector<raft::command_cref> command) {
|
||||
|
||||
// max_mutation_size = 1/2 of commitlog segment size, thus max_command_size is set 1/3 of commitlog segment size to leave space for metadata.
|
||||
size_t max_command_size = _sp.data_dictionary().get_config().commitlog_segment_size_in_mb() * 1024 * 1024 / 3;
|
||||
group0_state_machine_merger m(co_await _client.sys_ks().get_last_group0_state_id(), std::move(read_apply_mutex_holder),
|
||||
group0_state_machine_merger m(co_await _client.get_last_group0_state_id(), std::move(read_apply_mutex_holder),
|
||||
max_command_size, _sp.data_dictionary());
|
||||
|
||||
for (auto&& c : command) {
|
||||
@@ -392,7 +392,7 @@ future<> group0_state_machine::apply(std::vector<raft::command_cref> command) {
|
||||
#ifndef SCYLLA_BUILD_MODE_RELEASE
|
||||
// Ensure that the schema of the mutations is a group0 schema.
|
||||
// This validation is supposed to be only performed in tests, so it is skipped in the release mode.
|
||||
ensure_group0_schema(cmd, _client.sys_ks().local_db());
|
||||
ensure_group0_schema(cmd, _sp.data_dictionary());
|
||||
#endif
|
||||
|
||||
slogger.trace("cmd: prev_state_id: {}, new_state_id: {}, creator_addr: {}, creator_id: {}",
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user