mirror of
https://github.com/scylladb/scylladb.git
synced 2026-05-14 03:42:14 +00:00
Compare commits
239 Commits
copilot/us
...
debug_form
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8c293dfe36 | ||
|
|
97fb93a5b4 | ||
|
|
357b94ac60 | ||
|
|
037f291d48 | ||
|
|
b246a70404 | ||
|
|
a071d54160 | ||
|
|
62d369c113 | ||
|
|
bce09e055c | ||
|
|
37004ac4dc | ||
|
|
6f619908ce | ||
|
|
0fe552b074 | ||
|
|
14482e2e07 | ||
|
|
4bda6dded5 | ||
|
|
0be4fac5e5 | ||
|
|
8818a0347e | ||
|
|
489bdb2f93 | ||
|
|
9efd94199d | ||
|
|
10085ddb25 | ||
|
|
8635a2aa6f | ||
|
|
c7bb7b34c0 | ||
|
|
66be0f4577 | ||
|
|
120f381a9d | ||
|
|
f112e42ddd | ||
|
|
56c375b1f3 | ||
|
|
e59a21752d | ||
|
|
85a531819b | ||
|
|
772b32d9f7 | ||
|
|
60fb5270a9 | ||
|
|
3b9398dfc8 | ||
|
|
df68d0c0f7 | ||
|
|
051107f5bc | ||
|
|
c8e7e20c5c | ||
|
|
fb1f995d6b | ||
|
|
32225797cd | ||
|
|
f29525f3a6 | ||
|
|
05b11a3b82 | ||
|
|
f511264831 | ||
|
|
7dce43363e | ||
|
|
cc695bc3f7 | ||
|
|
4bfcd035ae | ||
|
|
9c1c41df03 | ||
|
|
c4a0f6f2e6 | ||
|
|
712ba5a31f | ||
|
|
961fc9e041 | ||
|
|
0a8dc4532b | ||
|
|
bb5c328a16 | ||
|
|
ea2a214959 | ||
|
|
65032877d4 | ||
|
|
de0bdf1a65 | ||
|
|
97430e2df5 | ||
|
|
5573c3b18e | ||
|
|
34473302b0 | ||
|
|
9898e5700b | ||
|
|
10c4b9b5b0 | ||
|
|
f9adbc7548 | ||
|
|
6b18d95dec | ||
|
|
89388510a0 | ||
|
|
6b259babeb | ||
|
|
062751fcec | ||
|
|
969dddb630 | ||
|
|
de21572b31 | ||
|
|
20b1531e6d | ||
|
|
c591b9ebe2 | ||
|
|
06006a6328 | ||
|
|
67d8cde42d | ||
|
|
04f046d2d8 | ||
|
|
e8b37d1a89 | ||
|
|
d2c44722e1 | ||
|
|
821f8696a7 | ||
|
|
d94999f87b | ||
|
|
249a6cec1b | ||
|
|
adc790a8bf | ||
|
|
967b7ff6bf | ||
|
|
5d51501a0b | ||
|
|
8367509b3b | ||
|
|
0a7a69345c | ||
|
|
899ae71349 | ||
|
|
4deeb7ebfc | ||
|
|
2a03c634c0 | ||
|
|
81c4e717e2 | ||
|
|
6b1df5202c | ||
|
|
171504c84f | ||
|
|
5e7fb08bf3 | ||
|
|
4981e72607 | ||
|
|
aa9da87e97 | ||
|
|
f74a54f005 | ||
|
|
151e945d9f | ||
|
|
517bb8655d | ||
|
|
9b24d9ee7d | ||
|
|
537747cf5d | ||
|
|
2535164542 | ||
|
|
86d7c82993 | ||
|
|
399260a6c0 | ||
|
|
f27dc12b7c | ||
|
|
3143134968 | ||
|
|
2e47fd9f56 | ||
|
|
a2ad57062f | ||
|
|
31d339e54a | ||
|
|
ad87eda835 | ||
|
|
a0da07e5b7 | ||
|
|
24379acc76 | ||
|
|
a9d0211a64 | ||
|
|
e7c3942d43 | ||
|
|
d69f7eb0ee | ||
|
|
65cd0b5639 | ||
|
|
b7bdb1010a | ||
|
|
8bd3bd7e2a | ||
|
|
caf5aa47c2 | ||
|
|
6ddb7a4d13 | ||
|
|
bd66edee5c | ||
|
|
489efca47c | ||
|
|
21db4f3ed8 | ||
|
|
37c485e3d1 | ||
|
|
31aefdc07d | ||
|
|
1231fafb46 | ||
|
|
17cb173e18 | ||
|
|
1da1bb9d99 | ||
|
|
b78cc787a6 | ||
|
|
600ec82bec | ||
|
|
009fc3757a | ||
|
|
b3293f8579 | ||
|
|
5a16980845 | ||
|
|
bc9fc96579 | ||
|
|
719f7cca57 | ||
|
|
521fca5c92 | ||
|
|
99c3b1998a | ||
|
|
ddd72a16b0 | ||
|
|
08bea860ef | ||
|
|
28f820eb1c | ||
|
|
5f649dd39f | ||
|
|
a521bcbcee | ||
|
|
1ae1f37ec1 | ||
|
|
2128b1b15c | ||
|
|
9172cc172e | ||
|
|
0b1343747f | ||
|
|
27fd0c119f | ||
|
|
ed852a2af2 | ||
|
|
88b98fac3a | ||
|
|
46a6f8e1d3 | ||
|
|
d6c01be09b | ||
|
|
4410e9c61a | ||
|
|
32f8609b89 | ||
|
|
6017688445 | ||
|
|
f55bb154ec | ||
|
|
1452e92567 | ||
|
|
75e6412b1c | ||
|
|
50dc7c6dd8 | ||
|
|
5e228a8387 | ||
|
|
2d77e4fc28 | ||
|
|
e9c98274b5 | ||
|
|
0e0f9f41b3 | ||
|
|
b6bfdeb111 | ||
|
|
3775593e53 | ||
|
|
6ee9bc63eb | ||
|
|
38d130d9d0 | ||
|
|
5ee61f067d | ||
|
|
2d16083ba6 | ||
|
|
1fbf3a4ba1 | ||
|
|
d4fdeb4839 | ||
|
|
0013f22374 | ||
|
|
ae17596c2a | ||
|
|
8b1ca6dcd6 | ||
|
|
d68c92ec04 | ||
|
|
b1d4fc5e6e | ||
|
|
21c603a79e | ||
|
|
34f3916e7d | ||
|
|
04bf631d7f | ||
|
|
cf578fd81a | ||
|
|
06d16b6ea2 | ||
|
|
7fdb1118f5 | ||
|
|
fca11c5a21 | ||
|
|
6f682f7eb1 | ||
|
|
61952cd985 | ||
|
|
c4cfb278bc | ||
|
|
c2a6d1e930 | ||
|
|
6dc4ea766b | ||
|
|
b09d45b89a | ||
|
|
580cc309d2 | ||
|
|
78c817f71e | ||
|
|
71e6918f28 | ||
|
|
278535e4e3 | ||
|
|
2e4b72c6b9 | ||
|
|
172c786079 | ||
|
|
5d868dcc55 | ||
|
|
f4a6bb1885 | ||
|
|
95bc8911dd | ||
|
|
a8dd13731f | ||
|
|
318aa07158 | ||
|
|
7f597aca67 | ||
|
|
dbe70cddca | ||
|
|
0fd51c4adb | ||
|
|
6cb263bab0 | ||
|
|
9fe19ec9d9 | ||
|
|
1a6a7647c6 | ||
|
|
035aa90d4b | ||
|
|
40d180a7ef | ||
|
|
9de8d6798e | ||
|
|
9318c80203 | ||
|
|
a5df2e79a7 | ||
|
|
edf0148bee | ||
|
|
85d5073234 | ||
|
|
3e4e0c57b8 | ||
|
|
ee87b66033 | ||
|
|
526e5986fe | ||
|
|
b508f3dd38 | ||
|
|
bc0952781a | ||
|
|
755d528135 | ||
|
|
7659a5b878 | ||
|
|
5474cc6cc2 | ||
|
|
60aaea8547 | ||
|
|
d544d8602d | ||
|
|
313985fed7 | ||
|
|
4c4d043a3b | ||
|
|
1256a9faa7 | ||
|
|
7706c9e8c4 | ||
|
|
582a4abeb6 | ||
|
|
279fcdd5ff | ||
|
|
3b9cd52a95 | ||
|
|
92ee959e9b | ||
|
|
6ac1f1333f | ||
|
|
16e7a88a02 | ||
|
|
147b355326 | ||
|
|
419e9aa323 | ||
|
|
3f70611504 | ||
|
|
7cdd979158 | ||
|
|
949fc85217 | ||
|
|
6b413e3959 | ||
|
|
b89840c4b9 | ||
|
|
9280a039ee | ||
|
|
cd13a911cc | ||
|
|
f375aae257 | ||
|
|
44b8cad3df | ||
|
|
aca5284b13 | ||
|
|
29e0b4e08c | ||
|
|
afac984632 | ||
|
|
1a20877afe | ||
|
|
d763bdabc2 | ||
|
|
24e70b30c8 | ||
|
|
329c156600 |
18
.github/copilot-instructions.md
vendored
18
.github/copilot-instructions.md
vendored
@@ -55,22 +55,26 @@ ninja build/<mode>/test/boost/<test_name>
|
||||
ninja build/<mode>/scylla
|
||||
|
||||
# Run all tests in a file
|
||||
./test.py --mode=<mode> <test_path>
|
||||
./test.py --mode=<mode> test/<suite>/<test_name>.py
|
||||
|
||||
# Run a single test case from a file
|
||||
./test.py --mode=<mode> <test_path>::<test_function_name>
|
||||
./test.py --mode=<mode> test/<suite>/<test_name>.py::<test_function_name>
|
||||
|
||||
# Run all tests in a directory
|
||||
./test.py --mode=<mode> test/<suite>/
|
||||
|
||||
# Examples
|
||||
./test.py --mode=dev alternator/
|
||||
./test.py --mode=dev cluster/test_raft_voters::test_raft_limited_voters_retain_coordinator
|
||||
./test.py --mode=dev test/alternator/
|
||||
./test.py --mode=dev test/cluster/test_raft_voters.py::test_raft_limited_voters_retain_coordinator
|
||||
./test.py --mode=dev test/cqlpy/test_json.py
|
||||
|
||||
# Optional flags
|
||||
./test.py --mode=dev cluster/test_raft_no_quorum -v # Verbose output
|
||||
./test.py --mode=dev cluster/test_raft_no_quorum --repeat 5 # Repeat test 5 times
|
||||
./test.py --mode=dev test/cluster/test_raft_no_quorum.py -v # Verbose output
|
||||
./test.py --mode=dev test/cluster/test_raft_no_quorum.py --repeat 5 # Repeat test 5 times
|
||||
```
|
||||
|
||||
**Important:**
|
||||
- Use path without `.py` extension (e.g., `cluster/test_raft_no_quorum`, not `cluster/test_raft_no_quorum.py`)
|
||||
- Use full path with `.py` extension (e.g., `test/cluster/test_raft_no_quorum.py`, not `cluster/test_raft_no_quorum`)
|
||||
- To run a single test case, append `::<test_function_name>` to the file path
|
||||
- Add `-v` for verbose output
|
||||
- Add `--repeat <num>` to repeat a test multiple times
|
||||
|
||||
@@ -8,6 +8,9 @@ on:
|
||||
jobs:
|
||||
check-fixes-prefix:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
issues: write
|
||||
steps:
|
||||
- name: Check PR body for "Fixes" prefix patterns
|
||||
uses: actions/github-script@v7
|
||||
|
||||
35
.github/workflows/call_jira_sync.yml
vendored
35
.github/workflows/call_jira_sync.yml
vendored
@@ -1,8 +1,8 @@
|
||||
name: Sync Jira Based on PR Events
|
||||
name: Sync Jira Based on PR Events
|
||||
|
||||
on:
|
||||
pull_request_target:
|
||||
types: [opened, ready_for_review, review_requested, labeled, unlabeled, closed]
|
||||
types: [opened, edited, ready_for_review, review_requested, labeled, unlabeled, closed]
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
@@ -10,32 +10,9 @@ permissions:
|
||||
issues: write
|
||||
|
||||
jobs:
|
||||
jira-sync-pr-opened:
|
||||
if: github.event.action == 'opened'
|
||||
uses: scylladb/github-automation/.github/workflows/main_jira_sync_pr_opened.yml@main
|
||||
secrets:
|
||||
caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
|
||||
|
||||
jira-sync-in-review:
|
||||
if: github.event.action == 'ready_for_review' || github.event.action == 'review_requested'
|
||||
uses: scylladb/github-automation/.github/workflows/main_jira_sync_in_review.yml@main
|
||||
secrets:
|
||||
caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
|
||||
|
||||
jira-sync-add-label:
|
||||
if: github.event.action == 'labeled'
|
||||
uses: scylladb/github-automation/.github/workflows/main_jira_sync_add_label.yml@main
|
||||
secrets:
|
||||
caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
|
||||
|
||||
jira-status-remove-label:
|
||||
if: github.event.action == 'unlabeled'
|
||||
uses: scylladb/github-automation/.github/workflows/main_jira_sync_remove_label.yml@main
|
||||
secrets:
|
||||
caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
|
||||
|
||||
jira-status-pr-closed:
|
||||
if: github.event.action == 'closed'
|
||||
uses: scylladb/github-automation/.github/workflows/main_jira_sync_pr_closed.yml@main
|
||||
jira-sync:
|
||||
uses: scylladb/github-automation/.github/workflows/main_pr_events_jira_sync.yml@main
|
||||
with:
|
||||
caller_action: ${{ github.event.action }}
|
||||
secrets:
|
||||
caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
|
||||
|
||||
2
.github/workflows/trigger-scylla-ci.yaml
vendored
2
.github/workflows/trigger-scylla-ci.yaml
vendored
@@ -1,4 +1,6 @@
|
||||
name: Trigger Scylla CI Route
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
on:
|
||||
issue_comment:
|
||||
|
||||
3
.github/workflows/trigger_jenkins.yaml
vendored
3
.github/workflows/trigger_jenkins.yaml
vendored
@@ -1,5 +1,8 @@
|
||||
name: Trigger next gating
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
|
||||
@@ -3463,7 +3463,11 @@ future<executor::request_return_type> executor::batch_write_item(client_state& c
|
||||
if (should_add_wcu) {
|
||||
rjson::add(ret, "ConsumedCapacity", std::move(consumed_capacity));
|
||||
}
|
||||
_stats.api_operations.batch_write_item_latency.mark(std::chrono::steady_clock::now() - start_time);
|
||||
auto duration = std::chrono::steady_clock::now() - start_time;
|
||||
_stats.api_operations.batch_write_item_latency.mark(duration);
|
||||
for (const auto& w : per_table_wcu) {
|
||||
w.first->api_operations.batch_write_item_latency.mark(duration);
|
||||
}
|
||||
co_return rjson::print(std::move(ret));
|
||||
}
|
||||
|
||||
@@ -4974,7 +4978,12 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
|
||||
if (!some_succeeded && eptr) {
|
||||
co_await coroutine::return_exception_ptr(std::move(eptr));
|
||||
}
|
||||
_stats.api_operations.batch_get_item_latency.mark(std::chrono::steady_clock::now() - start_time);
|
||||
auto duration = std::chrono::steady_clock::now() - start_time;
|
||||
_stats.api_operations.batch_get_item_latency.mark(duration);
|
||||
for (const table_requests& rs : requests) {
|
||||
lw_shared_ptr<stats> per_table_stats = get_stats_from_schema(_proxy, *rs.schema);
|
||||
per_table_stats->api_operations.batch_get_item_latency.mark(duration);
|
||||
}
|
||||
if (is_big(response)) {
|
||||
co_return make_streamed(std::move(response));
|
||||
} else {
|
||||
|
||||
@@ -1295,6 +1295,45 @@
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"path":"/storage_service/logstor_compaction",
|
||||
"operations":[
|
||||
{
|
||||
"method":"POST",
|
||||
"summary":"Trigger compaction of the key-value storage",
|
||||
"type":"void",
|
||||
"nickname":"logstor_compaction",
|
||||
"produces":[
|
||||
"application/json"
|
||||
],
|
||||
"parameters":[
|
||||
{
|
||||
"name":"major",
|
||||
"description":"When true, perform a major compaction",
|
||||
"required":false,
|
||||
"allowMultiple":false,
|
||||
"type":"boolean",
|
||||
"paramType":"query"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"path":"/storage_service/logstor_flush",
|
||||
"operations":[
|
||||
{
|
||||
"method":"POST",
|
||||
"summary":"Trigger flush of logstor storage",
|
||||
"type":"void",
|
||||
"nickname":"logstor_flush",
|
||||
"produces":[
|
||||
"application/json"
|
||||
],
|
||||
"parameters":[]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"path":"/storage_service/active_repair/",
|
||||
"operations":[
|
||||
@@ -3229,6 +3268,38 @@
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"path":"/storage_service/logstor_info",
|
||||
"operations":[
|
||||
{
|
||||
"method":"GET",
|
||||
"summary":"Logstor segment information for one table",
|
||||
"type":"table_logstor_info",
|
||||
"nickname":"logstor_info",
|
||||
"produces":[
|
||||
"application/json"
|
||||
],
|
||||
"parameters":[
|
||||
{
|
||||
"name":"keyspace",
|
||||
"description":"The keyspace",
|
||||
"required":true,
|
||||
"allowMultiple":false,
|
||||
"type":"string",
|
||||
"paramType":"query"
|
||||
},
|
||||
{
|
||||
"name":"table",
|
||||
"description":"table name",
|
||||
"required":true,
|
||||
"allowMultiple":false,
|
||||
"type":"string",
|
||||
"paramType":"query"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"path":"/storage_service/retrain_dict",
|
||||
"operations":[
|
||||
@@ -3637,6 +3708,47 @@
|
||||
}
|
||||
}
|
||||
},
|
||||
"logstor_hist_bucket":{
|
||||
"id":"logstor_hist_bucket",
|
||||
"properties":{
|
||||
"bucket":{
|
||||
"type":"long"
|
||||
},
|
||||
"count":{
|
||||
"type":"long"
|
||||
},
|
||||
"min_data_size":{
|
||||
"type":"long"
|
||||
},
|
||||
"max_data_size":{
|
||||
"type":"long"
|
||||
}
|
||||
}
|
||||
},
|
||||
"table_logstor_info":{
|
||||
"id":"table_logstor_info",
|
||||
"description":"Per-table logstor segment distribution",
|
||||
"properties":{
|
||||
"keyspace":{
|
||||
"type":"string"
|
||||
},
|
||||
"table":{
|
||||
"type":"string"
|
||||
},
|
||||
"compaction_groups":{
|
||||
"type":"long"
|
||||
},
|
||||
"segments":{
|
||||
"type":"long"
|
||||
},
|
||||
"data_size_histogram":{
|
||||
"type":"array",
|
||||
"items":{
|
||||
"$ref":"logstor_hist_bucket"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"tablet_repair_result":{
|
||||
"id":"tablet_repair_result",
|
||||
"description":"Tablet repair result",
|
||||
|
||||
@@ -209,6 +209,21 @@
|
||||
"parameters":[]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"path":"/system/chosen_sstable_version",
|
||||
"operations":[
|
||||
{
|
||||
"method":"GET",
|
||||
"summary":"Get sstable version currently chosen for use in new sstables",
|
||||
"type":"string",
|
||||
"nickname":"get_chosen_sstable_version",
|
||||
"produces":[
|
||||
"application/json"
|
||||
],
|
||||
"parameters":[]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@@ -122,9 +122,9 @@ future<> unset_thrift_controller(http_context& ctx) {
|
||||
return ctx.http_server.set_routes([&ctx] (routes& r) { unset_thrift_controller(ctx, r); });
|
||||
}
|
||||
|
||||
future<> set_server_storage_service(http_context& ctx, sharded<service::storage_service>& ss, service::raft_group0_client& group0_client) {
|
||||
return ctx.http_server.set_routes([&ctx, &ss, &group0_client] (routes& r) {
|
||||
set_storage_service(ctx, r, ss, group0_client);
|
||||
future<> set_server_storage_service(http_context& ctx, sharded<service::storage_service>& ss, sharded<db::snapshot_ctl>& ssc, service::raft_group0_client& group0_client) {
|
||||
return ctx.http_server.set_routes([&ctx, &ss, &ssc, &group0_client] (routes& r) {
|
||||
set_storage_service(ctx, r, ss, ssc, group0_client);
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -98,7 +98,7 @@ future<> set_server_config(http_context& ctx, db::config& cfg);
|
||||
future<> unset_server_config(http_context& ctx);
|
||||
future<> set_server_snitch(http_context& ctx, sharded<locator::snitch_ptr>& snitch);
|
||||
future<> unset_server_snitch(http_context& ctx);
|
||||
future<> set_server_storage_service(http_context& ctx, sharded<service::storage_service>& ss, service::raft_group0_client&);
|
||||
future<> set_server_storage_service(http_context& ctx, sharded<service::storage_service>& ss, sharded<db::snapshot_ctl>&, service::raft_group0_client&);
|
||||
future<> unset_server_storage_service(http_context& ctx);
|
||||
future<> set_server_client_routes(http_context& ctx, sharded<service::client_routes_service>& cr);
|
||||
future<> unset_server_client_routes(http_context& ctx);
|
||||
|
||||
@@ -18,7 +18,9 @@
|
||||
#include "utils/assert.hh"
|
||||
#include "utils/estimated_histogram.hh"
|
||||
#include <algorithm>
|
||||
#include <sstream>
|
||||
#include "db/data_listeners.hh"
|
||||
#include "utils/hash.hh"
|
||||
#include "storage_service.hh"
|
||||
#include "compaction/compaction_manager.hh"
|
||||
#include "unimplemented.hh"
|
||||
@@ -342,6 +344,56 @@ uint64_t accumulate_on_active_memtables(replica::table& t, noncopyable_function<
|
||||
return ret;
|
||||
}
|
||||
|
||||
static
|
||||
future<json::json_return_type>
|
||||
rest_toppartitions_generic(sharded<replica::database>& db, std::unique_ptr<http::request> req) {
|
||||
bool filters_provided = false;
|
||||
|
||||
std::unordered_set<std::tuple<sstring, sstring>, utils::tuple_hash> table_filters {};
|
||||
if (auto filters = req->get_query_param("table_filters"); !filters.empty()) {
|
||||
filters_provided = true;
|
||||
std::stringstream ss { filters };
|
||||
std::string filter;
|
||||
while (!filters.empty() && ss.good()) {
|
||||
std::getline(ss, filter, ',');
|
||||
table_filters.emplace(parse_fully_qualified_cf_name(filter));
|
||||
}
|
||||
}
|
||||
|
||||
std::unordered_set<sstring> keyspace_filters {};
|
||||
if (auto filters = req->get_query_param("keyspace_filters"); !filters.empty()) {
|
||||
filters_provided = true;
|
||||
std::stringstream ss { filters };
|
||||
std::string filter;
|
||||
while (!filters.empty() && ss.good()) {
|
||||
std::getline(ss, filter, ',');
|
||||
keyspace_filters.emplace(std::move(filter));
|
||||
}
|
||||
}
|
||||
|
||||
// when the query is empty return immediately
|
||||
if (filters_provided && table_filters.empty() && keyspace_filters.empty()) {
|
||||
apilog.debug("toppartitions query: processing results");
|
||||
cf::toppartitions_query_results results;
|
||||
|
||||
results.read_cardinality = 0;
|
||||
results.write_cardinality = 0;
|
||||
|
||||
return make_ready_future<json::json_return_type>(results);
|
||||
}
|
||||
|
||||
api::req_param<std::chrono::milliseconds, unsigned> duration{*req, "duration", 1000ms};
|
||||
api::req_param<unsigned> capacity(*req, "capacity", 256);
|
||||
api::req_param<unsigned> list_size(*req, "list_size", 10);
|
||||
|
||||
apilog.info("toppartitions query: #table_filters={} #keyspace_filters={} duration={} list_size={} capacity={}",
|
||||
!table_filters.empty() ? std::to_string(table_filters.size()) : "all", !keyspace_filters.empty() ? std::to_string(keyspace_filters.size()) : "all", duration.value, list_size.value, capacity.value);
|
||||
|
||||
return seastar::do_with(db::toppartitions_query(db, std::move(table_filters), std::move(keyspace_filters), duration.value, list_size, capacity), [] (db::toppartitions_query& q) {
|
||||
return run_toppartitions_query(q);
|
||||
});
|
||||
}
|
||||
|
||||
void set_column_family(http_context& ctx, routes& r, sharded<replica::database>& db) {
|
||||
cf::get_column_family_name.set(r, [&db] (const_req req){
|
||||
std::vector<sstring> res;
|
||||
@@ -1047,6 +1099,10 @@ void set_column_family(http_context& ctx, routes& r, sharded<replica::database>&
|
||||
});
|
||||
});
|
||||
|
||||
ss::toppartitions_generic.set(r, [&db] (std::unique_ptr<http::request> req) {
|
||||
return rest_toppartitions_generic(db, std::move(req));
|
||||
});
|
||||
|
||||
cf::force_major_compaction.set(r, [&ctx, &db](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
|
||||
if (!req->get_query_param("split_output").empty()) {
|
||||
fail(unimplemented::cause::API);
|
||||
@@ -1213,6 +1269,7 @@ void unset_column_family(http_context& ctx, routes& r) {
|
||||
cf::get_sstable_count_per_level.unset(r);
|
||||
cf::get_sstables_for_key.unset(r);
|
||||
cf::toppartitions.unset(r);
|
||||
ss::toppartitions_generic.unset(r);
|
||||
cf::force_major_compaction.unset(r);
|
||||
ss::get_load.unset(r);
|
||||
ss::get_metrics_load.unset(r);
|
||||
|
||||
@@ -17,9 +17,7 @@
|
||||
#include "gms/feature_service.hh"
|
||||
#include "schema/schema_builder.hh"
|
||||
#include "sstables/sstables_manager.hh"
|
||||
#include "utils/hash.hh"
|
||||
#include <optional>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
#include <time.h>
|
||||
#include <algorithm>
|
||||
@@ -612,56 +610,6 @@ rest_get_token_endpoint(http_context& ctx, sharded<service::storage_service>& ss
|
||||
co_return json::json_return_type(stream_range_as_array(token_endpoints, &map_to_json<dht::token, gms::inet_address>));
|
||||
}
|
||||
|
||||
static
|
||||
future<json::json_return_type>
|
||||
rest_toppartitions_generic(http_context& ctx, std::unique_ptr<http::request> req) {
|
||||
bool filters_provided = false;
|
||||
|
||||
std::unordered_set<std::tuple<sstring, sstring>, utils::tuple_hash> table_filters {};
|
||||
if (auto filters = req->get_query_param("table_filters"); !filters.empty()) {
|
||||
filters_provided = true;
|
||||
std::stringstream ss { filters };
|
||||
std::string filter;
|
||||
while (!filters.empty() && ss.good()) {
|
||||
std::getline(ss, filter, ',');
|
||||
table_filters.emplace(parse_fully_qualified_cf_name(filter));
|
||||
}
|
||||
}
|
||||
|
||||
std::unordered_set<sstring> keyspace_filters {};
|
||||
if (auto filters = req->get_query_param("keyspace_filters"); !filters.empty()) {
|
||||
filters_provided = true;
|
||||
std::stringstream ss { filters };
|
||||
std::string filter;
|
||||
while (!filters.empty() && ss.good()) {
|
||||
std::getline(ss, filter, ',');
|
||||
keyspace_filters.emplace(std::move(filter));
|
||||
}
|
||||
}
|
||||
|
||||
// when the query is empty return immediately
|
||||
if (filters_provided && table_filters.empty() && keyspace_filters.empty()) {
|
||||
apilog.debug("toppartitions query: processing results");
|
||||
httpd::column_family_json::toppartitions_query_results results;
|
||||
|
||||
results.read_cardinality = 0;
|
||||
results.write_cardinality = 0;
|
||||
|
||||
return make_ready_future<json::json_return_type>(results);
|
||||
}
|
||||
|
||||
api::req_param<std::chrono::milliseconds, unsigned> duration{*req, "duration", 1000ms};
|
||||
api::req_param<unsigned> capacity(*req, "capacity", 256);
|
||||
api::req_param<unsigned> list_size(*req, "list_size", 10);
|
||||
|
||||
apilog.info("toppartitions query: #table_filters={} #keyspace_filters={} duration={} list_size={} capacity={}",
|
||||
!table_filters.empty() ? std::to_string(table_filters.size()) : "all", !keyspace_filters.empty() ? std::to_string(keyspace_filters.size()) : "all", duration.value, list_size.value, capacity.value);
|
||||
|
||||
return seastar::do_with(db::toppartitions_query(ctx.db, std::move(table_filters), std::move(keyspace_filters), duration.value, list_size, capacity), [] (db::toppartitions_query& q) {
|
||||
return run_toppartitions_query(q);
|
||||
});
|
||||
}
|
||||
|
||||
static
|
||||
json::json_return_type
|
||||
rest_get_release_version(sharded<service::storage_service>& ss, const_req& req) {
|
||||
@@ -835,9 +783,31 @@ rest_force_keyspace_flush(http_context& ctx, std::unique_ptr<http::request> req)
|
||||
|
||||
static
|
||||
future<json::json_return_type>
|
||||
rest_decommission(sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
|
||||
rest_logstor_compaction(http_context& ctx, std::unique_ptr<http::request> req) {
|
||||
bool major = false;
|
||||
if (auto major_param = req->get_query_param("major"); !major_param.empty()) {
|
||||
major = validate_bool(major_param);
|
||||
}
|
||||
apilog.info("logstor_compaction: major={}", major);
|
||||
auto& db = ctx.db;
|
||||
co_await replica::database::trigger_logstor_compaction_on_all_shards(db, major);
|
||||
co_return json_void();
|
||||
}
|
||||
|
||||
static
|
||||
future<json::json_return_type>
|
||||
rest_logstor_flush(http_context& ctx, std::unique_ptr<http::request> req) {
|
||||
apilog.info("logstor_flush");
|
||||
auto& db = ctx.db;
|
||||
co_await replica::database::flush_logstor_separator_on_all_shards(db);
|
||||
co_return json_void();
|
||||
}
|
||||
|
||||
static
|
||||
future<json::json_return_type>
|
||||
rest_decommission(sharded<service::storage_service>& ss, sharded<db::snapshot_ctl>& ssc, std::unique_ptr<http::request> req) {
|
||||
apilog.info("decommission");
|
||||
return ss.local().decommission().then([] {
|
||||
return ss.local().decommission(ssc).then([] {
|
||||
return make_ready_future<json::json_return_type>(json_void());
|
||||
});
|
||||
}
|
||||
@@ -1553,6 +1523,54 @@ rest_sstable_info(http_context& ctx, std::unique_ptr<http::request> req) {
|
||||
});
|
||||
}
|
||||
|
||||
static
|
||||
future<json::json_return_type>
|
||||
rest_logstor_info(http_context& ctx, std::unique_ptr<http::request> req) {
|
||||
auto keyspace = api::req_param<sstring>(*req, "keyspace", {}).value;
|
||||
auto table = api::req_param<sstring>(*req, "table", {}).value;
|
||||
if (table.empty()) {
|
||||
table = api::req_param<sstring>(*req, "cf", {}).value;
|
||||
}
|
||||
|
||||
if (keyspace.empty()) {
|
||||
throw bad_param_exception("The query parameter 'keyspace' is required");
|
||||
}
|
||||
if (table.empty()) {
|
||||
throw bad_param_exception("The query parameter 'table' is required");
|
||||
}
|
||||
|
||||
keyspace = validate_keyspace(ctx, keyspace);
|
||||
auto tid = validate_table(ctx.db.local(), keyspace, table);
|
||||
|
||||
auto& cf = ctx.db.local().find_column_family(tid);
|
||||
if (!cf.uses_logstor()) {
|
||||
throw bad_param_exception(fmt::format("Table {}.{} does not use logstor", keyspace, table));
|
||||
}
|
||||
|
||||
return do_with(replica::logstor::table_segment_stats{}, [keyspace = std::move(keyspace), table = std::move(table), tid, &ctx] (replica::logstor::table_segment_stats& merged_stats) {
|
||||
return ctx.db.map_reduce([&merged_stats](replica::logstor::table_segment_stats&& shard_stats) {
|
||||
merged_stats += shard_stats;
|
||||
}, [tid](const replica::database& db) {
|
||||
return db.get_logstor_table_segment_stats(tid);
|
||||
}).then([&merged_stats, keyspace = std::move(keyspace), table = std::move(table)] {
|
||||
ss::table_logstor_info result;
|
||||
result.keyspace = keyspace;
|
||||
result.table = table;
|
||||
result.compaction_groups = merged_stats.compaction_group_count;
|
||||
result.segments = merged_stats.segment_count;
|
||||
|
||||
for (const auto& bucket : merged_stats.histogram) {
|
||||
ss::logstor_hist_bucket hist;
|
||||
hist.count = bucket.count;
|
||||
hist.max_data_size = bucket.max_data_size;
|
||||
result.data_size_histogram.push(std::move(hist));
|
||||
}
|
||||
|
||||
return make_ready_future<json::json_return_type>(stream_object(result));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
static
|
||||
future<json::json_return_type>
|
||||
rest_reload_raft_topology_state(sharded<service::storage_service>& ss, service::raft_group0_client& group0_client, std::unique_ptr<http::request> req) {
|
||||
@@ -1782,9 +1800,8 @@ rest_bind(FuncType func, BindArgs&... args) {
|
||||
return std::bind_front(func, std::ref(args)...);
|
||||
}
|
||||
|
||||
void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_service>& ss, service::raft_group0_client& group0_client) {
|
||||
void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_service>& ss, sharded<db::snapshot_ctl>& ssc, service::raft_group0_client& group0_client) {
|
||||
ss::get_token_endpoint.set(r, rest_bind(rest_get_token_endpoint, ctx, ss));
|
||||
ss::toppartitions_generic.set(r, rest_bind(rest_toppartitions_generic, ctx));
|
||||
ss::get_release_version.set(r, rest_bind(rest_get_release_version, ss));
|
||||
ss::get_scylla_release_version.set(r, rest_bind(rest_get_scylla_release_version, ss));
|
||||
ss::get_schema_version.set(r, rest_bind(rest_get_schema_version, ss));
|
||||
@@ -1799,7 +1816,9 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
|
||||
ss::reset_cleanup_needed.set(r, rest_bind(rest_reset_cleanup_needed, ctx, ss));
|
||||
ss::force_flush.set(r, rest_bind(rest_force_flush, ctx));
|
||||
ss::force_keyspace_flush.set(r, rest_bind(rest_force_keyspace_flush, ctx));
|
||||
ss::decommission.set(r, rest_bind(rest_decommission, ss));
|
||||
ss::decommission.set(r, rest_bind(rest_decommission, ss, ssc));
|
||||
ss::logstor_compaction.set(r, rest_bind(rest_logstor_compaction, ctx));
|
||||
ss::logstor_flush.set(r, rest_bind(rest_logstor_flush, ctx));
|
||||
ss::move.set(r, rest_bind(rest_move, ss));
|
||||
ss::remove_node.set(r, rest_bind(rest_remove_node, ss));
|
||||
ss::exclude_node.set(r, rest_bind(rest_exclude_node, ss));
|
||||
@@ -1848,6 +1867,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
|
||||
ss::retrain_dict.set(r, rest_bind(rest_retrain_dict, ctx, ss, group0_client));
|
||||
ss::estimate_compression_ratios.set(r, rest_bind(rest_estimate_compression_ratios, ctx, ss));
|
||||
ss::sstable_info.set(r, rest_bind(rest_sstable_info, ctx));
|
||||
ss::logstor_info.set(r, rest_bind(rest_logstor_info, ctx));
|
||||
ss::reload_raft_topology_state.set(r, rest_bind(rest_reload_raft_topology_state, ss, group0_client));
|
||||
ss::upgrade_to_raft_topology.set(r, rest_bind(rest_upgrade_to_raft_topology, ss));
|
||||
ss::raft_topology_upgrade_status.set(r, rest_bind(rest_raft_topology_upgrade_status, ss));
|
||||
@@ -1864,7 +1884,6 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
|
||||
|
||||
void unset_storage_service(http_context& ctx, routes& r) {
|
||||
ss::get_token_endpoint.unset(r);
|
||||
ss::toppartitions_generic.unset(r);
|
||||
ss::get_release_version.unset(r);
|
||||
ss::get_scylla_release_version.unset(r);
|
||||
ss::get_schema_version.unset(r);
|
||||
@@ -1878,6 +1897,8 @@ void unset_storage_service(http_context& ctx, routes& r) {
|
||||
ss::reset_cleanup_needed.unset(r);
|
||||
ss::force_flush.unset(r);
|
||||
ss::force_keyspace_flush.unset(r);
|
||||
ss::logstor_compaction.unset(r);
|
||||
ss::logstor_flush.unset(r);
|
||||
ss::decommission.unset(r);
|
||||
ss::move.unset(r);
|
||||
ss::remove_node.unset(r);
|
||||
@@ -1925,6 +1946,7 @@ void unset_storage_service(http_context& ctx, routes& r) {
|
||||
ss::get_ownership.unset(r);
|
||||
ss::get_effective_ownership.unset(r);
|
||||
ss::sstable_info.unset(r);
|
||||
ss::logstor_info.unset(r);
|
||||
ss::reload_raft_topology_state.unset(r);
|
||||
ss::upgrade_to_raft_topology.unset(r);
|
||||
ss::raft_topology_upgrade_status.unset(r);
|
||||
@@ -2141,6 +2163,7 @@ void unset_snapshot(http_context& ctx, routes& r) {
|
||||
ss::start_backup.unset(r);
|
||||
cf::get_true_snapshots_size.unset(r);
|
||||
cf::get_all_true_snapshots_size.unset(r);
|
||||
ss::decommission.unset(r);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -66,7 +66,7 @@ struct scrub_info {
|
||||
|
||||
scrub_info parse_scrub_options(const http_context& ctx, std::unique_ptr<http::request> req);
|
||||
|
||||
void set_storage_service(http_context& ctx, httpd::routes& r, sharded<service::storage_service>& ss, service::raft_group0_client&);
|
||||
void set_storage_service(http_context& ctx, httpd::routes& r, sharded<service::storage_service>& ss, sharded<db::snapshot_ctl>&, service::raft_group0_client&);
|
||||
void unset_storage_service(http_context& ctx, httpd::routes& r);
|
||||
void set_sstables_loader(http_context& ctx, httpd::routes& r, sharded<sstables_loader>& sst_loader);
|
||||
void unset_sstables_loader(http_context& ctx, httpd::routes& r);
|
||||
|
||||
@@ -190,6 +190,13 @@ void set_system(http_context& ctx, routes& r) {
|
||||
return make_ready_future<json::json_return_type>(seastar::to_sstring(format));
|
||||
});
|
||||
});
|
||||
|
||||
hs::get_chosen_sstable_version.set(r, [&ctx] (std::unique_ptr<request> req) {
|
||||
return smp::submit_to(0, [&ctx] {
|
||||
auto format = ctx.db.local().get_user_sstables_manager().get_preferred_sstable_version();
|
||||
return make_ready_future<json::json_return_type>(seastar::to_sstring(format));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -47,7 +47,7 @@ void cache::set_permission_loader(permission_loader_func loader) {
|
||||
_permission_loader = std::move(loader);
|
||||
}
|
||||
|
||||
lw_shared_ptr<const cache::role_record> cache::get(const role_name_t& role) const noexcept {
|
||||
lw_shared_ptr<const cache::role_record> cache::get(std::string_view role) const noexcept {
|
||||
auto it = _roles.find(role);
|
||||
if (it == _roles.end()) {
|
||||
return {};
|
||||
@@ -55,6 +55,16 @@ lw_shared_ptr<const cache::role_record> cache::get(const role_name_t& role) cons
|
||||
return it->second;
|
||||
}
|
||||
|
||||
void cache::for_each_role(const std::function<void(const role_name_t&, const role_record&)>& func) const {
|
||||
for (const auto& [name, record] : _roles) {
|
||||
func(name, *record);
|
||||
}
|
||||
}
|
||||
|
||||
size_t cache::roles_count() const noexcept {
|
||||
return _roles.size();
|
||||
}
|
||||
|
||||
future<permission_set> cache::get_permissions(const role_or_anonymous& role, const resource& r) {
|
||||
std::unordered_map<resource, permission_set>* perms_cache;
|
||||
lw_shared_ptr<role_record> role_ptr;
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
#pragma once
|
||||
|
||||
#include <seastar/core/abort_source.hh>
|
||||
#include <string_view>
|
||||
#include <unordered_set>
|
||||
#include <unordered_map>
|
||||
|
||||
@@ -19,7 +20,7 @@
|
||||
#include <seastar/core/semaphore.hh>
|
||||
#include <seastar/core/metrics_registration.hh>
|
||||
|
||||
#include <absl/container/flat_hash_map.h>
|
||||
#include "absl-flat_hash_map.hh"
|
||||
|
||||
#include "auth/permission.hh"
|
||||
#include "auth/common.hh"
|
||||
@@ -42,8 +43,8 @@ public:
|
||||
std::unordered_set<role_name_t> member_of;
|
||||
std::unordered_set<role_name_t> members;
|
||||
sstring salted_hash;
|
||||
std::unordered_map<sstring, sstring> attributes;
|
||||
std::unordered_map<sstring, permission_set> permissions;
|
||||
std::unordered_map<sstring, sstring, sstring_hash, sstring_eq> attributes;
|
||||
std::unordered_map<sstring, permission_set, sstring_hash, sstring_eq> permissions;
|
||||
private:
|
||||
friend cache;
|
||||
// cached permissions include effects of role's inheritance
|
||||
@@ -52,7 +53,7 @@ public:
|
||||
};
|
||||
|
||||
explicit cache(cql3::query_processor& qp, abort_source& as) noexcept;
|
||||
lw_shared_ptr<const role_record> get(const role_name_t& role) const noexcept;
|
||||
lw_shared_ptr<const role_record> get(std::string_view role) const noexcept;
|
||||
void set_permission_loader(permission_loader_func loader);
|
||||
future<permission_set> get_permissions(const role_or_anonymous& role, const resource& r);
|
||||
future<> prune(const resource& r);
|
||||
@@ -61,8 +62,15 @@ public:
|
||||
future<> load_roles(std::unordered_set<role_name_t> roles);
|
||||
static bool includes_table(const table_id&) noexcept;
|
||||
|
||||
// Returns the number of roles in the cache.
|
||||
size_t roles_count() const noexcept;
|
||||
|
||||
// The callback doesn't suspend (no co_await) so it observes the state
|
||||
// of the cache atomically.
|
||||
void for_each_role(const std::function<void(const role_name_t&, const role_record&)>& func) const;
|
||||
|
||||
private:
|
||||
using roles_map = absl::flat_hash_map<role_name_t, lw_shared_ptr<role_record>>;
|
||||
using roles_map = absl::flat_hash_map<role_name_t, lw_shared_ptr<role_record>, sstring_hash, sstring_eq>;
|
||||
roles_map _roles;
|
||||
// anonymous permissions map exists mainly due to compatibility with
|
||||
// higher layers which use role_or_anonymous to get permissions.
|
||||
|
||||
@@ -32,7 +32,7 @@ namespace {
|
||||
logger mylog{"ldap_role_manager"}; // `log` is taken by math.
|
||||
|
||||
struct url_desc_deleter {
|
||||
void operator()(LDAPURLDesc *p) {
|
||||
void operator()(LDAPURLDesc* p) {
|
||||
ldap_free_urldesc(p);
|
||||
}
|
||||
};
|
||||
@@ -40,7 +40,7 @@ struct url_desc_deleter {
|
||||
using url_desc_ptr = std::unique_ptr<LDAPURLDesc, url_desc_deleter>;
|
||||
|
||||
url_desc_ptr parse_url(std::string_view url) {
|
||||
LDAPURLDesc *desc = nullptr;
|
||||
LDAPURLDesc* desc = nullptr;
|
||||
if (ldap_url_parse(url.data(), &desc)) {
|
||||
mylog.error("error in ldap_url_parse({})", url);
|
||||
}
|
||||
@@ -53,8 +53,12 @@ std::vector<sstring> get_attr_values(LDAP* ld, LDAPMessage* res, const char* att
|
||||
mylog.debug("Analyzing search results");
|
||||
for (auto e = ldap_first_entry(ld, res); e; e = ldap_next_entry(ld, e)) {
|
||||
struct deleter {
|
||||
void operator()(berval** p) { ldap_value_free_len(p); }
|
||||
void operator()(char* p) { ldap_memfree(p); }
|
||||
void operator()(berval** p) {
|
||||
ldap_value_free_len(p);
|
||||
}
|
||||
void operator()(char* p) {
|
||||
ldap_memfree(p);
|
||||
}
|
||||
};
|
||||
const std::unique_ptr<char, deleter> dname(ldap_get_dn(ld, e));
|
||||
mylog.debug("Analyzing entry {}", dname.get());
|
||||
@@ -75,32 +79,29 @@ std::vector<sstring> get_attr_values(LDAP* ld, LDAPMessage* res, const char* att
|
||||
|
||||
namespace auth {
|
||||
|
||||
ldap_role_manager::ldap_role_manager(
|
||||
std::string_view query_template, std::string_view target_attr, std::string_view bind_name, std::string_view bind_password,
|
||||
uint32_t permissions_update_interval_in_ms,
|
||||
utils::observer<uint32_t> permissions_update_interval_in_ms_observer,
|
||||
cql3::query_processor& qp, ::service::raft_group0_client& rg0c, ::service::migration_manager& mm, cache& cache)
|
||||
: _std_mgr(qp, rg0c, mm, cache), _group0_client(rg0c), _query_template(query_template), _target_attr(target_attr), _bind_name(bind_name)
|
||||
, _bind_password(bind_password)
|
||||
, _permissions_update_interval_in_ms(permissions_update_interval_in_ms)
|
||||
, _permissions_update_interval_in_ms_observer(std::move(permissions_update_interval_in_ms_observer))
|
||||
, _connection_factory(bind(std::mem_fn(&ldap_role_manager::reconnect), std::ref(*this)))
|
||||
, _cache(cache)
|
||||
, _cache_pruner(make_ready_future<>()) {
|
||||
ldap_role_manager::ldap_role_manager(std::string_view query_template, std::string_view target_attr, std::string_view bind_name, std::string_view bind_password,
|
||||
uint32_t permissions_update_interval_in_ms, utils::observer<uint32_t> permissions_update_interval_in_ms_observer, cql3::query_processor& qp,
|
||||
::service::raft_group0_client& rg0c, ::service::migration_manager& mm, cache& cache)
|
||||
: _std_mgr(qp, rg0c, mm, cache)
|
||||
, _group0_client(rg0c)
|
||||
, _query_template(query_template)
|
||||
, _target_attr(target_attr)
|
||||
, _bind_name(bind_name)
|
||||
, _bind_password(bind_password)
|
||||
, _permissions_update_interval_in_ms(permissions_update_interval_in_ms)
|
||||
, _permissions_update_interval_in_ms_observer(std::move(permissions_update_interval_in_ms_observer))
|
||||
, _connection_factory(bind(std::mem_fn(&ldap_role_manager::reconnect), std::ref(*this)))
|
||||
, _cache(cache)
|
||||
, _cache_pruner(make_ready_future<>()) {
|
||||
}
|
||||
|
||||
ldap_role_manager::ldap_role_manager(cql3::query_processor& qp, ::service::raft_group0_client& rg0c, ::service::migration_manager& mm, cache& cache)
|
||||
: ldap_role_manager(
|
||||
qp.db().get_config().ldap_url_template(),
|
||||
qp.db().get_config().ldap_attr_role(),
|
||||
qp.db().get_config().ldap_bind_dn(),
|
||||
qp.db().get_config().ldap_bind_passwd(),
|
||||
qp.db().get_config().permissions_update_interval_in_ms(),
|
||||
qp.db().get_config().permissions_update_interval_in_ms.observe([this] (const uint32_t& v) { _permissions_update_interval_in_ms = v; }),
|
||||
qp,
|
||||
rg0c,
|
||||
mm,
|
||||
cache) {
|
||||
: ldap_role_manager(qp.db().get_config().ldap_url_template(), qp.db().get_config().ldap_attr_role(), qp.db().get_config().ldap_bind_dn(),
|
||||
qp.db().get_config().ldap_bind_passwd(), qp.db().get_config().permissions_update_interval_in_ms(),
|
||||
qp.db().get_config().permissions_update_interval_in_ms.observe([this](const uint32_t& v) {
|
||||
_permissions_update_interval_in_ms = v;
|
||||
}),
|
||||
qp, rg0c, mm, cache) {
|
||||
}
|
||||
|
||||
std::string_view ldap_role_manager::qualified_java_name() const noexcept {
|
||||
@@ -113,17 +114,16 @@ const resource_set& ldap_role_manager::protected_resources() const {
|
||||
|
||||
future<> ldap_role_manager::start() {
|
||||
if (!parse_url(get_url("dummy-user"))) { // Just need host and port -- any user should do.
|
||||
return make_exception_future(
|
||||
std::runtime_error(fmt::format("error getting LDAP server address from template {}", _query_template)));
|
||||
return make_exception_future(std::runtime_error(fmt::format("error getting LDAP server address from template {}", _query_template)));
|
||||
}
|
||||
_cache_pruner = futurize_invoke([this] () -> future<> {
|
||||
_cache_pruner = futurize_invoke([this]() -> future<> {
|
||||
while (true) {
|
||||
try {
|
||||
co_await seastar::sleep_abortable(std::chrono::milliseconds(_permissions_update_interval_in_ms), _as);
|
||||
} catch (const seastar::sleep_aborted&) {
|
||||
co_return; // ignore
|
||||
}
|
||||
co_await _cache.container().invoke_on_all([] (cache& c) -> future<> {
|
||||
co_await _cache.container().invoke_on_all([](cache& c) -> future<> {
|
||||
try {
|
||||
co_await c.reload_all_permissions();
|
||||
} catch (...) {
|
||||
@@ -165,7 +165,7 @@ future<conn_ptr> ldap_role_manager::connect() {
|
||||
future<conn_ptr> ldap_role_manager::reconnect() {
|
||||
unsigned retries_left = 5;
|
||||
using namespace std::literals::chrono_literals;
|
||||
conn_ptr conn = co_await exponential_backoff_retry::do_until_value(1s, 32s, _as, [this, &retries_left] () -> future<std::optional<conn_ptr>> {
|
||||
conn_ptr conn = co_await exponential_backoff_retry::do_until_value(1s, 32s, _as, [this, &retries_left]() -> future<std::optional<conn_ptr>> {
|
||||
if (!retries_left) {
|
||||
co_return conn_ptr{};
|
||||
}
|
||||
@@ -188,11 +188,13 @@ future<conn_ptr> ldap_role_manager::reconnect() {
|
||||
|
||||
future<> ldap_role_manager::stop() {
|
||||
_as.request_abort();
|
||||
return std::move(_cache_pruner).then([this] {
|
||||
return _std_mgr.stop();
|
||||
}).then([this] {
|
||||
return _connection_factory.stop();
|
||||
});
|
||||
return std::move(_cache_pruner)
|
||||
.then([this] {
|
||||
return _std_mgr.stop();
|
||||
})
|
||||
.then([this] {
|
||||
return _connection_factory.stop();
|
||||
});
|
||||
}
|
||||
|
||||
future<> ldap_role_manager::create(std::string_view name, const role_config& config, ::service::group0_batch& mc) {
|
||||
@@ -221,43 +223,42 @@ future<role_set> ldap_role_manager::query_granted(std::string_view grantee_name,
|
||||
if (!desc) {
|
||||
return make_exception_future<role_set>(std::runtime_error(format("Error parsing URL {}", url)));
|
||||
}
|
||||
return _connection_factory.with_connection([this, desc = std::move(desc), grantee_name_ = sstring(grantee_name)]
|
||||
(ldap_connection& conn) -> future<role_set> {
|
||||
sstring grantee_name = std::move(grantee_name_);
|
||||
ldap_msg_ptr res = co_await conn.search(desc->lud_dn, desc->lud_scope, desc->lud_filter, desc->lud_attrs,
|
||||
/*attrsonly=*/0, /*serverctrls=*/nullptr, /*clientctrls=*/nullptr,
|
||||
/*timeout=*/nullptr, /*sizelimit=*/0);
|
||||
mylog.trace("query_granted: got search results");
|
||||
const auto mtype = ldap_msgtype(res.get());
|
||||
if (mtype != LDAP_RES_SEARCH_ENTRY && mtype != LDAP_RES_SEARCH_RESULT && mtype != LDAP_RES_SEARCH_REFERENCE) {
|
||||
mylog.error("ldap search yielded result {} of type {}", static_cast<const void*>(res.get()), mtype);
|
||||
co_return coroutine::exception(std::make_exception_ptr(std::runtime_error("ldap_role_manager: search result has wrong type")));
|
||||
}
|
||||
std::vector<sstring> values = get_attr_values(conn.get_ldap(), res.get(), _target_attr.c_str());
|
||||
auth::role_set valid_roles{grantee_name};
|
||||
|
||||
// Each value is a role to be granted.
|
||||
co_await parallel_for_each(values, [this, &valid_roles] (const sstring& ldap_role) {
|
||||
return _std_mgr.exists(ldap_role).then([&valid_roles, &ldap_role] (bool exists) {
|
||||
if (exists) {
|
||||
valid_roles.insert(ldap_role);
|
||||
} else {
|
||||
mylog.error("unrecognized role received from LDAP: {}", ldap_role);
|
||||
return _connection_factory.with_connection(
|
||||
[this, desc = std::move(desc), grantee_name_ = sstring(grantee_name)](ldap_connection& conn) -> future<role_set> {
|
||||
sstring grantee_name = std::move(grantee_name_);
|
||||
ldap_msg_ptr res = co_await conn.search(desc->lud_dn, desc->lud_scope, desc->lud_filter, desc->lud_attrs,
|
||||
/*attrsonly=*/0, /*serverctrls=*/nullptr, /*clientctrls=*/nullptr,
|
||||
/*timeout=*/nullptr, /*sizelimit=*/0);
|
||||
mylog.trace("query_granted: got search results");
|
||||
const auto mtype = ldap_msgtype(res.get());
|
||||
if (mtype != LDAP_RES_SEARCH_ENTRY && mtype != LDAP_RES_SEARCH_RESULT && mtype != LDAP_RES_SEARCH_REFERENCE) {
|
||||
mylog.error("ldap search yielded result {} of type {}", static_cast<const void*>(res.get()), mtype);
|
||||
co_return coroutine::exception(std::make_exception_ptr(std::runtime_error("ldap_role_manager: search result has wrong type")));
|
||||
}
|
||||
});
|
||||
});
|
||||
std::vector<sstring> values = get_attr_values(conn.get_ldap(), res.get(), _target_attr.c_str());
|
||||
auth::role_set valid_roles{grantee_name};
|
||||
|
||||
co_return std::move(valid_roles);
|
||||
});
|
||||
// Each value is a role to be granted.
|
||||
co_await parallel_for_each(values, [this, &valid_roles](const sstring& ldap_role) {
|
||||
return _std_mgr.exists(ldap_role).then([&valid_roles, &ldap_role](bool exists) {
|
||||
if (exists) {
|
||||
valid_roles.insert(ldap_role);
|
||||
} else {
|
||||
mylog.error("unrecognized role received from LDAP: {}", ldap_role);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
co_return std::move(valid_roles);
|
||||
});
|
||||
}
|
||||
|
||||
future<role_to_directly_granted_map>
|
||||
ldap_role_manager::query_all_directly_granted(::service::query_state& qs) {
|
||||
future<role_to_directly_granted_map> ldap_role_manager::query_all_directly_granted(::service::query_state& qs) {
|
||||
role_to_directly_granted_map result;
|
||||
auto roles = co_await query_all(qs);
|
||||
for (auto& role: roles) {
|
||||
for (auto& role : roles) {
|
||||
auto granted_set = co_await query_granted(role, recursive_role_query::no);
|
||||
for (auto& granted: granted_set) {
|
||||
for (auto& granted : granted_set) {
|
||||
if (granted != role) {
|
||||
result.insert({role, granted});
|
||||
}
|
||||
@@ -271,7 +272,7 @@ future<role_set> ldap_role_manager::query_all(::service::query_state& qs) {
|
||||
}
|
||||
|
||||
future<> ldap_role_manager::create_role(std::string_view role_name) {
|
||||
return smp::submit_to(0, [this, role_name] () -> future<> {
|
||||
return smp::submit_to(0, [this, role_name]() -> future<> {
|
||||
int retries = 10;
|
||||
while (true) {
|
||||
auto guard = co_await _group0_client.start_operation(_as, ::service::raft_timeout{});
|
||||
@@ -283,8 +284,8 @@ future<> ldap_role_manager::create_role(std::string_view role_name) {
|
||||
} catch (const role_already_exists&) {
|
||||
// ok
|
||||
} catch (const ::service::group0_concurrent_modification& ex) {
|
||||
mylog.warn("Failed to auto-create role \"{}\" due to guard conflict.{}.",
|
||||
role_name, retries ? " Retrying" : " Number of retries exceeded, giving up");
|
||||
mylog.warn("Failed to auto-create role \"{}\" due to guard conflict.{}.", role_name,
|
||||
retries ? " Retrying" : " Number of retries exceeded, giving up");
|
||||
if (retries--) {
|
||||
continue;
|
||||
}
|
||||
@@ -329,8 +330,7 @@ future<bool> ldap_role_manager::can_login(std::string_view role_name) {
|
||||
return _std_mgr.can_login(role_name);
|
||||
}
|
||||
|
||||
future<std::optional<sstring>> ldap_role_manager::get_attribute(
|
||||
std::string_view role_name, std::string_view attribute_name, ::service::query_state& qs) {
|
||||
future<std::optional<sstring>> ldap_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state& qs) {
|
||||
return _std_mgr.get_attribute(role_name, attribute_name, qs);
|
||||
}
|
||||
|
||||
|
||||
37
auth/maintenance_socket_authorizer.hh
Normal file
37
auth/maintenance_socket_authorizer.hh
Normal file
@@ -0,0 +1,37 @@
|
||||
/*
|
||||
* Copyright (C) 2026-present ScyllaDB
|
||||
*
|
||||
* Modified by ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "auth/default_authorizer.hh"
|
||||
#include "auth/permission.hh"
|
||||
|
||||
namespace auth {
|
||||
|
||||
// maintenance_socket_authorizer is used for clients connecting to the
|
||||
// maintenance socket. It grants all permissions unconditionally (like
|
||||
// AllowAllAuthorizer) while still supporting grant/revoke operations
|
||||
// (delegated to the underlying CassandraAuthorizer / default_authorizer).
|
||||
class maintenance_socket_authorizer : public default_authorizer {
|
||||
public:
|
||||
using default_authorizer::default_authorizer;
|
||||
|
||||
~maintenance_socket_authorizer() override = default;
|
||||
|
||||
future<> start() override {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
future<permission_set> authorize(const role_or_anonymous&, const resource&) const override {
|
||||
return make_ready_future<permission_set>(permissions::ALL);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace auth
|
||||
@@ -30,6 +30,7 @@
|
||||
#include "auth/default_authorizer.hh"
|
||||
#include "auth/ldap_role_manager.hh"
|
||||
#include "auth/maintenance_socket_authenticator.hh"
|
||||
#include "auth/maintenance_socket_authorizer.hh"
|
||||
#include "auth/maintenance_socket_role_manager.hh"
|
||||
#include "auth/password_authenticator.hh"
|
||||
#include "auth/role_or_anonymous.hh"
|
||||
@@ -866,6 +867,12 @@ authenticator_factory make_maintenance_socket_authenticator_factory(
|
||||
};
|
||||
}
|
||||
|
||||
authorizer_factory make_maintenance_socket_authorizer_factory(sharded<cql3::query_processor>& qp) {
|
||||
return [&qp] {
|
||||
return std::make_unique<maintenance_socket_authorizer>(qp.local());
|
||||
};
|
||||
}
|
||||
|
||||
role_manager_factory make_maintenance_socket_role_manager_factory(
|
||||
sharded<cql3::query_processor>& qp,
|
||||
::service::raft_group0_client& g0,
|
||||
|
||||
@@ -434,6 +434,11 @@ authenticator_factory make_maintenance_socket_authenticator_factory(
|
||||
sharded<::service::migration_manager>& mm,
|
||||
sharded<cache>& cache);
|
||||
|
||||
/// Creates a factory for the maintenance socket authorizer.
|
||||
/// This authorizer is not config-selectable and is only used for the maintenance socket.
|
||||
/// It grants all permissions unconditionally while delegating grant/revoke to the default authorizer.
|
||||
authorizer_factory make_maintenance_socket_authorizer_factory(sharded<cql3::query_processor>& qp);
|
||||
|
||||
/// Creates a factory for the maintenance socket role manager.
|
||||
/// This role manager is not config-selectable and is only used for the maintenance socket.
|
||||
role_manager_factory make_maintenance_socket_role_manager_factory(
|
||||
|
||||
@@ -44,13 +44,12 @@ namespace auth {
|
||||
static logging::logger log("standard_role_manager");
|
||||
|
||||
future<std::optional<standard_role_manager::record>> standard_role_manager::find_record(std::string_view role_name) {
|
||||
auto name = sstring(role_name);
|
||||
auto role = _cache.get(name);
|
||||
auto role = _cache.get(role_name);
|
||||
if (!role) {
|
||||
return make_ready_future<std::optional<record>>(std::nullopt);
|
||||
}
|
||||
return make_ready_future<std::optional<record>>(std::make_optional(record{
|
||||
.name = std::move(name),
|
||||
.name = sstring(role_name),
|
||||
.is_superuser = role->is_superuser,
|
||||
.can_login = role->can_login,
|
||||
.member_of = role->member_of
|
||||
@@ -393,51 +392,21 @@ future<role_set> standard_role_manager::query_granted(std::string_view grantee_n
|
||||
}
|
||||
|
||||
future<role_to_directly_granted_map> standard_role_manager::query_all_directly_granted(::service::query_state& qs) {
|
||||
const sstring query = seastar::format("SELECT * FROM {}.{}",
|
||||
db::system_keyspace::NAME,
|
||||
ROLE_MEMBERS_CF);
|
||||
|
||||
const auto results = co_await _qp.execute_internal(
|
||||
query,
|
||||
db::consistency_level::ONE,
|
||||
qs,
|
||||
cql3::query_processor::cache_internal::yes);
|
||||
|
||||
role_to_directly_granted_map roles_map;
|
||||
std::transform(
|
||||
results->begin(),
|
||||
results->end(),
|
||||
std::inserter(roles_map, roles_map.begin()),
|
||||
[] (const cql3::untyped_result_set_row& row) {
|
||||
return std::make_pair(row.get_as<sstring>("member"), row.get_as<sstring>("role")); }
|
||||
);
|
||||
|
||||
_cache.for_each_role([&roles_map] (const cache::role_name_t& name, const cache::role_record& record) {
|
||||
for (const auto& granted_role : record.member_of) {
|
||||
roles_map.emplace(name, granted_role);
|
||||
}
|
||||
});
|
||||
co_return roles_map;
|
||||
}
|
||||
|
||||
future<role_set> standard_role_manager::query_all(::service::query_state& qs) {
|
||||
const sstring query = seastar::format("SELECT {} FROM {}.{}",
|
||||
meta::roles_table::role_col_name,
|
||||
db::system_keyspace::NAME,
|
||||
meta::roles_table::name);
|
||||
|
||||
// To avoid many copies of a view.
|
||||
static const auto role_col_name_string = sstring(meta::roles_table::role_col_name);
|
||||
|
||||
const auto results = co_await _qp.execute_internal(
|
||||
query,
|
||||
db::consistency_level::LOCAL_ONE,
|
||||
qs,
|
||||
cql3::query_processor::cache_internal::yes);
|
||||
|
||||
role_set roles;
|
||||
std::transform(
|
||||
results->begin(),
|
||||
results->end(),
|
||||
std::inserter(roles, roles.begin()),
|
||||
[] (const cql3::untyped_result_set_row& row) {
|
||||
return row.get_as<sstring>(role_col_name_string);}
|
||||
);
|
||||
roles.reserve(_cache.roles_count());
|
||||
_cache.for_each_role([&roles] (const cache::role_name_t& name, const cache::role_record&) {
|
||||
roles.insert(name);
|
||||
});
|
||||
co_return roles;
|
||||
}
|
||||
|
||||
@@ -460,31 +429,26 @@ future<bool> standard_role_manager::can_login(std::string_view role_name) {
|
||||
}
|
||||
|
||||
future<std::optional<sstring>> standard_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state& qs) {
|
||||
const sstring query = seastar::format("SELECT name, value FROM {}.{} WHERE role = ? AND name = ?",
|
||||
db::system_keyspace::NAME,
|
||||
ROLE_ATTRIBUTES_CF);
|
||||
const auto result_set = co_await _qp.execute_internal(query, db::consistency_level::ONE, qs, {sstring(role_name), sstring(attribute_name)}, cql3::query_processor::cache_internal::yes);
|
||||
if (!result_set->empty()) {
|
||||
const cql3::untyped_result_set_row &row = result_set->one();
|
||||
co_return std::optional<sstring>(row.get_as<sstring>("value"));
|
||||
auto role = _cache.get(role_name);
|
||||
if (!role) {
|
||||
co_return std::nullopt;
|
||||
}
|
||||
co_return std::optional<sstring>{};
|
||||
auto it = role->attributes.find(attribute_name);
|
||||
if (it != role->attributes.end()) {
|
||||
co_return it->second;
|
||||
}
|
||||
co_return std::nullopt;
|
||||
}
|
||||
|
||||
future<role_manager::attribute_vals> standard_role_manager::query_attribute_for_all (std::string_view attribute_name, ::service::query_state& qs) {
|
||||
return query_all(qs).then([this, attribute_name, &qs] (role_set roles) {
|
||||
return do_with(attribute_vals{}, [this, attribute_name, roles = std::move(roles), &qs] (attribute_vals &role_to_att_val) {
|
||||
return parallel_for_each(roles.begin(), roles.end(), [this, &role_to_att_val, attribute_name, &qs] (sstring role) {
|
||||
return get_attribute(role, attribute_name, qs).then([&role_to_att_val, role] (std::optional<sstring> att_val) {
|
||||
if (att_val) {
|
||||
role_to_att_val.emplace(std::move(role), std::move(*att_val));
|
||||
}
|
||||
});
|
||||
}).then([&role_to_att_val] () {
|
||||
return make_ready_future<attribute_vals>(std::move(role_to_att_val));
|
||||
});
|
||||
});
|
||||
future<role_manager::attribute_vals> standard_role_manager::query_attribute_for_all(std::string_view attribute_name, ::service::query_state& qs) {
|
||||
attribute_vals result;
|
||||
_cache.for_each_role([&result, attribute_name] (const cache::role_name_t& name, const cache::role_record& record) {
|
||||
auto it = record.attributes.find(attribute_name);
|
||||
if (it != record.attributes.end()) {
|
||||
result.emplace(name, it->second);
|
||||
}
|
||||
});
|
||||
co_return result;
|
||||
}
|
||||
|
||||
future<> standard_role_manager::set_attribute(std::string_view role_name, std::string_view attribute_name, std::string_view attribute_value, ::service::group0_batch& mc) {
|
||||
|
||||
219
cdc/split.cc
219
cdc/split.cc
@@ -76,14 +76,14 @@ struct partition_deletion {
|
||||
|
||||
using clustered_column_set = std::map<clustering_key, cdc::one_kind_column_set, clustering_key::less_compare>;
|
||||
|
||||
template<typename Container>
|
||||
template <typename Container>
|
||||
concept EntryContainer = requires(Container& container) {
|
||||
// Parenthesized due to https://bugs.llvm.org/show_bug.cgi?id=45088
|
||||
{ (container.atomic_entries) } -> std::same_as<std::vector<atomic_column_update>&>;
|
||||
{ (container.nonatomic_entries) } -> std::same_as<std::vector<nonatomic_column_update>&>;
|
||||
};
|
||||
|
||||
template<EntryContainer Container>
|
||||
template <EntryContainer Container>
|
||||
static void add_columns_affected_by_entries(cdc::one_kind_column_set& cset, const Container& cont) {
|
||||
for (const auto& entry : cont.atomic_entries) {
|
||||
cset.set(entry.id);
|
||||
@@ -134,7 +134,7 @@ struct batch {
|
||||
ret.emplace(clustering_key::make_empty(), all_columns);
|
||||
}
|
||||
|
||||
auto process_change_type = [&] (const auto& changes) {
|
||||
auto process_change_type = [&](const auto& changes) {
|
||||
for (const auto& change : changes) {
|
||||
auto& cset = ret[change.key];
|
||||
cset.resize(s.regular_columns_count());
|
||||
@@ -211,7 +211,9 @@ private:
|
||||
|
||||
public:
|
||||
extract_collection_visitor(column_id id, std::map<change_key_t, row_update>& updates)
|
||||
: _id(id), _updates(updates) {}
|
||||
: _id(id)
|
||||
, _updates(updates) {
|
||||
}
|
||||
|
||||
void collection_tombstone(const tombstone& t) {
|
||||
auto& entry = get_or_append_entry(t.timestamp + 1, gc_clock::duration(0));
|
||||
@@ -226,7 +228,9 @@ public:
|
||||
cell(key, c);
|
||||
}
|
||||
|
||||
constexpr bool finished() const { return false; }
|
||||
constexpr bool finished() const {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
/* Visits all cells and tombstones in a row, putting the encountered changes into buckets
|
||||
@@ -249,41 +253,46 @@ struct extract_row_visitor {
|
||||
|
||||
void collection_column(const column_definition& cdef, auto&& visit_collection) {
|
||||
visit(*cdef.type, make_visitor(
|
||||
[&] (const collection_type_impl& ctype) {
|
||||
struct collection_visitor : public extract_collection_visitor<collection_visitor> {
|
||||
data_type _value_type;
|
||||
[&](const collection_type_impl& ctype) {
|
||||
struct collection_visitor : public extract_collection_visitor<collection_visitor> {
|
||||
data_type _value_type;
|
||||
|
||||
collection_visitor(column_id id, std::map<change_key_t, row_update>& updates, const collection_type_impl& ctype)
|
||||
: extract_collection_visitor<collection_visitor>(id, updates), _value_type(ctype.value_comparator()) {}
|
||||
collection_visitor(column_id id, std::map<change_key_t, row_update>& updates, const collection_type_impl& ctype)
|
||||
: extract_collection_visitor<collection_visitor>(id, updates)
|
||||
, _value_type(ctype.value_comparator()) {
|
||||
}
|
||||
|
||||
data_type get_value_type(bytes_view) {
|
||||
return _value_type;
|
||||
}
|
||||
} v(cdef.id, _updates, ctype);
|
||||
data_type get_value_type(bytes_view) {
|
||||
return _value_type;
|
||||
}
|
||||
} v(cdef.id, _updates, ctype);
|
||||
|
||||
visit_collection(v);
|
||||
},
|
||||
[&] (const user_type_impl& utype) {
|
||||
struct udt_visitor : public extract_collection_visitor<udt_visitor> {
|
||||
const user_type_impl& _utype;
|
||||
visit_collection(v);
|
||||
},
|
||||
[&](const user_type_impl& utype) {
|
||||
struct udt_visitor : public extract_collection_visitor<udt_visitor> {
|
||||
const user_type_impl& _utype;
|
||||
|
||||
udt_visitor(column_id id, std::map<change_key_t, row_update>& updates, const user_type_impl& utype)
|
||||
: extract_collection_visitor<udt_visitor>(id, updates), _utype(utype) {}
|
||||
udt_visitor(column_id id, std::map<change_key_t, row_update>& updates, const user_type_impl& utype)
|
||||
: extract_collection_visitor<udt_visitor>(id, updates)
|
||||
, _utype(utype) {
|
||||
}
|
||||
|
||||
data_type get_value_type(bytes_view key) {
|
||||
return _utype.type(deserialize_field_index(key));
|
||||
}
|
||||
} v(cdef.id, _updates, utype);
|
||||
data_type get_value_type(bytes_view key) {
|
||||
return _utype.type(deserialize_field_index(key));
|
||||
}
|
||||
} v(cdef.id, _updates, utype);
|
||||
|
||||
visit_collection(v);
|
||||
},
|
||||
[&] (const abstract_type& o) {
|
||||
throw std::runtime_error(format("extract_changes: unknown collection type:", o.name()));
|
||||
}
|
||||
));
|
||||
visit_collection(v);
|
||||
},
|
||||
[&](const abstract_type& o) {
|
||||
throw std::runtime_error(format("extract_changes: unknown collection type:", o.name()));
|
||||
}));
|
||||
}
|
||||
|
||||
constexpr bool finished() const { return false; }
|
||||
constexpr bool finished() const {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
struct extract_changes_visitor {
|
||||
@@ -293,12 +302,8 @@ struct extract_changes_visitor {
|
||||
extract_row_visitor v;
|
||||
visit_row_cells(v);
|
||||
|
||||
for (auto& [ts_ttl, row_update]: v._updates) {
|
||||
_result[ts_ttl.first].static_updates.push_back({
|
||||
ts_ttl.second,
|
||||
std::move(row_update.atomic_entries),
|
||||
std::move(row_update.nonatomic_entries)
|
||||
});
|
||||
for (auto& [ts_ttl, row_update] : v._updates) {
|
||||
_result[ts_ttl.first].static_updates.push_back({ts_ttl.second, std::move(row_update.atomic_entries), std::move(row_update.nonatomic_entries)});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -319,24 +324,18 @@ struct extract_changes_visitor {
|
||||
} v;
|
||||
visit_row_cells(v);
|
||||
|
||||
for (auto& [ts_ttl, row_update]: v._updates) {
|
||||
for (auto& [ts_ttl, row_update] : v._updates) {
|
||||
// It is important that changes in the resulting `set_of_changes` are listed
|
||||
// in increasing TTL order. The reason is explained in a comment in cdc/log.cc,
|
||||
// search for "#6070".
|
||||
auto [ts, ttl] = ts_ttl;
|
||||
|
||||
if (v._marker && ts == v._marker_ts && ttl == v._marker_ttl) {
|
||||
_result[ts].clustered_inserts.push_back({
|
||||
ttl,
|
||||
ckey,
|
||||
*v._marker,
|
||||
std::move(row_update.atomic_entries),
|
||||
{}
|
||||
});
|
||||
_result[ts].clustered_inserts.push_back({ttl, ckey, *v._marker, std::move(row_update.atomic_entries), {}});
|
||||
|
||||
auto& cr_insert = _result[ts].clustered_inserts.back();
|
||||
bool clustered_update_exists = false;
|
||||
for (auto& nonatomic_up: row_update.nonatomic_entries) {
|
||||
for (auto& nonatomic_up : row_update.nonatomic_entries) {
|
||||
// Updating a collection column with an INSERT statement implies inserting a tombstone.
|
||||
//
|
||||
// For example, suppose that we have:
|
||||
@@ -362,12 +361,7 @@ struct extract_changes_visitor {
|
||||
cr_insert.nonatomic_entries.push_back(std::move(nonatomic_up));
|
||||
} else {
|
||||
if (!clustered_update_exists) {
|
||||
_result[ts].clustered_updates.push_back({
|
||||
ttl,
|
||||
ckey,
|
||||
{},
|
||||
{}
|
||||
});
|
||||
_result[ts].clustered_updates.push_back({ttl, ckey, {}, {}});
|
||||
|
||||
// Multiple iterations of this `for` loop (for different collection columns)
|
||||
// might want to put their `nonatomic_up`s into an UPDATE change;
|
||||
@@ -390,12 +384,7 @@ struct extract_changes_visitor {
|
||||
}
|
||||
}
|
||||
} else {
|
||||
_result[ts].clustered_updates.push_back({
|
||||
ttl,
|
||||
ckey,
|
||||
std::move(row_update.atomic_entries),
|
||||
std::move(row_update.nonatomic_entries)
|
||||
});
|
||||
_result[ts].clustered_updates.push_back({ttl, ckey, std::move(row_update.atomic_entries), std::move(row_update.nonatomic_entries)});
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -412,7 +401,9 @@ struct extract_changes_visitor {
|
||||
_result[t.timestamp].partition_deletions = partition_deletion{t};
|
||||
}
|
||||
|
||||
constexpr bool finished() const { return false; }
|
||||
constexpr bool finished() const {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
set_of_changes extract_changes(const mutation& m) {
|
||||
@@ -426,13 +417,23 @@ namespace cdc {
|
||||
struct find_timestamp_visitor {
|
||||
api::timestamp_type _ts = api::missing_timestamp;
|
||||
|
||||
bool finished() const { return _ts != api::missing_timestamp; }
|
||||
bool finished() const {
|
||||
return _ts != api::missing_timestamp;
|
||||
}
|
||||
|
||||
void visit(api::timestamp_type ts) { _ts = ts; }
|
||||
void visit(const atomic_cell_view& cell) { visit(cell.timestamp()); }
|
||||
void visit(api::timestamp_type ts) {
|
||||
_ts = ts;
|
||||
}
|
||||
void visit(const atomic_cell_view& cell) {
|
||||
visit(cell.timestamp());
|
||||
}
|
||||
|
||||
void live_atomic_cell(const column_definition&, const atomic_cell_view& cell) { visit(cell); }
|
||||
void dead_atomic_cell(const column_definition&, const atomic_cell_view& cell) { visit(cell); }
|
||||
void live_atomic_cell(const column_definition&, const atomic_cell_view& cell) {
|
||||
visit(cell);
|
||||
}
|
||||
void dead_atomic_cell(const column_definition&, const atomic_cell_view& cell) {
|
||||
visit(cell);
|
||||
}
|
||||
void collection_tombstone(const tombstone& t) {
|
||||
// A collection tombstone with timestamp T can be created with:
|
||||
// UPDATE ks.t USING TIMESTAMP T + 1 SET X = null WHERE ...
|
||||
@@ -441,15 +442,33 @@ struct find_timestamp_visitor {
|
||||
// with cdc$time using timestamp T + 1 instead of T.
|
||||
visit(t.timestamp + 1);
|
||||
}
|
||||
void live_collection_cell(bytes_view, const atomic_cell_view& cell) { visit(cell); }
|
||||
void dead_collection_cell(bytes_view, const atomic_cell_view& cell) { visit(cell); }
|
||||
void collection_column(const column_definition&, auto&& visit_collection) { visit_collection(*this); }
|
||||
void marker(const row_marker& rm) { visit(rm.timestamp()); }
|
||||
void static_row_cells(auto&& visit_row_cells) { visit_row_cells(*this); }
|
||||
void clustered_row_cells(const clustering_key&, auto&& visit_row_cells) { visit_row_cells(*this); }
|
||||
void clustered_row_delete(const clustering_key&, const tombstone& t) { visit(t.timestamp); }
|
||||
void range_delete(const range_tombstone& t) { visit(t.tomb.timestamp); }
|
||||
void partition_delete(const tombstone& t) { visit(t.timestamp); }
|
||||
void live_collection_cell(bytes_view, const atomic_cell_view& cell) {
|
||||
visit(cell);
|
||||
}
|
||||
void dead_collection_cell(bytes_view, const atomic_cell_view& cell) {
|
||||
visit(cell);
|
||||
}
|
||||
void collection_column(const column_definition&, auto&& visit_collection) {
|
||||
visit_collection(*this);
|
||||
}
|
||||
void marker(const row_marker& rm) {
|
||||
visit(rm.timestamp());
|
||||
}
|
||||
void static_row_cells(auto&& visit_row_cells) {
|
||||
visit_row_cells(*this);
|
||||
}
|
||||
void clustered_row_cells(const clustering_key&, auto&& visit_row_cells) {
|
||||
visit_row_cells(*this);
|
||||
}
|
||||
void clustered_row_delete(const clustering_key&, const tombstone& t) {
|
||||
visit(t.timestamp);
|
||||
}
|
||||
void range_delete(const range_tombstone& t) {
|
||||
visit(t.tomb.timestamp);
|
||||
}
|
||||
void partition_delete(const tombstone& t) {
|
||||
visit(t.timestamp);
|
||||
}
|
||||
};
|
||||
|
||||
/* Find some timestamp inside the given mutation.
|
||||
@@ -505,8 +524,12 @@ struct should_split_visitor {
|
||||
|
||||
virtual ~should_split_visitor() = default;
|
||||
|
||||
inline bool finished() const { return _result; }
|
||||
inline void stop() { _result = true; }
|
||||
inline bool finished() const {
|
||||
return _result;
|
||||
}
|
||||
inline void stop() {
|
||||
_result = true;
|
||||
}
|
||||
|
||||
void visit(api::timestamp_type ts, gc_clock::duration ttl = gc_clock::duration(0)) {
|
||||
if (_ts != api::missing_timestamp && _ts != ts) {
|
||||
@@ -517,15 +540,23 @@ struct should_split_visitor {
|
||||
if (_ttl && *_ttl != ttl) {
|
||||
return stop();
|
||||
}
|
||||
_ttl = { ttl };
|
||||
_ttl = {ttl};
|
||||
}
|
||||
|
||||
void visit(const atomic_cell_view& cell) { visit(cell.timestamp(), get_ttl(cell)); }
|
||||
void visit(const atomic_cell_view& cell) {
|
||||
visit(cell.timestamp(), get_ttl(cell));
|
||||
}
|
||||
|
||||
void live_atomic_cell(const column_definition&, const atomic_cell_view& cell) { visit(cell); }
|
||||
void dead_atomic_cell(const column_definition&, const atomic_cell_view& cell) { visit(cell); }
|
||||
void live_atomic_cell(const column_definition&, const atomic_cell_view& cell) {
|
||||
visit(cell);
|
||||
}
|
||||
void dead_atomic_cell(const column_definition&, const atomic_cell_view& cell) {
|
||||
visit(cell);
|
||||
}
|
||||
|
||||
void collection_tombstone(const tombstone& t) { visit(t.timestamp + 1); }
|
||||
void collection_tombstone(const tombstone& t) {
|
||||
visit(t.timestamp + 1);
|
||||
}
|
||||
|
||||
virtual void live_collection_cell(bytes_view, const atomic_cell_view& cell) {
|
||||
if (_had_row_marker) {
|
||||
@@ -534,8 +565,12 @@ struct should_split_visitor {
|
||||
}
|
||||
visit(cell);
|
||||
}
|
||||
void dead_collection_cell(bytes_view, const atomic_cell_view& cell) { visit(cell); }
|
||||
void collection_column(const column_definition&, auto&& visit_collection) { visit_collection(*this); }
|
||||
void dead_collection_cell(bytes_view, const atomic_cell_view& cell) {
|
||||
visit(cell);
|
||||
}
|
||||
void collection_column(const column_definition&, auto&& visit_collection) {
|
||||
visit_collection(*this);
|
||||
}
|
||||
|
||||
virtual void marker(const row_marker& rm) {
|
||||
_had_row_marker = true;
|
||||
@@ -606,8 +641,8 @@ bool should_split(const mutation& m, const per_request_options& options) {
|
||||
cdc::inspect_mutation(m, v);
|
||||
|
||||
return v._result
|
||||
// A mutation with no timestamp will be split into 0 mutations:
|
||||
|| v._ts == api::missing_timestamp;
|
||||
// A mutation with no timestamp will be split into 0 mutations:
|
||||
|| v._ts == api::missing_timestamp;
|
||||
}
|
||||
|
||||
// Returns true if the row state and the atomic and nonatomic entries represent
|
||||
@@ -642,7 +677,7 @@ static bool entries_match_row_state(const schema_ptr& base_schema, const cell_ma
|
||||
if (current_values.size() != update.cells.size()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
std::unordered_map<sstring_view, bytes> current_values_map;
|
||||
for (const auto& entry : current_values) {
|
||||
const auto attr_name = std::string_view(value_cast<sstring>(entry.first));
|
||||
@@ -711,8 +746,8 @@ bool should_skip(batch& changes, const mutation& base_mutation, change_processor
|
||||
return true;
|
||||
}
|
||||
|
||||
void process_changes_with_splitting(const mutation& base_mutation, change_processor& processor,
|
||||
bool enable_preimage, bool enable_postimage, bool alternator_strict_compatibility) {
|
||||
void process_changes_with_splitting(
|
||||
const mutation& base_mutation, change_processor& processor, bool enable_preimage, bool enable_postimage, bool alternator_strict_compatibility) {
|
||||
const auto base_schema = base_mutation.schema();
|
||||
auto changes = extract_changes(base_mutation);
|
||||
auto pk = base_mutation.key();
|
||||
@@ -824,8 +859,8 @@ void process_changes_with_splitting(const mutation& base_mutation, change_proces
|
||||
}
|
||||
}
|
||||
|
||||
void process_changes_without_splitting(const mutation& base_mutation, change_processor& processor,
|
||||
bool enable_preimage, bool enable_postimage, bool alternator_strict_compatibility) {
|
||||
void process_changes_without_splitting(
|
||||
const mutation& base_mutation, change_processor& processor, bool enable_preimage, bool enable_postimage, bool alternator_strict_compatibility) {
|
||||
if (alternator_strict_compatibility) {
|
||||
auto changes = extract_changes(base_mutation);
|
||||
if (should_skip(changes.begin()->second, base_mutation, processor)) {
|
||||
@@ -842,7 +877,7 @@ void process_changes_without_splitting(const mutation& base_mutation, change_pro
|
||||
|
||||
one_kind_column_set columns{base_schema->static_columns_count()};
|
||||
if (!p.static_row().empty()) {
|
||||
p.static_row().get().for_each_cell([&] (column_id id, const atomic_cell_or_collection& cell) {
|
||||
p.static_row().get().for_each_cell([&](column_id id, const atomic_cell_or_collection& cell) {
|
||||
columns.set(id);
|
||||
});
|
||||
processor.produce_preimage(nullptr, columns);
|
||||
@@ -855,7 +890,7 @@ void process_changes_without_splitting(const mutation& base_mutation, change_pro
|
||||
// Row deleted - include all columns in preimage
|
||||
columns.set(0, base_schema->regular_columns_count(), true);
|
||||
} else {
|
||||
cr.row().cells().for_each_cell([&] (column_id id, const atomic_cell_or_collection& cell) {
|
||||
cr.row().cells().for_each_cell([&](column_id id, const atomic_cell_or_collection& cell) {
|
||||
columns.set(id);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -946,7 +946,7 @@ sstables::shared_sstable sstables_task_executor::consume_sstable() {
|
||||
auto sst = _sstables.back();
|
||||
_sstables.pop_back();
|
||||
--_cm._stats.pending_tasks; // from this point on, switch_state(pending|active) works the same way as any other task
|
||||
cmlog.debug("{}", format("consumed {}", sst->get_filename()));
|
||||
cmlog.debug("consumed {}", sst->get_filename());
|
||||
return sst;
|
||||
}
|
||||
|
||||
@@ -1208,7 +1208,6 @@ future<> compaction_manager::await_tasks(std::vector<shared_ptr<compaction_task_
|
||||
|
||||
std::vector<shared_ptr<compaction_task_executor>>
|
||||
compaction_manager::do_stop_ongoing_compactions(sstring reason, std::function<bool(const compaction_group_view*)> filter, std::optional<compaction_type> type_opt) noexcept {
|
||||
auto ongoing_compactions = get_compactions(filter).size();
|
||||
auto tasks = _tasks
|
||||
| std::views::filter([&filter, type_opt] (const auto& task) {
|
||||
return filter(task.compacting_table()) && (!type_opt || task.compaction_type() == *type_opt);
|
||||
@@ -1217,6 +1216,7 @@ compaction_manager::do_stop_ongoing_compactions(sstring reason, std::function<bo
|
||||
| std::ranges::to<std::vector<shared_ptr<compaction_task_executor>>>();
|
||||
logging::log_level level = tasks.empty() ? log_level::debug : log_level::info;
|
||||
if (cmlog.is_enabled(level)) {
|
||||
auto ongoing_compactions = get_compactions(filter).size();
|
||||
std::string scope = "";
|
||||
if (!tasks.empty()) {
|
||||
const compaction_group_view* t = tasks.front()->compacting_table();
|
||||
@@ -1268,9 +1268,15 @@ future<> compaction_manager::start(const db::config& cfg, utils::disk_space_moni
|
||||
if (dsm && (this_shard_id() == 0)) {
|
||||
_out_of_space_subscription = dsm->subscribe(cfg.critical_disk_utilization_level, [this] (auto threshold_reached) {
|
||||
if (threshold_reached) {
|
||||
return container().invoke_on_all([] (compaction_manager& cm) { return cm.drain(); });
|
||||
return container().invoke_on_all([] (compaction_manager& cm) {
|
||||
cm._in_critical_disk_utilization_mode = true;
|
||||
return cm.drain();
|
||||
});
|
||||
}
|
||||
return container().invoke_on_all([] (compaction_manager& cm) { cm.enable(); });
|
||||
return container().invoke_on_all([] (compaction_manager& cm) {
|
||||
cm._in_critical_disk_utilization_mode = false;
|
||||
cm.enable();
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
@@ -1420,11 +1426,17 @@ protected:
|
||||
compaction_strategy cs = t.get_compaction_strategy();
|
||||
compaction_descriptor descriptor = co_await cs.get_sstables_for_compaction(t, _cm.get_strategy_control());
|
||||
int weight = calculate_weight(descriptor);
|
||||
cmlog.debug("Started minor compaction sstables={} sstables_reapired_at={} range={} uuid={} compaction_uuid={}",
|
||||
descriptor.sstables, compacting_table()->get_sstables_repaired_at(),
|
||||
compacting_table()->token_range(), uuid, _compaction_data.compaction_uuid);
|
||||
bool debug_enabled = cmlog.is_enabled(log_level::debug);
|
||||
if (debug_enabled) {
|
||||
cmlog.debug("Started minor compaction sstables={} sstables_reapired_at={} range={} uuid={} compaction_uuid={}",
|
||||
descriptor.sstables, compacting_table()->get_sstables_repaired_at(),
|
||||
compacting_table()->token_range(), uuid, _compaction_data.compaction_uuid);
|
||||
}
|
||||
|
||||
auto old_sstables = ::format("{}", descriptor.sstables);
|
||||
sstring old_sstables;
|
||||
if (debug_enabled) {
|
||||
old_sstables = ::format("{}", descriptor.sstables);
|
||||
}
|
||||
|
||||
if (descriptor.sstables.empty() || !can_proceed() || t.is_auto_compaction_disabled_by_user()) {
|
||||
cmlog.debug("{}: sstables={} can_proceed={} auto_compaction={}", *this, descriptor.sstables.size(), can_proceed(), t.is_auto_compaction_disabled_by_user());
|
||||
@@ -1454,8 +1466,10 @@ protected:
|
||||
try {
|
||||
bool should_update_history = this->should_update_history(descriptor.options.type());
|
||||
compaction_result res = co_await compact_sstables(std::move(descriptor), _compaction_data, on_replace);
|
||||
cmlog.debug("Finished minor compaction old_sstables={} new_sstables={} sstables_reapired_at={} range={} uuid={} compaction_uuid={}",
|
||||
old_sstables, res.new_sstables, compacting_table()->get_sstables_repaired_at(), compacting_table()->token_range(), uuid, _compaction_data.compaction_uuid);
|
||||
if (debug_enabled) {
|
||||
cmlog.debug("Finished minor compaction old_sstables={} new_sstables={} sstables_reapired_at={} range={} uuid={} compaction_uuid={}",
|
||||
old_sstables, res.new_sstables, compacting_table()->get_sstables_repaired_at(), compacting_table()->token_range(), uuid, _compaction_data.compaction_uuid);
|
||||
}
|
||||
finish_compaction();
|
||||
if (should_update_history) {
|
||||
// update_history can take a long time compared to
|
||||
@@ -2348,6 +2362,16 @@ future<compaction_manager::compaction_stats_opt> compaction_manager::perform_spl
|
||||
return perform_task_on_all_files<split_compaction_task_executor>("split", info, t, std::move(options), std::move(owned_ranges_ptr), std::move(get_sstables), throw_if_stopping::no);
|
||||
}
|
||||
|
||||
std::exception_ptr compaction_manager::make_disabled_exception(compaction::compaction_group_view& cg) {
|
||||
std::exception_ptr ex;
|
||||
if (_in_critical_disk_utilization_mode) {
|
||||
ex = std::make_exception_ptr(std::runtime_error("critical disk utilization"));
|
||||
} else {
|
||||
ex = std::make_exception_ptr(compaction_stopped_exception(cg.schema()->ks_name(), cg.schema()->cf_name(), "compaction disabled"));
|
||||
}
|
||||
return ex;
|
||||
}
|
||||
|
||||
future<std::vector<sstables::shared_sstable>>
|
||||
compaction_manager::maybe_split_new_sstable(sstables::shared_sstable sst, compaction_group_view& t, compaction_type_options::split opt) {
|
||||
if (!split_compaction_task_executor::sstable_needs_split(sst, opt)) {
|
||||
@@ -2357,8 +2381,7 @@ compaction_manager::maybe_split_new_sstable(sstables::shared_sstable sst, compac
|
||||
// We don't want to prevent split because compaction is temporarily disabled on a view only for synchronization,
|
||||
// which is unneeded against new sstables that aren't part of any set yet, so never use can_proceed(&t) here.
|
||||
if (is_disabled()) {
|
||||
co_return coroutine::exception(std::make_exception_ptr(std::runtime_error(format("Cannot split {} because manager has compaction disabled, " \
|
||||
"reason might be out of space prevention", sst->get_filename()))));
|
||||
co_return coroutine::exception(make_disabled_exception(t));
|
||||
}
|
||||
std::vector<sstables::shared_sstable> ret;
|
||||
|
||||
|
||||
@@ -115,6 +115,8 @@ private:
|
||||
uint32_t _disabled_state_count = 0;
|
||||
|
||||
bool is_disabled() const { return _state != state::running || _disabled_state_count > 0; }
|
||||
// precondition: is_disabled() is true.
|
||||
std::exception_ptr make_disabled_exception(compaction::compaction_group_view& cg);
|
||||
|
||||
std::optional<future<>> _stop_future;
|
||||
|
||||
@@ -170,6 +172,7 @@ private:
|
||||
shared_tombstone_gc_state _shared_tombstone_gc_state;
|
||||
|
||||
utils::disk_space_monitor::subscription _out_of_space_subscription;
|
||||
bool _in_critical_disk_utilization_mode = false;
|
||||
private:
|
||||
// Requires task->_compaction_state.gate to be held and task to be registered in _tasks.
|
||||
future<compaction_stats_opt> perform_task(shared_ptr<compaction::compaction_task_executor> task, throw_if_stopping do_throw_if_stopping);
|
||||
|
||||
@@ -33,8 +33,10 @@ future<compaction_descriptor> leveled_compaction_strategy::get_sstables_for_comp
|
||||
auto candidate = manifest.get_compaction_candidates(*state->last_compacted_keys, state->compaction_counter);
|
||||
|
||||
if (!candidate.sstables.empty()) {
|
||||
auto main_set = co_await table_s.main_sstable_set();
|
||||
leveled_manifest::logger.debug("leveled: Compacting {} out of {} sstables", candidate.sstables.size(), main_set->size());
|
||||
if (leveled_manifest::logger.is_enabled(logging::log_level::debug)) {
|
||||
auto main_set = co_await table_s.main_sstable_set();
|
||||
leveled_manifest::logger.debug("leveled: Compacting {} out of {} sstables", candidate.sstables.size(), main_set->size());
|
||||
}
|
||||
co_return candidate;
|
||||
}
|
||||
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
#include "compaction_strategy_state.hh"
|
||||
#include "utils/error_injection.hh"
|
||||
|
||||
#include <seastar/util/lazy.hh>
|
||||
#include <ranges>
|
||||
|
||||
namespace compaction {
|
||||
@@ -28,12 +29,12 @@ time_window_compaction_strategy_state_ptr time_window_compaction_strategy::get_s
|
||||
}
|
||||
|
||||
const std::unordered_map<sstring, std::chrono::seconds> time_window_compaction_strategy_options::valid_window_units = {
|
||||
{ "MINUTES", 60s }, { "HOURS", 3600s }, { "DAYS", 86400s }
|
||||
};
|
||||
{"MINUTES", 60s}, {"HOURS", 3600s}, {"DAYS", 86400s}};
|
||||
|
||||
const std::unordered_map<sstring, time_window_compaction_strategy_options::timestamp_resolutions> time_window_compaction_strategy_options::valid_timestamp_resolutions = {
|
||||
{ "MICROSECONDS", timestamp_resolutions::microsecond },
|
||||
{ "MILLISECONDS", timestamp_resolutions::millisecond },
|
||||
const std::unordered_map<sstring, time_window_compaction_strategy_options::timestamp_resolutions>
|
||||
time_window_compaction_strategy_options::valid_timestamp_resolutions = {
|
||||
{"MICROSECONDS", timestamp_resolutions::microsecond},
|
||||
{"MILLISECONDS", timestamp_resolutions::millisecond},
|
||||
};
|
||||
|
||||
static std::chrono::seconds validate_compaction_window_unit(const std::map<sstring, sstring>& options) {
|
||||
@@ -43,7 +44,8 @@ static std::chrono::seconds validate_compaction_window_unit(const std::map<sstri
|
||||
if (tmp_value) {
|
||||
auto valid_window_units_it = time_window_compaction_strategy_options::valid_window_units.find(tmp_value.value());
|
||||
if (valid_window_units_it == time_window_compaction_strategy_options::valid_window_units.end()) {
|
||||
throw exceptions::configuration_exception(fmt::format("Invalid window unit {} for {}", tmp_value.value(), time_window_compaction_strategy_options::COMPACTION_WINDOW_UNIT_KEY));
|
||||
throw exceptions::configuration_exception(
|
||||
fmt::format("Invalid window unit {} for {}", tmp_value.value(), time_window_compaction_strategy_options::COMPACTION_WINDOW_UNIT_KEY));
|
||||
}
|
||||
window_unit = valid_window_units_it->second;
|
||||
}
|
||||
@@ -59,10 +61,12 @@ static std::chrono::seconds validate_compaction_window_unit(const std::map<sstri
|
||||
|
||||
static int validate_compaction_window_size(const std::map<sstring, sstring>& options) {
|
||||
auto tmp_value = compaction_strategy_impl::get_value(options, time_window_compaction_strategy_options::COMPACTION_WINDOW_SIZE_KEY);
|
||||
int window_size = cql3::statements::property_definitions::to_long(time_window_compaction_strategy_options::COMPACTION_WINDOW_SIZE_KEY, tmp_value, time_window_compaction_strategy_options::DEFAULT_COMPACTION_WINDOW_SIZE);
|
||||
int window_size = cql3::statements::property_definitions::to_long(time_window_compaction_strategy_options::COMPACTION_WINDOW_SIZE_KEY, tmp_value,
|
||||
time_window_compaction_strategy_options::DEFAULT_COMPACTION_WINDOW_SIZE);
|
||||
|
||||
if (window_size <= 0) {
|
||||
throw exceptions::configuration_exception(fmt::format("{} value ({}) must be greater than 1", time_window_compaction_strategy_options::COMPACTION_WINDOW_SIZE_KEY, window_size));
|
||||
throw exceptions::configuration_exception(
|
||||
fmt::format("{} value ({}) must be greater than 1", time_window_compaction_strategy_options::COMPACTION_WINDOW_SIZE_KEY, window_size));
|
||||
}
|
||||
|
||||
return window_size;
|
||||
@@ -82,26 +86,30 @@ static db_clock::duration validate_expired_sstable_check_frequency_seconds(const
|
||||
try {
|
||||
expired_sstable_check_frequency = std::chrono::seconds(std::stol(tmp_value.value()));
|
||||
} catch (const std::exception& e) {
|
||||
throw exceptions::syntax_exception(fmt::format("Invalid long value {} for {}", tmp_value.value(), time_window_compaction_strategy_options::EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_KEY));
|
||||
throw exceptions::syntax_exception(fmt::format(
|
||||
"Invalid long value {} for {}", tmp_value.value(), time_window_compaction_strategy_options::EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_KEY));
|
||||
}
|
||||
}
|
||||
|
||||
return expired_sstable_check_frequency;
|
||||
}
|
||||
|
||||
static db_clock::duration validate_expired_sstable_check_frequency_seconds(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
|
||||
static db_clock::duration validate_expired_sstable_check_frequency_seconds(
|
||||
const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
|
||||
db_clock::duration expired_sstable_check_frequency = validate_expired_sstable_check_frequency_seconds(options);
|
||||
unchecked_options.erase(time_window_compaction_strategy_options::EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_KEY);
|
||||
return expired_sstable_check_frequency;
|
||||
}
|
||||
|
||||
static time_window_compaction_strategy_options::timestamp_resolutions validate_timestamp_resolution(const std::map<sstring, sstring>& options) {
|
||||
time_window_compaction_strategy_options::timestamp_resolutions timestamp_resolution = time_window_compaction_strategy_options::timestamp_resolutions::microsecond;
|
||||
time_window_compaction_strategy_options::timestamp_resolutions timestamp_resolution =
|
||||
time_window_compaction_strategy_options::timestamp_resolutions::microsecond;
|
||||
|
||||
auto tmp_value = compaction_strategy_impl::get_value(options, time_window_compaction_strategy_options::TIMESTAMP_RESOLUTION_KEY);
|
||||
if (tmp_value) {
|
||||
if (!time_window_compaction_strategy_options::valid_timestamp_resolutions.contains(tmp_value.value())) {
|
||||
throw exceptions::configuration_exception(fmt::format("Invalid timestamp resolution {} for {}", tmp_value.value(), time_window_compaction_strategy_options::TIMESTAMP_RESOLUTION_KEY));
|
||||
throw exceptions::configuration_exception(fmt::format(
|
||||
"Invalid timestamp resolution {} for {}", tmp_value.value(), time_window_compaction_strategy_options::TIMESTAMP_RESOLUTION_KEY));
|
||||
} else {
|
||||
timestamp_resolution = time_window_compaction_strategy_options::valid_timestamp_resolutions.at(tmp_value.value());
|
||||
}
|
||||
@@ -110,7 +118,8 @@ static time_window_compaction_strategy_options::timestamp_resolutions validate_t
|
||||
return timestamp_resolution;
|
||||
}
|
||||
|
||||
static time_window_compaction_strategy_options::timestamp_resolutions validate_timestamp_resolution(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
|
||||
static time_window_compaction_strategy_options::timestamp_resolutions validate_timestamp_resolution(
|
||||
const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
|
||||
time_window_compaction_strategy_options::timestamp_resolutions timestamp_resolution = validate_timestamp_resolution(options);
|
||||
unchecked_options.erase(time_window_compaction_strategy_options::TIMESTAMP_RESOLUTION_KEY);
|
||||
return timestamp_resolution;
|
||||
@@ -145,7 +154,7 @@ void time_window_compaction_strategy_options::validate(const std::map<sstring, s
|
||||
compaction_strategy_impl::validate_min_max_threshold(options, unchecked_options);
|
||||
|
||||
auto it = options.find("enable_optimized_twcs_queries");
|
||||
if (it != options.end() && it->second != "true" && it->second != "false") {
|
||||
if (it != options.end() && it->second != "true" && it->second != "false") {
|
||||
throw exceptions::configuration_exception(fmt::format("enable_optimized_twcs_queries value ({}) must be \"true\" or \"false\"", it->second));
|
||||
}
|
||||
unchecked_options.erase("enable_optimized_twcs_queries");
|
||||
@@ -162,7 +171,9 @@ class classify_by_timestamp {
|
||||
std::vector<int64_t> _known_windows;
|
||||
|
||||
public:
|
||||
explicit classify_by_timestamp(time_window_compaction_strategy_options options) : _options(std::move(options)) { }
|
||||
explicit classify_by_timestamp(time_window_compaction_strategy_options options)
|
||||
: _options(std::move(options)) {
|
||||
}
|
||||
int64_t operator()(api::timestamp_type ts) {
|
||||
const auto window = time_window_compaction_strategy::get_window_for(_options, ts);
|
||||
if (const auto it = std::ranges::find(_known_windows, window); it != _known_windows.end()) {
|
||||
@@ -190,7 +201,7 @@ uint64_t time_window_compaction_strategy::adjust_partition_estimate(const mutati
|
||||
auto estimated_window_count = max_data_segregation_window_count;
|
||||
auto default_ttl = std::chrono::duration_cast<std::chrono::microseconds>(s->default_time_to_live());
|
||||
bool min_and_max_ts_available = ms_meta.min_timestamp && ms_meta.max_timestamp;
|
||||
auto estimate_window_count = [this] (timestamp_type min_window, timestamp_type max_window) {
|
||||
auto estimate_window_count = [this](timestamp_type min_window, timestamp_type max_window) {
|
||||
const auto window_size = get_window_size(_options);
|
||||
return (max_window + (window_size - 1) - min_window) / window_size;
|
||||
};
|
||||
@@ -210,21 +221,19 @@ uint64_t time_window_compaction_strategy::adjust_partition_estimate(const mutati
|
||||
return partition_estimate / std::max(1UL, uint64_t(estimated_window_count));
|
||||
}
|
||||
|
||||
mutation_reader_consumer time_window_compaction_strategy::make_interposer_consumer(const mutation_source_metadata& ms_meta, mutation_reader_consumer end_consumer) const {
|
||||
if (ms_meta.min_timestamp && ms_meta.max_timestamp
|
||||
&& get_window_for(_options, *ms_meta.min_timestamp) == get_window_for(_options, *ms_meta.max_timestamp)) {
|
||||
mutation_reader_consumer time_window_compaction_strategy::make_interposer_consumer(
|
||||
const mutation_source_metadata& ms_meta, mutation_reader_consumer end_consumer) const {
|
||||
if (ms_meta.min_timestamp && ms_meta.max_timestamp &&
|
||||
get_window_for(_options, *ms_meta.min_timestamp) == get_window_for(_options, *ms_meta.max_timestamp)) {
|
||||
return end_consumer;
|
||||
}
|
||||
return [options = _options, end_consumer = std::move(end_consumer)] (mutation_reader rd) mutable -> future<> {
|
||||
return mutation_writer::segregate_by_timestamp(
|
||||
std::move(rd),
|
||||
classify_by_timestamp(std::move(options)),
|
||||
end_consumer);
|
||||
return [options = _options, end_consumer = std::move(end_consumer)](mutation_reader rd) mutable -> future<> {
|
||||
return mutation_writer::segregate_by_timestamp(std::move(rd), classify_by_timestamp(std::move(options)), end_consumer);
|
||||
};
|
||||
}
|
||||
|
||||
compaction_descriptor
|
||||
time_window_compaction_strategy::get_reshaping_job(std::vector<sstables::shared_sstable> input, schema_ptr schema, reshape_config cfg) const {
|
||||
compaction_descriptor time_window_compaction_strategy::get_reshaping_job(
|
||||
std::vector<sstables::shared_sstable> input, schema_ptr schema, reshape_config cfg) const {
|
||||
auto mode = cfg.mode;
|
||||
std::vector<sstables::shared_sstable> single_window;
|
||||
std::vector<sstables::shared_sstable> multi_window;
|
||||
@@ -239,7 +248,7 @@ time_window_compaction_strategy::get_reshaping_job(std::vector<sstables::shared_
|
||||
|
||||
// Sort input sstables by first_key order
|
||||
// to allow efficient reshaping of disjoint sstables.
|
||||
std::sort(input.begin(), input.end(), [&schema] (const sstables::shared_sstable& a, const sstables::shared_sstable& b) {
|
||||
std::sort(input.begin(), input.end(), [&schema](const sstables::shared_sstable& a, const sstables::shared_sstable& b) {
|
||||
return dht::ring_position(a->get_first_decorated_key()).less_compare(*schema, dht::ring_position(b->get_first_decorated_key()));
|
||||
});
|
||||
|
||||
@@ -253,31 +262,34 @@ time_window_compaction_strategy::get_reshaping_job(std::vector<sstables::shared_
|
||||
}
|
||||
}
|
||||
|
||||
auto is_disjoint = [&schema, mode, max_sstables] (const std::vector<sstables::shared_sstable>& ssts) {
|
||||
auto is_disjoint = [&schema, mode, max_sstables](const std::vector<sstables::shared_sstable>& ssts) {
|
||||
size_t tolerance = (mode == reshape_mode::relaxed) ? max_sstables : 0;
|
||||
return sstable_set_overlapping_count(schema, ssts) <= tolerance;
|
||||
};
|
||||
|
||||
clogger.debug("time_window_compaction_strategy::get_reshaping_job: offstrategy_threshold={} max_sstables={} multi_window={} disjoint={} single_window={} disjoint={}",
|
||||
offstrategy_threshold, max_sstables,
|
||||
multi_window.size(), !multi_window.empty() && sstable_set_overlapping_count(schema, multi_window) == 0,
|
||||
single_window.size(), !single_window.empty() && sstable_set_overlapping_count(schema, single_window) == 0);
|
||||
clogger.debug("time_window_compaction_strategy::get_reshaping_job: offstrategy_threshold={} max_sstables={} multi_window={} disjoint={} "
|
||||
"single_window={} disjoint={}",
|
||||
offstrategy_threshold, max_sstables, multi_window.size(), seastar::value_of([&] {
|
||||
return !multi_window.empty() && sstable_set_overlapping_count(schema, multi_window) == 0;
|
||||
}),
|
||||
single_window.size(), seastar::value_of([&] {
|
||||
return !single_window.empty() && sstable_set_overlapping_count(schema, single_window) == 0;
|
||||
}));
|
||||
|
||||
auto get_job_size = [] (const std::vector<sstables::shared_sstable>& ssts) {
|
||||
auto get_job_size = [](const std::vector<sstables::shared_sstable>& ssts) {
|
||||
return std::ranges::fold_left(ssts | std::views::transform(std::mem_fn(&sstables::sstable::bytes_on_disk)), uint64_t(0), std::plus{});
|
||||
};
|
||||
|
||||
// Targets a space overhead of 10%. All disjoint sstables can be compacted together as long as they won't
|
||||
// cause an overhead above target. Otherwise, the job targets a maximum of #max_threshold sstables.
|
||||
auto need_trimming = [&] (const std::vector<sstables::shared_sstable>& ssts, const uint64_t job_size, bool is_disjoint) {
|
||||
auto need_trimming = [&](const std::vector<sstables::shared_sstable>& ssts, const uint64_t job_size, bool is_disjoint) {
|
||||
const size_t min_sstables = 2;
|
||||
auto is_above_target_size = job_size > target_job_size;
|
||||
|
||||
return (ssts.size() > max_sstables && !is_disjoint) ||
|
||||
(ssts.size() > min_sstables && is_above_target_size);
|
||||
return (ssts.size() > max_sstables && !is_disjoint) || (ssts.size() > min_sstables && is_above_target_size);
|
||||
};
|
||||
|
||||
auto maybe_trim_job = [&need_trimming] (std::vector<sstables::shared_sstable>& ssts, uint64_t job_size, bool is_disjoint) {
|
||||
auto maybe_trim_job = [&need_trimming](std::vector<sstables::shared_sstable>& ssts, uint64_t job_size, bool is_disjoint) {
|
||||
while (need_trimming(ssts, job_size, is_disjoint)) {
|
||||
auto sst = ssts.back();
|
||||
ssts.pop_back();
|
||||
@@ -294,7 +306,7 @@ time_window_compaction_strategy::get_reshaping_job(std::vector<sstables::shared_
|
||||
// For example, if there are N sstables spanning window W, where N <= 32, then we can produce all data for W
|
||||
// in a single compaction round, removing the need to later compact W to reduce its number of files.
|
||||
auto sort_size = std::min(max_sstables, multi_window.size());
|
||||
std::ranges::partial_sort(multi_window, multi_window.begin() + sort_size, std::ranges::less(), [] (const sstables::shared_sstable &a) {
|
||||
std::ranges::partial_sort(multi_window, multi_window.begin() + sort_size, std::ranges::less(), [](const sstables::shared_sstable& a) {
|
||||
return a->get_stats_metadata().max_timestamp;
|
||||
});
|
||||
maybe_trim_job(multi_window, job_size, disjoint);
|
||||
@@ -334,8 +346,7 @@ time_window_compaction_strategy::get_reshaping_job(std::vector<sstables::shared_
|
||||
return compaction_descriptor();
|
||||
}
|
||||
|
||||
future<compaction_descriptor>
|
||||
time_window_compaction_strategy::get_sstables_for_compaction(compaction_group_view& table_s, strategy_control& control) {
|
||||
future<compaction_descriptor> time_window_compaction_strategy::get_sstables_for_compaction(compaction_group_view& table_s, strategy_control& control) {
|
||||
auto state = get_state(table_s);
|
||||
auto compaction_time = gc_clock::now();
|
||||
auto candidates = co_await control.candidates(table_s);
|
||||
@@ -369,10 +380,8 @@ time_window_compaction_strategy::get_sstables_for_compaction(compaction_group_vi
|
||||
co_return compaction_descriptor(std::move(compaction_candidates));
|
||||
}
|
||||
|
||||
time_window_compaction_strategy::bucket_compaction_mode
|
||||
time_window_compaction_strategy::compaction_mode(const time_window_compaction_strategy_state& state,
|
||||
const bucket_t& bucket, timestamp_type bucket_key,
|
||||
timestamp_type now, size_t min_threshold) const {
|
||||
time_window_compaction_strategy::bucket_compaction_mode time_window_compaction_strategy::compaction_mode(
|
||||
const time_window_compaction_strategy_state& state, const bucket_t& bucket, timestamp_type bucket_key, timestamp_type now, size_t min_threshold) const {
|
||||
// STCS will also be performed on older window buckets, to avoid a bad write and
|
||||
// space amplification when something like read repair cause small updates to
|
||||
// those past windows.
|
||||
@@ -385,8 +394,7 @@ time_window_compaction_strategy::compaction_mode(const time_window_compaction_st
|
||||
return bucket_compaction_mode::none;
|
||||
}
|
||||
|
||||
std::vector<sstables::shared_sstable>
|
||||
time_window_compaction_strategy::get_next_non_expired_sstables(compaction_group_view& table_s, strategy_control& control,
|
||||
std::vector<sstables::shared_sstable> time_window_compaction_strategy::get_next_non_expired_sstables(compaction_group_view& table_s, strategy_control& control,
|
||||
std::vector<sstables::shared_sstable> non_expiring_sstables, gc_clock::time_point compaction_time, time_window_compaction_strategy_state& state) {
|
||||
auto most_interesting = get_compaction_candidates(table_s, control, non_expiring_sstables, state);
|
||||
|
||||
@@ -400,31 +408,29 @@ time_window_compaction_strategy::get_next_non_expired_sstables(compaction_group_
|
||||
|
||||
// if there is no sstable to compact in standard way, try compacting single sstable whose droppable tombstone
|
||||
// ratio is greater than threshold.
|
||||
std::erase_if(non_expiring_sstables, [this, compaction_time, &table_s] (const sstables::shared_sstable& sst) -> bool {
|
||||
std::erase_if(non_expiring_sstables, [this, compaction_time, &table_s](const sstables::shared_sstable& sst) -> bool {
|
||||
return !worth_dropping_tombstones(sst, compaction_time, table_s);
|
||||
});
|
||||
if (non_expiring_sstables.empty()) {
|
||||
return {};
|
||||
}
|
||||
auto it = std::ranges::min_element(non_expiring_sstables, [] (auto& i, auto& j) {
|
||||
auto it = std::ranges::min_element(non_expiring_sstables, [](auto& i, auto& j) {
|
||||
return i->get_stats_metadata().min_timestamp < j->get_stats_metadata().min_timestamp;
|
||||
});
|
||||
return { *it };
|
||||
return {*it};
|
||||
}
|
||||
|
||||
std::vector<sstables::shared_sstable>
|
||||
time_window_compaction_strategy::get_compaction_candidates(compaction_group_view& table_s, strategy_control& control,
|
||||
std::vector<sstables::shared_sstable> candidate_sstables, time_window_compaction_strategy_state& state) {
|
||||
std::vector<sstables::shared_sstable> time_window_compaction_strategy::get_compaction_candidates(compaction_group_view& table_s, strategy_control& control,
|
||||
std::vector<sstables::shared_sstable> candidate_sstables, time_window_compaction_strategy_state& state) {
|
||||
auto [buckets, max_timestamp] = get_buckets(std::move(candidate_sstables), _options);
|
||||
// Update the highest window seen, if necessary
|
||||
state.highest_window_seen = std::max(state.highest_window_seen, max_timestamp);
|
||||
|
||||
return newest_bucket(table_s, control, std::move(buckets), table_s.min_compaction_threshold(), table_s.schema()->max_compaction_threshold(),
|
||||
state.highest_window_seen, state);
|
||||
state.highest_window_seen, state);
|
||||
}
|
||||
|
||||
timestamp_type
|
||||
time_window_compaction_strategy::get_window_lower_bound(std::chrono::seconds sstable_window_size, timestamp_type timestamp) {
|
||||
timestamp_type time_window_compaction_strategy::get_window_lower_bound(std::chrono::seconds sstable_window_size, timestamp_type timestamp) {
|
||||
using namespace std::chrono;
|
||||
// mask out window size from timestamp to get lower bound of its window
|
||||
auto num_windows = microseconds(timestamp) / sstable_window_size;
|
||||
@@ -432,8 +438,8 @@ time_window_compaction_strategy::get_window_lower_bound(std::chrono::seconds sst
|
||||
return duration_cast<microseconds>(num_windows * sstable_window_size).count();
|
||||
}
|
||||
|
||||
std::pair<std::map<timestamp_type, std::vector<sstables::shared_sstable>>, timestamp_type>
|
||||
time_window_compaction_strategy::get_buckets(std::vector<sstables::shared_sstable> files, const time_window_compaction_strategy_options& options) {
|
||||
std::pair<std::map<timestamp_type, std::vector<sstables::shared_sstable>>, timestamp_type> time_window_compaction_strategy::get_buckets(
|
||||
std::vector<sstables::shared_sstable> files, const time_window_compaction_strategy_options& options) {
|
||||
std::map<timestamp_type, std::vector<sstables::shared_sstable>> buckets;
|
||||
|
||||
timestamp_type max_timestamp = 0;
|
||||
@@ -450,11 +456,13 @@ time_window_compaction_strategy::get_buckets(std::vector<sstables::shared_sstabl
|
||||
return std::make_pair(std::move(buckets), max_timestamp);
|
||||
}
|
||||
|
||||
}
|
||||
} // namespace compaction
|
||||
|
||||
template <>
|
||||
struct fmt::formatter<std::map<compaction::timestamp_type, std::vector<sstables::shared_sstable>>> {
|
||||
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
|
||||
constexpr auto parse(format_parse_context& ctx) {
|
||||
return ctx.begin();
|
||||
}
|
||||
auto format(const std::map<compaction::timestamp_type, std::vector<sstables::shared_sstable>>& buckets, fmt::format_context& ctx) const {
|
||||
auto out = fmt::format_to(ctx.out(), " buckets = {{\n");
|
||||
for (auto& [timestamp, sstables] : buckets | std::views::reverse) {
|
||||
@@ -466,9 +474,9 @@ struct fmt::formatter<std::map<compaction::timestamp_type, std::vector<sstables:
|
||||
|
||||
namespace compaction {
|
||||
|
||||
std::vector<sstables::shared_sstable>
|
||||
time_window_compaction_strategy::newest_bucket(compaction_group_view& table_s, strategy_control& control, std::map<timestamp_type, std::vector<sstables::shared_sstable>> buckets,
|
||||
int min_threshold, int max_threshold, timestamp_type now, time_window_compaction_strategy_state& state) {
|
||||
std::vector<sstables::shared_sstable> time_window_compaction_strategy::newest_bucket(compaction_group_view& table_s, strategy_control& control,
|
||||
std::map<timestamp_type, std::vector<sstables::shared_sstable>> buckets, int min_threshold, int max_threshold, timestamp_type now,
|
||||
time_window_compaction_strategy_state& state) {
|
||||
clogger.debug("time_window_compaction_strategy::newest_bucket:\n now {}\n{}", now, buckets);
|
||||
|
||||
for (auto&& [key, bucket] : buckets | std::views::reverse) {
|
||||
@@ -509,8 +517,7 @@ time_window_compaction_strategy::newest_bucket(compaction_group_view& table_s, s
|
||||
return {};
|
||||
}
|
||||
|
||||
std::vector<sstables::shared_sstable>
|
||||
time_window_compaction_strategy::trim_to_threshold(std::vector<sstables::shared_sstable> bucket, int max_threshold) {
|
||||
std::vector<sstables::shared_sstable> time_window_compaction_strategy::trim_to_threshold(std::vector<sstables::shared_sstable> bucket, int max_threshold) {
|
||||
auto n = std::min(bucket.size(), size_t(max_threshold));
|
||||
// Trim the largest sstables off the end to meet the maxThreshold
|
||||
std::ranges::partial_sort(bucket, bucket.begin() + n, std::ranges::less(), std::mem_fn(&sstables::sstable::ondisk_data_size));
|
||||
@@ -542,8 +549,8 @@ future<int64_t> time_window_compaction_strategy::estimated_pending_compactions(c
|
||||
co_return n;
|
||||
}
|
||||
|
||||
std::vector<compaction_descriptor>
|
||||
time_window_compaction_strategy::get_cleanup_compaction_jobs(compaction_group_view& table_s, std::vector<sstables::shared_sstable> candidates) const {
|
||||
std::vector<compaction_descriptor> time_window_compaction_strategy::get_cleanup_compaction_jobs(
|
||||
compaction_group_view& table_s, std::vector<sstables::shared_sstable> candidates) const {
|
||||
std::vector<compaction_descriptor> ret;
|
||||
for (auto&& [_, sstables] : get_buckets(std::move(candidates), _options).first) {
|
||||
auto per_window_jobs = size_tiered_compaction_strategy(_stcs_options).get_cleanup_compaction_jobs(table_s, std::move(sstables));
|
||||
@@ -556,4 +563,4 @@ std::unique_ptr<sstables::sstable_set_impl> time_window_compaction_strategy::mak
|
||||
return std::make_unique<sstables::time_series_sstable_set>(ts.schema(), _options.enable_optimized_twcs_queries);
|
||||
}
|
||||
|
||||
}
|
||||
} // namespace compaction
|
||||
|
||||
@@ -397,6 +397,17 @@ commitlog_total_space_in_mb: -1
|
||||
# you can cache more hot rows
|
||||
# column_index_size_in_kb: 64
|
||||
|
||||
# sstable format version for newly written sstables.
|
||||
# Currently allowed values are `me` and `ms`.
|
||||
# If not specified in the config, this defaults to `me`.
|
||||
#
|
||||
# The difference between `me` and `ms` are the data structures used
|
||||
# in the primary index.
|
||||
# In short, `ms` needs more CPU during sstable writes,
|
||||
# but should behave better during reads,
|
||||
# although it might behave worse for very long clustering keys.
|
||||
sstable_format: ms
|
||||
|
||||
# Auto-scaling of the promoted index prevents running out of memory
|
||||
# when the promoted index grows too large (due to partitions with many rows
|
||||
# vs. too small column_index_size_in_kb). When the serialized representation
|
||||
|
||||
@@ -597,7 +597,6 @@ scylla_tests = set([
|
||||
'test/boost/logalloc_standard_allocator_segment_pool_backend_test',
|
||||
'test/boost/logalloc_test',
|
||||
'test/boost/lru_string_map_test',
|
||||
'test/boost/lru_test',
|
||||
'test/boost/managed_bytes_test',
|
||||
'test/boost/managed_vector_test',
|
||||
'test/boost/map_difference_test',
|
||||
@@ -897,6 +896,9 @@ scylla_core = (['message/messaging_service.cc',
|
||||
'replica/multishard_query.cc',
|
||||
'replica/mutation_dump.cc',
|
||||
'replica/querier.cc',
|
||||
'replica/logstor/segment_manager.cc',
|
||||
'replica/logstor/logstor.cc',
|
||||
'replica/logstor/write_buffer.cc',
|
||||
'mutation/atomic_cell.cc',
|
||||
'mutation/canonical_mutation.cc',
|
||||
'mutation/frozen_mutation.cc',
|
||||
@@ -1468,6 +1470,7 @@ idls = ['idl/gossip_digest.idl.hh',
|
||||
'idl/query.idl.hh',
|
||||
'idl/idl_test.idl.hh',
|
||||
'idl/commitlog.idl.hh',
|
||||
'idl/logstor.idl.hh',
|
||||
'idl/tracing.idl.hh',
|
||||
'idl/consistency_level.idl.hh',
|
||||
'idl/cache_temperature.idl.hh',
|
||||
@@ -1584,7 +1587,6 @@ pure_boost_tests = set([
|
||||
'test/boost/like_matcher_test',
|
||||
'test/boost/linearizing_input_stream_test',
|
||||
'test/boost/lru_string_map_test',
|
||||
'test/boost/lru_test',
|
||||
'test/boost/map_difference_test',
|
||||
'test/boost/nonwrapping_interval_test',
|
||||
'test/boost/observable_test',
|
||||
|
||||
@@ -48,13 +48,15 @@ const sstring query_processor::CQL_VERSION = "3.3.1";
|
||||
const std::chrono::minutes prepared_statements_cache::entry_expiry = std::chrono::minutes(60);
|
||||
|
||||
struct query_processor::remote {
|
||||
remote(service::migration_manager& mm, service::mapreduce_service& fwd,
|
||||
service::storage_service& ss, service::raft_group0_client& group0_client,
|
||||
service::strong_consistency::coordinator& _sc_coordinator)
|
||||
: mm(mm), mapreducer(fwd), ss(ss), group0_client(group0_client)
|
||||
, sc_coordinator(_sc_coordinator)
|
||||
, gate("query_processor::remote")
|
||||
{}
|
||||
remote(service::migration_manager& mm, service::mapreduce_service& fwd, service::storage_service& ss, service::raft_group0_client& group0_client,
|
||||
service::strong_consistency::coordinator& _sc_coordinator)
|
||||
: mm(mm)
|
||||
, mapreducer(fwd)
|
||||
, ss(ss)
|
||||
, group0_client(group0_client)
|
||||
, sc_coordinator(_sc_coordinator)
|
||||
, gate("query_processor::remote") {
|
||||
}
|
||||
|
||||
service::migration_manager& mm;
|
||||
service::mapreduce_service& mapreducer;
|
||||
@@ -77,24 +79,34 @@ static service::query_state query_state_for_internal_call() {
|
||||
return {service::client_state::for_internal_calls(), empty_service_permit()};
|
||||
}
|
||||
|
||||
query_processor::query_processor(service::storage_proxy& proxy, data_dictionary::database db, service::migration_notifier& mn, vector_search::vector_store_client& vsc, query_processor::memory_config mcfg, cql_config& cql_cfg, utils::loading_cache_config auth_prep_cache_cfg, lang::manager& langm)
|
||||
: _migration_subscriber{std::make_unique<migration_subscriber>(this)}
|
||||
, _proxy(proxy)
|
||||
, _db(db)
|
||||
, _mnotifier(mn)
|
||||
, _vector_store_client(vsc)
|
||||
, _mcfg(mcfg)
|
||||
, _cql_config(cql_cfg)
|
||||
, _prepared_cache(prep_cache_log, _mcfg.prepared_statment_cache_size)
|
||||
, _authorized_prepared_cache(std::move(auth_prep_cache_cfg), authorized_prepared_statements_cache_log)
|
||||
, _auth_prepared_cache_cfg_cb([this] (uint32_t) { (void) _authorized_prepared_cache_config_action.trigger_later(); })
|
||||
, _authorized_prepared_cache_config_action([this] { update_authorized_prepared_cache_config(); return make_ready_future<>(); })
|
||||
, _authorized_prepared_cache_update_interval_in_ms_observer(_db.get_config().permissions_update_interval_in_ms.observe(_auth_prepared_cache_cfg_cb))
|
||||
, _authorized_prepared_cache_validity_in_ms_observer(_db.get_config().permissions_validity_in_ms.observe(_auth_prepared_cache_cfg_cb))
|
||||
, _lang_manager(langm)
|
||||
, _write_consistency_levels_warned_observer(_db.get_config().write_consistency_levels_warned.observe([this](const auto& v) { _write_consistency_levels_warned = to_consistency_level_set(v); }))
|
||||
, _write_consistency_levels_disallowed_observer(_db.get_config().write_consistency_levels_disallowed.observe([this](const auto& v) { _write_consistency_levels_disallowed = to_consistency_level_set(v); }))
|
||||
{
|
||||
query_processor::query_processor(service::storage_proxy& proxy, data_dictionary::database db, service::migration_notifier& mn,
|
||||
vector_search::vector_store_client& vsc, query_processor::memory_config mcfg, cql_config& cql_cfg, utils::loading_cache_config auth_prep_cache_cfg,
|
||||
lang::manager& langm)
|
||||
: _migration_subscriber{std::make_unique<migration_subscriber>(this)}
|
||||
, _proxy(proxy)
|
||||
, _db(db)
|
||||
, _mnotifier(mn)
|
||||
, _vector_store_client(vsc)
|
||||
, _mcfg(mcfg)
|
||||
, _cql_config(cql_cfg)
|
||||
, _prepared_cache(prep_cache_log, _mcfg.prepared_statment_cache_size)
|
||||
, _authorized_prepared_cache(std::move(auth_prep_cache_cfg), authorized_prepared_statements_cache_log)
|
||||
, _auth_prepared_cache_cfg_cb([this](uint32_t) {
|
||||
(void)_authorized_prepared_cache_config_action.trigger_later();
|
||||
})
|
||||
, _authorized_prepared_cache_config_action([this] {
|
||||
update_authorized_prepared_cache_config();
|
||||
return make_ready_future<>();
|
||||
})
|
||||
, _authorized_prepared_cache_update_interval_in_ms_observer(_db.get_config().permissions_update_interval_in_ms.observe(_auth_prepared_cache_cfg_cb))
|
||||
, _authorized_prepared_cache_validity_in_ms_observer(_db.get_config().permissions_validity_in_ms.observe(_auth_prepared_cache_cfg_cb))
|
||||
, _lang_manager(langm)
|
||||
, _write_consistency_levels_warned_observer(_db.get_config().write_consistency_levels_warned.observe([this](const auto& v) {
|
||||
_write_consistency_levels_warned = to_consistency_level_set(v);
|
||||
}))
|
||||
, _write_consistency_levels_disallowed_observer(_db.get_config().write_consistency_levels_disallowed.observe([this](const auto& v) {
|
||||
_write_consistency_levels_disallowed = to_consistency_level_set(v);
|
||||
})) {
|
||||
_write_consistency_levels_warned = to_consistency_level_set(_db.get_config().write_consistency_levels_warned());
|
||||
_write_consistency_levels_disallowed = to_consistency_level_set(_db.get_config().write_consistency_levels_disallowed());
|
||||
namespace sm = seastar::metrics;
|
||||
@@ -102,7 +114,7 @@ query_processor::query_processor(service::storage_proxy& proxy, data_dictionary:
|
||||
using clevel = db::consistency_level;
|
||||
sm::label cl_label("consistency_level");
|
||||
|
||||
sm::label who_label("who"); // Who queried system tables
|
||||
sm::label who_label("who"); // Who queried system tables
|
||||
const auto user_who_label_instance = who_label("user");
|
||||
const auto internal_who_label_instance = who_label("internal");
|
||||
|
||||
@@ -110,17 +122,11 @@ query_processor::query_processor(service::storage_proxy& proxy, data_dictionary:
|
||||
const auto system_ks_label_instance = ks_label("system");
|
||||
|
||||
std::vector<sm::metric_definition> qp_group;
|
||||
qp_group.push_back(sm::make_counter(
|
||||
"statements_prepared",
|
||||
_stats.prepare_invocations,
|
||||
sm::description("Counts the total number of parsed CQL requests.")));
|
||||
qp_group.push_back(sm::make_counter("statements_prepared", _stats.prepare_invocations, sm::description("Counts the total number of parsed CQL requests.")));
|
||||
for (auto cl = size_t(clevel::MIN_VALUE); cl <= size_t(clevel::MAX_VALUE); ++cl) {
|
||||
qp_group.push_back(
|
||||
sm::make_counter(
|
||||
"queries",
|
||||
_stats.queries_by_cl[cl],
|
||||
sm::description("Counts queries by consistency level."),
|
||||
{cl_label(clevel(cl)), basic_level}).set_skip_when_empty());
|
||||
qp_group.push_back(sm::make_counter(
|
||||
"queries", _stats.queries_by_cl[cl], sm::description("Counts queries by consistency level."), {cl_label(clevel(cl)), basic_level})
|
||||
.set_skip_when_empty());
|
||||
}
|
||||
_metrics.add_group("query_processor", qp_group);
|
||||
|
||||
@@ -521,29 +527,23 @@ query_processor::query_processor(service::storage_proxy& proxy, data_dictionary:
|
||||
|
||||
std::vector<sm::metric_definition> cql_cl_group;
|
||||
for (auto cl = size_t(clevel::MIN_VALUE); cl <= size_t(clevel::MAX_VALUE); ++cl) {
|
||||
cql_cl_group.push_back(
|
||||
sm::make_counter(
|
||||
"writes_per_consistency_level",
|
||||
_cql_stats.writes_per_consistency_level[cl],
|
||||
sm::description("Counts the number of writes for each consistency level."),
|
||||
{cl_label(clevel(cl)), basic_level}).set_skip_when_empty());
|
||||
cql_cl_group.push_back(sm::make_counter("writes_per_consistency_level", _cql_stats.writes_per_consistency_level[cl],
|
||||
sm::description("Counts the number of writes for each consistency level."), {cl_label(clevel(cl)), basic_level})
|
||||
.set_skip_when_empty());
|
||||
}
|
||||
_metrics.add_group("cql", cql_cl_group);
|
||||
|
||||
_metrics.add_group("cql", {
|
||||
sm::make_counter(
|
||||
"write_consistency_levels_disallowed_violations",
|
||||
_cql_stats.write_consistency_levels_disallowed_violations,
|
||||
sm::description("Counts the number of write_consistency_levels_disallowed guardrail violations, "
|
||||
"i.e. attempts to write with a forbidden consistency level."),
|
||||
{basic_level}),
|
||||
sm::make_counter(
|
||||
"write_consistency_levels_warned_violations",
|
||||
_cql_stats.write_consistency_levels_warned_violations,
|
||||
sm::description("Counts the number of write_consistency_levels_warned guardrail violations, "
|
||||
"i.e. attempts to write with a discouraged consistency level."),
|
||||
{basic_level}),
|
||||
});
|
||||
_metrics.add_group(
|
||||
"cql", {
|
||||
sm::make_counter("write_consistency_levels_disallowed_violations", _cql_stats.write_consistency_levels_disallowed_violations,
|
||||
sm::description("Counts the number of write_consistency_levels_disallowed guardrail violations, "
|
||||
"i.e. attempts to write with a forbidden consistency level."),
|
||||
{basic_level}),
|
||||
sm::make_counter("write_consistency_levels_warned_violations", _cql_stats.write_consistency_levels_warned_violations,
|
||||
sm::description("Counts the number of write_consistency_levels_warned guardrail violations, "
|
||||
"i.e. attempts to write with a discouraged consistency level."),
|
||||
{basic_level}),
|
||||
});
|
||||
|
||||
_mnotifier.register_listener(_migration_subscriber.get());
|
||||
}
|
||||
@@ -554,15 +554,13 @@ query_processor::~query_processor() {
|
||||
}
|
||||
}
|
||||
|
||||
std::pair<std::reference_wrapper<service::strong_consistency::coordinator>, gate::holder>
|
||||
query_processor::acquire_strongly_consistent_coordinator() {
|
||||
std::pair<std::reference_wrapper<service::strong_consistency::coordinator>, gate::holder> query_processor::acquire_strongly_consistent_coordinator() {
|
||||
auto [remote_, holder] = remote();
|
||||
return {remote_.get().sc_coordinator, std::move(holder)};
|
||||
}
|
||||
|
||||
void query_processor::start_remote(service::migration_manager& mm, service::mapreduce_service& mapreducer,
|
||||
service::storage_service& ss, service::raft_group0_client& group0_client,
|
||||
service::strong_consistency::coordinator& sc_coordinator) {
|
||||
void query_processor::start_remote(service::migration_manager& mm, service::mapreduce_service& mapreducer, service::storage_service& ss,
|
||||
service::raft_group0_client& group0_client, service::strong_consistency::coordinator& sc_coordinator) {
|
||||
_remote = std::make_unique<struct remote>(mm, mapreducer, ss, group0_client, sc_coordinator);
|
||||
}
|
||||
|
||||
@@ -582,7 +580,9 @@ future<> query_processor::stop() {
|
||||
}
|
||||
|
||||
future<::shared_ptr<cql_transport::messages::result_message>> query_processor::execute_with_guard(
|
||||
std::function<future<::shared_ptr<cql_transport::messages::result_message>>(service::query_state&, ::shared_ptr<cql_statement>, const query_options&, std::optional<service::group0_guard>)> fn,
|
||||
std::function<future<::shared_ptr<cql_transport::messages::result_message>>(
|
||||
service::query_state&, ::shared_ptr<cql_statement>, const query_options&, std::optional<service::group0_guard>)>
|
||||
fn,
|
||||
::shared_ptr<cql_statement> statement, service::query_state& query_state, const query_options& options) {
|
||||
// execute all statements that need group0 guard on shard0
|
||||
if (this_shard_id() != 0) {
|
||||
@@ -591,13 +591,13 @@ future<::shared_ptr<cql_transport::messages::result_message>> query_processor::e
|
||||
|
||||
auto [remote_, holder] = remote();
|
||||
size_t retries = remote_.get().mm.get_concurrent_ddl_retries();
|
||||
while (true) {
|
||||
while (true) {
|
||||
try {
|
||||
auto guard = co_await remote_.get().mm.start_group0_operation();
|
||||
co_return co_await fn(query_state, statement, options, std::move(guard));
|
||||
} catch (const service::group0_concurrent_modification& ex) {
|
||||
log.warn("Failed to execute statement \"{}\" due to guard conflict.{}.",
|
||||
statement->raw_cql_statement, retries ? " Retrying" : " Number of retries exceeded, giving up");
|
||||
log.warn("Failed to execute statement \"{}\" due to guard conflict.{}.", statement->raw_cql_statement,
|
||||
retries ? " Retrying" : " Number of retries exceeded, giving up");
|
||||
if (retries--) {
|
||||
continue;
|
||||
}
|
||||
@@ -606,29 +606,30 @@ future<::shared_ptr<cql_transport::messages::result_message>> query_processor::e
|
||||
}
|
||||
}
|
||||
|
||||
template<typename... Args>
|
||||
future<::shared_ptr<result_message>>
|
||||
query_processor::execute_maybe_with_guard(service::query_state& query_state, ::shared_ptr<cql_statement> statement, const query_options& options,
|
||||
future<::shared_ptr<result_message>>(query_processor::*fn)(service::query_state&, ::shared_ptr<cql_statement>, const query_options&, std::optional<service::group0_guard>, Args...), Args... args) {
|
||||
template <typename... Args>
|
||||
future<::shared_ptr<result_message>> query_processor::execute_maybe_with_guard(service::query_state& query_state, ::shared_ptr<cql_statement> statement,
|
||||
const query_options& options,
|
||||
future<::shared_ptr<result_message>> (query_processor::*fn)(
|
||||
service::query_state&, ::shared_ptr<cql_statement>, const query_options&, std::optional<service::group0_guard>, Args...),
|
||||
Args... args) {
|
||||
if (!statement->needs_guard(*this, query_state)) {
|
||||
return (this->*fn)(query_state, std::move(statement), options, std::nullopt, std::forward<Args>(args)...);
|
||||
}
|
||||
static auto exec = [fn] (query_processor& qp, Args... args, service::query_state& query_state, ::shared_ptr<cql_statement> statement, const query_options& options, std::optional<service::group0_guard> guard) {
|
||||
static auto exec = [fn](query_processor& qp, Args... args, service::query_state& query_state, ::shared_ptr<cql_statement> statement,
|
||||
const query_options& options, std::optional<service::group0_guard> guard) {
|
||||
return (qp.*fn)(query_state, std::move(statement), options, std::move(guard), std::forward<Args>(args)...);
|
||||
};
|
||||
return execute_with_guard(std::bind_front(exec, std::ref(*this), std::forward<Args>(args)...), std::move(statement), query_state, options);
|
||||
}
|
||||
|
||||
future<::shared_ptr<result_message>>
|
||||
query_processor::execute_direct_without_checking_exception_message(const std::string_view& query_string, service::query_state& query_state, dialect d, query_options& options) {
|
||||
future<::shared_ptr<result_message>> query_processor::execute_direct_without_checking_exception_message(
|
||||
const std::string_view& query_string, service::query_state& query_state, dialect d, query_options& options) {
|
||||
log.trace("execute_direct: \"{}\"", query_string);
|
||||
tracing::trace(query_state.get_trace_state(), "Parsing a statement");
|
||||
auto p = get_statement(query_string, query_state.get_client_state(), d);
|
||||
auto statement = p->statement;
|
||||
if (statement->get_bound_terms() != options.get_values_count()) {
|
||||
const auto msg = format("Invalid amount of bind variables: expected {:d} received {:d}",
|
||||
statement->get_bound_terms(),
|
||||
options.get_values_count());
|
||||
const auto msg = format("Invalid amount of bind variables: expected {:d} received {:d}", statement->get_bound_terms(), options.get_values_count());
|
||||
throw exceptions::invalid_request_exception(msg);
|
||||
}
|
||||
options.prepare(p->bound_names);
|
||||
@@ -639,17 +640,13 @@ query_processor::execute_direct_without_checking_exception_message(const std::st
|
||||
metrics.regularStatementsExecuted.inc();
|
||||
#endif
|
||||
auto user = query_state.get_client_state().user();
|
||||
tracing::trace(query_state.get_trace_state(), "Processing a statement for authenticated user: {}", user ? (user->name ? *user->name : "anonymous") : "no user authenticated");
|
||||
tracing::trace(query_state.get_trace_state(), "Processing a statement for authenticated user: {}",
|
||||
user ? (user->name ? *user->name : "anonymous") : "no user authenticated");
|
||||
return execute_maybe_with_guard(query_state, std::move(statement), options, &query_processor::do_execute_direct, std::move(p->warnings));
|
||||
}
|
||||
|
||||
future<::shared_ptr<result_message>>
|
||||
query_processor::do_execute_direct(
|
||||
service::query_state& query_state,
|
||||
shared_ptr<cql_statement> statement,
|
||||
const query_options& options,
|
||||
std::optional<service::group0_guard> guard,
|
||||
cql3::cql_warnings_vec warnings) {
|
||||
future<::shared_ptr<result_message>> query_processor::do_execute_direct(service::query_state& query_state, shared_ptr<cql_statement> statement,
|
||||
const query_options& options, std::optional<service::group0_guard> guard, cql3::cql_warnings_vec warnings) {
|
||||
auto access_future = co_await coroutine::as_future(statement->check_access(*this, query_state.get_client_state()));
|
||||
if (access_future.failed()) {
|
||||
co_await audit::inspect(statement, query_state, options, true);
|
||||
@@ -674,26 +671,16 @@ query_processor::do_execute_direct(
|
||||
co_return std::move(m);
|
||||
}
|
||||
|
||||
future<::shared_ptr<result_message>>
|
||||
query_processor::execute_prepared_without_checking_exception_message(
|
||||
service::query_state& query_state,
|
||||
shared_ptr<cql_statement> statement,
|
||||
const query_options& options,
|
||||
statements::prepared_statement::checked_weak_ptr prepared,
|
||||
cql3::prepared_cache_key_type cache_key,
|
||||
bool needs_authorization) {
|
||||
return execute_maybe_with_guard(query_state, std::move(statement), options, &query_processor::do_execute_prepared, std::move(prepared), std::move(cache_key), needs_authorization);
|
||||
future<::shared_ptr<result_message>> query_processor::execute_prepared_without_checking_exception_message(service::query_state& query_state,
|
||||
shared_ptr<cql_statement> statement, const query_options& options, statements::prepared_statement::checked_weak_ptr prepared,
|
||||
cql3::prepared_cache_key_type cache_key, bool needs_authorization) {
|
||||
return execute_maybe_with_guard(
|
||||
query_state, std::move(statement), options, &query_processor::do_execute_prepared, std::move(prepared), std::move(cache_key), needs_authorization);
|
||||
}
|
||||
|
||||
future<::shared_ptr<result_message>>
|
||||
query_processor::do_execute_prepared(
|
||||
service::query_state& query_state,
|
||||
shared_ptr<cql_statement> statement,
|
||||
const query_options& options,
|
||||
std::optional<service::group0_guard> guard,
|
||||
statements::prepared_statement::checked_weak_ptr prepared,
|
||||
cql3::prepared_cache_key_type cache_key,
|
||||
bool needs_authorization) {
|
||||
future<::shared_ptr<result_message>> query_processor::do_execute_prepared(service::query_state& query_state, shared_ptr<cql_statement> statement,
|
||||
const query_options& options, std::optional<service::group0_guard> guard, statements::prepared_statement::checked_weak_ptr prepared,
|
||||
cql3::prepared_cache_key_type cache_key, bool needs_authorization) {
|
||||
if (needs_authorization) {
|
||||
co_await statement->check_access(*this, query_state.get_client_state());
|
||||
try {
|
||||
@@ -707,8 +694,8 @@ query_processor::do_execute_prepared(
|
||||
co_return co_await process_authorized_statement(std::move(statement), query_state, options, std::move(guard));
|
||||
}
|
||||
|
||||
future<::shared_ptr<result_message>>
|
||||
query_processor::process_authorized_statement(const ::shared_ptr<cql_statement> statement, service::query_state& query_state, const query_options& options, std::optional<service::group0_guard> guard) {
|
||||
future<::shared_ptr<result_message>> query_processor::process_authorized_statement(const ::shared_ptr<cql_statement> statement,
|
||||
service::query_state& query_state, const query_options& options, std::optional<service::group0_guard> guard) {
|
||||
auto& client_state = query_state.get_client_state();
|
||||
|
||||
++_stats.queries_by_cl[size_t(options.get_consistency())];
|
||||
@@ -718,43 +705,39 @@ query_processor::process_authorized_statement(const ::shared_ptr<cql_statement>
|
||||
auto msg = co_await statement->execute_without_checking_exception_message(*this, query_state, options, std::move(guard));
|
||||
|
||||
if (msg) {
|
||||
co_return std::move(msg);
|
||||
co_return std::move(msg);
|
||||
}
|
||||
co_return ::make_shared<result_message::void_message>();
|
||||
}
|
||||
|
||||
future<::shared_ptr<cql_transport::messages::result_message::prepared>>
|
||||
query_processor::prepare(sstring query_string, service::query_state& query_state, cql3::dialect d) {
|
||||
future<::shared_ptr<cql_transport::messages::result_message::prepared>> query_processor::prepare(
|
||||
sstring query_string, service::query_state& query_state, cql3::dialect d) {
|
||||
auto& client_state = query_state.get_client_state();
|
||||
return prepare(std::move(query_string), client_state, d);
|
||||
}
|
||||
|
||||
future<::shared_ptr<cql_transport::messages::result_message::prepared>>
|
||||
query_processor::prepare(sstring query_string, const service::client_state& client_state, cql3::dialect d) {
|
||||
future<::shared_ptr<cql_transport::messages::result_message::prepared>> query_processor::prepare(
|
||||
sstring query_string, const service::client_state& client_state, cql3::dialect d) {
|
||||
try {
|
||||
auto key = compute_id(query_string, client_state.get_raw_keyspace(), d);
|
||||
auto prep_entry = co_await _prepared_cache.get_pinned(key, [this, &query_string, &client_state, d] {
|
||||
auto prepared = get_statement(query_string, client_state, d);
|
||||
prepared->calculate_metadata_id();
|
||||
auto bound_terms = prepared->statement->get_bound_terms();
|
||||
if (bound_terms > std::numeric_limits<uint16_t>::max()) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
format("Too many markers(?). {:d} markers exceed the allowed maximum of {:d}",
|
||||
bound_terms,
|
||||
std::numeric_limits<uint16_t>::max()));
|
||||
}
|
||||
throwing_assert(bound_terms == prepared->bound_names.size());
|
||||
return make_ready_future<std::unique_ptr<statements::prepared_statement>>(std::move(prepared));
|
||||
});
|
||||
auto prepared = get_statement(query_string, client_state, d);
|
||||
prepared->calculate_metadata_id();
|
||||
auto bound_terms = prepared->statement->get_bound_terms();
|
||||
if (bound_terms > std::numeric_limits<uint16_t>::max()) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
format("Too many markers(?). {:d} markers exceed the allowed maximum of {:d}", bound_terms, std::numeric_limits<uint16_t>::max()));
|
||||
}
|
||||
throwing_assert(bound_terms == prepared->bound_names.size());
|
||||
return make_ready_future<std::unique_ptr<statements::prepared_statement>>(std::move(prepared));
|
||||
});
|
||||
|
||||
co_await utils::get_local_injector().inject("query_processor_prepare_wait_after_cache_get", utils::wait_for_message(std::chrono::seconds(60)));
|
||||
|
||||
co_await utils::get_local_injector().inject(
|
||||
"query_processor_prepare_wait_after_cache_get",
|
||||
utils::wait_for_message(std::chrono::seconds(60)));
|
||||
|
||||
auto msg = ::make_shared<result_message::prepared::cql>(prepared_cache_key_type::cql_id(key), std::move(prep_entry),
|
||||
client_state.is_protocol_extension_set(cql_transport::cql_protocol_extension::LWT_ADD_METADATA_MARK));
|
||||
client_state.is_protocol_extension_set(cql_transport::cql_protocol_extension::LWT_ADD_METADATA_MARK));
|
||||
co_return std::move(msg);
|
||||
} catch(typename prepared_statements_cache::statement_is_too_big&) {
|
||||
} catch (typename prepared_statements_cache::statement_is_too_big&) {
|
||||
throw prepared_statement_is_too_big(query_string);
|
||||
}
|
||||
}
|
||||
@@ -765,15 +748,11 @@ static std::string hash_target(std::string_view query_string, std::string_view k
|
||||
return ret;
|
||||
}
|
||||
|
||||
prepared_cache_key_type query_processor::compute_id(
|
||||
std::string_view query_string,
|
||||
std::string_view keyspace,
|
||||
dialect d) {
|
||||
prepared_cache_key_type query_processor::compute_id(std::string_view query_string, std::string_view keyspace, dialect d) {
|
||||
return prepared_cache_key_type(md5_hasher::calculate(hash_target(query_string, keyspace)), d);
|
||||
}
|
||||
|
||||
std::unique_ptr<prepared_statement>
|
||||
query_processor::get_statement(const std::string_view& query, const service::client_state& client_state, dialect d) {
|
||||
std::unique_ptr<prepared_statement> query_processor::get_statement(const std::string_view& query, const service::client_state& client_state, dialect d) {
|
||||
// Measuring allocation cost requires that no yield points exist
|
||||
// between bytes_before and bytes_after. It needs fixing if this
|
||||
// function is ever futurized.
|
||||
@@ -798,8 +777,7 @@ query_processor::get_statement(const std::string_view& query, const service::cli
|
||||
return p;
|
||||
}
|
||||
|
||||
std::unique_ptr<raw::parsed_statement>
|
||||
query_processor::parse_statement(const std::string_view& query, dialect d) {
|
||||
std::unique_ptr<raw::parsed_statement> query_processor::parse_statement(const std::string_view& query, dialect d) {
|
||||
try {
|
||||
{
|
||||
const char* error_injection_key = "query_processor-parse_statement-test_failure";
|
||||
@@ -824,8 +802,7 @@ query_processor::parse_statement(const std::string_view& query, dialect d) {
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::unique_ptr<raw::parsed_statement>>
|
||||
query_processor::parse_statements(std::string_view queries, dialect d) {
|
||||
std::vector<std::unique_ptr<raw::parsed_statement>> query_processor::parse_statements(std::string_view queries, dialect d) {
|
||||
try {
|
||||
auto statements = util::do_with_parser(queries, d, std::mem_fn(&cql3_parser::CqlParser::queries));
|
||||
if (statements.empty()) {
|
||||
@@ -854,15 +831,10 @@ std::pair<std::reference_wrapper<struct query_processor::remote>, gate::holder>
|
||||
on_internal_error(log, "attempted to perform distributed query when `query_processor::remote` is unavailable");
|
||||
}
|
||||
|
||||
query_options query_processor::make_internal_options(
|
||||
const statements::prepared_statement::checked_weak_ptr& p,
|
||||
const std::vector<data_value_or_unset>& values,
|
||||
db::consistency_level cl,
|
||||
int32_t page_size,
|
||||
service::node_local_only node_local_only) const {
|
||||
query_options query_processor::make_internal_options(const statements::prepared_statement::checked_weak_ptr& p, const std::vector<data_value_or_unset>& values,
|
||||
db::consistency_level cl, int32_t page_size, service::node_local_only node_local_only) const {
|
||||
if (p->bound_names.size() != values.size()) {
|
||||
throw std::invalid_argument(
|
||||
format("Invalid number of values. Expecting {:d} but got {:d}", p->bound_names.size(), values.size()));
|
||||
throw std::invalid_argument(format("Invalid number of values. Expecting {:d} but got {:d}", p->bound_names.size(), values.size()));
|
||||
}
|
||||
auto ni = p->bound_names.begin();
|
||||
raw_value_vector_with_unset bound_values;
|
||||
@@ -870,32 +842,28 @@ query_options query_processor::make_internal_options(
|
||||
bound_values.unset.resize(values.size());
|
||||
for (auto& var : values) {
|
||||
auto& n = *ni;
|
||||
std::visit(overloaded_functor {
|
||||
[&] (const data_value& v) {
|
||||
if (v.type() == bytes_type) {
|
||||
bound_values.values.emplace_back(cql3::raw_value::make_value(value_cast<bytes>(v)));
|
||||
} else if (v.is_null()) {
|
||||
bound_values.values.emplace_back(cql3::raw_value::make_null());
|
||||
} else {
|
||||
bound_values.values.emplace_back(cql3::raw_value::make_value(n->type->decompose(v)));
|
||||
}
|
||||
}, [&] (const unset_value&) {
|
||||
bound_values.values.emplace_back(cql3::raw_value::make_null());
|
||||
bound_values.unset[std::distance(p->bound_names.begin(), ni)] = true;
|
||||
}
|
||||
}, var);
|
||||
std::visit(overloaded_functor{[&](const data_value& v) {
|
||||
if (v.type() == bytes_type) {
|
||||
bound_values.values.emplace_back(cql3::raw_value::make_value(value_cast<bytes>(v)));
|
||||
} else if (v.is_null()) {
|
||||
bound_values.values.emplace_back(cql3::raw_value::make_null());
|
||||
} else {
|
||||
bound_values.values.emplace_back(cql3::raw_value::make_value(n->type->decompose(v)));
|
||||
}
|
||||
},
|
||||
[&](const unset_value&) {
|
||||
bound_values.values.emplace_back(cql3::raw_value::make_null());
|
||||
bound_values.unset[std::distance(p->bound_names.begin(), ni)] = true;
|
||||
}},
|
||||
var);
|
||||
++ni;
|
||||
}
|
||||
return query_options(
|
||||
cl,
|
||||
std::move(bound_values),
|
||||
cql3::query_options::specific_options {
|
||||
.page_size = page_size,
|
||||
.state = {},
|
||||
.serial_consistency = db::consistency_level::SERIAL,
|
||||
.timestamp = api::missing_timestamp,
|
||||
.node_local_only = node_local_only
|
||||
});
|
||||
return query_options(cl, std::move(bound_values),
|
||||
cql3::query_options::specific_options{.page_size = page_size,
|
||||
.state = {},
|
||||
.serial_consistency = db::consistency_level::SERIAL,
|
||||
.timestamp = api::missing_timestamp,
|
||||
.node_local_only = node_local_only});
|
||||
}
|
||||
|
||||
statements::prepared_statement::checked_weak_ptr query_processor::prepare_internal(const sstring& query_string) {
|
||||
@@ -917,11 +885,7 @@ struct internal_query_state {
|
||||
};
|
||||
|
||||
internal_query_state query_processor::create_paged_state(
|
||||
const sstring& query_string,
|
||||
db::consistency_level cl,
|
||||
const data_value_list& values,
|
||||
int32_t page_size,
|
||||
std::optional<service::query_state> qs) {
|
||||
const sstring& query_string, db::consistency_level cl, const data_value_list& values, int32_t page_size, std::optional<service::query_state> qs) {
|
||||
auto p = prepare_internal(query_string);
|
||||
auto opts = make_internal_options(p, values, cl, page_size);
|
||||
if (!qs) {
|
||||
@@ -935,8 +899,7 @@ bool query_processor::has_more_results(cql3::internal_query_state& state) const
|
||||
}
|
||||
|
||||
future<> query_processor::for_each_cql_result(
|
||||
cql3::internal_query_state& state,
|
||||
noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set::row&)> f) {
|
||||
cql3::internal_query_state& state, noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set::row&)> f) {
|
||||
do {
|
||||
auto msg = co_await execute_paged_internal(state);
|
||||
for (auto& row : *msg) {
|
||||
@@ -947,17 +910,18 @@ future<> query_processor::for_each_cql_result(
|
||||
} while (has_more_results(state));
|
||||
}
|
||||
|
||||
future<::shared_ptr<untyped_result_set>>
|
||||
query_processor::execute_paged_internal(internal_query_state& state) {
|
||||
future<::shared_ptr<untyped_result_set>> query_processor::execute_paged_internal(internal_query_state& state) {
|
||||
state.p->statement->validate(*this, service::client_state::for_internal_calls());
|
||||
::shared_ptr<cql_transport::messages::result_message> msg =
|
||||
co_await state.p->statement->execute(*this, *state.qs, *state.opts, std::nullopt);
|
||||
::shared_ptr<cql_transport::messages::result_message> msg = co_await state.p->statement->execute(*this, *state.qs, *state.opts, std::nullopt);
|
||||
|
||||
class visitor : public result_message::visitor_base {
|
||||
internal_query_state& _state;
|
||||
query_processor& _qp;
|
||||
|
||||
public:
|
||||
visitor(internal_query_state& state, query_processor& qp) : _state(state), _qp(qp) {
|
||||
visitor(internal_query_state& state, query_processor& qp)
|
||||
: _state(state)
|
||||
, _qp(qp) {
|
||||
}
|
||||
virtual ~visitor() = default;
|
||||
void visit(const result_message::rows& rmrs) override {
|
||||
@@ -986,23 +950,14 @@ query_processor::execute_paged_internal(internal_query_state& state) {
|
||||
co_return ::make_shared<untyped_result_set>(msg);
|
||||
}
|
||||
|
||||
future<::shared_ptr<untyped_result_set>>
|
||||
query_processor::execute_internal(
|
||||
const sstring& query_string,
|
||||
db::consistency_level cl,
|
||||
const data_value_list& values,
|
||||
cache_internal cache) {
|
||||
future<::shared_ptr<untyped_result_set>> query_processor::execute_internal(
|
||||
const sstring& query_string, db::consistency_level cl, const data_value_list& values, cache_internal cache) {
|
||||
auto qs = query_state_for_internal_call();
|
||||
co_return co_await execute_internal(query_string, cl, qs, values, cache);
|
||||
}
|
||||
|
||||
future<::shared_ptr<untyped_result_set>>
|
||||
query_processor::execute_internal(
|
||||
const sstring& query_string,
|
||||
db::consistency_level cl,
|
||||
service::query_state& query_state,
|
||||
const data_value_list& values,
|
||||
cache_internal cache) {
|
||||
future<::shared_ptr<untyped_result_set>> query_processor::execute_internal(
|
||||
const sstring& query_string, db::consistency_level cl, service::query_state& query_state, const data_value_list& values, cache_internal cache) {
|
||||
|
||||
if (log.is_enabled(logging::log_level::trace)) {
|
||||
log.trace("execute_internal: {}\"{}\" ({})", cache ? "(cached) " : "", query_string, fmt::join(values, ", "));
|
||||
@@ -1020,10 +975,7 @@ query_processor::execute_internal(
|
||||
}
|
||||
|
||||
future<utils::chunked_vector<mutation>> query_processor::get_mutations_internal(
|
||||
const sstring query_string,
|
||||
service::query_state& query_state,
|
||||
api::timestamp_type timestamp,
|
||||
std::vector<data_value_or_unset> values) {
|
||||
const sstring query_string, service::query_state& query_state, api::timestamp_type timestamp, std::vector<data_value_or_unset> values) {
|
||||
log.debug("get_mutations_internal: \"{}\" ({})", query_string, fmt::join(values, ", "));
|
||||
auto stmt = prepare_internal(query_string);
|
||||
auto mod_stmt = dynamic_pointer_cast<cql3::statements::modification_statement>(stmt->statement);
|
||||
@@ -1041,12 +993,8 @@ future<utils::chunked_vector<mutation>> query_processor::get_mutations_internal(
|
||||
co_return co_await mod_stmt->get_mutations(*this, opts, timeout, true, timestamp, query_state, json_cache, std::move(keys));
|
||||
}
|
||||
|
||||
future<::shared_ptr<untyped_result_set>>
|
||||
query_processor::execute_with_params(
|
||||
statements::prepared_statement::checked_weak_ptr p,
|
||||
db::consistency_level cl,
|
||||
service::query_state& query_state,
|
||||
const data_value_list& values) {
|
||||
future<::shared_ptr<untyped_result_set>> query_processor::execute_with_params(
|
||||
statements::prepared_statement::checked_weak_ptr p, db::consistency_level cl, service::query_state& query_state, const data_value_list& values) {
|
||||
auto opts = make_internal_options(p, values, cl);
|
||||
auto statement = p->statement;
|
||||
|
||||
@@ -1054,30 +1002,24 @@ query_processor::execute_with_params(
|
||||
co_return ::make_shared<untyped_result_set>(msg);
|
||||
}
|
||||
|
||||
future<::shared_ptr<result_message>>
|
||||
query_processor::do_execute_with_params(
|
||||
service::query_state& query_state,
|
||||
shared_ptr<cql_statement> statement,
|
||||
const query_options& options, std::optional<service::group0_guard> guard) {
|
||||
future<::shared_ptr<result_message>> query_processor::do_execute_with_params(
|
||||
service::query_state& query_state, shared_ptr<cql_statement> statement, const query_options& options, std::optional<service::group0_guard> guard) {
|
||||
statement->validate(*this, service::client_state::for_internal_calls());
|
||||
co_return co_await coroutine::try_future(statement->execute(*this, query_state, options, std::move(guard)));
|
||||
}
|
||||
|
||||
|
||||
future<::shared_ptr<cql_transport::messages::result_message>>
|
||||
query_processor::execute_batch_without_checking_exception_message(
|
||||
::shared_ptr<statements::batch_statement> batch,
|
||||
service::query_state& query_state,
|
||||
query_options& options,
|
||||
future<::shared_ptr<cql_transport::messages::result_message>> query_processor::execute_batch_without_checking_exception_message(
|
||||
::shared_ptr<statements::batch_statement> batch, service::query_state& query_state, query_options& options,
|
||||
std::unordered_map<prepared_cache_key_type, authorized_prepared_statements_cache::value_type> pending_authorization_entries) {
|
||||
auto access_future = co_await coroutine::as_future(batch->check_access(*this, query_state.get_client_state()));
|
||||
co_await coroutine::parallel_for_each(pending_authorization_entries, [this, &query_state] (auto& e) -> future<> {
|
||||
try {
|
||||
co_await _authorized_prepared_cache.insert(*query_state.get_client_state().user(), e.first, std::move(e.second));
|
||||
} catch (...) {
|
||||
log.error("failed to cache the entry: {}", std::current_exception());
|
||||
}
|
||||
});
|
||||
co_await coroutine::parallel_for_each(pending_authorization_entries, [this, &query_state](auto& e) -> future<> {
|
||||
try {
|
||||
co_await _authorized_prepared_cache.insert(*query_state.get_client_state().user(), e.first, std::move(e.second));
|
||||
} catch (...) {
|
||||
log.error("failed to cache the entry: {}", std::current_exception());
|
||||
}
|
||||
});
|
||||
bool failed = access_future.failed();
|
||||
co_await audit::inspect(batch, query_state, options, failed);
|
||||
if (access_future.failed()) {
|
||||
@@ -1086,30 +1028,28 @@ query_processor::execute_batch_without_checking_exception_message(
|
||||
batch->validate();
|
||||
batch->validate(*this, query_state.get_client_state());
|
||||
_stats.queries_by_cl[size_t(options.get_consistency())] += batch->get_statements().size();
|
||||
if (log.is_enabled(logging::log_level::trace)) {
|
||||
if (log.is_enabled(logging::log_level::trace)) {
|
||||
std::ostringstream oss;
|
||||
for (const auto& s: batch->get_statements()) {
|
||||
oss << std::endl << s.statement->raw_cql_statement;
|
||||
for (const auto& s : batch->get_statements()) {
|
||||
oss << std::endl << s.statement->raw_cql_statement;
|
||||
}
|
||||
log.trace("execute_batch({}): {}", batch->get_statements().size(), oss.str());
|
||||
}
|
||||
co_return co_await batch->execute(*this, query_state, options, std::nullopt);
|
||||
}
|
||||
|
||||
future<service::broadcast_tables::query_result>
|
||||
query_processor::execute_broadcast_table_query(const service::broadcast_tables::query& query) {
|
||||
future<service::broadcast_tables::query_result> query_processor::execute_broadcast_table_query(const service::broadcast_tables::query& query) {
|
||||
auto [remote_, holder] = remote();
|
||||
co_return co_await service::broadcast_tables::execute(remote_.get().group0_client, query);
|
||||
}
|
||||
|
||||
future<query::mapreduce_result>
|
||||
query_processor::mapreduce(query::mapreduce_request req, tracing::trace_state_ptr tr_state) {
|
||||
future<query::mapreduce_result> query_processor::mapreduce(query::mapreduce_request req, tracing::trace_state_ptr tr_state) {
|
||||
auto [remote_, holder] = remote();
|
||||
co_return co_await remote_.get().mapreducer.dispatch(std::move(req), std::move(tr_state));
|
||||
}
|
||||
|
||||
future<::shared_ptr<messages::result_message>>
|
||||
query_processor::execute_schema_statement(const statements::schema_altering_statement& stmt, service::query_state& state, const query_options& options, service::group0_batch& mc) {
|
||||
future<::shared_ptr<messages::result_message>> query_processor::execute_schema_statement(
|
||||
const statements::schema_altering_statement& stmt, service::query_state& state, const query_options& options, service::group0_batch& mc) {
|
||||
if (this_shard_id() != 0) {
|
||||
on_internal_error(log, "DDL must be executed on shard 0");
|
||||
}
|
||||
@@ -1163,7 +1103,8 @@ future<> query_processor::announce_schema_statement(const statements::schema_alt
|
||||
co_await remote_.get().mm.announce(std::move(m), std::move(guard), description);
|
||||
}
|
||||
|
||||
query_processor::migration_subscriber::migration_subscriber(query_processor* qp) : _qp{qp} {
|
||||
query_processor::migration_subscriber::migration_subscriber(query_processor* qp)
|
||||
: _qp{qp} {
|
||||
}
|
||||
|
||||
void query_processor::migration_subscriber::on_create_keyspace(const sstring& ks_name) {
|
||||
@@ -1189,10 +1130,7 @@ void query_processor::migration_subscriber::on_create_view(const sstring& ks_nam
|
||||
void query_processor::migration_subscriber::on_update_keyspace(const sstring& ks_name) {
|
||||
}
|
||||
|
||||
void query_processor::migration_subscriber::on_update_column_family(
|
||||
const sstring& ks_name,
|
||||
const sstring& cf_name,
|
||||
bool columns_changed) {
|
||||
void query_processor::migration_subscriber::on_update_column_family(const sstring& ks_name, const sstring& cf_name, bool columns_changed) {
|
||||
// #1255: Ignoring columns_changed deliberately.
|
||||
log.info("Column definitions for {}.{} changed, invalidating related prepared statements", ks_name, cf_name);
|
||||
remove_invalid_prepared_statements(ks_name, cf_name);
|
||||
@@ -1207,9 +1145,7 @@ void query_processor::migration_subscriber::on_update_function(const sstring& ks
|
||||
void query_processor::migration_subscriber::on_update_aggregate(const sstring& ks_name, const sstring& aggregate_name) {
|
||||
}
|
||||
|
||||
void query_processor::migration_subscriber::on_update_view(
|
||||
const sstring& ks_name,
|
||||
const sstring& view_name, bool columns_changed) {
|
||||
void query_processor::migration_subscriber::on_update_view(const sstring& ks_name, const sstring& view_name, bool columns_changed) {
|
||||
// scylladb/scylladb#16392 - Materialized views are also tables so we need at least handle
|
||||
// them as such when changed.
|
||||
on_update_column_family(ks_name, view_name, columns_changed);
|
||||
@@ -1238,39 +1174,28 @@ void query_processor::migration_subscriber::on_drop_view(const sstring& ks_name,
|
||||
remove_invalid_prepared_statements(ks_name, view_name);
|
||||
}
|
||||
|
||||
void query_processor::migration_subscriber::remove_invalid_prepared_statements(
|
||||
sstring ks_name,
|
||||
std::optional<sstring> cf_name) {
|
||||
_qp->_prepared_cache.remove_if([&] (::shared_ptr<cql_statement> stmt) {
|
||||
void query_processor::migration_subscriber::remove_invalid_prepared_statements(sstring ks_name, std::optional<sstring> cf_name) {
|
||||
_qp->_prepared_cache.remove_if([&](::shared_ptr<cql_statement> stmt) {
|
||||
return this->should_invalidate(ks_name, cf_name, stmt);
|
||||
});
|
||||
}
|
||||
|
||||
bool query_processor::migration_subscriber::should_invalidate(
|
||||
sstring ks_name,
|
||||
std::optional<sstring> cf_name,
|
||||
::shared_ptr<cql_statement> statement) {
|
||||
bool query_processor::migration_subscriber::should_invalidate(sstring ks_name, std::optional<sstring> cf_name, ::shared_ptr<cql_statement> statement) {
|
||||
return statement->depends_on(ks_name, cf_name);
|
||||
}
|
||||
|
||||
future<> query_processor::query_internal(
|
||||
const sstring& query_string,
|
||||
db::consistency_level cl,
|
||||
const data_value_list& values,
|
||||
int32_t page_size,
|
||||
noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)> f,
|
||||
std::optional<service::query_state> qs) {
|
||||
future<> query_processor::query_internal(const sstring& query_string, db::consistency_level cl, const data_value_list& values, int32_t page_size,
|
||||
noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)> f, std::optional<service::query_state> qs) {
|
||||
auto query_state = create_paged_state(query_string, cl, values, page_size, std::move(qs));
|
||||
co_return co_await for_each_cql_result(query_state, std::move(f));
|
||||
}
|
||||
|
||||
future<> query_processor::query_internal(
|
||||
const sstring& query_string,
|
||||
noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)> f) {
|
||||
future<> query_processor::query_internal(const sstring& query_string, noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)> f) {
|
||||
return query_internal(query_string, db::consistency_level::ONE, {}, 1000, std::move(f));
|
||||
}
|
||||
|
||||
shared_ptr<cql_transport::messages::result_message> query_processor::bounce_to_shard(unsigned shard, cql3::computed_function_values cached_fn_calls, bool track) {
|
||||
shared_ptr<cql_transport::messages::result_message> query_processor::bounce_to_shard(
|
||||
unsigned shard, cql3::computed_function_values cached_fn_calls, bool track) {
|
||||
if (track) {
|
||||
_proxy.get_stats().replica_cross_shard_ops++;
|
||||
}
|
||||
@@ -1278,7 +1203,8 @@ shared_ptr<cql_transport::messages::result_message> query_processor::bounce_to_s
|
||||
return ::make_shared<cql_transport::messages::result_message::bounce>(my_host_id, shard, std::move(cached_fn_calls));
|
||||
}
|
||||
|
||||
shared_ptr<cql_transport::messages::result_message> query_processor::bounce_to_node(locator::tablet_replica replica, cql3::computed_function_values cached_fn_calls, seastar::lowres_clock::time_point timeout, bool is_write) {
|
||||
shared_ptr<cql_transport::messages::result_message> query_processor::bounce_to_node(
|
||||
locator::tablet_replica replica, cql3::computed_function_values cached_fn_calls, seastar::lowres_clock::time_point timeout, bool is_write) {
|
||||
get_cql_stats().forwarded_requests++;
|
||||
return ::make_shared<cql_transport::messages::result_message::bounce>(replica.host, replica.shard, std::move(cached_fn_calls), timeout, is_write);
|
||||
}
|
||||
@@ -1295,7 +1221,7 @@ void query_processor::update_authorized_prepared_cache_config() {
|
||||
utils::loading_cache_config cfg;
|
||||
cfg.max_size = _mcfg.authorized_prepared_cache_size;
|
||||
cfg.expiry = std::min(std::chrono::milliseconds(_db.get_config().permissions_validity_in_ms()),
|
||||
std::chrono::duration_cast<std::chrono::milliseconds>(prepared_statements_cache::entry_expiry));
|
||||
std::chrono::duration_cast<std::chrono::milliseconds>(prepared_statements_cache::entry_expiry));
|
||||
cfg.refresh = std::chrono::milliseconds(_db.get_config().permissions_update_interval_in_ms());
|
||||
|
||||
if (!_authorized_prepared_cache.update_config(std::move(cfg))) {
|
||||
@@ -1307,4 +1233,4 @@ void query_processor::reset_cache() {
|
||||
_authorized_prepared_cache.reset();
|
||||
}
|
||||
|
||||
}
|
||||
} // namespace cql3
|
||||
|
||||
@@ -265,7 +265,10 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::do_
|
||||
if (guardrail_state == query_processor::write_consistency_guardrail_state::FAIL) {
|
||||
return make_exception_future<shared_ptr<cql_transport::messages::result_message>>(
|
||||
exceptions::invalid_request_exception(
|
||||
format("Consistency level {} is not allowed for write operations", cl)));
|
||||
format("Write consistency level {} is forbidden by the current configuration "
|
||||
"setting of write_consistency_levels_disallowed. Please use a different "
|
||||
"consistency level, or remove {} from write_consistency_levels_disallowed "
|
||||
"set in the configuration.", cl, cl)));
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < _statements.size(); ++i) {
|
||||
@@ -277,7 +280,8 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::do_
|
||||
_stats.statements_in_cas_batches += _statements.size();
|
||||
return execute_with_conditions(qp, options, query_state).then([guardrail_state, cl] (auto result) {
|
||||
if (guardrail_state == query_processor::write_consistency_guardrail_state::WARN) {
|
||||
result->add_warning(format("Write with consistency level {} is warned by guardrail configuration", cl));
|
||||
result->add_warning(format("Using write consistency level {} listed on the "
|
||||
"write_consistency_levels_warned is not recommended.", cl));
|
||||
}
|
||||
return result;
|
||||
});
|
||||
@@ -297,7 +301,8 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::do_
|
||||
}
|
||||
auto result = make_shared<cql_transport::messages::result_message::void_message>();
|
||||
if (guardrail_state == query_processor::write_consistency_guardrail_state::WARN) {
|
||||
result->add_warning(format("Write with consistency level {} is warned by guardrail configuration", cl));
|
||||
result->add_warning(format("Using write consistency level {} listed on the "
|
||||
"write_consistency_levels_warned is not recommended.", cl));
|
||||
}
|
||||
return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(std::move(result));
|
||||
});
|
||||
|
||||
@@ -59,6 +59,8 @@ const sstring cf_prop_defs::COMPACTION_ENABLED_KEY = "enabled";
|
||||
|
||||
const sstring cf_prop_defs::KW_TABLETS = "tablets";
|
||||
|
||||
const sstring cf_prop_defs::KW_STORAGE_ENGINE = "storage_engine";
|
||||
|
||||
schema::extensions_map cf_prop_defs::make_schema_extensions(const db::extensions& exts) const {
|
||||
schema::extensions_map er;
|
||||
for (auto& p : exts.schema_extensions()) {
|
||||
@@ -106,6 +108,7 @@ void cf_prop_defs::validate(const data_dictionary::database db, sstring ks_name,
|
||||
KW_BF_FP_CHANCE, KW_MEMTABLE_FLUSH_PERIOD, KW_COMPACTION,
|
||||
KW_COMPRESSION, KW_CRC_CHECK_CHANCE, KW_ID, KW_PAXOSGRACESECONDS,
|
||||
KW_SYNCHRONOUS_UPDATES, KW_TABLETS,
|
||||
KW_STORAGE_ENGINE,
|
||||
});
|
||||
static std::set<sstring> obsolete_keywords({
|
||||
sstring("index_interval"),
|
||||
@@ -196,6 +199,20 @@ void cf_prop_defs::validate(const data_dictionary::database db, sstring ks_name,
|
||||
}
|
||||
db::tablet_options::validate(*tablet_options_map);
|
||||
}
|
||||
|
||||
if (has_property(KW_STORAGE_ENGINE)) {
|
||||
auto storage_engine = get_string(KW_STORAGE_ENGINE, "");
|
||||
if (storage_engine == "logstor") {
|
||||
if (!db.features().logstor) {
|
||||
throw exceptions::configuration_exception(format("The experimental feature 'logstor' must be enabled in order to use the 'logstor' storage engine."));
|
||||
}
|
||||
if (!db.get_config().enable_logstor()) {
|
||||
throw exceptions::configuration_exception(format("The configuration option 'enable_logstor' must be set to true in the configuration in order to use the 'logstor' storage engine."));
|
||||
}
|
||||
} else {
|
||||
throw exceptions::configuration_exception(format("Illegal value for '{}'", KW_STORAGE_ENGINE));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::map<sstring, sstring> cf_prop_defs::get_compaction_type_options() const {
|
||||
@@ -396,6 +413,13 @@ void cf_prop_defs::apply_to_builder(schema_builder& builder, schema::extensions_
|
||||
if (auto tablet_options_opt = get_map(KW_TABLETS)) {
|
||||
builder.set_tablet_options(std::move(*tablet_options_opt));
|
||||
}
|
||||
|
||||
if (has_property(KW_STORAGE_ENGINE)) {
|
||||
auto storage_engine = get_string(KW_STORAGE_ENGINE, "");
|
||||
if (storage_engine == "logstor") {
|
||||
builder.set_logstor();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void cf_prop_defs::validate_minimum_int(const sstring& field, int32_t minimum_value, int32_t default_value) const
|
||||
|
||||
@@ -64,6 +64,8 @@ public:
|
||||
|
||||
static const sstring KW_TABLETS;
|
||||
|
||||
static const sstring KW_STORAGE_ENGINE;
|
||||
|
||||
// FIXME: In origin the following consts are in CFMetaData.
|
||||
static constexpr int32_t DEFAULT_DEFAULT_TIME_TO_LIVE = 0;
|
||||
static constexpr int32_t DEFAULT_MIN_INDEX_INTERVAL = 128;
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
*/
|
||||
|
||||
|
||||
#include "cql3/statements/cf_prop_defs.hh"
|
||||
#include "utils/assert.hh"
|
||||
#include <inttypes.h>
|
||||
#include <boost/regex.hpp>
|
||||
@@ -266,6 +267,13 @@ std::unique_ptr<prepared_statement> create_table_statement::raw_statement::prepa
|
||||
stmt_warning("CREATE TABLE WITH COMPACT STORAGE is deprecated and will eventually be removed in a future version.");
|
||||
}
|
||||
|
||||
if (_properties.properties()->has_property(cf_prop_defs::KW_STORAGE_ENGINE)) {
|
||||
auto storage_engine = _properties.properties()->get_string(cf_prop_defs::KW_STORAGE_ENGINE, "");
|
||||
if (storage_engine == "logstor" && !_column_aliases.empty()) {
|
||||
throw exceptions::configuration_exception("The 'logstor' storage engine cannot be used with tables that have clustering columns");
|
||||
}
|
||||
}
|
||||
|
||||
auto& key_aliases = _key_aliases[0];
|
||||
std::vector<data_type> key_types;
|
||||
for (auto&& alias : key_aliases) {
|
||||
|
||||
@@ -273,7 +273,10 @@ modification_statement::do_execute(query_processor& qp, service::query_state& qs
|
||||
if (guardrail_state == query_processor::write_consistency_guardrail_state::FAIL) {
|
||||
co_return coroutine::exception(
|
||||
std::make_exception_ptr(exceptions::invalid_request_exception(
|
||||
format("Consistency level {} is not allowed for write operations", cl))));
|
||||
format("Write consistency level {} is forbidden by the current configuration "
|
||||
"setting of write_consistency_levels_disallowed. Please use a different "
|
||||
"consistency level, or remove {} from write_consistency_levels_disallowed "
|
||||
"set in the configuration.", cl, cl))));
|
||||
}
|
||||
|
||||
_restrictions->validate_primary_key(options);
|
||||
@@ -281,7 +284,8 @@ modification_statement::do_execute(query_processor& qp, service::query_state& qs
|
||||
if (has_conditions()) {
|
||||
auto result = co_await execute_with_condition(qp, qs, options);
|
||||
if (guardrail_state == query_processor::write_consistency_guardrail_state::WARN) {
|
||||
result->add_warning(format("Write with consistency level {} is warned by guardrail configuration", cl));
|
||||
result->add_warning(format("Using write consistency level {} listed on the "
|
||||
"write_consistency_levels_warned is not recommended.", cl));
|
||||
}
|
||||
co_return result;
|
||||
}
|
||||
@@ -303,7 +307,8 @@ modification_statement::do_execute(query_processor& qp, service::query_state& qs
|
||||
|
||||
auto result = seastar::make_shared<cql_transport::messages::result_message::void_message>();
|
||||
if (guardrail_state == query_processor::write_consistency_guardrail_state::WARN) {
|
||||
result->add_warning(format("Write with consistency level {} is warned by guardrail configuration", cl));
|
||||
result->add_warning(format("Using write consistency level {} listed on the "
|
||||
"write_consistency_levels_warned is not recommended.", cl));
|
||||
}
|
||||
if (keys_size_one) {
|
||||
auto&& table = s->table();
|
||||
|
||||
15
db/config.cc
15
db/config.cc
@@ -679,6 +679,8 @@ db::config::config(std::shared_ptr<db::extensions> exts)
|
||||
"The directory where hints files are stored if hinted handoff is enabled.")
|
||||
, view_hints_directory(this, "view_hints_directory", value_status::Used, "",
|
||||
"The directory where materialized-view updates are stored while a view replica is unreachable.")
|
||||
, logstor_directory(this, "logstor_directory", value_status::Used, "",
|
||||
"The directory where data files for logstor storage are stored.")
|
||||
, saved_caches_directory(this, "saved_caches_directory", value_status::Unused, "",
|
||||
"The directory location where table key and row caches are stored.")
|
||||
/**
|
||||
@@ -862,6 +864,14 @@ db::config::config(std::shared_ptr<db::extensions> exts)
|
||||
"* offheap_objects Native memory, eliminating NIO buffer heap overhead.")
|
||||
, memtable_cleanup_threshold(this, "memtable_cleanup_threshold", value_status::Invalid, .11,
|
||||
"Ratio of occupied non-flushing memtable size to total permitted size for triggering a flush of the largest memtable. Larger values mean larger flushes and less compaction, but also less concurrent flush activity, which can make it difficult to keep your disks saturated under heavy write load.")
|
||||
, logstor_disk_size_in_mb(this, "logstor_disk_size_in_mb", value_status::Used, 2048,
|
||||
"Total size in megabytes allocated for logstor storage on disk.")
|
||||
, logstor_file_size_in_mb(this, "logstor_file_size_in_mb", value_status::Used, 32,
|
||||
"Total size in megabytes allocated for each logstor data file on disk.")
|
||||
, logstor_separator_delay_limit_ms(this, "logstor_separator_delay_limit_ms", value_status::Used, 100,
|
||||
"Maximum delay in milliseconds for logstor separator debt control.")
|
||||
, logstor_separator_max_memory_in_mb(this, "logstor_separator_max_memory_in_mb", value_status::Used, 256,
|
||||
"Maximum memory in megabytes for logstor separator memory buffers.")
|
||||
, file_cache_size_in_mb(this, "file_cache_size_in_mb", value_status::Unused, 512,
|
||||
"Total memory to use for SSTable-reading buffers.")
|
||||
, memtable_flush_queue_size(this, "memtable_flush_queue_size", value_status::Unused, 4,
|
||||
@@ -1281,6 +1291,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
|
||||
, enable_in_memory_data_store(this, "enable_in_memory_data_store", value_status::Used, false, "Enable in memory mode (system tables are always persisted).")
|
||||
, enable_cache(this, "enable_cache", value_status::Used, true, "Enable cache.")
|
||||
, enable_commitlog(this, "enable_commitlog", value_status::Used, true, "Enable commitlog.")
|
||||
, enable_logstor(this, "enable_logstor", value_status::Used, false, "Enable the logstor storage engine.")
|
||||
, volatile_system_keyspace_for_testing(this, "volatile_system_keyspace_for_testing", value_status::Used, false, "Don't persist system keyspace - testing only!")
|
||||
, api_port(this, "api_port", value_status::Used, 10000, "Http Rest API port.")
|
||||
, api_address(this, "api_address", value_status::Used, "", "Http Rest API address.")
|
||||
@@ -1692,6 +1703,7 @@ void db::config::setup_directories() {
|
||||
maybe_in_workdir(data_file_directories, "data");
|
||||
maybe_in_workdir(hints_directory, "hints");
|
||||
maybe_in_workdir(view_hints_directory, "view_hints");
|
||||
maybe_in_workdir(logstor_directory, "logstor");
|
||||
maybe_in_workdir(saved_caches_directory, "saved_caches");
|
||||
}
|
||||
|
||||
@@ -1861,7 +1873,8 @@ std::map<sstring, db::experimental_features_t::feature> db::experimental_feature
|
||||
{"keyspace-storage-options", feature::KEYSPACE_STORAGE_OPTIONS},
|
||||
{"tablets", feature::UNUSED},
|
||||
{"views-with-tablets", feature::UNUSED},
|
||||
{"strongly-consistent-tables", feature::STRONGLY_CONSISTENT_TABLES}
|
||||
{"strongly-consistent-tables", feature::STRONGLY_CONSISTENT_TABLES},
|
||||
{"logstor", feature::LOGSTOR}
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -117,7 +117,8 @@ struct experimental_features_t {
|
||||
ALTERNATOR_STREAMS,
|
||||
BROADCAST_TABLES,
|
||||
KEYSPACE_STORAGE_OPTIONS,
|
||||
STRONGLY_CONSISTENT_TABLES
|
||||
STRONGLY_CONSISTENT_TABLES,
|
||||
LOGSTOR,
|
||||
};
|
||||
static std::map<sstring, feature> map(); // See enum_option.
|
||||
static std::vector<enum_option<experimental_features_t>> all();
|
||||
@@ -201,6 +202,7 @@ public:
|
||||
named_value<uint64_t> data_file_capacity;
|
||||
named_value<sstring> hints_directory;
|
||||
named_value<sstring> view_hints_directory;
|
||||
named_value<sstring> logstor_directory;
|
||||
named_value<sstring> saved_caches_directory;
|
||||
named_value<sstring> commit_failure_policy;
|
||||
named_value<sstring> disk_failure_policy;
|
||||
@@ -244,6 +246,10 @@ public:
|
||||
named_value<bool> defragment_memory_on_idle;
|
||||
named_value<sstring> memtable_allocation_type;
|
||||
named_value<double> memtable_cleanup_threshold;
|
||||
named_value<uint32_t> logstor_disk_size_in_mb;
|
||||
named_value<uint32_t> logstor_file_size_in_mb;
|
||||
named_value<uint32_t> logstor_separator_delay_limit_ms;
|
||||
named_value<uint32_t> logstor_separator_max_memory_in_mb;
|
||||
named_value<uint32_t> file_cache_size_in_mb;
|
||||
named_value<uint32_t> memtable_flush_queue_size;
|
||||
named_value<uint32_t> memtable_flush_writers;
|
||||
@@ -364,6 +370,7 @@ public:
|
||||
named_value<bool> enable_in_memory_data_store;
|
||||
named_value<bool> enable_cache;
|
||||
named_value<bool> enable_commitlog;
|
||||
named_value<bool> enable_logstor;
|
||||
named_value<bool> volatile_system_keyspace_for_testing;
|
||||
named_value<uint16_t> api_port;
|
||||
named_value<sstring> api_address;
|
||||
|
||||
@@ -214,7 +214,11 @@ void cache_tracker::clear() {
|
||||
}
|
||||
|
||||
void cache_tracker::touch(rows_entry& e) {
|
||||
_lru.touch(e);
|
||||
// last dummy may not be linked if evicted
|
||||
if (e.is_linked()) {
|
||||
_lru.remove(e);
|
||||
}
|
||||
_lru.add(e);
|
||||
}
|
||||
|
||||
void cache_tracker::insert(cache_entry& entry) {
|
||||
|
||||
@@ -63,15 +63,14 @@ namespace db {
|
||||
|
||||
namespace schema_tables {
|
||||
|
||||
static constexpr std::initializer_list<table_kind> all_table_kinds = {
|
||||
table_kind::table,
|
||||
table_kind::view
|
||||
};
|
||||
static constexpr std::initializer_list<table_kind> all_table_kinds = {table_kind::table, table_kind::view};
|
||||
|
||||
static schema_ptr get_table_holder(table_kind k) {
|
||||
switch (k) {
|
||||
case table_kind::table: return tables();
|
||||
case table_kind::view: return views();
|
||||
case table_kind::table:
|
||||
return tables();
|
||||
case table_kind::view:
|
||||
return views();
|
||||
}
|
||||
abort();
|
||||
}
|
||||
@@ -94,15 +93,18 @@ void table_selector::add(sstring name) {
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
} // namespace schema_tables
|
||||
|
||||
}
|
||||
} // namespace db
|
||||
|
||||
template <> struct fmt::formatter<db::schema_tables::table_kind> {
|
||||
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
|
||||
template <>
|
||||
struct fmt::formatter<db::schema_tables::table_kind> {
|
||||
constexpr auto parse(format_parse_context& ctx) {
|
||||
return ctx.begin();
|
||||
}
|
||||
auto format(db::schema_tables::table_kind k, fmt::format_context& ctx) const {
|
||||
switch (k) {
|
||||
using enum db::schema_tables::table_kind;
|
||||
using enum db::schema_tables::table_kind;
|
||||
case table:
|
||||
return fmt::format_to(ctx.out(), "table");
|
||||
case view:
|
||||
@@ -125,11 +127,8 @@ static std::optional<table_id> table_id_from_mutations(const schema_mutations& s
|
||||
return table_id(table_row.get_nonnull<utils::UUID>("id"));
|
||||
}
|
||||
|
||||
static
|
||||
future<std::map<table_id, schema_mutations>>
|
||||
read_tables_for_keyspaces(sharded<service::storage_proxy>& proxy, const std::set<sstring>& keyspace_names, table_kind kind,
|
||||
const std::unordered_map<sstring, table_selector>& tables_per_keyspace)
|
||||
{
|
||||
static future<std::map<table_id, schema_mutations>> read_tables_for_keyspaces(sharded<service::storage_proxy>& proxy, const std::set<sstring>& keyspace_names,
|
||||
table_kind kind, const std::unordered_map<sstring, table_selector>& tables_per_keyspace) {
|
||||
std::map<table_id, schema_mutations> result;
|
||||
for (auto&& [keyspace_name, sel] : tables_per_keyspace) {
|
||||
if (!sel.tables.contains(kind)) {
|
||||
@@ -149,32 +148,30 @@ read_tables_for_keyspaces(sharded<service::storage_proxy>& proxy, const std::set
|
||||
|
||||
// Extracts the names of tables affected by a schema mutation.
|
||||
// The mutation must target one of the tables in schema_tables_holding_schema_mutations().
|
||||
static
|
||||
table_selector get_affected_tables(const sstring& keyspace_name, const mutation& m) {
|
||||
static table_selector get_affected_tables(const sstring& keyspace_name, const mutation& m) {
|
||||
const schema& s = *m.schema();
|
||||
auto get_table_name = [&] (const clustering_key& ck) {
|
||||
auto get_table_name = [&](const clustering_key& ck) {
|
||||
// The first component of the clustering key in each table listed in
|
||||
// schema_tables_holding_schema_mutations contains the table name.
|
||||
return value_cast<sstring>(utf8_type->deserialize(ck.get_component(s, 0)));
|
||||
};
|
||||
table_selector result;
|
||||
if (m.partition().partition_tombstone()) {
|
||||
slogger.trace("Mutation of {}.{} for keyspace {} contains a partition tombstone",
|
||||
m.schema()->ks_name(), m.schema()->cf_name(), keyspace_name);
|
||||
slogger.trace("Mutation of {}.{} for keyspace {} contains a partition tombstone", m.schema()->ks_name(), m.schema()->cf_name(), keyspace_name);
|
||||
result.all_in_keyspace = true;
|
||||
}
|
||||
for (auto&& e : m.partition().row_tombstones()) {
|
||||
const range_tombstone& rt = e.tombstone();
|
||||
if (rt.start.size(s) == 0 || rt.end.size(s) == 0) {
|
||||
slogger.trace("Mutation of {}.{} for keyspace {} contains a multi-table range tombstone",
|
||||
m.schema()->ks_name(), m.schema()->cf_name(), keyspace_name);
|
||||
slogger.trace(
|
||||
"Mutation of {}.{} for keyspace {} contains a multi-table range tombstone", m.schema()->ks_name(), m.schema()->cf_name(), keyspace_name);
|
||||
result.all_in_keyspace = true;
|
||||
break;
|
||||
}
|
||||
auto table_name = get_table_name(rt.start);
|
||||
if (table_name != get_table_name(rt.end)) {
|
||||
slogger.trace("Mutation of {}.{} for keyspace {} contains a multi-table range tombstone",
|
||||
m.schema()->ks_name(), m.schema()->cf_name(), keyspace_name);
|
||||
slogger.trace(
|
||||
"Mutation of {}.{} for keyspace {} contains a multi-table range tombstone", m.schema()->ks_name(), m.schema()->cf_name(), keyspace_name);
|
||||
result.all_in_keyspace = true;
|
||||
break;
|
||||
}
|
||||
@@ -183,16 +180,17 @@ table_selector get_affected_tables(const sstring& keyspace_name, const mutation&
|
||||
for (auto&& row : m.partition().clustered_rows()) {
|
||||
result.add(get_table_name(row.key()));
|
||||
}
|
||||
slogger.trace("Mutation of {}.{} for keyspace {} affects tables: {}, all_in_keyspace: {}",
|
||||
m.schema()->ks_name(), m.schema()->cf_name(), keyspace_name, result.tables, result.all_in_keyspace);
|
||||
slogger.trace("Mutation of {}.{} for keyspace {} affects tables: {}, all_in_keyspace: {}", m.schema()->ks_name(), m.schema()->cf_name(), keyspace_name,
|
||||
result.tables, result.all_in_keyspace);
|
||||
return result;
|
||||
}
|
||||
|
||||
future<schema_result>
|
||||
static read_schema_for_keyspaces(sharded<service::storage_proxy>& proxy, const sstring& schema_table_name, const std::set<sstring>& keyspace_names)
|
||||
{
|
||||
auto map = [&proxy, schema_table_name] (const sstring& keyspace_name) { return read_schema_partition_for_keyspace(proxy, schema_table_name, keyspace_name); };
|
||||
auto insert = [] (schema_result&& result, auto&& schema_entity) {
|
||||
future<schema_result> static read_schema_for_keyspaces(
|
||||
sharded<service::storage_proxy>& proxy, const sstring& schema_table_name, const std::set<sstring>& keyspace_names) {
|
||||
auto map = [&proxy, schema_table_name](const sstring& keyspace_name) {
|
||||
return read_schema_partition_for_keyspace(proxy, schema_table_name, keyspace_name);
|
||||
};
|
||||
auto insert = [](schema_result&& result, auto&& schema_entity) {
|
||||
if (!schema_entity.second->empty()) {
|
||||
result.insert(std::move(schema_entity));
|
||||
}
|
||||
@@ -202,11 +200,11 @@ static read_schema_for_keyspaces(sharded<service::storage_proxy>& proxy, const s
|
||||
}
|
||||
|
||||
// Returns names of live table definitions of given keyspace
|
||||
future<std::vector<sstring>>
|
||||
static read_table_names_of_keyspace(sharded<service::storage_proxy>& proxy, const sstring& keyspace_name, schema_ptr schema_table) {
|
||||
future<std::vector<sstring>> static read_table_names_of_keyspace(
|
||||
sharded<service::storage_proxy>& proxy, const sstring& keyspace_name, schema_ptr schema_table) {
|
||||
auto pkey = dht::decorate_key(*schema_table, partition_key::from_singular(*schema_table, keyspace_name));
|
||||
auto&& rs = co_await db::system_keyspace::query(proxy.local().get_db(), schema_table->ks_name(), schema_table->cf_name(), pkey);
|
||||
co_return rs->rows() | std::views::transform([schema_table] (const query::result_set_row& row) {
|
||||
co_return rs->rows() | std::views::transform([schema_table](const query::result_set_row& row) {
|
||||
const sstring name = schema_table->clustering_key_columns().begin()->name_as_text();
|
||||
return row.get_nonnull<sstring>(name);
|
||||
}) | std::ranges::to<std::vector>();
|
||||
@@ -242,8 +240,7 @@ static void maybe_delete_schema_version(mutation& m) {
|
||||
}
|
||||
}
|
||||
|
||||
future<> schema_applier::merge_keyspaces()
|
||||
{
|
||||
future<> schema_applier::merge_keyspaces() {
|
||||
/*
|
||||
* - we don't care about entriesOnlyOnLeft() or entriesInCommon(), because only the changes are of interest to us
|
||||
* - of all entriesOnlyOnRight(), we only care about ones that have live columns; it's possible to have a ColumnFamily
|
||||
@@ -280,21 +277,16 @@ future<> schema_applier::merge_keyspaces()
|
||||
for (auto& name : created) {
|
||||
slogger.info("Creating keyspace {}", name);
|
||||
auto sk_after_v = _after.scylla_keyspaces.contains(name) ? _after.scylla_keyspaces.at(name) : nullptr;
|
||||
auto ksm = co_await create_keyspace_metadata(
|
||||
schema_result_value_type{name, _after.keyspaces.at(name)}, sk_after_v);
|
||||
auto ksm = co_await create_keyspace_metadata(schema_result_value_type{name, _after.keyspaces.at(name)}, sk_after_v);
|
||||
_affected_keyspaces.created.push_back(
|
||||
co_await replica::database::prepare_create_keyspace_on_all_shards(
|
||||
sharded_db, _proxy, *ksm, _pending_token_metadata));
|
||||
co_await replica::database::prepare_create_keyspace_on_all_shards(sharded_db, _proxy, *ksm, _pending_token_metadata));
|
||||
_affected_keyspaces.names.created.insert(name);
|
||||
}
|
||||
for (auto& name : altered) {
|
||||
slogger.info("Altering keyspace {}", name);
|
||||
auto sk_after_v = _after.scylla_keyspaces.contains(name) ? _after.scylla_keyspaces.at(name) : nullptr;
|
||||
auto tmp_ksm = co_await create_keyspace_metadata(
|
||||
schema_result_value_type{name, _after.keyspaces.at(name)}, sk_after_v);
|
||||
_affected_keyspaces.altered.push_back(
|
||||
co_await replica::database::prepare_update_keyspace_on_all_shards(
|
||||
sharded_db, *tmp_ksm, _pending_token_metadata));
|
||||
auto tmp_ksm = co_await create_keyspace_metadata(schema_result_value_type{name, _after.keyspaces.at(name)}, sk_after_v);
|
||||
_affected_keyspaces.altered.push_back(co_await replica::database::prepare_update_keyspace_on_all_shards(sharded_db, *tmp_ksm, _pending_token_metadata));
|
||||
_affected_keyspaces.names.altered.insert(name);
|
||||
}
|
||||
for (auto& key : _affected_keyspaces.names.dropped) {
|
||||
@@ -327,7 +319,7 @@ static std::vector<column_definition> get_primary_key_definition(const schema_pt
|
||||
static std::vector<bytes> get_primary_key(const std::vector<column_definition>& primary_key, const query::result_set_row* row) {
|
||||
std::vector<bytes> key;
|
||||
for (const auto& column : primary_key) {
|
||||
const data_value *val = row->get_data_value(column.name_as_text());
|
||||
const data_value* val = row->get_data_value(column.name_as_text());
|
||||
key.push_back(val->serialize_nonnull());
|
||||
}
|
||||
return key;
|
||||
@@ -338,7 +330,7 @@ static std::map<std::vector<bytes>, const query::result_set_row*> build_row_map(
|
||||
const std::vector<query::result_set_row>& rows = result.rows();
|
||||
auto primary_key = get_primary_key_definition(result.schema());
|
||||
std::map<std::vector<bytes>, const query::result_set_row*> ret;
|
||||
for (const auto& row: rows) {
|
||||
for (const auto& row : rows) {
|
||||
auto key = get_primary_key(primary_key, &row);
|
||||
ret.insert(std::pair(std::move(key), &row));
|
||||
}
|
||||
@@ -391,8 +383,8 @@ struct aggregate_diff {
|
||||
std::vector<std::pair<const query::result_set_row*, const query::result_set_row*>> dropped;
|
||||
};
|
||||
|
||||
static aggregate_diff diff_aggregates_rows(const schema_result& aggr_before, const schema_result& aggr_after,
|
||||
const schema_result& scylla_aggr_before, const schema_result& scylla_aggr_after) {
|
||||
static aggregate_diff diff_aggregates_rows(
|
||||
const schema_result& aggr_before, const schema_result& aggr_after, const schema_result& scylla_aggr_before, const schema_result& scylla_aggr_after) {
|
||||
using map = std::map<std::vector<bytes>, const query::result_set_row*>;
|
||||
auto aggr_diff = difference(aggr_before, aggr_after, indirect_equal_to<lw_shared_ptr<query::result_set>>());
|
||||
|
||||
@@ -436,15 +428,11 @@ static aggregate_diff diff_aggregates_rows(const schema_result& aggr_before, con
|
||||
|
||||
for (const auto& k : diff.entries_only_on_left) {
|
||||
auto entry = scylla_aggr_rows_before.find(k);
|
||||
dropped.push_back({
|
||||
aggr_before_rows.find(k)->second, (entry != scylla_aggr_rows_before.end()) ? entry->second : nullptr
|
||||
});
|
||||
dropped.push_back({aggr_before_rows.find(k)->second, (entry != scylla_aggr_rows_before.end()) ? entry->second : nullptr});
|
||||
}
|
||||
for (const auto& k : diff.entries_only_on_right) {
|
||||
auto entry = scylla_aggr_rows_after.find(k);
|
||||
created.push_back({
|
||||
aggr_after_rows.find(k)->second, (entry != scylla_aggr_rows_after.end()) ? entry->second : nullptr
|
||||
});
|
||||
created.push_back({aggr_after_rows.find(k)->second, (entry != scylla_aggr_rows_after.end()) ? entry->second : nullptr});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -452,11 +440,10 @@ static aggregate_diff diff_aggregates_rows(const schema_result& aggr_before, con
|
||||
}
|
||||
|
||||
// see the comments for merge_keyspaces()
|
||||
future<> schema_applier::merge_types()
|
||||
{
|
||||
future<> schema_applier::merge_types() {
|
||||
auto diff = diff_rows(_before.types, _after.types);
|
||||
co_await _affected_user_types.start();
|
||||
co_await _affected_user_types.invoke_on_all([&] (affected_user_types_per_shard& af) mutable -> future<> {
|
||||
co_await _affected_user_types.invoke_on_all([&](affected_user_types_per_shard& af) mutable -> future<> {
|
||||
auto& db = _proxy.local().get_db().local();
|
||||
|
||||
std::map<sstring, std::reference_wrapper<replica::keyspace>> new_keyspaces_per_shard;
|
||||
@@ -478,16 +465,12 @@ future<> schema_applier::merge_types()
|
||||
// version of view to "before" version of base table and "after" to "after"
|
||||
// respectively.
|
||||
enum class schema_diff_side {
|
||||
left, // old, before
|
||||
left, // old, before
|
||||
right, // new, after
|
||||
};
|
||||
|
||||
static schema_diff_per_shard diff_table_or_view(sharded<service::storage_proxy>& proxy,
|
||||
const std::map<table_id, schema_mutations>& before,
|
||||
const std::map<table_id, schema_mutations>& after,
|
||||
bool reload,
|
||||
noncopyable_function<schema_ptr (schema_mutations sm, schema_diff_side)> create_schema)
|
||||
{
|
||||
static schema_diff_per_shard diff_table_or_view(sharded<service::storage_proxy>& proxy, const std::map<table_id, schema_mutations>& before,
|
||||
const std::map<table_id, schema_mutations>& after, bool reload, noncopyable_function<schema_ptr(schema_mutations sm, schema_diff_side)> create_schema) {
|
||||
schema_diff_per_shard d;
|
||||
auto diff = difference(before, after);
|
||||
for (auto&& key : diff.entries_only_on_left) {
|
||||
@@ -507,10 +490,10 @@ static schema_diff_per_shard diff_table_or_view(sharded<service::storage_proxy>&
|
||||
d.altered.emplace_back(schema_diff_per_shard::altered_schema{s_before, s});
|
||||
}
|
||||
if (reload) {
|
||||
for (auto&& key: diff.entries_in_common) {
|
||||
for (auto&& key : diff.entries_in_common) {
|
||||
auto s = create_schema(std::move(after.at(key)), schema_diff_side::right);
|
||||
slogger.info("Reloading {}.{} id={} version={}", s->ks_name(), s->cf_name(), s->id(), s->version());
|
||||
d.altered.emplace_back(schema_diff_per_shard::altered_schema {s, s});
|
||||
d.altered.emplace_back(schema_diff_per_shard::altered_schema{s, s});
|
||||
}
|
||||
}
|
||||
return d;
|
||||
@@ -524,7 +507,9 @@ static schema_diff_per_shard diff_table_or_view(sharded<service::storage_proxy>&
|
||||
constexpr size_t max_concurrent = 8;
|
||||
|
||||
|
||||
in_progress_types_storage_per_shard::in_progress_types_storage_per_shard(replica::database& db, const affected_keyspaces& affected_keyspaces, const affected_user_types& affected_types) : _stored_user_types(db.as_user_types_storage()) {
|
||||
in_progress_types_storage_per_shard::in_progress_types_storage_per_shard(
|
||||
replica::database& db, const affected_keyspaces& affected_keyspaces, const affected_user_types& affected_types)
|
||||
: _stored_user_types(db.as_user_types_storage()) {
|
||||
// initialize metadata for new keyspaces
|
||||
for (auto& ks_per_shard : affected_keyspaces.created) {
|
||||
auto metadata = ks_per_shard[this_shard_id()]->metadata();
|
||||
@@ -552,7 +537,7 @@ in_progress_types_storage_per_shard::in_progress_types_storage_per_shard(replica
|
||||
auto& ks_name = type->_keyspace;
|
||||
_in_progress_types[ks_name].remove_type(type);
|
||||
}
|
||||
for (const auto &ks_name : affected_keyspaces.names.dropped) {
|
||||
for (const auto& ks_name : affected_keyspaces.names.dropped) {
|
||||
// can't reference a type when it's keyspace is being dropped
|
||||
_in_progress_types[ks_name] = data_dictionary::user_types_metadata();
|
||||
}
|
||||
@@ -570,8 +555,9 @@ std::shared_ptr<data_dictionary::user_types_storage> in_progress_types_storage_p
|
||||
return _stored_user_types;
|
||||
}
|
||||
|
||||
future<> in_progress_types_storage::init(sharded<replica::database>& sharded_db, const affected_keyspaces& affected_keyspaces, const affected_user_types& affected_types) {
|
||||
co_await sharded_db.invoke_on_all([&] (replica::database& db) {
|
||||
future<> in_progress_types_storage::init(
|
||||
sharded<replica::database>& sharded_db, const affected_keyspaces& affected_keyspaces, const affected_user_types& affected_types) {
|
||||
co_await sharded_db.invoke_on_all([&](replica::database& db) {
|
||||
shards[this_shard_id()] = make_foreign(seastar::make_shared<in_progress_types_storage_per_shard>(db, affected_keyspaces, affected_types));
|
||||
});
|
||||
}
|
||||
@@ -585,8 +571,7 @@ in_progress_types_storage_per_shard& in_progress_types_storage::local() {
|
||||
// that when a base schema and a subset of its views are modified together (i.e.,
|
||||
// upon an alter table or alter type statement), then they are published together
|
||||
// as well, without any deferring in-between.
|
||||
future<> schema_applier::merge_tables_and_views()
|
||||
{
|
||||
future<> schema_applier::merge_tables_and_views() {
|
||||
auto& user_types = _types_storage.local();
|
||||
co_await _affected_tables_and_views.tables_and_views.start();
|
||||
|
||||
@@ -597,10 +582,10 @@ future<> schema_applier::merge_tables_and_views()
|
||||
|
||||
// Create CDC tables before non-CDC base tables, because we want the base tables with CDC enabled
|
||||
// to point to their CDC tables.
|
||||
local_cdc = diff_table_or_view(_proxy, _before.cdc, _after.cdc, _reload, [&] (schema_mutations sm, schema_diff_side) {
|
||||
local_cdc = diff_table_or_view(_proxy, _before.cdc, _after.cdc, _reload, [&](schema_mutations sm, schema_diff_side) {
|
||||
return create_table_from_mutations(_proxy, std::move(sm), user_types, nullptr);
|
||||
});
|
||||
local_tables = diff_table_or_view(_proxy, _before.tables, _after.tables, _reload, [&] (schema_mutations sm, schema_diff_side side) {
|
||||
local_tables = diff_table_or_view(_proxy, _before.tables, _after.tables, _reload, [&](schema_mutations sm, schema_diff_side side) {
|
||||
// If the table has CDC enabled, find the CDC schema version and set it in the table schema.
|
||||
// If the table is created or altered with CDC enabled, then the CDC
|
||||
// table is also created or altered in the same operation, so we can
|
||||
@@ -636,7 +621,7 @@ future<> schema_applier::merge_tables_and_views()
|
||||
|
||||
return create_table_from_mutations(_proxy, std::move(sm), user_types, cdc_schema);
|
||||
});
|
||||
local_views = diff_table_or_view(_proxy, _before.views, _after.views, _reload, [&] (schema_mutations sm, schema_diff_side side) {
|
||||
local_views = diff_table_or_view(_proxy, _before.views, _after.views, _reload, [&](schema_mutations sm, schema_diff_side side) {
|
||||
// The view schema mutation should be created with reference to the base table schema because we definitely know it by now.
|
||||
// If we don't do it we are leaving a window where write commands to this schema are illegal.
|
||||
// There are 3 possibilities:
|
||||
@@ -683,31 +668,26 @@ future<> schema_applier::merge_tables_and_views()
|
||||
frozen_schema_diff tables_frozen = co_await local_tables.freeze();
|
||||
frozen_schema_diff cdc_frozen = co_await local_cdc.freeze();
|
||||
frozen_schema_diff views_frozen = co_await local_views.freeze();
|
||||
co_await _affected_tables_and_views.tables_and_views.invoke_on_others([this, &tables_frozen, &cdc_frozen, &views_frozen] (affected_tables_and_views_per_shard& tables_and_views) -> future<> {
|
||||
auto& db = _proxy.local().get_db().local();
|
||||
tables_and_views.tables = co_await schema_diff_per_shard::copy_from(
|
||||
db, _types_storage, tables_frozen);
|
||||
tables_and_views.cdc = co_await schema_diff_per_shard::copy_from(
|
||||
db, _types_storage, cdc_frozen);
|
||||
tables_and_views.views = co_await schema_diff_per_shard::copy_from(
|
||||
db, _types_storage, views_frozen);
|
||||
});
|
||||
co_await _affected_tables_and_views.tables_and_views.invoke_on_others(
|
||||
[this, &tables_frozen, &cdc_frozen, &views_frozen](affected_tables_and_views_per_shard& tables_and_views) -> future<> {
|
||||
auto& db = _proxy.local().get_db().local();
|
||||
tables_and_views.tables = co_await schema_diff_per_shard::copy_from(db, _types_storage, tables_frozen);
|
||||
tables_and_views.cdc = co_await schema_diff_per_shard::copy_from(db, _types_storage, cdc_frozen);
|
||||
tables_and_views.views = co_await schema_diff_per_shard::copy_from(db, _types_storage, views_frozen);
|
||||
});
|
||||
|
||||
auto& db = _proxy.local().get_db();
|
||||
co_await max_concurrent_for_each(local_views.dropped, max_concurrent, [&db, this] (schema_ptr& dt) -> future<> {
|
||||
co_await max_concurrent_for_each(local_views.dropped, max_concurrent, [&db, this](schema_ptr& dt) -> future<> {
|
||||
auto uuid = dt->id();
|
||||
_affected_tables_and_views.table_shards.insert({uuid,
|
||||
co_await replica::database::prepare_drop_table_on_all_shards(db, uuid)});
|
||||
_affected_tables_and_views.table_shards.insert({uuid, co_await replica::database::prepare_drop_table_on_all_shards(db, uuid)});
|
||||
});
|
||||
co_await max_concurrent_for_each(local_tables.dropped, max_concurrent, [&db, this] (schema_ptr& dt) -> future<> {
|
||||
co_await max_concurrent_for_each(local_tables.dropped, max_concurrent, [&db, this](schema_ptr& dt) -> future<> {
|
||||
auto uuid = dt->id();
|
||||
_affected_tables_and_views.table_shards.insert({uuid,
|
||||
co_await replica::database::prepare_drop_table_on_all_shards(db, uuid)});
|
||||
_affected_tables_and_views.table_shards.insert({uuid, co_await replica::database::prepare_drop_table_on_all_shards(db, uuid)});
|
||||
});
|
||||
co_await max_concurrent_for_each(local_cdc.dropped, max_concurrent, [&db, this] (schema_ptr& dt) -> future<> {
|
||||
co_await max_concurrent_for_each(local_cdc.dropped, max_concurrent, [&db, this](schema_ptr& dt) -> future<> {
|
||||
auto uuid = dt->id();
|
||||
_affected_tables_and_views.table_shards.insert({uuid,
|
||||
co_await replica::database::prepare_drop_table_on_all_shards(db, uuid)});
|
||||
_affected_tables_and_views.table_shards.insert({uuid, co_await replica::database::prepare_drop_table_on_all_shards(db, uuid)});
|
||||
});
|
||||
}
|
||||
|
||||
@@ -719,8 +699,8 @@ future<frozen_schema_diff> schema_diff_per_shard::freeze() const {
|
||||
}
|
||||
for (const auto& a : altered) {
|
||||
result.altered.push_back(frozen_schema_diff::altered_schema{
|
||||
.old_schema = extended_frozen_schema(a.old_schema),
|
||||
.new_schema = extended_frozen_schema(a.new_schema),
|
||||
.old_schema = extended_frozen_schema(a.old_schema),
|
||||
.new_schema = extended_frozen_schema(a.new_schema),
|
||||
});
|
||||
co_await coroutine::maybe_yield();
|
||||
}
|
||||
@@ -743,8 +723,8 @@ future<schema_diff_per_shard> schema_diff_per_shard::copy_from(replica::database
|
||||
}
|
||||
for (const auto& a : oth.altered) {
|
||||
result.altered.push_back(schema_diff_per_shard::altered_schema{
|
||||
.old_schema = a.old_schema.unfreeze(commited_ctxt),
|
||||
.new_schema = a.new_schema.unfreeze(ctxt),
|
||||
.old_schema = a.old_schema.unfreeze(commited_ctxt),
|
||||
.new_schema = a.new_schema.unfreeze(ctxt),
|
||||
});
|
||||
co_await coroutine::maybe_yield();
|
||||
}
|
||||
@@ -758,7 +738,7 @@ future<schema_diff_per_shard> schema_diff_per_shard::copy_from(replica::database
|
||||
|
||||
static future<> notify_tables_and_views(service::migration_notifier& notifier, const affected_tables_and_views& diff) {
|
||||
auto it = diff.tables_and_views.local().columns_changed.cbegin();
|
||||
auto notify = [&] (auto& r, auto&& f) -> future<> {
|
||||
auto notify = [&](auto& r, auto&& f) -> future<> {
|
||||
co_await max_concurrent_for_each(r, max_concurrent, std::move(f));
|
||||
};
|
||||
|
||||
@@ -767,24 +747,41 @@ static future<> notify_tables_and_views(service::migration_notifier& notifier, c
|
||||
const auto& views = diff.tables_and_views.local().views;
|
||||
|
||||
// View drops are notified first, because a table can only be dropped if its views are already deleted
|
||||
co_await notify(views.dropped, [&] (auto&& dt) { return notifier.drop_view(view_ptr(dt)); });
|
||||
co_await notify(tables.dropped, [&] (auto&& dt) { return notifier.drop_column_family(dt); });
|
||||
co_await notify(cdc.dropped, [&] (auto&& dt) { return notifier.drop_column_family(dt); });
|
||||
co_await notify(views.dropped, [&](auto&& dt) {
|
||||
return notifier.drop_view(view_ptr(dt));
|
||||
});
|
||||
co_await notify(tables.dropped, [&](auto&& dt) {
|
||||
return notifier.drop_column_family(dt);
|
||||
});
|
||||
co_await notify(cdc.dropped, [&](auto&& dt) {
|
||||
return notifier.drop_column_family(dt);
|
||||
});
|
||||
// Table creations are notified first, in case a view is created right after the table
|
||||
co_await notify(tables.created, [&] (auto&& gs) { return notifier.create_column_family(gs); });
|
||||
co_await notify(cdc.created, [&] (auto&& gs) { return notifier.create_column_family(gs); });
|
||||
co_await notify(views.created, [&] (auto&& gs) { return notifier.create_view(view_ptr(gs)); });
|
||||
co_await notify(tables.created, [&](auto&& gs) {
|
||||
return notifier.create_column_family(gs);
|
||||
});
|
||||
co_await notify(cdc.created, [&](auto&& gs) {
|
||||
return notifier.create_column_family(gs);
|
||||
});
|
||||
co_await notify(views.created, [&](auto&& gs) {
|
||||
return notifier.create_view(view_ptr(gs));
|
||||
});
|
||||
// Table altering is notified first, in case new base columns appear
|
||||
co_await notify(tables.altered, [&] (auto&& altered) { return notifier.update_column_family(altered.new_schema, *it++); });
|
||||
co_await notify(cdc.altered, [&] (auto&& altered) { return notifier.update_column_family(altered.new_schema, *it++); });
|
||||
co_await notify(views.altered, [&] (auto&& altered) { return notifier.update_view(view_ptr(altered.new_schema), *it++); });
|
||||
co_await notify(tables.altered, [&](auto&& altered) {
|
||||
return notifier.update_column_family(altered.new_schema, *it++);
|
||||
});
|
||||
co_await notify(cdc.altered, [&](auto&& altered) {
|
||||
return notifier.update_column_family(altered.new_schema, *it++);
|
||||
});
|
||||
co_await notify(views.altered, [&](auto&& altered) {
|
||||
return notifier.update_view(view_ptr(altered.new_schema), *it++);
|
||||
});
|
||||
}
|
||||
|
||||
static void drop_cached_func(replica::database& db, const query::result_set_row& row) {
|
||||
auto language = row.get_nonnull<sstring>("language");
|
||||
if (language == "wasm") {
|
||||
cql3::functions::function_name name{
|
||||
row.get_nonnull<sstring>("keyspace_name"), row.get_nonnull<sstring>("function_name")};
|
||||
cql3::functions::function_name name{row.get_nonnull<sstring>("keyspace_name"), row.get_nonnull<sstring>("function_name")};
|
||||
auto arg_types = read_arg_types(row, name.keyspace, db.user_types());
|
||||
db.lang().remove(name, arg_types);
|
||||
}
|
||||
@@ -793,14 +790,13 @@ static void drop_cached_func(replica::database& db, const query::result_set_row&
|
||||
future<> schema_applier::merge_functions() {
|
||||
auto diff = diff_rows(_before.functions, _after.functions);
|
||||
co_await _functions_batch.start();
|
||||
co_await _functions_batch.invoke_on_all(coroutine::lambda([&] (cql3::functions::change_batch& batch) -> future<> {
|
||||
co_await _functions_batch.invoke_on_all(coroutine::lambda([&](cql3::functions::change_batch& batch) -> future<> {
|
||||
auto& db = _proxy.local().get_db().local();
|
||||
for (const auto& val : diff.created) {
|
||||
batch.add_function(co_await create_func(db, *val, _types_storage.local()));
|
||||
}
|
||||
for (const auto& val : diff.dropped) {
|
||||
cql3::functions::function_name name{
|
||||
val->get_nonnull<sstring>("keyspace_name"), val->get_nonnull<sstring>("function_name")};
|
||||
cql3::functions::function_name name{val->get_nonnull<sstring>("keyspace_name"), val->get_nonnull<sstring>("function_name")};
|
||||
auto commited_storage = _types_storage.local().committed_storage();
|
||||
auto arg_types = read_arg_types(*val, name.keyspace, *commited_storage);
|
||||
// as we don't yield between dropping cache and committing batch
|
||||
@@ -818,14 +814,13 @@ future<> schema_applier::merge_functions() {
|
||||
future<> schema_applier::merge_aggregates() {
|
||||
auto diff = diff_aggregates_rows(_before.aggregates, _after.aggregates, _before.scylla_aggregates, _after.scylla_aggregates);
|
||||
|
||||
co_await _functions_batch.invoke_on_all([&] (cql3::functions::change_batch& batch)-> future<> {
|
||||
co_await _functions_batch.invoke_on_all([&](cql3::functions::change_batch& batch) -> future<> {
|
||||
auto& db = _proxy.local().get_db().local();
|
||||
for (const auto& val : diff.created) {
|
||||
batch.add_function(create_aggregate(db, *val.first, val.second, batch, _types_storage.local()));
|
||||
}
|
||||
for (const auto& val : diff.dropped) {
|
||||
cql3::functions::function_name name{
|
||||
val.first->get_nonnull<sstring>("keyspace_name"), val.first->get_nonnull<sstring>("aggregate_name")};
|
||||
cql3::functions::function_name name{val.first->get_nonnull<sstring>("keyspace_name"), val.first->get_nonnull<sstring>("aggregate_name")};
|
||||
auto commited_storage = _types_storage.local().committed_storage();
|
||||
auto arg_types = read_arg_types(*val.first, name.keyspace, *commited_storage);
|
||||
batch.remove_aggregate(name, arg_types);
|
||||
@@ -860,15 +855,15 @@ future<schema_persisted_state> schema_applier::get_schema_persisted_state() {
|
||||
auto [tables, cdc] = extract_cdc(std::move(tables_and_cdc));
|
||||
|
||||
schema_persisted_state v{
|
||||
.keyspaces = co_await read_schema_for_keyspaces(_proxy, KEYSPACES, _keyspaces),
|
||||
.scylla_keyspaces = co_await read_schema_for_keyspaces(_proxy, SCYLLA_KEYSPACES, _keyspaces),
|
||||
.tables = std::move(tables),
|
||||
.types = co_await read_schema_for_keyspaces(_proxy, TYPES, _keyspaces),
|
||||
.views = co_await read_tables_for_keyspaces(_proxy, _keyspaces, table_kind::view, _affected_tables),
|
||||
.cdc = std::move(cdc),
|
||||
.functions = co_await read_schema_for_keyspaces(_proxy, FUNCTIONS, _keyspaces),
|
||||
.aggregates = co_await read_schema_for_keyspaces(_proxy, AGGREGATES, _keyspaces),
|
||||
.scylla_aggregates = co_await read_schema_for_keyspaces(_proxy, SCYLLA_AGGREGATES, _keyspaces),
|
||||
.keyspaces = co_await read_schema_for_keyspaces(_proxy, KEYSPACES, _keyspaces),
|
||||
.scylla_keyspaces = co_await read_schema_for_keyspaces(_proxy, SCYLLA_KEYSPACES, _keyspaces),
|
||||
.tables = std::move(tables),
|
||||
.types = co_await read_schema_for_keyspaces(_proxy, TYPES, _keyspaces),
|
||||
.views = co_await read_tables_for_keyspaces(_proxy, _keyspaces, table_kind::view, _affected_tables),
|
||||
.cdc = std::move(cdc),
|
||||
.functions = co_await read_schema_for_keyspaces(_proxy, FUNCTIONS, _keyspaces),
|
||||
.aggregates = co_await read_schema_for_keyspaces(_proxy, AGGREGATES, _keyspaces),
|
||||
.scylla_aggregates = co_await read_schema_for_keyspaces(_proxy, SCYLLA_AGGREGATES, _keyspaces),
|
||||
};
|
||||
co_return v;
|
||||
}
|
||||
@@ -924,10 +919,11 @@ class pending_schema_getter : public service::schema_getter {
|
||||
private:
|
||||
schema_applier& _sa;
|
||||
sharded<replica::database>& _db;
|
||||
|
||||
public:
|
||||
pending_schema_getter(schema_applier& sa) :
|
||||
_sa(sa), _db(sa._proxy.local().get_db()) {
|
||||
};
|
||||
pending_schema_getter(schema_applier& sa)
|
||||
: _sa(sa)
|
||||
, _db(sa._proxy.local().get_db()) {};
|
||||
|
||||
virtual flat_hash_map<sstring, locator::replication_strategy_ptr> get_keyspaces_replication() const override {
|
||||
flat_hash_map<sstring, locator::replication_strategy_ptr> out;
|
||||
@@ -989,8 +985,7 @@ future<> schema_applier::update_tablets() {
|
||||
if (_tablet_hint) {
|
||||
slogger.info("Tablet metadata changed");
|
||||
pending_schema_getter getter{*this};
|
||||
_token_metadata_change = co_await _ss.local().prepare_token_metadata_change(
|
||||
_pending_token_metadata.local(), getter);
|
||||
_token_metadata_change = co_await _ss.local().prepare_token_metadata_change(_pending_token_metadata.local(), getter);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -999,8 +994,7 @@ future<> schema_applier::update_tablets() {
|
||||
future<> schema_applier::load_mutable_token_metadata() {
|
||||
locator::mutable_token_metadata_ptr current_token_metadata = co_await _ss.local().get_mutable_token_metadata_ptr();
|
||||
if (_tablet_hint) {
|
||||
auto new_token_metadata = co_await _ss.local().prepare_tablet_metadata(
|
||||
_tablet_hint, current_token_metadata);
|
||||
auto new_token_metadata = co_await _ss.local().prepare_tablet_metadata(_tablet_hint, current_token_metadata);
|
||||
co_return co_await _pending_token_metadata.assign(new_token_metadata);
|
||||
}
|
||||
co_await _pending_token_metadata.assign(current_token_metadata);
|
||||
@@ -1115,14 +1109,13 @@ future<> schema_applier::commit() {
|
||||
// However, we can only acquire the (write) lock after preparing all
|
||||
// entities for the pending schema change that need to iterate over tables_metadata;
|
||||
// otherwise, such iteration would deadlock.
|
||||
_metadata_locks = std::make_unique<replica::tables_metadata_lock_on_all_shards>(
|
||||
co_await replica::database::lock_tables_metadata(sharded_db));
|
||||
_metadata_locks = std::make_unique<replica::tables_metadata_lock_on_all_shards>(co_await replica::database::lock_tables_metadata(sharded_db));
|
||||
// Run func first on shard 0
|
||||
// to allow "seeding" of the effective_replication_map
|
||||
// with a new e_r_m instance.
|
||||
SCYLLA_ASSERT(this_shard_id() == 0);
|
||||
commit_on_shard(sharded_db.local());
|
||||
co_await sharded_db.invoke_on_others([this] (replica::database& db) {
|
||||
co_await sharded_db.invoke_on_others([this](replica::database& db) {
|
||||
commit_on_shard(db);
|
||||
});
|
||||
// unlock as some functions in post_commit() may read data under those locks
|
||||
@@ -1154,12 +1147,11 @@ future<> schema_applier::finalize_tables_and_views() {
|
||||
|
||||
if (_tablet_hint) {
|
||||
auto& db = sharded_db.local();
|
||||
co_await db.get_compaction_manager().get_shared_tombstone_gc_state().
|
||||
flush_pending_repair_time_update(db);
|
||||
co_await db.get_compaction_manager().get_shared_tombstone_gc_state().flush_pending_repair_time_update(db);
|
||||
_ss.local().wake_up_topology_state_machine();
|
||||
}
|
||||
|
||||
co_await sharded_db.invoke_on_all([&diff] (replica::database& db) -> future<> {
|
||||
co_await sharded_db.invoke_on_all([&diff](replica::database& db) -> future<> {
|
||||
const auto& tables = diff.tables_and_views.local().tables;
|
||||
const auto& cdc = diff.tables_and_views.local().cdc;
|
||||
const auto& views = diff.tables_and_views.local().views;
|
||||
@@ -1184,15 +1176,14 @@ future<> schema_applier::finalize_tables_and_views() {
|
||||
//
|
||||
// Drop column mapping entries for dropped tables since these will not be TTLed automatically
|
||||
// and will stay there forever if we don't clean them up manually
|
||||
co_await max_concurrent_for_each(diff.tables_and_views.local().tables.created, max_concurrent, [this] (const schema_ptr& gs) -> future<> {
|
||||
co_await max_concurrent_for_each(diff.tables_and_views.local().tables.created, max_concurrent, [this](const schema_ptr& gs) -> future<> {
|
||||
co_await store_column_mapping(_proxy, gs, false);
|
||||
});
|
||||
co_await max_concurrent_for_each(diff.tables_and_views.local().tables.altered, max_concurrent, [this] (const schema_diff_per_shard::altered_schema& altered) -> future<> {
|
||||
co_await when_all_succeed(
|
||||
store_column_mapping(_proxy, altered.old_schema, true),
|
||||
store_column_mapping(_proxy, altered.new_schema, false));
|
||||
});
|
||||
co_await max_concurrent_for_each(diff.tables_and_views.local().tables.dropped, max_concurrent, [this] (const schema_ptr& s) -> future<> {
|
||||
co_await max_concurrent_for_each(
|
||||
diff.tables_and_views.local().tables.altered, max_concurrent, [this](const schema_diff_per_shard::altered_schema& altered) -> future<> {
|
||||
co_await when_all_succeed(store_column_mapping(_proxy, altered.old_schema, true), store_column_mapping(_proxy, altered.new_schema, false));
|
||||
});
|
||||
co_await max_concurrent_for_each(diff.tables_and_views.local().tables.dropped, max_concurrent, [this](const schema_ptr& s) -> future<> {
|
||||
co_await drop_column_mapping(_sys_ks.local(), s->id(), s->version());
|
||||
});
|
||||
}
|
||||
@@ -1200,7 +1191,7 @@ future<> schema_applier::finalize_tables_and_views() {
|
||||
future<> schema_applier::post_commit() {
|
||||
co_await finalize_tables_and_views();
|
||||
auto& sharded_db = _proxy.local().get_db();
|
||||
co_await sharded_db.invoke_on_all([&] (replica::database& db) -> future<> {
|
||||
co_await sharded_db.invoke_on_all([&](replica::database& db) -> future<> {
|
||||
auto& notifier = db.get_notifier();
|
||||
// notify about keyspaces
|
||||
for (const auto& name : _affected_keyspaces.names.created) {
|
||||
@@ -1260,8 +1251,8 @@ static future<> execute_do_merge_schema(sharded<service::storage_proxy>& proxy,
|
||||
co_await ap.post_commit();
|
||||
}
|
||||
|
||||
static future<> do_merge_schema(sharded<service::storage_proxy>& proxy, sharded<service::storage_service>& ss, sharded<db::system_keyspace>& sys_ks, utils::chunked_vector<mutation> mutations, bool reload)
|
||||
{
|
||||
static future<> do_merge_schema(sharded<service::storage_proxy>& proxy, sharded<service::storage_service>& ss, sharded<db::system_keyspace>& sys_ks,
|
||||
utils::chunked_vector<mutation> mutations, bool reload) {
|
||||
slogger.trace("do_merge_schema: {}", mutations);
|
||||
schema_applier ap(proxy, ss, sys_ks, reload);
|
||||
co_await execute_do_merge_schema(proxy, ap, std::move(mutations)).finally([&ap]() {
|
||||
@@ -1278,22 +1269,22 @@ static future<> do_merge_schema(sharded<service::storage_proxy>& proxy, sharded
|
||||
* @throws ConfigurationException If one of metadata attributes has invalid value
|
||||
* @throws IOException If data was corrupted during transportation or failed to apply fs operations
|
||||
*/
|
||||
future<> merge_schema(sharded<db::system_keyspace>& sys_ks, sharded<service::storage_proxy>& proxy, sharded<service::storage_service>& ss, utils::chunked_vector<mutation> mutations, bool reload)
|
||||
{
|
||||
future<> merge_schema(sharded<db::system_keyspace>& sys_ks, sharded<service::storage_proxy>& proxy, sharded<service::storage_service>& ss,
|
||||
utils::chunked_vector<mutation> mutations, bool reload) {
|
||||
if (this_shard_id() != 0) {
|
||||
// mutations must be applied on the owning shard (0).
|
||||
co_await smp::submit_to(0, coroutine::lambda([&, fmuts = freeze(mutations)] () mutable -> future<> {
|
||||
co_await smp::submit_to(0, coroutine::lambda([&, fmuts = freeze(mutations)]() mutable -> future<> {
|
||||
co_await merge_schema(sys_ks, proxy, ss, co_await unfreeze_gently(fmuts), reload);
|
||||
}));
|
||||
co_return;
|
||||
}
|
||||
co_await with_merge_lock([&] () mutable -> future<> {
|
||||
co_await with_merge_lock([&]() mutable -> future<> {
|
||||
co_await do_merge_schema(proxy, ss, sys_ks, std::move(mutations), reload);
|
||||
auto version = co_await get_group0_schema_version(sys_ks.local());
|
||||
co_await update_schema_version_and_announce(sys_ks, proxy, version);
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
} // namespace schema_tables
|
||||
|
||||
}
|
||||
} // namespace db
|
||||
|
||||
@@ -336,6 +336,8 @@ schema_ptr scylla_tables(schema_features features) {
|
||||
// since it is written to only after the cluster feature is enabled.
|
||||
sb.with_column("tablets", map_type_impl::get_instance(utf8_type, utf8_type, false));
|
||||
|
||||
sb.with_column("storage_engine", utf8_type);
|
||||
|
||||
sb.with_hash_version();
|
||||
s = sb.build();
|
||||
}
|
||||
@@ -1676,6 +1678,9 @@ mutation make_scylla_tables_mutation(schema_ptr table, api::timestamp_type times
|
||||
m.set_clustered_cell(ckey, cdef, make_map_mutation(map, cdef, timestamp));
|
||||
}
|
||||
}
|
||||
if (table->logstor_enabled()) {
|
||||
m.set_clustered_cell(ckey, "storage_engine", "logstor", timestamp);
|
||||
}
|
||||
// In-memory tables are deprecated since scylla-2024.1.0
|
||||
// FIXME: delete the column when there's no live version supporting it anymore.
|
||||
// Writing it here breaks upgrade rollback to versions that do not support the in_memory schema_feature
|
||||
@@ -2161,6 +2166,13 @@ static void prepare_builder_from_scylla_tables_row(const schema_ctxt& ctxt, sche
|
||||
auto tablet_options = db::tablet_options(*opt_map);
|
||||
builder.set_tablet_options(tablet_options.to_map());
|
||||
}
|
||||
if (auto storage_engine = table_row.get<sstring>("storage_engine")) {
|
||||
if (*storage_engine == "logstor") {
|
||||
builder.set_logstor();
|
||||
} else {
|
||||
throw std::invalid_argument(format("Invalid value for storage_engine: {}", *storage_engine));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
schema_ptr create_table_from_mutations(const schema_ctxt& ctxt, schema_mutations sm, const data_dictionary::user_types_storage& user_types, schema_ptr cdc_schema, std::optional<table_schema_version> version)
|
||||
|
||||
@@ -39,10 +39,19 @@ snapshot_ctl::snapshot_ctl(sharded<replica::database>& db, sharded<service::stor
|
||||
}
|
||||
|
||||
future<> snapshot_ctl::stop() {
|
||||
co_await _ops.close();
|
||||
co_await disable_all_operations();
|
||||
co_await _task_manager_module->stop();
|
||||
}
|
||||
|
||||
future<> snapshot_ctl::disable_all_operations() {
|
||||
if (!_ops.is_closed()) {
|
||||
if (_ops.get_count()) {
|
||||
snap_log.info("Waiting for snapshot/backup tasks to finish");
|
||||
}
|
||||
co_await _ops.close();
|
||||
}
|
||||
}
|
||||
|
||||
future<> snapshot_ctl::check_snapshot_not_exist(sstring ks_name, sstring name, std::optional<std::vector<sstring>> filter) {
|
||||
auto& ks = _db.local().find_keyspace(ks_name);
|
||||
return parallel_for_each(ks.metadata()->cf_meta_data(), [this, ks_name = std::move(ks_name), name = std::move(name), filter = std::move(filter)] (auto& pair) {
|
||||
|
||||
@@ -120,6 +120,8 @@ public:
|
||||
|
||||
future<int64_t> true_snapshots_size();
|
||||
future<int64_t> true_snapshots_size(sstring ks, sstring cf);
|
||||
|
||||
future<> disable_all_operations();
|
||||
private:
|
||||
config _config;
|
||||
sharded<replica::database>& _db;
|
||||
|
||||
@@ -3052,7 +3052,7 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
|
||||
co_return ret;
|
||||
}
|
||||
|
||||
const bool strongly_consistent_tables = _db.features().strongly_consistent_tables;
|
||||
const bool tablet_balancing_not_supported = _db.features().strongly_consistent_tables || _db.features().logstor;
|
||||
|
||||
for (auto& row : *rs) {
|
||||
if (!row.has("host_id")) {
|
||||
@@ -3289,7 +3289,7 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
|
||||
ret.session = service::session_id(some_row.get_as<utils::UUID>("session"));
|
||||
}
|
||||
|
||||
if (strongly_consistent_tables) {
|
||||
if (tablet_balancing_not_supported) {
|
||||
ret.tablet_balancing_enabled = false;
|
||||
} else if (some_row.has("tablet_balancing_enabled")) {
|
||||
ret.tablet_balancing_enabled = some_row.get_as<bool>("tablet_balancing_enabled");
|
||||
|
||||
@@ -2647,7 +2647,7 @@ future<> view_builder::add_new_view(view_ptr view, build_step& step) {
|
||||
}
|
||||
|
||||
if (this_shard_id() == smp::count - 1) {
|
||||
co_await utils::get_local_injector().inject("add_new_view_pause_last_shard", utils::wait_for_message(5min));
|
||||
inject_failure("add_new_view_fail_last_shard");
|
||||
}
|
||||
|
||||
co_await _sys_ks.register_view_for_building(view->ks_name(), view->cf_name(), step.current_token());
|
||||
|
||||
@@ -29,8 +29,8 @@ static logging::logger blogger("boot_strapper");
|
||||
|
||||
namespace dht {
|
||||
|
||||
future<> boot_strapper::bootstrap(streaming::stream_reason reason, gms::gossiper& gossiper, service::frozen_topology_guard topo_guard,
|
||||
locator::host_id replace_address) {
|
||||
future<> boot_strapper::bootstrap(
|
||||
streaming::stream_reason reason, gms::gossiper& gossiper, service::frozen_topology_guard topo_guard, locator::host_id replace_address) {
|
||||
blogger.debug("Beginning bootstrap process: sorted_tokens={}", get_token_metadata().sorted_tokens());
|
||||
sstring description;
|
||||
if (reason == streaming::stream_reason::bootstrap) {
|
||||
@@ -41,7 +41,8 @@ future<> boot_strapper::bootstrap(streaming::stream_reason reason, gms::gossiper
|
||||
throw std::runtime_error("Wrong stream_reason provided: it can only be replace or bootstrap");
|
||||
}
|
||||
try {
|
||||
auto streamer = make_lw_shared<range_streamer>(_db, _stream_manager, _token_metadata_ptr, _abort_source, _tokens, _address, _dr, description, reason, topo_guard);
|
||||
auto streamer = make_lw_shared<range_streamer>(
|
||||
_db, _stream_manager, _token_metadata_ptr, _abort_source, _tokens, _address, _dr, description, reason, topo_guard);
|
||||
auto nodes_to_filter = gossiper.get_unreachable_members();
|
||||
if (reason == streaming::stream_reason::replace) {
|
||||
nodes_to_filter.insert(std::move(replace_address));
|
||||
@@ -71,7 +72,8 @@ std::unordered_set<token> boot_strapper::get_random_bootstrap_tokens(const token
|
||||
}
|
||||
|
||||
if (num_tokens == 1) {
|
||||
blogger.warn("Picking random token for a single vnode. You should probably add more vnodes; failing that, you should probably specify the token manually");
|
||||
blogger.warn(
|
||||
"Picking random token for a single vnode. You should probably add more vnodes; failing that, you should probably specify the token manually");
|
||||
}
|
||||
|
||||
auto tokens = get_random_tokens(std::move(tmptr), num_tokens);
|
||||
@@ -86,7 +88,8 @@ std::unordered_set<token> boot_strapper::get_bootstrap_tokens(token_metadata_ptr
|
||||
return get_bootstrap_tokens(std::move(tmptr), cfg.initial_token(), cfg.num_tokens(), check);
|
||||
}
|
||||
|
||||
std::unordered_set<token> boot_strapper::get_bootstrap_tokens(const token_metadata_ptr tmptr, sstring tokens_string, uint32_t num_tokens, check_token_endpoint check) {
|
||||
std::unordered_set<token> boot_strapper::get_bootstrap_tokens(
|
||||
const token_metadata_ptr tmptr, sstring tokens_string, uint32_t num_tokens, check_token_endpoint check) {
|
||||
std::unordered_set<sstring> initial_tokens;
|
||||
try {
|
||||
boost::split(initial_tokens, tokens_string, boost::is_any_of(sstring(", ")));
|
||||
@@ -102,7 +105,8 @@ std::unordered_set<token> boot_strapper::get_bootstrap_tokens(const token_metada
|
||||
for (auto& token_string : initial_tokens) {
|
||||
auto token = dht::token::from_sstring(token_string);
|
||||
if (check && tmptr->get_endpoint(token)) {
|
||||
throw std::runtime_error(format("Bootstrapping to existing token {} is not allowed (decommission/removenode the old node first).", token_string));
|
||||
throw std::runtime_error(
|
||||
format("Bootstrapping to existing token {} is not allowed (decommission/removenode the old node first).", token_string));
|
||||
}
|
||||
tokens.insert(token);
|
||||
}
|
||||
|
||||
@@ -26,10 +26,9 @@ static logging::logger logger("range_streamer");
|
||||
|
||||
using inet_address = gms::inet_address;
|
||||
|
||||
std::unordered_map<locator::host_id, dht::token_range_vector>
|
||||
range_streamer::get_range_fetch_map(const std::unordered_map<dht::token_range, std::vector<locator::host_id>>& ranges_with_sources,
|
||||
const std::unordered_set<std::unique_ptr<i_source_filter>>& source_filters,
|
||||
const sstring& keyspace) {
|
||||
std::unordered_map<locator::host_id, dht::token_range_vector> range_streamer::get_range_fetch_map(
|
||||
const std::unordered_map<dht::token_range, std::vector<locator::host_id>>& ranges_with_sources,
|
||||
const std::unordered_set<std::unique_ptr<i_source_filter>>& source_filters, const sstring& keyspace) {
|
||||
std::unordered_map<locator::host_id, dht::token_range_vector> range_fetch_map_map;
|
||||
const auto& topo = _token_metadata_ptr->get_topology();
|
||||
for (const auto& x : ranges_with_sources) {
|
||||
@@ -79,8 +78,8 @@ range_streamer::get_range_fetch_map(const std::unordered_map<dht::token_range, s
|
||||
}
|
||||
|
||||
// Must be called from a seastar thread
|
||||
std::unordered_map<dht::token_range, std::vector<locator::host_id>>
|
||||
range_streamer::get_all_ranges_with_sources_for(const sstring& keyspace_name, const locator::vnode_effective_replication_map* erm, dht::token_range_vector desired_ranges) {
|
||||
std::unordered_map<dht::token_range, std::vector<locator::host_id>> range_streamer::get_all_ranges_with_sources_for(
|
||||
const sstring& keyspace_name, const locator::vnode_effective_replication_map* erm, dht::token_range_vector desired_ranges) {
|
||||
logger.debug("{} ks={}", __func__, keyspace_name);
|
||||
|
||||
auto range_addresses = erm->get_range_host_ids().get();
|
||||
@@ -114,24 +113,24 @@ range_streamer::get_all_ranges_with_sources_for(const sstring& keyspace_name, co
|
||||
}
|
||||
|
||||
// Must be called from a seastar thread
|
||||
std::unordered_map<dht::token_range, std::vector<locator::host_id>>
|
||||
range_streamer::get_all_ranges_with_strict_sources_for(const sstring& keyspace_name, const locator::vnode_effective_replication_map* erm, dht::token_range_vector desired_ranges, gms::gossiper& gossiper) {
|
||||
std::unordered_map<dht::token_range, std::vector<locator::host_id>> range_streamer::get_all_ranges_with_strict_sources_for(
|
||||
const sstring& keyspace_name, const locator::vnode_effective_replication_map* erm, dht::token_range_vector desired_ranges, gms::gossiper& gossiper) {
|
||||
logger.debug("{} ks={}", __func__, keyspace_name);
|
||||
SCYLLA_ASSERT (_tokens.empty() == false);
|
||||
SCYLLA_ASSERT(_tokens.empty() == false);
|
||||
|
||||
auto& strat = erm->get_replication_strategy();
|
||||
|
||||
//Active ranges
|
||||
// Active ranges
|
||||
auto metadata_clone = get_token_metadata().clone_only_token_map().get();
|
||||
auto range_addresses = strat.get_range_host_ids(metadata_clone).get();
|
||||
|
||||
//Pending ranges
|
||||
// Pending ranges
|
||||
metadata_clone.update_topology(_address, _dr);
|
||||
metadata_clone.update_normal_tokens(_tokens, _address).get();
|
||||
auto pending_range_addresses = strat.get_range_host_ids(metadata_clone).get();
|
||||
auto pending_range_addresses = strat.get_range_host_ids(metadata_clone).get();
|
||||
metadata_clone.clear_gently().get();
|
||||
|
||||
//Collects the source that will have its range moved to the new node
|
||||
// Collects the source that will have its range moved to the new node
|
||||
std::unordered_map<dht::token_range, std::vector<locator::host_id>> range_sources;
|
||||
|
||||
logger.debug("keyspace={}, desired_ranges.size={}, range_addresses.size={}", keyspace_name, desired_ranges.size(), range_addresses.size());
|
||||
@@ -150,11 +149,12 @@ range_streamer::get_all_ranges_with_strict_sources_for(const sstring& keyspace_n
|
||||
}
|
||||
|
||||
std::unordered_set<locator::host_id> new_endpoints(it->second.begin(), it->second.end());
|
||||
//Due to CASSANDRA-5953 we can have a higher RF then we have endpoints.
|
||||
//So we need to be careful to only be strict when endpoints == RF
|
||||
// Due to CASSANDRA-5953 we can have a higher RF then we have endpoints.
|
||||
// So we need to be careful to only be strict when endpoints == RF
|
||||
if (old_endpoints.size() == erm->get_replication_factor()) {
|
||||
std::erase_if(old_endpoints,
|
||||
[&new_endpoints] (locator::host_id ep) { return new_endpoints.contains(ep); });
|
||||
std::erase_if(old_endpoints, [&new_endpoints](locator::host_id ep) {
|
||||
return new_endpoints.contains(ep);
|
||||
});
|
||||
if (old_endpoints.size() != 1) {
|
||||
throw std::runtime_error(format("Expected 1 endpoint but found {:d}", old_endpoints.size()));
|
||||
}
|
||||
@@ -163,7 +163,7 @@ range_streamer::get_all_ranges_with_strict_sources_for(const sstring& keyspace_n
|
||||
}
|
||||
}
|
||||
|
||||
//Validate
|
||||
// Validate
|
||||
auto it = range_sources.find(desired_range);
|
||||
if (it == range_sources.end()) {
|
||||
throw std::runtime_error(format("No sources found for {}", desired_range));
|
||||
@@ -176,7 +176,9 @@ range_streamer::get_all_ranges_with_strict_sources_for(const sstring& keyspace_n
|
||||
locator::host_id source_id = it->second.front();
|
||||
|
||||
if (gossiper.is_enabled() && !gossiper.is_alive(source_id)) {
|
||||
throw std::runtime_error(format("A node required to move the data consistently is down ({}). If you wish to move the data from a potentially inconsistent replica, restart the node with consistent_rangemovement=false", source_id));
|
||||
throw std::runtime_error(format("A node required to move the data consistently is down ({}). If you wish to move the data from a potentially "
|
||||
"inconsistent replica, restart the node with consistent_rangemovement=false",
|
||||
source_id));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -188,12 +190,8 @@ bool range_streamer::use_strict_sources_for_ranges(const sstring& keyspace_name,
|
||||
auto nr_nodes_in_ring = get_token_metadata().get_normal_token_owners().size();
|
||||
bool everywhere_topology = erm.get_replication_strategy().get_type() == locator::replication_strategy_type::everywhere_topology;
|
||||
// Use strict when number of nodes in the ring is equal or more than RF
|
||||
auto strict = _db.local().get_config().consistent_rangemovement()
|
||||
&& !_tokens.empty()
|
||||
&& !everywhere_topology
|
||||
&& nr_nodes_in_ring >= rf;
|
||||
logger.debug("use_strict_sources_for_ranges: ks={}, nr_nodes_in_ring={}, rf={}, strict={}",
|
||||
keyspace_name, nr_nodes_in_ring, rf, strict);
|
||||
auto strict = _db.local().get_config().consistent_rangemovement() && !_tokens.empty() && !everywhere_topology && nr_nodes_in_ring >= rf;
|
||||
logger.debug("use_strict_sources_for_ranges: ks={}, nr_nodes_in_ring={}, rf={}, strict={}", keyspace_name, nr_nodes_in_ring, rf, strict);
|
||||
return strict;
|
||||
}
|
||||
|
||||
@@ -214,34 +212,36 @@ void range_streamer::add_rx_ranges(const sstring& keyspace_name, std::unordered_
|
||||
}
|
||||
|
||||
// TODO: This is the legacy range_streamer interface, it is add_rx_ranges which adds rx ranges.
|
||||
future<> range_streamer::add_ranges(const sstring& keyspace_name, locator::static_effective_replication_map_ptr erm, dht::token_range_vector ranges, gms::gossiper& gossiper, bool is_replacing) {
|
||||
return seastar::async([this, keyspace_name, ermp = std::move(erm), ranges= std::move(ranges), &gossiper, is_replacing] () mutable {
|
||||
if (_nr_tx_added) {
|
||||
throw std::runtime_error("Mixed sending and receiving is not supported");
|
||||
}
|
||||
_nr_rx_added++;
|
||||
auto erm = ermp->maybe_as_vnode_effective_replication_map();
|
||||
SCYLLA_ASSERT(erm != nullptr);
|
||||
auto ranges_for_keyspace = !is_replacing && use_strict_sources_for_ranges(keyspace_name, *erm)
|
||||
? get_all_ranges_with_strict_sources_for(keyspace_name, erm, std::move(ranges), gossiper)
|
||||
: get_all_ranges_with_sources_for(keyspace_name, erm, std::move(ranges));
|
||||
|
||||
if (logger.is_enabled(logging::log_level::debug)) {
|
||||
for (auto& x : ranges_for_keyspace) {
|
||||
logger.debug("{} : keyspace {} range {} exists on {}", _description, keyspace_name, x.first, x.second);
|
||||
future<> range_streamer::add_ranges(const sstring& keyspace_name, locator::static_effective_replication_map_ptr erm, dht::token_range_vector ranges,
|
||||
gms::gossiper& gossiper, bool is_replacing) {
|
||||
return seastar::async([this, keyspace_name, ermp = std::move(erm), ranges = std::move(ranges), &gossiper, is_replacing]() mutable {
|
||||
if (_nr_tx_added) {
|
||||
throw std::runtime_error("Mixed sending and receiving is not supported");
|
||||
}
|
||||
}
|
||||
_nr_rx_added++;
|
||||
auto erm = ermp->maybe_as_vnode_effective_replication_map();
|
||||
SCYLLA_ASSERT(erm != nullptr);
|
||||
auto ranges_for_keyspace = !is_replacing && use_strict_sources_for_ranges(keyspace_name, *erm)
|
||||
? get_all_ranges_with_strict_sources_for(keyspace_name, erm, std::move(ranges), gossiper)
|
||||
: get_all_ranges_with_sources_for(keyspace_name, erm, std::move(ranges));
|
||||
|
||||
std::unordered_map<locator::host_id, dht::token_range_vector> range_fetch_map = get_range_fetch_map(ranges_for_keyspace, _source_filters, keyspace_name);
|
||||
utils::clear_gently(ranges_for_keyspace).get();
|
||||
|
||||
if (logger.is_enabled(logging::log_level::debug)) {
|
||||
for (auto& x : range_fetch_map) {
|
||||
logger.debug("{} : keyspace={}, ranges={} from source={}, range_size={}", _description, keyspace_name, x.second, x.first, x.second.size());
|
||||
if (logger.is_enabled(logging::log_level::debug)) {
|
||||
for (auto& x : ranges_for_keyspace) {
|
||||
logger.debug("{} : keyspace {} range {} exists on {}", _description, keyspace_name, x.first, x.second);
|
||||
}
|
||||
}
|
||||
}
|
||||
_to_stream.emplace(keyspace_name, std::move(range_fetch_map));
|
||||
});
|
||||
|
||||
std::unordered_map<locator::host_id, dht::token_range_vector> range_fetch_map =
|
||||
get_range_fetch_map(ranges_for_keyspace, _source_filters, keyspace_name);
|
||||
utils::clear_gently(ranges_for_keyspace).get();
|
||||
|
||||
if (logger.is_enabled(logging::log_level::debug)) {
|
||||
for (auto& x : range_fetch_map) {
|
||||
logger.debug("{} : keyspace={}, ranges={} from source={}, range_size={}", _description, keyspace_name, x.second, x.first, x.second.size());
|
||||
}
|
||||
}
|
||||
_to_stream.emplace(keyspace_name, std::move(range_fetch_map));
|
||||
});
|
||||
}
|
||||
|
||||
future<> range_streamer::stream_async() {
|
||||
@@ -250,73 +250,73 @@ future<> range_streamer::stream_async() {
|
||||
_token_metadata_ptr = nullptr;
|
||||
logger.info("{} starts, nr_ranges_remaining={}", _description, _nr_ranges_remaining);
|
||||
auto start = lowres_clock::now();
|
||||
return do_for_each(_to_stream, [this, description = _description] (auto& stream) {
|
||||
return do_for_each(_to_stream, [this, description = _description](auto& stream) {
|
||||
const auto& keyspace = stream.first;
|
||||
auto& ip_range_vec = stream.second;
|
||||
auto ips = ip_range_vec | std::views::keys | std::ranges::to<std::list>();
|
||||
// Fetch from or send to peer node in parallel
|
||||
logger.info("{} with {} for keyspace={} started, nodes_to_stream={}", description, ips, keyspace, ip_range_vec.size());
|
||||
return parallel_for_each(ip_range_vec, [this, description, keyspace] (auto& ip_range) {
|
||||
auto& source = ip_range.first;
|
||||
auto& range_vec = ip_range.second;
|
||||
return seastar::with_semaphore(_limiter, 1, [this, description, keyspace, source, &range_vec] () mutable {
|
||||
return seastar::async([this, description, keyspace, source, &range_vec] () mutable {
|
||||
// TODO: It is better to use fiber instead of thread here because
|
||||
// creating a thread per peer can be some memory in a large cluster.
|
||||
auto start_time = lowres_clock::now();
|
||||
unsigned sp_index = 0;
|
||||
unsigned nr_ranges_streamed = 0;
|
||||
size_t nr_ranges_total = range_vec.size();
|
||||
auto do_streaming = [&] (dht::token_range_vector&& ranges_to_stream) {
|
||||
auto sp = stream_plan(_stream_manager.local(), format("{}-{}-index-{:d}", description, keyspace, sp_index++),
|
||||
_reason, _topo_guard);
|
||||
auto abort_listener = _abort_source.subscribe([&] () noexcept { sp.abort(); });
|
||||
_abort_source.check();
|
||||
logger.info("{} with {} for keyspace={}, streaming [{}, {}) out of {} ranges",
|
||||
description, source, keyspace,
|
||||
nr_ranges_streamed, nr_ranges_streamed + ranges_to_stream.size(), nr_ranges_total);
|
||||
auto ranges_streamed = ranges_to_stream.size();
|
||||
if (_nr_rx_added) {
|
||||
sp.request_ranges(source, keyspace, std::move(ranges_to_stream), _tables);
|
||||
} else if (_nr_tx_added) {
|
||||
sp.transfer_ranges(source, keyspace, std::move(ranges_to_stream), _tables);
|
||||
}
|
||||
sp.execute().discard_result().get();
|
||||
// Update finished percentage
|
||||
nr_ranges_streamed += ranges_streamed;
|
||||
_nr_ranges_remaining -= ranges_streamed;
|
||||
float percentage = _nr_total_ranges == 0 ? 1 : (_nr_total_ranges - _nr_ranges_remaining) / (float)_nr_total_ranges;
|
||||
_stream_manager.local().update_finished_percentage(_reason, percentage);
|
||||
logger.info("Finished {} out of {} ranges for {}, finished percentage={}",
|
||||
_nr_total_ranges - _nr_ranges_remaining, _nr_total_ranges, _reason, percentage);
|
||||
};
|
||||
dht::token_range_vector ranges_to_stream;
|
||||
try {
|
||||
for (auto it = range_vec.begin(); it < range_vec.end();) {
|
||||
ranges_to_stream.push_back(*it);
|
||||
++it;
|
||||
auto fraction = _db.local().get_config().stream_plan_ranges_fraction();
|
||||
size_t nr_ranges_per_stream_plan = nr_ranges_total * fraction;
|
||||
if (ranges_to_stream.size() < nr_ranges_per_stream_plan) {
|
||||
continue;
|
||||
} else {
|
||||
do_streaming(std::exchange(ranges_to_stream, {}));
|
||||
it = range_vec.erase(range_vec.begin(), it);
|
||||
return parallel_for_each(ip_range_vec, [this, description, keyspace](auto& ip_range) {
|
||||
auto& source = ip_range.first;
|
||||
auto& range_vec = ip_range.second;
|
||||
return seastar::with_semaphore(_limiter, 1, [this, description, keyspace, source, &range_vec]() mutable {
|
||||
return seastar::async([this, description, keyspace, source, &range_vec]() mutable {
|
||||
// TODO: It is better to use fiber instead of thread here because
|
||||
// creating a thread per peer can be some memory in a large cluster.
|
||||
auto start_time = lowres_clock::now();
|
||||
unsigned sp_index = 0;
|
||||
unsigned nr_ranges_streamed = 0;
|
||||
size_t nr_ranges_total = range_vec.size();
|
||||
auto do_streaming = [&](dht::token_range_vector&& ranges_to_stream) {
|
||||
auto sp = stream_plan(_stream_manager.local(), format("{}-{}-index-{:d}", description, keyspace, sp_index++), _reason, _topo_guard);
|
||||
auto abort_listener = _abort_source.subscribe([&]() noexcept {
|
||||
sp.abort();
|
||||
});
|
||||
_abort_source.check();
|
||||
logger.info("{} with {} for keyspace={}, streaming [{}, {}) out of {} ranges", description, source, keyspace, nr_ranges_streamed,
|
||||
nr_ranges_streamed + ranges_to_stream.size(), nr_ranges_total);
|
||||
auto ranges_streamed = ranges_to_stream.size();
|
||||
if (_nr_rx_added) {
|
||||
sp.request_ranges(source, keyspace, std::move(ranges_to_stream), _tables);
|
||||
} else if (_nr_tx_added) {
|
||||
sp.transfer_ranges(source, keyspace, std::move(ranges_to_stream), _tables);
|
||||
}
|
||||
sp.execute().discard_result().get();
|
||||
// Update finished percentage
|
||||
nr_ranges_streamed += ranges_streamed;
|
||||
_nr_ranges_remaining -= ranges_streamed;
|
||||
float percentage = _nr_total_ranges == 0 ? 1 : (_nr_total_ranges - _nr_ranges_remaining) / (float)_nr_total_ranges;
|
||||
_stream_manager.local().update_finished_percentage(_reason, percentage);
|
||||
logger.info("Finished {} out of {} ranges for {}, finished percentage={}", _nr_total_ranges - _nr_ranges_remaining, _nr_total_ranges,
|
||||
_reason, percentage);
|
||||
};
|
||||
dht::token_range_vector ranges_to_stream;
|
||||
try {
|
||||
for (auto it = range_vec.begin(); it < range_vec.end();) {
|
||||
ranges_to_stream.push_back(*it);
|
||||
++it;
|
||||
auto fraction = _db.local().get_config().stream_plan_ranges_fraction();
|
||||
size_t nr_ranges_per_stream_plan = nr_ranges_total * fraction;
|
||||
if (ranges_to_stream.size() < nr_ranges_per_stream_plan) {
|
||||
continue;
|
||||
} else {
|
||||
do_streaming(std::exchange(ranges_to_stream, {}));
|
||||
it = range_vec.erase(range_vec.begin(), it);
|
||||
}
|
||||
}
|
||||
if (ranges_to_stream.size() > 0) {
|
||||
do_streaming(std::exchange(ranges_to_stream, {}));
|
||||
range_vec.clear();
|
||||
}
|
||||
} catch (...) {
|
||||
auto t = std::chrono::duration_cast<std::chrono::duration<float>>(lowres_clock::now() - start_time).count();
|
||||
logger.warn("{} with {} for keyspace={} failed, took {} seconds: {}", description, source, keyspace, t, std::current_exception());
|
||||
throw;
|
||||
}
|
||||
if (ranges_to_stream.size() > 0) {
|
||||
do_streaming(std::exchange(ranges_to_stream, {}));
|
||||
range_vec.clear();
|
||||
}
|
||||
} catch (...) {
|
||||
auto t = std::chrono::duration_cast<std::chrono::duration<float>>(lowres_clock::now() - start_time).count();
|
||||
logger.warn("{} with {} for keyspace={} failed, took {} seconds: {}", description, source, keyspace, t, std::current_exception());
|
||||
throw;
|
||||
}
|
||||
auto t = std::chrono::duration_cast<std::chrono::duration<float>>(lowres_clock::now() - start_time).count();
|
||||
logger.info("{} with {} for keyspace={} succeeded, took {} seconds", description, source, keyspace, t);
|
||||
});
|
||||
});
|
||||
logger.info("{} with {} for keyspace={} succeeded, took {} seconds", description, source, keyspace, t);
|
||||
});
|
||||
});
|
||||
});
|
||||
}).finally([this, start] {
|
||||
auto t = std::chrono::duration_cast<std::chrono::seconds>(lowres_clock::now() - start).count();
|
||||
@@ -344,4 +344,4 @@ size_t range_streamer::nr_ranges_to_stream() {
|
||||
return nr_ranges_remaining;
|
||||
}
|
||||
|
||||
} // dht
|
||||
} // namespace dht
|
||||
|
||||
63
dht/token.hh
63
dht/token.hh
@@ -30,6 +30,31 @@ enum class token_kind {
|
||||
after_all_keys,
|
||||
};
|
||||
|
||||
// Represents a token for partition keys.
|
||||
// Has a disengaged state, which sorts before all engaged states.
|
||||
struct raw_token {
|
||||
int64_t value;
|
||||
|
||||
/// Constructs a disengaged token.
|
||||
raw_token() : value(std::numeric_limits<int64_t>::min()) {}
|
||||
|
||||
/// Constructs an engaged token.
|
||||
/// The token must be of token_kind::key kind.
|
||||
explicit raw_token(const token&);
|
||||
|
||||
explicit raw_token(int64_t v) : value(v) {};
|
||||
|
||||
std::strong_ordering operator<=>(const raw_token& o) const noexcept = default;
|
||||
std::strong_ordering operator<=>(const token& o) const noexcept;
|
||||
|
||||
/// Returns true iff engaged.
|
||||
explicit operator bool() const noexcept {
|
||||
return value != std::numeric_limits<int64_t>::min();
|
||||
}
|
||||
};
|
||||
|
||||
using raw_token_opt = seastar::optimized_optional<raw_token>;
|
||||
|
||||
class token {
|
||||
// INT64_MIN is not a legal token, but a special value used to represent
|
||||
// infinity in token intervals.
|
||||
@@ -52,6 +77,10 @@ public:
|
||||
|
||||
constexpr explicit token(int64_t d) noexcept : token(kind::key, normalize(d)) {}
|
||||
|
||||
token(raw_token raw) noexcept
|
||||
: token(raw ? kind::key : kind::before_all_keys, raw.value)
|
||||
{ }
|
||||
|
||||
// This constructor seems redundant with the bytes_view constructor, but
|
||||
// it's necessary for IDL, which passes a deserialized_bytes_proxy here.
|
||||
// (deserialized_bytes_proxy is convertible to bytes&&, but not bytes_view.)
|
||||
@@ -223,6 +252,29 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
inline
|
||||
raw_token::raw_token(const token& t)
|
||||
: value(t.raw())
|
||||
{
|
||||
#ifdef DEBUG
|
||||
assert(t._kind == token::kind::key);
|
||||
#endif
|
||||
}
|
||||
|
||||
inline
|
||||
std::strong_ordering raw_token::operator<=>(const token& o) const noexcept {
|
||||
switch (o._kind) {
|
||||
case token::kind::after_all_keys:
|
||||
return std::strong_ordering::less;
|
||||
case token::kind::before_all_keys:
|
||||
// before_all_keys has a raw value set to the same raw value as a disengaged raw_token, and sorts before all keys.
|
||||
// So we can order them by just comparing raw values.
|
||||
[[fallthrough]];
|
||||
case token::kind::key:
|
||||
return value <=> o._data;
|
||||
}
|
||||
}
|
||||
|
||||
inline constexpr std::strong_ordering tri_compare_raw(const int64_t l1, const int64_t l2) noexcept {
|
||||
if (l1 == l2) {
|
||||
return std::strong_ordering::equal;
|
||||
@@ -329,6 +381,17 @@ struct fmt::formatter<dht::token> : fmt::formatter<string_view> {
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct fmt::formatter<dht::raw_token> : fmt::formatter<string_view> {
|
||||
template <typename FormatContext>
|
||||
auto format(const dht::raw_token& t, FormatContext& ctx) const {
|
||||
if (!t) {
|
||||
return fmt::format_to(ctx.out(), "null");
|
||||
}
|
||||
return fmt::format_to(ctx.out(), "{}", t.value);
|
||||
}
|
||||
};
|
||||
|
||||
namespace std {
|
||||
|
||||
template<>
|
||||
|
||||
2
dist/common/sysconfig/scylla-node-exporter
vendored
2
dist/common/sysconfig/scylla-node-exporter
vendored
@@ -1 +1 @@
|
||||
SCYLLA_NODE_EXPORTER_ARGS="--collector.interrupts --collector.ethtool.metrics-include='(bw_in_allowance_exceeded|bw_out_allowance_exceeded|conntrack_allowance_exceeded|conntrack_allowance_available|linklocal_allowance_exceeded)' --collector.ethtool --no-collector.hwmon --no-collector.bcache --no-collector.btrfs --no-collector.fibrechannel --no-collector.infiniband --no-collector.ipvs --no-collector.nfs --no-collector.nfsd --no-collector.powersupplyclass --no-collector.rapl --no-collector.tapestats --no-collector.thermal_zone --no-collector.udp_queues --no-collector.zfs"
|
||||
SCYLLA_NODE_EXPORTER_ARGS="--collector.interrupts --collector.ethtool.metrics-include='(bw_in_allowance_exceeded|bw_out_allowance_exceeded|conntrack_allowance_exceeded|conntrack_allowance_available|linklocal_allowance_exceeded)' --collector.ethtool --collector.systemd --collector.systemd.unit-include='^(scylla-server|systemd-coredump.*)\.service$' --no-collector.hwmon --no-collector.bcache --no-collector.btrfs --no-collector.fibrechannel --no-collector.infiniband --no-collector.ipvs --no-collector.nfs --no-collector.nfsd --no-collector.powersupplyclass --no-collector.rapl --no-collector.tapestats --no-collector.thermal_zone --no-collector.udp_queues --no-collector.zfs"
|
||||
|
||||
@@ -139,7 +139,7 @@ The ``WHERE`` clause
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
The ``WHERE`` clause specifies which rows must be queried. It is composed of relations on the columns that are part of
|
||||
the ``PRIMARY KEY``.
|
||||
the ``PRIMARY KEY``, and relations can be joined only with ``AND`` (``OR`` and other logical operators are not supported).
|
||||
|
||||
Not all relations are allowed in a query. For instance, non-equal relations (where ``IN`` is considered as an equal
|
||||
relation) on a partition key are not supported (see the use of the ``TOKEN`` method below to do non-equal queries on
|
||||
@@ -200,6 +200,23 @@ The tuple notation may also be used for ``IN`` clauses on clustering columns::
|
||||
WHERE userid = 'john doe'
|
||||
AND (blog_title, posted_at) IN (('John''s Blog', '2012-01-01'), ('Extreme Chess', '2014-06-01'))
|
||||
|
||||
This tuple notation is different from boolean grouping. For example, the following query is not supported::
|
||||
|
||||
SELECT * FROM users
|
||||
WHERE (country = 'BR' AND state = 'SP')
|
||||
|
||||
because parentheses are only allowed around a single relation, so this works: ``(country = 'BR') AND (state = 'SP')``, but this does not: ``(country = 'BR' AND state = 'SP')``.
|
||||
Similarly, an extended query of the form of::
|
||||
|
||||
SELECT * FROM users
|
||||
WHERE (country = 'BR' AND state = 'SP')
|
||||
OR (country = 'BR' AND state = 'RJ')
|
||||
|
||||
won't work due to both: grouping boolean expressions and not supporting ``OR``, so when possible,
|
||||
rewrite such queries with ``IN`` on the varying column, for example
|
||||
``country = 'BR' AND state IN ('SP', 'RJ')``, or run multiple queries and merge
|
||||
the results client-side.
|
||||
|
||||
The ``CONTAINS`` operator may only be used on collection columns (lists, sets, and maps). In the case of maps,
|
||||
``CONTAINS`` applies to the map values. The ``CONTAINS KEY`` operator may only be used on map columns and applies to the
|
||||
map keys.
|
||||
@@ -292,8 +309,8 @@ For example::
|
||||
ORDER BY embedding ANN OF [0.1, 0.2, 0.3, 0.4] LIMIT 5;
|
||||
|
||||
|
||||
Vector queries also support filtering with ``WHERE`` clauses on columns that are part of the primary key
|
||||
or columns provided in a definition of the index.
|
||||
Vector queries also support filtering with ``WHERE`` clauses on columns that are part of the primary key.
|
||||
See :ref:`WHERE <where-clause>`.
|
||||
|
||||
For example::
|
||||
|
||||
@@ -301,10 +318,6 @@ For example::
|
||||
WHERE user_id = 'user123'
|
||||
ORDER BY embedding ANN OF [0.1, 0.2, 0.3, 0.4] LIMIT 5;
|
||||
|
||||
The supported operations are equal relations (``=`` and ``IN``) with restrictions as in regular ``WHERE`` clauses. See :ref:`WHERE <where-clause>`.
|
||||
|
||||
Other filtering scenarios are currently not supported.
|
||||
|
||||
.. note::
|
||||
|
||||
Vector indexes are supported in ScyllaDB Cloud only in clusters that have the Vector Search feature enabled.
|
||||
|
||||
236
docs/cql/guardrails.rst
Normal file
236
docs/cql/guardrails.rst
Normal file
@@ -0,0 +1,236 @@
|
||||
.. highlight:: cql
|
||||
|
||||
.. _cql-guardrails:
|
||||
|
||||
CQL Guardrails
|
||||
==============
|
||||
|
||||
ScyllaDB provides a set of configurable guardrail parameters that help operators
|
||||
enforce best practices and prevent misconfigurations that could degrade cluster
|
||||
health, availability, or performance. Guardrails operate at two severity levels:
|
||||
|
||||
* **Warn**: The request succeeds, but the server includes a warning in the CQL
|
||||
response. Depending on the specific guardrail, the warning may also be logged on the server side.
|
||||
* **Fail**: The request is rejected with an error/exception (the specific type
|
||||
depends on the guardrail). The user must correct the request or adjust the
|
||||
guardrail configuration to proceed.
|
||||
|
||||
.. note::
|
||||
|
||||
Guardrails are checked only when a statement is
|
||||
executed. They do not retroactively validate existing keyspaces, tables, or
|
||||
previously completed writes.
|
||||
|
||||
For the full list of configuration properties, including types, defaults, and
|
||||
liveness information, see :doc:`Configuration Parameters </reference/configuration-parameters>`.
|
||||
|
||||
.. _guardrails-replication-factor:
|
||||
|
||||
Replication Factor Guardrails
|
||||
-----------------------------
|
||||
|
||||
These four parameters control the minimum and maximum allowed replication factor
|
||||
(RF) values. They are evaluated whenever a ``CREATE KEYSPACE`` or
|
||||
``ALTER KEYSPACE`` statement is executed. Each data center's RF is checked
|
||||
individually.
|
||||
|
||||
An RF of ``0`` — which means "do not replicate to this data center" — is
|
||||
always allowed and never triggers a guardrail.
|
||||
|
||||
A threshold value of ``-1`` disables the corresponding check.
|
||||
|
||||
``minimum_replication_factor_warn_threshold``
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
If any data center's RF is set to a value greater than ``0`` and lower than
|
||||
this threshold, the server attaches a warning to the CQL response identifying
|
||||
the offending data center and RF value.
|
||||
|
||||
**When to use.** The default of ``3`` is the standard recommendation for
|
||||
production clusters. An RF below ``3`` means that the cluster cannot tolerate
|
||||
even a single node failure without data loss or read unavailability (assuming
|
||||
``QUORUM`` consistency). Keep this at ``3`` unless your deployment has specific
|
||||
constraints (e.g., a development or test cluster with fewer than 3 nodes).
|
||||
|
||||
``minimum_replication_factor_fail_threshold``
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
If any data center's RF is set to a value greater than ``0`` and lower than
|
||||
this threshold, the request is rejected with a ``ConfigurationException``
|
||||
identifying the offending data center and RF value.
|
||||
|
||||
**When to use.** Enable this parameter (e.g., set to ``3``) in production
|
||||
environments where allowing a low RF would be operationally dangerous. Unlike
|
||||
the warn threshold, this provides a hard guarantee that no keyspace can be
|
||||
created or altered to have an RF below the limit.
|
||||
|
||||
``maximum_replication_factor_warn_threshold``
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
If any data center's RF exceeds this threshold, the server attaches a warning to the CQL response identifying
|
||||
the offending data center and RF value.
|
||||
|
||||
**When to use.** An excessively high RF increases write amplification and
|
||||
storage costs proportionally. For example, an RF of ``5`` means every write
|
||||
is replicated to five nodes. Set this threshold to alert operators who
|
||||
may unintentionally set an RF that is too high.
|
||||
|
||||
``maximum_replication_factor_fail_threshold``
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
If any data center's RF exceeds this threshold, the request is rejected with a ``ConfigurationException``
|
||||
identifying the offending data center and RF value.
|
||||
|
||||
**When to use.** Enable this parameter to prevent accidental creation of
|
||||
keyspaces with an unreasonably high RF. An extremely high RF wastes storage and
|
||||
network bandwidth and can lead to write latency spikes. This is a hard limit —
|
||||
the keyspace creation or alteration will not proceed until the RF is lowered.
|
||||
|
||||
**Metrics.** ScyllaDB exposes per-shard metrics that track the number of
|
||||
times each replication factor guardrail has been triggered:
|
||||
|
||||
* ``scylla_cql_minimum_replication_factor_warn_violations``
|
||||
* ``scylla_cql_minimum_replication_factor_fail_violations``
|
||||
* ``scylla_cql_maximum_replication_factor_warn_violations``
|
||||
* ``scylla_cql_maximum_replication_factor_fail_violations``
|
||||
|
||||
A sustained increase in any of these metrics indicates that
|
||||
``CREATE KEYSPACE`` or ``ALTER KEYSPACE`` requests are hitting the configured
|
||||
thresholds.
|
||||
|
||||
.. _guardrails-replication-strategy:
|
||||
|
||||
Replication Strategy Guardrails
|
||||
-------------------------------
|
||||
|
||||
These two parameters control which replication strategies trigger warnings or
|
||||
are rejected when a keyspace is created or altered.
|
||||
|
||||
``replication_strategy_warn_list``
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
If the replication strategy used in a ``CREATE KEYSPACE`` or ``ALTER KEYSPACE``
|
||||
statement is on this list, the server attaches a warning to the CQL response
|
||||
identifying the discouraged strategy and the affected keyspace.
|
||||
|
||||
**When to use.** ``SimpleStrategy`` is not recommended for production use.
|
||||
It places replicas without awareness of data center or rack topology, which
|
||||
can undermine fault tolerance in multi-DC deployments. Even in single-DC
|
||||
deployments, ``NetworkTopologyStrategy`` is recommended because it keeps the
|
||||
schema ready for future topology changes.
|
||||
|
||||
The default configuration warns on ``SimpleStrategy``, which is appropriate
|
||||
for most deployments. If you have existing keyspaces that use
|
||||
``SimpleStrategy``, see :doc:`Update Topology Strategy From Simple to Network
|
||||
</operating-scylla/procedures/cluster-management/update-topology-strategy-from-simple-to-network>`
|
||||
for the migration procedure.
|
||||
|
||||
``replication_strategy_fail_list``
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
If the replication strategy used in a ``CREATE KEYSPACE`` or ``ALTER KEYSPACE``
|
||||
statement is on this list, the request is rejected with a
|
||||
``ConfigurationException`` identifying the forbidden strategy and the affected
|
||||
keyspace.
|
||||
|
||||
**When to use.** In production environments, add ``SimpleStrategy`` to this
|
||||
list to enforce ``NetworkTopologyStrategy`` across all keyspaces. This helps
|
||||
prevent new production keyspaces from being created with a topology-unaware
|
||||
strategy.
|
||||
|
||||
**Metrics.** The following per-shard metrics track replication strategy
|
||||
guardrail violations:
|
||||
|
||||
* ``scylla_cql_replication_strategy_warn_list_violations``
|
||||
* ``scylla_cql_replication_strategy_fail_list_violations``
|
||||
|
||||
.. _guardrails-write-consistency-level:
|
||||
|
||||
Write Consistency Level Guardrails
|
||||
----------------------------------
|
||||
|
||||
These two parameters control which consistency levels (CL) are allowed for
|
||||
write operations (``INSERT``, ``UPDATE``, ``DELETE``, and ``BATCH``
|
||||
statements).
|
||||
|
||||
Be aware that adding warnings to CQL responses can significantly increase
|
||||
network traffic and reduce overall throughput.
|
||||
|
||||
``write_consistency_levels_warned``
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
If a write operation uses a consistency level on this list, the server attaches
|
||||
a warning to the CQL response identifying the discouraged consistency level.
|
||||
|
||||
**When to use.** Use this parameter to alert application developers when they
|
||||
use a consistency level that, while technically functional, is not recommended
|
||||
for the workload. Common examples:
|
||||
|
||||
* **Warn on** ``ANY``: writes at ``ANY`` are acknowledged as soon as at least
|
||||
one node (including a coordinator acting as a hinted handoff store) receives
|
||||
the mutation. This means data may not be persisted on any replica node at
|
||||
the time of acknowledgement, risking data loss if the coordinator fails
|
||||
before hinted handoff completes.
|
||||
* **Warn on** ``ALL``: writes at ``ALL`` require every replica to acknowledge
|
||||
the write. If any single replica is down, the write fails. This significantly
|
||||
reduces write availability.
|
||||
|
||||
``write_consistency_levels_disallowed``
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
If a write operation uses a consistency level on this list, the request is
|
||||
rejected with an ``InvalidRequestException`` identifying the forbidden
|
||||
consistency level.
|
||||
|
||||
**When to use.** Use this parameter to hard-block consistency levels that are
|
||||
considered unsafe for your deployment:
|
||||
|
||||
* **Disallow** ``ANY``: in production environments, ``ANY`` is almost never
|
||||
appropriate. It provides the weakest durability guarantee and is a common
|
||||
source of data-loss incidents when operators or application developers use it
|
||||
unintentionally.
|
||||
* **Disallow** ``ALL``: in clusters where high write availability is critical,
|
||||
blocking ``ALL`` prevents a single node failure from causing write
|
||||
unavailability.
|
||||
|
||||
**Metrics.** The following per-shard metrics track write consistency level
|
||||
guardrail violations:
|
||||
|
||||
* ``scylla_cql_write_consistency_levels_warned_violations``
|
||||
* ``scylla_cql_write_consistency_levels_disallowed_violations``
|
||||
|
||||
Additionally, ScyllaDB exposes the
|
||||
``scylla_cql_writes_per_consistency_level`` metric, labeled by consistency
|
||||
level, which tracks the total number of write requests per CL. This metric is
|
||||
useful for understanding the current write-CL distribution across the cluster
|
||||
*before* deciding which levels to warn on or disallow. For example, querying
|
||||
this metric can reveal whether any application is inadvertently using ``ANY``
|
||||
or ``ALL`` for writes.
|
||||
|
||||
.. _guardrails-compact-storage:
|
||||
|
||||
Compact Storage Guardrail
|
||||
-------------------------
|
||||
|
||||
``enable_create_table_with_compact_storage``
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
This boolean parameter controls whether ``CREATE TABLE`` statements with the
|
||||
deprecated ``COMPACT STORAGE`` option are allowed. Unlike the other guardrails,
|
||||
it acts as a simple on/off switch rather than using separate warn and fail
|
||||
thresholds.
|
||||
|
||||
**When to use.** Leave this at the default (``false``) for all new
|
||||
deployments. ``COMPACT STORAGE`` is a legacy feature that will be permanently
|
||||
removed in a future version of ScyllaDB. Set to ``true`` only if you have a specific,
|
||||
temporary need to create compact storage tables (e.g., compatibility with legacy
|
||||
applications during a migration). For details on the ``COMPACT STORAGE`` option, see
|
||||
:ref:`Compact Tables <compact-tables>` in the Data Definition documentation.
|
||||
|
||||
Additional References
|
||||
---------------------
|
||||
|
||||
* :doc:`Consistency Level </cql/consistency>`
|
||||
* :doc:`Data Definition (CREATE/ALTER KEYSPACE) </cql/ddl>`
|
||||
* :doc:`How to Safely Increase the Replication Factor </kb/rf-increase>`
|
||||
* :doc:`Metrics Reference </reference/metrics>`
|
||||
@@ -17,6 +17,7 @@ CQL Reference
|
||||
secondary-indexes
|
||||
time-to-live
|
||||
functions
|
||||
guardrails
|
||||
wasm
|
||||
json
|
||||
mv
|
||||
@@ -46,6 +47,7 @@ It allows you to create keyspaces and tables, insert and query tables, and more.
|
||||
* :doc:`Data Types </cql/types>`
|
||||
* :doc:`Definitions </cql/definitions>`
|
||||
* :doc:`Global Secondary Indexes </cql/secondary-indexes>`
|
||||
* :doc:`CQL Guardrails </cql/guardrails>`
|
||||
* :doc:`Expiring Data with Time to Live (TTL) </cql/time-to-live>`
|
||||
* :doc:`Functions </cql/functions>`
|
||||
* :doc:`JSON Support </cql/json>`
|
||||
|
||||
124
docs/dev/logstor.md
Normal file
124
docs/dev/logstor.md
Normal file
@@ -0,0 +1,124 @@
|
||||
# Logstor
|
||||
|
||||
## Introduction
|
||||
|
||||
Logstor is a log-structured storage engine for ScyllaDB optimized for key-value workloads. It provides an alternative storage backend for key-value tables - tables with a partition key only, with no clustering columns.
|
||||
|
||||
Unlike the traditional LSM-tree based storage, logstor uses a log-structured approach with in-memory indexing, making it particularly suitable for workloads with frequent overwrites and point lookups.
|
||||
|
||||
## Architecture
|
||||
|
||||
Logstor consists of several key components:
|
||||
|
||||
### Components
|
||||
|
||||
#### Primary Index
|
||||
|
||||
The primary index is entirely in memory and it maps a partition key to its location in the log segments. It consists of a B-tree per each table that is ordered token.
|
||||
|
||||
#### Segment Manager
|
||||
|
||||
The `segment_manager` handles the allocation and management of fixed-size segments (default 128KB). Segments are grouped into large files (default 32MB). Key responsibilities include:
|
||||
|
||||
- **Segment allocation**: Provides segments for writing new data
|
||||
- **Space reclamation**: Tracks free space in each segment
|
||||
- **Compaction**: Copies live data from sparse segments to reclaim space
|
||||
- **Recovery**: Scans segments on startup to rebuild the index
|
||||
- **Separator**: Rewrites segments that have records from different compaction groups into new segments that are separated by compaction group.
|
||||
|
||||
The data in the segments consists of records of type `log_record`. Each record contains the value for some key as a `canonical_mutation` and additional metadata.
|
||||
|
||||
The `segment_manager` receives new writes via a `write_buffer` and writes them sequentially to the active segment with 4k-block alignment.
|
||||
|
||||
#### Write Buffer
|
||||
|
||||
The `write_buffer` manages a buffer of log records and handles the serialization of the records including headers and alignment. It can be used to write multiple records to the buffer and then write the buffer to the segment manager.
|
||||
|
||||
The `buffered_writer` manages multiple write buffers for user writes, an active buffer and multiple flushing ones, to batch writes and manage backpressure.
|
||||
|
||||
### Data Flow
|
||||
|
||||
**Write Path:**
|
||||
1. Application writes mutation to logstor
|
||||
2. Mutation is converted to a log record
|
||||
3. Record is written to write buffer
|
||||
4. The buffer is switched and written to the active segment.
|
||||
5. Index is updated with new record locations
|
||||
6. Old record locations (for overwrites) are marked as free
|
||||
|
||||
**Read Path:**
|
||||
1. Application requests data for a partition key
|
||||
2. Index lookup returns record location
|
||||
3. Segment manager reads record from disk
|
||||
4. Record is deserialized into a mutation and returned
|
||||
|
||||
**Separator:**
|
||||
1. When a record is written to the active segment, it is also written to its compaction group's separator buffer. The separator buffer holds a reference to the original segment.
|
||||
2. The separator buffer is flushed when it's full, or requested to flush for other reason. It is written into a new segment in the compaction group, and it updates the location of the records from the original mixed segments to the new segments in the compaction group.
|
||||
3. After the separator buffer is flushed and all records from the original segment are moved, it releases the reference of the segment. When there are no more reference to the segment it is freed.
|
||||
|
||||
**Compaction:**
|
||||
1. The amount of live data is tracked for each segment in its segment_descriptor. The segment descriptors are stored in a histogram by live data.
|
||||
2. A segment set from a single compaction group is submitted for compaction.
|
||||
3. Compaction picks segments for compaction from the segment set. It chooses segments with the lowest utilization such that compacting them results in net gain of free segments.
|
||||
4. It reads the segments, finding all live records, and writing them into a write buffer. When the buffer is full it is flushed into a new segment, and for each recording updating the index location to the new location.
|
||||
5. After all live records are rewritten the old segments are freed.
|
||||
|
||||
## Usage
|
||||
|
||||
### Enabling Logstor
|
||||
|
||||
To use logstor, enable it in the configuration:
|
||||
|
||||
```yaml
|
||||
enable_logstor: true
|
||||
|
||||
experimental_features:
|
||||
- logstor
|
||||
```
|
||||
|
||||
### Creating Tables
|
||||
|
||||
Tables using logstor must have no clustering columns, and created with the `storage_engine` property equals to 'logstor':
|
||||
|
||||
```cql
|
||||
CREATE TABLE keyspace.user_profiles (
|
||||
user_id uuid PRIMARY KEY,
|
||||
name text,
|
||||
email text,
|
||||
metadata frozen<map<text, text>>
|
||||
) WITH storage_engine = 'logstor';
|
||||
```
|
||||
|
||||
### Basic Operations
|
||||
|
||||
**Insert/Update:**
|
||||
|
||||
```cql
|
||||
INSERT INTO keyspace.table_name (pk, v) VALUES (1, 'value1');
|
||||
INSERT INTO keyspace.table_name (pk, v) VALUES (2, 'value2');
|
||||
|
||||
-- Overwrite with new value
|
||||
INSERT INTO keyspace.table_name (pk, v) VALUES (1, 'updated_value');
|
||||
```
|
||||
|
||||
Currently, updates must write the full row. Updating individual columns is not yet supported. Each write replaces the entire partition.
|
||||
|
||||
**Select:**
|
||||
|
||||
```cql
|
||||
SELECT * FROM keyspace.table_name WHERE pk = 1;
|
||||
-- Returns: (1, 'updated_value')
|
||||
|
||||
SELECT pk, v FROM keyspace.table_name WHERE pk = 2;
|
||||
-- Returns: (2, 'value2')
|
||||
|
||||
SELECT * FROM keyspace.table_name;
|
||||
-- Returns: (1, 'updated_value'), (2, 'value2')
|
||||
```
|
||||
|
||||
**Delete:**
|
||||
|
||||
```cql
|
||||
DELETE FROM keyspace.table_name WHERE pk = 1;
|
||||
```
|
||||
@@ -10,27 +10,7 @@ Cache is always paired with its underlying mutation source which it mirrors. Tha
|
||||
|
||||
Eviction is about removing parts of the data from memory and recording the fact that information about those parts is missing. Eviction doesn't change the set of writes represented by cache as part of its `mutation_source` interface.
|
||||
|
||||
The smallest object which can be evicted, called eviction unit, is currently a single row (`rows_entry`). Eviction units are managed by a W-TinyLFU policy owned by a `cache_tracker`. The W-TinyLFU policy determines eviction order. It is shared among many tables. Currently, there is one per `database`.
|
||||
|
||||
### W-TinyLFU Eviction Policy
|
||||
|
||||
The cache uses a W-TinyLFU (Window Tiny Least Frequently Used) eviction policy,
|
||||
which combines recency and frequency information for better hit rates than plain LRU.
|
||||
|
||||
The policy organizes entries into three segments:
|
||||
|
||||
- **Window** (~1% of cache): A small LRU that admits all new entries. This allows
|
||||
new entries to build up frequency information before competing for main cache space.
|
||||
- **Probation** (~19% of cache): Part of the main SLRU cache. Entries from the window
|
||||
compete with probation victims for admission using a TinyLFU frequency filter.
|
||||
- **Protected** (~80% of cache): The other part of the main SLRU cache. Entries are
|
||||
promoted here from probation when accessed again.
|
||||
|
||||
The TinyLFU frequency filter uses a Count-Min Sketch to compactly estimate access
|
||||
frequency. When eviction is needed, the window victim competes with the probation
|
||||
victim: the entry with higher estimated frequency survives in probation while the
|
||||
other is evicted. The sketch is periodically aged (all counts halved) to adapt to
|
||||
changing access patterns.
|
||||
The smallest object which can be evicted, called eviction unit, is currently a single row (`rows_entry`). Eviction units are linked in an LRU owned by a `cache_tracker`. The LRU determines eviction order. The LRU is shared among many tables. Currently, there is one per `database`.
|
||||
|
||||
All `rows_entry` objects which are owned by a `cache_tracker` are assumed to be either contained in a cache (in some `row_cache::partitions_type`) or
|
||||
be owned by a (detached) `partition_snapshot`. When the last row from a `partition_entry` is evicted, the containing `cache_entry` is evicted from the cache.
|
||||
|
||||
@@ -52,7 +52,7 @@ Install ScyllaDB
|
||||
.. code-block:: console
|
||||
:substitutions:
|
||||
|
||||
sudo wget -O /etc/apt/sources.list.d/scylla.list http://downloads.scylladb.com/deb/debian/|UBUNTU_SCYLLADB_LIST|
|
||||
sudo wget -O /etc/apt/sources.list.d/scylla.list https://downloads.scylladb.com/deb/debian/|UBUNTU_SCYLLADB_LIST|
|
||||
|
||||
|
||||
#. Install ScyllaDB packages.
|
||||
@@ -125,7 +125,7 @@ Install ScyllaDB
|
||||
.. code-block:: console
|
||||
:substitutions:
|
||||
|
||||
sudo curl -o /etc/yum.repos.d/scylla.repo -L http://downloads.scylladb.com/rpm/centos/|CENTOS_SCYLLADB_REPO|
|
||||
sudo curl -o /etc/yum.repos.d/scylla.repo -L https://downloads.scylladb.com/rpm/centos/|CENTOS_SCYLLADB_REPO|
|
||||
|
||||
#. Install ScyllaDB packages.
|
||||
|
||||
@@ -133,19 +133,19 @@ Install ScyllaDB
|
||||
|
||||
sudo yum install scylla
|
||||
|
||||
Running the command installs the latest official version of ScyllaDB Open Source.
|
||||
Alternatively, you can to install a specific patch version:
|
||||
Running the command installs the latest official version of ScyllaDB.
|
||||
Alternatively, you can install a specific patch version:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
sudo yum install scylla-<your patch version>
|
||||
|
||||
Example: The following example shows the command to install ScyllaDB 5.2.3.
|
||||
Example: The following example shows installing ScyllaDB 2025.3.1.
|
||||
|
||||
.. code-block:: console
|
||||
:class: hide-copy-button
|
||||
|
||||
sudo yum install scylla-5.2.3
|
||||
sudo yum install scylla-2025.3.1
|
||||
|
||||
.. include:: /getting-started/_common/setup-after-install.rst
|
||||
|
||||
|
||||
@@ -36,11 +36,8 @@ release versions, run:
|
||||
curl -sSf get.scylladb.com/server | sudo bash -s -- --list-active-releases
|
||||
|
||||
|
||||
Versions 2025.1 and Later
|
||||
==============================
|
||||
|
||||
Run the command with the ``--scylla-version`` option to specify the version
|
||||
you want to install.
|
||||
To install a non-default version, run the command with the ``--scylla-version``
|
||||
option to specify the version you want to install.
|
||||
|
||||
**Example**
|
||||
|
||||
@@ -50,20 +47,4 @@ you want to install.
|
||||
curl -sSf get.scylladb.com/server | sudo bash -s -- --scylla-version |CURRENT_VERSION|
|
||||
|
||||
|
||||
Versions Earlier than 2025.1
|
||||
================================
|
||||
|
||||
To install a supported version of *ScyllaDB Enterprise*, run the command with:
|
||||
|
||||
* ``--scylla-product scylla-enterprise`` to specify that you want to install
|
||||
ScyllaDB Entrprise.
|
||||
* ``--scylla-version`` to specify the version you want to install.
|
||||
|
||||
For example:
|
||||
|
||||
.. code:: console
|
||||
|
||||
curl -sSf get.scylladb.com/server | sudo bash -s -- --scylla-product scylla-enterprise --scylla-version 2024.1
|
||||
|
||||
|
||||
.. include:: /getting-started/_common/setup-after-install.rst
|
||||
@@ -57,12 +57,11 @@ To enable shared dictionaries:
|
||||
internode_compression_enable_advanced: true
|
||||
rpc_dict_training_when: when_leader
|
||||
|
||||
.. warning:: Enabling shared dictionary training might leak unencrypted data to disk.
|
||||
.. note::
|
||||
|
||||
Trained dictionaries contain randomly chosen samples of data transferred between
|
||||
nodes. The data samples are persisted in the Raft log, which is not encrypted.
|
||||
As a result, some data from otherwise encrypted tables might be stored on disk
|
||||
unencrypted.
|
||||
Some dictionary training data may be encrypted using storage-level encryption
|
||||
(if enabled) instead of database-level encryption, meaning protection is
|
||||
applied at the storage layer rather than within the database itself.
|
||||
|
||||
|
||||
Reference
|
||||
|
||||
@@ -727,7 +727,12 @@ public:
|
||||
|
||||
// now we need one page more to be able to save one for next lap
|
||||
auto fill_size = align_up(buf1.size(), block_size) + block_size - buf1.size();
|
||||
auto buf2 = co_await _input.read_exactly(fill_size);
|
||||
// If the underlying stream is already at EOF (e.g. buf1 came from
|
||||
// cached _next while the previous read_exactly drained the source),
|
||||
// skip the read_exactly call — it would return empty anyway.
|
||||
auto buf2 = _input.eof()
|
||||
? temporary_buffer<char>()
|
||||
: co_await _input.read_exactly(fill_size);
|
||||
|
||||
temporary_buffer<char> output(buf1.size() + buf2.size());
|
||||
|
||||
|
||||
@@ -172,6 +172,7 @@ public:
|
||||
gms::feature rack_list_rf { *this, "RACK_LIST_RF"sv };
|
||||
gms::feature driver_service_level { *this, "DRIVER_SERVICE_LEVEL"sv };
|
||||
gms::feature strongly_consistent_tables { *this, "STRONGLY_CONSISTENT_TABLES"sv };
|
||||
gms::feature logstor { *this, "LOGSTOR"sv };
|
||||
gms::feature client_routes { *this, "CLIENT_ROUTES"sv };
|
||||
gms::feature removenode_with_left_token_ring { *this, "REMOVENODE_WITH_LEFT_TOKEN_RING"sv };
|
||||
gms::feature size_based_load_balancing { *this, "SIZE_BASED_LOAD_BALANCING"sv };
|
||||
|
||||
593
gms/gossiper.cc
593
gms/gossiper.cc
File diff suppressed because it is too large
Load Diff
@@ -48,6 +48,7 @@ set(idl_headers
|
||||
messaging_service.idl.hh
|
||||
paxos.idl.hh
|
||||
raft.idl.hh
|
||||
raft_util.idl.hh
|
||||
raft_storage.idl.hh
|
||||
group0.idl.hh
|
||||
hinted_handoff.idl.hh
|
||||
@@ -55,6 +56,7 @@ set(idl_headers
|
||||
storage_proxy.idl.hh
|
||||
storage_service.idl.hh
|
||||
strong_consistency/state_machine.idl.hh
|
||||
logstor.idl.hh
|
||||
group0_state_machine.idl.hh
|
||||
mapreduce_request.idl.hh
|
||||
replica_exception.idl.hh
|
||||
|
||||
28
idl/logstor.idl.hh
Normal file
28
idl/logstor.idl.hh
Normal file
@@ -0,0 +1,28 @@
|
||||
/*
|
||||
* Copyright 2026-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
|
||||
#include "idl/frozen_schema.idl.hh"
|
||||
#include "idl/token.idl.hh"
|
||||
#include "mutation/canonical_mutation.hh"
|
||||
|
||||
namespace replica {
|
||||
namespace logstor {
|
||||
|
||||
struct primary_index_key {
|
||||
dht::decorated_key dk;
|
||||
};
|
||||
|
||||
class log_record {
|
||||
replica::logstor::primary_index_key key;
|
||||
replica::logstor::record_generation generation;
|
||||
table_id table;
|
||||
canonical_mutation mut;
|
||||
};
|
||||
|
||||
}
|
||||
}
|
||||
3
init.cc
3
init.cc
@@ -96,6 +96,9 @@ std::set<sstring> get_disabled_features_from_db_config(const db::config& cfg, st
|
||||
if (!cfg.check_experimental(db::experimental_features_t::feature::STRONGLY_CONSISTENT_TABLES)) {
|
||||
disabled.insert("STRONGLY_CONSISTENT_TABLES"s);
|
||||
}
|
||||
if (!cfg.check_experimental(db::experimental_features_t::feature::LOGSTOR)) {
|
||||
disabled.insert("LOGSTOR"s);
|
||||
}
|
||||
if (!cfg.table_digest_insensitive_to_expiry()) {
|
||||
disabled.insert("TABLE_DIGEST_INSENSITIVE_TO_EXPIRY"s);
|
||||
}
|
||||
|
||||
@@ -33,15 +33,14 @@ size_t hash<locator::endpoint_dc_rack>::operator()(const locator::endpoint_dc_ra
|
||||
return utils::tuple_hash()(std::tie(v.dc, v.rack));
|
||||
}
|
||||
|
||||
}
|
||||
} // namespace std
|
||||
|
||||
namespace locator {
|
||||
|
||||
static logging::logger logger("network_topology_strategy");
|
||||
|
||||
network_topology_strategy::network_topology_strategy(replication_strategy_params params, const topology* topo) :
|
||||
abstract_replication_strategy(params,
|
||||
replication_strategy_type::network_topology) {
|
||||
network_topology_strategy::network_topology_strategy(replication_strategy_params params, const topology* topo)
|
||||
: abstract_replication_strategy(params, replication_strategy_type::network_topology) {
|
||||
auto opts = _config_options;
|
||||
|
||||
logger.debug("options={}", opts);
|
||||
@@ -65,8 +64,7 @@ network_topology_strategy::network_topology_strategy(replication_strategy_params
|
||||
if (boost::equals(key, "replication_factor")) {
|
||||
on_internal_error(rslogger, "replication_factor should have been replaced with a DC:RF mapping by now");
|
||||
} else {
|
||||
throw exceptions::configuration_exception(format(
|
||||
"'{}' is not a valid option, did you mean (lowercase) 'replication_factor'?", key));
|
||||
throw exceptions::configuration_exception(format("'{}' is not a valid option, did you mean (lowercase) 'replication_factor'?", key));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -109,8 +107,8 @@ class natural_endpoints_tracker {
|
||||
, _rf_left(std::min(rf, node_count))
|
||||
// If there aren't enough racks in this DC to fill the RF, we'll still use at least one node from each rack,
|
||||
// and the difference is to be filled by the first encountered nodes.
|
||||
, _acceptable_rack_repeats(rf - rack_count)
|
||||
{}
|
||||
, _acceptable_rack_repeats(rf - rack_count) {
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempts to add an endpoint to the replicas for this datacenter, adding to the endpoints set if successful.
|
||||
@@ -201,8 +199,7 @@ public:
|
||||
, _tp(_tm.get_topology())
|
||||
, _dc_rep_factor(dc_rep_factor)
|
||||
, _token_owners(_tm.get_datacenter_token_owners())
|
||||
, _racks(_tm.get_datacenter_racks_token_owners())
|
||||
{
|
||||
, _racks(_tm.get_datacenter_racks_token_owners()) {
|
||||
// not aware of any cluster members
|
||||
SCYLLA_ASSERT(!_token_owners.empty() && !_racks.empty());
|
||||
|
||||
@@ -251,16 +248,14 @@ public:
|
||||
for (const auto& [dc, rf_data] : dc_rf) {
|
||||
auto rf = rf_data.count();
|
||||
if (rf > endpoints_in(dc)) {
|
||||
throw exceptions::configuration_exception(seastar::format(
|
||||
"Datacenter {} doesn't have enough token-owning nodes for replication_factor={}", dc, rf));
|
||||
throw exceptions::configuration_exception(
|
||||
seastar::format("Datacenter {} doesn't have enough token-owning nodes for replication_factor={}", dc, rf));
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
future<host_id_set>
|
||||
network_topology_strategy::calculate_natural_endpoints(
|
||||
const token& search_token, const token_metadata& tm) const {
|
||||
future<host_id_set> network_topology_strategy::calculate_natural_endpoints(const token& search_token, const token_metadata& tm) const {
|
||||
|
||||
natural_endpoints_tracker tracker(tm, _dc_rep_factor);
|
||||
|
||||
@@ -285,12 +280,14 @@ void network_topology_strategy::validate_options(const gms::feature_service& fs,
|
||||
for (auto& c : _config_options) {
|
||||
if (c.first == sstring("replication_factor")) {
|
||||
on_internal_error(rslogger, fmt::format("'replication_factor' tag should be unrolled into a list of DC:RF by now."
|
||||
"_config_options:{}", _config_options));
|
||||
"_config_options:{}",
|
||||
_config_options));
|
||||
}
|
||||
auto dc = dcs.find(c.first);
|
||||
if (dc == dcs.end()) {
|
||||
throw exceptions::configuration_exception(format("Unrecognized strategy option {{{}}} "
|
||||
"passed to NetworkTopologyStrategy", this->to_qualified_class_name(c.first)));
|
||||
"passed to NetworkTopologyStrategy",
|
||||
this->to_qualified_class_name(c.first)));
|
||||
}
|
||||
auto racks = dc->second | std::views::keys | std::ranges::to<std::unordered_set<sstring>>();
|
||||
auto rf = parse_replication_factor(c.second);
|
||||
@@ -311,8 +308,8 @@ future<tablet_map> network_topology_strategy::allocate_tablets_for_new_table(sch
|
||||
rslogger.info("Rounding up tablet count from {} to {} for table {}.{}", tablet_count, aligned_tablet_count, s->ks_name(), s->cf_name());
|
||||
tablet_count = aligned_tablet_count;
|
||||
}
|
||||
co_return co_await reallocate_tablets(std::move(s), std::move(tm),
|
||||
tablet_map(tablet_count, get_consistency() != data_dictionary::consistency_config_option::eventual));
|
||||
co_return co_await reallocate_tablets(
|
||||
std::move(s), std::move(tm), tablet_map(tablet_count, get_consistency() != data_dictionary::consistency_config_option::eventual));
|
||||
}
|
||||
|
||||
future<tablet_map> network_topology_strategy::reallocate_tablets(schema_ptr s, token_metadata_ptr tm, tablet_map tablets) const {
|
||||
@@ -321,16 +318,15 @@ future<tablet_map> network_topology_strategy::reallocate_tablets(schema_ptr s, t
|
||||
co_await load.populate_with_normalized_load();
|
||||
co_await load.populate(std::nullopt, s->id());
|
||||
|
||||
tablet_logger.debug("Allocating tablets for {}.{} ({}): dc_rep_factor={} tablet_count={}", s->ks_name(), s->cf_name(), s->id(), _dc_rep_factor, tablets.tablet_count());
|
||||
tablet_logger.debug(
|
||||
"Allocating tablets for {}.{} ({}): dc_rep_factor={} tablet_count={}", s->ks_name(), s->cf_name(), s->id(), _dc_rep_factor, tablets.tablet_count());
|
||||
|
||||
for (tablet_id tb : tablets.tablet_ids()) {
|
||||
auto tinfo = tablets.get_tablet_info(tb);
|
||||
tinfo.replicas = co_await reallocate_tablets(s, tm, load, tablets, tb);
|
||||
if (tablets.has_raft_info()) {
|
||||
if (!tablets.get_tablet_raft_info(tb).group_id) {
|
||||
tablets.set_tablet_raft_info(tb, tablet_raft_info {
|
||||
.group_id = raft::group_id{utils::UUID_gen::get_time_UUID()}
|
||||
});
|
||||
tablets.set_tablet_raft_info(tb, tablet_raft_info{.group_id = raft::group_id{utils::UUID_gen::get_time_UUID()}});
|
||||
}
|
||||
}
|
||||
tablets.set_tablet(tb, std::move(tinfo));
|
||||
@@ -340,7 +336,8 @@ future<tablet_map> network_topology_strategy::reallocate_tablets(schema_ptr s, t
|
||||
co_return tablets;
|
||||
}
|
||||
|
||||
future<tablet_replica_set> network_topology_strategy::reallocate_tablets(schema_ptr s, token_metadata_ptr tm, load_sketch& load, const tablet_map& cur_tablets, tablet_id tb) const {
|
||||
future<tablet_replica_set> network_topology_strategy::reallocate_tablets(
|
||||
schema_ptr s, token_metadata_ptr tm, load_sketch& load, const tablet_map& cur_tablets, tablet_id tb) const {
|
||||
tablet_replica_set replicas;
|
||||
// Current number of replicas per dc
|
||||
std::unordered_map<sstring, size_t> nodes_per_dc;
|
||||
@@ -364,8 +361,8 @@ future<tablet_replica_set> network_topology_strategy::reallocate_tablets(schema_
|
||||
if (new_rf && new_rf->is_rack_based()) {
|
||||
auto diff = diff_racks(old_racks_per_dc[dc], new_rf->get_rack_list());
|
||||
|
||||
tablet_logger.debug("reallocate_tablets {}.{} tablet_id={} dc={} old_racks={} add_racks={} del_racks={}",
|
||||
s->ks_name(), s->cf_name(), tb, dc, old_racks_per_dc[dc], diff.added, diff.removed);
|
||||
tablet_logger.debug("reallocate_tablets {}.{} tablet_id={} dc={} old_racks={} add_racks={} del_racks={}", s->ks_name(), s->cf_name(), tb, dc,
|
||||
old_racks_per_dc[dc], diff.added, diff.removed);
|
||||
|
||||
if (!diff) {
|
||||
continue;
|
||||
@@ -395,23 +392,18 @@ future<tablet_replica_set> network_topology_strategy::reallocate_tablets(schema_
|
||||
co_return replicas;
|
||||
}
|
||||
|
||||
tablet_replica_set network_topology_strategy::drop_tablets_in_racks(schema_ptr s,
|
||||
token_metadata_ptr tm,
|
||||
load_sketch& load,
|
||||
tablet_id tb,
|
||||
const tablet_replica_set& cur_replicas,
|
||||
const sstring& dc,
|
||||
const rack_list& racks_to_drop) const {
|
||||
tablet_replica_set network_topology_strategy::drop_tablets_in_racks(schema_ptr s, token_metadata_ptr tm, load_sketch& load, tablet_id tb,
|
||||
const tablet_replica_set& cur_replicas, const sstring& dc, const rack_list& racks_to_drop) const {
|
||||
auto& topo = tm->get_topology();
|
||||
tablet_replica_set filtered;
|
||||
auto is_rack_to_drop = [&racks_to_drop] (const sstring& rack) {
|
||||
auto is_rack_to_drop = [&racks_to_drop](const sstring& rack) {
|
||||
return std::ranges::contains(racks_to_drop, rack);
|
||||
};
|
||||
for (const auto& tr : cur_replicas) {
|
||||
auto& node = topo.get_node(tr.host);
|
||||
if (node.dc_rack().dc == dc && is_rack_to_drop(node.dc_rack().rack)) {
|
||||
tablet_logger.debug("drop_tablets_in_rack {}.{} tablet_id={} dc={} rack={} removing replica: {}",
|
||||
s->ks_name(), s->cf_name(), tb, node.dc_rack().dc, node.dc_rack().rack, tr);
|
||||
tablet_logger.debug("drop_tablets_in_rack {}.{} tablet_id={} dc={} rack={} removing replica: {}", s->ks_name(), s->cf_name(), tb, node.dc_rack().dc,
|
||||
node.dc_rack().rack, tr);
|
||||
load.unload(tr.host, tr.shard, 1, service::default_target_tablet_size);
|
||||
} else {
|
||||
filtered.emplace_back(tr);
|
||||
@@ -420,22 +412,17 @@ tablet_replica_set network_topology_strategy::drop_tablets_in_racks(schema_ptr s
|
||||
return filtered;
|
||||
}
|
||||
|
||||
tablet_replica_set network_topology_strategy::add_tablets_in_racks(schema_ptr s,
|
||||
token_metadata_ptr tm,
|
||||
load_sketch& load,
|
||||
tablet_id tb,
|
||||
const tablet_replica_set& cur_replicas,
|
||||
const sstring& dc,
|
||||
const rack_list& racks_to_add) const {
|
||||
tablet_replica_set network_topology_strategy::add_tablets_in_racks(schema_ptr s, token_metadata_ptr tm, load_sketch& load, tablet_id tb,
|
||||
const tablet_replica_set& cur_replicas, const sstring& dc, const rack_list& racks_to_add) const {
|
||||
auto nodes = tm->get_datacenter_racks_token_owners_nodes();
|
||||
auto& dc_nodes = nodes.at(dc);
|
||||
auto new_replicas = cur_replicas;
|
||||
|
||||
for (auto&& rack: racks_to_add) {
|
||||
for (auto&& rack : racks_to_add) {
|
||||
host_id min_node;
|
||||
double min_load = std::numeric_limits<double>::max();
|
||||
|
||||
for (auto&& node: dc_nodes.at(rack)) {
|
||||
for (auto&& node : dc_nodes.at(rack)) {
|
||||
if (!node.get().is_normal()) {
|
||||
continue;
|
||||
}
|
||||
@@ -450,29 +437,26 @@ tablet_replica_set network_topology_strategy::add_tablets_in_racks(schema_ptr s,
|
||||
}
|
||||
|
||||
if (!min_node) {
|
||||
throw std::runtime_error(
|
||||
fmt::format("No candidate node in rack {}.{} to allocate tablet replica", dc, rack));
|
||||
throw std::runtime_error(fmt::format("No candidate node in rack {}.{} to allocate tablet replica", dc, rack));
|
||||
}
|
||||
|
||||
auto new_replica = tablet_replica{min_node, load.next_shard(min_node, 1, service::default_target_tablet_size)};
|
||||
new_replicas.push_back(new_replica);
|
||||
|
||||
tablet_logger.trace("add_tablet_in_rack {}.{} tablet_id={} dc={} rack={} load={} new_replica={}",
|
||||
s->ks_name(), s->cf_name(), tb.id, dc, rack, min_load, new_replica);
|
||||
tablet_logger.trace("add_tablet_in_rack {}.{} tablet_id={} dc={} rack={} load={} new_replica={}", s->ks_name(), s->cf_name(), tb.id, dc, rack, min_load,
|
||||
new_replica);
|
||||
}
|
||||
return new_replicas;
|
||||
}
|
||||
|
||||
future<tablet_replica_set> network_topology_strategy::add_tablets_in_dc(schema_ptr s, token_metadata_ptr tm, load_sketch& load, tablet_id tb,
|
||||
std::map<sstring, std::unordered_set<locator::host_id>>& replicas_per_rack,
|
||||
const tablet_replica_set& cur_replicas,
|
||||
sstring dc, size_t dc_node_count, size_t dc_rf) const {
|
||||
std::map<sstring, std::unordered_set<locator::host_id>>& replicas_per_rack, const tablet_replica_set& cur_replicas, sstring dc, size_t dc_node_count,
|
||||
size_t dc_rf) const {
|
||||
static thread_local std::default_random_engine rnd_engine{std::random_device{}()};
|
||||
|
||||
auto replicas = cur_replicas;
|
||||
// all_dc_racks is ordered lexicographically on purpose
|
||||
auto all_dc_racks = tm->get_datacenter_racks_token_owners_nodes().at(dc)
|
||||
| std::ranges::to<std::map>();
|
||||
auto all_dc_racks = tm->get_datacenter_racks_token_owners_nodes().at(dc) | std::ranges::to<std::map>();
|
||||
|
||||
// Track all nodes with no replicas on them for this tablet, per rack.
|
||||
struct node_load {
|
||||
@@ -481,7 +465,7 @@ future<tablet_replica_set> network_topology_strategy::add_tablets_in_dc(schema_p
|
||||
};
|
||||
// for sorting in descending load order
|
||||
// (in terms of load)
|
||||
auto node_load_cmp = [] (const node_load& a, const node_load& b) {
|
||||
auto node_load_cmp = [](const node_load& a, const node_load& b) {
|
||||
return a.load > b.load;
|
||||
};
|
||||
|
||||
@@ -533,7 +517,7 @@ future<tablet_replica_set> network_topology_strategy::add_tablets_in_dc(schema_p
|
||||
|
||||
// ensure fairness across racks (in particular if rf < number_of_racks)
|
||||
// by rotating the racks order
|
||||
auto append_candidate_racks = [&] (candidates_list& racks) {
|
||||
auto append_candidate_racks = [&](candidates_list& racks) {
|
||||
if (auto size = racks.size()) {
|
||||
auto it = racks.begin() + tb.id % size;
|
||||
std::move(it, racks.end(), std::back_inserter(candidate_racks));
|
||||
@@ -545,20 +529,19 @@ future<tablet_replica_set> network_topology_strategy::add_tablets_in_dc(schema_p
|
||||
append_candidate_racks(existing_racks);
|
||||
|
||||
if (candidate_racks.empty()) {
|
||||
on_internal_error(tablet_logger,
|
||||
seastar::format("allocate_replica {}.{}: no candidate racks found for dc={} allocated={} rf={}: existing={}",
|
||||
s->ks_name(), s->cf_name(), dc, dc_node_count, dc_rf, replicas_per_rack));
|
||||
on_internal_error(tablet_logger, seastar::format("allocate_replica {}.{}: no candidate racks found for dc={} allocated={} rf={}: existing={}",
|
||||
s->ks_name(), s->cf_name(), dc, dc_node_count, dc_rf, replicas_per_rack));
|
||||
}
|
||||
|
||||
auto candidate_rack = candidate_racks.begin();
|
||||
|
||||
auto allocate_replica = [&] (candidates_list::iterator& candidate) {
|
||||
auto allocate_replica = [&](candidates_list::iterator& candidate) {
|
||||
const auto& rack = candidate->rack;
|
||||
auto& nodes = candidate->nodes;
|
||||
if (nodes.empty()) {
|
||||
on_internal_error(tablet_logger,
|
||||
seastar::format("allocate_replica {}.{} tablet_id={}: candidates vector for rack={} is empty for allocating tablet replicas in dc={} allocated={} rf={}",
|
||||
s->ks_name(), s->cf_name(), tb.id, rack, dc, dc_node_count, dc_rf));
|
||||
on_internal_error(tablet_logger, seastar::format("allocate_replica {}.{} tablet_id={}: candidates vector for rack={} is empty for allocating "
|
||||
"tablet replicas in dc={} allocated={} rf={}",
|
||||
s->ks_name(), s->cf_name(), tb.id, rack, dc, dc_node_count, dc_rf));
|
||||
}
|
||||
auto host_id = nodes.back().host;
|
||||
auto replica = tablet_replica{host_id, load.next_shard(host_id, 1, service::default_target_tablet_size)};
|
||||
@@ -566,13 +549,13 @@ future<tablet_replica_set> network_topology_strategy::add_tablets_in_dc(schema_p
|
||||
auto inserted = replicas_per_rack[node.dc_rack().rack].insert(host_id).second;
|
||||
// Sanity check that a node is not used more than once
|
||||
if (!inserted) {
|
||||
on_internal_error(tablet_logger,
|
||||
seastar::format("allocate_replica {}.{} tablet_id={}: allocated replica={} node already used when allocating tablet replicas in dc={} allocated={} rf={}: replicas={}",
|
||||
s->ks_name(), s->cf_name(), tb.id, replica, dc, dc_node_count, dc_rf, replicas));
|
||||
on_internal_error(tablet_logger, seastar::format("allocate_replica {}.{} tablet_id={}: allocated replica={} node already used when allocating "
|
||||
"tablet replicas in dc={} allocated={} rf={}: replicas={}",
|
||||
s->ks_name(), s->cf_name(), tb.id, replica, dc, dc_node_count, dc_rf, replicas));
|
||||
}
|
||||
nodes.pop_back();
|
||||
tablet_logger.trace("allocate_replica {}.{} tablet_id={}: allocated tablet replica={} dc={} rack={}: nodes remaining in rack={}",
|
||||
s->ks_name(), s->cf_name(), tb.id, replica, node.dc_rack().dc, node.dc_rack().rack, nodes.size());
|
||||
tablet_logger.trace("allocate_replica {}.{} tablet_id={}: allocated tablet replica={} dc={} rack={}: nodes remaining in rack={}", s->ks_name(),
|
||||
s->cf_name(), tb.id, replica, node.dc_rack().dc, node.dc_rack().rack, nodes.size());
|
||||
if (nodes.empty()) {
|
||||
candidate = candidate_racks.erase(candidate);
|
||||
} else {
|
||||
@@ -583,7 +566,8 @@ future<tablet_replica_set> network_topology_strategy::add_tablets_in_dc(schema_p
|
||||
}
|
||||
if (tablet_logger.is_enabled(log_level::trace)) {
|
||||
if (candidate != candidate_racks.end()) {
|
||||
tablet_logger.trace("allocate_replica {}.{} tablet_id={}: next rack={} nodes={}", s->ks_name(), s->cf_name(), tb.id, candidate->rack, candidate->nodes.size());
|
||||
tablet_logger.trace("allocate_replica {}.{} tablet_id={}: next rack={} nodes={}", s->ks_name(), s->cf_name(), tb.id, candidate->rack,
|
||||
candidate->nodes.size());
|
||||
} else {
|
||||
tablet_logger.trace("allocate_replica {}.{} tablet_id={}: no candidate racks left", s->ks_name(), s->cf_name(), tb.id);
|
||||
}
|
||||
@@ -591,15 +575,15 @@ future<tablet_replica_set> network_topology_strategy::add_tablets_in_dc(schema_p
|
||||
return replica;
|
||||
};
|
||||
|
||||
tablet_logger.debug("allocate_replica {}.{} tablet_id={}: allocating tablet replicas in dc={} allocated={} rf={}",
|
||||
s->ks_name(), s->cf_name(), tb.id, dc, dc_node_count, dc_rf);
|
||||
tablet_logger.debug("allocate_replica {}.{} tablet_id={}: allocating tablet replicas in dc={} allocated={} rf={}", s->ks_name(), s->cf_name(), tb.id, dc,
|
||||
dc_node_count, dc_rf);
|
||||
|
||||
for (size_t remaining = dc_rf - dc_node_count; remaining; --remaining) {
|
||||
co_await coroutine::maybe_yield();
|
||||
if (candidate_rack == candidate_racks.end()) {
|
||||
on_internal_error(tablet_logger,
|
||||
format("allocate_replica {}.{} tablet_id={}: ran out of candidates for allocating tablet replicas in dc={} allocated={} rf={}: remaining={}",
|
||||
s->ks_name(), s->cf_name(), tb.id, dc, dc_node_count, dc_rf, remaining));
|
||||
on_internal_error(tablet_logger, format("allocate_replica {}.{} tablet_id={}: ran out of candidates for allocating tablet replicas in dc={} "
|
||||
"allocated={} rf={}: remaining={}",
|
||||
s->ks_name(), s->cf_name(), tb.id, dc, dc_node_count, dc_rf, remaining));
|
||||
}
|
||||
replicas.emplace_back(allocate_replica(candidate_rack));
|
||||
}
|
||||
@@ -608,9 +592,9 @@ future<tablet_replica_set> network_topology_strategy::add_tablets_in_dc(schema_p
|
||||
}
|
||||
|
||||
tablet_replica_set network_topology_strategy::drop_tablets_in_dc(schema_ptr s, const locator::topology& topo, load_sketch& load, tablet_id tb,
|
||||
const tablet_replica_set& cur_replicas,
|
||||
sstring dc, size_t dc_node_count, size_t dc_rf) const {
|
||||
tablet_logger.debug("drop_tablets_in_dc {}.{} tablet_id={}: deallocating tablet replicas in dc={} allocated={} rf={}", s->ks_name(), s->cf_name(), tb.id, dc, dc_node_count, dc_rf);
|
||||
const tablet_replica_set& cur_replicas, sstring dc, size_t dc_node_count, size_t dc_rf) const {
|
||||
tablet_logger.debug("drop_tablets_in_dc {}.{} tablet_id={}: deallocating tablet replicas in dc={} allocated={} rf={}", s->ks_name(), s->cf_name(), tb.id,
|
||||
dc, dc_node_count, dc_rf);
|
||||
|
||||
// Leave dc_rf replicas in dc, effectively deallocating in reverse order,
|
||||
// to maintain replica pairing between the base table and its materialized views.
|
||||
@@ -629,8 +613,7 @@ tablet_replica_set network_topology_strategy::drop_tablets_in_dc(schema_ptr s, c
|
||||
return filtered;
|
||||
}
|
||||
|
||||
sstring network_topology_strategy::sanity_check_read_replicas(const effective_replication_map& erm,
|
||||
const host_id_vector_replica_set& read_replicas) const {
|
||||
sstring network_topology_strategy::sanity_check_read_replicas(const effective_replication_map& erm, const host_id_vector_replica_set& read_replicas) const {
|
||||
const auto& topology = erm.get_topology();
|
||||
|
||||
struct rf_node_count {
|
||||
@@ -663,4 +646,4 @@ sstring network_topology_strategy::sanity_check_read_replicas(const effective_re
|
||||
using registry = class_registrator<abstract_replication_strategy, network_topology_strategy, replication_strategy_params, const topology*>;
|
||||
static registry registrator("org.apache.cassandra.locator.NetworkTopologyStrategy");
|
||||
static registry registrator_short_name("NetworkTopologyStrategy");
|
||||
}
|
||||
} // namespace locator
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -611,6 +611,10 @@ public:
|
||||
/// Returns tablet_id of a tablet which owns a given token.
|
||||
tablet_id get_tablet_id(token) const;
|
||||
|
||||
// Returns the side of the tablet's range that a given token belongs to.
|
||||
// Less expensive than get_tablet_id_and_range_side() when tablet_id is already known.
|
||||
tablet_range_side get_tablet_range_side(token) const;
|
||||
|
||||
// Returns tablet_id and also the side of the tablet's range that a given token belongs to.
|
||||
std::pair<tablet_id, tablet_range_side> get_tablet_id_and_range_side(token) const;
|
||||
|
||||
|
||||
@@ -26,12 +26,16 @@
|
||||
|
||||
struct node_printer {
|
||||
const locator::node* v;
|
||||
node_printer(const locator::node* n) noexcept : v(n) {}
|
||||
node_printer(const locator::node* n) noexcept
|
||||
: v(n) {
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct fmt::formatter<node_printer> {
|
||||
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
|
||||
constexpr auto parse(format_parse_context& ctx) {
|
||||
return ctx.begin();
|
||||
}
|
||||
auto format(const node_printer& np, fmt::format_context& ctx) const {
|
||||
const locator::node* node = np.v;
|
||||
auto out = fmt::format_to(ctx.out(), "node={}", fmt::ptr(node));
|
||||
@@ -43,7 +47,9 @@ struct fmt::formatter<node_printer> {
|
||||
};
|
||||
|
||||
static auto lazy_backtrace() {
|
||||
return seastar::value_of([] { return current_backtrace(); });
|
||||
return seastar::value_of([] {
|
||||
return current_backtrace();
|
||||
});
|
||||
}
|
||||
|
||||
namespace locator {
|
||||
@@ -51,11 +57,12 @@ namespace locator {
|
||||
static logging::logger tlogger("topology");
|
||||
|
||||
thread_local const endpoint_dc_rack endpoint_dc_rack::default_location = {
|
||||
.dc = locator::production_snitch_base::default_dc,
|
||||
.rack = locator::production_snitch_base::default_rack,
|
||||
.dc = locator::production_snitch_base::default_dc,
|
||||
.rack = locator::production_snitch_base::default_rack,
|
||||
};
|
||||
|
||||
node::node(const locator::topology* topology, locator::host_id id, endpoint_dc_rack dc_rack, state state, shard_id shard_count, bool excluded, this_node is_this_node, node::idx_type idx, bool draining)
|
||||
node::node(const locator::topology* topology, locator::host_id id, endpoint_dc_rack dc_rack, state state, shard_id shard_count, bool excluded,
|
||||
this_node is_this_node, node::idx_type idx, bool draining)
|
||||
: _topology(topology)
|
||||
, _host_id(id)
|
||||
, _dc_rack(std::move(dc_rack))
|
||||
@@ -64,10 +71,11 @@ node::node(const locator::topology* topology, locator::host_id id, endpoint_dc_r
|
||||
, _excluded(excluded)
|
||||
, _draining(draining)
|
||||
, _is_this_node(is_this_node)
|
||||
, _idx(idx)
|
||||
{}
|
||||
, _idx(idx) {
|
||||
}
|
||||
|
||||
node_holder node::make(const locator::topology* topology, locator::host_id id, endpoint_dc_rack dc_rack, state state, shard_id shard_count, bool excluded, node::this_node is_this_node, node::idx_type idx, bool draining) {
|
||||
node_holder node::make(const locator::topology* topology, locator::host_id id, endpoint_dc_rack dc_rack, state state, shard_id shard_count, bool excluded,
|
||||
node::this_node is_this_node, node::idx_type idx, bool draining) {
|
||||
return std::make_unique<node>(topology, std::move(id), std::move(dc_rack), std::move(state), shard_count, excluded, is_this_node, idx, draining);
|
||||
}
|
||||
|
||||
@@ -77,14 +85,22 @@ node_holder node::clone() const {
|
||||
|
||||
std::string node::to_string(node::state s) {
|
||||
switch (s) {
|
||||
case state::none: return "none";
|
||||
case state::bootstrapping: return "bootstrapping";
|
||||
case state::replacing: return "replacing";
|
||||
case state::normal: return "normal";
|
||||
case state::being_decommissioned: return "being_decommissioned";
|
||||
case state::being_removed: return "being_removed";
|
||||
case state::being_replaced: return "being_replaced";
|
||||
case state::left: return "left";
|
||||
case state::none:
|
||||
return "none";
|
||||
case state::bootstrapping:
|
||||
return "bootstrapping";
|
||||
case state::replacing:
|
||||
return "replacing";
|
||||
case state::normal:
|
||||
return "normal";
|
||||
case state::being_decommissioned:
|
||||
return "being_decommissioned";
|
||||
case state::being_removed:
|
||||
return "being_removed";
|
||||
case state::being_replaced:
|
||||
return "being_replaced";
|
||||
case state::left:
|
||||
return "left";
|
||||
}
|
||||
__builtin_unreachable();
|
||||
}
|
||||
@@ -101,21 +117,19 @@ future<> topology::clear_gently() noexcept {
|
||||
}
|
||||
|
||||
topology::topology(shallow_copy, config cfg)
|
||||
: _shard(this_shard_id())
|
||||
, _cfg(cfg)
|
||||
, _sort_by_proximity(true)
|
||||
{
|
||||
: _shard(this_shard_id())
|
||||
, _cfg(cfg)
|
||||
, _sort_by_proximity(true) {
|
||||
// constructor for shallow copying of token_metadata_impl
|
||||
}
|
||||
|
||||
topology::topology(config cfg)
|
||||
: _shard(this_shard_id())
|
||||
, _cfg(cfg)
|
||||
, _sort_by_proximity(!cfg.disable_proximity_sorting)
|
||||
, _random_engine(std::random_device{}())
|
||||
{
|
||||
tlogger.trace("topology[{}]: constructing using config: endpoint={} id={} dc={} rack={}", fmt::ptr(this),
|
||||
cfg.this_endpoint, cfg.this_host_id, cfg.local_dc_rack.dc, cfg.local_dc_rack.rack);
|
||||
: _shard(this_shard_id())
|
||||
, _cfg(cfg)
|
||||
, _sort_by_proximity(!cfg.disable_proximity_sorting)
|
||||
, _random_engine(std::random_device{}()) {
|
||||
tlogger.trace("topology[{}]: constructing using config: endpoint={} id={} dc={} rack={}", fmt::ptr(this), cfg.this_endpoint, cfg.this_host_id,
|
||||
cfg.local_dc_rack.dc, cfg.local_dc_rack.rack);
|
||||
add_node(cfg.this_host_id, cfg.local_dc_rack, node::state::none);
|
||||
}
|
||||
|
||||
@@ -131,8 +145,7 @@ topology::topology(topology&& o) noexcept
|
||||
, _dc_racks(std::move(o._dc_racks))
|
||||
, _sort_by_proximity(o._sort_by_proximity)
|
||||
, _datacenters(std::move(o._datacenters))
|
||||
, _random_engine(std::move(o._random_engine))
|
||||
{
|
||||
, _random_engine(std::move(o._random_engine)) {
|
||||
SCYLLA_ASSERT(_shard == this_shard_id());
|
||||
tlogger.trace("topology[{}]: move from [{}]", fmt::ptr(this), fmt::ptr(&o));
|
||||
|
||||
@@ -153,16 +166,18 @@ topology& topology::operator=(topology&& o) noexcept {
|
||||
|
||||
void topology::set_host_id_cfg(host_id this_host_id) {
|
||||
if (_cfg.this_host_id) {
|
||||
on_internal_error(tlogger, fmt::format("topology[{}] set_host_id_cfg can be caller only once current id {} new id {}", fmt::ptr(this), _cfg.this_host_id, this_host_id));
|
||||
on_internal_error(tlogger,
|
||||
fmt::format("topology[{}] set_host_id_cfg can be caller only once current id {} new id {}", fmt::ptr(this), _cfg.this_host_id, this_host_id));
|
||||
}
|
||||
if (_nodes.size() != 1) {
|
||||
on_internal_error(tlogger, fmt::format("topology[{}] set_host_id_cfg called while nodes size is greater than 1", fmt::ptr(this)));
|
||||
on_internal_error(tlogger, fmt::format("topology[{}] set_host_id_cfg called while nodes size is greater than 1", fmt::ptr(this)));
|
||||
}
|
||||
if (!_this_node) {
|
||||
on_internal_error(tlogger, fmt::format("topology[{}] set_host_id_cfg called while _this_nodes is null", fmt::ptr(this)));
|
||||
on_internal_error(tlogger, fmt::format("topology[{}] set_host_id_cfg called while _this_nodes is null", fmt::ptr(this)));
|
||||
}
|
||||
if (_this_node->host_id()) {
|
||||
on_internal_error(tlogger, fmt::format("topology[{}] set_host_id_cfg called while _this_nodes has non null id {}", fmt::ptr(this), _this_node->host_id()));
|
||||
on_internal_error(
|
||||
tlogger, fmt::format("topology[{}] set_host_id_cfg called while _this_nodes has non null id {}", fmt::ptr(this), _this_node->host_id()));
|
||||
}
|
||||
|
||||
remove_node(*_this_node);
|
||||
@@ -203,7 +218,8 @@ const node& topology::add_node(node_holder nptr) {
|
||||
|
||||
if (nptr->topology() != this) {
|
||||
if (nptr->topology()) {
|
||||
on_fatal_internal_error(tlogger, seastar::format("topology[{}]: {} belongs to different topology={}", fmt::ptr(this), node_printer(node), fmt::ptr(node->topology())));
|
||||
on_fatal_internal_error(tlogger,
|
||||
seastar::format("topology[{}]: {} belongs to different topology={}", fmt::ptr(this), node_printer(node), fmt::ptr(node->topology())));
|
||||
}
|
||||
nptr->set_topology(this);
|
||||
}
|
||||
@@ -219,7 +235,8 @@ const node& topology::add_node(node_holder nptr) {
|
||||
try {
|
||||
if (is_configured_this_node(*node)) {
|
||||
if (_this_node) {
|
||||
on_internal_error(tlogger, seastar::format("topology[{}]: {}: local node already mapped to {}", fmt::ptr(this), node_printer(node), node_printer(this_node())));
|
||||
on_internal_error(tlogger,
|
||||
seastar::format("topology[{}]: {}: local node already mapped to {}", fmt::ptr(this), node_printer(node), node_printer(this_node())));
|
||||
}
|
||||
locator::node& n = *_nodes.back();
|
||||
n._is_this_node = node::this_node::yes;
|
||||
@@ -238,14 +255,25 @@ const node& topology::add_node(node_holder nptr) {
|
||||
return *node;
|
||||
}
|
||||
|
||||
void topology::update_node(node& node, std::optional<host_id> opt_id, std::optional<endpoint_dc_rack> opt_dr, std::optional<node::state> opt_st, std::optional<shard_id> opt_shard_count) {
|
||||
void topology::update_node(node& node, std::optional<host_id> opt_id, std::optional<endpoint_dc_rack> opt_dr, std::optional<node::state> opt_st,
|
||||
std::optional<shard_id> opt_shard_count) {
|
||||
tlogger.debug("topology[{}]: update_node: {}: to: host_id={} dc={} rack={} state={} shard_count={}, at {}", fmt::ptr(this), node_printer(&node),
|
||||
opt_id ? format("{}", *opt_id) : "unchanged",
|
||||
opt_dr ? format("{}", opt_dr->dc) : "unchanged",
|
||||
opt_dr ? format("{}", opt_dr->rack) : "unchanged",
|
||||
opt_st ? format("{}", *opt_st) : "unchanged",
|
||||
opt_shard_count ? format("{}", *opt_shard_count) : "unchanged",
|
||||
lazy_backtrace());
|
||||
seastar::value_of([&] {
|
||||
return opt_id ? format("{}", *opt_id) : "unchanged";
|
||||
}),
|
||||
seastar::value_of([&] {
|
||||
return opt_dr ? format("{}", opt_dr->dc) : "unchanged";
|
||||
}),
|
||||
seastar::value_of([&] {
|
||||
return opt_dr ? format("{}", opt_dr->rack) : "unchanged";
|
||||
}),
|
||||
seastar::value_of([&] {
|
||||
return opt_st ? format("{}", *opt_st) : "unchanged";
|
||||
}),
|
||||
seastar::value_of([&] {
|
||||
return opt_shard_count ? format("{}", *opt_shard_count) : "unchanged";
|
||||
}),
|
||||
lazy_backtrace());
|
||||
|
||||
bool changed = false;
|
||||
if (opt_id) {
|
||||
@@ -257,7 +285,8 @@ void topology::update_node(node& node, std::optional<host_id> opt_id, std::optio
|
||||
on_internal_error(tlogger, seastar::format("This node host_id is already set: {}: new host_id={}", node_printer(&node), *opt_id));
|
||||
}
|
||||
if (_nodes_by_host_id.contains(*opt_id)) {
|
||||
on_internal_error(tlogger, seastar::format("Cannot update node host_id: {}: new host_id already exists: {}", node_printer(&node), node_printer(find_node(*opt_id))));
|
||||
on_internal_error(tlogger, seastar::format("Cannot update node host_id: {}: new host_id already exists: {}", node_printer(&node),
|
||||
node_printer(find_node(*opt_id))));
|
||||
}
|
||||
changed = true;
|
||||
} else {
|
||||
@@ -442,11 +471,11 @@ const node* topology::find_node(node::idx_type idx) const noexcept {
|
||||
return _nodes.at(idx).get();
|
||||
}
|
||||
|
||||
const node& topology::add_or_update_endpoint(host_id id, std::optional<endpoint_dc_rack> opt_dr, std::optional<node::state> opt_st, std::optional<shard_id> shard_count)
|
||||
{
|
||||
tlogger.trace("topology[{}]: add_or_update_endpoint: host_id={} dc={} rack={} state={} shards={}, at {}", fmt::ptr(this),
|
||||
id, opt_dr.value_or(endpoint_dc_rack{}).dc, opt_dr.value_or(endpoint_dc_rack{}).rack, opt_st.value_or(node::state::none), shard_count,
|
||||
lazy_backtrace());
|
||||
const node& topology::add_or_update_endpoint(
|
||||
host_id id, std::optional<endpoint_dc_rack> opt_dr, std::optional<node::state> opt_st, std::optional<shard_id> shard_count) {
|
||||
tlogger.trace("topology[{}]: add_or_update_endpoint: host_id={} dc={} rack={} state={} shards={}, at {}", fmt::ptr(this), id,
|
||||
opt_dr.value_or(endpoint_dc_rack{}).dc, opt_dr.value_or(endpoint_dc_rack{}).rack, opt_st.value_or(node::state::none), shard_count,
|
||||
lazy_backtrace());
|
||||
|
||||
auto* n = find_node(id);
|
||||
if (n) {
|
||||
@@ -454,14 +483,10 @@ const node& topology::add_or_update_endpoint(host_id id, std::optional<endpoint_
|
||||
return *n;
|
||||
}
|
||||
|
||||
return add_node(id,
|
||||
opt_dr.value_or(endpoint_dc_rack::default_location),
|
||||
opt_st.value_or(node::state::none),
|
||||
shard_count.value_or(0));
|
||||
return add_node(id, opt_dr.value_or(endpoint_dc_rack::default_location), opt_st.value_or(node::state::none), shard_count.value_or(0));
|
||||
}
|
||||
|
||||
bool topology::remove_endpoint(locator::host_id host_id)
|
||||
{
|
||||
bool topology::remove_endpoint(locator::host_id host_id) {
|
||||
auto node = find_node(host_id);
|
||||
tlogger.debug("topology[{}]: remove_endpoint: host_id={}: {}", fmt::ptr(this), host_id, node_printer(node));
|
||||
// Do not allow removing yourself from the topology
|
||||
@@ -502,7 +527,7 @@ void topology::do_sort_by_proximity(locator::host_id address, host_id_vector_rep
|
||||
locator::host_id id;
|
||||
int distance;
|
||||
};
|
||||
auto host_infos = addresses | std::views::transform([&] (locator::host_id id) {
|
||||
auto host_infos = addresses | std::views::transform([&](locator::host_id id) {
|
||||
const auto& loc1 = get_location(id);
|
||||
return info{id, distance(address, loc, id, loc1)};
|
||||
}) | std::ranges::to<utils::small_vector<info, host_id_vector_replica_set::internal_capacity()>>();
|
||||
@@ -564,11 +589,12 @@ std::unordered_set<locator::host_id> topology::get_all_host_ids() const {
|
||||
return ids;
|
||||
}
|
||||
|
||||
std::unordered_map<sstring, std::unordered_set<host_id>>
|
||||
topology::get_datacenter_host_ids() const {
|
||||
std::unordered_map<sstring, std::unordered_set<host_id>> topology::get_datacenter_host_ids() const {
|
||||
std::unordered_map<sstring, std::unordered_set<host_id>> ret;
|
||||
for (auto& [dc, nodes] : _dc_nodes) {
|
||||
ret[dc] = nodes | std::views::transform([] (const node& n) { return n.host_id(); }) | std::ranges::to<std::unordered_set>();
|
||||
ret[dc] = nodes | std::views::transform([](const node& n) {
|
||||
return n.host_id();
|
||||
}) | std::ranges::to<std::unordered_set>();
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
11
main.cc
11
main.cc
@@ -19,8 +19,6 @@
|
||||
#include "gms/inet_address.hh"
|
||||
#include "auth/allow_all_authenticator.hh"
|
||||
#include "auth/allow_all_authorizer.hh"
|
||||
#include "auth/maintenance_socket_authenticator.hh"
|
||||
#include "auth/maintenance_socket_role_manager.hh"
|
||||
#include <seastar/core/future.hh>
|
||||
#include <seastar/core/signal.hh>
|
||||
#include <seastar/core/timer.hh>
|
||||
@@ -1964,6 +1962,11 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
checkpoint(stop_signal, "loading non-system sstables");
|
||||
replica::distributed_loader::init_non_system_keyspaces(db, proxy, sys_ks).get();
|
||||
|
||||
checkpoint(stop_signal, "recovering logstor");
|
||||
db.invoke_on_all([] (replica::database& db) {
|
||||
return db.recover_logstor();
|
||||
}).get();
|
||||
|
||||
// Depends on all keyspaces being initialized because after this call
|
||||
// we can be reloading schema.
|
||||
mm.local().register_feature_listeners();
|
||||
@@ -2102,7 +2105,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
if (cfg->maintenance_socket() != "ignore") {
|
||||
checkpoint(stop_signal, "starting maintenance auth service");
|
||||
maintenance_auth_service.start(std::ref(qp), std::ref(group0_client),
|
||||
auth::make_authorizer_factory(auth::allow_all_authorizer_name, qp),
|
||||
auth::make_maintenance_socket_authorizer_factory(qp),
|
||||
auth::make_maintenance_socket_authenticator_factory(qp, group0_client, mm, auth_cache),
|
||||
auth::make_maintenance_socket_role_manager_factory(qp, group0_client, mm, auth_cache),
|
||||
maintenance_socket_enabled::yes, std::ref(auth_cache)).get();
|
||||
@@ -2236,7 +2239,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
return m.start();
|
||||
}).get();
|
||||
|
||||
api::set_server_storage_service(ctx, ss, group0_client).get();
|
||||
api::set_server_storage_service(ctx, ss, snapshot_ctl, group0_client).get();
|
||||
auto stop_ss_api = defer_verbose_shutdown("storage service API", [&ctx] {
|
||||
api::unset_server_storage_service(ctx).get();
|
||||
});
|
||||
|
||||
@@ -103,7 +103,7 @@ future<std::optional<tasks::task_status>> node_ops_virtual_task::get_status(task
|
||||
.entity = stats.entity,
|
||||
.progress_units = "",
|
||||
.progress = tasks::task_manager::task::progress{},
|
||||
.children = co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper()))
|
||||
.children = co_await get_children(get_module(), id, _ss.get_token_metadata_ptr())
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -8,9 +8,10 @@
|
||||
|
||||
"""exec_cql.py
|
||||
Execute CQL statements from a file where each non-empty, non-comment line is exactly one CQL statement.
|
||||
Connects via a Unix domain socket (maintenance socket), bypassing authentication.
|
||||
Requires python cassandra-driver. Stops at first failure.
|
||||
Usage:
|
||||
./exec_cql.py --file ./conf/auth.cql [--host 127.0.0.1 --port 9042]
|
||||
./exec_cql.py --file ./conf/auth.cql --socket /path/to/cql.m
|
||||
"""
|
||||
import argparse, os, sys
|
||||
from typing import Sequence
|
||||
@@ -26,18 +27,27 @@ def read_statements(path: str) -> list[tuple[int, str]]:
|
||||
stms.append((lineno, line))
|
||||
return stms
|
||||
|
||||
def exec_driver(statements: list[tuple[int, str]], host: str, port: int, timeout: float, username: str, password: str) -> int:
|
||||
def exec_statements(statements: list[tuple[int, str]], socket_path: str, timeout: float) -> int:
|
||||
"""Execute CQL statements via a Unix domain socket (maintenance socket).
|
||||
|
||||
The maintenance socket only starts listening after the auth subsystem is
|
||||
fully initialised, so a successful connect means the node is ready.
|
||||
"""
|
||||
from cassandra.cluster import Cluster
|
||||
from cassandra.connection import UnixSocketEndPoint # type: ignore
|
||||
from cassandra.policies import WhiteListRoundRobinPolicy # type: ignore
|
||||
|
||||
ep = UnixSocketEndPoint(socket_path)
|
||||
try:
|
||||
from cassandra.cluster import Cluster
|
||||
from cassandra.auth import PlainTextAuthProvider # type: ignore
|
||||
except Exception:
|
||||
print('ERROR: cassandra-driver not installed. Install with: pip install cassandra-driver', file=sys.stderr)
|
||||
cluster = Cluster(
|
||||
contact_points=[ep],
|
||||
load_balancing_policy=WhiteListRoundRobinPolicy([ep]),
|
||||
)
|
||||
session = cluster.connect()
|
||||
except Exception as e:
|
||||
print(f'ERROR: failed to connect to maintenance socket {socket_path}: {e}', file=sys.stderr)
|
||||
return 2
|
||||
auth_provider = None
|
||||
if username != "":
|
||||
auth_provider = PlainTextAuthProvider(username=username, password=password)
|
||||
cluster = Cluster([host], port=port, auth_provider=auth_provider)
|
||||
session = cluster.connect()
|
||||
|
||||
try:
|
||||
for _, (lineno, s) in enumerate(statements, 1):
|
||||
try:
|
||||
@@ -50,13 +60,11 @@ def exec_driver(statements: list[tuple[int, str]], host: str, port: int, timeout
|
||||
return 0
|
||||
|
||||
def main(argv: Sequence[str]) -> int:
|
||||
ap = argparse.ArgumentParser(description='Execute one-line CQL statements from file (driver only)')
|
||||
ap = argparse.ArgumentParser(description='Execute one-line CQL statements from file via maintenance socket')
|
||||
ap.add_argument('--file', required=True)
|
||||
ap.add_argument('--host', default='127.0.0.1')
|
||||
ap.add_argument('--port', type=int, default=9042)
|
||||
ap.add_argument('--socket', required=True,
|
||||
help='Path to the Unix domain maintenance socket (<workdir>/cql.m)')
|
||||
ap.add_argument('--timeout', type=float, default=30.0)
|
||||
ap.add_argument('--username', default='cassandra')
|
||||
ap.add_argument('--password', default='cassandra')
|
||||
args = ap.parse_args(argv)
|
||||
if not os.path.isfile(args.file):
|
||||
print(f"File not found: {args.file}", file=sys.stderr)
|
||||
@@ -65,7 +73,7 @@ def main(argv: Sequence[str]) -> int:
|
||||
if not stmts:
|
||||
print('No statements found', file=sys.stderr)
|
||||
return 1
|
||||
rc = exec_driver(stmts, args.host, args.port, args.timeout, args.username, args.password)
|
||||
rc = exec_statements(stmts, args.socket, args.timeout)
|
||||
if rc == 0:
|
||||
print('All statements executed successfully')
|
||||
return rc
|
||||
|
||||
58
pgo/pgo.py
58
pgo/pgo.py
@@ -15,6 +15,7 @@ from typing import Any, Optional
|
||||
import asyncio
|
||||
import contextlib
|
||||
import glob
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
@@ -364,12 +365,14 @@ async def start_node(executable: PathLike, cluster_workdir: PathLike, addr: str,
|
||||
llvm_profile_file = f"{addr}-%m.profraw"
|
||||
scylla_workdir = f"{addr}"
|
||||
logfile = f"{addr}.log"
|
||||
socket = maintenance_socket_path(cluster_workdir, addr)
|
||||
command = [
|
||||
"env",
|
||||
f"LLVM_PROFILE_FILE={llvm_profile_file}",
|
||||
f"SCYLLA_HOME={os.path.realpath(os.getcwd())}", # We assume that the script has Scylla's `conf/` as its filesystem neighbour.
|
||||
os.path.realpath(executable),
|
||||
f"--workdir={scylla_workdir}",
|
||||
f"--maintenance-socket={socket}",
|
||||
"--ring-delay-ms=0",
|
||||
"--developer-mode=yes",
|
||||
"--memory=1G",
|
||||
@@ -391,6 +394,7 @@ async def start_node(executable: PathLike, cluster_workdir: PathLike, addr: str,
|
||||
f"--authenticator=PasswordAuthenticator",
|
||||
f"--authorizer=CassandraAuthorizer",
|
||||
] + list(extra_opts)
|
||||
training_logger.info(f"Using maintenance socket {socket}")
|
||||
return await run(['bash', '-c', fr"""exec {shlex.join(command)} >{q(logfile)} 2>&1"""], cwd=cluster_workdir)
|
||||
|
||||
async def start_cluster(executable: PathLike, addrs: list[str], cpusets: Optional[list[str]], workdir: PathLike, cluster_name: str, extra_opts: list[str]) -> list[Process]:
|
||||
@@ -433,16 +437,25 @@ async def start_cluster(executable: PathLike, addrs: list[str], cpusets: Optiona
|
||||
procs.append(proc)
|
||||
await wait_for_node(proc, addrs[i], timeout)
|
||||
except:
|
||||
await stop_cluster(procs, addrs)
|
||||
await stop_cluster(procs, addrs, cluster_workdir=workdir)
|
||||
raise
|
||||
return procs
|
||||
|
||||
async def stop_cluster(procs: list[Process], addrs: list[str]) -> None:
|
||||
async def stop_cluster(procs: list[Process], addrs: list[str], cluster_workdir: PathLike) -> None:
|
||||
"""Stops a Scylla cluster started with start_cluster().
|
||||
Doesn't return until all nodes exit, even if stop_cluster() is cancelled.
|
||||
|
||||
"""
|
||||
await clean_gather(*[cancel_process(p, timeout=60) for p in procs])
|
||||
_cleanup_short_sockets(cluster_workdir, addrs)
|
||||
|
||||
def _cleanup_short_sockets(cluster_workdir: PathLike, addrs: list[str]) -> None:
|
||||
"""Remove short maintenance socket files created in /tmp."""
|
||||
for addr in addrs:
|
||||
try:
|
||||
os.unlink(maintenance_socket_path(cluster_workdir, addr))
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
async def wait_for_port(addr: str, port: int) -> None:
|
||||
await bash(fr'until printf "" >>/dev/tcp/{addr}/{port}; do sleep 0.1; done 2>/dev/null')
|
||||
@@ -452,6 +465,33 @@ async def merge_profraw(directory: PathLike) -> None:
|
||||
if glob.glob(f"{directory}/*.profraw"):
|
||||
await bash(fr"llvm-profdata merge {q(directory)}/*.profraw -output {q(directory)}/prof.profdata")
|
||||
|
||||
def maintenance_socket_path(cluster_workdir: PathLike, addr: str) -> str:
|
||||
"""Return the maintenance socket path for a node.
|
||||
|
||||
Returns a short deterministic path in /tmp (derived from an MD5 hash of
|
||||
the natural ``<cluster_workdir>/<addr>/cql.m`` path) to stay within the
|
||||
Unix domain socket length limit.
|
||||
The same path is passed to Scylla via ``--maintenance-socket`` in
|
||||
``start_node()``.
|
||||
"""
|
||||
natural = os.path.realpath(f"{cluster_workdir}/{addr}/cql.m")
|
||||
path_hash = hashlib.md5(natural.encode()).hexdigest()[:12]
|
||||
return os.path.join(tempfile.gettempdir(), f'pgo-{path_hash}.m')
|
||||
|
||||
async def setup_cassandra_user(workdir: PathLike, addr: str) -> None:
|
||||
"""Create the ``cassandra`` superuser via the maintenance socket.
|
||||
|
||||
The default cassandra superuser is no longer seeded automatically, but
|
||||
``cassandra-stress`` hardcodes ``user=cassandra password=cassandra``.
|
||||
We create the role over the maintenance socket so that cassandra-stress
|
||||
and other tools that rely on the default credentials keep working.
|
||||
"""
|
||||
socket = maintenance_socket_path(workdir, addr)
|
||||
stmt = "CREATE ROLE cassandra WITH PASSWORD = 'cassandra' AND SUPERUSER = true AND LOGIN = true;"
|
||||
f = q(socket)
|
||||
# Write the statement to a temp file and execute it via exec_cql.py.
|
||||
await bash(fr"""tmpf=$(mktemp); echo {q(stmt)} > "$tmpf"; python3 ./exec_cql.py --file "$tmpf" --socket {f}; rc=$?; rm -f "$tmpf"; exit $rc""")
|
||||
|
||||
async def get_bolt_opts(executable: PathLike) -> list[str]:
|
||||
"""Returns the extra opts which have to be passed to a BOLT-instrumented Scylla
|
||||
to trigger a generation of a BOLT profile file.
|
||||
@@ -503,7 +543,7 @@ async def with_cluster(executable: PathLike, workdir: PathLike, cpusets: Optiona
|
||||
yield addrs, procs
|
||||
finally:
|
||||
training_logger.info(f"Stopping the cluster in {workdir}")
|
||||
await stop_cluster(procs, addrs)
|
||||
await stop_cluster(procs, addrs, cluster_workdir=workdir)
|
||||
training_logger.info(f"Stopped the cluster in {workdir}")
|
||||
|
||||
################################################################################
|
||||
@@ -557,8 +597,10 @@ def kw(**kwargs):
|
||||
|
||||
@contextlib.asynccontextmanager
|
||||
async def with_cs_populate(executable: PathLike, workdir: PathLike) -> AsyncIterator[str]:
|
||||
"""Provides a Scylla cluster and waits for compactions to end before stopping it."""
|
||||
"""Provides a Scylla cluster, creates the cassandra superuser, and waits
|
||||
for compactions to end before stopping it."""
|
||||
async with with_cluster(executable=executable, workdir=workdir) as (addrs, procs):
|
||||
await setup_cassandra_user(workdir, addrs[0])
|
||||
yield addrs[0]
|
||||
async with asyncio.timeout(3600):
|
||||
# Should it also flush memtables?
|
||||
@@ -667,9 +709,10 @@ populators["decommission_dataset"] = populate_decommission
|
||||
# AUTH CONNECTIONS STRESS ==================================================
|
||||
|
||||
async def populate_auth_conns(executable: PathLike, workdir: PathLike) -> None:
|
||||
# Create roles, table and permissions via CQL script.
|
||||
# Create roles, table and permissions via CQL script over the maintenance socket.
|
||||
async with with_cs_populate(executable=executable, workdir=workdir) as server:
|
||||
await bash(fr"python3 ./exec_cql.py --file conf/auth.cql --host {server}")
|
||||
socket = maintenance_socket_path(workdir, server)
|
||||
await bash(fr"python3 ./exec_cql.py --file conf/auth.cql --socket {q(socket)}")
|
||||
|
||||
async def train_auth_conns(executable: PathLike, workdir: PathLike) -> None:
|
||||
# Repeatedly connect as the reader user and perform simple reads to stress
|
||||
@@ -722,7 +765,8 @@ populators["si_dataset"] = populate_si
|
||||
|
||||
async def populate_counters(executable: PathLike, workdir: PathLike) -> None:
|
||||
async with with_cs_populate(executable=executable, workdir=workdir) as server:
|
||||
await bash(fr"python3 ./exec_cql.py --file conf/counters.cql --host {server}")
|
||||
socket = maintenance_socket_path(workdir, server)
|
||||
await bash(fr"python3 ./exec_cql.py --file conf/counters.cql --socket {q(socket)}")
|
||||
# Sleeps added in reaction to schema disagreement errors.
|
||||
# FIXME: get rid of this sleep and find a sane way to wait for schema
|
||||
# agreement.
|
||||
|
||||
@@ -371,7 +371,7 @@ public:
|
||||
}
|
||||
|
||||
void on_preemptive_aborted() {
|
||||
if (_state != reader_permit::state::waiting_for_admission && _state != reader_permit::state::waiting_for_memory) {
|
||||
if (_state != reader_permit::state::waiting_for_admission) {
|
||||
on_internal_error(rcslog, format("on_preemptive_aborted(): permit in invalid state {}", _state));
|
||||
}
|
||||
|
||||
@@ -1533,19 +1533,24 @@ void reader_concurrency_semaphore::maybe_admit_waiters() noexcept {
|
||||
// + permit.timeout() < db::no_timeout -- to avoid preemptively aborting reads without timeout.
|
||||
// Useful is tests when _preemptive_abort_factor is set to 1.0
|
||||
// to avoid additional sleeps to wait for the read to be shed.
|
||||
const auto time_budget = permit.timeout() - permit.created();
|
||||
const auto remaining_time = permit.timeout() - db::timeout_clock::now();
|
||||
if (remaining_time > db::timeout_clock::duration::zero() &&
|
||||
permit.timeout() < db::no_timeout &&
|
||||
remaining_time <= _preemptive_abort_factor() * time_budget) {
|
||||
permit.on_preemptive_aborted();
|
||||
using ms = std::chrono::milliseconds;
|
||||
tracing::trace(permit.trace_state(), "[reader concurrency semaphore {}] read shed as unlikely to finish (elapsed: {}, timeout: {}, preemptive_factor: {})",
|
||||
_name,
|
||||
std::chrono::duration_cast<ms>(time_budget - remaining_time),
|
||||
std::chrono::duration_cast<ms>(time_budget),
|
||||
_preemptive_abort_factor());
|
||||
continue;
|
||||
//
|
||||
// Only apply to permits waiting for admission -- permits waiting for memory are already
|
||||
// executing reads and should not be preemptively aborted.
|
||||
if (permit.get_state() == reader_permit::state::waiting_for_admission) {
|
||||
const auto time_budget = permit.timeout() - permit.created();
|
||||
const auto remaining_time = permit.timeout() - db::timeout_clock::now();
|
||||
if (remaining_time > db::timeout_clock::duration::zero() &&
|
||||
permit.timeout() < db::no_timeout &&
|
||||
remaining_time <= _preemptive_abort_factor() * time_budget) {
|
||||
permit.on_preemptive_aborted();
|
||||
using ms = std::chrono::milliseconds;
|
||||
tracing::trace(permit.trace_state(), "[reader concurrency semaphore {}] read shed as unlikely to finish (elapsed: {}, timeout: {}, preemptive_factor: {})",
|
||||
_name,
|
||||
std::chrono::duration_cast<ms>(time_budget - remaining_time),
|
||||
std::chrono::duration_cast<ms>(time_budget),
|
||||
_preemptive_abort_factor());
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (permit.get_state() == reader_permit::state::waiting_for_memory) {
|
||||
|
||||
@@ -68,6 +68,7 @@ public:
|
||||
using resources = reader_resources;
|
||||
|
||||
friend class reader_permit;
|
||||
friend struct reader_concurrency_semaphore_tester;
|
||||
|
||||
enum class evict_reason {
|
||||
permit, // evicted due to permit shortage
|
||||
|
||||
1728
repair/repair.cc
1728
repair/repair.cc
File diff suppressed because it is too large
Load Diff
@@ -2362,6 +2362,15 @@ static future<> repair_get_row_diff_with_rpc_stream_process_op_slow_path(
|
||||
}
|
||||
}
|
||||
|
||||
static future<repair_rows_on_wire> clone_gently(const repair_rows_on_wire& rows) {
|
||||
repair_rows_on_wire cloned;
|
||||
for (const auto& row : rows) {
|
||||
cloned.push_back(row);
|
||||
co_await seastar::coroutine::maybe_yield();
|
||||
}
|
||||
co_return cloned;
|
||||
}
|
||||
|
||||
static future<> repair_put_row_diff_with_rpc_stream_process_op(
|
||||
sharded<repair_service>& repair,
|
||||
locator::host_id from,
|
||||
@@ -2388,7 +2397,9 @@ static future<> repair_put_row_diff_with_rpc_stream_process_op(
|
||||
co_await rm->put_row_diff_handler(std::move(*fp));
|
||||
rm->set_repair_state_for_local_node(repair_state::put_row_diff_with_rpc_stream_finished);
|
||||
} else {
|
||||
co_await rm->put_row_diff_handler(*fp);
|
||||
// Gently clone to avoid copy stall on destination shard
|
||||
repair_rows_on_wire local_rows = co_await clone_gently(*fp);
|
||||
co_await seastar::when_all_succeed(rm->put_row_diff_handler(std::move(local_rows)), utils::clear_gently(fp));
|
||||
rm->set_repair_state_for_local_node(repair_state::put_row_diff_with_rpc_stream_finished);
|
||||
}
|
||||
});
|
||||
@@ -3242,10 +3253,13 @@ private:
|
||||
// sequentially because the rows from repair follower 1 to
|
||||
// repair master might reduce the amount of missing data
|
||||
// between repair master and repair follower 2.
|
||||
repair_hash_set set_diff = get_set_diff(master.peer_row_hash_sets(node_idx), master.working_row_hashes().get());
|
||||
auto working_hashes = master.working_row_hashes().get();
|
||||
repair_hash_set set_diff = get_set_diff(master.peer_row_hash_sets(node_idx), working_hashes);
|
||||
// Request missing sets from peer node
|
||||
rlogger.debug("Before get_row_diff to node {}, local={}, peer={}, set_diff={}",
|
||||
node, master.working_row_hashes().get().size(), master.peer_row_hash_sets(node_idx).size(), set_diff.size());
|
||||
if (rlogger.is_enabled(logging::log_level::debug)) {
|
||||
rlogger.debug("Before get_row_diff to node {}, local={}, peer={}, set_diff={}",
|
||||
node, working_hashes.size(), master.peer_row_hash_sets(node_idx).size(), set_diff.size());
|
||||
}
|
||||
// If we need to pull all rows from the peer. We can avoid
|
||||
// sending the row hashes on wire by setting needs_all_rows flag.
|
||||
auto needs_all_rows = repair_meta::needs_all_rows_t(set_diff.size() == master.peer_row_hash_sets(node_idx).size());
|
||||
@@ -3258,7 +3272,9 @@ private:
|
||||
master.get_row_diff(std::move(set_diff), needs_all_rows, node, node_idx, dst_cpu_id);
|
||||
ns.state = repair_state::get_row_diff_finished;
|
||||
}
|
||||
rlogger.debug("After get_row_diff node {}, hash_sets={}", master.myhostid(), master.working_row_hashes().get().size());
|
||||
if (rlogger.is_enabled(logging::log_level::debug)) {
|
||||
rlogger.debug("After get_row_diff node {}, hash_sets={}", master.myhostid(), master.working_row_hashes().get().size());
|
||||
}
|
||||
} catch (...) {
|
||||
rlogger.warn("repair[{}]: get_row_diff: got error from node={}, keyspace={}, table={}, range={}, error={}",
|
||||
_shard_task.global_repair_id.uuid(), node, _shard_task.get_keyspace(), _cf_name, _range, std::current_exception());
|
||||
|
||||
@@ -9,6 +9,9 @@ target_sources(replica
|
||||
memtable.cc
|
||||
exceptions.cc
|
||||
dirty_memory_manager.cc
|
||||
logstor/segment_manager.cc
|
||||
logstor/logstor.cc
|
||||
logstor/write_buffer.cc
|
||||
multishard_query.cc
|
||||
mutation_dump.cc
|
||||
schema_describe_helper.cc
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
// FIXME: un-nest compaction_reenabler, so we can forward declare it and remove this include.
|
||||
#include "compaction/compaction_manager.hh"
|
||||
#include "locator/tablets.hh"
|
||||
#include "replica/logstor/compaction.hh"
|
||||
#include "sstables/sstable_set.hh"
|
||||
#include "utils/chunked_vector.hh"
|
||||
#include <absl/container/flat_hash_map.h>
|
||||
@@ -33,6 +34,10 @@ class effective_replication_map;
|
||||
|
||||
namespace replica {
|
||||
|
||||
namespace logstor {
|
||||
class primary_index;
|
||||
}
|
||||
|
||||
using enable_backlog_tracker = bool_class<class enable_backlog_tracker_tag>;
|
||||
|
||||
enum class repair_sstable_classification {
|
||||
@@ -91,6 +96,12 @@ class compaction_group {
|
||||
bool _tombstone_gc_enabled = true;
|
||||
std::optional<compaction::compaction_backlog_tracker> _backlog_tracker;
|
||||
repair_classifier_func _repair_sstable_classifier;
|
||||
|
||||
lw_shared_ptr<logstor::segment_set> _logstor_segments;
|
||||
std::optional<logstor::separator_buffer> _logstor_separator;
|
||||
std::vector<future<>> _separator_flushes;
|
||||
seastar::semaphore _separator_flush_sem{1};
|
||||
|
||||
private:
|
||||
std::unique_ptr<compaction_group_view> make_compacting_view();
|
||||
std::unique_ptr<compaction_group_view> make_non_compacting_view();
|
||||
@@ -223,6 +234,7 @@ public:
|
||||
const std::vector<sstables::shared_sstable>& compacted_undeleted_sstables() const noexcept;
|
||||
// Triggers regular compaction.
|
||||
void trigger_compaction();
|
||||
void trigger_logstor_compaction();
|
||||
bool compaction_disabled() const;
|
||||
future<unsigned> estimate_pending_compactions() const;
|
||||
|
||||
@@ -231,6 +243,7 @@ public:
|
||||
|
||||
size_t live_sstable_count() const noexcept;
|
||||
uint64_t live_disk_space_used() const noexcept;
|
||||
size_t logstor_disk_space_used() const noexcept;
|
||||
sstables::file_size_stats live_disk_space_used_full_stats() const noexcept;
|
||||
uint64_t total_disk_space_used() const noexcept;
|
||||
sstables::file_size_stats total_disk_space_used_full_stats() const noexcept;
|
||||
@@ -262,12 +275,37 @@ public:
|
||||
compaction::compaction_manager& get_compaction_manager() noexcept;
|
||||
const compaction::compaction_manager& get_compaction_manager() const noexcept;
|
||||
|
||||
logstor::segment_manager& get_logstor_segment_manager() noexcept;
|
||||
const logstor::segment_manager& get_logstor_segment_manager() const noexcept;
|
||||
|
||||
logstor::compaction_manager& get_logstor_compaction_manager() noexcept;
|
||||
const logstor::compaction_manager& get_logstor_compaction_manager() const noexcept;
|
||||
|
||||
logstor::primary_index& get_logstor_index() noexcept;
|
||||
|
||||
future<> split(compaction::compaction_type_options::split opt, tasks::task_info tablet_split_task_info);
|
||||
|
||||
void set_repair_sstable_classifier(repair_classifier_func repair_sstable_classifier) {
|
||||
_repair_sstable_classifier = std::move(repair_sstable_classifier);
|
||||
}
|
||||
|
||||
void add_logstor_segment(logstor::segment_descriptor& desc) {
|
||||
_logstor_segments->add_segment(desc);
|
||||
}
|
||||
|
||||
future<> discard_logstor_segments();
|
||||
|
||||
future<> flush_separator(std::optional<size_t> seq_num = std::nullopt);
|
||||
logstor::separator_buffer& get_separator_buffer(size_t write_size);
|
||||
|
||||
logstor::segment_set& logstor_segments() noexcept {
|
||||
return *_logstor_segments;
|
||||
}
|
||||
|
||||
const logstor::segment_set& logstor_segments() const noexcept {
|
||||
return *_logstor_segments;
|
||||
}
|
||||
|
||||
friend class storage_group;
|
||||
};
|
||||
|
||||
@@ -312,7 +350,14 @@ public:
|
||||
|
||||
const compaction_group_ptr& main_compaction_group() const noexcept;
|
||||
const std::vector<compaction_group_ptr>& split_ready_compaction_groups() const;
|
||||
compaction_group_ptr& select_compaction_group(locator::tablet_range_side) noexcept;
|
||||
// Selects the compaction group for the given token. Computes the range side
|
||||
// from the token only when in splitting mode. This avoids the cost of computing
|
||||
// range side on the hot path when it's not needed.
|
||||
compaction_group_ptr& select_compaction_group(dht::token, const locator::tablet_map&) noexcept;
|
||||
// Selects the compaction group for an sstable spanning a token range.
|
||||
// If the first and last tokens fall on different sides of the split point,
|
||||
// the sstable belongs to the main compaction group.
|
||||
compaction_group_ptr& select_compaction_group(dht::token first, dht::token last, const locator::tablet_map&) noexcept;
|
||||
|
||||
uint64_t live_disk_space_used() const;
|
||||
|
||||
@@ -432,7 +477,9 @@ public:
|
||||
// refresh_mutation_source must be called when there are changes to data source
|
||||
// structures but logical state of data is not changed (e.g. when state for a
|
||||
// new tablet replica is allocated).
|
||||
virtual void update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) = 0;
|
||||
virtual void update_effective_replication_map(const locator::effective_replication_map_ptr& old_erm,
|
||||
const locator::effective_replication_map& erm,
|
||||
noncopyable_function<void()> refresh_mutation_source) = 0;
|
||||
|
||||
virtual compaction_group& compaction_group_for_token(dht::token token) const = 0;
|
||||
virtual compaction_group& compaction_group_for_key(partition_key_view key, const schema_ptr& s) const = 0;
|
||||
|
||||
1596
replica/database.cc
1596
replica/database.cc
File diff suppressed because it is too large
Load Diff
@@ -16,6 +16,7 @@
|
||||
#include <seastar/core/execution_stage.hh>
|
||||
#include <seastar/core/when_all.hh>
|
||||
#include "replica/global_table_ptr.hh"
|
||||
#include "replica/logstor/compaction.hh"
|
||||
#include "types/user.hh"
|
||||
#include "utils/assert.hh"
|
||||
#include "utils/hash.hh"
|
||||
@@ -35,6 +36,7 @@
|
||||
#include <seastar/core/gate.hh>
|
||||
#include "db/commitlog/replay_position.hh"
|
||||
#include "db/commitlog/commitlog_types.hh"
|
||||
#include "logstor/logstor.hh"
|
||||
#include "schema/schema_fwd.hh"
|
||||
#include "db/view/view.hh"
|
||||
#include "db/snapshot-ctl.hh"
|
||||
@@ -544,6 +546,9 @@ private:
|
||||
utils::phased_barrier _flush_barrier;
|
||||
std::vector<view_ptr> _views;
|
||||
|
||||
logstor::logstor* _logstor = nullptr;
|
||||
std::unique_ptr<logstor::primary_index> _logstor_index;
|
||||
|
||||
std::unique_ptr<cell_locker> _counter_cell_locks; // Memory-intensive; allocate only when needed.
|
||||
|
||||
// Labels used to identify writes and reads for this table in the rate_limiter structure.
|
||||
@@ -611,6 +616,10 @@ public:
|
||||
sstables::offstrategy offstrategy = sstables::offstrategy::no);
|
||||
future<> add_sstables_and_update_cache(const std::vector<sstables::shared_sstable>& ssts);
|
||||
|
||||
bool add_logstor_segment(logstor::segment_descriptor&, dht::token first_token, dht::token last_token);
|
||||
|
||||
logstor::separator_buffer& get_logstor_separator_buffer(dht::token token, size_t write_size);
|
||||
|
||||
// Restricted to new sstables produced by external processes such as repair.
|
||||
// The sstable might undergo split if table is in split mode.
|
||||
// If no need for split, the input sstable will only be attached to the sstable set.
|
||||
@@ -833,6 +842,21 @@ public:
|
||||
// to issue disk operations safely.
|
||||
void mark_ready_for_writes(db::commitlog* cl);
|
||||
|
||||
void init_logstor(logstor::logstor* ls);
|
||||
|
||||
bool uses_logstor() const {
|
||||
return _logstor != nullptr;
|
||||
}
|
||||
|
||||
logstor::primary_index& logstor_index() noexcept {
|
||||
return *_logstor_index;
|
||||
}
|
||||
const logstor::primary_index& logstor_index() const noexcept {
|
||||
return *_logstor_index;
|
||||
}
|
||||
|
||||
size_t get_logstor_memory_usage() const;
|
||||
|
||||
// Creates a mutation reader which covers all data sources for this column family.
|
||||
// Caller needs to ensure that column_family remains live (FIXME: relax this).
|
||||
// Note: for data queries use query() instead.
|
||||
@@ -858,6 +882,14 @@ public:
|
||||
return make_mutation_reader(std::move(schema), std::move(permit), range, full_slice);
|
||||
}
|
||||
|
||||
mutation_reader make_logstor_mutation_reader(schema_ptr s,
|
||||
reader_permit permit,
|
||||
const dht::partition_range& pr,
|
||||
const query::partition_slice& slice,
|
||||
tracing::trace_state_ptr trace_state,
|
||||
streamed_mutation::forwarding fwd,
|
||||
mutation_reader::forwarding fwd_mr) const;
|
||||
|
||||
// The streaming mutation reader differs from the regular mutation reader in that:
|
||||
// - Reflects all writes accepted by replica prior to creation of the
|
||||
// reader and a _bounded_ amount of writes which arrive later.
|
||||
@@ -1047,6 +1079,7 @@ public:
|
||||
bool needs_flush() const;
|
||||
future<> clear(); // discards memtable(s) without flushing them to disk.
|
||||
future<db::replay_position> discard_sstables(db_clock::time_point);
|
||||
future<> discard_logstor_segments();
|
||||
|
||||
bool can_flush() const;
|
||||
|
||||
@@ -1098,6 +1131,7 @@ public:
|
||||
void start_compaction();
|
||||
void trigger_compaction();
|
||||
void try_trigger_compaction(compaction_group& cg) noexcept;
|
||||
void trigger_logstor_compaction();
|
||||
// Triggers offstrategy compaction, if needed, in the background.
|
||||
void trigger_offstrategy_compaction();
|
||||
// Performs offstrategy compaction, if needed, returning
|
||||
@@ -1126,6 +1160,22 @@ public:
|
||||
return _compaction_manager;
|
||||
}
|
||||
|
||||
logstor::segment_manager& get_logstor_segment_manager() noexcept {
|
||||
return _logstor->get_segment_manager();
|
||||
}
|
||||
|
||||
const logstor::segment_manager& get_logstor_segment_manager() const noexcept {
|
||||
return _logstor->get_segment_manager();
|
||||
}
|
||||
|
||||
logstor::compaction_manager& get_logstor_compaction_manager() noexcept {
|
||||
return _logstor->get_compaction_manager();
|
||||
}
|
||||
|
||||
future<> flush_separator(std::optional<size_t> seq_num = std::nullopt);
|
||||
|
||||
future<logstor::table_segment_stats> get_logstor_segment_stats() const;
|
||||
|
||||
table_stats& get_stats() const {
|
||||
return _stats;
|
||||
}
|
||||
@@ -1613,6 +1663,8 @@ private:
|
||||
dirty_memory_manager _system_dirty_memory_manager;
|
||||
dirty_memory_manager _dirty_memory_manager;
|
||||
|
||||
timer<lowres_clock> _dirty_memory_threshold_controller;
|
||||
|
||||
database_config _dbcfg;
|
||||
flush_controller _memtable_controller;
|
||||
drain_progress _drain_progress {};
|
||||
@@ -1655,6 +1707,8 @@ private:
|
||||
bool _enable_autocompaction_toggle = false;
|
||||
querier_cache _querier_cache;
|
||||
|
||||
std::unique_ptr<logstor::logstor> _logstor;
|
||||
|
||||
std::unique_ptr<db::large_data_handler> _large_data_handler;
|
||||
std::unique_ptr<db::large_data_handler> _nop_large_data_handler;
|
||||
|
||||
@@ -1696,6 +1750,8 @@ public:
|
||||
std::shared_ptr<data_dictionary::user_types_storage> as_user_types_storage() const noexcept;
|
||||
const data_dictionary::user_types_storage& user_types() const noexcept;
|
||||
future<> init_commitlog();
|
||||
future<> init_logstor();
|
||||
future<> recover_logstor();
|
||||
const gms::feature_service& features() const { return _feat; }
|
||||
future<> apply_in_memory(const frozen_mutation& m, schema_ptr m_schema, db::rp_handle&&, db::timeout_clock::time_point timeout);
|
||||
future<> apply_in_memory(const mutation& m, column_family& cf, db::rp_handle&&, db::timeout_clock::time_point timeout);
|
||||
@@ -1996,6 +2052,13 @@ public:
|
||||
// a wrapper around flush_all_tables, allowing the caller to express intent more clearly
|
||||
future<> flush_commitlog() { return flush_all_tables(); }
|
||||
|
||||
static future<> trigger_logstor_compaction_on_all_shards(sharded<database>& sharded_db, bool major);
|
||||
void trigger_logstor_compaction(bool major);
|
||||
static future<> flush_logstor_separator_on_all_shards(sharded<database>& sharded_db);
|
||||
future<> flush_logstor_separator(std::optional<size_t> seq_num = std::nullopt);
|
||||
future<logstor::table_segment_stats> get_logstor_table_segment_stats(table_id table) const;
|
||||
size_t get_logstor_memory_usage() const;
|
||||
|
||||
static future<db_clock::time_point> get_all_tables_flushed_at(sharded<database>& sharded_db);
|
||||
|
||||
static future<> drop_cache_for_table_on_all_shards(sharded<database>& sharded_db, table_id id);
|
||||
|
||||
@@ -142,6 +142,16 @@ void region_group::notify_unspooled_pressure_relieved() {
|
||||
_relief.signal();
|
||||
}
|
||||
|
||||
void region_group::update_limits(size_t unspooled_hard_limit, size_t unspooled_soft_limit, size_t real_hard_limit) {
|
||||
_cfg.unspooled_hard_limit = unspooled_hard_limit;
|
||||
_cfg.unspooled_soft_limit = unspooled_soft_limit;
|
||||
_cfg.real_hard_limit = real_hard_limit;
|
||||
|
||||
// check pressure with the new limits
|
||||
update_real(0);
|
||||
update_unspooled(0);
|
||||
}
|
||||
|
||||
bool region_group::do_update_real_and_check_relief(ssize_t delta) {
|
||||
_real_total_memory += delta;
|
||||
|
||||
@@ -211,9 +221,18 @@ dirty_memory_manager::dirty_memory_manager(replica::database& db, size_t thresho
|
||||
.real_hard_limit = threshold,
|
||||
.start_reclaiming = std::bind_front(&dirty_memory_manager::start_reclaiming, this)
|
||||
}, deferred_work_sg)
|
||||
, _threshold(threshold)
|
||||
, _soft_limit(soft_limit)
|
||||
, _flush_serializer(1)
|
||||
, _waiting_flush(flush_when_needed()) {}
|
||||
|
||||
void dirty_memory_manager::update_threshold(size_t threshold) {
|
||||
if (threshold != _threshold) {
|
||||
_threshold = threshold;
|
||||
_region_group.update_limits(threshold / 2, threshold * _soft_limit / 2, threshold);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
dirty_memory_manager::setup_collectd(sstring namestr) {
|
||||
namespace sm = seastar::metrics;
|
||||
|
||||
@@ -268,6 +268,8 @@ public:
|
||||
}
|
||||
void update_unspooled(ssize_t delta);
|
||||
|
||||
void update_limits(size_t unspooled_hard_limit, size_t unspooled_soft_limit, size_t real_hard_limit);
|
||||
|
||||
void increase_usage(logalloc::region* r) { // Called by memtable's region_listener
|
||||
// It would be easier to call update, but it is unfortunately broken in boost versions up to at
|
||||
// least 1.59.
|
||||
@@ -395,6 +397,9 @@ class dirty_memory_manager {
|
||||
// memory usage minus bytes that were already written to disk.
|
||||
dirty_memory_manager_logalloc::region_group _region_group;
|
||||
|
||||
size_t _threshold;
|
||||
double _soft_limit;
|
||||
|
||||
// We would like to serialize the flushing of memtables. While flushing many memtables
|
||||
// simultaneously can sustain high levels of throughput, the memory is not freed until the
|
||||
// memtable is totally gone. That means that if we have throttled requests, they will stay
|
||||
@@ -483,6 +488,8 @@ public:
|
||||
return _region_group;
|
||||
}
|
||||
|
||||
void update_threshold(size_t threshold);
|
||||
|
||||
void revert_potentially_cleaned_up_memory(logalloc::region* from, int64_t delta) {
|
||||
_region_group.update_real(-delta);
|
||||
_region_group.update_unspooled(delta);
|
||||
|
||||
177
replica/logstor/compaction.hh
Normal file
177
replica/logstor/compaction.hh
Normal file
@@ -0,0 +1,177 @@
|
||||
/*
|
||||
* Copyright (C) 2026-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include "types.hh"
|
||||
#include "utils/chunked_vector.hh"
|
||||
#include "write_buffer.hh"
|
||||
#include "utils/log_heap.hh"
|
||||
|
||||
namespace replica::logstor {
|
||||
|
||||
constexpr log_heap_options segment_descriptor_hist_options(4 * 1024, 3, 128 * 1024);
|
||||
|
||||
struct segment_set;
|
||||
|
||||
struct segment_descriptor : public log_heap_hook<segment_descriptor_hist_options> {
|
||||
// free_space = segment_size - net_data_size
|
||||
// initially set to segment_size
|
||||
// when writing records, decrease by total net data size
|
||||
// when freeing a record, increase by the record's net data size
|
||||
size_t free_space{0};
|
||||
size_t record_count{0};
|
||||
segment_generation seg_gen{1};
|
||||
segment_set* owner{nullptr}; // non-owning, set when added to a segment_set
|
||||
|
||||
void reset(size_t segment_size) noexcept {
|
||||
free_space = segment_size;
|
||||
record_count = 0;
|
||||
}
|
||||
|
||||
size_t net_data_size(size_t segment_size) const noexcept {
|
||||
return segment_size - free_space;
|
||||
}
|
||||
|
||||
void on_free_segment() noexcept {
|
||||
++seg_gen;
|
||||
}
|
||||
|
||||
void on_write(size_t net_data_size, size_t cnt = 1) noexcept {
|
||||
free_space -= net_data_size;
|
||||
record_count += cnt;
|
||||
}
|
||||
|
||||
void on_write(log_location loc) noexcept {
|
||||
on_write(loc.size);
|
||||
}
|
||||
|
||||
void on_free(size_t net_data_size, size_t cnt = 1) noexcept {
|
||||
free_space += net_data_size;
|
||||
record_count -= cnt;
|
||||
}
|
||||
|
||||
void on_free(log_location loc) noexcept {
|
||||
on_free(loc.size);
|
||||
}
|
||||
};
|
||||
|
||||
using segment_descriptor_hist = log_heap<segment_descriptor, segment_descriptor_hist_options>;
|
||||
|
||||
struct segment_set {
|
||||
segment_descriptor_hist _segments;
|
||||
size_t _segment_count{0};
|
||||
|
||||
void add_segment(segment_descriptor& desc) {
|
||||
desc.owner = this;
|
||||
_segments.push(desc);
|
||||
++_segment_count;
|
||||
}
|
||||
|
||||
void update_segment(segment_descriptor& desc) {
|
||||
_segments.adjust_up(desc);
|
||||
}
|
||||
|
||||
void remove_segment(segment_descriptor& desc) {
|
||||
_segments.erase(desc);
|
||||
desc.owner = nullptr;
|
||||
--_segment_count;
|
||||
}
|
||||
|
||||
size_t segment_count() const noexcept {
|
||||
return _segment_count;
|
||||
}
|
||||
};
|
||||
|
||||
class segment_ref {
|
||||
struct state {
|
||||
log_segment_id id;
|
||||
std::function<void()> on_last_release;
|
||||
std::function<void()> on_failure;
|
||||
bool flush_failure{false};
|
||||
~state() {
|
||||
if (!flush_failure) {
|
||||
if (on_last_release) on_last_release();
|
||||
} else {
|
||||
if (on_failure) on_failure();
|
||||
}
|
||||
}
|
||||
};
|
||||
lw_shared_ptr<state> _state;
|
||||
public:
|
||||
segment_ref() = default;
|
||||
|
||||
// Copyable: copying increments the shared ref count
|
||||
segment_ref(const segment_ref&) = default;
|
||||
segment_ref& operator=(const segment_ref&) = default;
|
||||
segment_ref(segment_ref&&) noexcept = default;
|
||||
segment_ref& operator=(segment_ref&&) noexcept = default;
|
||||
|
||||
log_segment_id id() const noexcept { return _state->id; }
|
||||
bool empty() const noexcept { return !_state; }
|
||||
|
||||
void set_flush_failure() noexcept { if (_state) _state->flush_failure = true; }
|
||||
|
||||
private:
|
||||
friend class segment_manager_impl;
|
||||
explicit segment_ref(log_segment_id id, std::function<void()> on_last_release, std::function<void()> on_failure)
|
||||
: _state(make_lw_shared<state>(id, std::move(on_last_release), std::move(on_failure)))
|
||||
{}
|
||||
};
|
||||
|
||||
struct separator_buffer {
|
||||
write_buffer* buf;
|
||||
utils::chunked_vector<future<>> pending_updates;
|
||||
utils::chunked_vector<segment_ref> held_segments;
|
||||
std::optional<size_t> min_seq_num;
|
||||
bool flushed{false};
|
||||
|
||||
separator_buffer(write_buffer* wb)
|
||||
: buf(wb)
|
||||
{}
|
||||
|
||||
~separator_buffer() {
|
||||
if (!flushed && buf && buf->has_data()) {
|
||||
for (auto& seg_ref : held_segments) {
|
||||
seg_ref.set_flush_failure();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
separator_buffer(const separator_buffer&) = delete;
|
||||
separator_buffer& operator=(const separator_buffer&) = delete;
|
||||
|
||||
separator_buffer(separator_buffer&&) noexcept = default;
|
||||
separator_buffer& operator=(separator_buffer&&) noexcept = default;
|
||||
|
||||
future<log_location_with_holder> write(log_record_writer writer) {
|
||||
return buf->write(std::move(writer));
|
||||
}
|
||||
|
||||
bool can_fit(const log_record_writer& writer) const noexcept {
|
||||
return buf->can_fit(writer);
|
||||
}
|
||||
|
||||
bool can_fit(size_t write_size) const noexcept {
|
||||
return buf->can_fit(write_size);
|
||||
}
|
||||
};
|
||||
|
||||
class compaction_manager {
|
||||
public:
|
||||
virtual ~compaction_manager() = default;
|
||||
|
||||
virtual separator_buffer allocate_separator_buffer() = 0;
|
||||
|
||||
virtual future<> flush_separator_buffer(separator_buffer, replica::compaction_group&) = 0;
|
||||
|
||||
virtual void submit(replica::compaction_group&) = 0;
|
||||
|
||||
virtual future<> stop_ongoing_compactions(replica::compaction_group&) = 0;
|
||||
};
|
||||
|
||||
}
|
||||
167
replica/logstor/index.hh
Normal file
167
replica/logstor/index.hh
Normal file
@@ -0,0 +1,167 @@
|
||||
/*
|
||||
* Copyright (C) 2026-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include "dht/decorated_key.hh"
|
||||
#include "dht/ring_position.hh"
|
||||
#include "types.hh"
|
||||
#include "utils/bptree.hh"
|
||||
#include "utils/double-decker.hh"
|
||||
#include "utils/phased_barrier.hh"
|
||||
|
||||
namespace replica::logstor {
|
||||
|
||||
class primary_index_entry {
|
||||
dht::decorated_key _key;
|
||||
index_entry _e;
|
||||
struct {
|
||||
bool _head : 1;
|
||||
bool _tail : 1;
|
||||
bool _train : 1;
|
||||
} _flags{};
|
||||
public:
|
||||
primary_index_entry(dht::decorated_key key, index_entry e)
|
||||
: _key(std::move(key))
|
||||
, _e(std::move(e))
|
||||
{ }
|
||||
|
||||
primary_index_entry(primary_index_entry&&) noexcept = default;
|
||||
|
||||
bool is_head() const noexcept { return _flags._head; }
|
||||
void set_head(bool v) noexcept { _flags._head = v; }
|
||||
bool is_tail() const noexcept { return _flags._tail; }
|
||||
void set_tail(bool v) noexcept { _flags._tail = v; }
|
||||
bool with_train() const noexcept { return _flags._train; }
|
||||
void set_train(bool v) noexcept { _flags._train = v; }
|
||||
|
||||
const dht::decorated_key& key() const noexcept { return _key; }
|
||||
const index_entry& entry() const noexcept { return _e; }
|
||||
|
||||
friend class primary_index;
|
||||
|
||||
friend dht::ring_position_view ring_position_view_to_compare(const primary_index_entry& e) { return e._key; }
|
||||
};
|
||||
|
||||
class primary_index final {
|
||||
public:
|
||||
using partitions_type = double_decker<int64_t, primary_index_entry,
|
||||
dht::raw_token_less_comparator, dht::ring_position_comparator,
|
||||
16, bplus::key_search::linear>;
|
||||
private:
|
||||
partitions_type _partitions;
|
||||
schema_ptr _schema;
|
||||
size_t _key_count = 0;
|
||||
|
||||
mutable utils::phased_barrier _reads_phaser{"logstor_primary_index"};
|
||||
|
||||
public:
|
||||
explicit primary_index(schema_ptr schema)
|
||||
: _partitions(dht::raw_token_less_comparator{})
|
||||
, _schema(std::move(schema))
|
||||
{}
|
||||
|
||||
void set_schema(schema_ptr s) {
|
||||
_schema = std::move(s);
|
||||
}
|
||||
|
||||
void clear() {
|
||||
_partitions.clear();
|
||||
_key_count = 0;
|
||||
}
|
||||
|
||||
utils::phased_barrier::operation start_read() const {
|
||||
return _reads_phaser.start();
|
||||
}
|
||||
|
||||
future<> await_pending_reads() {
|
||||
return _reads_phaser.advance_and_await();
|
||||
}
|
||||
|
||||
std::optional<index_entry> get(const primary_index_key& key) const {
|
||||
auto it = _partitions.find(key.dk, dht::ring_position_comparator(*_schema));
|
||||
if (it != _partitions.end()) {
|
||||
return it->_e;
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
std::optional<index_entry> exchange(const primary_index_key& key, index_entry new_entry) {
|
||||
partitions_type::bound_hint hint;
|
||||
auto i = _partitions.lower_bound(key.dk, dht::ring_position_comparator(*_schema), hint);
|
||||
if (hint.match) {
|
||||
auto old_entry = i->_e;
|
||||
i->_e = std::move(new_entry);
|
||||
return old_entry;
|
||||
} else {
|
||||
_partitions.emplace_before(i, key.dk.token().raw(), hint, key.dk, std::move(new_entry));
|
||||
++_key_count;
|
||||
return std::nullopt;
|
||||
}
|
||||
}
|
||||
|
||||
bool update_record_location(const primary_index_key& key, log_location old_location, log_location new_location) {
|
||||
auto it = _partitions.find(key.dk, dht::ring_position_comparator(*_schema));
|
||||
if (it != _partitions.end()) {
|
||||
if (it->_e.location == old_location) {
|
||||
it->_e.location = new_location;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
std::pair<bool, std::optional<index_entry>> insert_if_newer(const primary_index_key& key, index_entry new_entry) {
|
||||
partitions_type::bound_hint hint;
|
||||
auto i = _partitions.lower_bound(key.dk, dht::ring_position_comparator(*_schema), hint);
|
||||
if (hint.match) {
|
||||
if (i->_e.generation < new_entry.generation) {
|
||||
auto old_entry = i->_e;
|
||||
i->_e = std::move(new_entry);
|
||||
return {true, std::make_optional(old_entry)};
|
||||
} else {
|
||||
return {false, std::make_optional(i->_e)};
|
||||
}
|
||||
} else {
|
||||
_partitions.emplace_before(i, key.dk.token().raw(), hint, key.dk, std::move(new_entry));
|
||||
++_key_count;
|
||||
return {true, std::nullopt};
|
||||
}
|
||||
}
|
||||
|
||||
bool erase(const primary_index_key& key, log_location loc) {
|
||||
auto it = _partitions.find(key.dk, dht::ring_position_comparator(*_schema));
|
||||
if (it != _partitions.end() && it->_e.location == loc) {
|
||||
it.erase(dht::raw_token_less_comparator{});
|
||||
--_key_count;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
auto begin() const noexcept { return _partitions.begin(); }
|
||||
auto end() const noexcept { return _partitions.end(); }
|
||||
|
||||
bool empty() const noexcept { return _partitions.empty(); }
|
||||
|
||||
size_t get_key_count() const noexcept { return _key_count; }
|
||||
|
||||
size_t get_memory_usage() const noexcept { return _key_count * sizeof(index_entry); }
|
||||
|
||||
// First entry with key >= pos (for positioning at range start)
|
||||
partitions_type::const_iterator lower_bound(const dht::ring_position_view& pos) const {
|
||||
return _partitions.lower_bound(pos, dht::ring_position_comparator(*_schema));
|
||||
}
|
||||
|
||||
// First entry with key strictly > key (for advancing past a key after a yield)
|
||||
partitions_type::const_iterator upper_bound(const dht::decorated_key& key) const {
|
||||
return _partitions.upper_bound(key, dht::ring_position_comparator(*_schema));
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
297
replica/logstor/logstor.cc
Normal file
297
replica/logstor/logstor.cc
Normal file
@@ -0,0 +1,297 @@
|
||||
/*
|
||||
* Copyright (C) 2026-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
#include "replica/logstor/logstor.hh"
|
||||
#include <seastar/core/coroutine.hh>
|
||||
#include <seastar/util/log.hh>
|
||||
#include <seastar/core/future.hh>
|
||||
#include "readers/from_mutations.hh"
|
||||
#include "keys/keys.hh"
|
||||
#include "replica/logstor/segment_manager.hh"
|
||||
#include "replica/logstor/types.hh"
|
||||
#include "utils/managed_bytes.hh"
|
||||
#include <openssl/ripemd.h>
|
||||
#include <openssl/evp.h>
|
||||
|
||||
namespace replica::logstor {
|
||||
|
||||
seastar::logger logstor_logger("logstor");
|
||||
|
||||
logstor::logstor(logstor_config config)
|
||||
: _segment_manager(config.segment_manager_cfg)
|
||||
, _write_buffer(_segment_manager, config.flush_sg) {
|
||||
}
|
||||
|
||||
future<> logstor::do_recovery(replica::database& db) {
|
||||
co_await _segment_manager.do_recovery(db);
|
||||
}
|
||||
|
||||
future<> logstor::start() {
|
||||
logstor_logger.info("Starting logstor");
|
||||
|
||||
co_await _segment_manager.start();
|
||||
co_await _write_buffer.start();
|
||||
|
||||
logstor_logger.info("logstor started");
|
||||
}
|
||||
|
||||
future<> logstor::stop() {
|
||||
logstor_logger.info("Stopping logstor");
|
||||
|
||||
co_await _write_buffer.stop();
|
||||
co_await _segment_manager.stop();
|
||||
|
||||
logstor_logger.info("logstor stopped");
|
||||
}
|
||||
|
||||
size_t logstor::get_memory_usage() const {
|
||||
return _segment_manager.get_memory_usage();
|
||||
}
|
||||
|
||||
future<> logstor::write(const mutation& m, compaction_group& cg, seastar::gate::holder cg_holder) {
|
||||
primary_index_key key(m.decorated_key());
|
||||
table_id table = m.schema()->id();
|
||||
auto& index = cg.get_logstor_index();
|
||||
|
||||
// TODO ?
|
||||
record_generation gen = index.get(key)
|
||||
.transform([](const index_entry& entry) {
|
||||
return entry.generation + 1;
|
||||
}).value_or(record_generation(1));
|
||||
|
||||
log_record record {
|
||||
.key = key,
|
||||
.generation = gen,
|
||||
.table = table,
|
||||
.mut = canonical_mutation(m)
|
||||
};
|
||||
|
||||
return _write_buffer.write(std::move(record), &cg, std::move(cg_holder)).then_unpack([this, &index, gen, key = std::move(key)]
|
||||
(log_location location, seastar::gate::holder op) {
|
||||
index_entry new_entry {
|
||||
.location = location,
|
||||
.generation = gen,
|
||||
};
|
||||
|
||||
auto old_entry = index.exchange(key, std::move(new_entry));
|
||||
|
||||
// If overwriting, free old record
|
||||
if (old_entry) {
|
||||
_segment_manager.free_record(old_entry->location);
|
||||
}
|
||||
}).handle_exception([] (std::exception_ptr ep) {
|
||||
logstor_logger.error("Error writing mutation: {}", ep);
|
||||
return make_exception_future<>(ep);
|
||||
});
|
||||
}
|
||||
|
||||
future<std::optional<log_record>> logstor::read(const primary_index& index, primary_index_key key) {
|
||||
auto op = index.start_read();
|
||||
|
||||
auto entry_opt = index.get(key);
|
||||
if (!entry_opt.has_value()) {
|
||||
return make_ready_future<std::optional<log_record>>(std::nullopt);
|
||||
}
|
||||
|
||||
const auto& entry = *entry_opt;
|
||||
|
||||
return _segment_manager.read(entry.location).then([key = std::move(key), op = std::move(op)] (log_record record) {
|
||||
return std::optional<log_record>(std::move(record));
|
||||
}).handle_exception([] (std::exception_ptr ep) {
|
||||
logstor_logger.error("Error reading record: {}", ep);
|
||||
return make_exception_future<std::optional<log_record>>(ep);
|
||||
});
|
||||
}
|
||||
|
||||
future<std::optional<canonical_mutation>> logstor::read(const schema& s, const primary_index& index, const dht::decorated_key& dk) {
|
||||
primary_index_key key(dk);
|
||||
return read(index, key).then([&dk] (std::optional<log_record> record_opt) -> std::optional<canonical_mutation> {
|
||||
if (!record_opt.has_value()) {
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
auto& record = *record_opt;
|
||||
|
||||
if (record.mut.key() != dk.key()) [[unlikely]] {
|
||||
throw std::runtime_error(fmt::format(
|
||||
"Key mismatch reading log entry: expected {}, got {}",
|
||||
dk.key(), record.mut.key()
|
||||
));
|
||||
}
|
||||
|
||||
return std::optional<canonical_mutation>(std::move(record.mut));
|
||||
});
|
||||
}
|
||||
|
||||
segment_manager& logstor::get_segment_manager() noexcept {
|
||||
return _segment_manager;
|
||||
}
|
||||
|
||||
const segment_manager& logstor::get_segment_manager() const noexcept {
|
||||
return _segment_manager;
|
||||
}
|
||||
|
||||
compaction_manager& logstor::get_compaction_manager() noexcept {
|
||||
return _segment_manager.get_compaction_manager();
|
||||
}
|
||||
|
||||
const compaction_manager& logstor::get_compaction_manager() const noexcept {
|
||||
return _segment_manager.get_compaction_manager();
|
||||
}
|
||||
|
||||
mutation_reader logstor::make_reader(schema_ptr schema,
|
||||
const primary_index& index,
|
||||
reader_permit permit,
|
||||
const dht::partition_range& pr,
|
||||
const query::partition_slice& slice,
|
||||
tracing::trace_state_ptr trace_state) {
|
||||
|
||||
class logstor_range_reader : public mutation_reader::impl {
|
||||
logstor* _logstor;
|
||||
const primary_index& _index;
|
||||
dht::partition_range _pr;
|
||||
query::partition_slice _slice;
|
||||
tracing::trace_state_ptr _trace_state;
|
||||
std::optional<dht::decorated_key> _last_key; // owns the key, safe across yields
|
||||
mutation_reader_opt _current_partition_reader;
|
||||
dht::ring_position_comparator _cmp;
|
||||
|
||||
// Finds the next iterator to process, safe to call after any co_await
|
||||
primary_index::partitions_type::const_iterator find_next() const {
|
||||
auto it = _last_key
|
||||
? _index.upper_bound(*_last_key) // strictly after last key
|
||||
: position_at_range_start(); // initial positioning
|
||||
// If start was exclusive and we haven't yet seen a key
|
||||
return it;
|
||||
}
|
||||
|
||||
primary_index::partitions_type::const_iterator position_at_range_start() const {
|
||||
if (!_pr.start()) {
|
||||
return _index.begin();
|
||||
}
|
||||
auto it = _index.lower_bound(_pr.start()->value());
|
||||
if (!_pr.start()->is_inclusive() && it != _index.end()) {
|
||||
if (_cmp(it->key(), _pr.start()->value()) == 0) {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
return it;
|
||||
}
|
||||
|
||||
bool exceeds_range_end(const primary_index_entry& e) const {
|
||||
if (!_pr.end()) return false;
|
||||
auto c = _cmp(e.key(), _pr.end()->value());
|
||||
return _pr.end()->is_inclusive() ? c > 0 : c >= 0;
|
||||
}
|
||||
|
||||
public:
|
||||
logstor_range_reader(schema_ptr s, const primary_index& idx, reader_permit p,
|
||||
logstor* ls, dht::partition_range pr,
|
||||
query::partition_slice slice, tracing::trace_state_ptr ts)
|
||||
: impl(std::move(s), std::move(p))
|
||||
, _logstor(ls), _index(idx), _pr(std::move(pr))
|
||||
, _slice(std::move(slice)), _trace_state(std::move(ts))
|
||||
, _cmp(*_schema)
|
||||
{}
|
||||
|
||||
virtual future<> fill_buffer() override {
|
||||
while (!is_buffer_full() && !_end_of_stream) {
|
||||
// Drain current partition's reader first
|
||||
if (_current_partition_reader) {
|
||||
co_await _current_partition_reader->fill_buffer();
|
||||
_current_partition_reader->move_buffer_content_to(*this);
|
||||
if (!_current_partition_reader->is_end_of_stream()) {
|
||||
continue;
|
||||
}
|
||||
co_await _current_partition_reader->close();
|
||||
_current_partition_reader = std::nullopt;
|
||||
// _last_key was already set when we opened the reader
|
||||
}
|
||||
|
||||
// Find next key in range (safe after co_await since we use _last_key)
|
||||
auto it = find_next();
|
||||
if (it == _index.end() || exceeds_range_end(*it)) {
|
||||
_end_of_stream = true;
|
||||
break;
|
||||
}
|
||||
|
||||
// Snapshot the key before yielding
|
||||
auto current_key = it->key();
|
||||
|
||||
auto guard = reader_permit::awaits_guard(_permit);
|
||||
auto cmut = co_await _logstor->read(*_schema, _index, current_key);
|
||||
|
||||
_last_key = current_key; // mark as visited even if not found (tombstoned)
|
||||
|
||||
if (!cmut) {
|
||||
continue; // key was removed between index lookup and read
|
||||
}
|
||||
|
||||
tracing::trace(_trace_state, "logstor_range_reader: fetched key {}", current_key);
|
||||
|
||||
_current_partition_reader = make_mutation_reader_from_mutations(
|
||||
_schema, _permit, cmut->to_mutation(_schema),
|
||||
_slice, streamed_mutation::forwarding::no
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
virtual future<> next_partition() override {
|
||||
clear_buffer_to_next_partition();
|
||||
if (!is_buffer_empty()) return make_ready_future<>();
|
||||
_end_of_stream = false;
|
||||
if (_current_partition_reader) {
|
||||
auto fut = _current_partition_reader->close();
|
||||
_current_partition_reader = std::nullopt;
|
||||
return fut;
|
||||
}
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
virtual future<> fast_forward_to(const dht::partition_range& pr) override {
|
||||
clear_buffer();
|
||||
_end_of_stream = false;
|
||||
_pr = pr;
|
||||
_last_key = std::nullopt; // re-position from new range start
|
||||
if (_current_partition_reader) {
|
||||
auto fut = _current_partition_reader->close();
|
||||
_current_partition_reader = std::nullopt;
|
||||
return fut;
|
||||
}
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
virtual future<> fast_forward_to(position_range pr) override {
|
||||
if (_current_partition_reader) {
|
||||
clear_buffer();
|
||||
return _current_partition_reader->fast_forward_to(std::move(pr));
|
||||
}
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
virtual future<> close() noexcept override {
|
||||
if (_current_partition_reader) {
|
||||
return _current_partition_reader->close();
|
||||
}
|
||||
return make_ready_future<>();
|
||||
}
|
||||
};
|
||||
|
||||
return make_mutation_reader<logstor_range_reader>(
|
||||
std::move(schema), index, std::move(permit), this, pr, slice, std::move(trace_state)
|
||||
);
|
||||
}
|
||||
|
||||
void logstor::set_trigger_compaction_hook(std::function<void()> fn) {
|
||||
_segment_manager.set_trigger_compaction_hook(std::move(fn));
|
||||
}
|
||||
|
||||
void logstor::set_trigger_separator_flush_hook(std::function<void(size_t)> fn) {
|
||||
_segment_manager.set_trigger_separator_flush_hook(std::move(fn));
|
||||
}
|
||||
|
||||
}
|
||||
81
replica/logstor/logstor.hh
Normal file
81
replica/logstor/logstor.hh
Normal file
@@ -0,0 +1,81 @@
|
||||
/*
|
||||
* Copyright (C) 2026-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include <seastar/core/future.hh>
|
||||
#include <seastar/core/temporary_buffer.hh>
|
||||
#include <optional>
|
||||
#include <seastar/core/scheduling.hh>
|
||||
#include "readers/mutation_reader.hh"
|
||||
#include "replica/compaction_group.hh"
|
||||
#include "types.hh"
|
||||
#include "index.hh"
|
||||
#include "segment_manager.hh"
|
||||
#include "write_buffer.hh"
|
||||
#include "mutation/mutation.hh"
|
||||
#include "dht/decorated_key.hh"
|
||||
|
||||
namespace replica {
|
||||
|
||||
class compaction_group;
|
||||
class database;
|
||||
|
||||
namespace logstor {
|
||||
|
||||
extern seastar::logger logstor_logger;
|
||||
|
||||
struct logstor_config {
|
||||
segment_manager_config segment_manager_cfg;
|
||||
seastar::scheduling_group flush_sg;
|
||||
};
|
||||
|
||||
class logstor {
|
||||
|
||||
segment_manager _segment_manager;
|
||||
buffered_writer _write_buffer;
|
||||
|
||||
public:
|
||||
|
||||
explicit logstor(logstor_config);
|
||||
|
||||
logstor(const logstor&) = delete;
|
||||
logstor& operator=(const logstor&) = delete;
|
||||
|
||||
future<> do_recovery(replica::database&);
|
||||
|
||||
future<> start();
|
||||
future<> stop();
|
||||
|
||||
size_t get_memory_usage() const;
|
||||
|
||||
segment_manager& get_segment_manager() noexcept;
|
||||
const segment_manager& get_segment_manager() const noexcept;
|
||||
|
||||
compaction_manager& get_compaction_manager() noexcept;
|
||||
const compaction_manager& get_compaction_manager() const noexcept;
|
||||
|
||||
future<> write(const mutation&, compaction_group&, seastar::gate::holder cg_holder);
|
||||
|
||||
future<std::optional<log_record>> read(const primary_index&, primary_index_key);
|
||||
|
||||
future<std::optional<canonical_mutation>> read(const schema&, const primary_index&, const dht::decorated_key&);
|
||||
|
||||
/// Create a mutation reader for a specific key
|
||||
mutation_reader make_reader(schema_ptr schema,
|
||||
const primary_index& index,
|
||||
reader_permit permit,
|
||||
const dht::partition_range& pr,
|
||||
const query::partition_slice& slice,
|
||||
tracing::trace_state_ptr trace_state = nullptr);
|
||||
|
||||
void set_trigger_compaction_hook(std::function<void()> fn);
|
||||
void set_trigger_separator_flush_hook(std::function<void(size_t)> fn);
|
||||
};
|
||||
|
||||
} // namespace logstor
|
||||
} // namespace replica
|
||||
1940
replica/logstor/segment_manager.cc
Normal file
1940
replica/logstor/segment_manager.cc
Normal file
File diff suppressed because it is too large
Load Diff
128
replica/logstor/segment_manager.hh
Normal file
128
replica/logstor/segment_manager.hh
Normal file
@@ -0,0 +1,128 @@
|
||||
/*
|
||||
* Copyright (C) 2026-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <filesystem>
|
||||
#include <seastar/core/shared_future.hh>
|
||||
#include <seastar/core/file.hh>
|
||||
#include <seastar/core/rwlock.hh>
|
||||
#include <seastar/core/gate.hh>
|
||||
#include <seastar/core/queue.hh>
|
||||
#include <seastar/core/shared_ptr.hh>
|
||||
#include "bytes_fwd.hh"
|
||||
#include "replica/logstor/write_buffer.hh"
|
||||
#include "types.hh"
|
||||
#include "utils/updateable_value.hh"
|
||||
|
||||
namespace replica {
|
||||
|
||||
class database;
|
||||
|
||||
namespace logstor {
|
||||
|
||||
class compaction_manager;
|
||||
class segment_set;
|
||||
class primary_index;
|
||||
|
||||
static constexpr size_t default_segment_size = 128 * 1024;
|
||||
static constexpr size_t default_file_size = 32 * 1024 * 1024;
|
||||
|
||||
/// Configuration for the segment manager
|
||||
struct segment_manager_config {
|
||||
std::filesystem::path base_dir;
|
||||
size_t segment_size = default_segment_size;
|
||||
size_t file_size = default_file_size;
|
||||
size_t disk_size;
|
||||
bool compaction_enabled = true;
|
||||
size_t max_segments_per_compaction = 8;
|
||||
seastar::scheduling_group compaction_sg;
|
||||
utils::updateable_value<float> compaction_static_shares;
|
||||
seastar::scheduling_group separator_sg;
|
||||
uint32_t separator_delay_limit_ms;
|
||||
size_t max_separator_memory = 1 * 1024 * 1024;
|
||||
};
|
||||
|
||||
struct table_segment_histogram_bucket {
|
||||
size_t count;
|
||||
size_t max_data_size;
|
||||
|
||||
table_segment_histogram_bucket& operator+=(table_segment_histogram_bucket& other) {
|
||||
count += other.count;
|
||||
max_data_size = std::max(max_data_size, other.max_data_size);
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
struct table_segment_stats {
|
||||
size_t compaction_group_count{0};
|
||||
size_t segment_count{0};
|
||||
std::vector<table_segment_histogram_bucket> histogram;
|
||||
|
||||
table_segment_stats& operator+=(table_segment_stats& other) {
|
||||
compaction_group_count += other.compaction_group_count;
|
||||
segment_count += other.segment_count;
|
||||
histogram.resize(std::max(histogram.size(), other.histogram.size()));
|
||||
for (size_t i = 0; i < other.histogram.size(); i++) {
|
||||
histogram[i] += other.histogram[i];
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
class segment_manager_impl;
|
||||
class log_index;
|
||||
|
||||
class segment_manager {
|
||||
std::unique_ptr<segment_manager_impl> _impl;
|
||||
private:
|
||||
segment_manager_impl& get_impl() noexcept;
|
||||
const segment_manager_impl& get_impl() const noexcept;
|
||||
public:
|
||||
static constexpr size_t block_alignment = 4096;
|
||||
|
||||
explicit segment_manager(segment_manager_config config);
|
||||
~segment_manager();
|
||||
|
||||
segment_manager(const segment_manager&) = delete;
|
||||
segment_manager& operator=(const segment_manager&) = delete;
|
||||
|
||||
future<> do_recovery(replica::database&);
|
||||
|
||||
future<> start();
|
||||
future<> stop();
|
||||
|
||||
future<log_location> write(write_buffer& wb);
|
||||
|
||||
future<log_record> read(log_location location);
|
||||
|
||||
void free_record(log_location location);
|
||||
|
||||
future<> for_each_record(const std::vector<log_segment_id>& segments,
|
||||
std::function<future<>(log_location, log_record)> callback);
|
||||
|
||||
compaction_manager& get_compaction_manager() noexcept;
|
||||
const compaction_manager& get_compaction_manager() const noexcept;
|
||||
|
||||
void set_trigger_compaction_hook(std::function<void()> fn);
|
||||
void set_trigger_separator_flush_hook(std::function<void(size_t)> fn);
|
||||
|
||||
size_t get_segment_size() const noexcept;
|
||||
|
||||
future<> discard_segments(segment_set&);
|
||||
|
||||
size_t get_memory_usage() const;
|
||||
|
||||
future<> await_pending_writes();
|
||||
|
||||
friend class segment_manager_impl;
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
}
|
||||
80
replica/logstor/types.hh
Normal file
80
replica/logstor/types.hh
Normal file
@@ -0,0 +1,80 @@
|
||||
/*
|
||||
* Copyright (C) 2026-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <fmt/format.h>
|
||||
#include "mutation/canonical_mutation.hh"
|
||||
#include "replica/logstor/utils.hh"
|
||||
#include "dht/decorated_key.hh"
|
||||
|
||||
namespace replica::logstor {
|
||||
|
||||
struct log_segment_id {
|
||||
uint32_t value;
|
||||
|
||||
bool operator==(const log_segment_id& other) const noexcept = default;
|
||||
auto operator<=>(const log_segment_id& other) const noexcept = default;
|
||||
};
|
||||
|
||||
struct log_location {
|
||||
log_segment_id segment;
|
||||
uint32_t offset;
|
||||
uint32_t size;
|
||||
|
||||
bool operator==(const log_location& other) const noexcept = default;
|
||||
};
|
||||
|
||||
struct primary_index_key {
|
||||
dht::decorated_key dk;
|
||||
};
|
||||
|
||||
using record_generation = generation_base<uint16_t>;
|
||||
using segment_generation = generation_base<uint16_t>;
|
||||
|
||||
struct index_entry {
|
||||
log_location location;
|
||||
record_generation generation;
|
||||
|
||||
bool operator==(const index_entry& other) const noexcept = default;
|
||||
};
|
||||
|
||||
struct log_record {
|
||||
primary_index_key key;
|
||||
record_generation generation;
|
||||
table_id table;
|
||||
canonical_mutation mut;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
// Format specialization declarations and implementations
|
||||
template <>
|
||||
struct fmt::formatter<replica::logstor::log_segment_id> : fmt::formatter<string_view> {
|
||||
template <typename FormatContext>
|
||||
auto format(const replica::logstor::log_segment_id& id, FormatContext& ctx) const {
|
||||
return fmt::format_to(ctx.out(), "segment({})", id.value);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct fmt::formatter<replica::logstor::log_location> : fmt::formatter<string_view> {
|
||||
template <typename FormatContext>
|
||||
auto format(const replica::logstor::log_location& loc, FormatContext& ctx) const {
|
||||
return fmt::format_to(ctx.out(), "{{segment:{}, offset:{}, size:{}}}",
|
||||
loc.segment, loc.offset, loc.size);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct fmt::formatter<replica::logstor::primary_index_key> : fmt::formatter<string_view> {
|
||||
template <typename FormatContext>
|
||||
auto format(const replica::logstor::primary_index_key& key, FormatContext& ctx) const {
|
||||
return fmt::format_to(ctx.out(), "{}", key.dk);
|
||||
}
|
||||
};
|
||||
104
replica/logstor/utils.hh
Normal file
104
replica/logstor/utils.hh
Normal file
@@ -0,0 +1,104 @@
|
||||
/*
|
||||
* Copyright (C) 2026-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include <concepts>
|
||||
#include "serializer.hh"
|
||||
|
||||
namespace replica::logstor {
|
||||
|
||||
// an unsigned integer that can be incremented and compared with wraparound semantics
|
||||
template <std::unsigned_integral T>
|
||||
class generation_base {
|
||||
T _value;
|
||||
|
||||
public:
|
||||
|
||||
using underlying = T;
|
||||
|
||||
constexpr generation_base() noexcept : _value(0) {}
|
||||
constexpr explicit generation_base(T value) noexcept : _value(value) {}
|
||||
|
||||
constexpr T value() const noexcept { return _value; }
|
||||
|
||||
constexpr generation_base& operator++() noexcept {
|
||||
++_value;
|
||||
return *this;
|
||||
}
|
||||
|
||||
constexpr generation_base operator++(int) noexcept {
|
||||
auto old = *this;
|
||||
++_value;
|
||||
return old;
|
||||
}
|
||||
|
||||
constexpr generation_base& operator+=(T delta) noexcept {
|
||||
_value += delta;
|
||||
return *this;
|
||||
}
|
||||
|
||||
constexpr generation_base operator+(T delta) const noexcept {
|
||||
return generation_base(_value + delta);
|
||||
}
|
||||
|
||||
constexpr bool operator==(const generation_base& other) const noexcept = default;
|
||||
|
||||
/// Comparison using wraparound semantics.
|
||||
/// Returns true if this generation is less than other, accounting for wraparound.
|
||||
/// Assumes generations are within half the value space of each other.
|
||||
constexpr bool operator<(const generation_base& other) const noexcept {
|
||||
// Use signed comparison after converting difference to signed type
|
||||
// This handles wraparound: if diff > max/2, it's treated as negative
|
||||
using signed_type = std::make_signed_t<T>;
|
||||
auto diff = static_cast<signed_type>(_value - other._value);
|
||||
return diff < 0;
|
||||
}
|
||||
|
||||
constexpr bool operator<=(const generation_base& other) const noexcept {
|
||||
return *this == other || *this < other;
|
||||
}
|
||||
|
||||
constexpr bool operator>(const generation_base& other) const noexcept {
|
||||
return other < *this;
|
||||
}
|
||||
|
||||
constexpr bool operator>=(const generation_base& other) const noexcept {
|
||||
return other <= *this;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
template <std::unsigned_integral T>
|
||||
struct fmt::formatter<replica::logstor::generation_base<T>> : fmt::formatter<T> {
|
||||
template <typename FormatContext>
|
||||
auto format(const replica::logstor::generation_base<T>& gen, FormatContext& ctx) const {
|
||||
return fmt::formatter<T>::format(gen.value(), ctx);
|
||||
}
|
||||
};
|
||||
|
||||
namespace ser {
|
||||
|
||||
template <std::unsigned_integral T>
|
||||
struct serializer<replica::logstor::generation_base<T>> {
|
||||
template <typename Output>
|
||||
static void write(Output& out, const replica::logstor::generation_base<T>& g) {
|
||||
serializer<typename replica::logstor::generation_base<T>::underlying>::write(out, g.value());
|
||||
}
|
||||
template <typename Input>
|
||||
static replica::logstor::generation_base<T> read(Input& in) {
|
||||
auto val = serializer<typename replica::logstor::generation_base<T>::underlying>::read(in);
|
||||
return replica::logstor::generation_base<T>(val);
|
||||
}
|
||||
template <typename Input>
|
||||
static void skip(Input& in) {
|
||||
serializer<typename replica::logstor::generation_base<T>::underlying>::skip(in);
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
278
replica/logstor/write_buffer.cc
Normal file
278
replica/logstor/write_buffer.cc
Normal file
@@ -0,0 +1,278 @@
|
||||
/*
|
||||
* Copyright (C) 2026-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
#include "write_buffer.hh"
|
||||
#include "segment_manager.hh"
|
||||
#include "bytes_fwd.hh"
|
||||
#include "logstor.hh"
|
||||
#include "replica/logstor/types.hh"
|
||||
#include <seastar/core/simple-stream.hh>
|
||||
#include <seastar/core/with_scheduling_group.hh>
|
||||
#include <seastar/core/on_internal_error.hh>
|
||||
#include "serializer_impl.hh"
|
||||
#include "idl/logstor.dist.hh"
|
||||
#include "idl/logstor.dist.impl.hh"
|
||||
#include <seastar/core/align.hh>
|
||||
#include <seastar/core/aligned_buffer.hh>
|
||||
|
||||
namespace replica::logstor {
|
||||
|
||||
void log_record_writer::compute_size() const {
|
||||
seastar::measuring_output_stream ms;
|
||||
ser::serialize(ms, _record);
|
||||
_size = ms.size();
|
||||
}
|
||||
|
||||
void log_record_writer::write(ostream& out) const {
|
||||
ser::serialize(out, _record);
|
||||
}
|
||||
|
||||
// write_buffer
|
||||
|
||||
write_buffer::write_buffer(size_t buffer_size, bool with_record_copy)
|
||||
: _buffer_size(buffer_size)
|
||||
, _buffer(seastar::allocate_aligned_buffer<char>(buffer_size, 4096))
|
||||
, _with_record_copy(with_record_copy)
|
||||
{
|
||||
if (_with_record_copy) {
|
||||
_records_copy.reserve(_buffer_size / 100);
|
||||
}
|
||||
reset();
|
||||
}
|
||||
|
||||
void write_buffer::reset() {
|
||||
_stream = seastar::simple_memory_output_stream(_buffer.get(), _buffer_size);
|
||||
_header_stream = _stream.write_substream(buffer_header_size);
|
||||
_buffer_header = {};
|
||||
_net_data_size = 0;
|
||||
_record_count = 0;
|
||||
_written = {};
|
||||
_records_copy.clear();
|
||||
_write_gate = {};
|
||||
}
|
||||
|
||||
future<> write_buffer::close() {
|
||||
if (!_write_gate.is_closed()) {
|
||||
co_await _write_gate.close();
|
||||
}
|
||||
}
|
||||
|
||||
size_t write_buffer::get_max_write_size() const noexcept {
|
||||
return _buffer_size - (buffer_header_size + record_header_size);
|
||||
}
|
||||
|
||||
bool write_buffer::can_fit(size_t data_size) const noexcept {
|
||||
// Calculate total space needed including header, data, and alignment padding
|
||||
auto total_size = record_header_size + data_size;
|
||||
auto aligned_size = align_up(total_size, record_alignment);
|
||||
return aligned_size <= _stream.size();
|
||||
}
|
||||
|
||||
bool write_buffer::has_data() const noexcept {
|
||||
return offset_in_buffer() > buffer_header_size;
|
||||
}
|
||||
|
||||
future<log_location_with_holder> write_buffer::write(log_record_writer writer, compaction_group* cg, seastar::gate::holder cg_holder) {
|
||||
const auto data_size = writer.size();
|
||||
|
||||
if (!can_fit(data_size)) {
|
||||
throw std::runtime_error(fmt::format("Write size {} exceeds buffer size {}", data_size, _stream.size()));
|
||||
}
|
||||
|
||||
auto rh = record_header {
|
||||
.data_size = data_size
|
||||
};
|
||||
ser::serialize(_stream, rh);
|
||||
|
||||
// Write actual data
|
||||
size_t data_offset_in_buffer = offset_in_buffer();
|
||||
auto data_out = _stream.write_substream(data_size);
|
||||
writer.write(data_out);
|
||||
|
||||
_net_data_size += data_size;
|
||||
_record_count++;
|
||||
|
||||
// Add padding to align record
|
||||
pad_to_alignment(record_alignment);
|
||||
|
||||
auto record_location = [data_offset_in_buffer, data_size] (log_location base_location) {
|
||||
return log_location {
|
||||
.segment = base_location.segment,
|
||||
.offset = base_location.offset + data_offset_in_buffer,
|
||||
.size = data_size
|
||||
};
|
||||
};
|
||||
|
||||
if (_with_record_copy) {
|
||||
_records_copy.push_back(record_in_buffer {
|
||||
.writer = std::move(writer),
|
||||
.offset_in_buffer = data_offset_in_buffer,
|
||||
.data_size = data_size,
|
||||
.loc = _written.get_shared_future().then(record_location),
|
||||
.cg = cg,
|
||||
.cg_holder = std::move(cg_holder)
|
||||
});
|
||||
}
|
||||
|
||||
// hold the write buffer until the write is complete, and pass the holder to the
|
||||
// caller for follow-up operations that should continue holding the buffer, such
|
||||
// as index updates.
|
||||
auto op = _write_gate.hold();
|
||||
|
||||
return _written.get_shared_future().then([record_location, op = std::move(op)] (log_location base_location) mutable {
|
||||
return std::make_tuple(record_location(base_location), std::move(op));
|
||||
});
|
||||
}
|
||||
|
||||
future<log_location> write_buffer::write_no_holder(log_record_writer writer) {
|
||||
// write and leave the gate immediately after the write.
|
||||
// use carefully when the gate it not needed.
|
||||
return write(std::move(writer)).then_unpack([] (log_location loc, seastar::gate::holder op) {
|
||||
return loc;
|
||||
});
|
||||
}
|
||||
|
||||
void write_buffer::pad_to_alignment(size_t alignment) {
|
||||
auto current_pos = offset_in_buffer();
|
||||
auto next_pos = align_up(current_pos, alignment);
|
||||
auto padding = next_pos - current_pos;
|
||||
if (padding > 0) {
|
||||
_stream.fill('\0', padding);
|
||||
}
|
||||
}
|
||||
|
||||
void write_buffer::finalize(size_t alignment) {
|
||||
_buffer_header.data_size = static_cast<uint32_t>(offset_in_buffer() - buffer_header_size);
|
||||
pad_to_alignment(alignment);
|
||||
}
|
||||
|
||||
void write_buffer::write_header(segment_generation seg_gen) {
|
||||
_buffer_header.magic = buffer_header_magic;
|
||||
_buffer_header.seg_gen = seg_gen;
|
||||
ser::serialize<buffer_header>(_header_stream, _buffer_header);
|
||||
}
|
||||
|
||||
future<> write_buffer::complete_writes(log_location base_location) {
|
||||
_written.set_value(base_location);
|
||||
co_await close();
|
||||
}
|
||||
|
||||
future<> write_buffer::abort_writes(std::exception_ptr ex) {
|
||||
if (!_written.available()) {
|
||||
_written.set_exception(std::move(ex));
|
||||
}
|
||||
co_await close();
|
||||
}
|
||||
|
||||
std::vector<write_buffer::record_in_buffer>& write_buffer::records() {
|
||||
if (!_with_record_copy) {
|
||||
on_internal_error(logstor_logger, "requesting records but the write buffer has no record copy enabled");
|
||||
}
|
||||
return _records_copy;
|
||||
}
|
||||
|
||||
size_t write_buffer::estimate_required_segments(size_t net_data_size, size_t record_count, size_t segment_size) {
|
||||
// Calculate total size needed including headers and alignment padding
|
||||
size_t total_size = record_header_size * record_count + net_data_size;
|
||||
|
||||
// not perfect so let's multiply by some overhead constant
|
||||
total_size = static_cast<size_t>(total_size * 1.1);
|
||||
|
||||
return align_up(total_size, segment_size) / segment_size;
|
||||
|
||||
}
|
||||
|
||||
// buffered_writer
|
||||
|
||||
buffered_writer::buffered_writer(segment_manager& sm, seastar::scheduling_group flush_sg)
|
||||
: _sm(sm)
|
||||
, _available_buffers(num_flushing_buffers)
|
||||
, _flush_sg(flush_sg) {
|
||||
|
||||
_buffers.reserve(num_flushing_buffers + 1);
|
||||
for (size_t i = 0; i < num_flushing_buffers + 1; ++i) {
|
||||
_buffers.emplace_back(_sm.get_segment_size(), true);
|
||||
}
|
||||
|
||||
_active_buffer = active_buffer {
|
||||
.buf = &_buffers[0],
|
||||
};
|
||||
|
||||
for (size_t i = 1; i < num_flushing_buffers + 1; ++i) {
|
||||
_available_buffers.push(&_buffers[i]);
|
||||
}
|
||||
}
|
||||
|
||||
future<> buffered_writer::start() {
|
||||
logstor_logger.info("Starting write buffer");
|
||||
co_return;
|
||||
}
|
||||
|
||||
future<> buffered_writer::stop() {
|
||||
if (_async_gate.is_closed()) {
|
||||
co_return;
|
||||
}
|
||||
logstor_logger.info("Stopping write buffer");
|
||||
|
||||
co_await _async_gate.close();
|
||||
logstor_logger.info("Write buffer stopped");
|
||||
}
|
||||
|
||||
future<log_location_with_holder> buffered_writer::write(log_record record, compaction_group* cg, seastar::gate::holder cg_holder) {
|
||||
auto holder = _async_gate.hold();
|
||||
|
||||
log_record_writer writer(std::move(record));
|
||||
|
||||
if (writer.size() > _active_buffer.buf->get_max_write_size()) {
|
||||
throw std::runtime_error(fmt::format("Write size {} exceeds buffer size {}", writer.size(), _active_buffer.buf->get_max_write_size()));
|
||||
}
|
||||
|
||||
// Check if write fits in current buffer
|
||||
while (!_active_buffer.buf->can_fit(writer)) {
|
||||
co_await _buffer_switched.wait();
|
||||
}
|
||||
|
||||
// Write to buffer at current position
|
||||
auto fut = _active_buffer.buf->write(std::move(writer), cg, std::move(cg_holder));
|
||||
|
||||
// Trigger flush for the active buffer if not in progress
|
||||
if (!std::exchange(_active_buffer.flush_requested, true)) {
|
||||
(void)with_gate(_async_gate, [this] {
|
||||
return switch_buffer().then([this] (write_buffer* old_buf) mutable {
|
||||
return with_scheduling_group(_flush_sg, [this, old_buf] mutable {
|
||||
return flush(old_buf);
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
co_return co_await std::move(fut);
|
||||
}
|
||||
|
||||
future<write_buffer*> buffered_writer::switch_buffer() {
|
||||
// Wait for and get the next available buffer
|
||||
auto new_buf = co_await _available_buffers.pop_eventually();
|
||||
|
||||
auto next_active_buffer = active_buffer {
|
||||
.buf = std::move(new_buf),
|
||||
};
|
||||
|
||||
auto old_active_buffer = std::exchange(_active_buffer, std::move(next_active_buffer));
|
||||
_buffer_switched.broadcast();
|
||||
|
||||
co_return std::move(old_active_buffer.buf);
|
||||
}
|
||||
|
||||
future<> buffered_writer::flush(write_buffer* buf) {
|
||||
co_await _sm.write(*buf);
|
||||
|
||||
// Return the flushed buffer to the available queue
|
||||
buf->reset();
|
||||
_available_buffers.push(std::move(buf));
|
||||
}
|
||||
|
||||
}
|
||||
294
replica/logstor/write_buffer.hh
Normal file
294
replica/logstor/write_buffer.hh
Normal file
@@ -0,0 +1,294 @@
|
||||
/*
|
||||
* Copyright (C) 2026-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include <seastar/core/future.hh>
|
||||
#include <seastar/core/gate.hh>
|
||||
#include <seastar/core/temporary_buffer.hh>
|
||||
#include <seastar/core/aligned_buffer.hh>
|
||||
#include <seastar/core/condition-variable.hh>
|
||||
#include <seastar/core/scheduling.hh>
|
||||
#include <seastar/core/semaphore.hh>
|
||||
#include <seastar/core/queue.hh>
|
||||
#include <seastar/core/simple-stream.hh>
|
||||
#include <seastar/core/shared_future.hh>
|
||||
#include "types.hh"
|
||||
#include "serializer.hh"
|
||||
|
||||
namespace replica {
|
||||
|
||||
class compaction_group;
|
||||
|
||||
namespace logstor {
|
||||
|
||||
class segment_manager;
|
||||
|
||||
// Writer for log records that handles serialization and size computation
|
||||
class log_record_writer {
|
||||
|
||||
using ostream = seastar::simple_memory_output_stream;
|
||||
|
||||
log_record _record;
|
||||
mutable std::optional<size_t> _size;
|
||||
|
||||
void compute_size() const;
|
||||
|
||||
public:
|
||||
explicit log_record_writer(log_record record)
|
||||
: _record(std::move(record))
|
||||
{}
|
||||
|
||||
// Get serialized size (computed lazily)
|
||||
size_t size() const {
|
||||
if (!_size) {
|
||||
compute_size();
|
||||
}
|
||||
return *_size;
|
||||
}
|
||||
|
||||
// Write the record to an output stream
|
||||
void write(ostream& out) const;
|
||||
|
||||
const log_record& record() const {
|
||||
return _record;
|
||||
}
|
||||
};
|
||||
|
||||
using log_location_with_holder = std::tuple<log_location, seastar::gate::holder>;
|
||||
|
||||
// Manages a single aligned buffer for accumulating records and writing
|
||||
// them to the segment manager.
|
||||
//
|
||||
// usage:
|
||||
//
|
||||
// create write buffer with specified size:
|
||||
// write_buffer wb(buffer_size);
|
||||
// write data to the buffer if fits and get a future for the log location when flushed:
|
||||
// log_record_writer writer(record);
|
||||
// auto loc_fut = wb.write(writer);
|
||||
// flush the buffer to the segment manager:
|
||||
// co_await sm.write(wb);
|
||||
// await individual write locations:
|
||||
// auto record_loc = co_await std::move(loc_fut);
|
||||
class write_buffer {
|
||||
public:
|
||||
|
||||
using ostream = seastar::simple_memory_output_stream;
|
||||
|
||||
// buffer: buffer_header | record_1 | ... | record_n | 0-padding
|
||||
// record: record_header | record_data | 0-padding
|
||||
//
|
||||
// buffer_header and record are aligned by record_alignment
|
||||
// buffer_header and record_header have explicit sizes and serialization below
|
||||
|
||||
static constexpr uint32_t buffer_header_magic = 0x4c475342;
|
||||
static constexpr size_t record_alignment = 8;
|
||||
|
||||
struct buffer_header {
|
||||
uint32_t magic;
|
||||
uint32_t data_size; // size of all records data following the buffer_header
|
||||
segment_generation seg_gen;
|
||||
uint16_t reserved1;
|
||||
uint32_t reserved2;
|
||||
};
|
||||
static constexpr size_t buffer_header_size = 3 * sizeof(uint32_t) + sizeof(uint16_t) + sizeof(segment_generation::underlying);
|
||||
|
||||
static_assert(buffer_header_size % record_alignment == 0, "Buffer header size must be aligned by record_alignment");
|
||||
|
||||
struct record_header {
|
||||
uint32_t data_size; // size of the record data following the record_header
|
||||
};
|
||||
static constexpr size_t record_header_size = sizeof(uint32_t);
|
||||
|
||||
private:
|
||||
|
||||
using aligned_buffer_type = std::unique_ptr<char[], free_deleter>;
|
||||
|
||||
size_t _buffer_size;
|
||||
aligned_buffer_type _buffer;
|
||||
seastar::simple_memory_output_stream _stream;
|
||||
buffer_header _buffer_header;
|
||||
seastar::simple_memory_output_stream _header_stream;
|
||||
|
||||
size_t _net_data_size{0};
|
||||
size_t _record_count{0};
|
||||
|
||||
shared_promise<log_location> _written;
|
||||
|
||||
seastar::gate _write_gate;
|
||||
|
||||
struct record_in_buffer {
|
||||
log_record_writer writer;
|
||||
size_t offset_in_buffer;
|
||||
size_t data_size;
|
||||
future<log_location> loc;
|
||||
compaction_group* cg;
|
||||
seastar::gate::holder cg_holder;
|
||||
};
|
||||
|
||||
bool _with_record_copy;
|
||||
std::vector<record_in_buffer> _records_copy;
|
||||
|
||||
public:
|
||||
|
||||
write_buffer(size_t buffer_size, bool with_record_copy);
|
||||
|
||||
void reset();
|
||||
|
||||
write_buffer(const write_buffer&) = delete;
|
||||
write_buffer& operator=(const write_buffer&) = delete;
|
||||
|
||||
write_buffer(write_buffer&&) noexcept = default;
|
||||
write_buffer& operator=(write_buffer&&) noexcept = default;
|
||||
|
||||
future<> close();
|
||||
|
||||
size_t get_buffer_size() const noexcept { return _buffer_size; }
|
||||
size_t offset_in_buffer() const noexcept { return _buffer_size - _stream.size(); }
|
||||
|
||||
bool can_fit(size_t data_size) const noexcept;
|
||||
|
||||
bool can_fit(const log_record_writer& writer) const noexcept {
|
||||
return can_fit(writer.size());
|
||||
}
|
||||
|
||||
bool has_data() const noexcept;
|
||||
|
||||
size_t get_max_write_size() const noexcept;
|
||||
|
||||
size_t get_net_data_size() const noexcept { return _net_data_size; }
|
||||
size_t get_record_count() const noexcept { return _record_count; }
|
||||
|
||||
// Write a record to the buffer.
|
||||
// Returns a future that will be resolved with the log location once flushed and a gate holder
|
||||
// that keeps the write buffer open. The gate should be held for index updates after the write
|
||||
// is done.
|
||||
future<log_location_with_holder> write(log_record_writer, compaction_group*, seastar::gate::holder cg_holder);
|
||||
|
||||
future<log_location_with_holder> write(log_record_writer writer) {
|
||||
return write(std::move(writer), nullptr, {});
|
||||
}
|
||||
|
||||
// Write a record to the buffer.
|
||||
// Returns a future that will be resolved with the log location once flushed.
|
||||
// If there are follow-up operations to the write such as index updates then consider
|
||||
// using write_with_holder instead to keep the write buffer open until those operations are complete.
|
||||
future<log_location> write_no_holder(log_record_writer);
|
||||
|
||||
static size_t estimate_required_segments(size_t net_data_size, size_t record_count, size_t segment_size);
|
||||
|
||||
private:
|
||||
|
||||
const char* data() const noexcept { return _buffer.get(); }
|
||||
|
||||
void write_header(segment_generation);
|
||||
|
||||
// get all write records in the buffer.
|
||||
// with_record_copy must be to true when creating the write_buffer.
|
||||
std::vector<record_in_buffer>& records();
|
||||
|
||||
/// Complete all tracked writes with their locations when the buffer is flushed to base_location
|
||||
future<> complete_writes(log_location base_location);
|
||||
future<> abort_writes(std::exception_ptr);
|
||||
|
||||
void pad_to_alignment(size_t alignment);
|
||||
void finalize(size_t alignment);
|
||||
|
||||
friend class segment_manager_impl;
|
||||
friend class compaction_manager_impl;
|
||||
};
|
||||
|
||||
// Manages multiple buffers, a single active buffer and multiple flushing buffers.
|
||||
// When switch is requested for the active buffer, it waits for a flushing buffer to
|
||||
// become available, and continuing to accumulate writes until then.
|
||||
class buffered_writer {
|
||||
static constexpr size_t num_flushing_buffers = 4;
|
||||
|
||||
segment_manager& _sm;
|
||||
|
||||
struct active_buffer {
|
||||
write_buffer* buf;
|
||||
bool flush_requested{false};
|
||||
} _active_buffer;
|
||||
|
||||
std::vector<write_buffer> _buffers;
|
||||
seastar::queue<write_buffer*> _available_buffers;
|
||||
seastar::gate _async_gate;
|
||||
seastar::condition_variable _buffer_switched;
|
||||
seastar::scheduling_group _flush_sg;
|
||||
|
||||
public:
|
||||
explicit buffered_writer(segment_manager& sm, seastar::scheduling_group flush_sg);
|
||||
|
||||
buffered_writer(const buffered_writer&) = delete;
|
||||
buffered_writer& operator=(const buffered_writer&) = delete;
|
||||
|
||||
future<> start();
|
||||
future<> stop();
|
||||
|
||||
future<log_location_with_holder> write(log_record, compaction_group* cg = nullptr, seastar::gate::holder cg_holder = {});
|
||||
|
||||
private:
|
||||
future<write_buffer*> switch_buffer();
|
||||
future<> flush(write_buffer*);
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
namespace ser {
|
||||
|
||||
template <>
|
||||
struct serializer<replica::logstor::write_buffer::buffer_header> {
|
||||
template <typename Output>
|
||||
static void write(Output& out, const replica::logstor::write_buffer::buffer_header& h) {
|
||||
serializer<uint32_t>::write(out, h.magic);
|
||||
serializer<uint32_t>::write(out, h.data_size);
|
||||
serializer<replica::logstor::segment_generation>::write(out, h.seg_gen);
|
||||
serializer<uint16_t>::write(out, h.reserved1);
|
||||
serializer<uint32_t>::write(out, h.reserved2);
|
||||
}
|
||||
template <typename Input>
|
||||
static replica::logstor::write_buffer::buffer_header read(Input& in) {
|
||||
replica::logstor::write_buffer::buffer_header h;
|
||||
h.magic = serializer<uint32_t>::read(in);
|
||||
h.data_size = serializer<uint32_t>::read(in);
|
||||
h.seg_gen = serializer<replica::logstor::segment_generation>::read(in);
|
||||
h.reserved1 = serializer<uint16_t>::read(in);
|
||||
h.reserved2 = serializer<uint32_t>::read(in);
|
||||
return h;
|
||||
}
|
||||
template <typename Input>
|
||||
static void skip(Input& in) {
|
||||
serializer<uint32_t>::skip(in);
|
||||
serializer<uint32_t>::skip(in);
|
||||
serializer<replica::logstor::segment_generation>::skip(in);
|
||||
serializer<uint16_t>::skip(in);
|
||||
serializer<uint32_t>::skip(in);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct serializer<replica::logstor::write_buffer::record_header> {
|
||||
template <typename Output>
|
||||
static void write(Output& out, const replica::logstor::write_buffer::record_header& h) {
|
||||
serializer<uint32_t>::write(out, h.data_size);
|
||||
}
|
||||
template <typename Input>
|
||||
static replica::logstor::write_buffer::record_header read(Input& in) {
|
||||
replica::logstor::write_buffer::record_header h;
|
||||
h.data_size = serializer<uint32_t>::read(in);
|
||||
return h;
|
||||
}
|
||||
template <typename Input>
|
||||
static void skip(Input& in) {
|
||||
serializer<uint32_t>::skip(in);
|
||||
}
|
||||
};
|
||||
} // namespace ser
|
||||
419
replica/table.cc
419
replica/table.cc
@@ -217,6 +217,17 @@ table::add_memtables_to_reader_list(std::vector<mutation_reader>& readers,
|
||||
}
|
||||
}
|
||||
|
||||
mutation_reader
|
||||
table::make_logstor_mutation_reader(schema_ptr s,
|
||||
reader_permit permit,
|
||||
const dht::partition_range& pr,
|
||||
const query::partition_slice& slice,
|
||||
tracing::trace_state_ptr trace_state,
|
||||
streamed_mutation::forwarding fwd,
|
||||
mutation_reader::forwarding fwd_mr) const {
|
||||
return _logstor->make_reader(std::move(s), logstor_index(), std::move(permit), pr, slice, std::move(trace_state));
|
||||
}
|
||||
|
||||
mutation_reader
|
||||
table::make_mutation_reader(schema_ptr s,
|
||||
reader_permit permit,
|
||||
@@ -229,6 +240,10 @@ table::make_mutation_reader(schema_ptr s,
|
||||
return (*_virtual_reader).make_mutation_reader(s, std::move(permit), range, slice, trace_state, fwd, fwd_mr);
|
||||
}
|
||||
|
||||
if (_logstor) [[unlikely]] {
|
||||
return make_logstor_mutation_reader(s, std::move(permit), range, slice, std::move(trace_state), fwd, fwd_mr);
|
||||
}
|
||||
|
||||
std::vector<mutation_reader> readers;
|
||||
|
||||
// We're assuming that cache and memtables are both read atomically
|
||||
@@ -716,7 +731,9 @@ public:
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
void update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) override {}
|
||||
void update_effective_replication_map(const locator::effective_replication_map_ptr& old_erm,
|
||||
const locator::effective_replication_map& erm,
|
||||
noncopyable_function<void()> refresh_mutation_source) override {}
|
||||
|
||||
compaction_group& compaction_group_for_token(dht::token token) const override {
|
||||
return get_compaction_group();
|
||||
@@ -762,6 +779,11 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
struct background_merge_guard {
|
||||
compaction::compaction_reenabler compaction_guard;
|
||||
locator::effective_replication_map_ptr erm_guard;
|
||||
};
|
||||
|
||||
class tablet_storage_group_manager final : public storage_group_manager {
|
||||
replica::table& _t;
|
||||
locator::host_id _my_host_id;
|
||||
@@ -782,7 +804,7 @@ class tablet_storage_group_manager final : public storage_group_manager {
|
||||
utils::phased_barrier _merge_fiber_barrier;
|
||||
std::optional<utils::phased_barrier::operation> _pending_merge_fiber_work;
|
||||
// Holds compaction reenabler which disables compaction temporarily during tablet merge
|
||||
std::vector<compaction::compaction_reenabler> _compaction_reenablers_for_merging;
|
||||
std::vector<background_merge_guard> _compaction_reenablers_for_merging;
|
||||
private:
|
||||
const schema_ptr& schema() const {
|
||||
return _t.schema();
|
||||
@@ -806,7 +828,8 @@ private:
|
||||
// Called when coordinator executes tablet merge. Tablet ids X and X+1 are merged into
|
||||
// the new tablet id (X >> 1). In practice, that means storage groups for X and X+1
|
||||
// are merged into a new storage group with id (X >> 1).
|
||||
void handle_tablet_merge_completion(const locator::tablet_map& old_tmap, const locator::tablet_map& new_tmap);
|
||||
void handle_tablet_merge_completion(locator::effective_replication_map_ptr old_erm,
|
||||
const locator::tablet_map& old_tmap, const locator::tablet_map& new_tmap);
|
||||
|
||||
// When merge completes, compaction groups of sibling tablets are added to same storage
|
||||
// group, but they're not merged yet into one, since the merge completion handler happens
|
||||
@@ -822,9 +845,8 @@ private:
|
||||
return tablet_map().get_tablet_id(t).value();
|
||||
}
|
||||
|
||||
std::pair<size_t, locator::tablet_range_side> storage_group_of(dht::token t) const {
|
||||
auto [id, side] = tablet_map().get_tablet_id_and_range_side(t);
|
||||
auto idx = id.value();
|
||||
size_t storage_group_of(dht::token t) const {
|
||||
auto idx = tablet_id_for_token(t);
|
||||
#ifndef SCYLLA_BUILD_MODE_RELEASE
|
||||
if (idx >= tablet_count()) {
|
||||
on_fatal_internal_error(tlogger, format("storage_group_of: index out of range: idx={} size_log2={} size={} token={}",
|
||||
@@ -836,7 +858,7 @@ private:
|
||||
idx, sg.token_range(), t));
|
||||
}
|
||||
#endif
|
||||
return { idx, side };
|
||||
return idx;
|
||||
}
|
||||
|
||||
repair_classifier_func make_repair_sstable_classifier_func() const {
|
||||
@@ -900,7 +922,9 @@ public:
|
||||
std::exchange(_stop_fut, make_ready_future())).discard_result();
|
||||
}
|
||||
|
||||
void update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) override;
|
||||
void update_effective_replication_map(const locator::effective_replication_map_ptr& old_erm,
|
||||
const locator::effective_replication_map& erm,
|
||||
noncopyable_function<void()> refresh_mutation_source) override;
|
||||
|
||||
compaction_group& compaction_group_for_token(dht::token token) const override;
|
||||
utils::chunked_vector<storage_group_ptr> storage_groups_for_token_range(dht::token_range tr) const override;
|
||||
@@ -911,7 +935,7 @@ public:
|
||||
return log2ceil(tablet_map().tablet_count());
|
||||
}
|
||||
storage_group& storage_group_for_token(dht::token token) const override {
|
||||
return storage_group_for_id(storage_group_of(token).first);
|
||||
return storage_group_for_id(storage_group_of(token));
|
||||
}
|
||||
|
||||
locator::combined_load_stats table_load_stats() const override;
|
||||
@@ -959,9 +983,20 @@ size_t storage_group::to_idx(locator::tablet_range_side side) const {
|
||||
return size_t(side);
|
||||
}
|
||||
|
||||
compaction_group_ptr& storage_group::select_compaction_group(locator::tablet_range_side side) noexcept {
|
||||
compaction_group_ptr& storage_group::select_compaction_group(dht::token token, const locator::tablet_map& tmap) noexcept {
|
||||
if (splitting_mode()) {
|
||||
return _split_ready_groups[to_idx(side)];
|
||||
return _split_ready_groups[to_idx(tmap.get_tablet_range_side(token))];
|
||||
}
|
||||
return _main_cg;
|
||||
}
|
||||
|
||||
compaction_group_ptr& storage_group::select_compaction_group(dht::token first, dht::token last, const locator::tablet_map& tmap) noexcept {
|
||||
if (splitting_mode()) {
|
||||
auto first_side = tmap.get_tablet_range_side(first);
|
||||
auto last_side = tmap.get_tablet_range_side(last);
|
||||
if (first_side == last_side) {
|
||||
return _split_ready_groups[to_idx(first_side)];
|
||||
}
|
||||
}
|
||||
return _main_cg;
|
||||
}
|
||||
@@ -1056,6 +1091,38 @@ future<> compaction_group::split(compaction::compaction_type_options::split opt,
|
||||
}
|
||||
}
|
||||
|
||||
future<> compaction_group::discard_logstor_segments() {
|
||||
auto& sm = get_logstor_segment_manager();
|
||||
co_await sm.discard_segments(*_logstor_segments);
|
||||
}
|
||||
|
||||
future<> compaction_group::flush_separator(std::optional<size_t> seq_num) {
|
||||
auto units = co_await get_units(_separator_flush_sem, 1);
|
||||
auto pending = std::exchange(_separator_flushes, {});
|
||||
if (_logstor_separator && (!seq_num || _logstor_separator->min_seq_num < *seq_num)) {
|
||||
auto& cm = get_logstor_compaction_manager();
|
||||
auto b = std::move(*_logstor_separator);
|
||||
_logstor_separator.reset();
|
||||
pending.push_back(cm.flush_separator_buffer(std::move(b), *this));
|
||||
}
|
||||
co_await when_all(pending.begin(), pending.end());
|
||||
}
|
||||
|
||||
logstor::separator_buffer& compaction_group::get_separator_buffer(size_t write_size) {
|
||||
if (!_logstor_separator || !_logstor_separator->can_fit(write_size)) {
|
||||
auto& cm = get_logstor_compaction_manager();
|
||||
if (_logstor_separator) {
|
||||
auto b = std::move(*_logstor_separator);
|
||||
_logstor_separator.reset();
|
||||
|
||||
std::erase_if(_separator_flushes, [](future<>& f) { return f.available(); });
|
||||
_separator_flushes.push_back(cm.flush_separator_buffer(std::move(b), *this));
|
||||
}
|
||||
_logstor_separator.emplace(cm.allocate_separator_buffer());
|
||||
}
|
||||
return *_logstor_separator;
|
||||
}
|
||||
|
||||
future<> storage_group::split(compaction::compaction_type_options::split opt, tasks::task_info tablet_split_task_info) {
|
||||
if (set_split_mode()) {
|
||||
co_return;
|
||||
@@ -1222,9 +1289,9 @@ storage_group& table::storage_group_for_id(size_t i) const {
|
||||
}
|
||||
|
||||
compaction_group& tablet_storage_group_manager::compaction_group_for_token(dht::token token) const {
|
||||
auto [idx, range_side] = storage_group_of(token);
|
||||
auto idx = storage_group_of(token);
|
||||
auto& sg = storage_group_for_id(idx);
|
||||
return *sg.select_compaction_group(range_side);
|
||||
return *sg.select_compaction_group(token, tablet_map());
|
||||
}
|
||||
|
||||
compaction_group& table::compaction_group_for_token(dht::token token) const {
|
||||
@@ -1265,8 +1332,8 @@ compaction_group& table::compaction_group_for_key(partition_key_view key, const
|
||||
}
|
||||
|
||||
compaction_group& tablet_storage_group_manager::compaction_group_for_sstable(const sstables::shared_sstable& sst) const {
|
||||
auto [first_id, first_range_side] = storage_group_of(sst->get_first_decorated_key().token());
|
||||
auto [last_id, last_range_side] = storage_group_of(sst->get_last_decorated_key().token());
|
||||
auto first_id = storage_group_of(sst->get_first_decorated_key().token());
|
||||
auto last_id = storage_group_of(sst->get_last_decorated_key().token());
|
||||
|
||||
auto sstable_desc = [] (const sstables::shared_sstable& sst) {
|
||||
auto& identifier_opt = sst->sstable_identifier();
|
||||
@@ -1289,12 +1356,10 @@ compaction_group& tablet_storage_group_manager::compaction_group_for_sstable(con
|
||||
|
||||
try {
|
||||
auto& sg = storage_group_for_id(first_id);
|
||||
|
||||
if (first_range_side != last_range_side) {
|
||||
return *sg.main_compaction_group();
|
||||
}
|
||||
|
||||
return *sg.select_compaction_group(first_range_side);
|
||||
return *sg.select_compaction_group(
|
||||
sst->get_first_decorated_key().token(),
|
||||
sst->get_last_decorated_key().token(),
|
||||
tablet_map());
|
||||
} catch (std::out_of_range& e) {
|
||||
on_internal_error(tlogger, format("Unable to load SSTable {} of tablet {}, due to {}",
|
||||
sstable_desc(sst),
|
||||
@@ -1465,6 +1530,7 @@ table::add_new_sstable_and_update_cache(sstables::shared_sstable new_sst,
|
||||
sstables::offstrategy offstrategy) {
|
||||
std::vector<sstables::shared_sstable> ret, ssts;
|
||||
std::exception_ptr ex;
|
||||
log_level failure_log_level = log_level::error;
|
||||
try {
|
||||
bool trigger_compaction = offstrategy == sstables::offstrategy::no;
|
||||
auto& cg = compaction_group_for_sstable(new_sst);
|
||||
@@ -1486,6 +1552,9 @@ table::add_new_sstable_and_update_cache(sstables::shared_sstable new_sst,
|
||||
co_await do_add_sstable_and_update_cache(cg, sst, offstrategy, trigger_compaction);
|
||||
sst = nullptr;
|
||||
}
|
||||
} catch (compaction::compaction_stopped_exception&) {
|
||||
failure_log_level = log_level::warn;
|
||||
ex = std::current_exception();
|
||||
} catch (...) {
|
||||
ex = std::current_exception();
|
||||
}
|
||||
@@ -1493,13 +1562,13 @@ table::add_new_sstable_and_update_cache(sstables::shared_sstable new_sst,
|
||||
if (ex) {
|
||||
// on failed split, input sstable is unlinked here.
|
||||
if (new_sst) {
|
||||
tlogger.error("Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", new_sst->get_filename(), new_sst->get_origin(), ex);
|
||||
tlogger.log(failure_log_level, "Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", new_sst->get_filename(), new_sst->get_origin(), ex);
|
||||
co_await new_sst->unlink();
|
||||
}
|
||||
// on failure after successful split, sstables not attached yet will be unlinked
|
||||
co_await coroutine::parallel_for_each(ssts, [&ex] (sstables::shared_sstable sst) -> future<> {
|
||||
co_await coroutine::parallel_for_each(ssts, [&ex, failure_log_level] (sstables::shared_sstable sst) -> future<> {
|
||||
if (sst) {
|
||||
tlogger.error("Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", sst->get_filename(), sst->get_origin(), ex);
|
||||
tlogger.log(failure_log_level, "Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", sst->get_filename(), sst->get_origin(), ex);
|
||||
co_await sst->unlink();
|
||||
}
|
||||
});
|
||||
@@ -1513,6 +1582,7 @@ table::add_new_sstables_and_update_cache(std::vector<sstables::shared_sstable> n
|
||||
std::function<future<>(sstables::shared_sstable)> on_add) {
|
||||
std::exception_ptr ex;
|
||||
std::vector<sstables::shared_sstable> ret;
|
||||
log_level failure_log_level = log_level::error;
|
||||
|
||||
// We rely on add_new_sstable_and_update_cache() to unlink the sstable fed into it,
|
||||
// so the exception handling below will only have to unlink sstables not processed yet.
|
||||
@@ -1522,14 +1592,17 @@ table::add_new_sstables_and_update_cache(std::vector<sstables::shared_sstable> n
|
||||
std::ranges::move(ssts, std::back_inserter(ret));
|
||||
|
||||
}
|
||||
} catch (compaction::compaction_stopped_exception&) {
|
||||
failure_log_level = log_level::warn;
|
||||
ex = std::current_exception();
|
||||
} catch (...) {
|
||||
ex = std::current_exception();
|
||||
}
|
||||
|
||||
if (ex) {
|
||||
co_await coroutine::parallel_for_each(new_ssts, [&ex] (sstables::shared_sstable sst) -> future<> {
|
||||
co_await coroutine::parallel_for_each(new_ssts, [&ex, failure_log_level] (sstables::shared_sstable sst) -> future<> {
|
||||
if (sst) {
|
||||
tlogger.error("Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", sst->get_filename(), sst->get_origin(), ex);
|
||||
tlogger.log(failure_log_level, "Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", sst->get_filename(), sst->get_origin(), ex);
|
||||
co_await sst->unlink();
|
||||
}
|
||||
});
|
||||
@@ -1568,6 +1641,19 @@ table::update_cache(compaction_group& cg, lw_shared_ptr<memtable> m, std::vector
|
||||
}
|
||||
}
|
||||
|
||||
bool table::add_logstor_segment(logstor::segment_descriptor& seg_desc, dht::token first_token, dht::token last_token) {
|
||||
auto& cg = compaction_group_for_token(first_token);
|
||||
if (&cg != &compaction_group_for_token(last_token)) {
|
||||
return false;
|
||||
}
|
||||
cg.add_logstor_segment(seg_desc);
|
||||
return true;
|
||||
}
|
||||
|
||||
logstor::separator_buffer& table::get_logstor_separator_buffer(dht::token token, size_t write_size) {
|
||||
return compaction_group_for_token(token).get_separator_buffer(write_size);
|
||||
}
|
||||
|
||||
// Handles permit management only, used for situations where we don't want to inform
|
||||
// the compaction manager about backlogs (i.e., tests)
|
||||
class permit_monitor : public sstables::write_monitor {
|
||||
@@ -1765,7 +1851,9 @@ table::seal_active_memtable(compaction_group& cg, flush_permit&& flush_permit) n
|
||||
utils::get_local_injector().inject("table_seal_active_memtable_try_flush", []() {
|
||||
throw std::system_error(ENOSPC, std::system_category(), "Injected error");
|
||||
});
|
||||
co_return co_await this->try_flush_memtable_to_sstable(cg, old, std::move(write_permit));
|
||||
co_await this->try_flush_memtable_to_sstable(cg, old, std::move(write_permit));
|
||||
// signal a memtable was sealed
|
||||
utils::get_local_injector().receive_message("table_seal_post_flush_waiters");
|
||||
});
|
||||
|
||||
undo_stats.reset();
|
||||
@@ -2021,8 +2109,15 @@ size_t compaction_group::live_sstable_count() const noexcept {
|
||||
return _main_sstables->size() + _maintenance_sstables->size();
|
||||
}
|
||||
|
||||
size_t compaction_group::logstor_disk_space_used() const noexcept {
|
||||
if (!_logstor_segments || !_t.uses_logstor()) {
|
||||
return 0;
|
||||
}
|
||||
return _logstor_segments->segment_count() * _t.get_logstor_segment_manager().get_segment_size();
|
||||
}
|
||||
|
||||
uint64_t compaction_group::live_disk_space_used() const noexcept {
|
||||
return _main_sstables->bytes_on_disk() + _maintenance_sstables->bytes_on_disk();
|
||||
return _main_sstables->bytes_on_disk() + _maintenance_sstables->bytes_on_disk() + logstor_disk_space_used();
|
||||
}
|
||||
|
||||
sstables::file_size_stats compaction_group::live_disk_space_used_full_stats() const noexcept {
|
||||
@@ -2372,6 +2467,12 @@ void table::trigger_compaction() {
|
||||
});
|
||||
}
|
||||
|
||||
void table::trigger_logstor_compaction() {
|
||||
for_each_compaction_group([] (compaction_group& cg) {
|
||||
cg.trigger_logstor_compaction();
|
||||
});
|
||||
}
|
||||
|
||||
void table::try_trigger_compaction(compaction_group& cg) noexcept {
|
||||
try {
|
||||
cg.trigger_compaction();
|
||||
@@ -2380,6 +2481,51 @@ void table::try_trigger_compaction(compaction_group& cg) noexcept {
|
||||
}
|
||||
}
|
||||
|
||||
future<> table::flush_separator(std::optional<size_t> seq_num) {
|
||||
if (!uses_logstor()) {
|
||||
co_return;
|
||||
}
|
||||
|
||||
// wait for all previous writes to be written to a separator buffer
|
||||
co_await get_logstor_segment_manager().await_pending_writes();
|
||||
|
||||
// flush separator buffers
|
||||
co_await parallel_foreach_compaction_group([seq_num] (compaction_group& cg) {
|
||||
return cg.flush_separator(seq_num);
|
||||
});
|
||||
}
|
||||
|
||||
future<logstor::table_segment_stats> table::get_logstor_segment_stats() const {
|
||||
logstor::table_segment_stats result;
|
||||
if (!uses_logstor()) {
|
||||
co_return std::move(result);
|
||||
}
|
||||
|
||||
const auto segment_size = get_logstor_segment_manager().get_segment_size();
|
||||
const auto bucket_count = 32;
|
||||
const auto bucket_size = segment_size / bucket_count;
|
||||
|
||||
result.histogram.resize(bucket_count);
|
||||
|
||||
co_await const_cast<table*>(this)->parallel_foreach_compaction_group([&] (const compaction_group& cg) -> future<> {
|
||||
const auto& cg_segments = cg.logstor_segments();
|
||||
|
||||
result.compaction_group_count++;
|
||||
result.segment_count += cg_segments.segment_count();
|
||||
|
||||
for (const auto& desc : cg_segments._segments) {
|
||||
co_await coroutine::maybe_yield();
|
||||
auto data_size = desc.net_data_size(segment_size);
|
||||
auto bucket_index = std::min<size_t>(data_size / bucket_size, bucket_count - 1);
|
||||
auto& bucket = result.histogram[bucket_index];
|
||||
bucket.count++;
|
||||
bucket.max_data_size = std::max(bucket.max_data_size, data_size);
|
||||
}
|
||||
});
|
||||
|
||||
co_return std::move(result);
|
||||
}
|
||||
|
||||
void compaction_group::trigger_compaction() {
|
||||
// But not if we're locked out or stopping
|
||||
if (!_async_gate.is_closed()) {
|
||||
@@ -2390,6 +2536,14 @@ void compaction_group::trigger_compaction() {
|
||||
}
|
||||
}
|
||||
|
||||
void compaction_group::trigger_logstor_compaction() {
|
||||
if (!_async_gate.is_closed() && !_t.is_auto_compaction_disabled_by_user()) {
|
||||
if (_logstor_segments) {
|
||||
get_logstor_compaction_manager().submit(*this);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void table::trigger_offstrategy_compaction() {
|
||||
// Run in background.
|
||||
// This is safe since the the compaction task is tracked
|
||||
@@ -2846,6 +3000,7 @@ compaction_group::compaction_group(table& t, size_t group_id, dht::token_range t
|
||||
, _async_gate(format("[compaction_group {}.{} {}]", t.schema()->ks_name(), t.schema()->cf_name(), group_id))
|
||||
, _backlog_tracker(t.get_compaction_strategy().make_backlog_tracker())
|
||||
, _repair_sstable_classifier(std::move(repair_classifier))
|
||||
, _logstor_segments(make_lw_shared<logstor::segment_set>())
|
||||
{
|
||||
}
|
||||
|
||||
@@ -2879,9 +3034,13 @@ future<> compaction_group::stop(sstring reason) noexcept {
|
||||
for (auto view : all_views()) {
|
||||
co_await _t._compaction_manager.stop_ongoing_compactions(reason, view);
|
||||
}
|
||||
if (_t.uses_logstor()) {
|
||||
co_await get_logstor_compaction_manager().stop_ongoing_compactions(*this);
|
||||
}
|
||||
co_await _async_gate.close();
|
||||
auto flush_future = co_await seastar::coroutine::as_future(flush());
|
||||
|
||||
co_await flush_separator();
|
||||
co_await _flush_gate.close();
|
||||
co_await _sstable_add_gate.close();
|
||||
// FIXME: indentation
|
||||
@@ -3198,7 +3357,9 @@ future<> tablet_storage_group_manager::merge_completion_fiber() {
|
||||
}
|
||||
}
|
||||
|
||||
void tablet_storage_group_manager::handle_tablet_merge_completion(const locator::tablet_map& old_tmap, const locator::tablet_map& new_tmap) {
|
||||
void tablet_storage_group_manager::handle_tablet_merge_completion(locator::effective_replication_map_ptr old_erm,
|
||||
const locator::tablet_map& old_tmap,
|
||||
const locator::tablet_map& new_tmap) {
|
||||
auto table_id = schema()->id();
|
||||
size_t old_tablet_count = old_tmap.tablet_count();
|
||||
size_t new_tablet_count = new_tmap.tablet_count();
|
||||
@@ -3222,7 +3383,7 @@ void tablet_storage_group_manager::handle_tablet_merge_completion(const locator:
|
||||
auto new_cg = make_lw_shared<compaction_group>(_t, new_tid, new_range, make_repair_sstable_classifier_func());
|
||||
for (auto& view : new_cg->all_views()) {
|
||||
auto cre = _t.get_compaction_manager().stop_and_disable_compaction_no_wait(*view, "tablet merging");
|
||||
_compaction_reenablers_for_merging.push_back(std::move(cre));
|
||||
_compaction_reenablers_for_merging.push_back(background_merge_guard{std::move(cre), old_erm});
|
||||
}
|
||||
auto new_sg = make_lw_shared<storage_group>(std::move(new_cg));
|
||||
|
||||
@@ -3255,7 +3416,11 @@ void tablet_storage_group_manager::handle_tablet_merge_completion(const locator:
|
||||
_merge_completion_event.signal();
|
||||
}
|
||||
|
||||
void tablet_storage_group_manager::update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) {
|
||||
void tablet_storage_group_manager::update_effective_replication_map(
|
||||
const locator::effective_replication_map_ptr& old_erm,
|
||||
const locator::effective_replication_map& erm,
|
||||
noncopyable_function<void()> refresh_mutation_source)
|
||||
{
|
||||
auto* new_tablet_map = &erm.get_token_metadata().tablets().get_tablet_map(schema()->id());
|
||||
auto* old_tablet_map = std::exchange(_tablet_map, new_tablet_map);
|
||||
|
||||
@@ -3271,7 +3436,7 @@ void tablet_storage_group_manager::update_effective_replication_map(const locato
|
||||
if (utils::get_local_injector().is_enabled("tablet_force_tablet_count_decrease_once")) {
|
||||
utils::get_local_injector().disable("tablet_force_tablet_count_decrease");
|
||||
}
|
||||
handle_tablet_merge_completion(*old_tablet_map, *new_tablet_map);
|
||||
handle_tablet_merge_completion(old_erm, *old_tablet_map, *new_tablet_map);
|
||||
}
|
||||
|
||||
// Allocate storage group if tablet is migrating in, or deallocate if it's migrating out.
|
||||
@@ -3357,7 +3522,7 @@ void table::update_effective_replication_map(locator::effective_replication_map_
|
||||
};
|
||||
|
||||
if (uses_tablets()) {
|
||||
_sg_manager->update_effective_replication_map(*_erm, refresh_mutation_source);
|
||||
_sg_manager->update_effective_replication_map(old_erm, *_erm, refresh_mutation_source);
|
||||
}
|
||||
if (old_erm) {
|
||||
old_erm->invalidate();
|
||||
@@ -4002,6 +4167,7 @@ future<std::unordered_map<sstring, table::snapshot_details>> table::get_snapshot
|
||||
}
|
||||
|
||||
auto lister = directory_lister(snapshots_dir, lister::dir_entry_types::of<directory_entry_type::directory>());
|
||||
auto close_lister = deferred_close(lister);
|
||||
while (auto de = lister.get().get()) {
|
||||
auto snapshot_name = de->name;
|
||||
all_snapshots.emplace(snapshot_name, snapshot_details());
|
||||
@@ -4009,6 +4175,9 @@ future<std::unordered_map<sstring, table::snapshot_details>> table::get_snapshot
|
||||
auto& sd = all_snapshots.at(snapshot_name);
|
||||
sd.total += details.total;
|
||||
sd.live += details.live;
|
||||
utils::get_local_injector().inject("get_snapshot_details", [&] (auto& handler) -> future<> {
|
||||
throw std::runtime_error("Injected exception in get_snapshot_details");
|
||||
}).get();
|
||||
}
|
||||
}
|
||||
return all_snapshots;
|
||||
@@ -4028,53 +4197,66 @@ future<table::snapshot_details> table::get_snapshot_details(fs::path snapshot_di
|
||||
}
|
||||
|
||||
auto lister = directory_lister(snapshot_directory, snapshot_dir, lister::dir_entry_types::of<directory_entry_type::regular>());
|
||||
while (auto de = co_await lister.get()) {
|
||||
const auto& name = de->name;
|
||||
future<stat_data> (&file_stat)(file& directory, std::string_view name, follow_symlink) noexcept = seastar::file_stat;
|
||||
auto sd = co_await io_check(file_stat, snapshot_directory, name, follow_symlink::no);
|
||||
auto size = sd.allocated_size;
|
||||
std::exception_ptr ex;
|
||||
try {
|
||||
while (auto de = co_await lister.get()) {
|
||||
const auto& name = de->name;
|
||||
future<stat_data> (&file_stat)(file& directory, std::string_view name, follow_symlink) noexcept = seastar::file_stat;
|
||||
auto sd = co_await io_check(file_stat, snapshot_directory, name, follow_symlink::no);
|
||||
auto size = sd.allocated_size;
|
||||
|
||||
// The manifest and schema.sql files are the only files expected to be in this directory not belonging to the SSTable.
|
||||
//
|
||||
// All the others should just generate an exception: there is something wrong, so don't blindly
|
||||
// add it to the size.
|
||||
if (name != "manifest.json" && name != "schema.cql") {
|
||||
details.total += size;
|
||||
if (sd.number_of_links == 1) {
|
||||
// File exists only in the snapshot directory.
|
||||
details.live += size;
|
||||
utils::get_local_injector().inject("per-snapshot-get_snapshot_details", [&] (auto& handler) -> future<> {
|
||||
throw std::runtime_error("Injected exception in per-snapshot-get_snapshot_details");
|
||||
}).get();
|
||||
|
||||
// The manifest and schema.cql files are the only files expected to be in this directory not belonging to the SSTable.
|
||||
//
|
||||
// All the others should just generate an exception: there is something wrong, so don't blindly
|
||||
// add it to the size.
|
||||
if (name != "manifest.json" && name != "schema.cql") {
|
||||
details.total += size;
|
||||
if (sd.number_of_links == 1) {
|
||||
// File exists only in the snapshot directory.
|
||||
details.live += size;
|
||||
continue;
|
||||
}
|
||||
// If the number of links is greater than 1, it is still possible that the file is linked to another snapshot
|
||||
// So check the datadir for the file too.
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
// If the number of links is greater than 1, it is still possible that the file is linked to another snapshot
|
||||
// So check the datadir for the file too.
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto exists_in_dir = [&] (file& dir, const fs::path& path, std::string_view name) -> future<bool> {
|
||||
try {
|
||||
// File exists in the main SSTable directory. Snapshots are not contributing to size
|
||||
auto psd = co_await io_check(file_stat, dir, name, follow_symlink::no);
|
||||
// File in main SSTable directory must be hardlinked to the file in the snapshot dir with the same name.
|
||||
if (psd.device_id != sd.device_id || psd.inode_number != sd.inode_number) {
|
||||
dblog.warn("[{} device_id={} inode_number={} size={}] is not the same file as [{} device_id={} inode_number={} size={}]",
|
||||
(path / name).native(), psd.device_id, psd.inode_number, psd.size,
|
||||
(snapshot_dir / name).native(), sd.device_id, sd.inode_number, sd.size);
|
||||
auto exists_in_dir = [&] (file& dir, const fs::path& path, std::string_view name) -> future<bool> {
|
||||
try {
|
||||
// File exists in the main SSTable directory. Snapshots are not contributing to size
|
||||
auto psd = co_await io_check(file_stat, dir, name, follow_symlink::no);
|
||||
// File in main SSTable directory must be hardlinked to the file in the snapshot dir with the same name.
|
||||
if (psd.device_id != sd.device_id || psd.inode_number != sd.inode_number) {
|
||||
dblog.warn("[{} device_id={} inode_number={} size={}] is not the same file as [{} device_id={} inode_number={} size={}]",
|
||||
(path / name).native(), psd.device_id, psd.inode_number, psd.size,
|
||||
(snapshot_dir / name).native(), sd.device_id, sd.inode_number, sd.size);
|
||||
co_return false;
|
||||
}
|
||||
co_return true;
|
||||
} catch (std::system_error& e) {
|
||||
if (e.code() != std::error_code(ENOENT, std::system_category())) {
|
||||
throw;
|
||||
}
|
||||
co_return false;
|
||||
}
|
||||
};
|
||||
// Check staging dir first, as files might be moved from there to the datadir concurrently to this check
|
||||
if ((!staging_dir || !co_await exists_in_dir(staging_directory, *staging_dir, name)) &&
|
||||
!co_await exists_in_dir(data_directory, datadir, name)) {
|
||||
details.live += size;
|
||||
}
|
||||
co_return true;
|
||||
} catch (std::system_error& e) {
|
||||
if (e.code() != std::error_code(ENOENT, std::system_category())) {
|
||||
throw;
|
||||
}
|
||||
co_return false;
|
||||
}
|
||||
};
|
||||
// Check staging dir first, as files might be moved from there to the datadir concurrently to this check
|
||||
if ((!staging_dir || !co_await exists_in_dir(staging_directory, *staging_dir, name)) &&
|
||||
!co_await exists_in_dir(data_directory, datadir, name)) {
|
||||
details.live += size;
|
||||
}
|
||||
} catch (...) {
|
||||
ex = std::current_exception();
|
||||
}
|
||||
co_await lister.close();
|
||||
if (ex) {
|
||||
co_await coroutine::return_exception_ptr(std::move(ex));
|
||||
}
|
||||
|
||||
co_return details;
|
||||
@@ -4261,6 +4443,18 @@ future<db::replay_position> table::discard_sstables(db_clock::time_point truncat
|
||||
co_return rp;
|
||||
}
|
||||
|
||||
future<> table::discard_logstor_segments() {
|
||||
if (!uses_logstor()) {
|
||||
co_return;
|
||||
}
|
||||
|
||||
_logstor_index->clear();
|
||||
|
||||
co_await parallel_foreach_compaction_group([] (compaction_group& cg) {
|
||||
return cg.discard_logstor_segments();
|
||||
});
|
||||
}
|
||||
|
||||
void table::mark_ready_for_writes(db::commitlog* cl) {
|
||||
if (!_readonly) {
|
||||
on_internal_error(dblog, ::format("table {}.{} is already writable", _schema->ks_name(), _schema->cf_name()));
|
||||
@@ -4271,6 +4465,19 @@ void table::mark_ready_for_writes(db::commitlog* cl) {
|
||||
_readonly = false;
|
||||
}
|
||||
|
||||
void table::init_logstor(logstor::logstor* ls) {
|
||||
_logstor = ls;
|
||||
_logstor_index = std::make_unique<logstor::primary_index>(_schema);
|
||||
}
|
||||
|
||||
size_t table::get_logstor_memory_usage() const {
|
||||
size_t m = 0;
|
||||
if (_logstor_index) {
|
||||
m += _logstor_index->get_memory_usage();
|
||||
}
|
||||
return m;
|
||||
}
|
||||
|
||||
db::commitlog* table::commitlog() const {
|
||||
if (_readonly) [[unlikely]] {
|
||||
on_internal_error(dblog, ::format("table {}.{} is readonly", _schema->ks_name(), _schema->cf_name()));
|
||||
@@ -4295,6 +4502,9 @@ void table::set_schema(schema_ptr s) {
|
||||
if (_counter_cell_locks) {
|
||||
_counter_cell_locks->set_schema(s);
|
||||
}
|
||||
if (_logstor_index) {
|
||||
_logstor_index->set_schema(s);
|
||||
}
|
||||
_schema = std::move(s);
|
||||
|
||||
for (auto&& v : _views) {
|
||||
@@ -4522,6 +4732,11 @@ future<> table::apply(const mutation& m, db::rp_handle&& h, db::timeout_clock::t
|
||||
|
||||
auto& cg = compaction_group_for_token(m.token());
|
||||
auto holder = cg.async_gate().hold();
|
||||
|
||||
if (_logstor) [[unlikely]] {
|
||||
return _logstor->write(m, cg, std::move(holder));
|
||||
}
|
||||
|
||||
return dirty_memory_region_group().run_when_memory_available([this, &m, h = std::move(h), &cg, holder = std::move(holder)] () mutable {
|
||||
do_apply(cg, std::move(h), m);
|
||||
}, timeout);
|
||||
@@ -4537,6 +4752,10 @@ future<> table::apply(const frozen_mutation& m, schema_ptr m_schema, db::rp_hand
|
||||
auto& cg = compaction_group_for_key(m.key(), m_schema);
|
||||
auto holder = cg.async_gate().hold();
|
||||
|
||||
if (_logstor) [[unlikely]] {
|
||||
return _logstor->write(m.unfreeze(m_schema), cg, std::move(holder));
|
||||
}
|
||||
|
||||
return dirty_memory_region_group().run_when_memory_available([this, &m, m_schema = std::move(m_schema), h = std::move(h), &cg, holder = std::move(holder)]() mutable {
|
||||
do_apply(cg, std::move(h), m, m_schema);
|
||||
}, timeout);
|
||||
@@ -4641,13 +4860,14 @@ table::query(schema_ptr query_schema,
|
||||
}
|
||||
|
||||
std::optional<full_position> last_pos;
|
||||
if (querier_opt && querier_opt->current_position()) {
|
||||
last_pos.emplace(*querier_opt->current_position());
|
||||
}
|
||||
|
||||
if (!saved_querier || (querier_opt && !querier_opt->are_limits_reached() && !qs.builder.is_short_read())) {
|
||||
co_await querier_opt->close();
|
||||
querier_opt = {};
|
||||
if (querier_opt) {
|
||||
if (querier_opt->current_position()) {
|
||||
last_pos.emplace(*querier_opt->current_position());
|
||||
}
|
||||
if (!saved_querier || (!querier_opt->are_limits_reached() && !qs.builder.is_short_read())) {
|
||||
co_await querier_opt->close();
|
||||
querier_opt = {};
|
||||
}
|
||||
}
|
||||
if (saved_querier) {
|
||||
*saved_querier = std::move(querier_opt);
|
||||
@@ -4737,6 +4957,10 @@ table::enable_auto_compaction() {
|
||||
// see table::disable_auto_compaction() notes.
|
||||
_compaction_disabled_by_user = false;
|
||||
trigger_compaction();
|
||||
|
||||
if (uses_logstor()) {
|
||||
trigger_logstor_compaction();
|
||||
}
|
||||
}
|
||||
|
||||
future<>
|
||||
@@ -4768,11 +4992,18 @@ table::disable_auto_compaction() {
|
||||
// - it will break computation of major compaction descriptor
|
||||
// for new submissions
|
||||
_compaction_disabled_by_user = true;
|
||||
return with_gate(_async_gate, [this] {
|
||||
return parallel_foreach_compaction_group_view([this] (compaction::compaction_group_view& view) {
|
||||
return _compaction_manager.stop_ongoing_compactions("disable auto-compaction", &view, compaction::compaction_type::Compaction);
|
||||
});
|
||||
|
||||
auto holder = _async_gate.hold();
|
||||
|
||||
co_await parallel_foreach_compaction_group_view([this] (compaction::compaction_group_view& view) {
|
||||
return _compaction_manager.stop_ongoing_compactions("disable auto-compaction", &view, compaction::compaction_type::Compaction);
|
||||
});
|
||||
|
||||
if (uses_logstor()) {
|
||||
co_await parallel_foreach_compaction_group([this] (compaction_group& cg) {
|
||||
return get_logstor_compaction_manager().stop_ongoing_compactions(cg);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
void table::set_tombstone_gc_enabled(bool tombstone_gc_enabled) noexcept {
|
||||
@@ -4985,6 +5216,26 @@ const compaction::compaction_manager& compaction_group::get_compaction_manager()
|
||||
return _t.get_compaction_manager();
|
||||
}
|
||||
|
||||
logstor::segment_manager& compaction_group::get_logstor_segment_manager() noexcept {
|
||||
return _t.get_logstor_segment_manager();
|
||||
}
|
||||
|
||||
const logstor::segment_manager& compaction_group::get_logstor_segment_manager() const noexcept {
|
||||
return _t.get_logstor_segment_manager();
|
||||
}
|
||||
|
||||
logstor::compaction_manager& compaction_group::get_logstor_compaction_manager() noexcept {
|
||||
return _t.get_logstor_compaction_manager();
|
||||
}
|
||||
|
||||
const logstor::compaction_manager& compaction_group::get_logstor_compaction_manager() const noexcept {
|
||||
return _t.get_logstor_compaction_manager();
|
||||
}
|
||||
|
||||
logstor::primary_index& compaction_group::get_logstor_index() noexcept {
|
||||
return _t.logstor_index();
|
||||
}
|
||||
|
||||
compaction::compaction_group_view& compaction_group::as_view_for_static_sharding() const {
|
||||
return view_for_unrepaired_data();
|
||||
}
|
||||
|
||||
@@ -592,6 +592,7 @@ bool operator==(const schema::user_properties& lhs, const schema::user_propertie
|
||||
&& lhs.compaction_strategy == rhs.compaction_strategy
|
||||
&& lhs.compaction_strategy_options == rhs.compaction_strategy_options
|
||||
&& lhs.compaction_enabled == rhs.compaction_enabled
|
||||
&& lhs.storage_engine == rhs.storage_engine
|
||||
&& lhs.caching_options == rhs.caching_options
|
||||
&& lhs.tablet_options == rhs.tablet_options
|
||||
&& lhs.get_paxos_grace_seconds() == rhs.get_paxos_grace_seconds()
|
||||
@@ -698,6 +699,7 @@ table_schema_version schema::calculate_digest(const schema::raw_schema& r) {
|
||||
feed_hash(h, r._view_info);
|
||||
feed_hash(h, r._indices_by_name);
|
||||
feed_hash(h, r._is_counter);
|
||||
feed_hash(h, r._props.storage_engine);
|
||||
|
||||
for (auto&& [name, ext] : r._props.extensions) {
|
||||
feed_hash(h, name);
|
||||
@@ -874,6 +876,9 @@ auto fmt::formatter<schema>::format(const schema& s, fmt::format_context& ctx) c
|
||||
out = fmt::format_to(out, ",minIndexInterval={}", s._raw._props.min_index_interval);
|
||||
out = fmt::format_to(out, ",maxIndexInterval={}", s._raw._props.max_index_interval);
|
||||
out = fmt::format_to(out, ",speculativeRetry={}", s._raw._props.speculative_retry.to_sstring());
|
||||
if (s.storage_engine() != storage_engine_type::normal) {
|
||||
out = fmt::format_to(out, ",storage_engine={}", storage_engine_type_to_sstring(s.storage_engine()));
|
||||
}
|
||||
out = fmt::format_to(out, ",tablets={{");
|
||||
if (s._raw._props.tablet_options) {
|
||||
n = 0;
|
||||
@@ -1210,6 +1215,9 @@ fragmented_ostringstream& schema::schema_properties(const schema_describe_helper
|
||||
os << "\n AND memtable_flush_period_in_ms = " << fmt::to_string(memtable_flush_period());
|
||||
os << "\n AND min_index_interval = " << fmt::to_string(min_index_interval());
|
||||
os << "\n AND speculative_retry = '" << speculative_retry().to_sstring() << "'";
|
||||
if (storage_engine() != storage_engine_type::normal) {
|
||||
os << "\n AND storage_engine = '" << storage_engine_type_to_sstring(storage_engine()) << "'";
|
||||
}
|
||||
|
||||
if (has_tablet_options()) {
|
||||
os << "\n AND tablets = {";
|
||||
|
||||
@@ -175,6 +175,21 @@ public:
|
||||
bool operator==(const speculative_retry& other) const = default;
|
||||
};
|
||||
|
||||
enum class storage_engine_type {
|
||||
normal,
|
||||
logstor,
|
||||
};
|
||||
|
||||
inline sstring storage_engine_type_to_sstring(storage_engine_type t) {
|
||||
switch (t) {
|
||||
case storage_engine_type::normal:
|
||||
return "normal";
|
||||
case storage_engine_type::logstor:
|
||||
return "logstor";
|
||||
}
|
||||
throw std::invalid_argument(format("unknown storage engine type: {:d}\n", uint8_t(t)));
|
||||
}
|
||||
|
||||
using index_options_map = std::unordered_map<sstring, sstring>;
|
||||
|
||||
enum class index_metadata_kind {
|
||||
@@ -561,6 +576,7 @@ public:
|
||||
compaction::compaction_strategy_type compaction_strategy = compaction::compaction_strategy_type::incremental;
|
||||
std::map<sstring, sstring> compaction_strategy_options;
|
||||
bool compaction_enabled = true;
|
||||
storage_engine_type storage_engine = storage_engine_type::normal;
|
||||
::caching_options caching_options;
|
||||
std::optional<std::map<sstring, sstring>> tablet_options;
|
||||
|
||||
@@ -776,6 +792,14 @@ public:
|
||||
return _raw._props.compaction_enabled;
|
||||
}
|
||||
|
||||
storage_engine_type storage_engine() const {
|
||||
return _raw._props.storage_engine;
|
||||
}
|
||||
|
||||
bool logstor_enabled() const {
|
||||
return _raw._props.storage_engine == storage_engine_type::logstor;
|
||||
}
|
||||
|
||||
const cdc::options& cdc_options() const {
|
||||
return _raw._props.get_cdc_options();
|
||||
}
|
||||
|
||||
@@ -269,6 +269,11 @@ public:
|
||||
enable_schema_commitlog();
|
||||
}
|
||||
|
||||
schema_builder& set_logstor() {
|
||||
_raw._props.storage_engine = storage_engine_type::logstor;
|
||||
return *this;
|
||||
}
|
||||
|
||||
class default_names {
|
||||
public:
|
||||
default_names(const schema_builder&);
|
||||
|
||||
@@ -22,12 +22,12 @@ static logging::logger slogger("schema_registry");
|
||||
static thread_local schema_registry registry;
|
||||
|
||||
schema_version_not_found::schema_version_not_found(table_schema_version v)
|
||||
: std::runtime_error{format("Schema version {} not found", v)}
|
||||
{ }
|
||||
: std::runtime_error{format("Schema version {} not found", v)} {
|
||||
}
|
||||
|
||||
schema_version_loading_failed::schema_version_loading_failed(table_schema_version v)
|
||||
: std::runtime_error{format("Failed to load schema version {}", v)}
|
||||
{ }
|
||||
: std::runtime_error{format("Failed to load schema version {}", v)} {
|
||||
}
|
||||
|
||||
schema_registry_entry::~schema_registry_entry() {
|
||||
if (_schema) {
|
||||
@@ -39,8 +39,7 @@ schema_registry_entry::schema_registry_entry(table_schema_version v, schema_regi
|
||||
: _state(state::INITIAL)
|
||||
, _version(v)
|
||||
, _registry(r)
|
||||
, _sync_state(sync_state::NOT_SYNCED)
|
||||
{
|
||||
, _sync_state(sync_state::NOT_SYNCED) {
|
||||
_erase_timer.set_callback([this] {
|
||||
slogger.debug("Dropping {}", _version);
|
||||
SCYLLA_ASSERT(!_schema);
|
||||
@@ -71,8 +70,8 @@ void schema_registry::attach_table(schema_registry_entry& e) noexcept {
|
||||
e.set_table(table.weak_from_this());
|
||||
} catch (const replica::no_such_column_family&) {
|
||||
if (slogger.is_enabled(seastar::log_level::debug)) {
|
||||
slogger.debug("No table for schema version {} of {}.{}: {}", e._version,
|
||||
e.get_schema()->ks_name(), e.get_schema()->cf_name(), seastar::current_backtrace());
|
||||
slogger.debug("No table for schema version {} of {}.{}: {}", e._version, e.get_schema()->ks_name(), e.get_schema()->cf_name(),
|
||||
seastar::current_backtrace());
|
||||
}
|
||||
// ignore
|
||||
}
|
||||
@@ -221,7 +220,7 @@ future<schema_ptr> schema_registry_entry::start_loading(async_schema_loader load
|
||||
_state = state::LOADING;
|
||||
slogger.trace("Loading {}", _version);
|
||||
// Move to background.
|
||||
(void)f.then_wrapped([self = shared_from_this(), this] (future<extended_frozen_schema>&& f) {
|
||||
(void)f.then_wrapped([self = shared_from_this(), this](future<extended_frozen_schema>&& f) {
|
||||
_loader = {};
|
||||
if (_state != state::LOADING) {
|
||||
slogger.trace("Loading of {} aborted", _version);
|
||||
@@ -294,8 +293,8 @@ schema_registry& local_schema_registry() {
|
||||
}
|
||||
|
||||
global_schema_ptr::global_schema_ptr(const global_schema_ptr& o)
|
||||
: global_schema_ptr(o.get())
|
||||
{ }
|
||||
: global_schema_ptr(o.get()) {
|
||||
}
|
||||
|
||||
global_schema_ptr::global_schema_ptr(global_schema_ptr&& o) noexcept {
|
||||
auto current = this_shard_id();
|
||||
@@ -332,15 +331,15 @@ schema_ptr global_schema_ptr::get() const {
|
||||
}
|
||||
|
||||
global_schema_ptr::global_schema_ptr(const schema_ptr& ptr)
|
||||
: _cpu_of_origin(this_shard_id()) {
|
||||
: _cpu_of_origin(this_shard_id()) {
|
||||
// _ptr must always have an associated registry entry,
|
||||
// if ptr doesn't, we need to load it into the registry.
|
||||
auto ensure_registry_entry = [] (const schema_ptr& s) {
|
||||
auto ensure_registry_entry = [](const schema_ptr& s) {
|
||||
schema_registry_entry* e = s->registry_entry();
|
||||
if (e) {
|
||||
return s;
|
||||
} else {
|
||||
return local_schema_registry().get_or_load(s->version(), [&s] (table_schema_version) -> extended_frozen_schema {
|
||||
return local_schema_registry().get_or_load(s->version(), [&s](table_schema_version) -> extended_frozen_schema {
|
||||
return extended_frozen_schema(s);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -952,6 +952,8 @@ class sstring:
|
||||
|
||||
@staticmethod
|
||||
def to_hex(data, size):
|
||||
if size == 0:
|
||||
return ''
|
||||
inf = gdb.selected_inferior()
|
||||
return bytes(inf.read_memory(data, size)).hex()
|
||||
|
||||
@@ -974,6 +976,8 @@ class sstring:
|
||||
return self.ref['u']['external']['str']
|
||||
|
||||
def as_bytes(self):
|
||||
if len(self) == 0:
|
||||
return b''
|
||||
inf = gdb.selected_inferior()
|
||||
return bytes(inf.read_memory(self.data(), len(self)))
|
||||
|
||||
@@ -5636,6 +5640,8 @@ class scylla_sstable_summary(gdb.Command):
|
||||
self.inf = gdb.selected_inferior()
|
||||
|
||||
def to_hex(self, data, size):
|
||||
if size == 0:
|
||||
return ''
|
||||
return bytes(self.inf.read_memory(data, size)).hex()
|
||||
|
||||
def invoke(self, arg, for_tty):
|
||||
@@ -5647,6 +5653,10 @@ class scylla_sstable_summary(gdb.Command):
|
||||
sst = seastar_lw_shared_ptr(arg).get().dereference()
|
||||
else:
|
||||
sst = arg
|
||||
ms_version = int(gdb.parse_and_eval('sstables::sstable_version_types::ms'))
|
||||
if int(sst['_version']) >= ms_version:
|
||||
gdb.write("sstable uses ms format (trie-based index); summary is not populated.\n")
|
||||
return
|
||||
summary = seastar_lw_shared_ptr(sst['_components']['_value']).get().dereference()['summary']
|
||||
|
||||
gdb.write("header: {}\n".format(summary['header']))
|
||||
|
||||
@@ -227,8 +227,6 @@ future<> group0_state_machine::reload_modules(modules_to_reload modules) {
|
||||
for (const auto& m : modules.entries) {
|
||||
if (m.table == db::system_keyspace::service_levels_v2()->id()) {
|
||||
update_service_levels_cache = true;
|
||||
} else if (m.table == db::system_keyspace::role_members()->id() || m.table == db::system_keyspace::role_attributes()->id()) {
|
||||
update_service_levels_effective_cache = true;
|
||||
} else if (m.table == db::system_keyspace::dicts()->id()) {
|
||||
auto pk_type = db::system_keyspace::dicts()->partition_key_type();
|
||||
auto name_value = pk_type->deserialize_value(m.pk.representation());
|
||||
@@ -247,6 +245,11 @@ future<> group0_state_machine::reload_modules(modules_to_reload modules) {
|
||||
auto cdc_log_table_id = table_id(value_cast<utils::UUID>(uuid_type->deserialize_value(elements.front())));
|
||||
update_cdc_streams.insert(cdc_log_table_id);
|
||||
} else if (auth::cache::includes_table(m.table)) {
|
||||
if (m.table == db::system_keyspace::role_members()->id() ||
|
||||
m.table == db::system_keyspace::role_attributes()->id()) {
|
||||
update_service_levels_effective_cache = true;
|
||||
}
|
||||
|
||||
auto schema = _ss.get_database().find_schema(m.table);
|
||||
const auto elements = m.pk.explode(*schema);
|
||||
auto role = value_cast<sstring>(schema->partition_key_type()->
|
||||
@@ -255,6 +258,9 @@ future<> group0_state_machine::reload_modules(modules_to_reload modules) {
|
||||
}
|
||||
}
|
||||
|
||||
if (update_auth_cache_roles.size()) {
|
||||
co_await _ss.auth_cache().load_roles(std::move(update_auth_cache_roles));
|
||||
}
|
||||
if (update_service_levels_cache || update_service_levels_effective_cache) { // this also updates SL effective cache
|
||||
co_await _ss.update_service_levels_cache(qos::update_both_cache_levels(update_service_levels_cache), qos::query_context::group0);
|
||||
}
|
||||
@@ -264,9 +270,6 @@ future<> group0_state_machine::reload_modules(modules_to_reload modules) {
|
||||
if (update_cdc_streams.size()) {
|
||||
co_await _ss.load_cdc_streams(std::move(update_cdc_streams));
|
||||
}
|
||||
if (update_auth_cache_roles.size()) {
|
||||
co_await _ss.auth_cache().load_roles(std::move(update_auth_cache_roles));
|
||||
}
|
||||
}
|
||||
|
||||
future<> group0_state_machine::merge_and_apply(group0_state_machine_merger& merger) {
|
||||
|
||||
@@ -4653,6 +4653,7 @@ void storage_proxy::send_to_live_endpoints(storage_proxy::response_id_type respo
|
||||
auto& stats = handler_ptr->stats();
|
||||
auto& handler = *handler_ptr;
|
||||
auto& global_stats = handler._proxy->_global_stats;
|
||||
auto schema = handler_ptr->get_schema();
|
||||
|
||||
if (handler.get_targets().size() == 0) {
|
||||
// Usually we remove the response handler when receiving responses from all targets.
|
||||
@@ -4748,7 +4749,7 @@ void storage_proxy::send_to_live_endpoints(storage_proxy::response_id_type respo
|
||||
}
|
||||
|
||||
// Waited on indirectly.
|
||||
(void)f.handle_exception([response_id, forward_size, coordinator, handler_ptr, p = shared_from_this(), &stats] (std::exception_ptr eptr) {
|
||||
(void)f.handle_exception([response_id, forward_size, coordinator, handler_ptr, p = shared_from_this(), &stats, schema] (std::exception_ptr eptr) {
|
||||
++stats.writes_errors.get_ep_stat(handler_ptr->_effective_replication_map_ptr->get_topology(), coordinator);
|
||||
error err = error::FAILURE;
|
||||
std::optional<sstring> msg;
|
||||
@@ -4762,8 +4763,8 @@ void storage_proxy::send_to_live_endpoints(storage_proxy::response_id_type respo
|
||||
// ignore, disconnect will be logged by gossiper
|
||||
} else if (const auto* e = try_catch_nested<seastar::gate_closed_exception>(eptr)) {
|
||||
// may happen during shutdown, log and ignore it
|
||||
slogger.warn("gate_closed_exception during mutation write to {}: {}",
|
||||
coordinator, e->what());
|
||||
slogger.warn("gate_closed_exception during mutation write to {}.{} on {}: {}",
|
||||
schema->ks_name(), schema->cf_name(), coordinator, e->what());
|
||||
} else if (try_catch<timed_out_error>(eptr)) {
|
||||
// from lmutate(). Ignore so that logs are not flooded
|
||||
// database total_writes_timedout counter was incremented.
|
||||
@@ -4774,7 +4775,8 @@ void storage_proxy::send_to_live_endpoints(storage_proxy::response_id_type respo
|
||||
} else if (auto* e = try_catch<replica::critical_disk_utilization_exception>(eptr)) {
|
||||
msg = e->what();
|
||||
} else {
|
||||
slogger.error("exception during mutation write to {}: {}", coordinator, eptr);
|
||||
slogger.error("exception during mutation write to {}.{} on {}: {}",
|
||||
schema->ks_name(), schema->cf_name(), coordinator, eptr);
|
||||
}
|
||||
p->got_failure_response(response_id, coordinator, forward_size + 1, std::nullopt, err, std::move(msg));
|
||||
});
|
||||
|
||||
@@ -910,7 +910,7 @@ future<> storage_service::merge_topology_snapshot(raft_snapshot snp) {
|
||||
frozen_muts_to_apply.push_back(co_await freeze_gently(mut));
|
||||
} else {
|
||||
co_await for_each_split_mutation(std::move(mut), max_size, [&] (mutation m) -> future<> {
|
||||
frozen_muts_to_apply.push_back(co_await freeze_gently(mut));
|
||||
frozen_muts_to_apply.push_back(co_await freeze_gently(m));
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -2794,12 +2794,18 @@ future<> storage_service::raft_decommission() {
|
||||
}
|
||||
}
|
||||
|
||||
future<> storage_service::decommission() {
|
||||
return run_with_api_lock(sstring("decommission"), [] (storage_service& ss) {
|
||||
return seastar::async([&ss] {
|
||||
future<> storage_service::decommission(sharded<db::snapshot_ctl>& snapshot_ctl) {
|
||||
return run_with_api_lock(sstring("decommission"), [&] (storage_service& ss) {
|
||||
return seastar::async([&] {
|
||||
if (ss._operation_mode != mode::NORMAL) {
|
||||
throw std::runtime_error(::format("Node in {} state; wait for status to become normal or restart", ss._operation_mode));
|
||||
}
|
||||
|
||||
snapshot_ctl.invoke_on_all([](auto& sctl) {
|
||||
return sctl.disable_all_operations();
|
||||
}).get();
|
||||
slogger.info("DECOMMISSIONING: disabled backup and snapshots");
|
||||
|
||||
ss.raft_decommission().get();
|
||||
|
||||
ss.stop_transport().get();
|
||||
@@ -3020,6 +3026,8 @@ future<> storage_service::drain() {
|
||||
}
|
||||
|
||||
future<> storage_service::do_drain() {
|
||||
co_await utils::get_local_injector().inject("storage_service_drain_wait", utils::wait_for_message(60s));
|
||||
|
||||
// Need to stop transport before group0, otherwise RPCs may fail with raft_group_not_found.
|
||||
co_await stop_transport();
|
||||
|
||||
@@ -4010,6 +4018,9 @@ future<> storage_service::process_tablet_split_candidate(table_id table) noexcep
|
||||
} catch (raft::request_aborted& ex) {
|
||||
slogger.warn("Failed to complete splitting of table {} due to {}", table, ex);
|
||||
break;
|
||||
} catch (seastar::gate_closed_exception& ex) {
|
||||
slogger.warn("Failed to complete splitting of table {} due to {}", table, ex);
|
||||
break;
|
||||
} catch (...) {
|
||||
slogger.error("Failed to complete splitting of table {} due to {}, retrying after {} seconds",
|
||||
table, std::current_exception(), split_retry.sleep_time());
|
||||
@@ -4076,6 +4087,58 @@ future<> storage_service::snitch_reconfigured() {
|
||||
}
|
||||
}
|
||||
|
||||
future<> storage_service::local_topology_barrier() {
|
||||
if (this_shard_id() != 0) {
|
||||
co_await container().invoke_on(0, [] (storage_service& ss) {
|
||||
return ss.local_topology_barrier();
|
||||
});
|
||||
co_return;
|
||||
}
|
||||
|
||||
auto version = _topology_state_machine._topology.version;
|
||||
|
||||
utils::get_local_injector().inject("raft_topology_barrier_and_drain_fail_before", [] {
|
||||
throw std::runtime_error("raft_topology_barrier_and_drain_fail_before injected exception");
|
||||
});
|
||||
|
||||
co_await utils::get_local_injector().inject("pause_before_barrier_and_drain", utils::wait_for_message(std::chrono::minutes(5)));
|
||||
if (_topology_state_machine._topology.tstate == topology::transition_state::write_both_read_old) {
|
||||
for (auto& n : _topology_state_machine._topology.transition_nodes) {
|
||||
if (!_address_map.find(locator::host_id{n.first.uuid()})) {
|
||||
rtlogger.error("The topology transition is in a double write state but the IP of the node in transition is not known");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
co_await container().invoke_on_all([version] (storage_service& ss) -> future<> {
|
||||
const auto current_version = ss._shared_token_metadata.get()->get_version();
|
||||
rtlogger.info("Got raft_topology_cmd::barrier_and_drain, version {}, "
|
||||
"current version {}, stale versions (version: use_count): {}",
|
||||
version, current_version, ss._shared_token_metadata.describe_stale_versions());
|
||||
|
||||
// This shouldn't happen under normal operation, it's only plausible
|
||||
// if the topology change coordinator has
|
||||
// moved to another node and managed to update the topology
|
||||
// parallel to this method. The previous coordinator
|
||||
// should be inactive now, so it won't observe this
|
||||
// exception. By returning exception we aim
|
||||
// to reveal any other conditions where this may arise.
|
||||
if (current_version != version) {
|
||||
co_await coroutine::return_exception(std::runtime_error(
|
||||
::format("raft topology: command::barrier_and_drain, the version has changed, "
|
||||
"version {}, current_version {}, the topology change coordinator "
|
||||
" had probably migrated to another node",
|
||||
version, current_version)));
|
||||
}
|
||||
|
||||
co_await ss._shared_token_metadata.stale_versions_in_use();
|
||||
co_await get_topology_session_manager().drain_closing_sessions();
|
||||
|
||||
rtlogger.info("raft_topology_cmd::barrier_and_drain done");
|
||||
});
|
||||
}
|
||||
|
||||
future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft::term_t term, uint64_t cmd_index, raft_topology_cmd cmd) {
|
||||
raft_topology_cmd_result result;
|
||||
rtlogger.info("topology cmd rpc {} is called index={}", cmd.cmd, cmd_index);
|
||||
@@ -4103,12 +4166,6 @@ future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft
|
||||
state.last_index = cmd_index;
|
||||
}
|
||||
|
||||
// We capture the topology version right after the checks
|
||||
// above, before any yields. This is crucial since _topology_state_machine._topology
|
||||
// might be altered concurrently while this method is running,
|
||||
// which can cause the fence command to apply an invalid fence version.
|
||||
const auto version = _topology_state_machine._topology.version;
|
||||
|
||||
switch (cmd.cmd) {
|
||||
case raft_topology_cmd::command::barrier: {
|
||||
utils::get_local_injector().inject("raft_topology_barrier_fail",
|
||||
@@ -4147,44 +4204,7 @@ future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft
|
||||
}
|
||||
break;
|
||||
case raft_topology_cmd::command::barrier_and_drain: {
|
||||
utils::get_local_injector().inject("raft_topology_barrier_and_drain_fail_before", [] {
|
||||
throw std::runtime_error("raft_topology_barrier_and_drain_fail_before injected exception");
|
||||
});
|
||||
co_await utils::get_local_injector().inject("pause_before_barrier_and_drain", utils::wait_for_message(std::chrono::minutes(5)));
|
||||
if (_topology_state_machine._topology.tstate == topology::transition_state::write_both_read_old) {
|
||||
for (auto& n : _topology_state_machine._topology.transition_nodes) {
|
||||
if (!_address_map.find(locator::host_id{n.first.uuid()})) {
|
||||
rtlogger.error("The topology transition is in a double write state but the IP of the node in transition is not known");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
co_await container().invoke_on_all([version] (storage_service& ss) -> future<> {
|
||||
const auto current_version = ss._shared_token_metadata.get()->get_version();
|
||||
rtlogger.info("Got raft_topology_cmd::barrier_and_drain, version {}, "
|
||||
"current version {}, stale versions (version: use_count): {}",
|
||||
version, current_version, ss._shared_token_metadata.describe_stale_versions());
|
||||
|
||||
// This shouldn't happen under normal operation, it's only plausible
|
||||
// if the topology change coordinator has
|
||||
// moved to another node and managed to update the topology
|
||||
// parallel to this method. The previous coordinator
|
||||
// should be inactive now, so it won't observe this
|
||||
// exception. By returning exception we aim
|
||||
// to reveal any other conditions where this may arise.
|
||||
if (current_version != version) {
|
||||
co_await coroutine::return_exception(std::runtime_error(
|
||||
::format("raft topology: command::barrier_and_drain, the version has changed, "
|
||||
"version {}, current_version {}, the topology change coordinator "
|
||||
" had probably migrated to another node",
|
||||
version, current_version)));
|
||||
}
|
||||
|
||||
co_await ss._shared_token_metadata.stale_versions_in_use();
|
||||
co_await get_topology_session_manager().drain_closing_sessions();
|
||||
|
||||
rtlogger.info("raft_topology_cmd::barrier_and_drain done");
|
||||
});
|
||||
co_await local_topology_barrier();
|
||||
|
||||
co_await utils::get_local_injector().inject("raft_topology_barrier_and_drain_fail", [this] (auto& handler) -> future<> {
|
||||
auto ks = handler.get("keyspace");
|
||||
|
||||
@@ -77,6 +77,7 @@ namespace db {
|
||||
class system_distributed_keyspace;
|
||||
class system_keyspace;
|
||||
class batchlog_manager;
|
||||
class snapshot_ctl;
|
||||
namespace view {
|
||||
class view_builder;
|
||||
class view_building_worker;
|
||||
@@ -666,7 +667,7 @@ private:
|
||||
inet_address_vector_replica_set get_natural_endpoints(const sstring& keyspace, const schema_ptr& schema, const replica::column_family& cf, const partition_key& pk) const;
|
||||
|
||||
public:
|
||||
future<> decommission();
|
||||
future<> decommission(sharded<db::snapshot_ctl>&);
|
||||
|
||||
private:
|
||||
future<> unbootstrap();
|
||||
@@ -812,6 +813,9 @@ public:
|
||||
future<bool> ongoing_rf_change(const group0_guard& guard, sstring ks) const;
|
||||
future<> raft_initialize_discovery_leader(const join_node_request_params& params);
|
||||
future<> initialize_done_topology_upgrade_state();
|
||||
// Does the local part of global_token_metadata_barrier(), without a raft group0 barrier.
|
||||
// In particular, waits for non-latest local erms to go die.
|
||||
future<> local_topology_barrier();
|
||||
private:
|
||||
// State machine that is responsible for topology change
|
||||
topology_state_machine& _topology_state_machine;
|
||||
|
||||
@@ -195,9 +195,9 @@ future<std::optional<tasks::task_status>> tablet_virtual_task::wait(tasks::task_
|
||||
} else if (is_resize_task(task_type)) {
|
||||
auto new_tablet_count = _ss.get_token_metadata().tablets().get_tablet_map(table).tablet_count();
|
||||
res->status.state = new_tablet_count == tablet_count ? tasks::task_manager::task_state::suspended : tasks::task_manager::task_state::done;
|
||||
res->status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper())) : utils::chunked_vector<tasks::task_identity>{};
|
||||
res->status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, _ss.get_token_metadata_ptr()) : utils::chunked_vector<tasks::task_identity>{};
|
||||
} else {
|
||||
res->status.children = co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper()));
|
||||
res->status.children = co_await get_children(get_module(), id, _ss.get_token_metadata_ptr());
|
||||
}
|
||||
res->status.end_time = db_clock::now(); // FIXME: Get precise end time.
|
||||
co_return res->status;
|
||||
@@ -312,7 +312,7 @@ future<std::optional<status_helper>> tablet_virtual_task::get_status_helper(task
|
||||
}
|
||||
return make_ready_future();
|
||||
});
|
||||
res.status.children = co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper()));
|
||||
res.status.children = co_await get_children(get_module(), id, _ss.get_token_metadata_ptr());
|
||||
} else if (is_migration_task(task_type)) { // Migration task.
|
||||
auto tablet_id = hint.get_tablet_id();
|
||||
res.pending_replica = tmap.get_tablet_transition_info(tablet_id)->pending_replica;
|
||||
@@ -326,7 +326,7 @@ future<std::optional<status_helper>> tablet_virtual_task::get_status_helper(task
|
||||
if (task_info.tablet_task_id.uuid() == id.uuid()) {
|
||||
update_status(task_info, res.status, sched_nr);
|
||||
res.status.state = tasks::task_manager::task_state::running;
|
||||
res.status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper())) : utils::chunked_vector<tasks::task_identity>{};
|
||||
res.status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, _ss.get_token_metadata_ptr()) : utils::chunked_vector<tasks::task_identity>{};
|
||||
co_return res;
|
||||
}
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user