Compare commits
266 Commits
copilot/fi
...
copilot/im
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b549a9b8f2 | ||
|
|
965bc9e5d0 | ||
|
|
179c8ac67f | ||
|
|
f0d159abb0 | ||
|
|
62313a6264 | ||
|
|
2642636ada | ||
|
|
5f79d93102 | ||
|
|
4ba3e90f33 | ||
|
|
3af5183633 | ||
|
|
a51cb3dad9 | ||
|
|
b546315edf | ||
|
|
4c9c3aae23 | ||
|
|
384e394ff0 | ||
|
|
e4da0afb8d | ||
|
|
375479d96c | ||
|
|
1f658bb2e2 | ||
|
|
b8afb173a6 | ||
|
|
ec329f85b0 | ||
|
|
08386ea959 | ||
|
|
0df85c8ae8 | ||
|
|
1ef6ac5439 | ||
|
|
4e41b6f106 | ||
|
|
e03d24e3f3 | ||
|
|
c4a9d7eb3e | ||
|
|
77a934e5b9 | ||
|
|
4d46674d03 | ||
|
|
2d2177d2c9 | ||
|
|
240b32a87a | ||
|
|
1a08ef2062 | ||
|
|
8d00266f88 | ||
|
|
c0b06a7fc6 | ||
|
|
115bd51873 | ||
|
|
12aa33106f | ||
|
|
b03d520aff | ||
|
|
5b2b8d596a | ||
|
|
b72df3ae27 | ||
|
|
2bedefbb85 | ||
|
|
6c8ddfc018 | ||
|
|
40ada3f187 | ||
|
|
76b84b71d1 | ||
|
|
624869de86 | ||
|
|
69d6e63a58 | ||
|
|
d6e2d3d34c | ||
|
|
e28df9b3d0 | ||
|
|
597d300527 | ||
|
|
b690ddb9e5 | ||
|
|
3abda7d15e | ||
|
|
3e9b071838 | ||
|
|
0264ec3c1d | ||
|
|
ffcce1ffc8 | ||
|
|
c9eab7fbd4 | ||
|
|
80e5860a8c | ||
|
|
853f3dadda | ||
|
|
4e63e74438 | ||
|
|
567c28dd0d | ||
|
|
9927c6a3d4 | ||
|
|
bbf9ce18ef | ||
|
|
d892140655 | ||
|
|
4a5292e815 | ||
|
|
fb4d89f789 | ||
|
|
ba5c70d5ab | ||
|
|
8df9cfcde8 | ||
|
|
f60033db63 | ||
|
|
df20f178aa | ||
|
|
a31c8762ca | ||
|
|
5e1254eef0 | ||
|
|
a86b782d3f | ||
|
|
1bd855a650 | ||
|
|
6a26381f4f | ||
|
|
a532fc73bc | ||
|
|
e246abec4d | ||
|
|
dfa600fb8f | ||
|
|
2e33234e91 | ||
|
|
63e3a22f2e | ||
|
|
e963a8d603 | ||
|
|
1ee89c9682 | ||
|
|
6d3c720a08 | ||
|
|
b7ebd73e53 | ||
|
|
10eb364821 | ||
|
|
cc9e125f12 | ||
|
|
0c9b93905e | ||
|
|
621cb19045 | ||
|
|
1c9ec9a76d | ||
|
|
bda1709734 | ||
|
|
712cc8b8f1 | ||
|
|
9e189da23a | ||
|
|
32cf358f44 | ||
|
|
8e496a2f2f | ||
|
|
9c50d29a00 | ||
|
|
92996ce9fa | ||
|
|
50a3460441 | ||
|
|
55c7bc746e | ||
|
|
ebb101f8ae | ||
|
|
f769e52877 | ||
|
|
51433b838a | ||
|
|
0e27ee67d2 | ||
|
|
186c91233b | ||
|
|
27bf65e77a | ||
|
|
c66275e05c | ||
|
|
9c5b4e74c3 | ||
|
|
ccc03d0026 | ||
|
|
8df5189f9c | ||
|
|
b036a461b7 | ||
|
|
3071ccd54a | ||
|
|
4ae45eb367 | ||
|
|
da00401b7d | ||
|
|
95d4c73eb1 | ||
|
|
12dcf79c60 | ||
|
|
74a57d2872 | ||
|
|
632ff66897 | ||
|
|
04976875cc | ||
|
|
377c3ac072 | ||
|
|
d6edad4117 | ||
|
|
3c1e1f867d | ||
|
|
f3a4af199f | ||
|
|
1bb897c7ca | ||
|
|
954f2cbd2f | ||
|
|
e75c75f8cd | ||
|
|
d671ca9f53 | ||
|
|
fc81983d42 | ||
|
|
cf70250a5c | ||
|
|
54f3e69fdc | ||
|
|
9ed820cbf5 | ||
|
|
71bc1886ee | ||
|
|
b24001b5e7 | ||
|
|
f4efdf18a5 | ||
|
|
6bdbd91cf7 | ||
|
|
ce3320a3ff | ||
|
|
caa0cbe328 | ||
|
|
bad2fe72b6 | ||
|
|
ec15a1b602 | ||
|
|
ecef158345 | ||
|
|
53abf93bd8 | ||
|
|
bbe64e0e2a | ||
|
|
7198191aa9 | ||
|
|
d5f72cd5fc | ||
|
|
afde5f668a | ||
|
|
140858fc22 | ||
|
|
132aa753da | ||
|
|
f902eb1632 | ||
|
|
e0cddc8c99 | ||
|
|
e31b72c61f | ||
|
|
48b1ceefaf | ||
|
|
a21aa5bdf6 | ||
|
|
d0812c951e | ||
|
|
fe8923bdc7 | ||
|
|
9fee06d3bc | ||
|
|
7a298788c0 | ||
|
|
db6a5aa20b | ||
|
|
8e247b06a2 | ||
|
|
37746ba814 | ||
|
|
976bcef5d0 | ||
|
|
8a4daf3ef1 | ||
|
|
6975234d1c | ||
|
|
4a88f15465 | ||
|
|
41ed11cdbe | ||
|
|
07708acebd | ||
|
|
ef3b651e1b | ||
|
|
0c45a7df00 | ||
|
|
2e2cd2aa39 | ||
|
|
58b5d43538 | ||
|
|
bfdd4f7776 | ||
|
|
bf9640457e | ||
|
|
cd2568ad00 | ||
|
|
7586c5ccbd | ||
|
|
d60b908a8e | ||
|
|
20ff2fcc18 | ||
|
|
6ffdada0ea | ||
|
|
4c247a5d08 | ||
|
|
288d4b49e9 | ||
|
|
e304d912b4 | ||
|
|
846a6e700b | ||
|
|
af5e73def9 | ||
|
|
8e462d06be | ||
|
|
9ac82d63e9 | ||
|
|
9793a45288 | ||
|
|
afb96b6387 | ||
|
|
033579ad6f | ||
|
|
c1da552fa4 | ||
|
|
cb3b96b8f4 | ||
|
|
b105ad8379 | ||
|
|
addac8b3f7 | ||
|
|
ea95cdaaec | ||
|
|
28cbaef110 | ||
|
|
85adf6bdb1 | ||
|
|
3a54bab193 | ||
|
|
f65db4e8eb | ||
|
|
df2ac0f257 | ||
|
|
093e97a539 | ||
|
|
fa6e5d0754 | ||
|
|
08518b2c12 | ||
|
|
2a75b1374e | ||
|
|
2cb9bb8f3a | ||
|
|
c899c117c7 | ||
|
|
db9f9e1b7e | ||
|
|
4c6e508811 | ||
|
|
f1d63d014c | ||
|
|
33f7bc28da | ||
|
|
0ef8ca4c57 | ||
|
|
95d0782f89 | ||
|
|
f831ca5ab5 | ||
|
|
1fe0509a9b | ||
|
|
e7d76fd8f3 | ||
|
|
700853740d | ||
|
|
3c5dd5e5ae | ||
|
|
5971b2ad97 | ||
|
|
f89315d02f | ||
|
|
d5c205194b | ||
|
|
6ad10b141a | ||
|
|
8cf8e6c87d | ||
|
|
1642c686c2 | ||
|
|
9431826c52 | ||
|
|
ba6fabfc88 | ||
|
|
cfd91545f9 | ||
|
|
1382b47d45 | ||
|
|
798714183e | ||
|
|
f5ca3657e2 | ||
|
|
dc00461adf | ||
|
|
be6d87648c | ||
|
|
004c08f525 | ||
|
|
4e106b9820 | ||
|
|
4fa4f40712 | ||
|
|
07867a9a0d | ||
|
|
a0a7941eb1 | ||
|
|
e3b9abdb30 | ||
|
|
bc772b791d | ||
|
|
77a4f95eb8 | ||
|
|
992bfb9f63 | ||
|
|
ee3a743dc4 | ||
|
|
48d243f32f | ||
|
|
d9d58780e2 | ||
|
|
ddb27488fa | ||
|
|
10225ee434 | ||
|
|
a72025bbf6 | ||
|
|
3f8363300a | ||
|
|
63d1d6c39b | ||
|
|
27d460758f | ||
|
|
794e03856a | ||
|
|
2dae0a7380 | ||
|
|
1fdc410e24 | ||
|
|
1a077a80f1 | ||
|
|
c5e840e460 | ||
|
|
c10486a5e9 | ||
|
|
ab82428228 | ||
|
|
b1be4ba2fc | ||
|
|
7c8ab3d3d3 | ||
|
|
cb7f2e4953 | ||
|
|
dd5b6770c8 | ||
|
|
3d73a9781e | ||
|
|
982339e73f | ||
|
|
d1a04b3913 | ||
|
|
a63508a6f7 | ||
|
|
ca4564e41c | ||
|
|
f93cafac51 | ||
|
|
a3ca4fccef | ||
|
|
5953a89822 | ||
|
|
932b008107 | ||
|
|
e47f0c6284 | ||
|
|
f12adfc292 | ||
|
|
aa908ba99c | ||
|
|
529cd25c51 | ||
|
|
4fc5fcaec4 | ||
|
|
3253b05ec9 | ||
|
|
597a2ce5f9 | ||
|
|
a5f19af050 | ||
|
|
b4fe565f07 |
@@ -1,12 +0,0 @@
|
||||
name: Call Jira Status In Progress
|
||||
|
||||
on:
|
||||
pull_request_target:
|
||||
types: [opened]
|
||||
|
||||
jobs:
|
||||
call-jira-status-in-progress:
|
||||
uses: scylladb/github-automation/.github/workflows/main_update_jira_status_to_in_progress.yml@main
|
||||
secrets:
|
||||
caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
|
||||
|
||||
12
.github/workflows/call_jira_status_in_review.yml
vendored
12
.github/workflows/call_jira_status_in_review.yml
vendored
@@ -1,12 +0,0 @@
|
||||
name: Call Jira Status In Review
|
||||
|
||||
on:
|
||||
pull_request_target:
|
||||
types: [ready_for_review, review_requested]
|
||||
|
||||
jobs:
|
||||
call-jira-status-in-review:
|
||||
uses: scylladb/github-automation/.github/workflows/main_update_jira_status_to_in_review.yml@main
|
||||
secrets:
|
||||
caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
|
||||
|
||||
@@ -1,12 +0,0 @@
|
||||
name: Call Jira Status Ready For Merge
|
||||
|
||||
on:
|
||||
pull_request_target:
|
||||
types: [labeled]
|
||||
|
||||
jobs:
|
||||
call-jira-status-update:
|
||||
uses: scylladb/github-automation/.github/workflows/main_update_jira_status_to_ready_for_merge.yml@main
|
||||
secrets:
|
||||
caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
|
||||
|
||||
41
.github/workflows/call_jira_sync.yml
vendored
Normal file
41
.github/workflows/call_jira_sync.yml
vendored
Normal file
@@ -0,0 +1,41 @@
|
||||
name: Sync Jira Based on PR Events
|
||||
|
||||
on:
|
||||
pull_request_target:
|
||||
types: [opened, ready_for_review, review_requested, labeled, unlabeled, closed]
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: write
|
||||
issues: write
|
||||
|
||||
jobs:
|
||||
jira-sync-pr-opened:
|
||||
if: github.event.action == 'opened'
|
||||
uses: scylladb/github-automation/.github/workflows/main_jira_sync_pr_opened.yml@main
|
||||
secrets:
|
||||
caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
|
||||
|
||||
jira-sync-in-review:
|
||||
if: github.event.action == 'ready_for_review' || github.event.action == 'review_requested'
|
||||
uses: scylladb/github-automation/.github/workflows/main_jira_sync_in_review.yml@main
|
||||
secrets:
|
||||
caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
|
||||
|
||||
jira-sync-add-label:
|
||||
if: github.event.action == 'labeled'
|
||||
uses: scylladb/github-automation/.github/workflows/main_jira_sync_add_label.yml@main
|
||||
secrets:
|
||||
caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
|
||||
|
||||
jira-status-remove-label:
|
||||
if: github.event.action == 'unlabeled'
|
||||
uses: scylladb/github-automation/.github/workflows/main_jira_sync_remove_label.yml@main
|
||||
secrets:
|
||||
caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
|
||||
|
||||
jira-status-pr-closed:
|
||||
if: github.event.action == 'closed'
|
||||
uses: scylladb/github-automation/.github/workflows/main_jira_sync_pr_closed.yml@main
|
||||
secrets:
|
||||
caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
|
||||
13
.github/workflows/call_validate_pr_author_email.yml
vendored
Normal file
13
.github/workflows/call_validate_pr_author_email.yml
vendored
Normal file
@@ -0,0 +1,13 @@
|
||||
name: validate_pr_author_email
|
||||
|
||||
on:
|
||||
pull_request_target:
|
||||
types:
|
||||
- opened
|
||||
- synchronize
|
||||
- reopened
|
||||
|
||||
jobs:
|
||||
validate_pr_author_email:
|
||||
uses: scylladb/github-automation/.github/workflows/validate_pr_author_email.yml@main
|
||||
|
||||
2
.github/workflows/codespell.yaml
vendored
2
.github/workflows/codespell.yaml
vendored
@@ -13,5 +13,5 @@ jobs:
|
||||
- uses: codespell-project/actions-codespell@master
|
||||
with:
|
||||
only_warn: 1
|
||||
ignore_words_list: "ans,datas,fo,ser,ue,crate,nd,reenable,strat,stap,te,raison"
|
||||
ignore_words_list: "ans,datas,fo,ser,ue,crate,nd,reenable,strat,stap,te,raison,iif,tread"
|
||||
skip: "./.git,./build,./tools,*.js,*.lock,./test,./licenses,./redis/lolwut.cc,*.svg"
|
||||
|
||||
@@ -18,6 +18,7 @@ target_sources(alternator
|
||||
consumed_capacity.cc
|
||||
ttl.cc
|
||||
parsed_expression_cache.cc
|
||||
http_compression.cc
|
||||
${cql_grammar_srcs})
|
||||
target_include_directories(alternator
|
||||
PUBLIC
|
||||
|
||||
@@ -28,6 +28,7 @@ static logging::logger logger("alternator_controller");
|
||||
controller::controller(
|
||||
sharded<gms::gossiper>& gossiper,
|
||||
sharded<service::storage_proxy>& proxy,
|
||||
sharded<service::storage_service>& ss,
|
||||
sharded<service::migration_manager>& mm,
|
||||
sharded<db::system_distributed_keyspace>& sys_dist_ks,
|
||||
sharded<cdc::generation_service>& cdc_gen_svc,
|
||||
@@ -39,6 +40,7 @@ controller::controller(
|
||||
: protocol_server(sg)
|
||||
, _gossiper(gossiper)
|
||||
, _proxy(proxy)
|
||||
, _ss(ss)
|
||||
, _mm(mm)
|
||||
, _sys_dist_ks(sys_dist_ks)
|
||||
, _cdc_gen_svc(cdc_gen_svc)
|
||||
@@ -89,7 +91,7 @@ future<> controller::start_server() {
|
||||
auto get_timeout_in_ms = [] (const db::config& cfg) -> utils::updateable_value<uint32_t> {
|
||||
return cfg.alternator_timeout_in_ms;
|
||||
};
|
||||
_executor.start(std::ref(_gossiper), std::ref(_proxy), std::ref(_mm), std::ref(_sys_dist_ks),
|
||||
_executor.start(std::ref(_gossiper), std::ref(_proxy), std::ref(_ss), std::ref(_mm), std::ref(_sys_dist_ks),
|
||||
sharded_parameter(get_cdc_metadata, std::ref(_cdc_gen_svc)), _ssg.value(),
|
||||
sharded_parameter(get_timeout_in_ms, std::ref(_config))).get();
|
||||
_server.start(std::ref(_executor), std::ref(_proxy), std::ref(_gossiper), std::ref(_auth_service), std::ref(_sl_controller)).get();
|
||||
@@ -169,7 +171,7 @@ future<> controller::request_stop_server() {
|
||||
});
|
||||
}
|
||||
|
||||
future<utils::chunked_vector<client_data>> controller::get_client_data() {
|
||||
future<utils::chunked_vector<foreign_ptr<std::unique_ptr<client_data>>>> controller::get_client_data() {
|
||||
return _server.local().get_client_data();
|
||||
}
|
||||
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
|
||||
namespace service {
|
||||
class storage_proxy;
|
||||
class storage_service;
|
||||
class migration_manager;
|
||||
class memory_limiter;
|
||||
}
|
||||
@@ -57,6 +58,7 @@ class server;
|
||||
class controller : public protocol_server {
|
||||
sharded<gms::gossiper>& _gossiper;
|
||||
sharded<service::storage_proxy>& _proxy;
|
||||
sharded<service::storage_service>& _ss;
|
||||
sharded<service::migration_manager>& _mm;
|
||||
sharded<db::system_distributed_keyspace>& _sys_dist_ks;
|
||||
sharded<cdc::generation_service>& _cdc_gen_svc;
|
||||
@@ -74,6 +76,7 @@ public:
|
||||
controller(
|
||||
sharded<gms::gossiper>& gossiper,
|
||||
sharded<service::storage_proxy>& proxy,
|
||||
sharded<service::storage_service>& ss,
|
||||
sharded<service::migration_manager>& mm,
|
||||
sharded<db::system_distributed_keyspace>& sys_dist_ks,
|
||||
sharded<cdc::generation_service>& cdc_gen_svc,
|
||||
@@ -93,7 +96,7 @@ public:
|
||||
// This virtual function is called (on each shard separately) when the
|
||||
// virtual table "system.clients" is read. It is expected to generate a
|
||||
// list of clients connected to this server (on this shard).
|
||||
virtual future<utils::chunked_vector<client_data>> get_client_data() override;
|
||||
virtual future<utils::chunked_vector<foreign_ptr<std::unique_ptr<client_data>>>> get_client_data() override;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@@ -67,6 +67,14 @@ using namespace std::chrono_literals;
|
||||
|
||||
logging::logger elogger("alternator-executor");
|
||||
|
||||
namespace std {
|
||||
template <> struct hash<std::pair<sstring, sstring>> {
|
||||
size_t operator () (const std::pair<sstring, sstring>& p) const {
|
||||
return std::hash<sstring>()(p.first) * 1009 + std::hash<sstring>()(p.second) * 3;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
namespace alternator {
|
||||
|
||||
// Alternator-specific table properties stored as hidden table tags:
|
||||
@@ -248,14 +256,66 @@ static const rjson::value::Member& get_single_member(const rjson::value& v, cons
|
||||
return *(v.MemberBegin());
|
||||
}
|
||||
|
||||
class executor::describe_table_info_manager : public service::migration_listener::empty_listener {
|
||||
executor &_executor;
|
||||
|
||||
struct table_info {
|
||||
utils::simple_value_with_expiry<std::uint64_t> size_in_bytes;
|
||||
};
|
||||
std::unordered_map<std::pair<sstring, sstring>, table_info> info_for_tables;
|
||||
bool active = false;
|
||||
|
||||
public:
|
||||
describe_table_info_manager(executor& executor) : _executor(executor) {
|
||||
_executor._proxy.data_dictionary().real_database_ptr()->get_notifier().register_listener(this);
|
||||
active = true;
|
||||
}
|
||||
describe_table_info_manager(const describe_table_info_manager &) = delete;
|
||||
describe_table_info_manager(describe_table_info_manager&&) = delete;
|
||||
~describe_table_info_manager() {
|
||||
if (active) {
|
||||
on_fatal_internal_error(elogger, "describe_table_info_manager was not stopped before destruction");
|
||||
}
|
||||
}
|
||||
|
||||
describe_table_info_manager &operator = (const describe_table_info_manager &) = delete;
|
||||
describe_table_info_manager &operator = (describe_table_info_manager&&) = delete;
|
||||
|
||||
static std::chrono::high_resolution_clock::time_point now() {
|
||||
return std::chrono::high_resolution_clock::now();
|
||||
}
|
||||
|
||||
std::optional<std::uint64_t> get_cached_size_in_bytes(const sstring &ks_name, const sstring &cf_name) const {
|
||||
auto it = info_for_tables.find({ks_name, cf_name});
|
||||
if (it != info_for_tables.end()) {
|
||||
return it->second.size_in_bytes.get();
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
void cache_size_in_bytes(sstring ks_name, sstring cf_name, std::uint64_t size_in_bytes, std::chrono::high_resolution_clock::time_point expiry) {
|
||||
info_for_tables[{std::move(ks_name), std::move(cf_name)}].size_in_bytes.set_if_longer_expiry(size_in_bytes, expiry);
|
||||
}
|
||||
future<> stop() {
|
||||
co_await _executor._proxy.data_dictionary().real_database_ptr()->get_notifier().unregister_listener(this);
|
||||
active = false;
|
||||
co_return;
|
||||
}
|
||||
void on_drop_column_family(const sstring& ks_name, const sstring& cf_name) override {
|
||||
if (!ks_name.starts_with(executor::KEYSPACE_NAME_PREFIX)) return;
|
||||
info_for_tables.erase({ks_name, cf_name});
|
||||
}
|
||||
};
|
||||
|
||||
executor::executor(gms::gossiper& gossiper,
|
||||
service::storage_proxy& proxy,
|
||||
service::storage_service& ss,
|
||||
service::migration_manager& mm,
|
||||
db::system_distributed_keyspace& sdks,
|
||||
cdc::metadata& cdc_metadata,
|
||||
smp_service_group ssg,
|
||||
utils::updateable_value<uint32_t> default_timeout_in_ms)
|
||||
: _gossiper(gossiper),
|
||||
_ss(ss),
|
||||
_proxy(proxy),
|
||||
_mm(mm),
|
||||
_sdks(sdks),
|
||||
@@ -268,6 +328,7 @@ executor::executor(gms::gossiper& gossiper,
|
||||
_stats))
|
||||
{
|
||||
s_default_timeout_in_ms = std::move(default_timeout_in_ms);
|
||||
_describe_table_info_manager = std::make_unique<describe_table_info_manager>(*this);
|
||||
register_metrics(_metrics, _stats);
|
||||
}
|
||||
|
||||
@@ -752,12 +813,44 @@ static future<bool> is_view_built(
|
||||
|
||||
}
|
||||
|
||||
static future<rjson::value> fill_table_description(schema_ptr schema, table_status tbl_status, service::storage_proxy& proxy, service::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit)
|
||||
future<> executor::cache_newly_calculated_size_on_all_shards(schema_ptr schema, std::uint64_t size_in_bytes, std::chrono::nanoseconds ttl) {
|
||||
auto expiry = describe_table_info_manager::now() + ttl;
|
||||
return container().invoke_on_all(
|
||||
[schema, size_in_bytes, expiry] (executor& exec) {
|
||||
exec._describe_table_info_manager->cache_size_in_bytes(schema->ks_name(), schema->cf_name(), size_in_bytes, expiry);
|
||||
});
|
||||
}
|
||||
|
||||
future<> executor::fill_table_size(rjson::value &table_description, schema_ptr schema, bool deleting) {
|
||||
auto cached_size = _describe_table_info_manager->get_cached_size_in_bytes(schema->ks_name(), schema->cf_name());
|
||||
std::uint64_t total_size = 0;
|
||||
if (cached_size) {
|
||||
total_size = *cached_size;
|
||||
} else {
|
||||
// there's no point in trying to estimate value of table that is being deleted, as other nodes more often than not might
|
||||
// move forward with deletion faster than we calculate the size
|
||||
if (!deleting) {
|
||||
total_size = co_await _ss.estimate_total_sstable_volume(schema->id(), service::storage_service::ignore_errors::yes);
|
||||
const auto expiry = std::chrono::seconds{ _proxy.data_dictionary().get_config().alternator_describe_table_info_cache_validity_in_seconds() };
|
||||
// Note: we don't care when the notification of other shards will finish, as long as it will be done
|
||||
// it's possible to get into race condition (next DescribeTable comes to other shard, that new shard doesn't have
|
||||
// the size yet, so it will calculate it again) - this is not a problem, because it will call cache_newly_calculated_size_on_all_shards
|
||||
// with expiry, which is extremely unlikely to be exactly the same as the previous one, all shards will keep the size coming with expiry that is further into the future.
|
||||
// In case of the same expiry, some shards will have different size, which means DescribeTable will return different values depending on the shard
|
||||
// which is also fine, as the specification doesn't give precision guarantees of any kind.
|
||||
co_await cache_newly_calculated_size_on_all_shards(schema, total_size, expiry);
|
||||
}
|
||||
}
|
||||
rjson::add(table_description, "TableSizeBytes", total_size);
|
||||
}
|
||||
|
||||
future<rjson::value> executor::fill_table_description(schema_ptr schema, table_status tbl_status, service::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit)
|
||||
{
|
||||
rjson::value table_description = rjson::empty_object();
|
||||
auto tags_ptr = db::get_tags_of_table(schema);
|
||||
|
||||
rjson::add(table_description, "TableName", rjson::from_string(schema->cf_name()));
|
||||
co_await fill_table_size(table_description, schema, tbl_status == table_status::deleting);
|
||||
|
||||
auto creation_timestamp = get_table_creation_time(*schema);
|
||||
|
||||
@@ -801,9 +894,7 @@ static future<rjson::value> fill_table_description(schema_ptr schema, table_stat
|
||||
rjson::add(table_description["ProvisionedThroughput"], "WriteCapacityUnits", wcu);
|
||||
rjson::add(table_description["ProvisionedThroughput"], "NumberOfDecreasesToday", 0);
|
||||
|
||||
|
||||
|
||||
data_dictionary::table t = proxy.data_dictionary().find_column_family(schema);
|
||||
data_dictionary::table t = _proxy.data_dictionary().find_column_family(schema);
|
||||
|
||||
if (tbl_status != table_status::deleting) {
|
||||
rjson::add(table_description, "CreationDateTime", rjson::value(creation_timestamp));
|
||||
@@ -840,7 +931,7 @@ static future<rjson::value> fill_table_description(schema_ptr schema, table_stat
|
||||
// (for a built view) or CREATING+Backfilling (if view building
|
||||
// is in progress).
|
||||
if (!is_lsi) {
|
||||
if (co_await is_view_built(vptr, proxy, client_state, trace_state, permit)) {
|
||||
if (co_await is_view_built(vptr, _proxy, client_state, trace_state, permit)) {
|
||||
rjson::add(view_entry, "IndexStatus", "ACTIVE");
|
||||
} else {
|
||||
rjson::add(view_entry, "IndexStatus", "CREATING");
|
||||
@@ -868,9 +959,8 @@ static future<rjson::value> fill_table_description(schema_ptr schema, table_stat
|
||||
}
|
||||
rjson::add(table_description, "AttributeDefinitions", std::move(attribute_definitions));
|
||||
}
|
||||
executor::supplement_table_stream_info(table_description, *schema, proxy);
|
||||
executor::supplement_table_stream_info(table_description, *schema, _proxy);
|
||||
|
||||
// FIXME: still missing some response fields (issue #5026)
|
||||
co_return table_description;
|
||||
}
|
||||
|
||||
@@ -890,7 +980,7 @@ future<executor::request_return_type> executor::describe_table(client_state& cli
|
||||
get_stats_from_schema(_proxy, *schema)->api_operations.describe_table++;
|
||||
tracing::add_alternator_table_name(trace_state, schema->cf_name());
|
||||
|
||||
rjson::value table_description = co_await fill_table_description(schema, table_status::active, _proxy, client_state, trace_state, permit);
|
||||
rjson::value table_description = co_await fill_table_description(schema, table_status::active, client_state, trace_state, permit);
|
||||
rjson::value response = rjson::empty_object();
|
||||
rjson::add(response, "Table", std::move(table_description));
|
||||
elogger.trace("returning {}", response);
|
||||
@@ -993,7 +1083,7 @@ future<executor::request_return_type> executor::delete_table(client_state& clien
|
||||
auto& p = _proxy.container();
|
||||
|
||||
schema_ptr schema = get_table(_proxy, request);
|
||||
rjson::value table_description = co_await fill_table_description(schema, table_status::deleting, _proxy, client_state, trace_state, permit);
|
||||
rjson::value table_description = co_await fill_table_description(schema, table_status::deleting, client_state, trace_state, permit);
|
||||
co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, schema, auth::permission::DROP, _stats);
|
||||
co_await _mm.container().invoke_on(0, [&, cs = client_state.move_to_other_shard()] (service::migration_manager& mm) -> future<> {
|
||||
size_t retries = mm.get_concurrent_ddl_retries();
|
||||
@@ -1557,8 +1647,7 @@ static future<> mark_view_schemas_as_built(utils::chunked_vector<mutation>& out,
|
||||
}
|
||||
}
|
||||
|
||||
static future<executor::request_return_type> create_table_on_shard0(service::client_state&& client_state, tracing::trace_state_ptr trace_state, rjson::value request,
|
||||
service::storage_proxy& sp, service::migration_manager& mm, gms::gossiper& gossiper, bool enforce_authorization, bool warn_authorization, stats& stats, const db::tablets_mode_t::mode tablets_mode) {
|
||||
future<executor::request_return_type> executor::create_table_on_shard0(service::client_state&& client_state, tracing::trace_state_ptr trace_state, rjson::value request, bool enforce_authorization, bool warn_authorization, const db::tablets_mode_t::mode tablets_mode) {
|
||||
SCYLLA_ASSERT(this_shard_id() == 0);
|
||||
|
||||
// We begin by parsing and validating the content of the CreateTable
|
||||
@@ -1745,7 +1834,7 @@ static future<executor::request_return_type> create_table_on_shard0(service::cli
|
||||
|
||||
rjson::value* stream_specification = rjson::find(request, "StreamSpecification");
|
||||
if (stream_specification && stream_specification->IsObject()) {
|
||||
if (executor::add_stream_options(*stream_specification, builder, sp)) {
|
||||
if (executor::add_stream_options(*stream_specification, builder, _proxy)) {
|
||||
validate_cdc_log_name_length(builder.cf_name());
|
||||
}
|
||||
}
|
||||
@@ -1764,7 +1853,7 @@ static future<executor::request_return_type> create_table_on_shard0(service::cli
|
||||
set_table_creation_time(tags_map, db_clock::now());
|
||||
builder.add_extension(db::tags_extension::NAME, ::make_shared<db::tags_extension>(tags_map));
|
||||
|
||||
co_await verify_create_permission(enforce_authorization, warn_authorization, client_state, stats);
|
||||
co_await verify_create_permission(enforce_authorization, warn_authorization, client_state, _stats);
|
||||
|
||||
schema_ptr schema = builder.build();
|
||||
for (auto& view_builder : view_builders) {
|
||||
@@ -1780,18 +1869,18 @@ static future<executor::request_return_type> create_table_on_shard0(service::cli
|
||||
view_builder.with_view_info(schema, include_all_columns, ""/*where clause*/);
|
||||
}
|
||||
|
||||
size_t retries = mm.get_concurrent_ddl_retries();
|
||||
size_t retries = _mm.get_concurrent_ddl_retries();
|
||||
for (;;) {
|
||||
auto group0_guard = co_await mm.start_group0_operation();
|
||||
auto group0_guard = co_await _mm.start_group0_operation();
|
||||
auto ts = group0_guard.write_timestamp();
|
||||
utils::chunked_vector<mutation> schema_mutations;
|
||||
auto ksm = create_keyspace_metadata(keyspace_name, sp, gossiper, ts, tags_map, sp.features(), tablets_mode);
|
||||
auto ksm = create_keyspace_metadata(keyspace_name, _proxy, _gossiper, ts, tags_map, _proxy.features(), tablets_mode);
|
||||
// Alternator Streams doesn't yet work when the table uses tablets (#23838)
|
||||
if (stream_specification && stream_specification->IsObject()) {
|
||||
auto stream_enabled = rjson::find(*stream_specification, "StreamEnabled");
|
||||
if (stream_enabled && stream_enabled->IsBool() && stream_enabled->GetBool()) {
|
||||
locator::replication_strategy_params params(ksm->strategy_options(), ksm->initial_tablets(), ksm->consistency_option());
|
||||
const auto& topo = sp.local_db().get_token_metadata().get_topology();
|
||||
const auto& topo = _proxy.local_db().get_token_metadata().get_topology();
|
||||
auto rs = locator::abstract_replication_strategy::create_replication_strategy(ksm->strategy_name(), params, topo);
|
||||
if (rs->uses_tablets()) {
|
||||
co_return api_error::validation("Streams not yet supported on a table using tablets (issue #23838). "
|
||||
@@ -1801,17 +1890,17 @@ static future<executor::request_return_type> create_table_on_shard0(service::cli
|
||||
}
|
||||
// Creating an index in tablets mode requires the rf_rack_valid_keyspaces option to be enabled.
|
||||
// GSI and LSI indexes are based on materialized views which require this option to avoid consistency issues.
|
||||
if (!view_builders.empty() && ksm->uses_tablets() && !sp.data_dictionary().get_config().rf_rack_valid_keyspaces()) {
|
||||
if (!view_builders.empty() && ksm->uses_tablets() && !_proxy.data_dictionary().get_config().rf_rack_valid_keyspaces()) {
|
||||
co_return api_error::validation("GlobalSecondaryIndexes and LocalSecondaryIndexes with tablets require the rf_rack_valid_keyspaces option to be enabled.");
|
||||
}
|
||||
try {
|
||||
schema_mutations = service::prepare_new_keyspace_announcement(sp.local_db(), ksm, ts);
|
||||
schema_mutations = service::prepare_new_keyspace_announcement(_proxy.local_db(), ksm, ts);
|
||||
} catch (exceptions::already_exists_exception&) {
|
||||
if (sp.data_dictionary().has_schema(keyspace_name, table_name)) {
|
||||
if (_proxy.data_dictionary().has_schema(keyspace_name, table_name)) {
|
||||
co_return api_error::resource_in_use(fmt::format("Table {} already exists", table_name));
|
||||
}
|
||||
}
|
||||
if (sp.data_dictionary().try_find_table(schema->id())) {
|
||||
if (_proxy.data_dictionary().try_find_table(schema->id())) {
|
||||
// This should never happen, the ID is supposed to be unique
|
||||
co_return api_error::internal(format("Table with ID {} already exists", schema->id()));
|
||||
}
|
||||
@@ -1820,9 +1909,9 @@ static future<executor::request_return_type> create_table_on_shard0(service::cli
|
||||
for (schema_builder& view_builder : view_builders) {
|
||||
schemas.push_back(view_builder.build());
|
||||
}
|
||||
co_await service::prepare_new_column_families_announcement(schema_mutations, sp, *ksm, schemas, ts);
|
||||
co_await service::prepare_new_column_families_announcement(schema_mutations, _proxy, *ksm, schemas, ts);
|
||||
if (ksm->uses_tablets()) {
|
||||
co_await mark_view_schemas_as_built(schema_mutations, schemas, ts, sp);
|
||||
co_await mark_view_schemas_as_built(schema_mutations, schemas, ts, _proxy);
|
||||
}
|
||||
|
||||
// If a role is allowed to create a table, we must give it permissions to
|
||||
@@ -1847,7 +1936,7 @@ static future<executor::request_return_type> create_table_on_shard0(service::cli
|
||||
}
|
||||
std::tie(schema_mutations, group0_guard) = co_await std::move(mc).extract();
|
||||
try {
|
||||
co_await mm.announce(std::move(schema_mutations), std::move(group0_guard), fmt::format("alternator-executor: create {} table", table_name));
|
||||
co_await _mm.announce(std::move(schema_mutations), std::move(group0_guard), fmt::format("alternator-executor: create {} table", table_name));
|
||||
break;
|
||||
} catch (const service::group0_concurrent_modification& ex) {
|
||||
elogger.info("Failed to execute CreateTable {} due to concurrent schema modifications. {}.",
|
||||
@@ -1859,9 +1948,9 @@ static future<executor::request_return_type> create_table_on_shard0(service::cli
|
||||
}
|
||||
}
|
||||
|
||||
co_await mm.wait_for_schema_agreement(sp.local_db(), db::timeout_clock::now() + 10s, nullptr);
|
||||
co_await _mm.wait_for_schema_agreement(_proxy.local_db(), db::timeout_clock::now() + 10s, nullptr);
|
||||
rjson::value status = rjson::empty_object();
|
||||
executor::supplement_table_info(request, *schema, sp);
|
||||
executor::supplement_table_info(request, *schema, _proxy);
|
||||
rjson::add(status, "TableDescription", std::move(request));
|
||||
co_return rjson::print(std::move(status));
|
||||
}
|
||||
@@ -1870,10 +1959,11 @@ future<executor::request_return_type> executor::create_table(client_state& clien
|
||||
_stats.api_operations.create_table++;
|
||||
elogger.trace("Creating table {}", request);
|
||||
|
||||
co_return co_await _mm.container().invoke_on(0, [&, tr = tracing::global_trace_state_ptr(trace_state), request = std::move(request), &sp = _proxy.container(), &g = _gossiper.container(), &e = this->container(), client_state_other_shard = client_state.move_to_other_shard(), enforce_authorization = bool(_enforce_authorization), warn_authorization = bool(_warn_authorization)]
|
||||
co_return co_await _mm.container().invoke_on(0, [&, tr = tracing::global_trace_state_ptr(trace_state), request = std::move(request), &e = this->container(), client_state_other_shard = client_state.move_to_other_shard(), enforce_authorization = bool(_enforce_authorization), warn_authorization = bool(_warn_authorization)]
|
||||
(service::migration_manager& mm) mutable -> future<executor::request_return_type> {
|
||||
const db::tablets_mode_t::mode tablets_mode = _proxy.data_dictionary().get_config().tablets_mode_for_new_keyspaces(); // type cast
|
||||
co_return co_await create_table_on_shard0(client_state_other_shard.get(), tr, std::move(request), sp.local(), mm, g.local(), enforce_authorization, warn_authorization, e.local()._stats, std::move(tablets_mode));
|
||||
// `invoke_on` hopped us to shard 0, but `this` points to `executor` is from 'old' shard, we need to hop it too.
|
||||
co_return co_await e.local().create_table_on_shard0(client_state_other_shard.get(), tr, std::move(request), enforce_authorization, warn_authorization, std::move(tablets_mode));
|
||||
});
|
||||
}
|
||||
|
||||
@@ -6087,9 +6177,10 @@ future<> executor::start() {
|
||||
}
|
||||
|
||||
future<> executor::stop() {
|
||||
co_await _describe_table_info_manager->stop();
|
||||
// disconnect from the value source, but keep the value unchanged.
|
||||
s_default_timeout_in_ms = utils::updateable_value<uint32_t>{s_default_timeout_in_ms()};
|
||||
return _parsed_expression_cache->stop();
|
||||
co_await _parsed_expression_cache->stop();
|
||||
}
|
||||
|
||||
} // namespace alternator
|
||||
|
||||
@@ -17,11 +17,13 @@
|
||||
#include "service/client_state.hh"
|
||||
#include "service_permit.hh"
|
||||
#include "db/timeout_clock.hh"
|
||||
#include "db/config.hh"
|
||||
|
||||
#include "alternator/error.hh"
|
||||
#include "stats.hh"
|
||||
#include "utils/rjson.hh"
|
||||
#include "utils/updateable_value.hh"
|
||||
#include "utils/simple_value_with_expiry.hh"
|
||||
|
||||
#include "tracing/trace_state.hh"
|
||||
|
||||
@@ -41,6 +43,7 @@ namespace cql3::selection {
|
||||
namespace service {
|
||||
class storage_proxy;
|
||||
class cas_shard;
|
||||
class storage_service;
|
||||
}
|
||||
|
||||
namespace cdc {
|
||||
@@ -57,6 +60,7 @@ class schema_builder;
|
||||
|
||||
namespace alternator {
|
||||
|
||||
enum class table_status;
|
||||
class rmw_operation;
|
||||
class put_or_delete_item;
|
||||
|
||||
@@ -136,6 +140,7 @@ class expression_cache;
|
||||
|
||||
class executor : public peering_sharded_service<executor> {
|
||||
gms::gossiper& _gossiper;
|
||||
service::storage_service& _ss;
|
||||
service::storage_proxy& _proxy;
|
||||
service::migration_manager& _mm;
|
||||
db::system_distributed_keyspace& _sdks;
|
||||
@@ -148,6 +153,11 @@ class executor : public peering_sharded_service<executor> {
|
||||
|
||||
std::unique_ptr<parsed::expression_cache> _parsed_expression_cache;
|
||||
|
||||
struct describe_table_info_manager;
|
||||
std::unique_ptr<describe_table_info_manager> _describe_table_info_manager;
|
||||
|
||||
future<> cache_newly_calculated_size_on_all_shards(schema_ptr schema, std::uint64_t size_in_bytes, std::chrono::nanoseconds ttl);
|
||||
future<> fill_table_size(rjson::value &table_description, schema_ptr schema, bool deleting);
|
||||
public:
|
||||
using client_state = service::client_state;
|
||||
// request_return_type is the return type of the executor methods, which
|
||||
@@ -173,6 +183,7 @@ public:
|
||||
|
||||
executor(gms::gossiper& gossiper,
|
||||
service::storage_proxy& proxy,
|
||||
service::storage_service& ss,
|
||||
service::migration_manager& mm,
|
||||
db::system_distributed_keyspace& sdks,
|
||||
cdc::metadata& cdc_metadata,
|
||||
@@ -220,6 +231,8 @@ private:
|
||||
friend class rmw_operation;
|
||||
|
||||
static void describe_key_schema(rjson::value& parent, const schema&, std::unordered_map<std::string,std::string> * = nullptr, const std::map<sstring, sstring> *tags = nullptr);
|
||||
future<rjson::value> fill_table_description(schema_ptr schema, table_status tbl_status, service::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit);
|
||||
future<executor::request_return_type> create_table_on_shard0(service::client_state&& client_state, tracing::trace_state_ptr trace_state, rjson::value request, bool enforce_authorization, bool warn_authorization, const db::tablets_mode_t::mode tablets_mode);
|
||||
|
||||
future<> do_batch_write(
|
||||
std::vector<std::pair<schema_ptr, put_or_delete_item>> mutation_builders,
|
||||
|
||||
301
alternator/http_compression.cc
Normal file
301
alternator/http_compression.cc
Normal file
@@ -0,0 +1,301 @@
|
||||
/*
|
||||
* Copyright 2025-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
|
||||
#include "alternator/http_compression.hh"
|
||||
#include "alternator/server.hh"
|
||||
#include <seastar/coroutine/maybe_yield.hh>
|
||||
#include <zlib.h>
|
||||
|
||||
static logging::logger slogger("alternator-http-compression");
|
||||
|
||||
namespace alternator {
|
||||
|
||||
|
||||
static constexpr size_t compressed_buffer_size = 1024;
|
||||
class zlib_compressor {
|
||||
z_stream _zs;
|
||||
temporary_buffer<char> _output_buf;
|
||||
noncopyable_function<future<>(temporary_buffer<char>&&)> _write_func;
|
||||
public:
|
||||
zlib_compressor(bool gzip, int compression_level, noncopyable_function<future<>(temporary_buffer<char>&&)> write_func)
|
||||
: _write_func(std::move(write_func)) {
|
||||
memset(&_zs, 0, sizeof(_zs));
|
||||
if (deflateInit2(&_zs, std::clamp(compression_level, Z_NO_COMPRESSION, Z_BEST_COMPRESSION), Z_DEFLATED,
|
||||
(gzip ? 16 : 0) + MAX_WBITS, 8, Z_DEFAULT_STRATEGY) != Z_OK) {
|
||||
// Should only happen if memory allocation fails
|
||||
throw std::bad_alloc();
|
||||
}
|
||||
}
|
||||
~zlib_compressor() {
|
||||
deflateEnd(&_zs);
|
||||
}
|
||||
future<> close() {
|
||||
return compress(nullptr, 0, true);
|
||||
}
|
||||
|
||||
future<> compress(const char* buf, size_t len, bool is_last_chunk = false) {
|
||||
_zs.next_in = reinterpret_cast<unsigned char*>(const_cast<char*>(buf));
|
||||
_zs.avail_in = (uInt) len;
|
||||
int mode = is_last_chunk ? Z_FINISH : Z_NO_FLUSH;
|
||||
while(_zs.avail_in > 0 || is_last_chunk) {
|
||||
co_await coroutine::maybe_yield();
|
||||
if (_output_buf.empty()) {
|
||||
if (is_last_chunk) {
|
||||
uint32_t max_buffer_size = 0;
|
||||
deflatePending(&_zs, &max_buffer_size, nullptr);
|
||||
max_buffer_size += deflateBound(&_zs, _zs.avail_in) + 1;
|
||||
_output_buf = temporary_buffer<char>(std::min(compressed_buffer_size, (size_t) max_buffer_size));
|
||||
} else {
|
||||
_output_buf = temporary_buffer<char>(compressed_buffer_size);
|
||||
}
|
||||
_zs.next_out = reinterpret_cast<unsigned char*>(_output_buf.get_write());
|
||||
_zs.avail_out = compressed_buffer_size;
|
||||
}
|
||||
int e = deflate(&_zs, mode);
|
||||
if (e < Z_OK) {
|
||||
throw api_error::internal("Error during compression of response body");
|
||||
}
|
||||
if (e == Z_STREAM_END || _zs.avail_out < compressed_buffer_size / 4) {
|
||||
_output_buf.trim(compressed_buffer_size - _zs.avail_out);
|
||||
co_await _write_func(std::move(_output_buf));
|
||||
if (e == Z_STREAM_END) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Helper string_view functions for parsing Accept-Encoding header
|
||||
struct case_insensitive_cmp_sv {
|
||||
bool operator()(std::string_view s1, std::string_view s2) const {
|
||||
return std::equal(s1.begin(), s1.end(), s2.begin(), s2.end(),
|
||||
[](char a, char b) { return ::tolower(a) == ::tolower(b); });
|
||||
}
|
||||
};
|
||||
static inline std::string_view trim_left(std::string_view sv) {
|
||||
while (!sv.empty() && std::isspace(static_cast<unsigned char>(sv.front())))
|
||||
sv.remove_prefix(1);
|
||||
return sv;
|
||||
}
|
||||
static inline std::string_view trim_right(std::string_view sv) {
|
||||
while (!sv.empty() && std::isspace(static_cast<unsigned char>(sv.back())))
|
||||
sv.remove_suffix(1);
|
||||
return sv;
|
||||
}
|
||||
static inline std::string_view trim(std::string_view sv) {
|
||||
return trim_left(trim_right(sv));
|
||||
}
|
||||
|
||||
inline std::vector<std::string_view> split(std::string_view text, char separator) {
|
||||
std::vector<std::string_view> tokens;
|
||||
if (text == "") {
|
||||
return tokens;
|
||||
}
|
||||
|
||||
while (true) {
|
||||
auto pos = text.find_first_of(separator);
|
||||
if (pos != std::string_view::npos) {
|
||||
tokens.emplace_back(text.data(), pos);
|
||||
text.remove_prefix(pos + 1);
|
||||
} else {
|
||||
tokens.emplace_back(text);
|
||||
break;
|
||||
}
|
||||
}
|
||||
return tokens;
|
||||
}
|
||||
|
||||
constexpr response_compressor::compression_type response_compressor::get_compression_type(std::string_view encoding) {
|
||||
for (size_t i = 0; i < static_cast<size_t>(compression_type::count); ++i) {
|
||||
if (case_insensitive_cmp_sv{}(encoding, compression_names[i])) {
|
||||
return static_cast<compression_type>(i);
|
||||
}
|
||||
}
|
||||
return compression_type::unknown;
|
||||
}
|
||||
|
||||
response_compressor::compression_type response_compressor::find_compression(std::string_view accept_encoding, size_t response_size) {
|
||||
std::optional<float> ct_q[static_cast<size_t>(compression_type::count)];
|
||||
ct_q[static_cast<size_t>(compression_type::none)] = std::numeric_limits<float>::min(); // enabled, but lowest priority
|
||||
compression_type selected_ct = compression_type::none;
|
||||
|
||||
std::vector<std::string_view> entries = split(accept_encoding, ',');
|
||||
for (auto& e : entries) {
|
||||
std::vector<std::string_view> params = split(e, ';');
|
||||
if (params.size() == 0) {
|
||||
continue;
|
||||
}
|
||||
compression_type ct = get_compression_type(trim(params[0]));
|
||||
if (ct == compression_type::unknown) {
|
||||
continue; // ignore unknown encoding types
|
||||
}
|
||||
if (ct_q[static_cast<size_t>(ct)].has_value() && ct_q[static_cast<size_t>(ct)] != 0.0f) {
|
||||
continue; // already processed this encoding
|
||||
}
|
||||
if (response_size < _threshold[static_cast<size_t>(ct)]) {
|
||||
continue; // below threshold treat as unknown
|
||||
}
|
||||
for (size_t i = 1; i < params.size(); ++i) { // find "q=" parameter
|
||||
auto pos = params[i].find("q=");
|
||||
if (pos == std::string_view::npos) {
|
||||
continue;
|
||||
}
|
||||
std::string_view param = params[i].substr(pos + 2);
|
||||
param = trim(param);
|
||||
// parse quality value
|
||||
float q_value = 1.0f;
|
||||
auto [ptr, ec] = std::from_chars(param.data(), param.data() + param.size(), q_value);
|
||||
if (ec != std::errc() || ptr != param.data() + param.size()) {
|
||||
continue;
|
||||
}
|
||||
if (q_value < 0.0) {
|
||||
q_value = 0.0;
|
||||
} else if (q_value > 1.0) {
|
||||
q_value = 1.0;
|
||||
}
|
||||
ct_q[static_cast<size_t>(ct)] = q_value;
|
||||
break; // we parsed quality value
|
||||
}
|
||||
if (!ct_q[static_cast<size_t>(ct)].has_value()) {
|
||||
ct_q[static_cast<size_t>(ct)] = 1.0f; // default quality value
|
||||
}
|
||||
// keep the highest encoding (in the order, unless 'any')
|
||||
if (selected_ct == compression_type::any) {
|
||||
if (ct_q[static_cast<size_t>(ct)] >= ct_q[static_cast<size_t>(selected_ct)]) {
|
||||
selected_ct = ct;
|
||||
}
|
||||
} else {
|
||||
if (ct_q[static_cast<size_t>(ct)] > ct_q[static_cast<size_t>(selected_ct)]) {
|
||||
selected_ct = ct;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (selected_ct == compression_type::any) {
|
||||
// select any not mentioned or highest quality
|
||||
selected_ct = compression_type::none;
|
||||
for (size_t i = 0; i < static_cast<size_t>(compression_type::compressions_count); ++i) {
|
||||
if (!ct_q[i].has_value()) {
|
||||
return static_cast<compression_type>(i);
|
||||
}
|
||||
if (ct_q[i] > ct_q[static_cast<size_t>(selected_ct)]) {
|
||||
selected_ct = static_cast<compression_type>(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
return selected_ct;
|
||||
}
|
||||
|
||||
static future<chunked_content> compress(response_compressor::compression_type ct, const db::config& cfg, std::string str) {
|
||||
chunked_content compressed;
|
||||
auto write = [&compressed](temporary_buffer<char>&& buf) -> future<> {
|
||||
compressed.push_back(std::move(buf));
|
||||
return make_ready_future<>();
|
||||
};
|
||||
zlib_compressor compressor(ct != response_compressor::compression_type::deflate,
|
||||
cfg.alternator_response_gzip_compression_level(), std::move(write));
|
||||
co_await compressor.compress(str.data(), str.size(), true);
|
||||
co_return compressed;
|
||||
}
|
||||
|
||||
static sstring flatten(chunked_content&& cc) {
|
||||
size_t total_size = 0;
|
||||
for (const auto& chunk : cc) {
|
||||
total_size += chunk.size();
|
||||
}
|
||||
sstring result = sstring{ sstring::initialized_later{}, total_size };
|
||||
size_t offset = 0;
|
||||
for (const auto& chunk : cc) {
|
||||
std::copy(chunk.begin(), chunk.end(), result.begin() + offset);
|
||||
offset += chunk.size();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
future<std::unique_ptr<http::reply>> response_compressor::generate_reply(std::unique_ptr<http::reply> rep, sstring accept_encoding, const char* content_type, std::string&& response_body) {
|
||||
response_compressor::compression_type ct = find_compression(accept_encoding, response_body.size());
|
||||
if (ct != response_compressor::compression_type::none) {
|
||||
rep->add_header("Content-Encoding", get_encoding_name(ct));
|
||||
rep->set_content_type(content_type);
|
||||
return compress(ct, cfg, std::move(response_body)).then([rep = std::move(rep)] (chunked_content compressed) mutable {
|
||||
rep->_content = flatten(std::move(compressed));
|
||||
return make_ready_future<std::unique_ptr<http::reply>>(std::move(rep));
|
||||
});
|
||||
} else {
|
||||
// Note that despite the move, there is a copy here -
|
||||
// as str is std::string and rep->_content is sstring.
|
||||
rep->_content = std::move(response_body);
|
||||
rep->set_content_type(content_type);
|
||||
}
|
||||
return make_ready_future<std::unique_ptr<http::reply>>(std::move(rep));
|
||||
}
|
||||
|
||||
template<typename Compressor>
|
||||
class compressed_data_sink_impl : public data_sink_impl {
|
||||
output_stream<char> _out;
|
||||
Compressor _compressor;
|
||||
public:
|
||||
template<typename... Args>
|
||||
compressed_data_sink_impl(output_stream<char>&& out, Args&&... args)
|
||||
: _out(std::move(out)), _compressor(std::forward<Args>(args)..., [this](temporary_buffer<char>&& buf) {
|
||||
return _out.write(std::move(buf));
|
||||
}) { }
|
||||
|
||||
future<> put(std::span<temporary_buffer<char>> data) override {
|
||||
return data_sink_impl::fallback_put(data, [this] (temporary_buffer<char>&& buf) {
|
||||
return do_put(std::move(buf));
|
||||
});
|
||||
}
|
||||
|
||||
private:
|
||||
future<> do_put(temporary_buffer<char> buf) {
|
||||
co_return co_await _compressor.compress(buf.get(), buf.size());
|
||||
|
||||
}
|
||||
future<> close() override {
|
||||
return _compressor.close().then([this] {
|
||||
return _out.close();
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
executor::body_writer compress(response_compressor::compression_type ct, const db::config& cfg, executor::body_writer&& bw) {
|
||||
return [bw = std::move(bw), ct, level = cfg.alternator_response_gzip_compression_level()](output_stream<char>&& out) mutable -> future<> {
|
||||
output_stream_options opts;
|
||||
opts.trim_to_size = true;
|
||||
std::unique_ptr<data_sink_impl> data_sink_impl;
|
||||
switch (ct) {
|
||||
case response_compressor::compression_type::gzip:
|
||||
data_sink_impl = std::make_unique<compressed_data_sink_impl<zlib_compressor>>(std::move(out), true, level);
|
||||
break;
|
||||
case response_compressor::compression_type::deflate:
|
||||
data_sink_impl = std::make_unique<compressed_data_sink_impl<zlib_compressor>>(std::move(out), false, level);
|
||||
break;
|
||||
case response_compressor::compression_type::none:
|
||||
case response_compressor::compression_type::any:
|
||||
case response_compressor::compression_type::unknown:
|
||||
on_internal_error(slogger,"Compression not selected");
|
||||
default:
|
||||
on_internal_error(slogger, "Unsupported compression type for data sink");
|
||||
}
|
||||
return bw(output_stream<char>(data_sink(std::move(data_sink_impl)), compressed_buffer_size, opts));
|
||||
};
|
||||
}
|
||||
|
||||
future<std::unique_ptr<http::reply>> response_compressor::generate_reply(std::unique_ptr<http::reply> rep, sstring accept_encoding, const char* content_type, executor::body_writer&& body_writer) {
|
||||
response_compressor::compression_type ct = find_compression(accept_encoding, std::numeric_limits<size_t>::max());
|
||||
if (ct != response_compressor::compression_type::none) {
|
||||
rep->add_header("Content-Encoding", get_encoding_name(ct));
|
||||
rep->write_body(content_type, compress(ct, cfg, std::move(body_writer)));
|
||||
} else {
|
||||
rep->write_body(content_type, std::move(body_writer));
|
||||
}
|
||||
return make_ready_future<std::unique_ptr<http::reply>>(std::move(rep));
|
||||
}
|
||||
|
||||
} // namespace alternator
|
||||
91
alternator/http_compression.hh
Normal file
91
alternator/http_compression.hh
Normal file
@@ -0,0 +1,91 @@
|
||||
/*
|
||||
* Copyright 2025-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "alternator/executor.hh"
|
||||
#include <seastar/http/httpd.hh>
|
||||
#include "db/config.hh"
|
||||
|
||||
namespace alternator {
|
||||
|
||||
class response_compressor {
|
||||
public:
|
||||
enum class compression_type {
|
||||
gzip,
|
||||
deflate,
|
||||
compressions_count,
|
||||
any = compressions_count,
|
||||
none,
|
||||
count,
|
||||
unknown = count
|
||||
};
|
||||
static constexpr std::string_view compression_names[] = {
|
||||
"gzip",
|
||||
"deflate",
|
||||
"*",
|
||||
"identity"
|
||||
};
|
||||
|
||||
static sstring get_encoding_name(compression_type ct) {
|
||||
return sstring(compression_names[static_cast<size_t>(ct)]);
|
||||
}
|
||||
static constexpr compression_type get_compression_type(std::string_view encoding);
|
||||
|
||||
sstring get_accepted_encoding(const http::request& req) {
|
||||
if (get_threshold() == 0) {
|
||||
return "";
|
||||
}
|
||||
return req.get_header("Accept-Encoding");
|
||||
}
|
||||
compression_type find_compression(std::string_view accept_encoding, size_t response_size);
|
||||
|
||||
response_compressor(const db::config& cfg)
|
||||
: cfg(cfg)
|
||||
,_gzip_level_observer(
|
||||
cfg.alternator_response_gzip_compression_level.observe([this](int v) {
|
||||
update_threshold();
|
||||
}))
|
||||
,_gzip_threshold_observer(
|
||||
cfg.alternator_response_compression_threshold_in_bytes.observe([this](uint32_t v) {
|
||||
update_threshold();
|
||||
}))
|
||||
{
|
||||
update_threshold();
|
||||
}
|
||||
response_compressor(const response_compressor& rhs) : response_compressor(rhs.cfg) {}
|
||||
|
||||
private:
|
||||
const db::config& cfg;
|
||||
utils::observable<int>::observer _gzip_level_observer;
|
||||
utils::observable<uint32_t>::observer _gzip_threshold_observer;
|
||||
uint32_t _threshold[static_cast<size_t>(compression_type::count)];
|
||||
|
||||
size_t get_threshold() { return _threshold[static_cast<size_t>(compression_type::any)]; }
|
||||
void update_threshold() {
|
||||
_threshold[static_cast<size_t>(compression_type::none)] = std::numeric_limits<uint32_t>::max();
|
||||
_threshold[static_cast<size_t>(compression_type::any)] = std::numeric_limits<uint32_t>::max();
|
||||
uint32_t gzip = cfg.alternator_response_gzip_compression_level() <= 0 ? std::numeric_limits<uint32_t>::max()
|
||||
: cfg.alternator_response_compression_threshold_in_bytes();
|
||||
_threshold[static_cast<size_t>(compression_type::gzip)] = gzip;
|
||||
_threshold[static_cast<size_t>(compression_type::deflate)] = gzip;
|
||||
for (size_t i = 0; i < static_cast<size_t>(compression_type::compressions_count); ++i) {
|
||||
if (_threshold[i] < _threshold[static_cast<size_t>(compression_type::any)]) {
|
||||
_threshold[static_cast<size_t>(compression_type::any)] = _threshold[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
future<std::unique_ptr<http::reply>> generate_reply(std::unique_ptr<http::reply> rep,
|
||||
sstring accept_encoding, const char* content_type, std::string&& response_body);
|
||||
future<std::unique_ptr<http::reply>> generate_reply(std::unique_ptr<http::reply> rep,
|
||||
sstring accept_encoding, const char* content_type, executor::body_writer&& body_writer);
|
||||
};
|
||||
|
||||
}
|
||||
@@ -34,6 +34,7 @@
|
||||
#include "client_data.hh"
|
||||
#include "utils/updateable_value.hh"
|
||||
#include <zlib.h>
|
||||
#include "alternator/http_compression.hh"
|
||||
|
||||
static logging::logger slogger("alternator-server");
|
||||
|
||||
@@ -111,9 +112,12 @@ class api_handler : public handler_base {
|
||||
// type applies to all replies, both success and error.
|
||||
static constexpr const char* REPLY_CONTENT_TYPE = "application/x-amz-json-1.0";
|
||||
public:
|
||||
api_handler(const std::function<future<executor::request_return_type>(std::unique_ptr<request> req)>& _handle) : _f_handle(
|
||||
api_handler(const std::function<future<executor::request_return_type>(std::unique_ptr<request> req)>& _handle,
|
||||
const db::config& config) : _response_compressor(config), _f_handle(
|
||||
[this, _handle](std::unique_ptr<request> req, std::unique_ptr<reply> rep) {
|
||||
return seastar::futurize_invoke(_handle, std::move(req)).then_wrapped([this, rep = std::move(rep)](future<executor::request_return_type> resf) mutable {
|
||||
sstring accept_encoding = _response_compressor.get_accepted_encoding(*req);
|
||||
return seastar::futurize_invoke(_handle, std::move(req)).then_wrapped(
|
||||
[this, rep = std::move(rep), accept_encoding=std::move(accept_encoding)](future<executor::request_return_type> resf) mutable {
|
||||
if (resf.failed()) {
|
||||
// Exceptions of type api_error are wrapped as JSON and
|
||||
// returned to the client as expected. Other types of
|
||||
@@ -133,22 +137,20 @@ public:
|
||||
return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
|
||||
}
|
||||
auto res = resf.get();
|
||||
std::visit(overloaded_functor {
|
||||
return std::visit(overloaded_functor {
|
||||
[&] (std::string&& str) {
|
||||
// Note that despite the move, there is a copy here -
|
||||
// as str is std::string and rep->_content is sstring.
|
||||
rep->_content = std::move(str);
|
||||
rep->set_content_type(REPLY_CONTENT_TYPE);
|
||||
return _response_compressor.generate_reply(std::move(rep), std::move(accept_encoding),
|
||||
REPLY_CONTENT_TYPE, std::move(str));
|
||||
},
|
||||
[&] (executor::body_writer&& body_writer) {
|
||||
rep->write_body(REPLY_CONTENT_TYPE, std::move(body_writer));
|
||||
return _response_compressor.generate_reply(std::move(rep), std::move(accept_encoding),
|
||||
REPLY_CONTENT_TYPE, std::move(body_writer));
|
||||
},
|
||||
[&] (const api_error& err) {
|
||||
generate_error_reply(*rep, err);
|
||||
return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
|
||||
}
|
||||
}, std::move(res));
|
||||
|
||||
return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
|
||||
});
|
||||
}) { }
|
||||
|
||||
@@ -177,6 +179,7 @@ protected:
|
||||
slogger.trace("api_handler error case: {}", rep._content);
|
||||
}
|
||||
|
||||
response_compressor _response_compressor;
|
||||
future_handler_function _f_handle;
|
||||
};
|
||||
|
||||
@@ -708,8 +711,12 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
|
||||
// As long as the system_clients_entry object is alive, this request will
|
||||
// be visible in the "system.clients" virtual table. When requested, this
|
||||
// entry will be formatted by server::ongoing_request::make_client_data().
|
||||
auto user_agent_header = co_await _connection_options_keys_and_values.get_or_load(req->get_header("User-Agent"), [] (const client_options_cache_key_type&) {
|
||||
return make_ready_future<options_cache_value_type>(options_cache_value_type{});
|
||||
});
|
||||
|
||||
auto system_clients_entry = _ongoing_requests.emplace(
|
||||
req->get_client_address(), req->get_header("User-Agent"),
|
||||
req->get_client_address(), std::move(user_agent_header),
|
||||
username, current_scheduling_group(),
|
||||
req->get_protocol_name() == "https");
|
||||
|
||||
@@ -754,7 +761,7 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
|
||||
void server::set_routes(routes& r) {
|
||||
api_handler* req_handler = new api_handler([this] (std::unique_ptr<request> req) mutable {
|
||||
return handle_api_request(std::move(req));
|
||||
});
|
||||
}, _proxy.data_dictionary().get_config());
|
||||
|
||||
r.put(operation_type::POST, "/", req_handler);
|
||||
r.put(operation_type::GET, "/", new health_handler(_pending_requests));
|
||||
@@ -985,10 +992,10 @@ client_data server::ongoing_request::make_client_data() const {
|
||||
return cd;
|
||||
}
|
||||
|
||||
future<utils::chunked_vector<client_data>> server::get_client_data() {
|
||||
utils::chunked_vector<client_data> ret;
|
||||
future<utils::chunked_vector<foreign_ptr<std::unique_ptr<client_data>>>> server::get_client_data() {
|
||||
utils::chunked_vector<foreign_ptr<std::unique_ptr<client_data>>> ret;
|
||||
co_await _ongoing_requests.for_each_gently([&ret] (const ongoing_request& r) {
|
||||
ret.emplace_back(r.make_client_data());
|
||||
ret.emplace_back(make_foreign(std::make_unique<client_data>(r.make_client_data())));
|
||||
});
|
||||
co_return ret;
|
||||
}
|
||||
|
||||
@@ -55,6 +55,7 @@ class server : public peering_sharded_service<server> {
|
||||
// though it isn't really relevant for Alternator which defines its own
|
||||
// timeouts separately. We can create this object only once.
|
||||
updateable_timeout_config _timeout_config;
|
||||
client_options_cache_type _connection_options_keys_and_values;
|
||||
|
||||
alternator_callbacks_map _callbacks;
|
||||
|
||||
@@ -88,7 +89,7 @@ class server : public peering_sharded_service<server> {
|
||||
// is called when reading the "system.clients" virtual table.
|
||||
struct ongoing_request {
|
||||
socket_address _client_address;
|
||||
sstring _user_agent;
|
||||
client_options_cache_entry_type _user_agent;
|
||||
sstring _username;
|
||||
scheduling_group _scheduling_group;
|
||||
bool _is_https;
|
||||
@@ -107,7 +108,7 @@ public:
|
||||
// table "system.clients" is read. It is expected to generate a list of
|
||||
// clients connected to this server (on this shard). This function is
|
||||
// called by alternator::controller::get_client_data().
|
||||
future<utils::chunked_vector<client_data>> get_client_data();
|
||||
future<utils::chunked_vector<foreign_ptr<std::unique_ptr<client_data>>>> get_client_data();
|
||||
private:
|
||||
void set_routes(seastar::httpd::routes& r);
|
||||
// If verification succeeds, returns the authenticated user's username
|
||||
|
||||
@@ -100,9 +100,8 @@ rest_set_client_routes(http_context& ctx, sharded<service::client_routes_service
|
||||
rapidjson::Document root;
|
||||
auto content = co_await util::read_entire_stream_contiguous(*req->content_stream);
|
||||
root.Parse(content.c_str());
|
||||
const auto route_entries = parse_set_client_array(root);
|
||||
|
||||
co_await cr.local().set_client_routes(route_entries);
|
||||
co_await cr.local().set_client_routes(parse_set_client_array(root));
|
||||
co_return seastar::json::json_void();
|
||||
}
|
||||
|
||||
@@ -132,8 +131,7 @@ rest_delete_client_routes(http_context& ctx, sharded<service::client_routes_serv
|
||||
auto content = co_await util::read_entire_stream_contiguous(*req->content_stream);
|
||||
root.Parse(content.c_str());
|
||||
|
||||
const auto route_keys = parse_delete_client_array(root);
|
||||
co_await cr.local().delete_client_routes(route_keys);
|
||||
co_await cr.local().delete_client_routes(parse_delete_client_array(root));
|
||||
co_return seastar::json::json_void();
|
||||
}
|
||||
|
||||
|
||||
@@ -547,17 +547,13 @@ void set_view_builder(http_context& ctx, routes& r, sharded<db::view::view_build
|
||||
vp.insert(b.second);
|
||||
}
|
||||
}
|
||||
std::vector<sstring> res;
|
||||
replica::database& db = vb.local().get_db();
|
||||
auto uuid = validate_table(db, ks, cf_name);
|
||||
replica::column_family& cf = db.find_column_family(uuid);
|
||||
res.reserve(cf.get_index_manager().list_indexes().size());
|
||||
for (auto&& i : cf.get_index_manager().list_indexes()) {
|
||||
if (vp.contains(secondary_index::index_table_name(i.metadata().name()))) {
|
||||
res.emplace_back(i.metadata().name());
|
||||
}
|
||||
}
|
||||
co_return res;
|
||||
co_return cf.get_index_manager().list_indexes()
|
||||
| std::views::transform([] (const auto& i) { return i.metadata().name(); })
|
||||
| std::views::filter([&vp] (const auto& n) { return vp.contains(secondary_index::index_table_name(n)); })
|
||||
| std::ranges::to<std::vector>();
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
#include "db/system_keyspace.hh"
|
||||
#include "schema/schema.hh"
|
||||
#include <iterator>
|
||||
#include <seastar/core/abort_source.hh>
|
||||
#include <seastar/coroutine/maybe_yield.hh>
|
||||
#include <seastar/core/format.hh>
|
||||
|
||||
@@ -22,9 +23,11 @@ namespace auth {
|
||||
|
||||
logging::logger logger("auth-cache");
|
||||
|
||||
cache::cache(cql3::query_processor& qp) noexcept
|
||||
cache::cache(cql3::query_processor& qp, abort_source& as) noexcept
|
||||
: _current_version(0)
|
||||
, _qp(qp) {
|
||||
, _qp(qp)
|
||||
, _loading_sem(1)
|
||||
, _as(as) {
|
||||
}
|
||||
|
||||
lw_shared_ptr<const cache::role_record> cache::get(const role_name_t& role) const noexcept {
|
||||
@@ -116,6 +119,8 @@ future<> cache::load_all() {
|
||||
co_return;
|
||||
}
|
||||
SCYLLA_ASSERT(this_shard_id() == 0);
|
||||
auto units = co_await get_units(_loading_sem, 1, _as);
|
||||
|
||||
++_current_version;
|
||||
|
||||
logger.info("Loading all roles");
|
||||
@@ -146,6 +151,9 @@ future<> cache::load_roles(std::unordered_set<role_name_t> roles) {
|
||||
if (legacy_mode(_qp)) {
|
||||
co_return;
|
||||
}
|
||||
SCYLLA_ASSERT(this_shard_id() == 0);
|
||||
auto units = co_await get_units(_loading_sem, 1, _as);
|
||||
|
||||
for (const auto& name : roles) {
|
||||
logger.info("Loading role {}", name);
|
||||
auto role = co_await fetch_role(name);
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <seastar/core/abort_source.hh>
|
||||
#include <unordered_set>
|
||||
#include <unordered_map>
|
||||
|
||||
@@ -15,6 +16,7 @@
|
||||
#include <seastar/core/future.hh>
|
||||
#include <seastar/core/sharded.hh>
|
||||
#include <seastar/core/shared_ptr.hh>
|
||||
#include <seastar/core/semaphore.hh>
|
||||
|
||||
#include <absl/container/flat_hash_map.h>
|
||||
|
||||
@@ -41,7 +43,7 @@ public:
|
||||
version_tag_t version; // used for seamless cache reloads
|
||||
};
|
||||
|
||||
explicit cache(cql3::query_processor& qp) noexcept;
|
||||
explicit cache(cql3::query_processor& qp, abort_source& as) noexcept;
|
||||
lw_shared_ptr<const role_record> get(const role_name_t& role) const noexcept;
|
||||
future<> load_all();
|
||||
future<> load_roles(std::unordered_set<role_name_t> roles);
|
||||
@@ -52,6 +54,8 @@ private:
|
||||
roles_map _roles;
|
||||
version_tag_t _current_version;
|
||||
cql3::query_processor& _qp;
|
||||
semaphore _loading_sem;
|
||||
abort_source& _as;
|
||||
|
||||
future<lw_shared_ptr<role_record>> fetch_role(const role_name_t& role) const;
|
||||
future<> prune_all() noexcept;
|
||||
|
||||
@@ -10,7 +10,9 @@
|
||||
#include <seastar/net/inet_address.hh>
|
||||
#include <seastar/core/sstring.hh>
|
||||
#include "seastarx.hh"
|
||||
#include "utils/loading_shared_values.hh"
|
||||
|
||||
#include <list>
|
||||
#include <optional>
|
||||
|
||||
enum class client_type {
|
||||
@@ -27,6 +29,20 @@ enum class client_connection_stage {
|
||||
ready,
|
||||
};
|
||||
|
||||
// We implement a keys cache using a map-like utils::loading_shared_values container by storing empty values.
|
||||
struct options_cache_value_type {};
|
||||
using client_options_cache_type = utils::loading_shared_values<sstring, options_cache_value_type>;
|
||||
using client_options_cache_entry_type = client_options_cache_type::entry_ptr;
|
||||
using client_options_cache_key_type = client_options_cache_type::key_type;
|
||||
|
||||
// This struct represents a single OPTION key-value pair from the client's connection options.
|
||||
// Both key and value are represented by corresponding "references" to their cached values.
|
||||
// Each "reference" is effectively a lw_shared_ptr value.
|
||||
struct client_option_key_value_cached_entry {
|
||||
client_options_cache_entry_type key;
|
||||
client_options_cache_entry_type value;
|
||||
};
|
||||
|
||||
sstring to_string(client_connection_stage ct);
|
||||
|
||||
// Representation of a row in `system.clients'. std::optionals are for nullable cells.
|
||||
@@ -37,8 +53,8 @@ struct client_data {
|
||||
client_connection_stage connection_stage = client_connection_stage::established;
|
||||
int32_t shard_id; /// ID of server-side shard which is processing the connection.
|
||||
|
||||
std::optional<sstring> driver_name;
|
||||
std::optional<sstring> driver_version;
|
||||
std::optional<client_options_cache_entry_type> driver_name;
|
||||
std::optional<client_options_cache_entry_type> driver_version;
|
||||
std::optional<sstring> hostname;
|
||||
std::optional<int32_t> protocol_version;
|
||||
std::optional<sstring> ssl_cipher_suite;
|
||||
@@ -46,6 +62,7 @@ struct client_data {
|
||||
std::optional<sstring> ssl_protocol;
|
||||
std::optional<sstring> username;
|
||||
std::optional<sstring> scheduling_group_name;
|
||||
std::list<client_option_key_value_cached_entry> client_options;
|
||||
|
||||
sstring stage_str() const { return to_string(connection_stage); }
|
||||
sstring client_type_str() const { return to_string(ct); }
|
||||
|
||||
@@ -125,10 +125,6 @@ if(target_arch)
|
||||
add_compile_options("-march=${target_arch}")
|
||||
endif()
|
||||
|
||||
if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
|
||||
add_compile_options("SHELL:-Xclang -fexperimental-assignment-tracking=disabled")
|
||||
endif()
|
||||
|
||||
function(maybe_limit_stack_usage_in_KB stack_usage_threshold_in_KB config)
|
||||
math(EXPR _stack_usage_threshold_in_bytes "${stack_usage_threshold_in_KB} * 1024")
|
||||
set(_stack_usage_threshold_flag "-Wstack-usage=${_stack_usage_threshold_in_bytes}")
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
#include <seastar/core/condition-variable.hh>
|
||||
|
||||
#include "schema/schema_fwd.hh"
|
||||
#include "sstables/open_info.hh"
|
||||
#include "compaction_descriptor.hh"
|
||||
|
||||
class reader_permit;
|
||||
@@ -44,7 +45,7 @@ public:
|
||||
virtual compaction_strategy_state& get_compaction_strategy_state() noexcept = 0;
|
||||
virtual reader_permit make_compaction_reader_permit() const = 0;
|
||||
virtual sstables::sstables_manager& get_sstables_manager() noexcept = 0;
|
||||
virtual sstables::shared_sstable make_sstable() const = 0;
|
||||
virtual sstables::shared_sstable make_sstable(sstables::sstable_state) const = 0;
|
||||
virtual sstables::sstable_writer_config configure_writer(sstring origin) const = 0;
|
||||
virtual api::timestamp_type min_memtable_timestamp() const = 0;
|
||||
virtual api::timestamp_type min_memtable_live_timestamp() const = 0;
|
||||
|
||||
@@ -416,7 +416,9 @@ future<compaction_result> compaction_task_executor::compact_sstables(compaction_
|
||||
descriptor.enable_garbage_collection(co_await sstable_set_for_tombstone_gc(t));
|
||||
}
|
||||
descriptor.creator = [&t] (shard_id) {
|
||||
return t.make_sstable();
|
||||
// All compaction types going through this path will work on normal input sstables only.
|
||||
// Off-strategy, for example, waits until the sstables move out of staging state.
|
||||
return t.make_sstable(sstables::sstable_state::normal);
|
||||
};
|
||||
descriptor.replacer = [this, &t, &on_replace, offstrategy] (compaction_completion_desc desc) {
|
||||
t.get_compaction_strategy().notify_completion(t, desc.old_sstables, desc.new_sstables);
|
||||
@@ -1847,6 +1849,10 @@ protected:
|
||||
throw make_compaction_stopped_exception();
|
||||
}
|
||||
}, false);
|
||||
if (utils::get_local_injector().is_enabled("split_sstable_force_stop_exception")) {
|
||||
throw make_compaction_stopped_exception();
|
||||
}
|
||||
|
||||
co_return co_await do_rewrite_sstable(std::move(sst));
|
||||
}
|
||||
};
|
||||
@@ -2284,12 +2290,16 @@ future<compaction_manager::compaction_stats_opt> compaction_manager::perform_spl
|
||||
}
|
||||
|
||||
future<std::vector<sstables::shared_sstable>>
|
||||
compaction_manager::maybe_split_sstable(sstables::shared_sstable sst, compaction_group_view& t, compaction_type_options::split opt) {
|
||||
compaction_manager::maybe_split_new_sstable(sstables::shared_sstable sst, compaction_group_view& t, compaction_type_options::split opt) {
|
||||
if (!split_compaction_task_executor::sstable_needs_split(sst, opt)) {
|
||||
co_return std::vector<sstables::shared_sstable>{sst};
|
||||
}
|
||||
if (!can_proceed(&t)) {
|
||||
co_return std::vector<sstables::shared_sstable>{sst};
|
||||
// Throw an error if split cannot be performed due to e.g. out of space prevention.
|
||||
// We don't want to prevent split because compaction is temporarily disabled on a view only for synchronization,
|
||||
// which is unneeded against new sstables that aren't part of any set yet, so never use can_proceed(&t) here.
|
||||
if (is_disabled()) {
|
||||
co_return coroutine::exception(std::make_exception_ptr(std::runtime_error(format("Cannot split {} because manager has compaction disabled, " \
|
||||
"reason might be out of space prevention", sst->get_filename()))));
|
||||
}
|
||||
std::vector<sstables::shared_sstable> ret;
|
||||
|
||||
@@ -2297,8 +2307,11 @@ compaction_manager::maybe_split_sstable(sstables::shared_sstable sst, compaction
|
||||
compaction_progress_monitor monitor;
|
||||
compaction_data info = create_compaction_data();
|
||||
compaction_descriptor desc = split_compaction_task_executor::make_descriptor(sst, opt);
|
||||
desc.creator = [&t] (shard_id _) {
|
||||
return t.make_sstable();
|
||||
desc.creator = [&t, sst] (shard_id _) {
|
||||
// NOTE: preserves the sstable state, since we want the output to be on the same state as the original.
|
||||
// For example, if base table has views, it's important that sstable produced by repair will be
|
||||
// in the staging state.
|
||||
return t.make_sstable(sst->state());
|
||||
};
|
||||
desc.replacer = [&] (compaction_completion_desc d) {
|
||||
std::move(d.new_sstables.begin(), d.new_sstables.end(), std::back_inserter(ret));
|
||||
|
||||
@@ -376,7 +376,8 @@ public:
|
||||
// Splits a single SSTable by segregating all its data according to the classifier.
|
||||
// If SSTable doesn't need split, the same input SSTable is returned as output.
|
||||
// If SSTable needs split, then output SSTables are returned and the input SSTable is deleted.
|
||||
future<std::vector<sstables::shared_sstable>> maybe_split_sstable(sstables::shared_sstable sst, compaction_group_view& t, compaction_type_options::split opt);
|
||||
// Exception is thrown if the input sstable cannot be split due to e.g. out of space prevention.
|
||||
future<std::vector<sstables::shared_sstable>> maybe_split_new_sstable(sstables::shared_sstable sst, compaction_group_view& t, compaction_type_options::split opt);
|
||||
|
||||
// Run a custom job for a given table, defined by a function
|
||||
// it completes when future returned by job is ready or returns immediately
|
||||
|
||||
@@ -571,10 +571,10 @@ commitlog_total_space_in_mb: -1
|
||||
# - "none": auditing is disabled (default)
|
||||
# - "table": save audited events in audit.audit_log column family
|
||||
# - "syslog": send audited events via syslog (depends on OS, but usually to /dev/log)
|
||||
# audit: "none"
|
||||
audit: "table"
|
||||
#
|
||||
# List of statement categories that should be audited.
|
||||
# audit_categories: "DCL,DDL,AUTH"
|
||||
audit_categories: "DCL,DDL,AUTH,ADMIN"
|
||||
#
|
||||
# List of tables that should be audited.
|
||||
# audit_tables: "<keyspace_name>.<table_name>,<keyspace_name>.<table_name>"
|
||||
|
||||
157
configure.py
157
configure.py
@@ -368,6 +368,87 @@ def find_ninja():
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def find_compiler(name):
|
||||
"""
|
||||
Find a compiler by name, skipping ccache wrapper directories.
|
||||
|
||||
This is useful when using sccache to avoid double-caching through ccache.
|
||||
|
||||
Args:
|
||||
name: The compiler name (e.g., 'clang++', 'clang', 'gcc')
|
||||
|
||||
Returns:
|
||||
Path to the compiler, skipping ccache directories, or None if not found.
|
||||
"""
|
||||
ccache_dirs = {'/usr/lib/ccache', '/usr/lib64/ccache'}
|
||||
for path_dir in os.environ.get('PATH', '').split(os.pathsep):
|
||||
# Skip ccache wrapper directories
|
||||
if os.path.realpath(path_dir) in ccache_dirs or path_dir in ccache_dirs:
|
||||
continue
|
||||
candidate = os.path.join(path_dir, name)
|
||||
if os.path.isfile(candidate) and os.access(candidate, os.X_OK):
|
||||
return candidate
|
||||
return None
|
||||
|
||||
|
||||
def resolve_compilers_for_compiler_cache(args, compiler_cache):
|
||||
"""
|
||||
When using a compiler cache, resolve compiler paths to avoid ccache directories.
|
||||
|
||||
This prevents double-caching when ccache symlinks are in PATH.
|
||||
|
||||
Args:
|
||||
args: The argument namespace with cc and cxx attributes.
|
||||
compiler_cache: Path to the compiler cache binary, or None.
|
||||
"""
|
||||
if not compiler_cache:
|
||||
return
|
||||
if not os.path.isabs(args.cxx):
|
||||
real_cxx = find_compiler(args.cxx)
|
||||
if real_cxx:
|
||||
args.cxx = real_cxx
|
||||
if not os.path.isabs(args.cc):
|
||||
real_cc = find_compiler(args.cc)
|
||||
if real_cc:
|
||||
args.cc = real_cc
|
||||
|
||||
|
||||
def find_compiler_cache(preference):
|
||||
"""
|
||||
Find a compiler cache based on the preference.
|
||||
|
||||
Args:
|
||||
preference: One of 'auto', 'sccache', 'ccache', 'none', or a path to a binary.
|
||||
|
||||
Returns:
|
||||
Path to the compiler cache binary, or None if not found/disabled.
|
||||
"""
|
||||
if preference == 'none':
|
||||
return None
|
||||
|
||||
if preference == 'auto':
|
||||
# Prefer sccache over ccache
|
||||
for cache in ['sccache', 'ccache']:
|
||||
path = which(cache)
|
||||
if path:
|
||||
return path
|
||||
return None
|
||||
|
||||
if preference in ('sccache', 'ccache'):
|
||||
path = which(preference)
|
||||
if path:
|
||||
return path
|
||||
print(f"Warning: {preference} not found on PATH, disabling compiler cache")
|
||||
return None
|
||||
|
||||
# Assume it's a path to a binary
|
||||
if os.path.isfile(preference) and os.access(preference, os.X_OK):
|
||||
return preference
|
||||
|
||||
print(f"Warning: compiler cache '{preference}' not found or not executable, disabling compiler cache")
|
||||
return None
|
||||
|
||||
|
||||
modes = {
|
||||
'debug': {
|
||||
'cxxflags': '-DDEBUG -DSANITIZE -DDEBUG_LSA_SANITIZER -DSCYLLA_ENABLE_ERROR_INJECTION',
|
||||
@@ -732,6 +813,8 @@ arg_parser.add_argument('--compiler', action='store', dest='cxx', default='clang
|
||||
help='C++ compiler path')
|
||||
arg_parser.add_argument('--c-compiler', action='store', dest='cc', default='clang',
|
||||
help='C compiler path')
|
||||
arg_parser.add_argument('--compiler-cache', action='store', dest='compiler_cache', default='auto',
|
||||
help='Compiler cache to use: auto (default, prefers sccache), sccache, ccache, none, or a path to a binary')
|
||||
add_tristate(arg_parser, name='dpdk', dest='dpdk', default=False,
|
||||
help='Use dpdk (from seastar dpdk sources)')
|
||||
arg_parser.add_argument('--dpdk-target', action='store', dest='dpdk_target', default='',
|
||||
@@ -951,6 +1034,7 @@ scylla_core = (['message/messaging_service.cc',
|
||||
'cql3/functions/aggregate_fcts.cc',
|
||||
'cql3/functions/castas_fcts.cc',
|
||||
'cql3/functions/error_injection_fcts.cc',
|
||||
'cql3/functions/vector_similarity_fcts.cc',
|
||||
'cql3/statements/cf_prop_defs.cc',
|
||||
'cql3/statements/cf_statement.cc',
|
||||
'cql3/statements/authentication_statement.cc',
|
||||
@@ -1370,6 +1454,7 @@ alternator = [
|
||||
'alternator/auth.cc',
|
||||
'alternator/streams.cc',
|
||||
'alternator/ttl.cc',
|
||||
'alternator/http_compression.cc'
|
||||
]
|
||||
|
||||
idls = ['idl/gossip_digest.idl.hh',
|
||||
@@ -1615,6 +1700,7 @@ deps['test/boost/combined_tests'] += [
|
||||
'test/boost/schema_registry_test.cc',
|
||||
'test/boost/secondary_index_test.cc',
|
||||
'test/boost/sessions_test.cc',
|
||||
'test/boost/simple_value_with_expiry_test.cc',
|
||||
'test/boost/sstable_compaction_test.cc',
|
||||
'test/boost/sstable_compressor_factory_test.cc',
|
||||
'test/boost/sstable_compression_config_test.cc',
|
||||
@@ -1698,6 +1784,18 @@ deps['test/vector_search/vector_store_client_test'] = ['test/vector_search/vect
|
||||
deps['test/vector_search/load_balancer_test'] = ['test/vector_search/load_balancer_test.cc'] + scylla_tests_dependencies
|
||||
deps['test/vector_search/client_test'] = ['test/vector_search/client_test.cc'] + scylla_tests_dependencies
|
||||
|
||||
boost_tests_prefixes = ["test/boost/", "test/vector_search/", "test/raft/", "test/manual/", "test/ldap/"]
|
||||
|
||||
# We need to link these files to all Boost tests to make sure that
|
||||
# we can execute `--list_json_content` on them. That will produce
|
||||
# a similar result as calling `--list_content={HRF,DOT}`.
|
||||
# Unfortunately, to be able to do that, we're forced to link the
|
||||
# relevant code by hand.
|
||||
for key in deps.keys():
|
||||
for prefix in boost_tests_prefixes:
|
||||
if key.startswith(prefix):
|
||||
deps[key] += ["test/lib/boost_tree_lister_injector.cc", "test/lib/boost_test_tree_lister.cc"]
|
||||
|
||||
wasm_deps = {}
|
||||
|
||||
wasm_deps['wasm/return_input.wat'] = 'test/resource/wasm/rust/return_input.rs'
|
||||
@@ -2002,7 +2100,7 @@ def semicolon_separated(*flags):
|
||||
def real_relpath(path, start):
|
||||
return os.path.relpath(os.path.realpath(path), os.path.realpath(start))
|
||||
|
||||
def configure_seastar(build_dir, mode, mode_config):
|
||||
def configure_seastar(build_dir, mode, mode_config, compiler_cache=None):
|
||||
seastar_cxx_ld_flags = mode_config['cxx_ld_flags']
|
||||
# We want to "undo" coverage for seastar if we have it enabled.
|
||||
if args.coverage:
|
||||
@@ -2049,6 +2147,10 @@ def configure_seastar(build_dir, mode, mode_config):
|
||||
'-DSeastar_IO_URING=ON',
|
||||
]
|
||||
|
||||
if compiler_cache:
|
||||
seastar_cmake_args += [f'-DCMAKE_CXX_COMPILER_LAUNCHER={compiler_cache}',
|
||||
f'-DCMAKE_C_COMPILER_LAUNCHER={compiler_cache}']
|
||||
|
||||
if args.stack_guards is not None:
|
||||
stack_guards = 'ON' if args.stack_guards else 'OFF'
|
||||
seastar_cmake_args += ['-DSeastar_STACK_GUARDS={}'.format(stack_guards)]
|
||||
@@ -2080,7 +2182,7 @@ def configure_seastar(build_dir, mode, mode_config):
|
||||
subprocess.check_call(seastar_cmd, shell=False, cwd=cmake_dir)
|
||||
|
||||
|
||||
def configure_abseil(build_dir, mode, mode_config):
|
||||
def configure_abseil(build_dir, mode, mode_config, compiler_cache=None):
|
||||
abseil_cflags = mode_config['lib_cflags']
|
||||
cxx_flags = mode_config['cxxflags']
|
||||
if '-DSANITIZE' in cxx_flags:
|
||||
@@ -2106,6 +2208,10 @@ def configure_abseil(build_dir, mode, mode_config):
|
||||
'-DABSL_PROPAGATE_CXX_STD=ON',
|
||||
]
|
||||
|
||||
if compiler_cache:
|
||||
abseil_cmake_args += [f'-DCMAKE_CXX_COMPILER_LAUNCHER={compiler_cache}',
|
||||
f'-DCMAKE_C_COMPILER_LAUNCHER={compiler_cache}']
|
||||
|
||||
cmake_args = abseil_cmake_args[:]
|
||||
abseil_build_dir = os.path.join(build_dir, mode, 'abseil')
|
||||
abseil_cmd = ['cmake', '-G', 'Ninja', real_relpath('abseil', abseil_build_dir)] + cmake_args
|
||||
@@ -2251,15 +2357,6 @@ def get_extra_cxxflags(mode, mode_config, cxx, debuginfo):
|
||||
if debuginfo and mode_config['can_have_debug_info']:
|
||||
cxxflags += ['-g', '-gz']
|
||||
|
||||
if 'clang' in cxx:
|
||||
# Since AssignmentTracking was enabled by default in clang
|
||||
# (llvm/llvm-project@de6da6ad55d3ca945195d1cb109cb8efdf40a52a)
|
||||
# coroutine frame debugging info (`coro_frame_ty`) is broken.
|
||||
#
|
||||
# It seems that we aren't losing much by disabling AssigmentTracking,
|
||||
# so for now we choose to disable it to get `coro_frame_ty` back.
|
||||
cxxflags.append('-Xclang -fexperimental-assignment-tracking=disabled')
|
||||
|
||||
return cxxflags
|
||||
|
||||
|
||||
@@ -2287,10 +2384,15 @@ def write_build_file(f,
|
||||
scylla_product,
|
||||
scylla_version,
|
||||
scylla_release,
|
||||
compiler_cache,
|
||||
args):
|
||||
use_precompiled_header = not args.disable_precompiled_header
|
||||
warnings = get_warning_options(args.cxx)
|
||||
rustc_target = pick_rustc_target('wasm32-wasi', 'wasm32-wasip1')
|
||||
# If compiler cache is available, prefix the compiler with it
|
||||
cxx_with_cache = f'{compiler_cache} {args.cxx}' if compiler_cache else args.cxx
|
||||
# For Rust, sccache is used via RUSTC_WRAPPER environment variable
|
||||
rustc_wrapper = f'RUSTC_WRAPPER={compiler_cache} ' if compiler_cache and 'sccache' in compiler_cache else ''
|
||||
f.write(textwrap.dedent('''\
|
||||
configure_args = {configure_args}
|
||||
builddir = {outdir}
|
||||
@@ -2353,7 +2455,7 @@ def write_build_file(f,
|
||||
command = clang --target=wasm32 --no-standard-libraries -Wl,--export-all -Wl,--no-entry $in -o $out
|
||||
description = C2WASM $out
|
||||
rule rust2wasm
|
||||
command = cargo build --target={rustc_target} --example=$example --locked --manifest-path=test/resource/wasm/rust/Cargo.toml --target-dir=$builddir/wasm/ $
|
||||
command = {rustc_wrapper}cargo build --target={rustc_target} --example=$example --locked --manifest-path=test/resource/wasm/rust/Cargo.toml --target-dir=$builddir/wasm/ $
|
||||
&& wasm-opt -Oz $builddir/wasm/{rustc_target}/debug/examples/$example.wasm -o $builddir/wasm/$example.wasm $
|
||||
&& wasm-strip $builddir/wasm/$example.wasm
|
||||
description = RUST2WASM $out
|
||||
@@ -2369,7 +2471,7 @@ def write_build_file(f,
|
||||
command = llvm-profdata merge $in -output=$out
|
||||
''').format(configure_args=configure_args,
|
||||
outdir=outdir,
|
||||
cxx=args.cxx,
|
||||
cxx=cxx_with_cache,
|
||||
user_cflags=user_cflags,
|
||||
warnings=warnings,
|
||||
defines=defines,
|
||||
@@ -2377,6 +2479,7 @@ def write_build_file(f,
|
||||
user_ldflags=user_ldflags,
|
||||
libs=libs,
|
||||
rustc_target=rustc_target,
|
||||
rustc_wrapper=rustc_wrapper,
|
||||
link_pool_depth=link_pool_depth,
|
||||
seastar_path=args.seastar_path,
|
||||
ninja=ninja,
|
||||
@@ -2461,10 +2564,10 @@ def write_build_file(f,
|
||||
description = TEST {mode}
|
||||
# This rule is unused for PGO stages. They use the rust lib from the parent mode.
|
||||
rule rust_lib.{mode}
|
||||
command = CARGO_BUILD_DEP_INFO_BASEDIR='.' cargo build --locked --manifest-path=rust/Cargo.toml --target-dir=$builddir/{mode} --profile=rust-{mode} $
|
||||
command = CARGO_BUILD_DEP_INFO_BASEDIR='.' {rustc_wrapper}cargo build --locked --manifest-path=rust/Cargo.toml --target-dir=$builddir/{mode} --profile=rust-{mode} $
|
||||
&& touch $out
|
||||
description = RUST_LIB $out
|
||||
''').format(mode=mode, antlr3_exec=args.antlr3_exec, fmt_lib=fmt_lib, test_repeat=args.test_repeat, test_timeout=args.test_timeout, **modeval))
|
||||
''').format(mode=mode, antlr3_exec=args.antlr3_exec, fmt_lib=fmt_lib, test_repeat=args.test_repeat, test_timeout=args.test_timeout, rustc_wrapper=rustc_wrapper, **modeval))
|
||||
f.write(
|
||||
'build {mode}-build: phony {artifacts} {wasms} {vector_search_validator_bins}\n'.format(
|
||||
mode=mode,
|
||||
@@ -2528,7 +2631,7 @@ def write_build_file(f,
|
||||
# In debug/sanitize modes, we compile with fsanitizers,
|
||||
# so must use the same options during the link:
|
||||
if '-DSANITIZE' in modes[mode]['cxxflags']:
|
||||
f.write(' libs = -fsanitize=address -fsanitize=undefined\n')
|
||||
f.write(' libs = -fsanitize=address -fsanitize=undefined -lubsan\n')
|
||||
else:
|
||||
f.write(' libs =\n')
|
||||
f.write(f'build $builddir/{mode}/{binary}.stripped: strip $builddir/{mode}/{binary}\n')
|
||||
@@ -2924,6 +3027,9 @@ def create_build_system(args):
|
||||
|
||||
os.makedirs(outdir, exist_ok=True)
|
||||
|
||||
compiler_cache = find_compiler_cache(args.compiler_cache)
|
||||
resolve_compilers_for_compiler_cache(args, compiler_cache)
|
||||
|
||||
scylla_product, scylla_version, scylla_release = generate_version(args.date_stamp)
|
||||
|
||||
for mode, mode_config in build_modes.items():
|
||||
@@ -2940,8 +3046,8 @@ def create_build_system(args):
|
||||
# {outdir}/{mode}/seastar/build.ninja, and
|
||||
# {outdir}/{mode}/seastar/seastar.pc is queried for building flags
|
||||
for mode, mode_config in build_modes.items():
|
||||
configure_seastar(outdir, mode, mode_config)
|
||||
configure_abseil(outdir, mode, mode_config)
|
||||
configure_seastar(outdir, mode, mode_config, compiler_cache)
|
||||
configure_abseil(outdir, mode, mode_config, compiler_cache)
|
||||
user_cflags += ' -isystem abseil'
|
||||
|
||||
for mode, mode_config in build_modes.items():
|
||||
@@ -2964,6 +3070,7 @@ def create_build_system(args):
|
||||
scylla_product,
|
||||
scylla_version,
|
||||
scylla_release,
|
||||
compiler_cache,
|
||||
args)
|
||||
generate_compdb('compile_commands.json', ninja, args.buildfile, selected_modes)
|
||||
|
||||
@@ -3006,6 +3113,10 @@ def configure_using_cmake(args):
|
||||
selected_modes = args.selected_modes or default_modes
|
||||
selected_configs = ';'.join(build_modes[mode].cmake_build_type for mode
|
||||
in selected_modes)
|
||||
|
||||
compiler_cache = find_compiler_cache(args.compiler_cache)
|
||||
resolve_compilers_for_compiler_cache(args, compiler_cache)
|
||||
|
||||
settings = {
|
||||
'CMAKE_CONFIGURATION_TYPES': selected_configs,
|
||||
'CMAKE_CROSS_CONFIGS': selected_configs,
|
||||
@@ -3023,6 +3134,14 @@ def configure_using_cmake(args):
|
||||
'Scylla_WITH_DEBUG_INFO' : 'ON' if args.debuginfo else 'OFF',
|
||||
'Scylla_USE_PRECOMPILED_HEADER': 'OFF' if args.disable_precompiled_header else 'ON',
|
||||
}
|
||||
|
||||
if compiler_cache:
|
||||
settings['CMAKE_CXX_COMPILER_LAUNCHER'] = compiler_cache
|
||||
settings['CMAKE_C_COMPILER_LAUNCHER'] = compiler_cache
|
||||
# For Rust, sccache is used via RUSTC_WRAPPER
|
||||
if 'sccache' in compiler_cache:
|
||||
settings['Scylla_RUSTC_WRAPPER'] = compiler_cache
|
||||
|
||||
if args.date_stamp:
|
||||
settings['Scylla_DATE_STAMP'] = args.date_stamp
|
||||
if args.staticboost:
|
||||
@@ -3054,7 +3173,7 @@ def configure_using_cmake(args):
|
||||
|
||||
if not args.dist_only:
|
||||
for mode in selected_modes:
|
||||
configure_seastar(build_dir, build_modes[mode].cmake_build_type, modes[mode])
|
||||
configure_seastar(build_dir, build_modes[mode].cmake_build_type, modes[mode], compiler_cache)
|
||||
|
||||
cmake_command = ['cmake']
|
||||
cmake_command += [f'-D{var}={value}' for var, value in settings.items()]
|
||||
|
||||
@@ -47,6 +47,7 @@ target_sources(cql3
|
||||
functions/aggregate_fcts.cc
|
||||
functions/castas_fcts.cc
|
||||
functions/error_injection_fcts.cc
|
||||
functions/vector_similarity_fcts.cc
|
||||
statements/cf_prop_defs.cc
|
||||
statements/cf_statement.cc
|
||||
statements/authentication_statement.cc
|
||||
|
||||
26
cql3/Cql.g
26
cql3/Cql.g
@@ -431,6 +431,7 @@ unaliasedSelector returns [uexpression tmp]
|
||||
| K_TTL '(' c=cident ')' { tmp = column_mutation_attribute{column_mutation_attribute::attribute_kind::ttl,
|
||||
unresolved_identifier{std::move(c)}}; }
|
||||
| f=functionName args=selectionFunctionArgs { tmp = function_call{std::move(f), std::move(args)}; }
|
||||
| f=similarityFunctionName args=vectorSimilarityArgs { tmp = function_call{std::move(f), std::move(args)}; }
|
||||
| K_CAST '(' arg=unaliasedSelector K_AS t=native_type ')' { tmp = cast{.style = cast::cast_style::sql, .arg = std::move(arg), .type = std::move(t)}; }
|
||||
)
|
||||
( '.' fi=cident { tmp = field_selection{std::move(tmp), std::move(fi)}; }
|
||||
@@ -445,6 +446,18 @@ selectionFunctionArgs returns [std::vector<expression> a]
|
||||
')'
|
||||
;
|
||||
|
||||
vectorSimilarityArgs returns [std::vector<expression> a]
|
||||
: '(' ')'
|
||||
| '(' v1=vectorSimilarityArg { a.push_back(std::move(v1)); }
|
||||
( ',' vn=vectorSimilarityArg { a.push_back(std::move(vn)); } )*
|
||||
')'
|
||||
;
|
||||
|
||||
vectorSimilarityArg returns [uexpression a]
|
||||
: s=unaliasedSelector { a = std::move(s); }
|
||||
| v=value { a = std::move(v); }
|
||||
;
|
||||
|
||||
countArgument
|
||||
: '*'
|
||||
| i=INTEGER { if (i->getText() != "1") {
|
||||
@@ -1683,6 +1696,10 @@ functionName returns [cql3::functions::function_name s]
|
||||
: (ks=keyspaceName '.')? f=allowedFunctionName { $s.keyspace = std::move(ks); $s.name = std::move(f); }
|
||||
;
|
||||
|
||||
similarityFunctionName returns [cql3::functions::function_name s]
|
||||
: f=allowedSimilarityFunctionName { $s = cql3::functions::function_name::native_function(std::move(f)); }
|
||||
;
|
||||
|
||||
allowedFunctionName returns [sstring s]
|
||||
: f=IDENT { $s = $f.text; std::transform(s.begin(), s.end(), s.begin(), ::tolower); }
|
||||
| f=QUOTED_NAME { $s = $f.text; }
|
||||
@@ -1691,6 +1708,11 @@ allowedFunctionName returns [sstring s]
|
||||
| K_COUNT { $s = "count"; }
|
||||
;
|
||||
|
||||
allowedSimilarityFunctionName returns [sstring s]
|
||||
: f=(K_SIMILARITY_COSINE | K_SIMILARITY_EUCLIDEAN | K_SIMILARITY_DOT_PRODUCT)
|
||||
{ $s = $f.text; std::transform(s.begin(), s.end(), s.begin(), ::tolower); }
|
||||
;
|
||||
|
||||
functionArgs returns [std::vector<expression> a]
|
||||
: '(' ')'
|
||||
| '(' t1=term { a.push_back(std::move(t1)); }
|
||||
@@ -2387,6 +2409,10 @@ K_MUTATION_FRAGMENTS: M U T A T I O N '_' F R A G M E N T S;
|
||||
|
||||
K_VECTOR_SEARCH_INDEXING: V E C T O R '_' S E A R C H '_' I N D E X I N G;
|
||||
|
||||
K_SIMILARITY_EUCLIDEAN: S I M I L A R I T Y '_' E U C L I D E A N;
|
||||
K_SIMILARITY_COSINE: S I M I L A R I T Y '_' C O S I N E;
|
||||
K_SIMILARITY_DOT_PRODUCT: S I M I L A R I T Y '_' D O T '_' P R O D U C T;
|
||||
|
||||
// Case-insensitive alpha characters
|
||||
fragment A: ('a'|'A');
|
||||
fragment B: ('b'|'B');
|
||||
|
||||
@@ -25,6 +25,11 @@ public:
|
||||
NOT_ASSIGNABLE,
|
||||
};
|
||||
|
||||
struct vector_test_result {
|
||||
test_result result;
|
||||
std::optional<size_t> dimension_opt;
|
||||
};
|
||||
|
||||
static bool is_assignable(test_result tr) {
|
||||
return tr != test_result::NOT_ASSIGNABLE;
|
||||
}
|
||||
@@ -44,6 +49,8 @@ public:
|
||||
*/
|
||||
virtual test_result test_assignment(data_dictionary::database db, const sstring& keyspace, const schema* schema_opt, const column_specification& receiver) const = 0;
|
||||
|
||||
virtual vector_test_result test_assignment_any_size_float_vector() const = 0;
|
||||
|
||||
virtual std::optional<data_type> assignment_testable_type_opt() const = 0;
|
||||
|
||||
// for error reporting
|
||||
|
||||
@@ -1434,6 +1434,112 @@ test_assignment(const expression& expr, data_dictionary::database db, const sstr
|
||||
}, expr);
|
||||
}
|
||||
|
||||
template <cql3_type::kind... Kinds>
|
||||
assignment_testable::vector_test_result
|
||||
test_assignment_any_size_float_vector(const expression& expr) {
|
||||
using test_result = assignment_testable::vector_test_result;
|
||||
const test_result NOT_ASSIGNABLE = {assignment_testable::test_result::NOT_ASSIGNABLE, std::nullopt};
|
||||
const test_result WEAKLY_ASSIGNABLE = {assignment_testable::test_result::WEAKLY_ASSIGNABLE, std::nullopt};
|
||||
auto is_float_or_bind = [] (const expression& e) {
|
||||
return expr::visit(overloaded_functor{
|
||||
[] (const bind_variable&) {
|
||||
return true;
|
||||
},
|
||||
[] (const untyped_constant& uc) {
|
||||
return uc.partial_type == untyped_constant::type_class::floating_point
|
||||
|| uc.partial_type == untyped_constant::type_class::integer;
|
||||
},
|
||||
[] (const constant& value) {
|
||||
auto kind = value.type->as_cql3_type().get_kind();
|
||||
return cql3_type::kind_enum_set::frozen<Kinds...>().contains(kind);
|
||||
},
|
||||
[] (const auto&) {
|
||||
return false;
|
||||
},
|
||||
}, e);
|
||||
};
|
||||
auto validate_assignment = [&] (const data_type& dt) -> test_result {
|
||||
auto vt = dynamic_pointer_cast<const vector_type_impl>(dt->underlying_type());
|
||||
if (!vt) {
|
||||
return NOT_ASSIGNABLE;
|
||||
}
|
||||
auto elem_kind = vt->get_elements_type()->as_cql3_type().get_kind();
|
||||
if (cql3_type::kind_enum_set::frozen<Kinds...>().contains(elem_kind)) {
|
||||
return {assignment_testable::test_result::WEAKLY_ASSIGNABLE, vt->get_dimension()};
|
||||
}
|
||||
return NOT_ASSIGNABLE;
|
||||
};
|
||||
return expr::visit(overloaded_functor{
|
||||
[&] (const constant& value) -> test_result {
|
||||
return validate_assignment(value.type);
|
||||
},
|
||||
[&] (const binary_operator&) -> test_result {
|
||||
return NOT_ASSIGNABLE;
|
||||
},
|
||||
[&] (const conjunction&) -> test_result {
|
||||
return NOT_ASSIGNABLE;
|
||||
},
|
||||
[&] (const column_value& col_val) -> test_result {
|
||||
return validate_assignment(col_val.col->type);
|
||||
},
|
||||
[&] (const subscript&) -> test_result {
|
||||
return NOT_ASSIGNABLE;
|
||||
},
|
||||
[&] (const unresolved_identifier& ui) -> test_result {
|
||||
return NOT_ASSIGNABLE;
|
||||
},
|
||||
[&] (const column_mutation_attribute& cma) -> test_result {
|
||||
return NOT_ASSIGNABLE;
|
||||
},
|
||||
[&] (const function_call& fc) -> test_result {
|
||||
return NOT_ASSIGNABLE;
|
||||
},
|
||||
[&] (const cast& c) -> test_result {
|
||||
return NOT_ASSIGNABLE;
|
||||
},
|
||||
[&] (const field_selection& fs) -> test_result {
|
||||
return NOT_ASSIGNABLE;
|
||||
},
|
||||
[&] (const bind_variable& bv) -> test_result {
|
||||
return WEAKLY_ASSIGNABLE;
|
||||
},
|
||||
[&] (const untyped_constant& uc) -> test_result {
|
||||
return uc.partial_type == untyped_constant::type_class::null
|
||||
? WEAKLY_ASSIGNABLE
|
||||
: NOT_ASSIGNABLE;
|
||||
},
|
||||
[&] (const tuple_constructor& tc) -> test_result {
|
||||
return NOT_ASSIGNABLE;
|
||||
},
|
||||
[&] (const collection_constructor& c) -> test_result {
|
||||
switch (c.style) {
|
||||
case collection_constructor::style_type::list_or_vector: {
|
||||
if(std::ranges::all_of(c.elements, is_float_or_bind)) {
|
||||
return {assignment_testable::test_result::WEAKLY_ASSIGNABLE, c.elements.size()};
|
||||
}
|
||||
return NOT_ASSIGNABLE;
|
||||
}
|
||||
case collection_constructor::style_type::set: return NOT_ASSIGNABLE;
|
||||
case collection_constructor::style_type::map: return NOT_ASSIGNABLE;
|
||||
case collection_constructor::style_type::vector:
|
||||
on_internal_error(expr_logger, "vector style type found in test_assignment, should have been introduced post-prepare");
|
||||
}
|
||||
on_internal_error(expr_logger, fmt::format("unexpected collection_constructor style {}", static_cast<unsigned>(c.style)));
|
||||
},
|
||||
[&] (const usertype_constructor& uc) -> test_result {
|
||||
return NOT_ASSIGNABLE;
|
||||
},
|
||||
[&] (const temporary& t) -> test_result {
|
||||
return NOT_ASSIGNABLE;
|
||||
},
|
||||
}, expr);
|
||||
}
|
||||
|
||||
assignment_testable::vector_test_result
|
||||
test_assignment_any_size_float_vector(const expression& expr) {
|
||||
return test_assignment_any_size_float_vector<cql3_type::kind::FLOAT, cql3_type::kind::DOUBLE>(expr);
|
||||
}
|
||||
|
||||
expression
|
||||
prepare_expression(const expression& expr, data_dictionary::database db, const sstring& keyspace, const schema* schema_opt, lw_shared_ptr<column_specification> receiver) {
|
||||
auto e_opt = try_prepare_expression(expr, db, keyspace, schema_opt, std::move(receiver));
|
||||
@@ -1467,6 +1573,9 @@ public:
|
||||
virtual test_result test_assignment(data_dictionary::database db, const sstring& keyspace, const schema* schema_opt, const column_specification& receiver) const override {
|
||||
return expr::test_assignment(_e, db, keyspace, schema_opt, receiver);
|
||||
}
|
||||
virtual vector_test_result test_assignment_any_size_float_vector() const override {
|
||||
return expr::test_assignment_any_size_float_vector(_e);
|
||||
}
|
||||
virtual sstring assignment_testable_source_context() const override {
|
||||
return fmt::format("{}", _e);
|
||||
}
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#include "cql3/functions/user_function.hh"
|
||||
#include "cql3/functions/user_aggregate.hh"
|
||||
#include "cql3/functions/uuid_fcts.hh"
|
||||
#include "cql3/functions/vector_similarity_fcts.hh"
|
||||
#include "data_dictionary/data_dictionary.hh"
|
||||
#include "as_json_function.hh"
|
||||
#include "cql3/prepare_context.hh"
|
||||
@@ -398,6 +399,14 @@ functions::get(data_dictionary::database db,
|
||||
}
|
||||
});
|
||||
|
||||
const auto func_name = name.has_keyspace() ? name : name.as_native_function();
|
||||
if (SIMILARITY_FUNCTIONS.contains(func_name)) {
|
||||
auto arg_types = retrieve_vector_arg_types(func_name, provided_args);
|
||||
auto fun = ::make_shared<vector_similarity_fct>(func_name.name, arg_types);
|
||||
validate_types(db, keyspace, schema.get(), fun, provided_args, receiver_ks, receiver_cf);
|
||||
return fun;
|
||||
}
|
||||
|
||||
if (name.has_keyspace()
|
||||
? name == TOKEN_FUNCTION_NAME
|
||||
: name.name == TOKEN_FUNCTION_NAME.name) {
|
||||
|
||||
150
cql3/functions/vector_similarity_fcts.cc
Normal file
150
cql3/functions/vector_similarity_fcts.cc
Normal file
@@ -0,0 +1,150 @@
|
||||
/*
|
||||
* Copyright (C) 2025-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
|
||||
#include "vector_similarity_fcts.hh"
|
||||
#include "types/types.hh"
|
||||
#include "types/vector.hh"
|
||||
#include "exceptions/exceptions.hh"
|
||||
|
||||
namespace cql3 {
|
||||
namespace functions {
|
||||
namespace {
|
||||
|
||||
// The computations of similarity scores match the exact formulas of Cassandra's (jVector's) implementation to ensure compatibility.
|
||||
// There exist tests checking the compliance of the results.
|
||||
// Reference:
|
||||
// https://github.com/datastax/jvector/blob/f967f1c9249035b63b55a566fac7d4dc38380349/jvector-base/src/main/java/io/github/jbellis/jvector/vector/VectorSimilarityFunction.java#L36-L69
|
||||
|
||||
// You should only use this function if you need to preserve the original vectors and cannot normalize
|
||||
// them in advance.
|
||||
float compute_cosine_similarity(const std::vector<data_value>& v1, const std::vector<data_value>& v2) {
|
||||
double dot_product = 0.0;
|
||||
double squared_norm_a = 0.0;
|
||||
double squared_norm_b = 0.0;
|
||||
|
||||
for (size_t i = 0; i < v1.size(); ++i) {
|
||||
double a = value_cast<float>(v1[i]);
|
||||
double b = value_cast<float>(v2[i]);
|
||||
|
||||
dot_product += a * b;
|
||||
squared_norm_a += a * a;
|
||||
squared_norm_b += b * b;
|
||||
}
|
||||
|
||||
if (squared_norm_a == 0 || squared_norm_b == 0) {
|
||||
throw exceptions::invalid_request_exception("Function system.similarity_cosine doesn't support all-zero vectors");
|
||||
}
|
||||
|
||||
// The cosine similarity is in the range [-1, 1].
|
||||
// It is mapped to a similarity score in the range [0, 1] (-1 -> 0, 1 -> 1)
|
||||
// for consistency with other similarity functions.
|
||||
return (1 + (dot_product / (std::sqrt(squared_norm_a * squared_norm_b)))) / 2;
|
||||
}
|
||||
|
||||
float compute_euclidean_similarity(const std::vector<data_value>& v1, const std::vector<data_value>& v2) {
|
||||
double sum = 0.0;
|
||||
|
||||
for (size_t i = 0; i < v1.size(); ++i) {
|
||||
double a = value_cast<float>(v1[i]);
|
||||
double b = value_cast<float>(v2[i]);
|
||||
|
||||
double diff = a - b;
|
||||
sum += diff * diff;
|
||||
}
|
||||
|
||||
// The squared Euclidean (L2) distance is of range [0, inf).
|
||||
// It is mapped to a similarity score in the range (0, 1] (0 -> 1, inf -> 0)
|
||||
// for consistency with other similarity functions.
|
||||
return (1 / (1 + sum));
|
||||
}
|
||||
|
||||
// Assumes that both vectors are L2-normalized.
|
||||
// This similarity is intended as an optimized way to perform cosine similarity calculation.
|
||||
float compute_dot_product_similarity(const std::vector<data_value>& v1, const std::vector<data_value>& v2) {
|
||||
double dot_product = 0.0;
|
||||
|
||||
for (size_t i = 0; i < v1.size(); ++i) {
|
||||
double a = value_cast<float>(v1[i]);
|
||||
double b = value_cast<float>(v2[i]);
|
||||
dot_product += a * b;
|
||||
}
|
||||
|
||||
// The dot product is in the range [-1, 1] for L2-normalized vectors.
|
||||
// It is mapped to a similarity score in the range [0, 1] (-1 -> 0, 1 -> 1)
|
||||
// for consistency with other similarity functions.
|
||||
return ((1 + dot_product) / 2);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
thread_local const std::unordered_map<function_name, similarity_function_t> SIMILARITY_FUNCTIONS = {
|
||||
{SIMILARITY_COSINE_FUNCTION_NAME, compute_cosine_similarity},
|
||||
{SIMILARITY_EUCLIDEAN_FUNCTION_NAME, compute_euclidean_similarity},
|
||||
{SIMILARITY_DOT_PRODUCT_FUNCTION_NAME, compute_dot_product_similarity},
|
||||
};
|
||||
|
||||
std::vector<data_type> retrieve_vector_arg_types(const function_name& name, const std::vector<shared_ptr<assignment_testable>>& provided_args) {
|
||||
if (provided_args.size() != 2) {
|
||||
throw exceptions::invalid_request_exception(fmt::format("Invalid number of arguments for function {}(vector<float, n>, vector<float, n>)", name));
|
||||
}
|
||||
|
||||
auto [first_result, first_dim_opt] = provided_args[0]->test_assignment_any_size_float_vector();
|
||||
auto [second_result, second_dim_opt] = provided_args[1]->test_assignment_any_size_float_vector();
|
||||
|
||||
auto invalid_type_error_message = [&name](const shared_ptr<assignment_testable>& arg) {
|
||||
auto type = arg->assignment_testable_type_opt();
|
||||
const auto& source_context = arg->assignment_testable_source_context();
|
||||
if (type) {
|
||||
return fmt::format("Function {} requires a float vector argument, but found {} of type {}", name, source_context, type.value()->cql3_type_name());
|
||||
} else {
|
||||
return fmt::format("Function {} requires a float vector argument, but found {}", name, source_context);
|
||||
}
|
||||
};
|
||||
|
||||
if (!is_assignable(first_result)) {
|
||||
throw exceptions::invalid_request_exception(invalid_type_error_message(provided_args[0]));
|
||||
}
|
||||
if (!is_assignable(second_result)) {
|
||||
throw exceptions::invalid_request_exception(invalid_type_error_message(provided_args[1]));
|
||||
}
|
||||
|
||||
if (!first_dim_opt && !second_dim_opt) {
|
||||
throw exceptions::invalid_request_exception(fmt::format("Cannot infer type of argument {} for function {}(vector<float, n>, vector<float, n>)",
|
||||
provided_args[0]->assignment_testable_source_context(), name));
|
||||
}
|
||||
if (first_dim_opt && second_dim_opt) {
|
||||
if (*first_dim_opt != *second_dim_opt) {
|
||||
throw exceptions::invalid_request_exception(fmt::format(
|
||||
"All arguments must have the same vector dimensions, but found vector<float, {}> and vector<float, {}>", *first_dim_opt, *second_dim_opt));
|
||||
}
|
||||
}
|
||||
|
||||
size_t dimension = first_dim_opt ? *first_dim_opt : *second_dim_opt;
|
||||
auto type = vector_type_impl::get_instance(float_type, dimension);
|
||||
return {type, type};
|
||||
}
|
||||
|
||||
bytes_opt vector_similarity_fct::execute(std::span<const bytes_opt> parameters) {
|
||||
if (std::any_of(parameters.begin(), parameters.end(), [](const auto& param) {
|
||||
return !param;
|
||||
})) {
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
const auto& type = arg_types()[0];
|
||||
data_value v1 = type->deserialize(*parameters[0]);
|
||||
data_value v2 = type->deserialize(*parameters[1]);
|
||||
const auto& v1_elements = value_cast<std::vector<data_value>>(v1);
|
||||
const auto& v2_elements = value_cast<std::vector<data_value>>(v2);
|
||||
|
||||
float result = SIMILARITY_FUNCTIONS.at(_name)(v1_elements, v2_elements);
|
||||
return float_type->decompose(result);
|
||||
}
|
||||
|
||||
} // namespace functions
|
||||
} // namespace cql3
|
||||
37
cql3/functions/vector_similarity_fcts.hh
Normal file
37
cql3/functions/vector_similarity_fcts.hh
Normal file
@@ -0,0 +1,37 @@
|
||||
/*
|
||||
* Copyright (C) 2025-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "native_scalar_function.hh"
|
||||
#include "cql3/assignment_testable.hh"
|
||||
#include "cql3/functions/function_name.hh"
|
||||
|
||||
namespace cql3 {
|
||||
namespace functions {
|
||||
|
||||
static const function_name SIMILARITY_COSINE_FUNCTION_NAME = function_name::native_function("similarity_cosine");
|
||||
static const function_name SIMILARITY_EUCLIDEAN_FUNCTION_NAME = function_name::native_function("similarity_euclidean");
|
||||
static const function_name SIMILARITY_DOT_PRODUCT_FUNCTION_NAME = function_name::native_function("similarity_dot_product");
|
||||
|
||||
using similarity_function_t = float (*)(const std::vector<data_value>&, const std::vector<data_value>&);
|
||||
extern thread_local const std::unordered_map<function_name, similarity_function_t> SIMILARITY_FUNCTIONS;
|
||||
|
||||
std::vector<data_type> retrieve_vector_arg_types(const function_name& name, const std::vector<shared_ptr<assignment_testable>>& provided_args);
|
||||
|
||||
class vector_similarity_fct : public native_scalar_function {
|
||||
public:
|
||||
vector_similarity_fct(const sstring& name, const std::vector<data_type>& arg_types)
|
||||
: native_scalar_function(name, float_type, arg_types) {
|
||||
}
|
||||
|
||||
virtual bytes_opt execute(std::span<const bytes_opt> parameters) override;
|
||||
};
|
||||
|
||||
} // namespace functions
|
||||
} // namespace cql3
|
||||
@@ -32,7 +32,7 @@ bool
|
||||
selectable_processes_selection(const expr::expression& selectable) {
|
||||
return expr::visit(overloaded_functor{
|
||||
[&] (const expr::constant&) -> bool {
|
||||
on_internal_error(slogger, "no way to express SELECT constant in the grammar yet");
|
||||
return true;
|
||||
},
|
||||
[&] (const expr::conjunction& conj) -> bool {
|
||||
on_internal_error(slogger, "no way to express 'SELECT a AND b' in the grammar yet");
|
||||
|
||||
@@ -190,7 +190,7 @@ future<utils::chunked_vector<mutation>> batch_statement::get_mutations(query_pro
|
||||
co_return vresult;
|
||||
}
|
||||
|
||||
void batch_statement::verify_batch_size(query_processor& qp, const utils::chunked_vector<mutation>& mutations) {
|
||||
void batch_statement::verify_batch_size(query_processor& qp, const utils::chunked_vector<mutation>& mutations) const {
|
||||
if (mutations.size() <= 1) {
|
||||
return; // We only warn for batch spanning multiple mutations
|
||||
}
|
||||
@@ -209,8 +209,9 @@ void batch_statement::verify_batch_size(query_processor& qp, const utils::chunke
|
||||
for (auto&& m : mutations) {
|
||||
ks_cf_pairs.insert(m.schema()->ks_name() + "." + m.schema()->cf_name());
|
||||
}
|
||||
return seastar::format("Batch modifying {:d} partitions in {} is of size {:d} bytes, exceeding specified {} threshold of {:d} by {:d}.",
|
||||
mutations.size(), fmt::join(ks_cf_pairs, ", "), size, type, threshold, size - threshold);
|
||||
const auto batch_type = _type == type::LOGGED ? "Logged" : "Unlogged";
|
||||
return seastar::format("{} batch modifying {:d} partitions in {} is of size {:d} bytes, exceeding specified {} threshold of {:d} by {:d}.",
|
||||
batch_type, mutations.size(), fmt::join(ks_cf_pairs, ", "), size, type, threshold, size - threshold);
|
||||
};
|
||||
if (size > fail_threshold) {
|
||||
_logger.error("{}", error("FAIL", fail_threshold).c_str());
|
||||
|
||||
@@ -116,7 +116,7 @@ public:
|
||||
* Checks batch size to ensure threshold is met. If not, a warning is logged.
|
||||
* @param cfs ColumnFamilies that will store the batch's mutations.
|
||||
*/
|
||||
static void verify_batch_size(query_processor& qp, const utils::chunked_vector<mutation>& mutations);
|
||||
void verify_batch_size(query_processor& qp, const utils::chunked_vector<mutation>& mutations) const;
|
||||
|
||||
virtual future<shared_ptr<cql_transport::messages::result_message>> execute(
|
||||
query_processor& qp, service::query_state& state, const query_options& options, std::optional<service::group0_guard> guard) const override;
|
||||
|
||||
@@ -710,11 +710,12 @@ std::vector<lw_shared_ptr<column_specification>> listing_describe_statement::get
|
||||
|
||||
future<std::vector<std::vector<managed_bytes_opt>>> listing_describe_statement::describe(cql3::query_processor& qp, const service::client_state& client_state) const {
|
||||
auto db = qp.db();
|
||||
auto raw_ks = client_state.get_raw_keyspace();
|
||||
|
||||
std::vector<sstring> keyspaces;
|
||||
if (!raw_ks.empty()) {
|
||||
keyspaces.push_back(raw_ks);
|
||||
// For most describe statements we should limit the results to the USEd
|
||||
// keyspace (client_state.get_raw_keyspace()), if any. However for DESC
|
||||
// KEYSPACES we must list all keyspaces, not just the USEd one.
|
||||
if (_element != element_type::keyspace && !client_state.get_raw_keyspace().empty()) {
|
||||
keyspaces.push_back(client_state.get_raw_keyspace());
|
||||
} else {
|
||||
keyspaces = db.get_all_keyspaces();
|
||||
std::ranges::sort(keyspaces);
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
#include <seastar/core/metrics.hh>
|
||||
#include <seastar/core/coroutine.hh>
|
||||
#include <seastar/core/sleep.hh>
|
||||
#include <seastar/coroutine/parallel_for_each.hh>
|
||||
|
||||
#include "batchlog_manager.hh"
|
||||
#include "batchlog.hh"
|
||||
@@ -318,8 +319,8 @@ future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches
|
||||
|
||||
std::unordered_map<int32_t, replay_stats> replay_stats_per_shard;
|
||||
|
||||
// Use a stable `now` accross all batches, so skip/replay decisions are the
|
||||
// same accross a while prefix of written_at (accross all ids).
|
||||
// Use a stable `now` across all batches, so skip/replay decisions are the
|
||||
// same across a while prefix of written_at (across all ids).
|
||||
const auto now = db_clock::now();
|
||||
|
||||
auto batch = [this, cleanup, limiter, schema, &all_replayed, &replay_stats_per_shard, now] (const cql3::untyped_result_set::row& row) -> future<stop_iteration> {
|
||||
|
||||
23
db/config.cc
23
db/config.cc
@@ -1105,6 +1105,14 @@ db::config::config(std::shared_ptr<db::extensions> exts)
|
||||
"Like native_transport_port, but clients-side port number (modulo smp) is used to route the connection to the specific shard.")
|
||||
, native_shard_aware_transport_port_ssl(this, "native_shard_aware_transport_port_ssl", value_status::Used, 19142,
|
||||
"Like native_transport_port_ssl, but clients-side port number (modulo smp) is used to route the connection to the specific shard.")
|
||||
, native_transport_port_proxy_protocol(this, "native_transport_port_proxy_protocol", value_status::Used, 0,
|
||||
"Port on which the CQL native transport listens for clients using proxy protocol v2. Disabled (0) by default.")
|
||||
, native_transport_port_ssl_proxy_protocol(this, "native_transport_port_ssl_proxy_protocol", value_status::Used, 0,
|
||||
"Port on which the CQL TLS native transport listens for clients using proxy protocol v2. Disabled (0) by default.")
|
||||
, native_shard_aware_transport_port_proxy_protocol(this, "native_shard_aware_transport_port_proxy_protocol", value_status::Used, 0,
|
||||
"Like native_transport_port_proxy_protocol, but clients-side port number (modulo smp) is used to route the connection to the specific shard.")
|
||||
, native_shard_aware_transport_port_ssl_proxy_protocol(this, "native_shard_aware_transport_port_ssl_proxy_protocol", value_status::Used, 0,
|
||||
"Like native_transport_port_ssl_proxy_protocol, but clients-side port number (modulo smp) is used to route the connection to the specific shard.")
|
||||
, native_transport_max_threads(this, "native_transport_max_threads", value_status::Invalid, 128,
|
||||
"The maximum number of thread handling requests. The meaning is the same as rpc_max_threads.\n"
|
||||
"Default is different (128 versus unlimited).\n"
|
||||
@@ -1470,6 +1478,15 @@ db::config::config(std::shared_ptr<db::extensions> exts)
|
||||
, alternator_max_expression_cache_entries_per_shard(this, "alternator_max_expression_cache_entries_per_shard", liveness::LiveUpdate, value_status::Used, 2000, "Maximum number of cached parsed request expressions, per shard.")
|
||||
, alternator_max_users_query_size_in_trace_output(this, "alternator_max_users_query_size_in_trace_output", liveness::LiveUpdate, value_status::Used, uint64_t(4096),
|
||||
"Maximum size of user's command in trace output (`alternator_op` entry). Larger traces will be truncated and have `<truncated>` message appended - which doesn't count to the maximum limit.")
|
||||
, alternator_describe_table_info_cache_validity_in_seconds(this, "alternator_describe_table_info_cache_validity_in_seconds", liveness::LiveUpdate, value_status::Used, 60 * 60 * 6,
|
||||
"The validity of DescribeTable information - table size in bytes. This is how long calculated value will be reused before recalculation.")
|
||||
, alternator_response_gzip_compression_level(this, "alternator_response_gzip_compression_level", liveness::LiveUpdate, value_status::Used, int8_t(6),
|
||||
"Controls gzip and deflate compression level for Alternator response bodies (if the client requests it via Accept-Encoding header) Default of 6 is a compromise between speed and compression.\n"
|
||||
"Valid values:\n"
|
||||
"\t0 : No compression (disables gzip/deflate)\n"
|
||||
"\t1-9: Compression levels (1 = fastest, 9 = best compression)")
|
||||
, alternator_response_compression_threshold_in_bytes(this, "alternator_response_compression_threshold_in_bytes", liveness::LiveUpdate, value_status::Used, uint64_t(4096),
|
||||
"When the compression is enabled, this value indicates the minimum size of data to compress. Smaller responses will not be compressed.")
|
||||
, abort_on_ebadf(this, "abort_on_ebadf", value_status::Used, true, "Abort the server on incorrect file descriptor access. Throws exception when disabled.")
|
||||
, sanitizer_report_backtrace(this, "sanitizer_report_backtrace", value_status::Used, false,
|
||||
"In debug mode, report log-structured allocator sanitizer violations with a backtrace. Slow.")
|
||||
@@ -1566,6 +1583,12 @@ db::config::config(std::shared_ptr<db::extensions> exts)
|
||||
// Bigger tables will take longer to be resized. similar-sized tables can be batched into same iteration.
|
||||
, tablet_load_stats_refresh_interval_in_seconds(this, "tablet_load_stats_refresh_interval_in_seconds", liveness::LiveUpdate, value_status::Used, 60,
|
||||
"Tablet load stats refresh rate in seconds.")
|
||||
, force_capacity_based_balancing(this, "force_capacity_based_balancing", liveness::LiveUpdate, value_status::Used, false,
|
||||
"Forces the load balancer to perform capacity based balancing, instead of size based balancing.")
|
||||
, size_based_balance_threshold_percentage(this, "size_based_balance_threshold_percentage", liveness::LiveUpdate, value_status::Used, 1.0,
|
||||
"Sets the maximum difference in percentages between the most loaded and least loaded nodes, below which the load balancer considers nodes balanced.")
|
||||
, minimal_tablet_size_for_balancing(this, "minimal_tablet_size_for_balancing", liveness::LiveUpdate, value_status::Used, service::default_target_tablet_size / 100,
|
||||
"Sets the minimal tablet size for the load balancer. For any tablet smaller than this, the balancer will use this size instead of the actual tablet size.")
|
||||
, default_log_level(this, "default_log_level", value_status::Used, seastar::log_level::info, "Default log level for log messages")
|
||||
, logger_log_level(this, "logger_log_level", value_status::Used, {}, "Map of logger name to log level. Valid log levels are 'error', 'warn', 'info', 'debug' and 'trace'")
|
||||
, log_to_stdout(this, "log_to_stdout", value_status::Used, true, "Send log output to stdout")
|
||||
|
||||
10
db/config.hh
10
db/config.hh
@@ -324,6 +324,10 @@ public:
|
||||
named_value<uint16_t> native_transport_port_ssl;
|
||||
named_value<uint16_t> native_shard_aware_transport_port;
|
||||
named_value<uint16_t> native_shard_aware_transport_port_ssl;
|
||||
named_value<uint16_t> native_transport_port_proxy_protocol;
|
||||
named_value<uint16_t> native_transport_port_ssl_proxy_protocol;
|
||||
named_value<uint16_t> native_shard_aware_transport_port_proxy_protocol;
|
||||
named_value<uint16_t> native_shard_aware_transport_port_ssl_proxy_protocol;
|
||||
named_value<uint32_t> native_transport_max_threads;
|
||||
named_value<uint32_t> native_transport_max_frame_size_in_mb;
|
||||
named_value<sstring> broadcast_rpc_address;
|
||||
@@ -473,6 +477,9 @@ public:
|
||||
named_value<bool> alternator_allow_system_table_write;
|
||||
named_value<uint32_t> alternator_max_expression_cache_entries_per_shard;
|
||||
named_value<uint64_t> alternator_max_users_query_size_in_trace_output;
|
||||
named_value<uint32_t> alternator_describe_table_info_cache_validity_in_seconds;
|
||||
named_value<int> alternator_response_gzip_compression_level;
|
||||
named_value<uint32_t> alternator_response_compression_threshold_in_bytes;
|
||||
|
||||
named_value<bool> abort_on_ebadf;
|
||||
|
||||
@@ -590,6 +597,9 @@ public:
|
||||
named_value<bool> rf_rack_valid_keyspaces;
|
||||
|
||||
named_value<uint32_t> tablet_load_stats_refresh_interval_in_seconds;
|
||||
named_value<bool> force_capacity_based_balancing;
|
||||
named_value<float> size_based_balance_threshold_percentage;
|
||||
named_value<uint64_t> minimal_tablet_size_for_balancing;
|
||||
|
||||
static const sstring default_tls_priority;
|
||||
private:
|
||||
|
||||
@@ -26,6 +26,7 @@
|
||||
#include <seastar/core/smp.hh>
|
||||
#include <seastar/coroutine/exception.hh>
|
||||
#include <seastar/coroutine/parallel_for_each.hh>
|
||||
#include <seastar/util/file.hh>
|
||||
|
||||
// Boost features.
|
||||
|
||||
@@ -643,6 +644,12 @@ future<> manager::drain_for(endpoint_id host_id, gms::inet_address ip) noexcept
|
||||
co_return;
|
||||
}
|
||||
|
||||
if (!replay_allowed()) {
|
||||
auto reason = seastar::format("Precondition violdated while trying to drain {} / {}: "
|
||||
"hint replay is not allowed", host_id, ip);
|
||||
on_internal_error(manager_logger, std::move(reason));
|
||||
}
|
||||
|
||||
manager_logger.info("Draining starts for {}", host_id);
|
||||
|
||||
const auto holder = seastar::gate::holder{_draining_eps_gate};
|
||||
@@ -899,7 +906,7 @@ future<> manager::migrate_ip_directories() {
|
||||
co_await coroutine::parallel_for_each(dirs_to_remove, [] (auto& directory) -> future<> {
|
||||
try {
|
||||
manager_logger.warn("Removing hint directory {}", directory.native());
|
||||
co_await lister::rmdir(directory);
|
||||
co_await seastar::recursive_remove_directory(directory);
|
||||
} catch (...) {
|
||||
on_internal_error(manager_logger,
|
||||
seastar::format("Removing a hint directory has failed. Reason: {}", std::current_exception()));
|
||||
|
||||
@@ -318,6 +318,10 @@ public:
|
||||
/// In both cases - removes the corresponding hints' directories after all hints have been drained and erases the
|
||||
/// corresponding hint_endpoint_manager objects.
|
||||
///
|
||||
/// Preconditions:
|
||||
/// * Hint replay must be allowed (i.e. `replay_allowed()` must be true) throughout
|
||||
/// the execution of this function.
|
||||
///
|
||||
/// \param host_id host ID of the node that left the cluster
|
||||
/// \param ip the IP of the node that left the cluster
|
||||
future<> drain_for(endpoint_id host_id, gms::inet_address ip) noexcept;
|
||||
@@ -342,15 +346,15 @@ public:
|
||||
return _state.contains(state::started);
|
||||
}
|
||||
|
||||
bool replay_allowed() const noexcept {
|
||||
return _state.contains(state::replay_allowed);
|
||||
}
|
||||
|
||||
private:
|
||||
void set_started() noexcept {
|
||||
_state.set(state::started);
|
||||
}
|
||||
|
||||
bool replay_allowed() const noexcept {
|
||||
return _state.contains(state::replay_allowed);
|
||||
}
|
||||
|
||||
void set_draining_all() noexcept {
|
||||
_state.set(state::draining_all);
|
||||
}
|
||||
|
||||
@@ -152,7 +152,8 @@ future<> backup_task_impl::do_backup() {
|
||||
}
|
||||
|
||||
future<> backup_task_impl::process_snapshot_dir() {
|
||||
auto snapshot_dir_lister = directory_lister(_snapshot_dir, lister::dir_entry_types::of<directory_entry_type::regular>());
|
||||
auto directory = co_await io_check(open_directory, _snapshot_dir.native());
|
||||
auto snapshot_dir_lister = directory_lister(directory, _snapshot_dir, lister::dir_entry_types::of<directory_entry_type::regular>());
|
||||
size_t num_sstable_comps = 0;
|
||||
|
||||
try {
|
||||
@@ -161,7 +162,7 @@ future<> backup_task_impl::process_snapshot_dir() {
|
||||
while (auto component_ent = co_await snapshot_dir_lister.get()) {
|
||||
const auto& name = component_ent->name;
|
||||
auto file_path = _snapshot_dir / name;
|
||||
auto st = co_await file_stat(file_path.native());
|
||||
auto st = co_await file_stat(directory, name);
|
||||
total += st.size;
|
||||
try {
|
||||
auto desc = sstables::parse_path(file_path, "", "");
|
||||
|
||||
@@ -55,6 +55,7 @@
|
||||
#include "message/shared_dict.hh"
|
||||
#include "replica/database.hh"
|
||||
#include "db/compaction_history_entry.hh"
|
||||
#include "mutation/async_utils.hh"
|
||||
|
||||
#include <unordered_map>
|
||||
|
||||
@@ -2999,7 +3000,9 @@ future<mutation> system_keyspace::get_group0_history(sharded<replica::database>&
|
||||
SCYLLA_ASSERT(rs);
|
||||
auto& ps = rs->partitions();
|
||||
for (auto& p: ps) {
|
||||
auto mut = p.mut().unfreeze(s);
|
||||
// Note: we could decorate the frozen_mutation's key to check if it's the expected one
|
||||
// but since this is a single partition table, we can just check after unfreezing the whole mutation.
|
||||
auto mut = co_await unfreeze_gently(p.mut(), s);
|
||||
auto partition_key = value_cast<sstring>(utf8_type->deserialize(mut.key().get_component(*s, 0)));
|
||||
if (partition_key == GROUP0_HISTORY_KEY) {
|
||||
co_return mut;
|
||||
@@ -3157,7 +3160,10 @@ static bool must_have_tokens(service::node_state nst) {
|
||||
// A decommissioning node doesn't have tokens at the end, they are
|
||||
// removed during transition to the left_token_ring state.
|
||||
case service::node_state::decommissioning: return false;
|
||||
case service::node_state::removing: return true;
|
||||
// A removing node might or might not have tokens depending on whether
|
||||
// REMOVENODE_WITH_LEFT_TOKEN_RING feature is enabled. To support both
|
||||
// cases, we allow removing nodes to not have tokens.
|
||||
case service::node_state::removing: return false;
|
||||
case service::node_state::rebuilding: return true;
|
||||
case service::node_state::normal: return true;
|
||||
case service::node_state::left: return false;
|
||||
|
||||
@@ -200,6 +200,7 @@ public:
|
||||
static constexpr auto DICTS = "dicts";
|
||||
static constexpr auto VIEW_BUILDING_TASKS = "view_building_tasks";
|
||||
static constexpr auto CLIENT_ROUTES = "client_routes";
|
||||
static constexpr auto VERSIONS = "versions";
|
||||
|
||||
// auth
|
||||
static constexpr auto ROLES = "roles";
|
||||
|
||||
@@ -198,6 +198,7 @@ future<> view_building_worker::register_staging_sstable_tasks(std::vector<sstabl
|
||||
|
||||
future<> view_building_worker::run_staging_sstables_registrator() {
|
||||
while (!_as.abort_requested()) {
|
||||
bool sleep = false;
|
||||
try {
|
||||
auto lock = co_await get_units(_staging_sstables_mutex, 1, _as);
|
||||
co_await create_staging_sstable_tasks();
|
||||
@@ -214,6 +215,14 @@ future<> view_building_worker::run_staging_sstables_registrator() {
|
||||
vbw_logger.warn("Got group0_concurrent_modification while creating staging sstable tasks");
|
||||
} catch (raft::request_aborted&) {
|
||||
vbw_logger.warn("Got raft::request_aborted while creating staging sstable tasks");
|
||||
} catch (...) {
|
||||
vbw_logger.error("Exception while creating staging sstable tasks: {}", std::current_exception());
|
||||
sleep = true;
|
||||
}
|
||||
|
||||
if (sleep) {
|
||||
vbw_logger.debug("Sleeping after exception.");
|
||||
co_await seastar::sleep_abortable(1s, _as).handle_exception([] (auto x) { return make_ready_future<>(); });
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -417,9 +426,12 @@ future<> view_building_worker::check_for_aborted_tasks() {
|
||||
|
||||
auto my_host_id = vbw._db.get_token_metadata().get_topology().my_host_id();
|
||||
auto my_replica = locator::tablet_replica{my_host_id, this_shard_id()};
|
||||
auto tasks_map = vbw._state._batch->tasks; // Potentially, we'll remove elements from the map, so we need a copy to iterate over it
|
||||
for (auto& [id, t]: tasks_map) {
|
||||
auto task_opt = building_state.get_task(t.base_id, my_replica, id);
|
||||
auto it = vbw._state._batch->tasks.begin();
|
||||
while (it != vbw._state._batch->tasks.end()) {
|
||||
auto id = it->first;
|
||||
auto task_opt = building_state.get_task(it->second.base_id, my_replica, id);
|
||||
|
||||
++it; // Advance the iterator before potentially removing the entry from the map.
|
||||
if (!task_opt || task_opt->get().aborted) {
|
||||
co_await vbw._state._batch->abort_task(id);
|
||||
}
|
||||
@@ -449,7 +461,7 @@ static std::unordered_set<table_id> get_ids_of_all_views(replica::database& db,
|
||||
}) | std::ranges::to<std::unordered_set>();;
|
||||
}
|
||||
|
||||
// If `state::processing_base_table` is diffrent that the `view_building_state::currently_processed_base_table`,
|
||||
// If `state::processing_base_table` is different that the `view_building_state::currently_processed_base_table`,
|
||||
// clear the state, save and flush new base table
|
||||
future<> view_building_worker::state::update_processing_base_table(replica::database& db, const view_building_state& building_state, abort_source& as) {
|
||||
if (processing_base_table != building_state.currently_processed_base_table) {
|
||||
@@ -571,8 +583,6 @@ future<> view_building_worker::batch::do_work() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
_vbw.local()._vb_state_machine.event.broadcast();
|
||||
}
|
||||
|
||||
future<> view_building_worker::do_build_range(table_id base_id, std::vector<table_id> views_ids, dht::token last_token, abort_source& as) {
|
||||
@@ -774,13 +784,15 @@ future<std::vector<utils::UUID>> view_building_worker::work_on_tasks(raft::term_
|
||||
tasks.insert({id, *task_opt});
|
||||
}
|
||||
#ifdef SEASTAR_DEBUG
|
||||
auto& some_task = tasks.begin()->second;
|
||||
for (auto& [_, t]: tasks) {
|
||||
SCYLLA_ASSERT(t.base_id == some_task.base_id);
|
||||
SCYLLA_ASSERT(t.last_token == some_task.last_token);
|
||||
SCYLLA_ASSERT(t.replica == some_task.replica);
|
||||
SCYLLA_ASSERT(t.type == some_task.type);
|
||||
SCYLLA_ASSERT(t.replica.shard == this_shard_id());
|
||||
{
|
||||
auto& some_task = tasks.begin()->second;
|
||||
for (auto& [_, t]: tasks) {
|
||||
SCYLLA_ASSERT(t.base_id == some_task.base_id);
|
||||
SCYLLA_ASSERT(t.last_token == some_task.last_token);
|
||||
SCYLLA_ASSERT(t.replica == some_task.replica);
|
||||
SCYLLA_ASSERT(t.type == some_task.type);
|
||||
SCYLLA_ASSERT(t.replica.shard == this_shard_id());
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -811,25 +823,6 @@ future<std::vector<utils::UUID>> view_building_worker::work_on_tasks(raft::term_
|
||||
co_return collect_completed_tasks();
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -605,8 +605,8 @@ public:
|
||||
}
|
||||
|
||||
static schema_ptr build_schema() {
|
||||
auto id = generate_legacy_id(system_keyspace::NAME, "versions");
|
||||
return schema_builder(system_keyspace::NAME, "versions", std::make_optional(id))
|
||||
auto id = generate_legacy_id(system_keyspace::NAME, system_keyspace::VERSIONS);
|
||||
return schema_builder(system_keyspace::NAME, system_keyspace::VERSIONS, std::make_optional(id))
|
||||
.with_column("key", utf8_type, column_kind::partition_key)
|
||||
.with_column("version", utf8_type)
|
||||
.with_column("build_mode", utf8_type)
|
||||
@@ -749,6 +749,7 @@ class clients_table : public streaming_virtual_table {
|
||||
.with_column("ssl_protocol", utf8_type)
|
||||
.with_column("username", utf8_type)
|
||||
.with_column("scheduling_group", utf8_type)
|
||||
.with_column("client_options", map_type_impl::get_instance(utf8_type, utf8_type, false))
|
||||
.with_hash_version()
|
||||
.build();
|
||||
}
|
||||
@@ -766,7 +767,7 @@ class clients_table : public streaming_virtual_table {
|
||||
|
||||
future<> execute(reader_permit permit, result_collector& result, const query_restrictions& qr) override {
|
||||
// Collect
|
||||
using client_data_vec = utils::chunked_vector<client_data>;
|
||||
using client_data_vec = utils::chunked_vector<foreign_ptr<std::unique_ptr<client_data>>>;
|
||||
using shard_client_data = std::vector<client_data_vec>;
|
||||
std::vector<foreign_ptr<std::unique_ptr<shard_client_data>>> cd_vec;
|
||||
cd_vec.resize(smp::count);
|
||||
@@ -806,13 +807,13 @@ class clients_table : public streaming_virtual_table {
|
||||
for (unsigned i = 0; i < smp::count; i++) {
|
||||
for (auto&& ps_cdc : *cd_vec[i]) {
|
||||
for (auto&& cd : ps_cdc) {
|
||||
if (cd_map.contains(cd.ip)) {
|
||||
cd_map[cd.ip].emplace_back(std::move(cd));
|
||||
if (cd_map.contains(cd->ip)) {
|
||||
cd_map[cd->ip].emplace_back(std::move(cd));
|
||||
} else {
|
||||
dht::decorated_key key = make_partition_key(cd.ip);
|
||||
dht::decorated_key key = make_partition_key(cd->ip);
|
||||
if (this_shard_owns(key) && contains_key(qr.partition_range(), key)) {
|
||||
ips.insert(decorated_ip{std::move(key), cd.ip});
|
||||
cd_map[cd.ip].emplace_back(std::move(cd));
|
||||
ips.insert(decorated_ip{std::move(key), cd->ip});
|
||||
cd_map[cd->ip].emplace_back(std::move(cd));
|
||||
}
|
||||
}
|
||||
co_await coroutine::maybe_yield();
|
||||
@@ -825,39 +826,58 @@ class clients_table : public streaming_virtual_table {
|
||||
co_await result.emit_partition_start(dip.key);
|
||||
auto& clients = cd_map[dip.ip];
|
||||
|
||||
std::ranges::sort(clients, [] (const client_data& a, const client_data& b) {
|
||||
return a.port < b.port || a.client_type_str() < b.client_type_str();
|
||||
std::ranges::sort(clients, [] (const foreign_ptr<std::unique_ptr<client_data>>& a, const foreign_ptr<std::unique_ptr<client_data>>& b) {
|
||||
return a->port < b->port || a->client_type_str() < b->client_type_str();
|
||||
});
|
||||
|
||||
for (const auto& cd : clients) {
|
||||
clustering_row cr(make_clustering_key(cd.port, cd.client_type_str()));
|
||||
set_cell(cr.cells(), "shard_id", cd.shard_id);
|
||||
set_cell(cr.cells(), "connection_stage", cd.stage_str());
|
||||
if (cd.driver_name) {
|
||||
set_cell(cr.cells(), "driver_name", *cd.driver_name);
|
||||
clustering_row cr(make_clustering_key(cd->port, cd->client_type_str()));
|
||||
set_cell(cr.cells(), "shard_id", cd->shard_id);
|
||||
set_cell(cr.cells(), "connection_stage", cd->stage_str());
|
||||
if (cd->driver_name) {
|
||||
set_cell(cr.cells(), "driver_name", cd->driver_name->key());
|
||||
}
|
||||
if (cd.driver_version) {
|
||||
set_cell(cr.cells(), "driver_version", *cd.driver_version);
|
||||
if (cd->driver_version) {
|
||||
set_cell(cr.cells(), "driver_version", cd->driver_version->key());
|
||||
}
|
||||
if (cd.hostname) {
|
||||
set_cell(cr.cells(), "hostname", *cd.hostname);
|
||||
if (cd->hostname) {
|
||||
set_cell(cr.cells(), "hostname", *cd->hostname);
|
||||
}
|
||||
if (cd.protocol_version) {
|
||||
set_cell(cr.cells(), "protocol_version", *cd.protocol_version);
|
||||
if (cd->protocol_version) {
|
||||
set_cell(cr.cells(), "protocol_version", *cd->protocol_version);
|
||||
}
|
||||
if (cd.ssl_cipher_suite) {
|
||||
set_cell(cr.cells(), "ssl_cipher_suite", *cd.ssl_cipher_suite);
|
||||
if (cd->ssl_cipher_suite) {
|
||||
set_cell(cr.cells(), "ssl_cipher_suite", *cd->ssl_cipher_suite);
|
||||
}
|
||||
if (cd.ssl_enabled) {
|
||||
set_cell(cr.cells(), "ssl_enabled", *cd.ssl_enabled);
|
||||
if (cd->ssl_enabled) {
|
||||
set_cell(cr.cells(), "ssl_enabled", *cd->ssl_enabled);
|
||||
}
|
||||
if (cd.ssl_protocol) {
|
||||
set_cell(cr.cells(), "ssl_protocol", *cd.ssl_protocol);
|
||||
if (cd->ssl_protocol) {
|
||||
set_cell(cr.cells(), "ssl_protocol", *cd->ssl_protocol);
|
||||
}
|
||||
set_cell(cr.cells(), "username", cd.username ? *cd.username : sstring("anonymous"));
|
||||
if (cd.scheduling_group_name) {
|
||||
set_cell(cr.cells(), "scheduling_group", *cd.scheduling_group_name);
|
||||
set_cell(cr.cells(), "username", cd->username ? *cd->username : sstring("anonymous"));
|
||||
if (cd->scheduling_group_name) {
|
||||
set_cell(cr.cells(), "scheduling_group", *cd->scheduling_group_name);
|
||||
}
|
||||
|
||||
auto map_type = map_type_impl::get_instance(
|
||||
utf8_type,
|
||||
utf8_type,
|
||||
false
|
||||
);
|
||||
|
||||
auto prepare_client_options = [] (const auto& client_options) {
|
||||
map_type_impl::native_type tmp;
|
||||
for (auto& co: client_options) {
|
||||
auto map_element = std::make_pair(data_value(co.key.key()), data_value(co.value.key()));
|
||||
tmp.push_back(std::move(map_element));
|
||||
}
|
||||
return tmp;
|
||||
};
|
||||
|
||||
set_cell(cr.cells(), "client_options",
|
||||
make_map_value(map_type, prepare_client_options(cd->client_options)));
|
||||
|
||||
co_await result.emit_row(std::move(cr));
|
||||
}
|
||||
co_await result.emit_partition_end();
|
||||
@@ -1100,9 +1120,10 @@ public:
|
||||
}
|
||||
|
||||
auto tm = _db.local().get_token_metadata_ptr();
|
||||
auto target_tablet_size = _db.local().get_config().target_tablet_size_in_bytes();
|
||||
|
||||
locator::load_sketch load(tm);
|
||||
const uint64_t default_tablet_size = _db.local().get_config().target_tablet_size_in_bytes();
|
||||
|
||||
locator::load_sketch load(tm, stats, default_tablet_size);
|
||||
co_await load.populate();
|
||||
|
||||
tm->get_topology().for_each_node([&] (const auto& node) {
|
||||
@@ -1116,18 +1137,23 @@ public:
|
||||
if (auto ip = _gossiper.local().get_address_map().find(host)) {
|
||||
set_cell(r.cells(), "ip", data_value(inet_address(*ip)));
|
||||
}
|
||||
set_cell(r.cells(), "tablets_allocated", load.get_load(host));
|
||||
set_cell(r.cells(), "tablets_allocated_per_shard", data_value(double(load.get_real_avg_shard_load(host))));
|
||||
set_cell(r.cells(), "storage_allocated_load", data_value(int64_t(load.get_load(host) * target_tablet_size)));
|
||||
set_cell(r.cells(), "tablets_allocated", int64_t(load.get_tablet_count(host)));
|
||||
set_cell(r.cells(), "tablets_allocated_per_shard", data_value(double(load.get_real_avg_tablet_count(host))));
|
||||
set_cell(r.cells(), "storage_allocated_load", data_value(int64_t(load.get_tablet_count(host) * default_tablet_size)));
|
||||
|
||||
if (stats && stats->capacity.contains(host)) {
|
||||
auto capacity = stats->capacity.at(host);
|
||||
set_cell(r.cells(), "storage_capacity", data_value(int64_t(capacity)));
|
||||
|
||||
auto utilization = load.get_allocated_utilization(host, *stats, target_tablet_size);
|
||||
if (utilization) {
|
||||
if (auto utilization = load.get_allocated_utilization(host)) {
|
||||
set_cell(r.cells(), "storage_allocated_utilization", data_value(double(*utilization)));
|
||||
}
|
||||
if (load.has_complete_data(host)) {
|
||||
if (auto utilization = load.get_storage_utilization(host)) {
|
||||
set_cell(r.cells(), "storage_utilization", data_value(double(*utilization)));
|
||||
}
|
||||
set_cell(r.cells(), "storage_load", data_value(int64_t(load.get_disk_used(host))));
|
||||
}
|
||||
}
|
||||
mutation_sink(m);
|
||||
});
|
||||
@@ -1147,6 +1173,8 @@ private:
|
||||
.with_column("storage_capacity", long_type)
|
||||
.with_column("storage_allocated_load", long_type)
|
||||
.with_column("storage_allocated_utilization", double_type)
|
||||
.with_column("storage_load", long_type)
|
||||
.with_column("storage_utilization", double_type)
|
||||
.with_sharder(1, 0) // shard0-only
|
||||
.with_hash_version()
|
||||
.build();
|
||||
|
||||
@@ -271,6 +271,12 @@ is different, or can be configured in Alternator:
|
||||
So for example, if you create a table whose name is 192 characters, you
|
||||
can't create a GSI whose name is longer than 29 characters.
|
||||
|
||||
* DynamoDB's DescribeTable will return information about the table. According to
|
||||
AWS documentation, fields TableSizeBytes, IndexSizeBytes and ItemCount can
|
||||
lag behind by up to 6 hours.
|
||||
The `alternator_describe_table_info_cache_validity_in_seconds` parameter allows
|
||||
users to change this timeout - the default value in seconds is set to 21600 (6 hours).
|
||||
|
||||
## Experimental API features
|
||||
|
||||
Some DynamoDB API features are supported by Alternator, but considered
|
||||
@@ -290,6 +296,14 @@ experimental:
|
||||
considered experimental so needs to be enabled explicitly with the
|
||||
`--experimental-features=alternator-streams` configuration option.
|
||||
|
||||
In this version, Alternator Streams is only supported if the base table
|
||||
uses vnodes instead of tablets. However, by default new tables use tablets
|
||||
so to create a table that can be used with Streams, you must set the tag
|
||||
`system:initial_tablets` set to `none` during CreateTable - so that the
|
||||
new table will use vnodes. Streams cannot be enabled on an already-existing
|
||||
table that uses tablets.
|
||||
See <https://github.com/scylladb/scylla/issues/23838>.
|
||||
|
||||
Alternator streams also differ in some respects from DynamoDB Streams:
|
||||
* The number of separate "shards" in Alternator's streams is significantly
|
||||
larger than is typical on DynamoDB.
|
||||
@@ -375,11 +389,11 @@ they should be easy to detect. Here is a list of these unimplemented features:
|
||||
another cache in front of the it. We wrote more about this here:
|
||||
<https://www.scylladb.com/2017/07/31/database-caches-not-good/>
|
||||
|
||||
* The DescribeTable is missing information about size estimates, and
|
||||
also part of the information about indexes enabled on the table.
|
||||
* The DescribeTable is missing some information about size estimates
|
||||
(IndexSizeBytes and ItemCount - TableSizeBytes is available), and also
|
||||
part of the information about indexes enabled on the table.
|
||||
<https://github.com/scylladb/scylla/issues/5320>
|
||||
<https://github.com/scylladb/scylla/issues/7550>
|
||||
<https://github.com/scylladb/scylla/issues/7551>
|
||||
|
||||
* The PartiQL syntax (SQL-like SELECT/UPDATE/INSERT/DELETE expressions)
|
||||
and the operations ExecuteStatement, BatchExecuteStatement and
|
||||
|
||||
@@ -365,7 +365,7 @@ Modifying a keyspace with tablets enabled is possible and doesn't require any sp
|
||||
|
||||
- The replication factor (RF) can be increased or decreased by at most 1 at a time. To reach the desired RF value, modify the RF repeatedly.
|
||||
- The ``ALTER`` statement rejects the ``replication_factor`` tag. List the DCs explicitly when altering a keyspace. See :ref:`NetworkTopologyStrategy <replication-strategy>`.
|
||||
- If there's any other ongoing global topology operation, executing the ``ALTER`` statement will fail (with an explicit and specific error) and needs to be repeated.
|
||||
- An RF change cannot be requested while another RF change is pending for the same keyspace. Attempting to execute an ``ALTER`` statement in this scenario will fail with an explicit error. Wait for the ongoing RF change to complete before issuing another ``ALTER`` statement.
|
||||
- The ``ALTER`` statement may take longer than the regular query timeout, and even if it times out, it will continue to execute in the background.
|
||||
- The replication strategy cannot be modified, as keyspaces with tablets only support ``NetworkTopologyStrategy``.
|
||||
- The ``ALTER`` statement will fail if it would make the keyspace :term:`RF-rack-invalid <RF-rack-valid keyspace>`.
|
||||
@@ -1043,6 +1043,8 @@ The following modes are available:
|
||||
* - ``immediate``
|
||||
- Tombstone GC is immediately performed. There is no wait time or repair requirement. This mode is useful for a table that uses the TWCS compaction strategy with no user deletes. After data is expired after TTL, ScyllaDB can perform compaction to drop the expired data immediately.
|
||||
|
||||
.. warning:: The ``repair`` mode is not supported for :term:`Colocated Tables <Colocated Table>` in this version.
|
||||
|
||||
.. _cql-per-table-tablet-options:
|
||||
|
||||
Per-table tablet options
|
||||
|
||||
@@ -272,9 +272,17 @@ For example::
|
||||
This query returns up to 5 rows with the closest distance of ``embedding`` vector to the provided query vector,
|
||||
in this case ``[0.1, 0.2, 0.3, 0.4]``.
|
||||
|
||||
There's also possibility to return the similarity score along with the results by using the :ref:`similarity functions <vector-similarity-functions>`.
|
||||
|
||||
For example::
|
||||
|
||||
SELECT image_id, similarity_cosine(embedding, [0.1, 0.2, 0.3, 0.4])
|
||||
FROM ImageEmbeddings
|
||||
ORDER BY embedding ANN OF [0.1, 0.2, 0.3, 0.4] LIMIT 5;
|
||||
|
||||
.. warning::
|
||||
|
||||
Currently, vector queries do not support filtering with ``WHERE`` clause, returning similarity distances,
|
||||
Currently, vector queries do not support filtering with ``WHERE`` clause,
|
||||
grouping with ``GROUP BY`` and paging. This will be added in the future releases.
|
||||
|
||||
|
||||
|
||||
@@ -227,6 +227,39 @@ A number of functions are provided to “convert” the native types into binary
|
||||
takes a 64-bit ``blob`` argument and converts it to a ``bigint`` value. For example, ``bigintAsBlob(3)`` is
|
||||
``0x0000000000000003`` and ``blobAsBigint(0x0000000000000003)`` is ``3``.
|
||||
|
||||
|
||||
.. _vector-similarity-functions:
|
||||
|
||||
Vector similarity functions
|
||||
```````````````````````````
|
||||
|
||||
To obtain the similarity of the given vectors, use a ``SELECT`` query::
|
||||
|
||||
SELECT comment, similarity_cosine(comment_vector, [0.1, 0.15, 0.3, 0.12, 0.05])
|
||||
FROM cycling.comments_vs;
|
||||
|
||||
The supported functions for this type of query are:
|
||||
|
||||
- ``similarity_cosine``
|
||||
- ``similarity_euclidean``
|
||||
- ``similarity_dot_product``
|
||||
|
||||
with the parameters of (``<vector>``, ``<vector>``).
|
||||
|
||||
The ``vector`` is either the name of the float vector column or :ref:`vector of floats <vectors>`.
|
||||
Both arguments must be of the same dimension.
|
||||
|
||||
These functions return a ``float`` value representing the similarity between the given vectors for each row.
|
||||
The similarity value is a floating-point number in a range of [0, 1] that describes how similar two vectors are.
|
||||
Values closer to 1 indicate greater similarity.
|
||||
The ``similarity_euclidean`` and ``similarity_dot_product`` functions do not perform vector normalization prior to computing similarity.
|
||||
|
||||
.. note::
|
||||
The ``similarity_dot_product`` function assumes that all input vectors are L2-normalized.
|
||||
Supplying non-normalized vectors will result in dot product values that do not represent cosine similarity and therefore are not meaningful for similarity comparison.
|
||||
If the input vectors are not normalized, consider using the ``similarity_cosine`` function instead.
|
||||
|
||||
|
||||
.. _udfs:
|
||||
|
||||
User-defined functions :label-caution:`Experimental`
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# Introduction
|
||||
|
||||
Similar to the approach described in CASSANDRA-14471, we add the
|
||||
Similar to the approach described in CASSANDRA-12151, we add the
|
||||
concept of an audit specification. An audit has a target (syslog or a
|
||||
table) and a set of events/actions that it wants recorded. We
|
||||
introduce new CQL syntax for Scylla users to describe and manipulate
|
||||
|
||||
@@ -2,8 +2,11 @@
|
||||
|
||||
## What is ScyllaDB?
|
||||
|
||||
ScyllaDB is a high-performance NoSQL database system, fully compatible with Apache Cassandra.
|
||||
ScyllaDB is released under the GNU Affero General Public License version 3 and the Apache License, ScyllaDB is free and open-source software.
|
||||
ScyllaDB is a high-performance NoSQL database optimized for speed and scalability.
|
||||
It is designed to efficiently handle large volumes of data with minimal latency,
|
||||
making it ideal for data-intensive applications.
|
||||
|
||||
ScyllaDB is distributed under the [ScyllaDB Source Available License](https://github.com/scylladb/scylladb/blob/master/LICENSE-ScyllaDB-Source-Available.md).
|
||||
|
||||
> [ScyllaDB](http://www.scylladb.com/)
|
||||
|
||||
|
||||
@@ -74,6 +74,8 @@ The keys and values are:
|
||||
as an indicator to which shard client wants to connect. The desired shard number
|
||||
is calculated as: `desired_shard_no = client_port % SCYLLA_NR_SHARDS`.
|
||||
Its value is a decimal representation of type `uint16_t`, by default `19142`.
|
||||
- `CLIENT_OPTIONS` is a string containing a JSON object representation that
|
||||
contains CQL Driver configuration, e.g. load balancing policy, retry policy, timeouts, etc.
|
||||
|
||||
Currently, one `SCYLLA_SHARDING_ALGORITHM` is defined,
|
||||
`biased-token-round-robin`. To apply the algorithm,
|
||||
|
||||
@@ -372,6 +372,8 @@ Columns:
|
||||
* `storage_allocated_load` - Disk space allocated for tablets, assuming each tablet has a fixed size (target_tablet_size).
|
||||
* `storage_allocated_utilization` - Fraction of node's disk capacity taken for `storage_allocated_load`, where 1.0 means full utilization.
|
||||
* `storage_capacity` - Total disk capacity in bytes. Used to compute `storage_allocated_utilization`. By default equal to file system's capacity.
|
||||
* `storage_load` - Disk space allocated for tablets, computed with actual tablet sizes. Can be null if some of the tablet sizes are not known.
|
||||
* `storage_utilization` - Fraction of node's disk capacity taken for `storage_load` (with actual tablet sizes), where 1.0 means full utilization. Can be null if some of the tablet sizes are not known.
|
||||
* `tablets_allocated` - Number of tablet replicas on the node. Migrating tablets are accounted as if migration already finished.
|
||||
* `tablets_allocated_per_shard` - `tablets_allocated` divided by shard count on the node.
|
||||
|
||||
|
||||
@@ -86,6 +86,7 @@ stateDiagram-v2
|
||||
de_left_token_ring --> [*]
|
||||
}
|
||||
state removing {
|
||||
re_left_token_ring : left_token_ring
|
||||
re_tablet_draining : tablet_draining
|
||||
re_tablet_migration : tablet_migration
|
||||
re_write_both_read_old : write_both_read_old
|
||||
@@ -98,7 +99,8 @@ stateDiagram-v2
|
||||
re_tablet_draining --> re_write_both_read_old
|
||||
re_write_both_read_old --> re_write_both_read_new: streaming completed
|
||||
re_write_both_read_old --> re_rollback_to_normal: rollback
|
||||
re_write_both_read_new --> [*]
|
||||
re_write_both_read_new --> re_left_token_ring
|
||||
re_left_token_ring --> [*]
|
||||
}
|
||||
rebuilding --> normal: streaming completed
|
||||
decommissioning --> left: operation succeeded
|
||||
@@ -122,9 +124,10 @@ Note that these are not all states, as there are other states specific to tablet
|
||||
Writes to vnodes-based tables are going to both new and old replicas (new replicas means calculated according
|
||||
to modified token ring), reads are using old replicas.
|
||||
- `write_both_read_new` - as above, but reads are using new replicas.
|
||||
- `left_token_ring` - the decommissioning node left the token ring, but we still need to wait until other
|
||||
nodes observe it and stop sending writes to this node. Then, we tell the node to shut down and remove
|
||||
it from group 0. We also use this state to rollback a failed bootstrap or decommission.
|
||||
- `left_token_ring` - the decommissioning or removing node left the token ring, but we still need to wait until other
|
||||
nodes observe it and stop sending writes to this node. For decommission, we tell the node to shut down,
|
||||
then remove it from group 0. For removenode, the node is already down, so we skip the shutdown step.
|
||||
We also use this state to rollback a failed bootstrap or decommission.
|
||||
- `rollback_to_normal` - the decommission or removenode operation failed. Rollback the operation by
|
||||
moving the node we tried to decommission/remove back to the normal state.
|
||||
- `lock` - the topology stays in this state until externally changed (to null state), preventing topology
|
||||
@@ -141,7 +144,9 @@ reads that started before this point exist in the system. Finally we remove the
|
||||
transitioning state.
|
||||
|
||||
Decommission, removenode and replace work similarly, except they don't go through
|
||||
`commit_cdc_generation`.
|
||||
`commit_cdc_generation`. Both decommission and removenode go through the
|
||||
`left_token_ring` state to run a global barrier ensuring all nodes are aware
|
||||
of the topology change before the operation completes.
|
||||
|
||||
The state machine may also go only through the `commit_cdc_generation` state
|
||||
after getting a request from the user to create a new CDC generation if the
|
||||
|
||||
@@ -41,12 +41,12 @@ Unless the task was aborted, the worker will eventually reply that the task was
|
||||
it temporarily saves list of ids of finished tasks and removes those tasks from group0 state (pernamently marking them as finished) in 200ms intervals. (*)
|
||||
This batching of removing finished tasks is done in order to reduce number of generated group0 operations.
|
||||
|
||||
On the other hand, view buildind tasks can can also be aborted due to 2 main reasons:
|
||||
On the other hand, view building tasks can can also be aborted due to 2 main reasons:
|
||||
- a keyspace/view was dropped
|
||||
- tablet operations (see [tablet operations section](#tablet-operations))
|
||||
In the first case we simply delete relevant view building tasks as they are no longer needed.
|
||||
But if a task needs to be aborted due to tablet operation, we're firstly setting the `aborted` flag to true. We need to do this because we need the task informations
|
||||
to created a new adjusted tasks (if the operation succeeded) or rollback them (if the operation failed).
|
||||
But if a task needs to be aborted due to tablet operation, we're firstly setting the `aborted` flag to true. We need to do this because we need the task information
|
||||
to create new adjusted tasks (if the operation succeeded) or rollback them (if the operation failed).
|
||||
Once a task is aborted by setting the flag, this cannot be revoked, so rolling back a task means creating its duplicate and removing the original task.
|
||||
|
||||
(*) - Because there is a time gap between when the coordinator learns that a task is finished (from the RPC response) and when the task is marked as completed,
|
||||
|
||||
@@ -17,6 +17,7 @@ This document highlights ScyllaDB's key data modeling features.
|
||||
Workload Prioritization </features/workload-prioritization>
|
||||
Backup and Restore </features/backup-and-restore>
|
||||
Incremental Repair </features/incremental-repair/>
|
||||
Vector Search </features/vector-search/>
|
||||
|
||||
.. panel-box::
|
||||
:title: ScyllaDB Features
|
||||
@@ -43,3 +44,5 @@ This document highlights ScyllaDB's key data modeling features.
|
||||
* :doc:`Incremental Repair </features/incremental-repair/>` provides a much more
|
||||
efficient and lightweight approach to maintaining data consistency by
|
||||
repairing only the data that has changed since the last repair.
|
||||
* :doc:`Vector Search in ScyllaDB </features/vector-search/>` enables
|
||||
similarity-based queries on vector embeddings.
|
||||
|
||||
55
docs/features/vector-search.rst
Normal file
55
docs/features/vector-search.rst
Normal file
@@ -0,0 +1,55 @@
|
||||
=================================
|
||||
Vector Search in ScyllaDB
|
||||
=================================
|
||||
|
||||
.. note::
|
||||
|
||||
This feature is currently available only in `ScyllaDB Cloud <https://cloud.docs.scylladb.com/>`_.
|
||||
|
||||
What Is Vector Search
|
||||
-------------------------
|
||||
|
||||
Vector Search enables similarity-based queries over high-dimensional data,
|
||||
such as text, images, audio, or user behavior. Instead of searching for exact
|
||||
matches, it allows applications to find items that are semantically similar to
|
||||
a given input.
|
||||
|
||||
To do this, Vector Search works on vector embeddings, which are numerical
|
||||
representations of data that capture semantic meaning. This enables queries
|
||||
such as:
|
||||
|
||||
* “Find documents similar to this paragraph”
|
||||
* “Find products similar to what the user just viewed”
|
||||
* “Find previous tickets related to this support request”
|
||||
|
||||
Rather than relying on exact values or keywords, Vector Search returns results
|
||||
based on distance or similarity between vectors. This capability is
|
||||
increasingly used in modern workloads such as AI-powered search, recommendation
|
||||
systems, and retrieval-augmented generation (RAG).
|
||||
|
||||
Why Vector Search Matters
|
||||
------------------------------------
|
||||
|
||||
Many applications already rely on ScyllaDB for high throughput, low and
|
||||
predictable latency, and large-scale data storage.
|
||||
|
||||
Vector Search complements these strengths by enabling new classes of workloads,
|
||||
including:
|
||||
|
||||
* Semantic search over text or documents
|
||||
* Recommendations based on user or item similarity
|
||||
* AI and ML applications, including RAG pipelines
|
||||
* Anomaly and pattern detection
|
||||
|
||||
With Vector Search, ScyllaDB can serve as the similarity search backend for
|
||||
AI-driven applications.
|
||||
|
||||
Availability
|
||||
--------------
|
||||
|
||||
Vector Search is currently available only in ScyllaDB Cloud, the fully managed
|
||||
ScyllaDB service.
|
||||
|
||||
|
||||
👉 For details on using Vector Search, refer to the
|
||||
`ScyllaDB Cloud documentation <https://cloud.docs.scylladb.com/stable/vector-search/index.html>`_.
|
||||
@@ -45,10 +45,3 @@ Run cqlsh:
|
||||
|
||||
cqlsh
|
||||
|
||||
Run cassandra-stress:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
cassandra-stress write -mode cql3 native
|
||||
|
||||
|
||||
|
||||
@@ -20,7 +20,10 @@ You can run your ScyllaDB workloads on AWS, GCE, and Azure using a ScyllaDB imag
|
||||
Amazon Web Services (AWS)
|
||||
-----------------------------
|
||||
|
||||
The recommended instance types are :ref:`i3en <system-requirements-i3en-instances>`, :ref:`i4i <system-requirements-i4i-instances>`, :ref:`i7i <system-requirements-i7i-instances>`, and :ref:`i7ie <system-requirements-i7ie-instances>`.
|
||||
The recommended instance types are :ref:`i3en <system-requirements-i3en-instances>`,
|
||||
:ref:`i4i <system-requirements-i4i-instances>`, :ref:`i7i <system-requirements-i7i-instances>`,
|
||||
:ref:`i7ie <system-requirements-i7ie-instances>`, :ref:`i8g<system-requirements-i8g-instances>`,
|
||||
and :ref:`i8ge <system-requirements-i8ge-instances>`.
|
||||
|
||||
.. note::
|
||||
|
||||
@@ -195,6 +198,118 @@ All i7i instances have the following specs:
|
||||
|
||||
See `Amazon EC2 I7i Instances <https://aws.amazon.com/ec2/instance-types/i7i/>`_ for details.
|
||||
|
||||
|
||||
.. _system-requirements-i8g-instances:
|
||||
|
||||
i8g instances
|
||||
^^^^^^^^^^^^^^
|
||||
|
||||
The following i8g instances are supported.
|
||||
|
||||
.. list-table::
|
||||
:widths: 30 20 20 30
|
||||
:header-rows: 1
|
||||
|
||||
* - Model
|
||||
- vCPU
|
||||
- Mem (GiB)
|
||||
- Storage (GB)
|
||||
* - i8g.large
|
||||
- 2
|
||||
- 16
|
||||
- 1 x 468 GB
|
||||
* - i8g.xlarge
|
||||
- 4
|
||||
- 32
|
||||
- 1 x 937 GB
|
||||
* - i8g.2xlarge
|
||||
- 8
|
||||
- 64
|
||||
- 1 x 1,875 GB
|
||||
* - i8g.4xlarge
|
||||
- 16
|
||||
- 128
|
||||
- 1 x 3,750 GB
|
||||
* - i8g.8xlarge
|
||||
- 32
|
||||
- 256
|
||||
- 2 x 3,750 GB
|
||||
* - i8g.12xlarge
|
||||
- 48
|
||||
- 384
|
||||
- 3 x 3,750 GB
|
||||
* - i8g.16xlarge
|
||||
- 64
|
||||
- 512
|
||||
- 4 x 3,750 GB
|
||||
|
||||
All i8g instances have the following specs:
|
||||
|
||||
* Powered by AWS Graviton4 processors
|
||||
* 3rd generation AWS Nitro SSD storage
|
||||
* DDR5-5600 memory for improved throughput
|
||||
* Up to 100 Gbps of networking bandwidth and up to 60 Gbps of bandwidth to
|
||||
Amazon Elastic Block Store (EBS)
|
||||
* Instance sizes offer up to 45 TB of total local NVMe instance storage
|
||||
|
||||
See `Amazon EC2 I8g Instances <https://aws.amazon.com/ec2/instance-types/i8g/>`_ for details.
|
||||
|
||||
.. _system-requirements-i8ge-instances:
|
||||
|
||||
i8ge instances
|
||||
^^^^^^^^^^^^^^
|
||||
|
||||
The following i8ge instances are supported.
|
||||
|
||||
.. list-table::
|
||||
:widths: 30 20 20 30
|
||||
:header-rows: 1
|
||||
|
||||
* - Model
|
||||
- vCPU
|
||||
- Mem (GiB)
|
||||
- Storage (GB)
|
||||
* - i8ge.large
|
||||
- 2
|
||||
- 16
|
||||
- 1 x 1,250 GB
|
||||
* - i8ge.xlarge
|
||||
- 4
|
||||
- 32
|
||||
- 1 x 2,500 GB
|
||||
* - i8ge.2xlarge
|
||||
- 8
|
||||
- 64
|
||||
- 2 x 2,500 GB
|
||||
* - i8ge.3xlarge
|
||||
- 12
|
||||
- 96
|
||||
- 1 x 7,500 GB
|
||||
* - i8ge.6xlarge
|
||||
- 24
|
||||
- 192
|
||||
- 2 x 7,500 GB
|
||||
* - i8ge.12xlarge
|
||||
- 48
|
||||
- 384
|
||||
- 4 x 7,500 GB
|
||||
* - i8ge.18xlarge
|
||||
- 72
|
||||
- 576
|
||||
- 6 x 7,500 GB
|
||||
|
||||
All i8ge instances have the following specs:
|
||||
|
||||
* Powered by AWS Graviton4 processors
|
||||
* 3rd generation AWS Nitro SSD storage
|
||||
* DDR5-5600 memory for improved throughput
|
||||
* Up to 300 Gbps of networking bandwidth and up to 60 Gbps of bandwidth to
|
||||
Amazon Elastic Block Store (EBS)
|
||||
* Instance sizes offer up to 120 TB of total local NVMe instance storage
|
||||
|
||||
See `Amazon EC2 I8g Instances <https://aws.amazon.com/ec2/instance-types/i8g/>`_ for details.
|
||||
|
||||
|
||||
Im4gn and Is4gen instances
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
ScyllaDB supports Arm-based Im4gn and Is4gen instances. See `Amazon EC2 Im4gn and Is4gen instances <https://aws.amazon.com/ec2/instance-types/i4g/>`_ for specification details.
|
||||
|
||||
@@ -25,8 +25,7 @@ Getting Started
|
||||
:id: "getting-started"
|
||||
:class: my-panel
|
||||
|
||||
* `Install ScyllaDB (Binary Packages, Docker, or EC2) <https://www.scylladb.com/download/#core>`_ - Links to the ScyllaDB Download Center
|
||||
|
||||
* :doc:`Install ScyllaDB </getting-started/install-scylla/index/>`
|
||||
* :doc:`Configure ScyllaDB </getting-started/system-configuration/>`
|
||||
* :doc:`Run ScyllaDB in a Shared Environment </getting-started/scylla-in-a-shared-environment>`
|
||||
* :doc:`Create a ScyllaDB Cluster - Single Data Center (DC) </operating-scylla/procedures/cluster-management/create-cluster/>`
|
||||
@@ -60,4 +59,5 @@ Getting Started
|
||||
|
||||
* `Build an IoT App with sensor simulator and a REST API <https://iot.scylladb.com/stable/>`_ - ScyllaDB Tutorial
|
||||
* `Implement CRUD operations with a TODO App <https://github.com/scylladb/scylla-cloud-getting-started/>`_ - ScyllaDB Cloud Tutorial
|
||||
* `Build a machine learning (ML) feature store with ScyllaDB <https://feature-store.scylladb.com/stable/>`_ - ScyllaDB Cloud Tutorial ` <>`_
|
||||
* `Build a machine learning (ML) feature store with ScyllaDB <https://feature-store.scylladb.com/stable/>`_ - ScyllaDB Cloud Tutorial
|
||||
|
||||
@@ -3,8 +3,7 @@
|
||||
ScyllaDB Housekeeping and how to disable it
|
||||
============================================
|
||||
|
||||
It is always recommended to run the latest version of ScyllaDB.
|
||||
The latest stable release version is always available from the `Download Center <https://www.scylladb.com/download/>`_.
|
||||
It is always recommended to run the latest stable version of ScyllaDB.
|
||||
|
||||
When you install ScyllaDB, it installs by default two services: **scylla-housekeeping-restart** and **scylla-housekeeping-daily**. These services check for the latest ScyllaDB version and prompt the user if they are using a version that is older than what is publicly available.
|
||||
Information about your ScyllaDB deployment, including the ScyllaDB version currently used, as well as unique user and server identifiers, are collected by a centralized service.
|
||||
|
||||
@@ -83,7 +83,7 @@ Additional References
|
||||
|
||||
* `Jepsen and ScyllaDB: Putting Consistency to the Test blog post <https://www.scylladb.com/2020/12/23/jepsen-and-scylla-putting-consistency-to-the-test/>`_
|
||||
* `Nauto: Achieving Consistency in an Eventually Consistent Environment blog post <https://www.scylladb.com/2020/02/20/nauto-achieving-consistency-in-an-eventually-consistent-environment/>`_
|
||||
* `Consistency Levels documentation <https://docs.scylladb.com/manual/stable/cql/consistency.html>`_
|
||||
* :doc:`Consistency Levels documentation </cql/consistency/>`
|
||||
* `High Availability lesson on ScyllaDB University <https://university.scylladb.com/courses/scylla-essentials-overview/lessons/high-availability/>`_
|
||||
* `Lightweight Transactions lesson on ScyllaDB University <https://university.scylladb.com/courses/data-modeling/lessons/lightweight-transactions/>`_
|
||||
* `Getting the Most out of Lightweight Transactions in ScyllaDB blog post <https://www.scylladb.com/2020/07/15/getting-the-most-out-of-lightweight-transactions-in-scylla/>`_
|
||||
|
||||
@@ -38,7 +38,7 @@ Steps:
|
||||
|
||||
4. Run compaction (this will remove big partitions with tombstones from specified table)
|
||||
|
||||
.. note:: By default, major compaction runs on all the keyspaces and tables, so if we want to specyfy e.g. only one table, we should point at it using arguments: ``<keyspace>.<mytable>``. For more information, please refer to `this article <https://docs.scylladb.com/operating-scylla/nodetool-commands/compact/>`_.
|
||||
.. note:: By default, major compaction runs on all the keyspaces and tables, so if we want to specyfy e.g. only one table, we should point at it using arguments: ``<keyspace>.<mytable>``. For more information, please see :doc:`Nodetool compact </operating-scylla/nodetool-commands/compact/>`.
|
||||
|
||||
.. code-block:: sh
|
||||
|
||||
|
||||
@@ -10,6 +10,7 @@ Admin Tools
|
||||
Admin REST API </operating-scylla/rest>
|
||||
Tracing </using-scylla/tracing>
|
||||
ScyllaDB SStable </operating-scylla/admin-tools/scylla-sstable/>
|
||||
ScyllaDB SStable Script API </operating-scylla/admin-tools/scylla-sstable-script-api/>
|
||||
ScyllaDB Types </operating-scylla/admin-tools/scylla-types/>
|
||||
sstableloader
|
||||
cassandra-stress </operating-scylla/admin-tools/cassandra-stress/>
|
||||
|
||||
530
docs/operating-scylla/admin-tools/scylla-sstable-script-api.rst
Normal file
530
docs/operating-scylla/admin-tools/scylla-sstable-script-api.rst
Normal file
@@ -0,0 +1,530 @@
|
||||
ScyllaDB SStable Script API
|
||||
---------------------------
|
||||
|
||||
The script API consists of two parts:
|
||||
|
||||
* `ScyllaDB Consume API <scylla-consume-api_>`_ - Hook methods implemented by the script to consume a :ref:`mutation fragment stream <scylla-sstable-sstable-content>`;
|
||||
* `ScyllaDB Lua API <scylla-script-lua-api_>`_ - types and methods exposed to the script to work with ScyllaDB types and values.
|
||||
|
||||
.. _scylla-consume-api:
|
||||
|
||||
ScyllaDB Consume API
|
||||
~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
These methods represent the glue code between scylla-sstable's C++ code and the Lua script.
|
||||
Conceptually a script is an implementation of a consumer interface. The script has to implement only the methods it is interested in. Each method has a default implementation in the interface, which simply drops the respective :ref:`mutation fragment <scylla-sstable-sstable-content>`.
|
||||
For example, a script only interested in partitions can define only :ref:`consume_partition_start() <scylla-consume-partition-start-method>` and nothing else.
|
||||
Therefore a completely empty script is also valid, although not very useful.
|
||||
Below you will find the listing of the API methods.
|
||||
These methods (if provided by the script) will be called by the scylla-sstable runtime for the appropriate events and fragment types.
|
||||
|
||||
.. _scylla-consume-stream-start-method:
|
||||
|
||||
consume_stream_start(args)
|
||||
""""""""""""""""""""""""""
|
||||
|
||||
* Part of the Consume API. Called on the very start of the stream.
|
||||
* Parameter is a Lua table containing command line arguments for the script, passed via ``--script-arg``.
|
||||
* Can be used to initialize global state.
|
||||
|
||||
.. _scylla-consume-sstable-start-method:
|
||||
|
||||
consume_sstable_start(sst)
|
||||
""""""""""""""""""""""""""
|
||||
|
||||
* Part of the Consume API.
|
||||
* Called on the start of each stable.
|
||||
* The parameter is of type `ScyllaDB.sstable <scylla-sstable-type_>`_.
|
||||
* When SStables are merged (``--merge``), the parameter is ``nil``.
|
||||
|
||||
Returns whether to stop. If ``true``, `consume_sstable_end() <scylla-consume-sstable-end-method_>`_ is called, skipping the content of the sstable (or that of the entire stream if ``--merge`` is used). If ``false``, consumption follows with the content of the sstable.
|
||||
|
||||
.. _scylla-consume-partition-start-method:
|
||||
|
||||
consume_partition_start(ps)
|
||||
"""""""""""""""""""""""""""
|
||||
|
||||
* Part of the Consume API. Called on the start of each partition.
|
||||
* The parameter is of type `ScyllaDB.partition_start <scylla-partition-start-type_>`_.
|
||||
* Returns whether to stop. If ``true``, `consume_partition_end() <scylla-consume-partition-end-method_>`_ is called, skipping the content of the partition. If ``false``, consumption follows with the content of the partition.
|
||||
|
||||
consume_static_row(sr)
|
||||
""""""""""""""""""""""
|
||||
|
||||
* Part of the Consume API.
|
||||
* Called if the partition has a static row.
|
||||
* The parameter is of type `ScyllaDB.static_row <scylla-static-row-type_>`_.
|
||||
* Returns whether to stop. If ``true``, `consume_partition_end() <scylla-consume-partition-end-method_>`_ is called, and the remaining content of the partition is skipped. If ``false``, consumption follows with the remaining content of the partition.
|
||||
|
||||
consume_clustering_row(cr)
|
||||
""""""""""""""""""""""""""
|
||||
|
||||
* Part of the Consume API.
|
||||
* Called for each clustering row.
|
||||
* The parameter is of type `ScyllaDB.clustering_row <scylla-clustering-row-type_>`_.
|
||||
* Returns whether to stop. If ``true``, `consume_partition_end() <scylla-consume-partition-end-method_>`_ is called, the remaining content of the partition is skipped. If ``false``, consumption follows with the remaining content of the partition.
|
||||
|
||||
consume_range_tombstone_change(crt)
|
||||
"""""""""""""""""""""""""""""""""""
|
||||
|
||||
* Part of the Consume API.
|
||||
* Called for each range tombstone change.
|
||||
* The parameter is of type `ScyllaDB.range_tombstone_change <scylla-range-tombstone-change-type_>`_.
|
||||
* Returns whether to stop. If ``true``, `consume_partition_end() <scylla-consume-partition-end-method_>`_ is called, the remaining content of the partition is skipped. If ``false``, consumption follows with the remaining content of the partition.
|
||||
|
||||
.. _scylla-consume-partition-end-method:
|
||||
|
||||
consume_partition_end()
|
||||
"""""""""""""""""""""""
|
||||
|
||||
* Part of the Consume API.
|
||||
* Called at the end of the partition.
|
||||
* Returns whether to stop. If ``true``, `consume_sstable_end() <scylla-consume-sstable-end-method_>`_ is called, the remaining content of the SStable is skipped. If ``false``, consumption follows with the remaining content of the SStable.
|
||||
|
||||
.. _scylla-consume-sstable-end-method:
|
||||
|
||||
consume_sstable_end()
|
||||
"""""""""""""""""""""
|
||||
|
||||
* Part of the Consume API.
|
||||
* Called at the end of the SStable.
|
||||
* Returns whether to stop. If true, `consume_stream_end() <scylla-consume-stream-end-method_>`_ is called, the remaining content of the stream is skipped. If false, consumption follows with the remaining content of the stream.
|
||||
|
||||
.. _scylla-consume-stream-end-method:
|
||||
|
||||
consume_stream_end()
|
||||
""""""""""""""""""""
|
||||
|
||||
* Part of the Consume API.
|
||||
* Called at the very end of the stream.
|
||||
|
||||
.. _scylla-script-lua-api:
|
||||
|
||||
ScyllaDB LUA API
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
||||
In addition to the `ScyllaDB Consume API <scylla-consume-api_>`_, the Lua bindings expose various types and methods that allow you to work with ScyllaDB types and values.
|
||||
The listing uses the following terminology:
|
||||
|
||||
* Attribute - a simple attribute accessible via ``obj.attribute_name``;
|
||||
* Method - a method operating on an instance of said type, invocable as ``obj:method()``;
|
||||
* Magic method - magic methods defined in the metatable which define behaviour of these objects w.r.t. `Lua operators and more <http://www.lua.org/manual/5.4/manual.html#2.4>`_;
|
||||
|
||||
The format of an attribute description is the following:
|
||||
|
||||
.. code-block:: none
|
||||
:class: hide-copy-button
|
||||
|
||||
attribute_name (type) - description
|
||||
|
||||
and that of a method:
|
||||
|
||||
.. code-block:: none
|
||||
:class: hide-copy-button
|
||||
|
||||
method_name(arg1_type, arg2_type...) (return_type) - description
|
||||
|
||||
Magic methods have their signature defined by Lua and so that is not described here (these methods are not used directly anyway).
|
||||
|
||||
.. _scylla-atomic-cell-type:
|
||||
|
||||
ScyllaDB.atomic_cell
|
||||
""""""""""""""""""""
|
||||
|
||||
Attributes:
|
||||
|
||||
* timestamp (integer)
|
||||
* is_live (boolean) - is the cell live?
|
||||
* type (string) - one of: ``regular``, ``counter-update``, ``counter-shards``, ``frozen-collection`` or ``collection``.
|
||||
* has_ttl (boolean) - is the cell expiring?
|
||||
* ttl (integer) - time to live in seconds, ``nil`` if cell is not expiring.
|
||||
* expiry (`ScyllaDB.gc_clock_time_point <scylla-gc-clock-time-point-type_>`_) - time at which cell expires, ``nil`` if cell is not expiring.
|
||||
* deletion_time (`ScyllaDB.gc_clock_time_point <scylla-gc-clock-time-point-type_>`_) - time at which cell was deleted, ``nil`` unless cell is dead or expiring.
|
||||
* value:
|
||||
|
||||
- ``nil`` if cell is dead.
|
||||
- appropriate Lua native type if type == ``regular``.
|
||||
- integer if type == ``counter-update``.
|
||||
- `ScyllaDB.counter_shards_value <scylla-counter-shards-value-type_>`_ if type == ``counter-shards``.
|
||||
|
||||
A counter-shard table has the following keys:
|
||||
|
||||
* id (string)
|
||||
* value (integer)
|
||||
* clock (integer)
|
||||
|
||||
.. _scylla-clustering-key-type:
|
||||
|
||||
ScyllaDB.clustering_key
|
||||
"""""""""""""""""""""""
|
||||
|
||||
Attributes:
|
||||
|
||||
* components (table) - the column values (`ScyllaDB.data_value <scylla-data-value-type_>`_) making up the composite clustering key.
|
||||
|
||||
Methods:
|
||||
|
||||
* to_hex - convert the key to its serialized format, encoded in hex.
|
||||
|
||||
Magic methods:
|
||||
|
||||
* __tostring - can be converted to string with tostring(), uses the built-in operator<< in ScyllaDB.
|
||||
|
||||
.. _scylla-clustering-row-type:
|
||||
|
||||
ScyllaDB.clustering_row
|
||||
"""""""""""""""""""""""
|
||||
|
||||
Attributes:
|
||||
|
||||
* key ($TYPE) - the clustering key's value as the appropriate Lua native type.
|
||||
* tombstone (`ScyllaDB.tombstone <scylla-tombstone-type_>`_) - row tombstone, ``nil`` if no tombstone.
|
||||
* shadowable_tombstone (`ScyllaDB.tombstone <scylla-tombstone-type_>`_) - shadowable tombstone of the row tombstone, ``nil`` if no tombstone.
|
||||
* marker (`ScyllaDB.row_marker <scylla-row-marker-type_>`_) - the row marker, ``nil`` if row doesn't have one.
|
||||
* cells (table) - table of cells, where keys are the column names and the values are either of type `ScyllaDB.atomic_cell <scylla-atomic-cell-type_>`_ or `ScyllaDB.collection <scylla-collection-type_>`_.
|
||||
|
||||
See also:
|
||||
|
||||
* `ScyllaDB.unserialize_clustering_key() <scylla-unserialize-clustering-key-method_>`_.
|
||||
|
||||
.. _scylla-collection-type:
|
||||
|
||||
ScyllaDB.collection
|
||||
"""""""""""""""""""
|
||||
|
||||
Attributes:
|
||||
|
||||
* type (string) - always ``collection`` for collection.
|
||||
* tombstone (`ScyllaDB.tombstone <scylla-tombstone-type_>`_) - ``nil`` if no tombstone.
|
||||
* cells (table) - the collection cells, each collection cell is a table, with a ``key`` and ``value`` attribute. The key entry is the key of the collection cell for actual collections (list, set and map) and is of type `ScyllaDB.data-value <scylla-data-value-type_>`_. For tuples and UDT this is just an empty string. The value entry is the value of the collection cell and is of type `ScyllaDB.atomic-cell <scylla-atomic-cell-type_>`_.
|
||||
|
||||
.. _scylla-collection-cell-value-type:
|
||||
|
||||
ScyllaDB.collection_cell_value
|
||||
""""""""""""""""""""""""""""""
|
||||
|
||||
Attributes:
|
||||
|
||||
* key (sstring) - collection cell key in human readable form.
|
||||
* value (`ScyllaDB.atomic_cell <scylla-atomic-cell-type_>`_) - collection cell value.
|
||||
|
||||
.. _scylla-column-definition-type:
|
||||
|
||||
ScyllaDB.column_definition
|
||||
""""""""""""""""""""""""""
|
||||
|
||||
Attributes:
|
||||
|
||||
* id (integer) - the id of the column.
|
||||
* name (string) - the name of the column.
|
||||
* kind (string) - the kind of the column, one of ``partition_key``, ``clustering_key``, ``static_column`` or ``regular_column``.
|
||||
|
||||
.. _scylla-counter-shards-value-type:
|
||||
|
||||
ScyllaDB.counter_shards_value
|
||||
"""""""""""""""""""""""""""""
|
||||
|
||||
Attributes:
|
||||
|
||||
* value (integer) - the total value of the counter (the sum of all the shards).
|
||||
* shards (table) - the shards making up this counter, a lua list containing tables, representing shards, with the following key/values:
|
||||
|
||||
- id (string) - the shard's id (UUID).
|
||||
- value (integer) - the shard's value.
|
||||
- clock (integer) - the shard's logical clock.
|
||||
|
||||
Magic methods:
|
||||
|
||||
* __tostring - can be converted to string with tostring().
|
||||
|
||||
.. _scylla-data-value-type:
|
||||
|
||||
ScyllaDB.data_value
|
||||
"""""""""""""""""""
|
||||
|
||||
Attributes:
|
||||
|
||||
* value - the value represented as the appropriate Lua type
|
||||
|
||||
Magic methods:
|
||||
|
||||
* __tostring - can be converted to string with tostring().
|
||||
|
||||
.. _scylla-gc-clock-time-point-type:
|
||||
|
||||
ScyllaDB.gc_clock_time_point
|
||||
""""""""""""""""""""""""""""
|
||||
|
||||
A time point belonging to the gc_clock, in UTC.
|
||||
|
||||
Attributes:
|
||||
|
||||
* year (integer) - [1900, +inf).
|
||||
* month (integer) - [1, 12].
|
||||
* day (integer) - [1, 31].
|
||||
* hour (integer) - [0, 23].
|
||||
* min (integer) - [0, 59].
|
||||
* sec (integer) - [0, 59].
|
||||
|
||||
Magic methods:
|
||||
|
||||
* __eq - can be equal compared.
|
||||
* __lt - can be less compared.
|
||||
* __le - can be less-or-equal compared.
|
||||
* __tostring - can be converted to string with tostring().
|
||||
|
||||
See also:
|
||||
|
||||
* `ScyllaDB.now() <scylla-now-method_>`_.
|
||||
* `ScyllaDB.time_point_from_string() <scylla-time-point-from-string-method_>`_.
|
||||
|
||||
.. _scylla-json-writer-type:
|
||||
|
||||
ScyllaDB.json_writer
|
||||
""""""""""""""""""""
|
||||
|
||||
A JSON writer object, with both low-level and high-level APIs.
|
||||
The low-level API allows you to write custom JSON and it loosely follows the API of `rapidjson::Writer <https://rapidjson.org/classrapidjson_1_1_writer.html>`_ (upon which it is implemented).
|
||||
The high-level API is for writing :ref:`mutation fragments <scylla-sstable-sstable-content>` as JSON directly, using the built-in JSON conversion logic that is used by :ref:`dump-data <scylla-sstable-dump-data-operation>` operation.
|
||||
|
||||
Low level API Methods:
|
||||
|
||||
* null() - write a null json value.
|
||||
* bool(boolean) - write a bool json value.
|
||||
* int(integer) - write an integer json value.
|
||||
* double(number) - write a double json value.
|
||||
* string(string) - write a string json value.
|
||||
* start_object() - start a json object.
|
||||
* key(string) - write the key of a json object.
|
||||
* end_object() - write the end of a json object.
|
||||
* start_array() - write the start of a json array.
|
||||
* end_array() - write the end of a json array.
|
||||
|
||||
High level API Methods:
|
||||
|
||||
* start_stream() - start the stream, call at the very beginning.
|
||||
* start_sstable() - start an sstable.
|
||||
* start_partition() - start a partition.
|
||||
* static_row() - write a static row to the stream.
|
||||
* clustering_row() - write a clustering row to the stream.
|
||||
* range_tombstone_change() - write a range tombstone change to the stream.
|
||||
* end_partition() - end the current partition.
|
||||
* end_sstable() - end the current sstable.
|
||||
* end_stream() - end the stream, call at the very end.
|
||||
|
||||
.. _scylla-new-json-writer-method:
|
||||
|
||||
ScyllaDB.new_json_writer()
|
||||
""""""""""""""""""""""""""
|
||||
|
||||
Create a `ScyllaDB.json_writer <scylla-json-writer-type_>`_ instance.
|
||||
|
||||
.. _scylla-new-position-in-partition-method:
|
||||
|
||||
ScyllaDB.new_position_in_partition()
|
||||
""""""""""""""""""""""""""""""""""""
|
||||
|
||||
Creates a `ScyllaDB.position_in_partition <scylla-position-in-partition-type_>`_ instance.
|
||||
|
||||
Arguments:
|
||||
|
||||
* weight (integer) - the weight of the key.
|
||||
* key (`ScyllaDB.clustering_key <scylla-clustering-key-type_>`_) - the clustering key, optional.
|
||||
|
||||
.. _scylla-new-ring-position-method:
|
||||
|
||||
ScyllaDB.new_ring_position()
|
||||
""""""""""""""""""""""""""""
|
||||
|
||||
Creates a `ScyllaDB.ring_position <scylla-ring-position-type_>`_ instance.
|
||||
|
||||
Has several overloads:
|
||||
|
||||
* ``ScyllaDB.new_ring_position(weight, key)``.
|
||||
* ``ScyllaDB.new_ring_position(weight, token)``.
|
||||
* ``ScyllaDB.new_ring_position(weight, key, token)``.
|
||||
|
||||
Where:
|
||||
|
||||
* weight (integer) - the weight of the key.
|
||||
* key (`ScyllaDB.partition_key <scylla-partition-key-type_>`_) - the partition key.
|
||||
* token (integer) - the token (of the key if a key is provided).
|
||||
|
||||
.. _scylla-now-method:
|
||||
|
||||
ScyllaDB.now()
|
||||
""""""""""""""
|
||||
|
||||
Create a `ScyllaDB.gc_clock_time_point <scylla-gc-clock-time-point-type_>`_ instance, representing the current time.
|
||||
|
||||
.. _scylla-partition-key-type:
|
||||
|
||||
ScyllaDB.partition_key
|
||||
""""""""""""""""""""""
|
||||
|
||||
Attributes:
|
||||
|
||||
* components (table) - the column values (`ScyllaDB.data_value <scylla-data-value-type_>`_) making up the composite partition key.
|
||||
|
||||
Methods:
|
||||
|
||||
* to_hex - convert the key to its serialized format, encoded in hex.
|
||||
|
||||
Magic methods:
|
||||
|
||||
* __tostring - can be converted to string with tostring(), uses the built-in operator<< in ScyllaDB.
|
||||
|
||||
See also:
|
||||
|
||||
* :ref:`ScyllaDB.unserialize_partition_key() <scylla-unserialize-partition-key-method>`.
|
||||
* :ref:`ScyllaDB.token_of() <scylla-token-of-method>`.
|
||||
|
||||
.. _scylla-partition-start-type:
|
||||
|
||||
ScyllaDB.partition_start
|
||||
""""""""""""""""""""""""
|
||||
|
||||
Attributes:
|
||||
|
||||
* key - the partition key's value as the appropriate Lua native type.
|
||||
* token (integer) - the partition key's token.
|
||||
* tombstone (`ScyllaDB.tombstone <scylla-tombstone-type_>`_) - the partition tombstone, ``nil`` if no tombstone.
|
||||
|
||||
.. _scylla-position-in-partition-type:
|
||||
|
||||
ScyllaDB.position_in_partition
|
||||
""""""""""""""""""""""""""""""
|
||||
|
||||
Currently used only for clustering positions.
|
||||
|
||||
Attributes:
|
||||
|
||||
* key (`ScyllaDB.clustering_key <scylla-clustering-key-type_>`_) - the clustering key, ``nil`` if the position in partition represents the min or max clustering positions.
|
||||
* weight (integer) - weight of the position, either -1 (before key), 0 (at key) or 1 (after key). If key attribute is ``nil``, the weight is never 0.
|
||||
|
||||
Methods:
|
||||
|
||||
* tri_cmp - compare this position in partition to another position in partition, returns -1 (``<``), 0 (``==``) or 1 (``>``).
|
||||
|
||||
See also:
|
||||
|
||||
* `ScyllaDB.new_position_in_partition() <scylla-new-position-in-partition-method_>`_.
|
||||
|
||||
.. _scylla-range-tombstone-change-type:
|
||||
|
||||
ScyllaDB.range_tombstone_change
|
||||
"""""""""""""""""""""""""""""""
|
||||
|
||||
Attributes:
|
||||
|
||||
* key ($TYPE) - the clustering key's value as the appropriate Lua native type.
|
||||
* key_weight (integer) - weight of the position, either -1 (before key), 0 (at key) or 1 (after key).
|
||||
* tombstone (`ScyllaDB.tombstone <scylla-tombstone-type_>`_) - tombstone, ``nil`` if no tombstone.
|
||||
|
||||
.. _scylla-ring-position-type:
|
||||
|
||||
ScyllaDB.ring_position
|
||||
""""""""""""""""""""""
|
||||
|
||||
Attributes:
|
||||
|
||||
* token (integer) - the token, ``nil`` if the ring position represents the min or max ring positions.
|
||||
* key (`ScyllaDB.partition_key <scylla-partition-key-type_>`_) - the partition key, ``nil`` if the ring position represents a position before/after a token.
|
||||
* weight (integer) - weight of the position, either -1 (before key/token), 0 (at key) or 1 (after key/token). If key attribute is ``nil``, the weight is never 0.
|
||||
|
||||
Methods:
|
||||
|
||||
* tri_cmp - compare this ring position to another ring position, returns -1 (``<``), 0 (``==``) or 1 (``>``).
|
||||
|
||||
See also:
|
||||
|
||||
* `ScyllaDB.new_ring_position() <scylla-new-ring-position-method_>`_.
|
||||
|
||||
.. _scylla-row-marker-type:
|
||||
|
||||
ScyllaDB.row_marker
|
||||
"""""""""""""""""""
|
||||
|
||||
Attributes:
|
||||
|
||||
* timestamp (integer).
|
||||
* is_live (boolean) - is the marker live?
|
||||
* has_ttl (boolean) - is the marker expiring?
|
||||
* ttl (integer) - time to live in seconds, ``nil`` if marker is not expiring.
|
||||
* expiry (`ScyllaDB.gc_clock_time_point <scylla-gc-clock-time-point-type_>`_) - time at which marker expires, ``nil`` if marker is not expiring.
|
||||
* deletion_time (`ScyllaDB.gc_clock_time_point <scylla-gc-clock-time-point-type_>`_) - time at which marker was deleted, ``nil`` unless marker is dead or expiring.
|
||||
|
||||
.. _scylla-schema-type:
|
||||
|
||||
ScyllaDB.schema
|
||||
"""""""""""""""
|
||||
|
||||
Attributes:
|
||||
|
||||
* partition_key_columns (table) - list of `ScyllaDB.column_definition <scylla-column-definition-type_>`_ of the key columns making up the partition key.
|
||||
* clustering_key_columns (table) - list of `ScyllaDB.column_definition <scylla-column-definition-type_>`_ of the key columns making up the clustering key.
|
||||
* static_columns (table) - list of `ScyllaDB.column_definition <scylla-column-definition-type_>`_ of the static columns.
|
||||
* regular_columns (table) - list of `ScyllaDB.column_definition <scylla-column-definition-type_>`_ of the regular columns.
|
||||
* all_columns (table) - list of `ScyllaDB.column_definition <scylla-column-definition-type_>`_ of all columns.
|
||||
|
||||
.. _scylla-sstable-type:
|
||||
|
||||
ScyllaDB.sstable
|
||||
""""""""""""""""
|
||||
|
||||
Attributes:
|
||||
|
||||
* filename (string) - the full path of the sstable Data component file;
|
||||
|
||||
.. _scylla-static-row-type:
|
||||
|
||||
ScyllaDB.static_row
|
||||
"""""""""""""""""""
|
||||
|
||||
Attributes:
|
||||
|
||||
* cells (table) - table of cells, where keys are the column names and the values are either of type `ScyllaDB.atomic_cell <scylla-atomic-cell-type_>`_ or `ScyllaDB.collection <scylla-collection-type_>`_.
|
||||
|
||||
.. _scylla-time-point-from-string-method:
|
||||
|
||||
ScyllaDB.time_point_from_string()
|
||||
"""""""""""""""""""""""""""""""""
|
||||
|
||||
Create a `ScyllaDB.gc_clock_time_point <scylla-gc-clock-time-point-type_>`_ instance from the passed in string.
|
||||
Argument is string, using the same format as the CQL timestamp type, see https://en.wikipedia.org/wiki/ISO_8601.
|
||||
|
||||
.. _scylla-token-of-method:
|
||||
|
||||
ScyllaDB.token_of()
|
||||
"""""""""""""""""""
|
||||
|
||||
Compute and return the token (integer) for a `ScyllaDB.partition_key <scylla-partition-key-type_>`_.
|
||||
|
||||
.. _scylla-tombstone-type:
|
||||
|
||||
ScyllaDB.tombstone
|
||||
""""""""""""""""""
|
||||
|
||||
Attributes:
|
||||
|
||||
* timestamp (integer)
|
||||
* deletion_time (`ScyllaDB.gc_clock_time_point <scylla-gc-clock-time-point-type_>`_) - the point in time at which the tombstone was deleted.
|
||||
|
||||
.. _scylla-unserialize-clustering-key-method:
|
||||
|
||||
ScyllaDB.unserialize_clustering_key()
|
||||
"""""""""""""""""""""""""""""""""""""
|
||||
|
||||
Create a `ScyllaDB.clustering_key <scylla-clustering-key-type_>`_ instance.
|
||||
|
||||
Argument is a string representing serialized clustering key in hex format.
|
||||
|
||||
.. _scylla-unserialize-partition-key-method:
|
||||
|
||||
ScyllaDB.unserialize_partition_key()
|
||||
""""""""""""""""""""""""""""""""""""
|
||||
|
||||
Create a `ScyllaDB.partition_key <scylla-partition-key-type_>`_ instance.
|
||||
|
||||
Argument is a string representing serialized partition key in hex format.
|
||||
|
||||
@@ -667,7 +667,7 @@ write
|
||||
Writes an SStable based on a description of the content.
|
||||
The description can be provided in two formats: ``CQL`` and ``JSON``.
|
||||
The input format can be selected with the ``--input-format`` flag. Default is ``cql``.
|
||||
In both cases the input is expected to be provided via the file whoose path is passed to ``--input-file``.
|
||||
In both cases the input is expected to be provided via the file whose path is passed to ``--input-file``.
|
||||
|
||||
CQL input format
|
||||
~~~~~~~~~~~~~~~~
|
||||
@@ -858,527 +858,9 @@ Alternatively, you can provide each key-value pair via a separate ``--script-arg
|
||||
|
||||
--script-arg $key1=$value1 --script-arg $key2=$value2
|
||||
|
||||
Command line arguments will be received by the `consume_stream_start() <scylla-consume-stream-start-method_>`_ API method.
|
||||
Command line arguments will be received by the :ref:`consume_stream_start() <scylla-consume-stream-start-method>` API method.
|
||||
|
||||
.. _scylla-consume-api:
|
||||
|
||||
ScyllaDB Consume API
|
||||
~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
These methods represent the glue code between scylla-sstable's C++ code and the Lua script.
|
||||
Conceptually a script is an implementation of a consumer interface. The script has to implement only the methods it is interested in. Each method has a default implementation in the interface, which simply drops the respective `mutation fragment <scylla-sstable-sstable-content_>`_.
|
||||
For example, a script only interested in partitions can define only `consume_partition_start() <scylla-consume-partition-start-method_>`_ and nothing else.
|
||||
Therefore a completely empty script is also valid, although not very useful.
|
||||
Below you will find the listing of the API methods.
|
||||
These methods (if provided by the script) will be called by the scylla-sstable runtime for the appropriate events and fragment types.
|
||||
|
||||
.. _scylla-consume-stream-start-method:
|
||||
|
||||
consume_stream_start(args)
|
||||
""""""""""""""""""""""""""
|
||||
|
||||
* Part of the Consume API. Called on the very start of the stream.
|
||||
* Parameter is a Lua table containing command line arguments for the script, passed via ``--script-arg``.
|
||||
* Can be used to initialize global state.
|
||||
|
||||
.. _scylla-consume-sstable-start-method:
|
||||
|
||||
consume_sstable_start(sst)
|
||||
""""""""""""""""""""""""""
|
||||
|
||||
* Part of the Consume API.
|
||||
* Called on the start of each stable.
|
||||
* The parameter is of type `ScyllaDB.sstable <scylla-sstable-type_>`_.
|
||||
* When SStables are merged (``--merge``), the parameter is ``nil``.
|
||||
|
||||
Returns whether to stop. If ``true``, `consume_sstable_end() <scylla-consume-sstable-end-method_>`_ is called, skipping the content of the sstable (or that of the entire stream if ``--merge`` is used). If ``false``, consumption follows with the content of the sstable.
|
||||
|
||||
.. _scylla-consume-partition-start-method:
|
||||
|
||||
consume_partition_start(ps)
|
||||
"""""""""""""""""""""""""""
|
||||
|
||||
* Part of the Consume API. Called on the start of each partition.
|
||||
* The parameter is of type `ScyllaDB.partition_start <scylla-partition-start-type_>`_.
|
||||
* Returns whether to stop. If ``true``, `consume_partition_end() <scylla-consume-partition-end-method_>`_ is called, skipping the content of the partition. If ``false``, consumption follows with the content of the partition.
|
||||
|
||||
consume_static_row(sr)
|
||||
""""""""""""""""""""""
|
||||
|
||||
* Part of the Consume API.
|
||||
* Called if the partition has a static row.
|
||||
* The parameter is of type `ScyllaDB.static_row <scylla-static-row-type_>`_.
|
||||
* Returns whether to stop. If ``true``, `consume_partition_end() <scylla-consume-partition-end-method_>`_ is called, and the remaining content of the partition is skipped. If ``false``, consumption follows with the remaining content of the partition.
|
||||
|
||||
consume_clustering_row(cr)
|
||||
""""""""""""""""""""""""""
|
||||
|
||||
* Part of the Consume API.
|
||||
* Called for each clustering row.
|
||||
* The parameter is of type `ScyllaDB.clustering_row <scylla-clustering-row-type_>`_.
|
||||
* Returns whether to stop. If ``true``, `consume_partition_end() <scylla-consume-partition-end-method_>`_ is called, the remaining content of the partition is skipped. If ``false``, consumption follows with the remaining content of the partition.
|
||||
|
||||
consume_range_tombstone_change(crt)
|
||||
"""""""""""""""""""""""""""""""""""
|
||||
|
||||
* Part of the Consume API.
|
||||
* Called for each range tombstone change.
|
||||
* The parameter is of type `ScyllaDB.range_tombstone_change <scylla-range-tombstone-change-type_>`_.
|
||||
* Returns whether to stop. If ``true``, `consume_partition_end() <scylla-consume-partition-end-method_>`_ is called, the remaining content of the partition is skipped. If ``false``, consumption follows with the remaining content of the partition.
|
||||
|
||||
.. _scylla-consume-partition-end-method:
|
||||
|
||||
consume_partition_end()
|
||||
"""""""""""""""""""""""
|
||||
|
||||
* Part of the Consume API.
|
||||
* Called at the end of the partition.
|
||||
* Returns whether to stop. If ``true``, `consume_sstable_end() <scylla-consume-sstable-end-method_>`_ is called, the remaining content of the SStable is skipped. If ``false``, consumption follows with the remaining content of the SStable.
|
||||
|
||||
.. _scylla-consume-sstable-end-method:
|
||||
|
||||
consume_sstable_end()
|
||||
"""""""""""""""""""""
|
||||
|
||||
* Part of the Consume API.
|
||||
* Called at the end of the SStable.
|
||||
* Returns whether to stop. If true, `consume_stream_end() <scylla-consume-stream-end-method_>`_ is called, the remaining content of the stream is skipped. If false, consumption follows with the remaining content of the stream.
|
||||
|
||||
.. _scylla-consume-stream-end-method:
|
||||
|
||||
consume_stream_end()
|
||||
""""""""""""""""""""
|
||||
|
||||
* Part of the Consume API.
|
||||
* Called at the very end of the stream.
|
||||
|
||||
ScyllaDB LUA API
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
||||
In addition to the `ScyllaDB Consume API <scylla-consume-api_>`_, the Lua bindings expose various types and methods that allow you to work with ScyllaDB types and values.
|
||||
The listing uses the following terminology:
|
||||
|
||||
* Attribute - a simple attribute accessible via ``obj.attribute_name``;
|
||||
* Method - a method operating on an instance of said type, invocable as ``obj:method()``;
|
||||
* Magic method - magic methods defined in the metatable which define behaviour of these objects w.r.t. `Lua operators and more <http://www.lua.org/manual/5.4/manual.html#2.4>`_;
|
||||
|
||||
The format of an attribute description is the following:
|
||||
|
||||
.. code-block:: none
|
||||
:class: hide-copy-button
|
||||
|
||||
attribute_name (type) - description
|
||||
|
||||
and that of a method:
|
||||
|
||||
.. code-block:: none
|
||||
:class: hide-copy-button
|
||||
|
||||
method_name(arg1_type, arg2_type...) (return_type) - description
|
||||
|
||||
Magic methods have their signature defined by Lua and so that is not described here (these methods are not used directly anyway).
|
||||
|
||||
.. _scylla-atomic-cell-type:
|
||||
|
||||
ScyllaDB.atomic_cell
|
||||
""""""""""""""""""""
|
||||
|
||||
Attributes:
|
||||
|
||||
* timestamp (integer)
|
||||
* is_live (boolean) - is the cell live?
|
||||
* type (string) - one of: ``regular``, ``counter-update``, ``counter-shards``, ``frozen-collection`` or ``collection``.
|
||||
* has_ttl (boolean) - is the cell expiring?
|
||||
* ttl (integer) - time to live in seconds, ``nil`` if cell is not expiring.
|
||||
* expiry (`ScyllaDB.gc_clock_time_point <scylla-gc-clock-time-point-type_>`_) - time at which cell expires, ``nil`` if cell is not expiring.
|
||||
* deletion_time (`ScyllaDB.gc_clock_time_point <scylla-gc-clock-time-point-type_>`_) - time at which cell was deleted, ``nil`` unless cell is dead or expiring.
|
||||
* value:
|
||||
|
||||
- ``nil`` if cell is dead.
|
||||
- appropriate Lua native type if type == ``regular``.
|
||||
- integer if type == ``counter-update``.
|
||||
- `ScyllaDB.counter_shards_value <scylla-counter-shards-value-type_>`_ if type == ``counter-shards``.
|
||||
|
||||
A counter-shard table has the following keys:
|
||||
|
||||
* id (string)
|
||||
* value (integer)
|
||||
* clock (integer)
|
||||
|
||||
.. _scylla-clustering-key-type:
|
||||
|
||||
ScyllaDB.clustering_key
|
||||
"""""""""""""""""""""""
|
||||
|
||||
Attributes:
|
||||
|
||||
* components (table) - the column values (`ScyllaDB.data_value <scylla-data-value-type_>`_) making up the composite clustering key.
|
||||
|
||||
Methods:
|
||||
|
||||
* to_hex - convert the key to its serialized format, encoded in hex.
|
||||
|
||||
Magic methods:
|
||||
|
||||
* __tostring - can be converted to string with tostring(), uses the built-in operator<< in ScyllaDB.
|
||||
|
||||
.. _scylla-clustering-row-type:
|
||||
|
||||
ScyllaDB.clustering_row
|
||||
"""""""""""""""""""""""
|
||||
|
||||
Attributes:
|
||||
|
||||
* key ($TYPE) - the clustering key's value as the appropriate Lua native type.
|
||||
* tombstone (`ScyllaDB.tombstone <scylla-tombstone-type_>`_) - row tombstone, ``nil`` if no tombstone.
|
||||
* shadowable_tombstone (`ScyllaDB.tombstone <scylla-tombstone-type_>`_) - shadowable tombstone of the row tombstone, ``nil`` if no tombstone.
|
||||
* marker (`ScyllaDB.row_marker <scylla-row-marker-type_>`_) - the row marker, ``nil`` if row doesn't have one.
|
||||
* cells (table) - table of cells, where keys are the column names and the values are either of type `ScyllaDB.atomic_cell <scylla-atomic-cell-type_>`_ or `ScyllaDB.collection <scylla-collection-type_>`_.
|
||||
|
||||
See also:
|
||||
|
||||
* `ScyllaDB.unserialize_clustering_key() <scylla-unserialize-clustering-key-method_>`_.
|
||||
|
||||
.. _scylla-collection-type:
|
||||
|
||||
ScyllaDB.collection
|
||||
"""""""""""""""""""
|
||||
|
||||
Attributes:
|
||||
|
||||
* type (string) - always ``collection`` for collection.
|
||||
* tombstone (`ScyllaDB.tombstone <scylla-tombstone-type_>`_) - ``nil`` if no tombstone.
|
||||
* cells (table) - the collection cells, each collection cell is a table, with a ``key`` and ``value`` attribute. The key entry is the key of the collection cell for actual collections (list, set and map) and is of type `ScyllaDB.data-value <scylla-data-value-type_>`_. For tuples and UDT this is just an empty string. The value entry is the value of the collection cell and is of type `ScyllaDB.atomic-cell <scylla-atomic-cell-type_>`_.
|
||||
|
||||
.. _scylla-collection-cell-value-type:
|
||||
|
||||
ScyllaDB.collection_cell_value
|
||||
""""""""""""""""""""""""""""""
|
||||
|
||||
Attributes:
|
||||
|
||||
* key (sstring) - collection cell key in human readable form.
|
||||
* value (`ScyllaDB.atomic_cell <scylla-atomic-cell-type_>`_) - collection cell value.
|
||||
|
||||
.. _scylla-column-definition-type:
|
||||
|
||||
ScyllaDB.column_definition
|
||||
""""""""""""""""""""""""""
|
||||
|
||||
Attributes:
|
||||
|
||||
* id (integer) - the id of the column.
|
||||
* name (string) - the name of the column.
|
||||
* kind (string) - the kind of the column, one of ``partition_key``, ``clustering_key``, ``static_column`` or ``regular_column``.
|
||||
|
||||
.. _scylla-counter-shards-value-type:
|
||||
|
||||
ScyllaDB.counter_shards_value
|
||||
"""""""""""""""""""""""""""""
|
||||
|
||||
Attributes:
|
||||
|
||||
* value (integer) - the total value of the counter (the sum of all the shards).
|
||||
* shards (table) - the shards making up this counter, a lua list containing tables, representing shards, with the following key/values:
|
||||
|
||||
- id (string) - the shard's id (UUID).
|
||||
- value (integer) - the shard's value.
|
||||
- clock (integer) - the shard's logical clock.
|
||||
|
||||
Magic methods:
|
||||
|
||||
* __tostring - can be converted to string with tostring().
|
||||
|
||||
.. _scylla-data-value-type:
|
||||
|
||||
ScyllaDB.data_value
|
||||
"""""""""""""""""""
|
||||
|
||||
Attributes:
|
||||
|
||||
* value - the value represented as the appropriate Lua type
|
||||
|
||||
Magic methods:
|
||||
|
||||
* __tostring - can be converted to string with tostring().
|
||||
|
||||
.. _scylla-gc-clock-time-point-type:
|
||||
|
||||
ScyllaDB.gc_clock_time_point
|
||||
""""""""""""""""""""""""""""
|
||||
|
||||
A time point belonging to the gc_clock, in UTC.
|
||||
|
||||
Attributes:
|
||||
|
||||
* year (integer) - [1900, +inf).
|
||||
* month (integer) - [1, 12].
|
||||
* day (integer) - [1, 31].
|
||||
* hour (integer) - [0, 23].
|
||||
* min (integer) - [0, 59].
|
||||
* sec (integer) - [0, 59].
|
||||
|
||||
Magic methods:
|
||||
|
||||
* __eq - can be equal compared.
|
||||
* __lt - can be less compared.
|
||||
* __le - can be less-or-equal compared.
|
||||
* __tostring - can be converted to string with tostring().
|
||||
|
||||
See also:
|
||||
|
||||
* `ScyllaDB.now() <scylla-now-method_>`_.
|
||||
* `ScyllaDB.time_point_from_string() <scylla-time-point-from-string-method_>`_.
|
||||
|
||||
.. _scylla-json-writer-type:
|
||||
|
||||
ScyllaDB.json_writer
|
||||
""""""""""""""""""""
|
||||
|
||||
A JSON writer object, with both low-level and high-level APIs.
|
||||
The low-level API allows you to write custom JSON and it loosely follows the API of `rapidjson::Writer <https://rapidjson.org/classrapidjson_1_1_writer.html>`_ (upon which it is implemented).
|
||||
The high-level API is for writing `mutation fragments <scylla-sstable-sstable-content_>`_ as JSON directly, using the built-in JSON conversion logic that is used by `dump-data <dump-data_>`_ operation.
|
||||
|
||||
Low level API Methods:
|
||||
|
||||
* null() - write a null json value.
|
||||
* bool(boolean) - write a bool json value.
|
||||
* int(integer) - write an integer json value.
|
||||
* double(number) - write a double json value.
|
||||
* string(string) - write a string json value.
|
||||
* start_object() - start a json object.
|
||||
* key(string) - write the key of a json object.
|
||||
* end_object() - write the end of a json object.
|
||||
* start_array() - write the start of a json array.
|
||||
* end_array() - write the end of a json array.
|
||||
|
||||
High level API Methods:
|
||||
|
||||
* start_stream() - start the stream, call at the very beginning.
|
||||
* start_sstable() - start an sstable.
|
||||
* start_partition() - start a partition.
|
||||
* static_row() - write a static row to the stream.
|
||||
* clustering_row() - write a clustering row to the stream.
|
||||
* range_tombstone_change() - write a range tombstone change to the stream.
|
||||
* end_partition() - end the current partition.
|
||||
* end_sstable() - end the current sstable.
|
||||
* end_stream() - end the stream, call at the very end.
|
||||
|
||||
.. _scylla-new-json-writer-method:
|
||||
|
||||
ScyllaDB.new_json_writer()
|
||||
""""""""""""""""""""""""""
|
||||
|
||||
Create a `ScyllaDB.json_writer <scylla-json-writer-type_>`_ instance.
|
||||
|
||||
.. _scylla-new-position-in-partition-method:
|
||||
|
||||
ScyllaDB.new_position_in_partition()
|
||||
""""""""""""""""""""""""""""""""""""
|
||||
|
||||
Creates a `ScyllaDB.position_in_partition <scylla-position-in-partition-type_>`_ instance.
|
||||
|
||||
Arguments:
|
||||
|
||||
* weight (integer) - the weight of the key.
|
||||
* key (`ScyllaDB.clustering_key <scylla-clustering-key-type_>`_) - the clustering key, optional.
|
||||
|
||||
.. _scylla-new-ring-position-method:
|
||||
|
||||
ScyllaDB.new_ring_position()
|
||||
""""""""""""""""""""""""""""
|
||||
|
||||
Creates a `ScyllaDB.ring_position <scylla-ring-position-type_>`_ instance.
|
||||
|
||||
Has several overloads:
|
||||
|
||||
* ``ScyllaDB.new_ring_position(weight, key)``.
|
||||
* ``ScyllaDB.new_ring_position(weight, token)``.
|
||||
* ``ScyllaDB.new_ring_position(weight, key, token)``.
|
||||
|
||||
Where:
|
||||
|
||||
* weight (integer) - the weight of the key.
|
||||
* key (`ScyllaDB.partition_key <scylla-partition-key-type_>`_) - the partition key.
|
||||
* token (integer) - the token (of the key if a key is provided).
|
||||
|
||||
.. _scylla-now-method:
|
||||
|
||||
ScyllaDB.now()
|
||||
""""""""""""""
|
||||
|
||||
Create a `ScyllaDB.gc_clock_time_point <scylla-gc-clock-time-point-type_>`_ instance, representing the current time.
|
||||
|
||||
.. _scylla-partition-key-type:
|
||||
|
||||
ScyllaDB.partition_key
|
||||
""""""""""""""""""""""
|
||||
|
||||
Attributes:
|
||||
|
||||
* components (table) - the column values (`ScyllaDB.data_value <scylla-data-value-type_>`_) making up the composite partition key.
|
||||
|
||||
Methods:
|
||||
|
||||
* to_hex - convert the key to its serialized format, encoded in hex.
|
||||
|
||||
Magic methods:
|
||||
|
||||
* __tostring - can be converted to string with tostring(), uses the built-in operator<< in ScyllaDB.
|
||||
|
||||
See also:
|
||||
|
||||
* :ref:`ScyllaDB.unserialize_partition_key() <scylla-unserialize-partition-key-method>`.
|
||||
* :ref:`ScyllaDB.token_of() <scylla-token-of-method>`.
|
||||
|
||||
.. _scylla-partition-start-type:
|
||||
|
||||
ScyllaDB.partition_start
|
||||
""""""""""""""""""""""""
|
||||
|
||||
Attributes:
|
||||
|
||||
* key - the partition key's value as the appropriate Lua native type.
|
||||
* token (integer) - the partition key's token.
|
||||
* tombstone (`ScyllaDB.tombstone <scylla-tombstone-type_>`_) - the partition tombstone, ``nil`` if no tombstone.
|
||||
|
||||
.. _scylla-position-in-partition-type:
|
||||
|
||||
ScyllaDB.position_in_partition
|
||||
""""""""""""""""""""""""""""""
|
||||
|
||||
Currently used only for clustering positions.
|
||||
|
||||
Attributes:
|
||||
|
||||
* key (`ScyllaDB.clustering_key <scylla-clustering-key-type_>`_) - the clustering key, ``nil`` if the position in partition represents the min or max clustering positions.
|
||||
* weight (integer) - weight of the position, either -1 (before key), 0 (at key) or 1 (after key). If key attribute is ``nil``, the weight is never 0.
|
||||
|
||||
Methods:
|
||||
|
||||
* tri_cmp - compare this position in partition to another position in partition, returns -1 (``<``), 0 (``==``) or 1 (``>``).
|
||||
|
||||
See also:
|
||||
|
||||
* `ScyllaDB.new_position_in_partition() <scylla-new-position-in-partition-method_>`_.
|
||||
|
||||
.. _scylla-range-tombstone-change-type:
|
||||
|
||||
ScyllaDB.range_tombstone_change
|
||||
"""""""""""""""""""""""""""""""
|
||||
|
||||
Attributes:
|
||||
|
||||
* key ($TYPE) - the clustering key's value as the appropriate Lua native type.
|
||||
* key_weight (integer) - weight of the position, either -1 (before key), 0 (at key) or 1 (after key).
|
||||
* tombstone (`ScyllaDB.tombstone <scylla-tombstone-type_>`_) - tombstone, ``nil`` if no tombstone.
|
||||
|
||||
.. _scylla-ring-position-type:
|
||||
|
||||
ScyllaDB.ring_position
|
||||
""""""""""""""""""""""
|
||||
|
||||
Attributes:
|
||||
|
||||
* token (integer) - the token, ``nil`` if the ring position represents the min or max ring positions.
|
||||
* key (`ScyllaDB.partition_key <scylla-partition-key-type_>`_) - the partition key, ``nil`` if the ring position represents a position before/after a token.
|
||||
* weight (integer) - weight of the position, either -1 (before key/token), 0 (at key) or 1 (after key/token). If key attribute is ``nil``, the weight is never 0.
|
||||
|
||||
Methods:
|
||||
|
||||
* tri_cmp - compare this ring position to another ring position, returns -1 (``<``), 0 (``==``) or 1 (``>``).
|
||||
|
||||
See also:
|
||||
|
||||
* `ScyllaDB.new_ring_position() <scylla-new-ring-position-method_>`_.
|
||||
|
||||
.. _scylla-row-marker-type:
|
||||
|
||||
ScyllaDB.row_marker
|
||||
"""""""""""""""""""
|
||||
|
||||
Attributes:
|
||||
|
||||
* timestamp (integer).
|
||||
* is_live (boolean) - is the marker live?
|
||||
* has_ttl (boolean) - is the marker expiring?
|
||||
* ttl (integer) - time to live in seconds, ``nil`` if marker is not expiring.
|
||||
* expiry (`ScyllaDB.gc_clock_time_point <scylla-gc-clock-time-point-type_>`_) - time at which marker expires, ``nil`` if marker is not expiring.
|
||||
* deletion_time (`ScyllaDB.gc_clock_time_point <scylla-gc-clock-time-point-type_>`_) - time at which marker was deleted, ``nil`` unless marker is dead or expiring.
|
||||
|
||||
.. _scylla-schema-type:
|
||||
|
||||
ScyllaDB.schema
|
||||
"""""""""""""""
|
||||
|
||||
Attributes:
|
||||
|
||||
* partition_key_columns (table) - list of `ScyllaDB.column_definition <scylla-column-definition-type_>`_ of the key columns making up the partition key.
|
||||
* clustering_key_columns (table) - list of `ScyllaDB.column_definition <scylla-column-definition-type_>`_ of the key columns making up the clustering key.
|
||||
* static_columns (table) - list of `ScyllaDB.column_definition <scylla-column-definition-type_>`_ of the static columns.
|
||||
* regular_columns (table) - list of `ScyllaDB.column_definition <scylla-column-definition-type_>`_ of the regular columns.
|
||||
* all_columns (table) - list of `ScyllaDB.column_definition <scylla-column-definition-type_>`_ of all columns.
|
||||
|
||||
.. _scylla-sstable-type:
|
||||
|
||||
ScyllaDB.sstable
|
||||
""""""""""""""""
|
||||
|
||||
Attributes:
|
||||
|
||||
* filename (string) - the full path of the sstable Data component file;
|
||||
|
||||
.. _scylla-static-row-type:
|
||||
|
||||
ScyllaDB.static_row
|
||||
"""""""""""""""""""
|
||||
|
||||
Attributes:
|
||||
|
||||
* cells (table) - table of cells, where keys are the column names and the values are either of type `ScyllaDB.atomic_cell <scylla-atomic-cell-type_>`_ or `ScyllaDB.collection <scylla-collection-type_>`_.
|
||||
|
||||
.. _scylla-time-point-from-string-method:
|
||||
|
||||
ScyllaDB.time_point_from_string()
|
||||
"""""""""""""""""""""""""""""""""
|
||||
|
||||
Create a `ScyllaDB.gc_clock_time_point <scylla-gc-clock-time-point-type_>`_ instance from the passed in string.
|
||||
Argument is string, using the same format as the CQL timestamp type, see https://en.wikipedia.org/wiki/ISO_8601.
|
||||
|
||||
.. _scylla-token-of-method:
|
||||
|
||||
ScyllaDB.token_of()
|
||||
"""""""""""""""""""
|
||||
|
||||
Compute and return the token (integer) for a `ScyllaDB.partition_key <scylla-partition-key-type_>`_.
|
||||
|
||||
.. _scylla-tombstone-type:
|
||||
|
||||
ScyllaDB.tombstone
|
||||
""""""""""""""""""
|
||||
|
||||
Attributes:
|
||||
|
||||
* timestamp (integer)
|
||||
* deletion_time (`ScyllaDB.gc_clock_time_point <scylla-gc-clock-time-point-type_>`_) - the point in time at which the tombstone was deleted.
|
||||
|
||||
.. _scylla-unserialize-clustering-key-method:
|
||||
|
||||
ScyllaDB.unserialize_clustering_key()
|
||||
"""""""""""""""""""""""""""""""""""""
|
||||
|
||||
Create a `ScyllaDB.clustering_key <scylla-clustering-key-type_>`_ instance.
|
||||
|
||||
Argument is a string representing serialized clustering key in hex format.
|
||||
|
||||
.. _scylla-unserialize-partition-key-method:
|
||||
|
||||
ScyllaDB.unserialize_partition_key()
|
||||
""""""""""""""""""""""""""""""""""""
|
||||
|
||||
Create a `ScyllaDB.partition_key <scylla-partition-key-type_>`_ instance.
|
||||
|
||||
Argument is a string representing serialized partition key in hex format.
|
||||
See the :doc:`scripting API </operating-scylla/admin-tools/scylla-sstable-script-api/>` for more details.
|
||||
|
||||
Examples
|
||||
~~~~~~~~
|
||||
@@ -1388,7 +870,7 @@ You can find example scripts at https://github.com/scylladb/scylladb/tree/master
|
||||
upgrade
|
||||
^^^^^^^
|
||||
|
||||
Offline, scylla-sstable variant of `nodetool upgradesstables </operating-scylla/nodetool-commands/upgradesstables/>`_.
|
||||
Offline, scylla-sstable variant of :doc:`nodetool upgradesstables </operating-scylla/nodetool-commands/upgradesstables/>`.
|
||||
Rewrites the input SSTable(s) to the latest supported version and latest schema version.
|
||||
The SSTable version to be used can be overridden with the ``--version`` flag, allowing for switching sstables between all versions supported for writing (some SSTable versions are supported for reading only).
|
||||
|
||||
@@ -1397,7 +879,7 @@ SSTables which are already on the designated version are skipped. To force rewri
|
||||
Output SSTables are written to the path provided by the ``--output-dir`` flag, or to the current directory if not specified.
|
||||
This directory is expected to exist and be empty. If not empty the tool will refuse to run. This can be overridden with the ``--unsafe-accept-nonempty-output-dir`` flag.
|
||||
|
||||
It is strongly recommended to use the system schema tables as the schema source for this command, see the `schema options <scylla-sstable-schema_>`_ for more details.
|
||||
It is strongly recommended to use the system schema tables as the schema source for this command, see the :ref:`schema options <scylla-sstable-schema>` for more details.
|
||||
A schema which is good enough to read the SSTable and dump its content, may not be good enough to write its content back verbatim.
|
||||
An incomplete or incorrect schema can lead to the tool crashing or even data loss.
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@ Logs
|
||||
|
||||
The most obvious source of information to find out more about why ScyllaDB is misbehaving.
|
||||
On production systems, ScyllaDB logs to syslog; thus logs can usually be viewed via ``journalctl``.
|
||||
See `Logging </getting-started/logging/>`_ on more information on how to access the logs.
|
||||
See :doc:`Logging </getting-started/logging/>` on more information on how to access the logs.
|
||||
|
||||
|
||||
ScyllaDB has the following log levels: ``trace``, ``debug``, ``info``, ``warn``, ``error``.
|
||||
@@ -64,21 +64,21 @@ Tracing
|
||||
Tracing allows you to retrieve the internal log of events happening in the context of a single query.
|
||||
Therefore, tracing is only useful to diagnose problems related to a certain query and cannot be used to diagnose generic problems.
|
||||
That said, when it comes to diagnosing problems with a certain query, tracing is an excellent tool, allowing you to have a peek at what happens when that query is processed, including the timestamp of each event.
|
||||
For more details, see `Tracing </using-scylla/tracing>`_.
|
||||
For more details, see :doc:`Tracing </using-scylla/tracing>`.
|
||||
|
||||
Nodetool
|
||||
--------
|
||||
|
||||
Although ``nodetool`` is primarily an administration tool, it has various commands that retrieve and display useful information about the state of a certain ScyllaDB node.
|
||||
Look for commands with "stats", "info", "describe", "get", "histogram" in their names.
|
||||
For a comprehensive list of all available nodetool commands, see the `Nodetool Reference </operating-scylla/nodetool>`_.
|
||||
For a comprehensive list of all available nodetool commands, see the :doc:`Nodetool Reference </operating-scylla/nodetool>`.
|
||||
|
||||
REST API
|
||||
--------
|
||||
|
||||
ScyllaDB has a REST API which is a superset of all ``nodetool`` commands, in the sense that it is the backend serving all of them.
|
||||
It has many more endpoints, many of which can supply valuable information about the internal state of ScyllaDB.
|
||||
For more information, see `REST API </operating-scylla/rest>`_.
|
||||
For more information, see :doc:`REST API </operating-scylla/rest>`.
|
||||
|
||||
System Tables
|
||||
-------------
|
||||
@@ -102,9 +102,9 @@ Other Tools
|
||||
ScyllaDB has various other tools, mainly to work with sstables.
|
||||
If you are diagnosing a problem that is related to sstables misbehaving or being corrupt, you may find these useful:
|
||||
|
||||
* `sstabledump </operating-scylla/admin-tools/sstabledump/>`_
|
||||
* `ScyllaDB SStable </operating-scylla/admin-tools/scylla-sstable/>`_
|
||||
* `ScyllaDB Types </operating-scylla/admin-tools/scylla-types/>`_
|
||||
* :doc:`sstabledump </operating-scylla/admin-tools/sstabledump/>`
|
||||
* :doc:`ScyllaDB SStable </operating-scylla/admin-tools/scylla-sstable/>`
|
||||
* :doc:`ScyllaDB Types </operating-scylla/admin-tools/scylla-types/>`
|
||||
|
||||
GDB
|
||||
---
|
||||
|
||||
@@ -9,6 +9,8 @@ Running ``cluster repair`` on a **single node** synchronizes all data on all nod
|
||||
To synchronize all data in clusters that have both tablets-based and vnodes-based keyspaces, run :doc:`nodetool repair -pr </operating-scylla/nodetool-commands/repair/>` on **all**
|
||||
of the nodes in the cluster, and :doc:`nodetool cluster repair </operating-scylla/nodetool-commands/cluster/repair/>` on **any** of the nodes in the cluster.
|
||||
|
||||
.. warning:: :term:`Colocated Tables <Colocated Table>` cannot be synchronized using cluster repair in this version.
|
||||
|
||||
To check if a keyspace enables tablets, use:
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
@@ -14,12 +14,13 @@ Enable ScyllaDB :doc:`Authentication </operating-scylla/security/authentication>
|
||||
Enabling Audit
|
||||
---------------
|
||||
|
||||
By default, auditing is **disabled**. Enabling auditing is controlled by the ``audit:`` parameter in the ``scylla.yaml`` file.
|
||||
By default, auditing is **enabled**. Enabling auditing is controlled by the ``audit:`` parameter in the ``scylla.yaml`` file.
|
||||
You can set the following options:
|
||||
|
||||
* ``none`` - Audit is disabled (default).
|
||||
* ``table`` - Audit is enabled, and messages are stored in a Scylla table.
|
||||
* ``syslog`` - Audit is enabled, and messages are sent to Syslog.
|
||||
* ``syslog,table`` - Audit is enabled, and messages are stored in a Scylla table and sent to Syslog.
|
||||
|
||||
Configuring any other value results in an error at Scylla startup.
|
||||
|
||||
|
||||
@@ -202,3 +202,7 @@ Glossary
|
||||
The name comes from two basic operations, multiply (MU) and rotate (R), used in its inner loop.
|
||||
The MurmurHash3 version used in ScyllaDB originated from `Apache Cassandra <https://commons.apache.org/proper/commons-codec/apidocs/org/apache/commons/codec/digest/MurmurHash3.html>`_, and is **not** identical to the `official MurmurHash3 calculation <https://github.com/apache/cassandra/blob/trunk/src/java/org/apache/cassandra/utils/MurmurHash.java#L31-L33>`_. More `here <https://github.com/russss/murmur3-cassandra>`_.
|
||||
|
||||
Colocated Table
|
||||
An internal table of a special type in a :doc:`tablets </architecture/tablets>` enabled keyspace that is colocated with another base table, meaning it always has the same tablet replicas as the base table.
|
||||
Current types of colocated tables include CDC log tables, local indexes, and materialized views that have the same partition key as their base table.
|
||||
|
||||
|
||||
@@ -177,6 +177,8 @@ public:
|
||||
gms::feature driver_service_level { *this, "DRIVER_SERVICE_LEVEL"sv };
|
||||
gms::feature strongly_consistent_tables { *this, "STRONGLY_CONSISTENT_TABLES"sv };
|
||||
gms::feature client_routes { *this, "CLIENT_ROUTES"sv };
|
||||
gms::feature removenode_with_left_token_ring { *this, "REMOVENODE_WITH_LEFT_TOKEN_RING"sv };
|
||||
gms::feature size_based_load_balancing { *this, "SIZE_BASED_LOAD_BALANCING"sv };
|
||||
public:
|
||||
|
||||
const std::unordered_map<sstring, std::reference_wrapper<feature>>& registered_features() const;
|
||||
|
||||
@@ -8,76 +8,120 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "service/tablet_allocator_fwd.hh"
|
||||
#include "locator/topology.hh"
|
||||
#include "locator/token_metadata.hh"
|
||||
#include "locator/tablets.hh"
|
||||
#include "utils/stall_free.hh"
|
||||
#include "utils/extremum_tracking.hh"
|
||||
#include "utils/div_ceil.hh"
|
||||
#include "utils/pretty_printers.hh"
|
||||
|
||||
#include <absl/container/btree_set.h>
|
||||
#include <seastar/util/defer.hh>
|
||||
|
||||
#include <optional>
|
||||
#include <vector>
|
||||
|
||||
namespace locator {
|
||||
|
||||
struct disk_usage {
|
||||
using load_type = double; // Disk usage factor (0.0 to 1.0)
|
||||
|
||||
uint64_t capacity = 0;
|
||||
uint64_t used = 0;
|
||||
|
||||
load_type get_load() const {
|
||||
if (capacity == 0) {
|
||||
return 0;
|
||||
}
|
||||
return load_type(used) / capacity;
|
||||
}
|
||||
};
|
||||
|
||||
/// A data structure which keeps track of load associated with data ownership
|
||||
/// on shards of the whole cluster.
|
||||
class load_sketch {
|
||||
using shard_id = seastar::shard_id;
|
||||
using load_type = ssize_t; // In tablets.
|
||||
using load_type = disk_usage::load_type;
|
||||
|
||||
struct shard_load {
|
||||
shard_id id;
|
||||
load_type load;
|
||||
disk_usage du;
|
||||
size_t tablet_count = 0;
|
||||
|
||||
// Returns storage utilization for the shard
|
||||
load_type get_load() const {
|
||||
return du.get_load();
|
||||
}
|
||||
};
|
||||
|
||||
// Less-comparator which orders by load first (ascending), and then by shard id (ascending).
|
||||
struct shard_load_cmp {
|
||||
bool operator()(const shard_load& a, const shard_load& b) const {
|
||||
return a.load == b.load ? a.id < b.id : a.load < b.load;
|
||||
bool operator()(const shard_load& shard_a, const shard_load& shard_b) const {
|
||||
auto load_a = shard_a.get_load();
|
||||
auto load_b = shard_b.get_load();
|
||||
return load_a == load_b ? shard_a.id < shard_b.id : load_a < load_b;
|
||||
}
|
||||
};
|
||||
|
||||
struct node_load {
|
||||
std::vector<shard_load> _shards;
|
||||
absl::btree_set<shard_load, shard_load_cmp> _shards_by_load;
|
||||
std::vector<load_type> _shards;
|
||||
load_type _load = 0;
|
||||
disk_usage _du;
|
||||
size_t _tablet_count = 0;
|
||||
|
||||
node_load(size_t shard_count) : _shards(shard_count) {
|
||||
// These can be false only when _load_stats != nullptr
|
||||
bool _has_valid_disk_capacity = true;
|
||||
bool _has_all_tablet_sizes = true;
|
||||
|
||||
node_load(size_t shard_count, uint64_t capacity)
|
||||
: _shards(shard_count) {
|
||||
_du.capacity = capacity;
|
||||
uint64_t shard_capacity = capacity / shard_count;
|
||||
for (shard_id i = 0; i < shard_count; ++i) {
|
||||
_shards[i] = 0;
|
||||
_shards[i].id = i;
|
||||
_shards[i].du.capacity = shard_capacity;
|
||||
}
|
||||
}
|
||||
|
||||
void update_shard_load(shard_id shard, load_type load_delta) {
|
||||
_load += load_delta;
|
||||
|
||||
auto old_load = _shards[shard];
|
||||
auto new_load = old_load + load_delta;
|
||||
_shards_by_load.erase(shard_load{shard, old_load});
|
||||
_shards[shard] = new_load;
|
||||
_shards_by_load.insert(shard_load{shard, new_load});
|
||||
void update_shard_load(shard_id shard, ssize_t tablet_count_delta, int64_t tablet_size_delta) {
|
||||
_shards_by_load.erase(_shards[shard]);
|
||||
_shards[shard].tablet_count += tablet_count_delta;
|
||||
_shards[shard].du.used += tablet_size_delta;
|
||||
_shards_by_load.insert(_shards[shard]);
|
||||
_du.used += tablet_size_delta;
|
||||
_tablet_count += tablet_count_delta;
|
||||
}
|
||||
|
||||
void populate_shards_by_load() {
|
||||
_shards_by_load.clear();
|
||||
_shards_by_load.insert(_shards.begin(), _shards.end());
|
||||
}
|
||||
|
||||
void normalize(load_type factor) {
|
||||
_du.used /= factor;
|
||||
for (shard_id i = 0; i < _shards.size(); ++i) {
|
||||
_shards_by_load.insert(shard_load{i, _shards[i]});
|
||||
_shards[i].du.used /= factor;
|
||||
}
|
||||
populate_shards_by_load();
|
||||
}
|
||||
|
||||
load_type& load() noexcept {
|
||||
return _load;
|
||||
}
|
||||
|
||||
const load_type& load() const noexcept {
|
||||
return _load;
|
||||
// Returns storage utilization for the node
|
||||
load_type get_load() const noexcept {
|
||||
return _du.get_load();
|
||||
}
|
||||
};
|
||||
std::unordered_map<host_id, node_load> _nodes;
|
||||
token_metadata_ptr _tm;
|
||||
load_stats_ptr _load_stats;
|
||||
uint64_t _default_tablet_size = service::default_target_tablet_size;
|
||||
uint64_t _minimal_tablet_size = 0;
|
||||
|
||||
// When set to true, it will use gross disk capacity instead of effective_capacity and
|
||||
// treat all tablet as having the same size: _default_tablet_size
|
||||
bool _force_capacity_based_load = false;
|
||||
|
||||
private:
|
||||
tablet_replica_set get_replicas_for_tablet_load(const tablet_info& ti, const tablet_transition_info* trinfo) const {
|
||||
// We reflect migrations in the load as if they already happened,
|
||||
@@ -85,10 +129,34 @@ private:
|
||||
return trinfo ? trinfo->next : ti.replicas;
|
||||
}
|
||||
|
||||
future<> populate_table(const tablet_map& tmap, std::optional<host_id> host, std::optional<sstring> only_dc) {
|
||||
std::optional<uint64_t> get_disk_capacity_for_node(host_id node) {
|
||||
if (_load_stats) {
|
||||
if (_load_stats->tablet_stats.contains(node) && !_force_capacity_based_load) {
|
||||
return _load_stats->tablet_stats.at(node).effective_capacity;
|
||||
} else if (_load_stats->capacity.contains(node)) {
|
||||
return _load_stats->capacity.at(node);
|
||||
}
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
std::optional<uint64_t> get_tablet_size(host_id host, const range_based_tablet_id& rb_tid, const tablet_info& ti, const tablet_transition_info* trinfo) const {
|
||||
if (_force_capacity_based_load) {
|
||||
return _default_tablet_size;
|
||||
}
|
||||
|
||||
std::optional<uint64_t> tablet_size_opt;
|
||||
if (_load_stats) {
|
||||
tablet_size_opt = _load_stats->get_tablet_size_in_transition(host, rb_tid, ti, trinfo);
|
||||
}
|
||||
return tablet_size_opt;
|
||||
}
|
||||
|
||||
future<> populate_table(table_id table, const tablet_map& tmap, std::optional<host_id> host, std::optional<sstring> only_dc) {
|
||||
const topology& topo = _tm->get_topology();
|
||||
co_await tmap.for_each_tablet([&] (tablet_id tid, const tablet_info& ti) -> future<> {
|
||||
for (auto&& replica : get_replicas_for_tablet_load(ti, tmap.get_tablet_transition_info(tid))) {
|
||||
auto trinfo = tmap.get_tablet_transition_info(tid);
|
||||
for (auto&& replica : get_replicas_for_tablet_load(ti, trinfo)) {
|
||||
if (host && *host != replica.host) {
|
||||
continue;
|
||||
}
|
||||
@@ -97,28 +165,50 @@ private:
|
||||
if (only_dc && node->dc_rack().dc != *only_dc) {
|
||||
continue;
|
||||
}
|
||||
_nodes.emplace(replica.host, node_load{node->get_shard_count()});
|
||||
auto disk_capacity_opt = get_disk_capacity_for_node(replica.host);
|
||||
auto [i, _] = _nodes.emplace(replica.host, node_load{node->get_shard_count(), disk_capacity_opt.value_or(_default_tablet_size)});
|
||||
if (!disk_capacity_opt && _load_stats) {
|
||||
i->second._has_valid_disk_capacity = false;
|
||||
}
|
||||
}
|
||||
node_load& n = _nodes.at(replica.host);
|
||||
if (replica.shard < n._shards.size()) {
|
||||
n.load() += 1;
|
||||
n._shards[replica.shard] += 1;
|
||||
const range_based_tablet_id rb_tid {table, tmap.get_token_range(tid)};
|
||||
auto tablet_size_opt = get_tablet_size(replica.host, rb_tid, ti, trinfo);
|
||||
if (!tablet_size_opt && _load_stats) {
|
||||
n._has_all_tablet_sizes = false;
|
||||
}
|
||||
const uint64_t tablet_size = std::max(tablet_size_opt.value_or(_default_tablet_size), _minimal_tablet_size);
|
||||
n._du.used += tablet_size;
|
||||
n._tablet_count++;
|
||||
n._shards[replica.shard].du.used += tablet_size;
|
||||
n._shards[replica.shard].tablet_count++;
|
||||
// Note: as an optimization, _shards_by_load is populated later in populate_shards_by_load()
|
||||
}
|
||||
}
|
||||
return make_ready_future<>();
|
||||
});
|
||||
}
|
||||
|
||||
void throw_on_incomplete_data(host_id host, bool only_check_disk_capacity = false) const {
|
||||
if (!has_complete_data(host, only_check_disk_capacity)) {
|
||||
throw std::runtime_error(format("Can't provide accurate load computation with incomplete load_stats for host: {}", host));
|
||||
}
|
||||
}
|
||||
public:
|
||||
load_sketch(token_metadata_ptr tm)
|
||||
: _tm(std::move(tm)) {
|
||||
load_sketch(token_metadata_ptr tm, load_stats_ptr load_stats = {}, uint64_t default_tablet_size = service::default_target_tablet_size)
|
||||
: _tm(std::move(tm))
|
||||
, _load_stats(std::move(load_stats))
|
||||
, _default_tablet_size(default_tablet_size) {
|
||||
}
|
||||
|
||||
future<> clear() {
|
||||
return utils::clear_gently(_nodes);
|
||||
}
|
||||
|
||||
future<> populate(std::optional<host_id> host = std::nullopt,
|
||||
std::optional<table_id> only_table = std::nullopt,
|
||||
std::optional<sstring> only_dc = std::nullopt) {
|
||||
co_await utils::clear_gently(_nodes);
|
||||
|
||||
if (host) {
|
||||
ensure_node(*host);
|
||||
} else {
|
||||
@@ -132,11 +222,11 @@ public:
|
||||
if (only_table) {
|
||||
if (_tm->tablets().has_tablet_map(*only_table)) {
|
||||
auto& tmap = _tm->tablets().get_tablet_map(*only_table);
|
||||
co_await populate_table(tmap, host, only_dc);
|
||||
co_await populate_table(*only_table, tmap, host, only_dc);
|
||||
}
|
||||
} else {
|
||||
for (const auto& [table, tmap] : _tm->tablets().all_tables_ungrouped()) {
|
||||
co_await populate_table(*tmap, host, only_dc);
|
||||
co_await populate_table(table, *tmap, host, only_dc);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -149,12 +239,52 @@ public:
|
||||
return populate(std::nullopt, std::nullopt, dc);
|
||||
}
|
||||
|
||||
shard_id next_shard(host_id node) {
|
||||
|
||||
future<> populate_with_normalized_load() {
|
||||
co_await populate();
|
||||
|
||||
min_max_tracker<load_type> minmax;
|
||||
minmax.update(1);
|
||||
for (auto&& id : _nodes | std::views::keys) {
|
||||
minmax.update(get_shard_minmax(id).max());
|
||||
}
|
||||
|
||||
for (auto&& n : _nodes | std::views::values) {
|
||||
n.normalize(minmax.max());
|
||||
}
|
||||
}
|
||||
|
||||
shard_id next_shard(host_id node, size_t tablet_count, uint64_t tablet_size_sum) {
|
||||
auto shard = get_least_loaded_shard(node);
|
||||
pick(node, shard);
|
||||
pick(node, shard, tablet_count, tablet_size_sum);
|
||||
return shard;
|
||||
}
|
||||
|
||||
bool has_complete_data(host_id node, bool only_check_disk_capacity = false) const {
|
||||
if (!_nodes.contains(node)) {
|
||||
return false;
|
||||
}
|
||||
auto& n = _nodes.at(node);
|
||||
return n._has_valid_disk_capacity && (only_check_disk_capacity || n._has_all_tablet_sizes);
|
||||
}
|
||||
|
||||
void ignore_incomplete_data(host_id node) {
|
||||
if (!_nodes.contains(node)) {
|
||||
return;
|
||||
}
|
||||
auto& n = _nodes.at(node);
|
||||
n._has_valid_disk_capacity = true;
|
||||
n._has_all_tablet_sizes = true;
|
||||
}
|
||||
|
||||
void set_minimal_tablet_size(uint64_t min_ts) {
|
||||
_minimal_tablet_size = min_ts;
|
||||
}
|
||||
|
||||
void set_force_capacity_based_load(bool force_capacity_based_load) {
|
||||
_force_capacity_based_load = force_capacity_based_load;
|
||||
}
|
||||
|
||||
node_load& ensure_node(host_id node) {
|
||||
if (!_nodes.contains(node)) {
|
||||
const topology& topo = _tm->get_topology();
|
||||
@@ -162,55 +292,69 @@ public:
|
||||
if (shard_count == 0) {
|
||||
throw std::runtime_error(format("Shard count not known for node {}", node));
|
||||
}
|
||||
auto [i, _] = _nodes.emplace(node, node_load{shard_count});
|
||||
auto disk_capacity_opt = get_disk_capacity_for_node(node);
|
||||
auto [i, _] = _nodes.emplace(node, node_load{shard_count, disk_capacity_opt.value_or(_default_tablet_size)});
|
||||
i->second.populate_shards_by_load();
|
||||
if (!disk_capacity_opt && _load_stats) {
|
||||
i->second._has_valid_disk_capacity = false;
|
||||
}
|
||||
}
|
||||
return _nodes.at(node);
|
||||
}
|
||||
|
||||
shard_id get_least_loaded_shard(host_id node) {
|
||||
auto& n = ensure_node(node);
|
||||
const shard_load& s = *n._shards_by_load.begin();
|
||||
return s.id;
|
||||
throw_on_incomplete_data(node);
|
||||
return n._shards_by_load.begin()->id;
|
||||
}
|
||||
|
||||
shard_id get_most_loaded_shard(host_id node) {
|
||||
auto& n = ensure_node(node);
|
||||
const shard_load& s = *std::prev(n._shards_by_load.end());
|
||||
return s.id;
|
||||
throw_on_incomplete_data(node);
|
||||
return std::prev(n._shards_by_load.end())->id;
|
||||
}
|
||||
|
||||
void unload(host_id node, shard_id shard) {
|
||||
void unload(host_id node, shard_id shard, size_t tablet_count_delta, uint64_t tablet_sizes_delta) {
|
||||
throw_on_incomplete_data(node);
|
||||
auto& n = _nodes.at(node);
|
||||
n.update_shard_load(shard, -1);
|
||||
n.update_shard_load(shard, -ssize_t(tablet_count_delta), -int64_t(tablet_sizes_delta));
|
||||
}
|
||||
|
||||
void pick(host_id node, shard_id shard) {
|
||||
void pick(host_id node, shard_id shard, size_t tablet_count_delta, uint64_t tablet_sizes_delta) {
|
||||
throw_on_incomplete_data(node);
|
||||
auto& n = _nodes.at(node);
|
||||
n.update_shard_load(shard, 1);
|
||||
n.update_shard_load(shard, tablet_count_delta, tablet_sizes_delta);
|
||||
}
|
||||
|
||||
load_type get_load(host_id node) const {
|
||||
if (!_nodes.contains(node)) {
|
||||
return 0;
|
||||
}
|
||||
return _nodes.at(node).load();
|
||||
throw_on_incomplete_data(node);
|
||||
return _nodes.at(node).get_load();
|
||||
}
|
||||
|
||||
load_type total_load() const {
|
||||
load_type total = 0;
|
||||
for (auto&& n : _nodes) {
|
||||
total += n.second.load();
|
||||
uint64_t get_tablet_count(host_id node) const {
|
||||
if (!_nodes.contains(node)) {
|
||||
return 0;
|
||||
}
|
||||
return total;
|
||||
return _nodes.at(node)._tablet_count;
|
||||
}
|
||||
|
||||
load_type get_avg_shard_load(host_id node) const {
|
||||
uint64_t get_avg_tablet_count(host_id node) const {
|
||||
if (!_nodes.contains(node)) {
|
||||
return 0;
|
||||
}
|
||||
auto& n = _nodes.at(node);
|
||||
return div_ceil(n.load(), n._shards.size());
|
||||
return div_ceil(n._tablet_count, n._shards.size());
|
||||
}
|
||||
|
||||
double get_real_avg_tablet_count(host_id node) const {
|
||||
if (!_nodes.contains(node)) {
|
||||
return 0;
|
||||
}
|
||||
auto& n = _nodes.at(node);
|
||||
return double(n._tablet_count) / n._shards.size();
|
||||
}
|
||||
|
||||
double get_real_avg_shard_load(host_id node) const {
|
||||
@@ -218,7 +362,23 @@ public:
|
||||
return 0;
|
||||
}
|
||||
auto& n = _nodes.at(node);
|
||||
return double(n.load()) / n._shards.size();
|
||||
return double(n.get_load()) / n._shards.size();
|
||||
}
|
||||
|
||||
uint64_t get_disk_used(host_id node) const {
|
||||
if (!_nodes.contains(node)) {
|
||||
return 0;
|
||||
}
|
||||
throw_on_incomplete_data(node);
|
||||
return _nodes.at(node)._du.used;
|
||||
}
|
||||
|
||||
uint64_t get_capacity(host_id node) const {
|
||||
if (!_nodes.contains(node)) {
|
||||
return 0;
|
||||
}
|
||||
throw_on_incomplete_data(node, true);
|
||||
return _nodes.at(node)._du.capacity;
|
||||
}
|
||||
|
||||
shard_id get_shard_count(host_id node) const {
|
||||
@@ -231,17 +391,18 @@ public:
|
||||
// Returns the difference in tablet count between highest-loaded shard and lowest-loaded shard.
|
||||
// Returns 0 when shards are perfectly balanced.
|
||||
// Returns 1 when shards are imbalanced, but it's not possible to balance them.
|
||||
load_type get_shard_imbalance(host_id node) const {
|
||||
auto minmax = get_shard_minmax(node);
|
||||
return minmax.max() - minmax.max();
|
||||
size_t get_shard_tablet_count_imbalance(host_id node) const {
|
||||
auto minmax = get_shard_minmax_tablet_count(node);
|
||||
return minmax.max() - minmax.min();
|
||||
}
|
||||
|
||||
min_max_tracker<load_type> get_shard_minmax(host_id node) const {
|
||||
min_max_tracker<load_type> minmax;
|
||||
if (_nodes.contains(node)) {
|
||||
throw_on_incomplete_data(node);
|
||||
auto& n = _nodes.at(node);
|
||||
for (auto&& load: n._shards) {
|
||||
minmax.update(load);
|
||||
for (auto&& shard: n._shards) {
|
||||
minmax.update(shard.get_load());
|
||||
}
|
||||
} else {
|
||||
minmax.update(0);
|
||||
@@ -249,18 +410,44 @@ public:
|
||||
return minmax;
|
||||
}
|
||||
|
||||
// Returns nullopt if capacity is not known.
|
||||
std::optional<double> get_allocated_utilization(host_id node, const locator::load_stats& stats, uint64_t target_tablet_size) const {
|
||||
if (!_nodes.contains(node)) {
|
||||
min_max_tracker<size_t> get_shard_minmax_tablet_count(host_id node) const {
|
||||
min_max_tracker<size_t> minmax;
|
||||
if (_nodes.contains(node)) {
|
||||
auto& n = _nodes.at(node);
|
||||
for (auto&& shard: n._shards) {
|
||||
minmax.update(shard.tablet_count);
|
||||
}
|
||||
} else {
|
||||
minmax.update(0);
|
||||
}
|
||||
return minmax;
|
||||
}
|
||||
|
||||
// Returns nullopt if node is not known, or we don't have valid disk capacity.
|
||||
std::optional<load_type> get_allocated_utilization(host_id node) const {
|
||||
if (!_nodes.contains(node) || !has_complete_data(node, true)) {
|
||||
return std::nullopt;
|
||||
}
|
||||
auto& n = _nodes.at(node);
|
||||
if (!stats.capacity.contains(node)) {
|
||||
const node_load& n = _nodes.at(node);
|
||||
return load_type(n._tablet_count * _default_tablet_size) / n._du.capacity;
|
||||
}
|
||||
|
||||
// Returns nullopt if node is not known, or we don't have tablet sizes or valid disk capacity.
|
||||
std::optional<load_type> get_storage_utilization(host_id node) const {
|
||||
if (!_nodes.contains(node) || !has_complete_data(node)) {
|
||||
return std::nullopt;
|
||||
}
|
||||
auto capacity = stats.capacity.at(node);
|
||||
return capacity > 0 ? double(n.load() * target_tablet_size) / capacity : 0;
|
||||
return _nodes.at(node).get_load();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace locator
|
||||
|
||||
template<>
|
||||
struct fmt::formatter<locator::disk_usage> : fmt::formatter<string_view> {
|
||||
template <typename FormatContext>
|
||||
auto format(const locator::disk_usage& du, FormatContext& ctx) const {
|
||||
return fmt::format_to(ctx.out(), "cap: {:i} used: {:i} load: {}",
|
||||
utils::pretty_printed_data_size(du.capacity), utils::pretty_printed_data_size(du.used), du.get_load());
|
||||
}
|
||||
};
|
||||
|
||||
@@ -317,6 +317,7 @@ future<tablet_map> network_topology_strategy::allocate_tablets_for_new_table(sch
|
||||
future<tablet_map> network_topology_strategy::reallocate_tablets(schema_ptr s, token_metadata_ptr tm, tablet_map tablets) const {
|
||||
natural_endpoints_tracker::check_enough_endpoints(*tm, _dc_rep_factor);
|
||||
load_sketch load(tm);
|
||||
co_await load.populate_with_normalized_load();
|
||||
co_await load.populate(std::nullopt, s->id());
|
||||
|
||||
tablet_logger.debug("Allocating tablets for {}.{} ({}): dc_rep_factor={} tablet_count={}", s->ks_name(), s->cf_name(), s->id(), _dc_rep_factor, tablets.tablet_count());
|
||||
@@ -403,7 +404,7 @@ tablet_replica_set network_topology_strategy::drop_tablets_in_racks(schema_ptr s
|
||||
if (node.dc_rack().dc == dc && is_rack_to_drop(node.dc_rack().rack)) {
|
||||
tablet_logger.debug("drop_tablets_in_rack {}.{} tablet_id={} dc={} rack={} removing replica: {}",
|
||||
s->ks_name(), s->cf_name(), tb, node.dc_rack().dc, node.dc_rack().rack, tr);
|
||||
load.unload(tr.host, tr.shard);
|
||||
load.unload(tr.host, tr.shard, 1, service::default_target_tablet_size);
|
||||
} else {
|
||||
filtered.emplace_back(tr);
|
||||
}
|
||||
@@ -445,7 +446,7 @@ tablet_replica_set network_topology_strategy::add_tablets_in_racks(schema_ptr s,
|
||||
fmt::format("No candidate node in rack {}.{} to allocate tablet replica", dc, rack));
|
||||
}
|
||||
|
||||
auto new_replica = tablet_replica{min_node, load.next_shard(min_node)};
|
||||
auto new_replica = tablet_replica{min_node, load.next_shard(min_node, 1, service::default_target_tablet_size)};
|
||||
new_replicas.push_back(new_replica);
|
||||
|
||||
tablet_logger.trace("add_tablet_in_rack {}.{} tablet_id={} dc={} rack={} load={} new_replica={}",
|
||||
@@ -468,10 +469,10 @@ future<tablet_replica_set> network_topology_strategy::add_tablets_in_dc(schema_p
|
||||
// Track all nodes with no replicas on them for this tablet, per rack.
|
||||
struct node_load {
|
||||
locator::host_id host;
|
||||
uint64_t load;
|
||||
double load;
|
||||
};
|
||||
// for sorting in descending load order
|
||||
// (in terms of number of replicas)
|
||||
// (in terms of load)
|
||||
auto node_load_cmp = [] (const node_load& a, const node_load& b) {
|
||||
return a.load > b.load;
|
||||
};
|
||||
@@ -484,7 +485,7 @@ future<tablet_replica_set> network_topology_strategy::add_tablets_in_dc(schema_p
|
||||
candidates_list existing_racks;
|
||||
|
||||
// We use this list to start allocating from an
|
||||
// unpoplated rack.
|
||||
// unpopulated rack.
|
||||
candidates_list new_racks;
|
||||
|
||||
for (const auto& [rack, nodes] : all_dc_racks) {
|
||||
@@ -502,7 +503,7 @@ future<tablet_replica_set> network_topology_strategy::add_tablets_in_dc(schema_p
|
||||
const auto& host_id = node.get().host_id();
|
||||
if (!existing.contains(host_id)) {
|
||||
// FIXME: https://github.com/scylladb/scylladb/issues/26366
|
||||
candidate.nodes.emplace_back(host_id, load.get_avg_shard_load(host_id));
|
||||
candidate.nodes.emplace_back(host_id, load.get_real_avg_shard_load(host_id));
|
||||
}
|
||||
}
|
||||
if (candidate.nodes.empty()) {
|
||||
@@ -552,7 +553,7 @@ future<tablet_replica_set> network_topology_strategy::add_tablets_in_dc(schema_p
|
||||
s->ks_name(), s->cf_name(), tb.id, rack, dc, dc_node_count, dc_rf));
|
||||
}
|
||||
auto host_id = nodes.back().host;
|
||||
auto replica = tablet_replica{host_id, load.next_shard(host_id)};
|
||||
auto replica = tablet_replica{host_id, load.next_shard(host_id, 1, service::default_target_tablet_size)};
|
||||
const auto& node = tm->get_topology().get_node(host_id);
|
||||
auto inserted = replicas_per_rack[node.dc_rack().rack].insert(host_id).second;
|
||||
// Sanity check that a node is not used more than once
|
||||
@@ -614,7 +615,7 @@ tablet_replica_set network_topology_strategy::drop_tablets_in_dc(schema_ptr s, c
|
||||
if (topo.get_node(tr.host).dc_rack().dc != dc || ++nodes_in_dc <= dc_rf) {
|
||||
filtered.emplace_back(tr);
|
||||
} else {
|
||||
load.unload(tr.host, tr.shard);
|
||||
load.unload(tr.host, tr.shard, 1, service::default_target_tablet_size);
|
||||
}
|
||||
}
|
||||
return filtered;
|
||||
|
||||
@@ -927,6 +927,56 @@ std::optional<uint64_t> load_stats::get_tablet_size(host_id host, const range_ba
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
std::optional<uint64_t> load_stats::get_tablet_size_in_transition(host_id host, const range_based_tablet_id& rb_tid, const tablet_info& ti, const tablet_transition_info* trinfo) const {
|
||||
std::optional<uint64_t> tablet_size_opt;
|
||||
tablet_size_opt = get_tablet_size(host, rb_tid);
|
||||
if (tablet_size_opt) {
|
||||
return tablet_size_opt;
|
||||
}
|
||||
|
||||
// If the tablet is in transition,
|
||||
// try to find it on the leaving replica, in case of tablet migration,
|
||||
// or get the avg tablet size of all the replicas, in case we have a rebuild
|
||||
if (trinfo) {
|
||||
switch (trinfo->transition) {
|
||||
case tablet_transition_kind::migration:
|
||||
// Search for the tablet size on leaving replica
|
||||
if (trinfo->pending_replica && trinfo->pending_replica->host == host) {
|
||||
if (auto leaving_replica = get_leaving_replica(ti, *trinfo)) {
|
||||
tablet_size_opt = get_tablet_size(leaving_replica->host, rb_tid);
|
||||
} else {
|
||||
on_internal_error_noexcept(tablet_logger, ::format("No leaving replica for tablet migration in table {}. ti.replicas: {} trinfo->next: {}",
|
||||
rb_tid.table, ti.replicas, trinfo->next));
|
||||
}
|
||||
}
|
||||
break;
|
||||
case tablet_transition_kind::rebuild:
|
||||
[[fallthrough]];
|
||||
case tablet_transition_kind::rebuild_v2: {
|
||||
// Get the avg tablet size from the available replicas
|
||||
size_t replica_count = 0;
|
||||
uint64_t tablet_size_sum = 0;
|
||||
for (auto& replica : ti.replicas) {
|
||||
auto new_tablet_size_opt = get_tablet_size(replica.host, rb_tid);
|
||||
if (new_tablet_size_opt) {
|
||||
tablet_size_sum += *new_tablet_size_opt;
|
||||
replica_count++;
|
||||
}
|
||||
}
|
||||
if (replica_count) {
|
||||
tablet_size_opt = tablet_size_sum / replica_count;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case tablet_transition_kind::intranode_migration:
|
||||
[[fallthrough]];
|
||||
case tablet_transition_kind::repair:
|
||||
break;
|
||||
}
|
||||
}
|
||||
return tablet_size_opt;
|
||||
}
|
||||
|
||||
lw_shared_ptr<load_stats> load_stats::reconcile_tablets_resize(const std::unordered_set<table_id>& tables, const token_metadata& old_tm, const token_metadata& new_tm) const {
|
||||
lw_shared_ptr<load_stats> reconciled_stats { make_lw_shared<load_stats>(*this) };
|
||||
load_stats& new_stats = *reconciled_stats;
|
||||
|
||||
@@ -489,6 +489,12 @@ struct load_stats {
|
||||
|
||||
std::optional<uint64_t> get_tablet_size(host_id host, const range_based_tablet_id& rb_tid) const;
|
||||
|
||||
// Returns the tablet size on the given host. If the tablet size is not found on the host, we will search for it on
|
||||
// other hosts based on the tablet transition info:
|
||||
// - if the tablet is in migration, and the given host is pending, the tablet size will be searched on the leaving replica
|
||||
// - if the tablet is being rebuilt, we will return the average tablet size of all the replicas
|
||||
std::optional<uint64_t> get_tablet_size_in_transition(host_id host, const range_based_tablet_id& rb_tid, const tablet_info& ti, const tablet_transition_info* trinfo) const;
|
||||
|
||||
// Modifies the tablet sizes in load_stats for the given table after a split or merge. The old_tm argument has
|
||||
// to contain the token_metadata pre-resize. The function returns load_stats with tablet token ranges
|
||||
// corresponding to the post-resize tablet_map.
|
||||
|
||||
5
main.cc
5
main.cc
@@ -959,7 +959,6 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
auto ip = utils::resolve(cfg->prometheus_address || cfg->listen_address, family, preferred).get();
|
||||
|
||||
prometheus::config pctx;
|
||||
pctx.metric_help = "Scylla server statistics";
|
||||
pctx.prefix = cfg->prometheus_prefix();
|
||||
pctx.allow_protobuf = cfg->prometheus_allow_protobuf();
|
||||
prometheus::start(prometheus_server, pctx).get();
|
||||
@@ -1791,7 +1790,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
});
|
||||
|
||||
checkpoint(stop_signal, "starting auth cache");
|
||||
auth_cache.start(std::ref(qp)).get();
|
||||
auth_cache.start(std::ref(qp), std::ref(stop_signal.as_sharded_abort_source())).get();
|
||||
auto stop_auth_cache = defer_verbose_shutdown("auth cache", [&] {
|
||||
auth_cache.stop().get();
|
||||
});
|
||||
@@ -2527,7 +2526,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
|
||||
api::set_server_service_levels(ctx, cql_server_ctl, qp).get();
|
||||
|
||||
alternator::controller alternator_ctl(gossiper, proxy, mm, sys_dist_ks, cdc_generation_service, service_memory_limiter, auth_service, sl_controller, *cfg, dbcfg.statement_scheduling_group);
|
||||
alternator::controller alternator_ctl(gossiper, proxy, ss, mm, sys_dist_ks, cdc_generation_service, service_memory_limiter, auth_service, sl_controller, *cfg, dbcfg.statement_scheduling_group);
|
||||
|
||||
// Register at_exit last, so that storage_service::drain_on_shutdown will be called first
|
||||
auto do_drain = defer_verbose_shutdown("local storage", [&ss] {
|
||||
|
||||
@@ -1292,7 +1292,7 @@ future<std::tuple<rpc::sink<frozen_mutation_fragment, streaming::stream_mutation
|
||||
messaging_service::make_sink_and_source_for_stream_mutation_fragments(table_schema_version schema_id, streaming::plan_id plan_id, table_id cf_id, uint64_t estimated_partitions, streaming::stream_reason reason, service::session_id session, locator::host_id id) {
|
||||
using value_type = std::tuple<rpc::sink<frozen_mutation_fragment, streaming::stream_mutation_fragments_cmd>, rpc::source<int32_t>>;
|
||||
if (is_shutting_down()) {
|
||||
return make_exception_future<value_type>(rpc::closed_error());
|
||||
return make_exception_future<value_type>(rpc::closed_error("local node is shutting down"));
|
||||
}
|
||||
auto rpc_client = get_rpc_client(messaging_verb::STREAM_MUTATION_FRAGMENTS, addr_for_host_id(id), id);
|
||||
return rpc_client->make_stream_sink<netw::serializer, frozen_mutation_fragment, streaming::stream_mutation_fragments_cmd>().then([this, session, plan_id, schema_id, cf_id, estimated_partitions, reason, rpc_client] (rpc::sink<frozen_mutation_fragment, streaming::stream_mutation_fragments_cmd> sink) mutable {
|
||||
@@ -1321,7 +1321,7 @@ rpc::sink<streaming::stream_blob_cmd_data> messaging_service::make_sink_for_stre
|
||||
future<std::tuple<rpc::sink<streaming::stream_blob_cmd_data>, rpc::source<streaming::stream_blob_cmd_data>>>
|
||||
messaging_service::make_sink_and_source_for_stream_blob(streaming::stream_blob_meta meta, locator::host_id id) {
|
||||
if (is_shutting_down()) {
|
||||
co_await coroutine::return_exception(rpc::closed_error());
|
||||
co_await coroutine::return_exception(rpc::closed_error("local node is shutting down"));
|
||||
}
|
||||
auto rpc_client = get_rpc_client(messaging_verb::STREAM_BLOB, addr_for_host_id(id), id);
|
||||
auto sink = co_await rpc_client->make_stream_sink<netw::serializer, streaming::stream_blob_cmd_data>();
|
||||
@@ -1370,7 +1370,7 @@ future<std::tuple<rpc::sink<repair_hash_with_cmd>, rpc::source<repair_row_on_wir
|
||||
messaging_service::make_sink_and_source_for_repair_get_row_diff_with_rpc_stream(uint32_t repair_meta_id, shard_id dst_cpu_id, locator::host_id id) {
|
||||
auto verb = messaging_verb::REPAIR_GET_ROW_DIFF_WITH_RPC_STREAM;
|
||||
if (is_shutting_down()) {
|
||||
return make_exception_future<std::tuple<rpc::sink<repair_hash_with_cmd>, rpc::source<repair_row_on_wire_with_cmd>>>(rpc::closed_error());
|
||||
return make_exception_future<std::tuple<rpc::sink<repair_hash_with_cmd>, rpc::source<repair_row_on_wire_with_cmd>>>(rpc::closed_error("local node is shutting down"));
|
||||
}
|
||||
auto rpc_client = get_rpc_client(verb, addr_for_host_id(id), id);
|
||||
return do_make_sink_source<repair_hash_with_cmd, repair_row_on_wire_with_cmd>(verb, repair_meta_id, dst_cpu_id, std::move(rpc_client), rpc());
|
||||
@@ -1392,7 +1392,7 @@ future<std::tuple<rpc::sink<repair_row_on_wire_with_cmd>, rpc::source<repair_str
|
||||
messaging_service::make_sink_and_source_for_repair_put_row_diff_with_rpc_stream(uint32_t repair_meta_id, shard_id dst_cpu_id, locator::host_id id) {
|
||||
auto verb = messaging_verb::REPAIR_PUT_ROW_DIFF_WITH_RPC_STREAM;
|
||||
if (is_shutting_down()) {
|
||||
return make_exception_future<std::tuple<rpc::sink<repair_row_on_wire_with_cmd>, rpc::source<repair_stream_cmd>>>(rpc::closed_error());
|
||||
return make_exception_future<std::tuple<rpc::sink<repair_row_on_wire_with_cmd>, rpc::source<repair_stream_cmd>>>(rpc::closed_error("local node is shutting down"));
|
||||
}
|
||||
auto rpc_client = get_rpc_client(verb, addr_for_host_id(id), id);
|
||||
return do_make_sink_source<repair_row_on_wire_with_cmd, repair_stream_cmd>(verb, repair_meta_id, dst_cpu_id, std::move(rpc_client), rpc());
|
||||
@@ -1414,7 +1414,7 @@ future<std::tuple<rpc::sink<repair_stream_cmd>, rpc::source<repair_hash_with_cmd
|
||||
messaging_service::make_sink_and_source_for_repair_get_full_row_hashes_with_rpc_stream(uint32_t repair_meta_id, shard_id dst_cpu_id, locator::host_id id) {
|
||||
auto verb = messaging_verb::REPAIR_GET_FULL_ROW_HASHES_WITH_RPC_STREAM;
|
||||
if (is_shutting_down()) {
|
||||
return make_exception_future<std::tuple<rpc::sink<repair_stream_cmd>, rpc::source<repair_hash_with_cmd>>>(rpc::closed_error());
|
||||
return make_exception_future<std::tuple<rpc::sink<repair_stream_cmd>, rpc::source<repair_hash_with_cmd>>>(rpc::closed_error("local node is shutting down"));
|
||||
}
|
||||
auto rpc_client = get_rpc_client(verb, addr_for_host_id(id), id);
|
||||
return do_make_sink_source<repair_stream_cmd, repair_hash_with_cmd>(verb, repair_meta_id, dst_cpu_id, std::move(rpc_client), rpc());
|
||||
|
||||
@@ -127,20 +127,21 @@ auto send_message(messaging_service* ms, messaging_verb verb, std::optional<loca
|
||||
auto rpc_handler = ms->rpc()->make_client<MsgIn(MsgOut...)>(verb);
|
||||
using futurator = futurize<std::invoke_result_t<decltype(rpc_handler), rpc_protocol::client&, MsgOut...>>;
|
||||
if (ms->is_shutting_down()) {
|
||||
return futurator::make_exception_future(rpc::closed_error());
|
||||
return futurator::make_exception_future(rpc::closed_error("local node is shutting down"));
|
||||
}
|
||||
auto rpc_client_ptr = ms->get_rpc_client(verb, id, host_id);
|
||||
auto& rpc_client = *rpc_client_ptr;
|
||||
return rpc_handler(rpc_client, std::forward<MsgOut>(msg)...).handle_exception([ms = ms->shared_from_this(), id, host_id, verb, rpc_client_ptr = std::move(rpc_client_ptr)] (std::exception_ptr&& eptr) {
|
||||
ms->increment_dropped_messages(verb);
|
||||
if (try_catch<rpc::closed_error>(eptr)) {
|
||||
if (const auto* exp = try_catch<rpc::closed_error>(eptr)) {
|
||||
// This is a transport error
|
||||
if (host_id) {
|
||||
ms->remove_error_rpc_client(verb, *host_id);
|
||||
} else {
|
||||
ms->remove_error_rpc_client(verb, id);
|
||||
}
|
||||
return futurator::make_exception_future(std::move(eptr));
|
||||
return futurator::make_exception_future(rpc::closed_error(fmt::format("got error from node {}/{}: {}",
|
||||
host_id.value_or(locator::host_id{}), id.addr, exp->what())));
|
||||
} else {
|
||||
// This is expected to be a rpc server error, e.g., the rpc handler throws a std::runtime_error.
|
||||
return futurator::make_exception_future(std::move(eptr));
|
||||
@@ -165,20 +166,21 @@ auto send_message_timeout(messaging_service* ms, messaging_verb verb, std::optio
|
||||
auto rpc_handler = ms->rpc()->make_client<MsgIn(MsgOut...)>(verb);
|
||||
using futurator = futurize<std::invoke_result_t<decltype(rpc_handler), rpc_protocol::client&, MsgOut...>>;
|
||||
if (ms->is_shutting_down()) {
|
||||
return futurator::make_exception_future(rpc::closed_error());
|
||||
return futurator::make_exception_future(rpc::closed_error("local node is shutting down"));
|
||||
}
|
||||
auto rpc_client_ptr = ms->get_rpc_client(verb, id, host_id);
|
||||
auto& rpc_client = *rpc_client_ptr;
|
||||
return rpc_handler(rpc_client, timeout, std::forward<MsgOut>(msg)...).handle_exception([ms = ms->shared_from_this(), id, host_id, verb, rpc_client_ptr = std::move(rpc_client_ptr)] (std::exception_ptr&& eptr) {
|
||||
ms->increment_dropped_messages(verb);
|
||||
if (try_catch<rpc::closed_error>(eptr)) {
|
||||
if (const auto* exp = try_catch<rpc::closed_error>(eptr)) {
|
||||
// This is a transport error
|
||||
if (host_id) {
|
||||
ms->remove_error_rpc_client(verb, *host_id);
|
||||
} else {
|
||||
ms->remove_error_rpc_client(verb, id);
|
||||
}
|
||||
return futurator::make_exception_future(std::move(eptr));
|
||||
return futurator::make_exception_future(rpc::closed_error(fmt::format("got error from node {}/{}: {}",
|
||||
host_id.value_or(locator::host_id{}), id.addr, exp->what())));
|
||||
} else {
|
||||
// This is expected to be a rpc server error, e.g., the rpc handler throws a std::runtime_error.
|
||||
return futurator::make_exception_future(std::move(eptr));
|
||||
@@ -206,7 +208,7 @@ auto send_message_cancellable(messaging_service* ms, messaging_verb verb, std::o
|
||||
auto rpc_handler = ms->rpc()->make_client<MsgIn(MsgOut...)>(verb);
|
||||
using futurator = futurize<std::invoke_result_t<decltype(rpc_handler), rpc_protocol::client&, MsgOut...>>;
|
||||
if (ms->is_shutting_down()) {
|
||||
return futurator::make_exception_future(rpc::closed_error());
|
||||
return futurator::make_exception_future(rpc::closed_error("local node is shutting down"));
|
||||
}
|
||||
auto rpc_client_ptr = ms->get_rpc_client(verb, id, host_id);
|
||||
auto& rpc_client = *rpc_client_ptr;
|
||||
@@ -222,14 +224,15 @@ auto send_message_cancellable(messaging_service* ms, messaging_verb verb, std::o
|
||||
|
||||
return rpc_handler(rpc_client, c_ref, std::forward<MsgOut>(msg)...).handle_exception([ms = ms->shared_from_this(), id, host_id, verb, rpc_client_ptr = std::move(rpc_client_ptr), sub = std::move(sub)] (std::exception_ptr&& eptr) {
|
||||
ms->increment_dropped_messages(verb);
|
||||
if (try_catch<rpc::closed_error>(eptr)) {
|
||||
if (const auto* exp = try_catch<rpc::closed_error>(eptr)) {
|
||||
// This is a transport error
|
||||
if (host_id) {
|
||||
ms->remove_error_rpc_client(verb, *host_id);
|
||||
} else {
|
||||
ms->remove_error_rpc_client(verb, id);
|
||||
}
|
||||
return futurator::make_exception_future(std::move(eptr));
|
||||
return futurator::make_exception_future(rpc::closed_error(fmt::format("got error from node {}/{}: {}",
|
||||
host_id.value_or(locator::host_id{}), id.addr, exp->what())));
|
||||
} else if (try_catch<rpc::canceled_error>(eptr)) {
|
||||
// Translate low-level canceled_error into high-level abort_requested_exception.
|
||||
return futurator::make_exception_future(abort_requested_exception{});
|
||||
@@ -255,9 +258,10 @@ auto send_message_timeout_cancellable(messaging_service* ms, messaging_verb verb
|
||||
auto rpc_handler = ms->rpc()->make_client<MsgIn(MsgOut...)>(verb);
|
||||
using futurator = futurize<std::invoke_result_t<decltype(rpc_handler), rpc_protocol::client&, MsgOut...>>;
|
||||
if (ms->is_shutting_down()) {
|
||||
return futurator::make_exception_future(rpc::closed_error());
|
||||
return futurator::make_exception_future(rpc::closed_error("local node is shutting down"));
|
||||
}
|
||||
auto rpc_client_ptr = ms->get_rpc_client(verb, ms->addr_for_host_id(host_id), host_id);
|
||||
auto address = ms->addr_for_host_id(host_id);
|
||||
auto rpc_client_ptr = ms->get_rpc_client(verb, address, host_id);
|
||||
auto& rpc_client = *rpc_client_ptr;
|
||||
|
||||
auto c = std::make_unique<seastar::rpc::cancellable>();
|
||||
@@ -269,12 +273,13 @@ auto send_message_timeout_cancellable(messaging_service* ms, messaging_verb verb
|
||||
return futurator::make_exception_future(abort_requested_exception{});
|
||||
}
|
||||
|
||||
return rpc_handler(rpc_client, timeout, c_ref, std::forward<MsgOut>(msg)...).handle_exception([ms = ms->shared_from_this(), host_id, verb, rpc_client_ptr = std::move(rpc_client_ptr), sub = std::move(sub)] (std::exception_ptr&& eptr) {
|
||||
return rpc_handler(rpc_client, timeout, c_ref, std::forward<MsgOut>(msg)...).handle_exception([ms = ms->shared_from_this(), host_id, address, verb, rpc_client_ptr = std::move(rpc_client_ptr), sub = std::move(sub)] (std::exception_ptr&& eptr) {
|
||||
ms->increment_dropped_messages(verb);
|
||||
if (try_catch<rpc::closed_error>(eptr)) {
|
||||
if (const auto* exp = try_catch<rpc::closed_error>(eptr)) {
|
||||
// This is a transport error
|
||||
ms->remove_error_rpc_client(verb, host_id);
|
||||
return futurator::make_exception_future(std::move(eptr));
|
||||
return futurator::make_exception_future(rpc::closed_error(fmt::format("got error from node {}/{}: {}",
|
||||
host_id, address.addr, exp->what())));
|
||||
} else if (try_catch<rpc::canceled_error>(eptr)) {
|
||||
// Translate low-level canceled_error into high-level abort_requested_exception.
|
||||
return futurator::make_exception_future(abort_requested_exception{});
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:3cbe2dd05945f8fb76ebce2ea70864063d2b282c4d5080af1f290ead43321ab3
|
||||
size 6444732
|
||||
oid sha256:9d387b5ff44094e9b6c587d3e0cb2e7098ea68924f3f9947ff7574be3c378a4e
|
||||
size 6475784
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:ad1705d5c37cc6b6cd24354b83fee8da64a14f918351d357f21cf771a650ad3d
|
||||
size 6452816
|
||||
oid sha256:3b35c1ed982e025b4c3d079c2d14873a115ff8e8d364a19633bf83052e52a059
|
||||
size 6473408
|
||||
|
||||
@@ -176,7 +176,7 @@ void fsm::become_leader() {
|
||||
|
||||
_last_election_time = _clock.now();
|
||||
_ping_leader = false;
|
||||
// a new leader needs to commit at lease one entry to make sure that
|
||||
// a new leader needs to commit at least one entry to make sure that
|
||||
// all existing entries in its log are committed as well. Also it should
|
||||
// send append entries RPC as soon as possible to establish its leadership
|
||||
// (3.4). Do both of those by committing a dummy entry.
|
||||
|
||||
@@ -1020,6 +1020,13 @@ void reader_concurrency_semaphore::consume(reader_permit::impl& permit, resource
|
||||
|
||||
void reader_concurrency_semaphore::signal(const resources& r) noexcept {
|
||||
_resources += r;
|
||||
if (_resources.count > _initial_resources.count || _resources.memory > _initial_resources.memory) [[unlikely]] {
|
||||
on_internal_error_noexcept(rcslog,
|
||||
format("reader_concurrency_semaphore::signal(): semaphore {} detected resource leak, available {} exceeds initial {}", _name,
|
||||
_resources, _initial_resources));
|
||||
_resources.count = std::max(_resources.count, _initial_resources.count);
|
||||
_resources.memory = std::max(_resources.memory, _initial_resources.memory);
|
||||
}
|
||||
maybe_wake_execution_loop();
|
||||
}
|
||||
|
||||
|
||||
@@ -1179,6 +1179,7 @@ private:
|
||||
bool full = is_incremental_repair_using_all_sstables();
|
||||
auto& tinfo = tmap.get_tablet_info(id);
|
||||
auto sstables_repaired_at = tinfo.sstables_repaired_at;
|
||||
auto gid = locator::global_tablet_id{tid, id};
|
||||
// Consider this:
|
||||
// 1) n1 is the topology coordinator
|
||||
// 2) n1 schedules and executes a tablet repair with session id s1 for a tablet on n3 an n4.
|
||||
@@ -1190,14 +1191,16 @@ private:
|
||||
// To avoid the deadlock, we can throw in step 7 so that n2 will
|
||||
// proceed to the end_repair stage and release the lock. After that,
|
||||
// the scheduler could schedule the tablet repair again.
|
||||
if (_rs._repair_compaction_locks.contains(_frozen_topology_guard)) {
|
||||
if (_rs._repair_compaction_locks.contains(gid)) {
|
||||
auto msg = fmt::format("Tablet repair session={} table={} is in progress", _frozen_topology_guard, tid);
|
||||
rlogger.info("{}", msg);
|
||||
throw std::runtime_error(msg);
|
||||
}
|
||||
|
||||
co_await utils::get_local_injector().inject("incremental_repair_prepare_wait", utils::wait_for_message(60s));
|
||||
auto reenablers_and_holders = co_await table.get_compaction_reenablers_and_lock_holders_for_repair(_db.local(), _frozen_topology_guard, _range);
|
||||
for (auto& lock_holder : reenablers_and_holders.lock_holders) {
|
||||
_rs._repair_compaction_locks[_frozen_topology_guard].push_back(std::move(lock_holder));
|
||||
_rs._repair_compaction_locks[gid].push_back(std::move(lock_holder));
|
||||
}
|
||||
auto sstables = co_await table.take_storage_snapshot(_range);
|
||||
_incremental_repair_meta.sst_set = make_lw_shared<sstables::sstable_set>(sstables::make_partitioned_sstable_set(_schema, _range));
|
||||
@@ -2836,9 +2839,20 @@ future<> repair_service::init_ms_handlers() {
|
||||
auto& table = local_repair.get_db().local().find_column_family(gid.table);
|
||||
auto erm = table.get_effective_replication_map();
|
||||
auto& tmap = erm->get_token_metadata_ptr()->tablets().get_tablet_map(gid.table);
|
||||
auto* trinfo = tmap.get_tablet_transition_info(gid.tablet);
|
||||
if (!trinfo) {
|
||||
auto msg = fmt::format("Skipped repair_update_compaction_ctrl gid={} session_id={} since tablet is not in transition", gid, topo_guard);
|
||||
rlogger.warn("{}", msg);
|
||||
throw std::runtime_error(msg);
|
||||
}
|
||||
if (trinfo->stage != locator::tablet_transition_stage::end_repair) {
|
||||
auto msg = fmt::format("Skipped repair_update_compaction_ctrl gid={} session_id={} since tablet is not in tablet_transition_stage::end_repair", gid, topo_guard);
|
||||
rlogger.warn("{}", msg);
|
||||
throw std::runtime_error(msg);
|
||||
}
|
||||
auto range = tmap.get_token_range(gid.tablet);
|
||||
co_await table.clear_being_repaired_for_range(range);
|
||||
auto removed = local_repair._repair_compaction_locks.erase(topo_guard);
|
||||
auto removed = local_repair._repair_compaction_locks.erase(gid);
|
||||
rlogger.info("Got repair_update_compaction_ctrl gid={} session_id={} removed={}", gid, topo_guard, removed);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -154,7 +154,7 @@ class repair_service : public seastar::peering_sharded_service<repair_service> {
|
||||
std::unordered_set<locator::host_id> ignore_nodes);
|
||||
|
||||
public:
|
||||
std::unordered_map<service::session_id, std::vector<seastar::rwlock::holder>> _repair_compaction_locks;
|
||||
std::unordered_map<locator::global_tablet_id, std::vector<seastar::rwlock::holder>> _repair_compaction_locks;
|
||||
|
||||
public:
|
||||
repair_service(sharded<service::topology_state_machine>& tsm,
|
||||
|
||||
@@ -84,6 +84,10 @@ class compaction_group {
|
||||
seastar::named_gate _async_gate;
|
||||
// Gates flushes.
|
||||
seastar::named_gate _flush_gate;
|
||||
// Gates sstable being added to the group.
|
||||
// This prevents the group from being considered empty when sstables are being added.
|
||||
// Crucial for tablet split which ACKs split for a table when all pre-split groups are empty.
|
||||
seastar::named_gate _sstable_add_gate;
|
||||
bool _tombstone_gc_enabled = true;
|
||||
std::optional<compaction::compaction_backlog_tracker> _backlog_tracker;
|
||||
repair_classifier_func _repair_sstable_classifier;
|
||||
@@ -248,6 +252,10 @@ public:
|
||||
return _flush_gate;
|
||||
}
|
||||
|
||||
seastar::named_gate& sstable_add_gate() noexcept {
|
||||
return _sstable_add_gate;
|
||||
}
|
||||
|
||||
compaction::compaction_manager& get_compaction_manager() noexcept;
|
||||
const compaction::compaction_manager& get_compaction_manager() const noexcept;
|
||||
|
||||
@@ -306,8 +314,8 @@ public:
|
||||
uint64_t live_disk_space_used() const;
|
||||
|
||||
void for_each_compaction_group(std::function<void(const compaction_group_ptr&)> action) const;
|
||||
utils::small_vector<compaction_group_ptr, 3> compaction_groups();
|
||||
utils::small_vector<const_compaction_group_ptr, 3> compaction_groups() const;
|
||||
utils::small_vector<compaction_group_ptr, 3> compaction_groups_immediate();
|
||||
utils::small_vector<const_compaction_group_ptr, 3> compaction_groups_immediate() const;
|
||||
|
||||
utils::small_vector<compaction_group_ptr, 3> split_unready_groups() const;
|
||||
bool split_unready_groups_are_empty() const;
|
||||
@@ -434,7 +442,7 @@ public:
|
||||
virtual bool all_storage_groups_split() = 0;
|
||||
virtual future<> split_all_storage_groups(tasks::task_info tablet_split_task_info) = 0;
|
||||
virtual future<> maybe_split_compaction_group_of(size_t idx) = 0;
|
||||
virtual future<std::vector<sstables::shared_sstable>> maybe_split_sstable(const sstables::shared_sstable& sst) = 0;
|
||||
virtual future<std::vector<sstables::shared_sstable>> maybe_split_new_sstable(const sstables::shared_sstable& sst) = 0;
|
||||
virtual dht::token_range get_token_range_after_split(const dht::token&) const noexcept = 0;
|
||||
|
||||
virtual lw_shared_ptr<sstables::sstable_set> make_sstable_set() const = 0;
|
||||
|
||||
@@ -2793,6 +2793,7 @@ future<> database::flush_all_tables() {
|
||||
});
|
||||
_all_tables_flushed_at = db_clock::now();
|
||||
co_await _commitlog->wait_for_pending_deletes();
|
||||
dblog.info("Forcing new commitlog segment and flushing all tables complete");
|
||||
}
|
||||
|
||||
future<db_clock::time_point> database::get_all_tables_flushed_at(sharded<database>& sharded_db) {
|
||||
@@ -2815,7 +2816,7 @@ future<> database::snapshot_table_on_all_shards(sharded<database>& sharded_db, t
|
||||
co_await flush_table_on_all_shards(sharded_db, uuid);
|
||||
}
|
||||
auto table_shards = co_await get_table_on_all_shards(sharded_db, uuid);
|
||||
co_await table::snapshot_on_all_shards(sharded_db, table_shards, tag);
|
||||
co_await snapshot_table_on_all_shards(sharded_db, table_shards, tag);
|
||||
}
|
||||
|
||||
future<> database::snapshot_tables_on_all_shards(sharded<database>& sharded_db, std::string_view ks_name, std::vector<sstring> table_names, sstring tag, bool skip_flush) {
|
||||
@@ -2951,7 +2952,7 @@ future<> database::truncate_table_on_all_shards(sharded<database>& sharded_db, s
|
||||
auto truncated_at = truncated_at_opt.value_or(db_clock::now());
|
||||
auto name = snapshot_name_opt.value_or(
|
||||
format("{:d}-{}", truncated_at.time_since_epoch().count(), cf.schema()->cf_name()));
|
||||
co_await table::snapshot_on_all_shards(sharded_db, table_shards, name);
|
||||
co_await snapshot_table_on_all_shards(sharded_db, table_shards, name);
|
||||
}
|
||||
|
||||
co_await sharded_db.invoke_on_all([&] (database& db) {
|
||||
|
||||
@@ -604,9 +604,28 @@ public:
|
||||
|
||||
data_dictionary::table as_data_dictionary() const;
|
||||
|
||||
// The usage of these functions are restricted to preexisting sstables that aren't being
|
||||
// moved anywhere, so should never be used in the context of file streaming and intra
|
||||
// node migration. The only user today is distributed loader, which populates the
|
||||
// sstables for each column family on boot.
|
||||
future<> add_sstable_and_update_cache(sstables::shared_sstable sst,
|
||||
sstables::offstrategy offstrategy = sstables::offstrategy::no);
|
||||
future<> add_sstables_and_update_cache(const std::vector<sstables::shared_sstable>& ssts);
|
||||
|
||||
// Restricted to new sstables produced by external processes such as repair.
|
||||
// The sstable might undergo split if table is in split mode.
|
||||
// If no need for split, the input sstable will only be attached to the sstable set.
|
||||
// If split happens, the output sstables will be attached and the input sstable unlinked.
|
||||
// On failure, the input sstable is unlinked and exception propagated to the caller.
|
||||
// The on_add callback will be called on all sstables to be added into the set.
|
||||
[[nodiscard]] future<std::vector<sstables::shared_sstable>>
|
||||
add_new_sstable_and_update_cache(sstables::shared_sstable new_sst,
|
||||
std::function<future<>(sstables::shared_sstable)> on_add,
|
||||
sstables::offstrategy offstrategy = sstables::offstrategy::no);
|
||||
[[nodiscard]] future<std::vector<sstables::shared_sstable>>
|
||||
add_new_sstables_and_update_cache(std::vector<sstables::shared_sstable> new_ssts,
|
||||
std::function<future<>(sstables::shared_sstable)> on_add);
|
||||
|
||||
future<> move_sstables_from_staging(std::vector<sstables::shared_sstable>);
|
||||
sstables::shared_sstable make_sstable();
|
||||
void set_truncation_time(db_clock::time_point truncated_at) noexcept {
|
||||
@@ -724,7 +743,9 @@ private:
|
||||
return _config.enable_cache && _schema->caching_options().enabled();
|
||||
}
|
||||
void update_stats_for_new_sstable(const sstables::shared_sstable& sst) noexcept;
|
||||
future<> do_add_sstable_and_update_cache(compaction_group& cg, sstables::shared_sstable sst, sstables::offstrategy, bool trigger_compaction);
|
||||
// This function can throw even if the sstable was added into the set. When the sstable was successfully
|
||||
// added, the sstable ptr @sst will be set to nullptr. Allowing caller to optionally discard the sstable.
|
||||
future<> do_add_sstable_and_update_cache(compaction_group& cg, sstables::shared_sstable& sst, sstables::offstrategy, bool trigger_compaction);
|
||||
future<> do_add_sstable_and_update_cache(sstables::shared_sstable sst, sstables::offstrategy offstrategy, bool trigger_compaction);
|
||||
// Helpers which add sstable on behalf of a compaction group and refreshes compound set.
|
||||
void add_sstable(compaction_group& cg, sstables::shared_sstable sstable);
|
||||
@@ -1037,37 +1058,11 @@ public:
|
||||
db::replay_position set_low_replay_position_mark();
|
||||
db::replay_position highest_flushed_replay_position() const;
|
||||
|
||||
private:
|
||||
using snapshot_file_set = foreign_ptr<std::unique_ptr<std::unordered_set<sstring>>>;
|
||||
|
||||
future<snapshot_file_set> take_snapshot(sstring jsondir);
|
||||
// Writes the table schema and the manifest of all files in the snapshot directory.
|
||||
future<> finalize_snapshot(const global_table_ptr& table_shards, sstring jsondir, std::vector<snapshot_file_set> file_sets);
|
||||
static future<> seal_snapshot(sstring jsondir, std::vector<snapshot_file_set> file_sets);
|
||||
public:
|
||||
static future<> snapshot_on_all_shards(sharded<database>& sharded_db, const global_table_ptr& table_shards, sstring name);
|
||||
future<std::pair<std::vector<sstables::shared_sstable>, sstable_list_permit>> snapshot_sstables();
|
||||
|
||||
future<std::unordered_map<sstring, snapshot_details>> get_snapshot_details();
|
||||
static future<snapshot_details> get_snapshot_details(std::filesystem::path snapshot_dir, std::filesystem::path datadir);
|
||||
|
||||
/*!
|
||||
* \brief write the schema to a 'schema.cql' file at the given directory.
|
||||
*
|
||||
* When doing a snapshot, the snapshot directory contains a 'schema.cql' file
|
||||
* with a CQL command that can be used to generate the schema.
|
||||
* The content is is similar to the result of the CQL DESCRIBE command of the table.
|
||||
*
|
||||
* When a schema has indexes, local indexes or views, those indexes and views
|
||||
* are represented by their own schemas.
|
||||
* In those cases, the method would write the relevant information for each of the schemas:
|
||||
*
|
||||
* The schema of the base table would output a file with the CREATE TABLE command
|
||||
* and the schema of the view that is used for the index would output a file with the
|
||||
* CREATE INDEX command.
|
||||
* The same is true for local index and MATERIALIZED VIEW.
|
||||
*/
|
||||
future<> write_schema_as_cql(const global_table_ptr& table_shards, sstring dir) const;
|
||||
|
||||
bool incremental_backups_enabled() const {
|
||||
return _config.enable_incremental_backups;
|
||||
}
|
||||
@@ -1358,7 +1353,8 @@ public:
|
||||
|
||||
// Clones storage of a given tablet. Memtable is flushed first to guarantee that the
|
||||
// snapshot (list of sstables) will include all the data written up to the time it was taken.
|
||||
future<utils::chunked_vector<sstables::entry_descriptor>> clone_tablet_storage(locator::tablet_id tid);
|
||||
// If leave_unsealead is set, all the destination sstables will be left unsealed.
|
||||
future<utils::chunked_vector<sstables::entry_descriptor>> clone_tablet_storage(locator::tablet_id tid, bool leave_unsealed);
|
||||
|
||||
friend class compaction_group;
|
||||
friend class compaction::compaction_task_impl;
|
||||
@@ -2019,6 +2015,7 @@ private:
|
||||
keyspace::config make_keyspace_config(const keyspace_metadata& ksm, system_keyspace is_system);
|
||||
struct table_truncate_state;
|
||||
|
||||
static future<> snapshot_table_on_all_shards(sharded<database>& sharded_db, const global_table_ptr& table_shards, sstring name);
|
||||
static future<> truncate_table_on_all_shards(sharded<database>& db, sharded<db::system_keyspace>& sys_ks, const global_table_ptr&, std::optional<db_clock::time_point> truncated_at_opt, bool with_snapshot, std::optional<sstring> snapshot_name_opt);
|
||||
future<> truncate(db::system_keyspace& sys_ks, column_family& cf, std::vector<lw_shared_ptr<replica::table>>& views, const table_truncate_state&);
|
||||
public:
|
||||
|
||||
433
replica/table.cc
433
replica/table.cc
@@ -210,9 +210,9 @@ table::add_memtables_to_reader_list(std::vector<mutation_reader>& readers,
|
||||
auto sgs = storage_groups_for_token_range(token_range);
|
||||
reserve_fn(std::ranges::fold_left(sgs | std::views::transform(std::mem_fn(&storage_group::memtable_count)), uint64_t(0), std::plus{}));
|
||||
for (auto& sg : sgs) {
|
||||
for (auto& cg : sg->compaction_groups()) {
|
||||
sg->for_each_compaction_group([&] (const compaction_group_ptr &cg) {
|
||||
add_memtables_from_cg(*cg);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -423,15 +423,27 @@ bool compaction_group::memtable_has_key(const dht::decorated_key& key) const {
|
||||
}
|
||||
|
||||
api::timestamp_type storage_group::min_memtable_timestamp() const {
|
||||
return std::ranges::min(compaction_groups() | std::views::transform(std::mem_fn(&compaction_group::min_memtable_timestamp)));
|
||||
api::timestamp_type min_timestamp = api::max_timestamp;
|
||||
for_each_compaction_group([&min_timestamp] (const compaction_group_ptr& cg) {
|
||||
min_timestamp = std::min(min_timestamp, cg->min_memtable_timestamp());
|
||||
});
|
||||
return min_timestamp;
|
||||
}
|
||||
|
||||
api::timestamp_type storage_group::min_memtable_live_timestamp() const {
|
||||
return std::ranges::min(compaction_groups() | std::views::transform(std::mem_fn(&compaction_group::min_memtable_live_timestamp)));
|
||||
api::timestamp_type min_timestamp = api::max_timestamp;
|
||||
for_each_compaction_group([&min_timestamp] (const compaction_group_ptr& cg) {
|
||||
min_timestamp = std::min(min_timestamp, cg->min_memtable_live_timestamp());
|
||||
});
|
||||
return min_timestamp;
|
||||
}
|
||||
|
||||
api::timestamp_type storage_group::min_memtable_live_row_marker_timestamp() const {
|
||||
return std::ranges::min(compaction_groups() | std::views::transform(std::mem_fn(&compaction_group::min_memtable_live_row_marker_timestamp)));
|
||||
api::timestamp_type min_timestamp = api::max_timestamp;
|
||||
for_each_compaction_group([&min_timestamp] (const compaction_group_ptr& cg) {
|
||||
min_timestamp = std::min(min_timestamp, cg->min_memtable_live_row_marker_timestamp());
|
||||
});
|
||||
return min_timestamp;
|
||||
}
|
||||
|
||||
api::timestamp_type table::min_memtable_timestamp() const {
|
||||
@@ -721,7 +733,7 @@ public:
|
||||
bool all_storage_groups_split() override { return true; }
|
||||
future<> split_all_storage_groups(tasks::task_info tablet_split_task_info) override { return make_ready_future(); }
|
||||
future<> maybe_split_compaction_group_of(size_t idx) override { return make_ready_future(); }
|
||||
future<std::vector<sstables::shared_sstable>> maybe_split_sstable(const sstables::shared_sstable& sst) override {
|
||||
future<std::vector<sstables::shared_sstable>> maybe_split_new_sstable(const sstables::shared_sstable& sst) override {
|
||||
return make_ready_future<std::vector<sstables::shared_sstable>>(std::vector<sstables::shared_sstable>{sst});
|
||||
}
|
||||
dht::token_range get_token_range_after_split(const dht::token&) const noexcept override { return dht::token_range(); }
|
||||
@@ -879,7 +891,7 @@ public:
|
||||
bool all_storage_groups_split() override;
|
||||
future<> split_all_storage_groups(tasks::task_info tablet_split_task_info) override;
|
||||
future<> maybe_split_compaction_group_of(size_t idx) override;
|
||||
future<std::vector<sstables::shared_sstable>> maybe_split_sstable(const sstables::shared_sstable& sst) override;
|
||||
future<std::vector<sstables::shared_sstable>> maybe_split_new_sstable(const sstables::shared_sstable& sst) override;
|
||||
dht::token_range get_token_range_after_split(const dht::token& token) const noexcept override {
|
||||
return tablet_map().get_token_range_after_split(token);
|
||||
}
|
||||
@@ -933,7 +945,7 @@ void storage_group::for_each_compaction_group(std::function<void(const compactio
|
||||
}
|
||||
}
|
||||
|
||||
utils::small_vector<compaction_group_ptr, 3> storage_group::compaction_groups() {
|
||||
utils::small_vector<compaction_group_ptr, 3> storage_group::compaction_groups_immediate() {
|
||||
utils::small_vector<compaction_group_ptr, 3> cgs;
|
||||
for_each_compaction_group([&cgs] (const compaction_group_ptr& cg) {
|
||||
cgs.push_back(cg);
|
||||
@@ -941,7 +953,7 @@ utils::small_vector<compaction_group_ptr, 3> storage_group::compaction_groups()
|
||||
return cgs;
|
||||
}
|
||||
|
||||
utils::small_vector<const_compaction_group_ptr, 3> storage_group::compaction_groups() const {
|
||||
utils::small_vector<const_compaction_group_ptr, 3> storage_group::compaction_groups_immediate() const {
|
||||
utils::small_vector<const_compaction_group_ptr, 3> cgs;
|
||||
for_each_compaction_group([&cgs] (const compaction_group_ptr& cg) {
|
||||
cgs.push_back(cg);
|
||||
@@ -1130,7 +1142,8 @@ future<> tablet_storage_group_manager::maybe_split_compaction_group_of(size_t id
|
||||
}
|
||||
|
||||
future<std::vector<sstables::shared_sstable>>
|
||||
tablet_storage_group_manager::maybe_split_sstable(const sstables::shared_sstable& sst) {
|
||||
tablet_storage_group_manager::maybe_split_new_sstable(const sstables::shared_sstable& sst) {
|
||||
co_await utils::get_local_injector().inject("maybe_split_new_sstable_wait", utils::wait_for_message(120s));
|
||||
if (!tablet_map().needs_split()) {
|
||||
co_return std::vector<sstables::shared_sstable>{sst};
|
||||
}
|
||||
@@ -1138,8 +1151,7 @@ tablet_storage_group_manager::maybe_split_sstable(const sstables::shared_sstable
|
||||
auto& cg = compaction_group_for_sstable(sst);
|
||||
auto holder = cg.async_gate().hold();
|
||||
auto& view = cg.view_for_sstable(sst);
|
||||
auto lock_holder = co_await _t.get_compaction_manager().get_incremental_repair_read_lock(view, "maybe_split_sstable");
|
||||
co_return co_await _t.get_compaction_manager().maybe_split_sstable(sst, view, co_await split_compaction_options());
|
||||
co_return co_await _t.get_compaction_manager().maybe_split_new_sstable(sst, view, co_await split_compaction_options());
|
||||
}
|
||||
|
||||
future<> table::maybe_split_compaction_group_of(locator::tablet_id tablet_id) {
|
||||
@@ -1149,7 +1161,7 @@ future<> table::maybe_split_compaction_group_of(locator::tablet_id tablet_id) {
|
||||
|
||||
future<std::vector<sstables::shared_sstable>> table::maybe_split_new_sstable(const sstables::shared_sstable& sst) {
|
||||
auto holder = async_gate().hold();
|
||||
co_return co_await _sg_manager->maybe_split_sstable(sst);
|
||||
co_return co_await _sg_manager->maybe_split_new_sstable(sst);
|
||||
}
|
||||
|
||||
dht::token_range table::get_token_range_after_split(const dht::token& token) const noexcept {
|
||||
@@ -1257,7 +1269,7 @@ future<> table::parallel_foreach_compaction_group(std::function<future<>(compact
|
||||
tlogger.info("foreach_compaction_group_wait: released");
|
||||
});
|
||||
|
||||
co_await coroutine::parallel_for_each(sg.compaction_groups(), [&] (compaction_group_ptr cg) -> future<> {
|
||||
co_await coroutine::parallel_for_each(sg.compaction_groups_immediate(), [&] (compaction_group_ptr cg) -> future<> {
|
||||
if (auto holder = try_hold_gate(cg->async_gate())) {
|
||||
co_await action(*cg);
|
||||
}
|
||||
@@ -1330,7 +1342,7 @@ future<utils::chunked_vector<sstables::shared_sstable>> table::take_sstable_set_
|
||||
}
|
||||
|
||||
future<utils::chunked_vector<sstables::entry_descriptor>>
|
||||
table::clone_tablet_storage(locator::tablet_id tid) {
|
||||
table::clone_tablet_storage(locator::tablet_id tid, bool leave_unsealed) {
|
||||
utils::chunked_vector<sstables::entry_descriptor> ret;
|
||||
auto holder = async_gate().hold();
|
||||
|
||||
@@ -1342,7 +1354,7 @@ table::clone_tablet_storage(locator::tablet_id tid) {
|
||||
// by compaction while we are waiting for the lock.
|
||||
auto deletion_guard = co_await get_sstable_list_permit();
|
||||
co_await sg.make_sstable_set()->for_each_sstable_gently([&] (const sstables::shared_sstable& sst) -> future<> {
|
||||
ret.push_back(co_await sst->clone(calculate_generation_for_new_table()));
|
||||
ret.push_back(co_await sst->clone(calculate_generation_for_new_table(), leave_unsealed));
|
||||
});
|
||||
co_return ret;
|
||||
}
|
||||
@@ -1354,10 +1366,10 @@ void table::update_stats_for_new_sstable(const sstables::shared_sstable& sst) no
|
||||
}
|
||||
|
||||
future<>
|
||||
table::do_add_sstable_and_update_cache(compaction_group& cg, sstables::shared_sstable sst, sstables::offstrategy offstrategy,
|
||||
table::do_add_sstable_and_update_cache(compaction_group& cg, sstables::shared_sstable& sst, sstables::offstrategy offstrategy,
|
||||
bool trigger_compaction) {
|
||||
auto permit = co_await seastar::get_units(_sstable_set_mutation_sem, 1);
|
||||
co_return co_await get_row_cache().invalidate(row_cache::external_updater([&] () noexcept {
|
||||
co_return co_await get_row_cache().invalidate(row_cache::external_updater([&] () mutable noexcept {
|
||||
// FIXME: this is not really noexcept, but we need to provide strong exception guarantees.
|
||||
// atomically load all opened sstables into column family.
|
||||
if (!offstrategy) {
|
||||
@@ -1369,6 +1381,8 @@ table::do_add_sstable_and_update_cache(compaction_group& cg, sstables::shared_ss
|
||||
if (trigger_compaction) {
|
||||
try_trigger_compaction(cg);
|
||||
}
|
||||
// Resetting sstable ptr to inform the caller the sstable has been loaded successfully.
|
||||
sst = nullptr;
|
||||
}), dht::partition_range::make({sst->get_first_decorated_key(), true}, {sst->get_last_decorated_key(), true}), [sst, schema = _schema] (const dht::decorated_key& key) {
|
||||
return sst->filter_has_key(sstables::key::from_partition_key(*schema, key.key()));
|
||||
});
|
||||
@@ -1376,12 +1390,10 @@ table::do_add_sstable_and_update_cache(compaction_group& cg, sstables::shared_ss
|
||||
|
||||
future<>
|
||||
table::do_add_sstable_and_update_cache(sstables::shared_sstable new_sst, sstables::offstrategy offstrategy, bool trigger_compaction) {
|
||||
for (auto sst : co_await maybe_split_new_sstable(new_sst)) {
|
||||
auto& cg = compaction_group_for_sstable(sst);
|
||||
// Hold gate to make share compaction group is alive.
|
||||
auto holder = cg.async_gate().hold();
|
||||
co_await do_add_sstable_and_update_cache(cg, std::move(sst), offstrategy, trigger_compaction);
|
||||
}
|
||||
auto& cg = compaction_group_for_sstable(new_sst);
|
||||
// Hold gate to make share compaction group is alive.
|
||||
auto holder = cg.async_gate().hold();
|
||||
co_await do_add_sstable_and_update_cache(cg, new_sst, offstrategy, trigger_compaction);
|
||||
}
|
||||
|
||||
future<>
|
||||
@@ -1399,6 +1411,85 @@ table::add_sstables_and_update_cache(const std::vector<sstables::shared_sstable>
|
||||
trigger_compaction();
|
||||
}
|
||||
|
||||
future<std::vector<sstables::shared_sstable>>
|
||||
table::add_new_sstable_and_update_cache(sstables::shared_sstable new_sst,
|
||||
std::function<future<>(sstables::shared_sstable)> on_add,
|
||||
sstables::offstrategy offstrategy) {
|
||||
std::vector<sstables::shared_sstable> ret, ssts;
|
||||
std::exception_ptr ex;
|
||||
try {
|
||||
bool trigger_compaction = offstrategy == sstables::offstrategy::no;
|
||||
auto& cg = compaction_group_for_sstable(new_sst);
|
||||
// This prevents compaction group from being considered empty until the holder is released.
|
||||
// Helpful for tablet split, where split is acked for a table when all pre-split groups are empty.
|
||||
auto sstable_add_holder = cg.sstable_add_gate().hold();
|
||||
|
||||
ret = ssts = co_await maybe_split_new_sstable(new_sst);
|
||||
// on successful split, input sstable is unlinked.
|
||||
new_sst = nullptr;
|
||||
for (auto& sst : ssts) {
|
||||
auto& cg = compaction_group_for_sstable(sst);
|
||||
// Hold gate to make sure compaction group is alive.
|
||||
auto holder = cg.async_gate().hold();
|
||||
co_await on_add(sst);
|
||||
// If do_add_sstable_and_update_cache() throws after sstable has been loaded, the pointer
|
||||
// sst passed by reference will be set to nullptr, so it won't be unlinked in the exception
|
||||
// handler below.
|
||||
co_await do_add_sstable_and_update_cache(cg, sst, offstrategy, trigger_compaction);
|
||||
sst = nullptr;
|
||||
}
|
||||
} catch (...) {
|
||||
ex = std::current_exception();
|
||||
}
|
||||
|
||||
if (ex) {
|
||||
// on failed split, input sstable is unlinked here.
|
||||
if (new_sst) {
|
||||
tlogger.error("Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", new_sst->get_filename(), new_sst->get_origin(), ex);
|
||||
co_await new_sst->unlink();
|
||||
}
|
||||
// on failure after successful split, sstables not attached yet will be unlinked
|
||||
co_await coroutine::parallel_for_each(ssts, [&ex] (sstables::shared_sstable sst) -> future<> {
|
||||
if (sst) {
|
||||
tlogger.error("Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", sst->get_filename(), sst->get_origin(), ex);
|
||||
co_await sst->unlink();
|
||||
}
|
||||
});
|
||||
co_await coroutine::return_exception_ptr(std::move(ex));
|
||||
}
|
||||
co_return std::move(ret);
|
||||
}
|
||||
|
||||
future<std::vector<sstables::shared_sstable>>
|
||||
table::add_new_sstables_and_update_cache(std::vector<sstables::shared_sstable> new_ssts,
|
||||
std::function<future<>(sstables::shared_sstable)> on_add) {
|
||||
std::exception_ptr ex;
|
||||
std::vector<sstables::shared_sstable> ret;
|
||||
|
||||
// We rely on add_new_sstable_and_update_cache() to unlink the sstable fed into it,
|
||||
// so the exception handling below will only have to unlink sstables not processed yet.
|
||||
try {
|
||||
for (auto& sst: new_ssts) {
|
||||
auto ssts = co_await add_new_sstable_and_update_cache(std::exchange(sst, nullptr), on_add);
|
||||
std::ranges::move(ssts, std::back_inserter(ret));
|
||||
|
||||
}
|
||||
} catch (...) {
|
||||
ex = std::current_exception();
|
||||
}
|
||||
|
||||
if (ex) {
|
||||
co_await coroutine::parallel_for_each(new_ssts, [&ex] (sstables::shared_sstable sst) -> future<> {
|
||||
if (sst) {
|
||||
tlogger.error("Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", sst->get_filename(), sst->get_origin(), ex);
|
||||
co_await sst->unlink();
|
||||
}
|
||||
});
|
||||
co_await coroutine::return_exception_ptr(std::move(ex));
|
||||
}
|
||||
co_return std::move(ret);
|
||||
}
|
||||
|
||||
future<>
|
||||
table::update_cache(compaction_group& cg, lw_shared_ptr<memtable> m, std::vector<sstables::shared_sstable> ssts) {
|
||||
auto permit = co_await seastar::get_units(_sstable_set_mutation_sem, 1);
|
||||
@@ -1892,7 +1983,7 @@ sstables::file_size_stats compaction_group::live_disk_space_used_full_stats() co
|
||||
}
|
||||
|
||||
uint64_t storage_group::live_disk_space_used() const {
|
||||
auto cgs = const_cast<storage_group&>(*this).compaction_groups();
|
||||
auto cgs = const_cast<storage_group&>(*this).compaction_groups_immediate();
|
||||
return std::ranges::fold_left(cgs | std::views::transform(std::mem_fn(&compaction_group::live_disk_space_used)), uint64_t(0), std::plus{});
|
||||
}
|
||||
|
||||
@@ -2019,10 +2110,9 @@ future<std::vector<compaction::compaction_group_view*>> table::get_compaction_gr
|
||||
auto sgs = storage_groups_for_token_range(range);
|
||||
for (auto& sg : sgs) {
|
||||
co_await coroutine::maybe_yield();
|
||||
auto cgs = sg->compaction_groups();
|
||||
for (auto& cg : cgs) {
|
||||
sg->for_each_compaction_group([&ret] (const compaction_group_ptr& cg) {
|
||||
ret.push_back(&cg->view_for_unrepaired_data());
|
||||
}
|
||||
});
|
||||
}
|
||||
co_return ret;
|
||||
}
|
||||
@@ -2049,7 +2139,7 @@ future<compaction_reenablers_and_lock_holders> table::get_compaction_reenablers_
|
||||
future<> table::clear_being_repaired_for_range(dht::token_range range) {
|
||||
auto sgs = storage_groups_for_token_range(range);
|
||||
for (auto& sg : sgs) {
|
||||
auto cgs = sg->compaction_groups();
|
||||
auto cgs = sg->compaction_groups_immediate();
|
||||
for (auto& cg : cgs) {
|
||||
auto sstables = cg->all_sstables();
|
||||
co_await coroutine::maybe_yield();
|
||||
@@ -2491,9 +2581,11 @@ future<> table::drop_quarantined_sstables() {
|
||||
}
|
||||
|
||||
bool storage_group::no_compacted_sstable_undeleted() const {
|
||||
return std::ranges::all_of(compaction_groups(), [] (const_compaction_group_ptr& cg) {
|
||||
return cg->compacted_undeleted_sstables().empty();
|
||||
auto ret = true;
|
||||
for_each_compaction_group([&ret] (const compaction_group_ptr& cg) {
|
||||
ret &= cg->compacted_undeleted_sstables().empty();
|
||||
});
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Gets the list of all sstables in the column family, including ones that are
|
||||
@@ -2612,8 +2704,8 @@ public:
|
||||
sstables::sstables_manager& get_sstables_manager() noexcept override {
|
||||
return _t.get_sstables_manager();
|
||||
}
|
||||
sstables::shared_sstable make_sstable() const override {
|
||||
return _t.make_sstable();
|
||||
sstables::shared_sstable make_sstable(sstables::sstable_state state) const override {
|
||||
return _t.make_sstable(state);
|
||||
}
|
||||
sstables::sstable_writer_config configure_writer(sstring origin) const override {
|
||||
auto cfg = _t.get_sstables_manager().configure_writer(std::move(origin));
|
||||
@@ -2731,6 +2823,7 @@ future<> compaction_group::stop(sstring reason) noexcept {
|
||||
auto flush_future = co_await seastar::coroutine::as_future(flush());
|
||||
|
||||
co_await _flush_gate.close();
|
||||
co_await _sstable_add_gate.close();
|
||||
// FIXME: indentation
|
||||
_compaction_disabler_for_views.clear();
|
||||
co_await utils::get_local_injector().inject("compaction_group_stop_wait", utils::wait_for_message(60s));
|
||||
@@ -2744,7 +2837,7 @@ future<> compaction_group::stop(sstring reason) noexcept {
|
||||
}
|
||||
|
||||
bool compaction_group::empty() const noexcept {
|
||||
return _memtables->empty() && live_sstable_count() == 0;
|
||||
return _memtables->empty() && live_sstable_count() == 0 && _sstable_add_gate.get_count() == 0;
|
||||
}
|
||||
|
||||
const schema_ptr& compaction_group::schema() const {
|
||||
@@ -2757,9 +2850,9 @@ void compaction_group::clear_sstables() {
|
||||
}
|
||||
|
||||
void storage_group::clear_sstables() {
|
||||
for (auto cg : compaction_groups()) {
|
||||
for_each_compaction_group([] (const compaction_group_ptr& cg) {
|
||||
cg->clear_sstables();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
table::table(schema_ptr schema, config config, lw_shared_ptr<const storage_options> sopts, compaction::compaction_manager& compaction_manager,
|
||||
@@ -3086,7 +3179,7 @@ future<> table::update_repaired_at_for_merge() {
|
||||
for (auto& x : sgs) {
|
||||
auto sg = x.second;
|
||||
if (sg) {
|
||||
auto cgs = sg->compaction_groups();
|
||||
auto cgs = sg->compaction_groups_immediate();
|
||||
for (auto& cg : cgs) {
|
||||
auto cre = co_await cg->get_compaction_manager().stop_and_disable_compaction("update_repaired_at_for_merge", cg->view_for_unrepaired_data());
|
||||
co_await cg->update_repaired_at_for_merge();
|
||||
@@ -3200,7 +3293,7 @@ db::replay_position table::highest_flushed_replay_position() const {
|
||||
}
|
||||
|
||||
struct manifest_json : public json::json_base {
|
||||
json::json_chunked_list<sstring> files;
|
||||
json::json_chunked_list<std::string_view> files;
|
||||
|
||||
manifest_json() {
|
||||
register_params();
|
||||
@@ -3219,22 +3312,25 @@ private:
|
||||
}
|
||||
};
|
||||
|
||||
future<>
|
||||
table::seal_snapshot(sstring jsondir, std::vector<snapshot_file_set> file_sets) {
|
||||
class snapshot_writer {
|
||||
public:
|
||||
virtual future<> init() = 0;
|
||||
virtual future<> sync() = 0;
|
||||
virtual future<output_stream<char>> stream_for(sstring component) = 0;
|
||||
virtual ~snapshot_writer() = default;
|
||||
};
|
||||
|
||||
using snapshot_file_set = foreign_ptr<std::unique_ptr<std::unordered_set<sstring>>>;
|
||||
|
||||
static future<> write_manifest(snapshot_writer& writer, std::vector<snapshot_file_set> file_sets) {
|
||||
manifest_json manifest;
|
||||
for (const auto& fsp : file_sets) {
|
||||
for (auto& rf : *fsp) {
|
||||
manifest.files.push(std::move(rf));
|
||||
manifest.files.push(std::string_view(rf));
|
||||
}
|
||||
}
|
||||
auto streamer = json::stream_object(std::move(manifest));
|
||||
auto jsonfile = jsondir + "/manifest.json";
|
||||
|
||||
tlogger.debug("Storing manifest {}", jsonfile);
|
||||
|
||||
co_await io_check([jsondir] { return recursive_touch_directory(jsondir); });
|
||||
auto f = co_await open_checked_file_dma(general_disk_error_handler, jsonfile, open_flags::wo | open_flags::create | open_flags::truncate);
|
||||
auto out = co_await make_file_output_stream(std::move(f));
|
||||
auto out = co_await writer.stream_for("manifest.json");
|
||||
std::exception_ptr ex;
|
||||
try {
|
||||
co_await streamer(std::move(out));
|
||||
@@ -3245,19 +3341,27 @@ table::seal_snapshot(sstring jsondir, std::vector<snapshot_file_set> file_sets)
|
||||
if (ex) {
|
||||
co_await coroutine::return_exception_ptr(std::move(ex));
|
||||
}
|
||||
|
||||
co_await io_check(sync_directory, std::move(jsondir));
|
||||
}
|
||||
|
||||
future<> table::write_schema_as_cql(const global_table_ptr& table_shards, sstring dir) const {
|
||||
auto schema_desc = schema()->describe(
|
||||
replica::make_schema_describe_helper(table_shards),
|
||||
cql3::describe_option::STMTS);
|
||||
|
||||
/*!
|
||||
* \brief write the schema to a 'schema.cql' file at the given directory.
|
||||
*
|
||||
* When doing a snapshot, the snapshot directory contains a 'schema.cql' file
|
||||
* with a CQL command that can be used to generate the schema.
|
||||
* The content is is similar to the result of the CQL DESCRIBE command of the table.
|
||||
*
|
||||
* When a schema has indexes, local indexes or views, those indexes and views
|
||||
* are represented by their own schemas.
|
||||
* In those cases, the method would write the relevant information for each of the schemas:
|
||||
*
|
||||
* The schema of the base table would output a file with the CREATE TABLE command
|
||||
* and the schema of the view that is used for the index would output a file with the
|
||||
* CREATE INDEX command.
|
||||
* The same is true for local index and MATERIALIZED VIEW.
|
||||
*/
|
||||
static future<> write_schema_as_cql(snapshot_writer& writer, cql3::description schema_desc) {
|
||||
auto schema_description = std::move(*schema_desc.create_statement);
|
||||
auto schema_file_name = dir + "/schema.cql";
|
||||
auto f = co_await open_checked_file_dma(general_disk_error_handler, schema_file_name, open_flags::wo | open_flags::create | open_flags::truncate);
|
||||
auto out = co_await make_file_output_stream(std::move(f));
|
||||
auto out = co_await writer.stream_for("schema.cql");
|
||||
std::exception_ptr ex;
|
||||
|
||||
auto view = managed_bytes_view(schema_description.as_managed_bytes());
|
||||
@@ -3278,73 +3382,87 @@ future<> table::write_schema_as_cql(const global_table_ptr& table_shards, sstrin
|
||||
}
|
||||
}
|
||||
|
||||
// Runs the orchestration code on an arbitrary shard to balance the load.
|
||||
future<> table::snapshot_on_all_shards(sharded<database>& sharded_db, const global_table_ptr& table_shards, sstring name) {
|
||||
auto* so = std::get_if<storage_options::local>(&table_shards->get_storage_options().value);
|
||||
if (so == nullptr) {
|
||||
throw std::runtime_error("Snapshotting non-local tables is not implemented");
|
||||
class local_snapshot_writer : public snapshot_writer {
|
||||
std::filesystem::path _dir;
|
||||
|
||||
public:
|
||||
local_snapshot_writer(std::filesystem::path dir, sstring name)
|
||||
: _dir(dir / sstables::snapshots_dir / name)
|
||||
{}
|
||||
future<> init() override {
|
||||
co_await io_check([this] { return recursive_touch_directory(_dir.native()); });
|
||||
}
|
||||
if (so->dir.empty()) { // virtual tables don't have initialized local storage
|
||||
future<> sync() override {
|
||||
co_await io_check([this] { return sync_directory(_dir.native()); });
|
||||
}
|
||||
future<output_stream<char>> stream_for(sstring component) override {
|
||||
auto file_name = (_dir / component).native();
|
||||
auto f = co_await open_checked_file_dma(general_disk_error_handler, file_name, open_flags::wo | open_flags::create | open_flags::truncate);
|
||||
co_return co_await make_file_output_stream(std::move(f));
|
||||
}
|
||||
};
|
||||
|
||||
// Runs the orchestration code on an arbitrary shard to balance the load.
|
||||
future<> database::snapshot_table_on_all_shards(sharded<database>& sharded_db, const global_table_ptr& table_shards, sstring name) {
|
||||
auto writer = std::visit(overloaded_functor{
|
||||
[&name] (const data_dictionary::storage_options::local& loc) -> std::unique_ptr<snapshot_writer> {
|
||||
if (loc.dir.empty()) {
|
||||
// virtual tables don't have initialized local storage
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return std::make_unique<local_snapshot_writer>(loc.dir, name);
|
||||
},
|
||||
[] (const data_dictionary::storage_options::s3&) -> std::unique_ptr<snapshot_writer> {
|
||||
throw std::runtime_error("Snapshotting non-local tables is not implemented");
|
||||
}
|
||||
}, table_shards->get_storage_options().value);
|
||||
if (!writer) {
|
||||
co_return;
|
||||
}
|
||||
|
||||
auto jsondir = (so->dir / sstables::snapshots_dir / name).native();
|
||||
auto orchestrator = std::hash<sstring>()(jsondir) % smp::count;
|
||||
|
||||
auto orchestrator = std::hash<sstring>()(name) % smp::count;
|
||||
co_await smp::submit_to(orchestrator, [&] () -> future<> {
|
||||
auto& t = *table_shards;
|
||||
auto s = t.schema();
|
||||
tlogger.debug("Taking snapshot of {}.{}: directory={}", s->ks_name(), s->cf_name(), jsondir);
|
||||
tlogger.debug("Taking snapshot of {}.{}: name={}", s->ks_name(), s->cf_name(), name);
|
||||
|
||||
std::vector<table::snapshot_file_set> file_sets;
|
||||
file_sets.reserve(smp::count);
|
||||
std::vector<snapshot_file_set> file_sets(smp::count);
|
||||
|
||||
co_await io_check([&jsondir] { return recursive_touch_directory(jsondir); });
|
||||
co_await coroutine::parallel_for_each(smp::all_cpus(), [&] (unsigned shard) -> future<> {
|
||||
file_sets.emplace_back(co_await smp::submit_to(shard, [&] {
|
||||
return table_shards->take_snapshot(jsondir);
|
||||
}));
|
||||
co_await writer->init();
|
||||
co_await smp::invoke_on_all([&] -> future<> {
|
||||
auto& t = *table_shards;
|
||||
auto [tables, permit] = co_await t.snapshot_sstables();
|
||||
auto table_names = co_await t.get_sstables_manager().take_snapshot(std::move(tables), name);
|
||||
file_sets[this_shard_id()] = make_foreign(std::make_unique<std::unordered_set<sstring>>(std::move(table_names)));
|
||||
});
|
||||
co_await io_check(sync_directory, jsondir);
|
||||
co_await writer->sync();
|
||||
|
||||
co_await t.finalize_snapshot(table_shards, std::move(jsondir), std::move(file_sets));
|
||||
std::exception_ptr ex;
|
||||
|
||||
tlogger.debug("snapshot {}: writing schema.cql", name);
|
||||
auto schema_desc = s->describe(replica::make_schema_describe_helper(table_shards), cql3::describe_option::STMTS);
|
||||
co_await write_schema_as_cql(*writer, std::move(schema_desc)).handle_exception([&] (std::exception_ptr ptr) {
|
||||
tlogger.error("Failed writing schema file in snapshot in {} with exception {}", name, ptr);
|
||||
ex = std::move(ptr);
|
||||
});
|
||||
tlogger.debug("snapshot {}: seal_snapshot", name);
|
||||
co_await write_manifest(*writer, std::move(file_sets)).handle_exception([&] (std::exception_ptr ptr) {
|
||||
tlogger.error("Failed to seal snapshot in {}: {}.", name, ptr);
|
||||
ex = std::move(ptr);
|
||||
});
|
||||
if (ex) {
|
||||
co_await coroutine::return_exception_ptr(std::move(ex));
|
||||
}
|
||||
|
||||
co_await writer->sync();
|
||||
});
|
||||
}
|
||||
|
||||
future<table::snapshot_file_set> table::take_snapshot(sstring jsondir) {
|
||||
tlogger.trace("take_snapshot {}", jsondir);
|
||||
|
||||
auto sstable_deletion_guard = co_await get_sstable_list_permit();
|
||||
|
||||
future<std::pair<std::vector<sstables::shared_sstable>, table::sstable_list_permit>> table::snapshot_sstables() {
|
||||
auto permit = co_await get_sstable_list_permit();
|
||||
auto tables = *_sstables->all() | std::ranges::to<std::vector<sstables::shared_sstable>>();
|
||||
auto table_names = std::make_unique<std::unordered_set<sstring>>();
|
||||
|
||||
co_await _sstables_manager.dir_semaphore().parallel_for_each(tables, [&jsondir, &table_names] (sstables::shared_sstable sstable) {
|
||||
table_names->insert(sstable->component_basename(sstables::component_type::Data));
|
||||
return io_check([sstable, &dir = jsondir] {
|
||||
return sstable->snapshot(dir);
|
||||
});
|
||||
});
|
||||
co_return make_foreign(std::move(table_names));
|
||||
}
|
||||
|
||||
future<> table::finalize_snapshot(const global_table_ptr& table_shards, sstring jsondir, std::vector<snapshot_file_set> file_sets) {
|
||||
std::exception_ptr ex;
|
||||
|
||||
tlogger.debug("snapshot {}: writing schema.cql", jsondir);
|
||||
co_await write_schema_as_cql(table_shards, jsondir).handle_exception([&] (std::exception_ptr ptr) {
|
||||
tlogger.error("Failed writing schema file in snapshot in {} with exception {}", jsondir, ptr);
|
||||
ex = std::move(ptr);
|
||||
});
|
||||
tlogger.debug("snapshot {}: seal_snapshot", jsondir);
|
||||
co_await seal_snapshot(jsondir, std::move(file_sets)).handle_exception([&] (std::exception_ptr ptr) {
|
||||
tlogger.error("Failed to seal snapshot in {}: {}.", jsondir, ptr);
|
||||
ex = std::move(ptr);
|
||||
});
|
||||
|
||||
if (ex) {
|
||||
co_await coroutine::return_exception_ptr(std::move(ex));
|
||||
}
|
||||
co_return std::make_pair(std::move(tables), std::move(permit));
|
||||
}
|
||||
|
||||
future<bool> table::snapshot_exists(sstring tag) {
|
||||
@@ -3356,6 +3474,7 @@ future<bool> table::snapshot_exists(sstring tag) {
|
||||
sstring jsondir = (so->dir / sstables::snapshots_dir / tag).native();
|
||||
bool exists = false;
|
||||
try {
|
||||
future<stat_data> (&file_stat)(std::string_view, follow_symlink) noexcept = seastar::file_stat;
|
||||
auto sd = co_await io_check(file_stat, jsondir, follow_symlink::no);
|
||||
if (sd.type != directory_entry_type::directory) {
|
||||
throw std::error_code(ENOTDIR, std::system_category());
|
||||
@@ -3385,16 +3504,15 @@ future<std::unordered_map<sstring, table::snapshot_details>> table::get_snapshot
|
||||
continue;
|
||||
}
|
||||
|
||||
lister::scan_dir(snapshots_dir, lister::dir_entry_types::of<directory_entry_type::directory>(), [datadir, &all_snapshots] (fs::path snapshots_dir, directory_entry de) {
|
||||
auto snapshot_name = de.name;
|
||||
auto lister = directory_lister(snapshots_dir, lister::dir_entry_types::of<directory_entry_type::directory>());
|
||||
while (auto de = lister.get().get()) {
|
||||
auto snapshot_name = de->name;
|
||||
all_snapshots.emplace(snapshot_name, snapshot_details());
|
||||
return get_snapshot_details(snapshots_dir / fs::path(snapshot_name), datadir).then([&all_snapshots, snapshot_name] (auto details) {
|
||||
auto& sd = all_snapshots.at(snapshot_name);
|
||||
sd.total += details.total;
|
||||
sd.live += details.live;
|
||||
return make_ready_future<>();
|
||||
});
|
||||
}).get();
|
||||
auto details = get_snapshot_details(snapshots_dir / fs::path(snapshot_name), datadir).get();
|
||||
auto& sd = all_snapshots.at(snapshot_name);
|
||||
sd.total += details.total;
|
||||
sd.live += details.live;
|
||||
}
|
||||
}
|
||||
return all_snapshots;
|
||||
});
|
||||
@@ -3402,38 +3520,65 @@ future<std::unordered_map<sstring, table::snapshot_details>> table::get_snapshot
|
||||
|
||||
future<table::snapshot_details> table::get_snapshot_details(fs::path snapshot_dir, fs::path datadir) {
|
||||
table::snapshot_details details{};
|
||||
file snapshot_directory = co_await io_check(open_directory, snapshot_dir.native());
|
||||
file data_directory = co_await io_check(open_directory, datadir.native());
|
||||
file staging_directory;
|
||||
std::optional<fs::path> staging_dir = datadir / sstables::staging_dir;
|
||||
if (!co_await file_exists(staging_dir->native())) {
|
||||
staging_dir.reset();
|
||||
} else {
|
||||
staging_directory = co_await io_check(open_directory, staging_dir->native());
|
||||
}
|
||||
|
||||
co_await lister::scan_dir(snapshot_dir, lister::dir_entry_types::of<directory_entry_type::regular>(), [datadir, &details] (fs::path snapshot_dir, directory_entry de) -> future<> {
|
||||
auto sd = co_await io_check(file_stat, (snapshot_dir / de.name).native(), follow_symlink::no);
|
||||
auto lister = directory_lister(snapshot_directory, snapshot_dir, lister::dir_entry_types::of<directory_entry_type::regular>());
|
||||
while (auto de = co_await lister.get()) {
|
||||
const auto& name = de->name;
|
||||
future<stat_data> (&file_stat)(file& directory, std::string_view name, follow_symlink) noexcept = seastar::file_stat;
|
||||
auto sd = co_await io_check(file_stat, snapshot_directory, name, follow_symlink::no);
|
||||
auto size = sd.allocated_size;
|
||||
|
||||
// The manifest and schema.sql files are the only files expected to be in this directory not belonging to the SSTable.
|
||||
//
|
||||
// All the others should just generate an exception: there is something wrong, so don't blindly
|
||||
// add it to the size.
|
||||
if (de.name != "manifest.json" && de.name != "schema.cql") {
|
||||
if (name != "manifest.json" && name != "schema.cql") {
|
||||
details.total += size;
|
||||
if (sd.number_of_links == 1) {
|
||||
// File exists only in the snapshot directory.
|
||||
details.live += size;
|
||||
continue;
|
||||
}
|
||||
// If the number of links is greater than 1, it is still possible that the file is linked to another snapshot
|
||||
// So check the datadir for the file too.
|
||||
} else {
|
||||
size = 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
auto exists_in_dir = [&] (file& dir, const fs::path& path, std::string_view name) -> future<bool> {
|
||||
try {
|
||||
// File exists in the main SSTable directory. Snapshots are not contributing to size
|
||||
auto psd = co_await io_check(file_stat, (datadir / de.name).native(), follow_symlink::no);
|
||||
auto psd = co_await io_check(file_stat, dir, name, follow_symlink::no);
|
||||
// File in main SSTable directory must be hardlinked to the file in the snapshot dir with the same name.
|
||||
if (psd.device_id != sd.device_id || psd.inode_number != sd.inode_number) {
|
||||
dblog.warn("[{} device_id={} inode_number={} size={}] is not the same file as [{} device_id={} inode_number={} size={}]",
|
||||
(datadir / de.name).native(), psd.device_id, psd.inode_number, psd.size,
|
||||
(snapshot_dir / de.name).native(), sd.device_id, sd.inode_number, sd.size);
|
||||
details.live += size;
|
||||
(path / name).native(), psd.device_id, psd.inode_number, psd.size,
|
||||
(snapshot_dir / name).native(), sd.device_id, sd.inode_number, sd.size);
|
||||
co_return false;
|
||||
}
|
||||
} catch (std::system_error& e) {
|
||||
co_return true;
|
||||
} catch (std::system_error& e) {
|
||||
if (e.code() != std::error_code(ENOENT, std::system_category())) {
|
||||
throw;
|
||||
}
|
||||
co_return false;
|
||||
}
|
||||
};
|
||||
// Check staging dir first, as files might be moved from there to the datadir concurrently to this check
|
||||
if ((!staging_dir || !co_await exists_in_dir(staging_directory, *staging_dir, name)) &&
|
||||
!co_await exists_in_dir(data_directory, datadir, name)) {
|
||||
details.live += size;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
co_return details;
|
||||
}
|
||||
@@ -3447,7 +3592,7 @@ future<> compaction_group::flush() noexcept {
|
||||
}
|
||||
|
||||
future<> storage_group::flush() noexcept {
|
||||
for (auto& cg : compaction_groups()) {
|
||||
for (auto& cg : compaction_groups_immediate()) {
|
||||
co_await cg->flush();
|
||||
}
|
||||
}
|
||||
@@ -3465,7 +3610,11 @@ size_t compaction_group::memtable_count() const noexcept {
|
||||
}
|
||||
|
||||
size_t storage_group::memtable_count() const {
|
||||
return std::ranges::fold_left(compaction_groups() | std::views::transform(std::mem_fn(&compaction_group::memtable_count)), size_t(0), std::plus{});
|
||||
size_t count = 0;
|
||||
for_each_compaction_group([&count] (const compaction_group_ptr& cg) {
|
||||
count += cg->memtable_count();
|
||||
});
|
||||
return count;
|
||||
}
|
||||
|
||||
future<> table::flush(std::optional<db::replay_position> pos) {
|
||||
@@ -3483,7 +3632,7 @@ future<> table::flush(std::optional<db::replay_position> pos) {
|
||||
}
|
||||
|
||||
bool storage_group::can_flush() const {
|
||||
return std::ranges::any_of(compaction_groups(), std::mem_fn(&compaction_group::can_flush));
|
||||
return std::ranges::any_of(compaction_groups_immediate(), std::mem_fn(&compaction_group::can_flush));
|
||||
}
|
||||
|
||||
bool table::can_flush() const {
|
||||
@@ -3514,9 +3663,11 @@ bool storage_group::compaction_disabled() const {
|
||||
// Compaction group that has been stopped will be excluded, since the group will not be available for a caller
|
||||
// to disable compaction explicitly on it, e.g. on truncate, and the caller might want to perform a check
|
||||
// that compaction was disabled on all groups. Stopping a group is equivalent to disabling compaction on it.
|
||||
return std::ranges::all_of(compaction_groups()
|
||||
| std::views::filter(std::not_fn(&compaction_group::stopped)), [] (const_compaction_group_ptr& cg) {
|
||||
return cg->compaction_disabled(); });
|
||||
bool all_disabled = true;
|
||||
for_each_compaction_group([&all_disabled] (const compaction_group_ptr& cg) {
|
||||
all_disabled &= cg->stopped() || cg->compaction_disabled();
|
||||
});
|
||||
return all_disabled;
|
||||
}
|
||||
|
||||
// NOTE: does not need to be futurized, but might eventually, depending on
|
||||
@@ -4301,11 +4452,11 @@ std::vector<mutation_source> table::select_memtables_as_mutation_sources(dht::to
|
||||
auto& sg = storage_group_for_token(token);
|
||||
std::vector<mutation_source> mss;
|
||||
mss.reserve(sg.memtable_count());
|
||||
for (auto& cg : sg.compaction_groups()) {
|
||||
sg.for_each_compaction_group([&mss] (const compaction_group_ptr &cg) {
|
||||
for (auto& mt : *cg->memtables()) {
|
||||
mss.emplace_back(mt->as_data_source());
|
||||
}
|
||||
}
|
||||
});
|
||||
return mss;
|
||||
}
|
||||
|
||||
@@ -4465,7 +4616,7 @@ future<> compaction_group::cleanup() {
|
||||
}
|
||||
|
||||
future<> table::clear_inactive_reads_for_tablet(database& db, storage_group& sg) {
|
||||
for (auto& cg_ptr : sg.compaction_groups()) {
|
||||
for (auto& cg_ptr : sg.compaction_groups_immediate()) {
|
||||
co_await db.clear_inactive_reads_for_tablet(_schema->id(), cg_ptr->token_range());
|
||||
}
|
||||
}
|
||||
@@ -4506,13 +4657,13 @@ future<> table::stop_compaction_groups(storage_group& sg) {
|
||||
}
|
||||
|
||||
future<> table::flush_compaction_groups(storage_group& sg) {
|
||||
for (auto& cg_ptr : sg.compaction_groups()) {
|
||||
for (auto& cg_ptr : sg.compaction_groups_immediate()) {
|
||||
co_await cg_ptr->flush();
|
||||
}
|
||||
}
|
||||
|
||||
future<> table::cleanup_compaction_groups(database& db, db::system_keyspace& sys_ks, locator::tablet_id tid, storage_group& sg) {
|
||||
for (auto& cg_ptr : sg.compaction_groups()) {
|
||||
for (auto& cg_ptr : sg.compaction_groups_immediate()) {
|
||||
co_await cg_ptr->cleanup();
|
||||
// FIXME: at this point _highest_rp might be greater than the replay_position of the last cleaned mutation,
|
||||
// and can cover some mutations which weren't cleaned, causing them to be lost during replay.
|
||||
|
||||
@@ -1,6 +1,13 @@
|
||||
find_program(CARGO cargo
|
||||
REQUIRED)
|
||||
|
||||
# Set up RUSTC_WRAPPER for sccache support if configured
|
||||
if(Scylla_RUSTC_WRAPPER)
|
||||
set(RUSTC_WRAPPER_ENV "RUSTC_WRAPPER=${Scylla_RUSTC_WRAPPER}")
|
||||
else()
|
||||
set(RUSTC_WRAPPER_ENV "")
|
||||
endif()
|
||||
|
||||
function(add_rust_library name)
|
||||
# used for profiles defined in Cargo.toml
|
||||
if(CMAKE_CONFIGURATION_TYPES)
|
||||
@@ -16,7 +23,7 @@ function(add_rust_library name)
|
||||
set(library ${target_dir}/lib${name}.a)
|
||||
add_custom_command(
|
||||
OUTPUT ${library}
|
||||
COMMAND ${CMAKE_COMMAND} -E env CARGO_BUILD_DEP_INFO_BASEDIR=. ${CARGO} build --locked --target-dir=${target_dir} --profile=${profile}
|
||||
COMMAND ${CMAKE_COMMAND} -E env CARGO_BUILD_DEP_INFO_BASEDIR=. ${RUSTC_WRAPPER_ENV} ${CARGO} build --locked --target-dir=${target_dir} --profile=${profile}
|
||||
COMMAND ${CMAKE_COMMAND} -E copy ${target_dir}/${profile}/lib${name}.a ${library}
|
||||
DEPENDS Cargo.lock
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
|
||||
@@ -390,9 +390,11 @@ dark_green = (195, 215, 195)
|
||||
light_red = (255, 200, 200)
|
||||
light_green = (200, 255, 200)
|
||||
light_gray = (240, 240, 240)
|
||||
scylla_blue = (87, 209, 229)
|
||||
|
||||
tablet_colors = {
|
||||
(Tablet.STATE_NORMAL, None): GRAY,
|
||||
(Tablet.STATE_NORMAL, 'repair'): scylla_blue,
|
||||
(Tablet.STATE_JOINING, 'allow_write_both_read_old'): dark_green,
|
||||
(Tablet.STATE_LEAVING, 'allow_write_both_read_old'): dark_red,
|
||||
(Tablet.STATE_JOINING, 'write_both_read_old'): dark_green,
|
||||
@@ -532,6 +534,8 @@ def update_from_cql(initial=False):
|
||||
state = (Tablet.STATE_JOINING, tablet.stage)
|
||||
elif replica in leaving:
|
||||
state = (Tablet.STATE_LEAVING, tablet.stage)
|
||||
elif tablet.stage == 'repair':
|
||||
state = (Tablet.STATE_NORMAL, tablet.stage)
|
||||
else:
|
||||
state = (Tablet.STATE_NORMAL, None)
|
||||
|
||||
|
||||
@@ -4109,6 +4109,16 @@ class scylla_fiber(gdb.Command):
|
||||
return res
|
||||
return None
|
||||
|
||||
# Coroutines need special handling as they allocate the future object on their frame.
|
||||
if name.strip().endswith('[clone .resume]'):
|
||||
self._maybe_log(f"Current task is a coroutine, trying to find the promise in the coroutine frame: 0x{ptr_meta.ptr:x}+{ptr_meta.size}\n", verbose)
|
||||
# Skip the first two pointers, these are the coroutine resume and destroy function pointers.
|
||||
for maybe_tptr in range(ptr_meta.ptr + 2 * _vptr_type().sizeof, ptr_meta.ptr + ptr_meta.size, _vptr_type().sizeof):
|
||||
res = self._probe_pointer(maybe_tptr, scanned_region_size, using_seastar_allocator, verbose)
|
||||
if res is not None:
|
||||
return res
|
||||
return None
|
||||
|
||||
if name.startswith('vtable for seastar::internal::when_all_state'):
|
||||
when_all_state_base_ptr_type = gdb.lookup_type('seastar::internal::when_all_state_base').pointer()
|
||||
when_all_state_base = gdb.Value(int(ptr_meta.ptr)).reinterpret_cast(when_all_state_base_ptr_type)
|
||||
@@ -4195,6 +4205,9 @@ class scylla_fiber(gdb.Command):
|
||||
parser.add_argument("--force-fallback-mode", action="store_true", default=False,
|
||||
help="Force fallback mode to be used, that is, scan a fixed-size region of memory"
|
||||
" (configurable via --scanned-region-size), instead of relying on `scylla ptr` for determining the size of the task objects.")
|
||||
parser.add_argument("--direction", action="store", choices=['forward', 'backward', 'both'], default='both',
|
||||
help="Direction in which to walk the continuation chain. 'forward' walks futures waiting on the given task,"
|
||||
" 'backward' walks futures the given task is waiting on, 'both' does both.")
|
||||
parser.add_argument("task", action="store", help="An expression that evaluates to a valid `seastar::task*` value. Cannot contain white-space.")
|
||||
|
||||
try:
|
||||
@@ -4224,14 +4237,20 @@ class scylla_fiber(gdb.Command):
|
||||
gdb.write("Provided pointer 0x{:016x} is not an object managed by seastar or not a task pointer\n".format(initial_task_ptr))
|
||||
return
|
||||
|
||||
backwards_fiber = self._walk(self._walk_backward, this_task[0], this_task[2], args.max_depth, args.scanned_region_size, using_seastar_allocator, args.verbose)
|
||||
if (args.direction == 'backward' or args.direction == 'both'):
|
||||
backwards_fiber = self._walk(self._walk_backward, this_task[0], this_task[2], args.max_depth, args.scanned_region_size, using_seastar_allocator, args.verbose)
|
||||
else:
|
||||
backwards_fiber = []
|
||||
|
||||
for i, task_info in enumerate(reversed(backwards_fiber)):
|
||||
format_task_line(i - len(backwards_fiber), task_info)
|
||||
|
||||
format_task_line(0, this_task)
|
||||
|
||||
forward_fiber = self._walk(self._walk_forward, this_task[0], this_task[2], args.max_depth, args.scanned_region_size, using_seastar_allocator, args.verbose)
|
||||
if (args.direction == 'forward' or args.direction == 'both'):
|
||||
forward_fiber = self._walk(self._walk_forward, this_task[0], this_task[2], args.max_depth, args.scanned_region_size, using_seastar_allocator, args.verbose)
|
||||
else:
|
||||
forward_fiber = []
|
||||
|
||||
for i, task_info in enumerate(forward_fiber):
|
||||
format_task_line(i + 1, task_info)
|
||||
@@ -5104,10 +5123,15 @@ class scylla_small_objects(gdb.Command):
|
||||
span_end = int(span_start + span.size() * self._page_size)
|
||||
|
||||
# span's free list
|
||||
span_next_free = span.page['freelist']
|
||||
while span_next_free:
|
||||
self._free_in_span.add(int(span_next_free))
|
||||
span_next_free = span_next_free['next']
|
||||
try:
|
||||
span_next_free = span.page['freelist']
|
||||
while span_next_free:
|
||||
self._free_in_span.add(int(span_next_free))
|
||||
span_next_free = span_next_free['next']
|
||||
except gdb.error:
|
||||
# This loop sometimes steps on "Cannot access memory at address", causing CI instability.
|
||||
# Catch the exception and break the freelist traversal loop gracefully.
|
||||
gdb.write(f"Warning: error traversing freelist of span [0x{span_start:x}, 0x{span_end:x}), some of the listed objects in this span may be free objects.\n")
|
||||
|
||||
return span_start, span_end
|
||||
|
||||
@@ -5850,6 +5874,18 @@ class scylla_read_stats(gdb.Command):
|
||||
def __init__(self):
|
||||
gdb.Command.__init__(self, 'scylla read-stats', gdb.COMMAND_USER, gdb.COMPLETE_COMMAND)
|
||||
|
||||
@staticmethod
|
||||
def foreach_permit(semaphore, fn):
|
||||
"""Mirror of reader_concurrency_semaphore::foreach_permit()"""
|
||||
for permit_list in (
|
||||
semaphore['_permit_list'],
|
||||
semaphore['_wait_list']['_admission_queue'],
|
||||
semaphore['_wait_list']['_memory_queue'],
|
||||
semaphore['_ready_list'],
|
||||
semaphore['_inactive_reads']):
|
||||
for permit in intrusive_list(permit_list):
|
||||
fn(permit)
|
||||
|
||||
@staticmethod
|
||||
def dump_reads_from_semaphore(semaphore):
|
||||
try:
|
||||
@@ -5864,7 +5900,7 @@ class scylla_read_stats(gdb.Command):
|
||||
permit_summaries = defaultdict(permit_stats)
|
||||
total = permit_stats()
|
||||
|
||||
for permit in intrusive_list(permit_list):
|
||||
def summarize_permit(permit):
|
||||
schema_name = "*.*"
|
||||
schema = permit['_schema']
|
||||
try:
|
||||
@@ -5884,6 +5920,8 @@ class scylla_read_stats(gdb.Command):
|
||||
permit_summaries[(schema_name, description, state)].add(summary)
|
||||
total.add(summary)
|
||||
|
||||
scylla_read_stats.foreach_permit(semaphore, summarize_permit)
|
||||
|
||||
if not permit_summaries:
|
||||
return
|
||||
|
||||
@@ -5893,7 +5931,9 @@ class scylla_read_stats(gdb.Command):
|
||||
inactive_read_count = len(intrusive_list(semaphore['_inactive_reads']))
|
||||
waiters = int(semaphore["_stats"]["waiters"])
|
||||
|
||||
gdb.write("Semaphore {} with: {}/{} count and {}/{} memory resources, queued: {}, inactive={}\n".format(
|
||||
gdb.write("Semaphore ({}*) 0x{:x} {} with: {}/{} count and {}/{} memory resources, queued: {}, inactive={}\n".format(
|
||||
semaphore.type.name,
|
||||
int(semaphore.address),
|
||||
semaphore_name,
|
||||
initial_count - int(semaphore['_resources']['count']), initial_count,
|
||||
initial_memory - int(semaphore['_resources']['memory']), initial_memory,
|
||||
|
||||
2
seastar
2
seastar
Submodule seastar updated: 7ec14e836a...4dcd4df5e7
@@ -82,7 +82,7 @@ seastar::future<> service::client_routes_service::set_client_routes_inner(const
|
||||
auto guard = co_await _group0_client.start_operation(_abort_source, service::raft_timeout{});
|
||||
utils::chunked_vector<canonical_mutation> cmuts;
|
||||
|
||||
for (auto& entry : route_entries) {
|
||||
for (const auto& entry : route_entries) {
|
||||
auto mut = co_await make_update_client_route_mutation(guard.write_timestamp(), entry);
|
||||
cmuts.emplace_back(std::move(mut));
|
||||
}
|
||||
@@ -103,24 +103,24 @@ seastar::future<> service::client_routes_service::delete_client_routes_inner(con
|
||||
co_await _group0_client.add_entry(std::move(cmd), std::move(guard), _abort_source);
|
||||
}
|
||||
|
||||
seastar::future<> service::client_routes_service::set_client_routes(const std::vector<service::client_routes_service::client_route_entry>& route_entries) {
|
||||
return container().invoke_on(0, [route_entries = std::move(route_entries)] (service::client_routes_service& cr) -> future<> {
|
||||
return cr.with_retry([&] {
|
||||
seastar::future<> service::client_routes_service::set_client_routes(std::vector<service::client_routes_service::client_route_entry> route_entries) {
|
||||
return container().invoke_on(0, [route_entries = std::move(route_entries)] (service::client_routes_service& cr) mutable -> future<> {
|
||||
return cr.with_retry([&cr, route_entries = std::move(route_entries)] {
|
||||
return cr.set_client_routes_inner(route_entries);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
seastar::future<> service::client_routes_service::delete_client_routes(const std::vector<service::client_routes_service::client_route_key>& route_keys) {
|
||||
return container().invoke_on(0, [route_keys = std::move(route_keys)] (service::client_routes_service& cr) -> future<> {
|
||||
return cr.with_retry([&] {
|
||||
seastar::future<> service::client_routes_service::delete_client_routes(std::vector<service::client_routes_service::client_route_key> route_keys) {
|
||||
return container().invoke_on(0, [route_keys = std::move(route_keys)] (service::client_routes_service& cr) mutable -> future<> {
|
||||
return cr.with_retry([&cr, route_keys = std::move(route_keys)] {
|
||||
return cr.delete_client_routes_inner(route_keys);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
template <typename Func>
|
||||
seastar::future<> service::client_routes_service::with_retry(Func&& func) const {
|
||||
seastar::future<> service::client_routes_service::with_retry(Func func) const {
|
||||
int retries = 10;
|
||||
while (true) {
|
||||
try {
|
||||
|
||||
@@ -66,8 +66,8 @@ public:
|
||||
future<mutation> make_remove_client_route_mutation(api::timestamp_type ts, const service::client_routes_service::client_route_key& key);
|
||||
future<mutation> make_update_client_route_mutation(api::timestamp_type ts, const client_route_entry& entry);
|
||||
future<std::vector<client_route_entry>> get_client_routes() const;
|
||||
seastar::future<> set_client_routes(const std::vector<service::client_routes_service::client_route_entry>& route_entries);
|
||||
seastar::future<> delete_client_routes(const std::vector<service::client_routes_service::client_route_key>& route_keys);
|
||||
seastar::future<> set_client_routes(std::vector<service::client_routes_service::client_route_entry> route_entries);
|
||||
seastar::future<> delete_client_routes(std::vector<service::client_routes_service::client_route_key> route_keys);
|
||||
|
||||
|
||||
// notifications
|
||||
@@ -76,7 +76,7 @@ private:
|
||||
seastar::future<> set_client_routes_inner(const std::vector<service::client_routes_service::client_route_entry>& route_entries);
|
||||
seastar::future<> delete_client_routes_inner(const std::vector<service::client_routes_service::client_route_key>& route_keys);
|
||||
template <typename Func>
|
||||
seastar::future<> with_retry(Func&& func) const;
|
||||
seastar::future<> with_retry(Func func) const;
|
||||
|
||||
abort_source& _abort_source;
|
||||
gms::feature_service& _feature_service;
|
||||
|
||||
@@ -224,7 +224,13 @@ future<> service::client_state::has_access(const sstring& ks, auth::command_desc
|
||||
ks + " can be granted only SELECT or DESCRIBE permissions to a non-superuser.");
|
||||
}
|
||||
|
||||
if (cmd.resource.kind() == auth::resource_kind::data && cmd.permission == auth::permission::SELECT && is_vector_indexed.has_value() && is_vector_indexed.value()) {
|
||||
static const std::unordered_set<auth::resource> vector_search_system_resources = {
|
||||
auth::make_data_resource(db::system_keyspace::NAME, db::system_keyspace::GROUP0_HISTORY),
|
||||
auth::make_data_resource(db::system_keyspace::NAME, db::system_keyspace::VERSIONS),
|
||||
};
|
||||
|
||||
if ((cmd.resource.kind() == auth::resource_kind::data && cmd.permission == auth::permission::SELECT && is_vector_indexed.has_value() && is_vector_indexed.value()) ||
|
||||
(cmd.permission == auth::permission::SELECT && vector_search_system_resources.contains(cmd.resource))) {
|
||||
|
||||
co_return co_await ensure_has_permission<auth::command_desc_with_permission_set>({auth::permission_set::of<auth::permission::SELECT, auth::permission::VECTOR_SEARCH_INDEXING>(), cmd.resource});
|
||||
|
||||
@@ -344,3 +350,17 @@ void service::client_state::update_per_service_level_params(qos::service_level_o
|
||||
|
||||
_workload_type = slo.workload;
|
||||
}
|
||||
|
||||
future<> service::client_state::set_client_options(
|
||||
client_options_cache_type& keys_and_values_cache,
|
||||
const std::unordered_map<sstring, sstring>& client_options) {
|
||||
for (const auto& [key, value] : client_options) {
|
||||
auto cached_key = co_await keys_and_values_cache.get_or_load(key, [] (const client_options_cache_key_type&) {
|
||||
return make_ready_future<options_cache_value_type>(options_cache_value_type{});
|
||||
});
|
||||
auto cached_value = co_await keys_and_values_cache.get_or_load(value, [] (const client_options_cache_key_type&) {
|
||||
return make_ready_future<options_cache_value_type>(options_cache_value_type{});
|
||||
});
|
||||
_client_options.emplace_back(std::move(cached_key), std::move(cached_value));
|
||||
}
|
||||
}
|
||||
@@ -18,6 +18,7 @@
|
||||
#include "auth/authenticated_user.hh"
|
||||
#include "auth/authenticator.hh"
|
||||
#include "auth/permission.hh"
|
||||
#include "client_data.hh"
|
||||
|
||||
#include "transport/cql_protocol_extension.hh"
|
||||
#include "service/qos/service_level_controller.hh"
|
||||
@@ -102,7 +103,8 @@ private:
|
||||
private volatile String keyspace;
|
||||
#endif
|
||||
std::optional<auth::authenticated_user> _user;
|
||||
std::optional<sstring> _driver_name, _driver_version;
|
||||
std::optional<client_options_cache_entry_type> _driver_name, _driver_version;
|
||||
std::list<client_option_key_value_cached_entry> _client_options;
|
||||
|
||||
auth_state _auth_state = auth_state::UNINITIALIZED;
|
||||
bool _control_connection = false;
|
||||
@@ -151,18 +153,33 @@ public:
|
||||
return _control_connection = true;
|
||||
}
|
||||
|
||||
std::optional<sstring> get_driver_name() const {
|
||||
std::optional<client_options_cache_entry_type> get_driver_name() const {
|
||||
return _driver_name;
|
||||
}
|
||||
void set_driver_name(sstring driver_name) {
|
||||
_driver_name = std::move(driver_name);
|
||||
future<> set_driver_name(client_options_cache_type& keys_and_values_cache, const sstring& driver_name) {
|
||||
_driver_name = co_await keys_and_values_cache.get_or_load(driver_name, [] (const client_options_cache_key_type&) {
|
||||
return make_ready_future<options_cache_value_type>(options_cache_value_type{});
|
||||
});
|
||||
}
|
||||
|
||||
std::optional<sstring> get_driver_version() const {
|
||||
const auto& get_client_options() const {
|
||||
return _client_options;
|
||||
}
|
||||
|
||||
future<> set_client_options(
|
||||
client_options_cache_type& keys_and_values_cache,
|
||||
const std::unordered_map<sstring, sstring>& client_options);
|
||||
|
||||
std::optional<client_options_cache_entry_type> get_driver_version() const {
|
||||
return _driver_version;
|
||||
}
|
||||
void set_driver_version(sstring driver_version) {
|
||||
_driver_version = std::move(driver_version);
|
||||
future<> set_driver_version(
|
||||
client_options_cache_type& keys_and_values_cache,
|
||||
const sstring& driver_version)
|
||||
{
|
||||
_driver_version = co_await keys_and_values_cache.get_or_load(driver_version, [] (const client_options_cache_key_type&) {
|
||||
return make_ready_future<options_cache_value_type>(options_cache_value_type{});
|
||||
});
|
||||
}
|
||||
|
||||
client_state(external_tag,
|
||||
|
||||
@@ -79,7 +79,8 @@ group0_state_machine::group0_state_machine(raft_group0_client& client, migration
|
||||
// the node won't try to fetch a topology snapshot if the other
|
||||
// node doesn't support it yet.
|
||||
_topology_change_enabled = true;
|
||||
})) {
|
||||
}))
|
||||
, _in_memory_state_machine_enabled(utils::get_local_injector().is_enabled("group0_enable_sm_immediately")) {
|
||||
_state_id_handler.run();
|
||||
}
|
||||
|
||||
@@ -154,6 +155,27 @@ static future<> notify_client_route_change_if_needed(storage_service& storage_se
|
||||
}
|
||||
}
|
||||
|
||||
// Meant to be used only in error injections.
|
||||
static future<> maybe_partially_apply_cdc_generation_deletion_then_get_stuck(
|
||||
std::function<future<>(utils::chunked_vector<frozen_mutation_and_schema>)> mutate,
|
||||
const utils::chunked_vector<frozen_mutation_and_schema>& mutations) {
|
||||
|
||||
auto is_cdc_generation_data_clearing_mutation = [] (const frozen_mutation_and_schema& fm_s) {
|
||||
return fm_s.s->id() == db::system_keyspace::cdc_generations_v3()->id()
|
||||
&& !fm_s.fm.unfreeze(fm_s.s).partition().row_tombstones().empty();
|
||||
};
|
||||
|
||||
if (std::any_of(mutations.begin(), mutations.end(), is_cdc_generation_data_clearing_mutation)) {
|
||||
utils::chunked_vector<frozen_mutation_and_schema> filtered_mutations;
|
||||
std::copy_if(mutations.begin(), mutations.end(), std::back_inserter(filtered_mutations), is_cdc_generation_data_clearing_mutation);
|
||||
co_await mutate(std::move(filtered_mutations));
|
||||
while (true) {
|
||||
slogger.info("group0 has hung on error injection, waiting for the process to be killed");
|
||||
co_await seastar::sleep(std::chrono::seconds(1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
future<> write_mutations_to_database(storage_service& storage_service, storage_proxy& proxy, gms::inet_address from, utils::chunked_vector<canonical_mutation> cms) {
|
||||
utils::chunked_vector<frozen_mutation_and_schema> mutations;
|
||||
client_routes_service::client_route_keys client_routes_update;
|
||||
@@ -178,7 +200,13 @@ future<> write_mutations_to_database(storage_service& storage_service, storage_p
|
||||
throw std::runtime_error(::format("Error while applying mutations: {}", e));
|
||||
}
|
||||
|
||||
co_await proxy.mutate_locally(std::move(mutations), tracing::trace_state_ptr(), db::commitlog::force_sync::no);
|
||||
auto mutate = [&proxy] (utils::chunked_vector<frozen_mutation_and_schema> mutations) {
|
||||
return proxy.mutate_locally(std::move(mutations), tracing::trace_state_ptr(), db::commitlog::force_sync::no);
|
||||
};
|
||||
if (utils::get_local_injector().is_enabled("group0_simulate_partial_application_of_cdc_generation_deletion")) {
|
||||
co_await maybe_partially_apply_cdc_generation_deletion_then_get_stuck(mutate, mutations);
|
||||
}
|
||||
co_await mutate(std::move(mutations));
|
||||
|
||||
if (need_system_topology_flush) {
|
||||
slogger.trace("write_mutations_to_database: flushing {}.{}", db::system_keyspace::NAME, db::system_keyspace::TOPOLOGY);
|
||||
@@ -271,42 +299,41 @@ future<> group0_state_machine::merge_and_apply(group0_state_machine_merger& merg
|
||||
// If we crash before appending the state ID, when we reapply the command after restart, the change will be applied because
|
||||
// the state ID was not yet appended so the above check will pass.
|
||||
|
||||
// TODO: reapplication of a command after a crash may require contacting a quorum (we need to learn that the command
|
||||
// is committed from a leader). But we may want to ensure that group 0 state is consistent after restart even without
|
||||
// access to quorum, which means we cannot allow partially applied commands. We need to ensure that either the entire
|
||||
// change is applied and the state ID is updated or none of this happens.
|
||||
// E.g. use a write-ahead-entry which contains all this information and make sure it's replayed during restarts.
|
||||
std::optional<storage_service::state_change_hint> topology_state_change_hint;
|
||||
modules_to_reload modules_to_reload;
|
||||
|
||||
co_await std::visit(make_visitor(
|
||||
[&] (schema_change& chng) -> future<> {
|
||||
auto modules_to_reload = get_modules_to_reload(chng.mutations);
|
||||
modules_to_reload = get_modules_to_reload(chng.mutations);
|
||||
co_await _mm.merge_schema_from(locator::host_id{cmd.creator_id.uuid()}, std::move(chng.mutations));
|
||||
co_await reload_modules(std::move(modules_to_reload));
|
||||
},
|
||||
[&] (broadcast_table_query& query) -> future<> {
|
||||
auto result = co_await service::broadcast_tables::execute_broadcast_table_query(_sp, query.query, cmd.new_state_id);
|
||||
_client.set_query_result(cmd.new_state_id, std::move(result));
|
||||
},
|
||||
[&] (topology_change& chng) -> future<> {
|
||||
auto modules_to_reload = get_modules_to_reload(chng.mutations);
|
||||
auto tablet_keys = replica::get_tablet_metadata_change_hint(chng.mutations);
|
||||
modules_to_reload = get_modules_to_reload(chng.mutations);
|
||||
topology_state_change_hint = {.tablets_hint = replica::get_tablet_metadata_change_hint(chng.mutations)};
|
||||
co_await write_mutations_to_database(_ss, _sp, cmd.creator_addr, std::move(chng.mutations));
|
||||
co_await _ss.topology_transition({.tablets_hint = std::move(tablet_keys)});
|
||||
co_await reload_modules(std::move(modules_to_reload));
|
||||
},
|
||||
[&] (mixed_change& chng) -> future<> {
|
||||
auto modules_to_reload = get_modules_to_reload(chng.mutations);
|
||||
modules_to_reload = get_modules_to_reload(chng.mutations);
|
||||
topology_state_change_hint.emplace();
|
||||
co_await _mm.merge_schema_from(locator::host_id{cmd.creator_id.uuid()}, std::move(chng.mutations));
|
||||
co_await _ss.topology_transition();
|
||||
co_await reload_modules(std::move(modules_to_reload));
|
||||
},
|
||||
[&] (write_mutations& muts) -> future<> {
|
||||
auto modules_to_reload = get_modules_to_reload(muts.mutations);
|
||||
modules_to_reload = get_modules_to_reload(muts.mutations);
|
||||
co_await write_mutations_to_database(_ss, _sp, cmd.creator_addr, std::move(muts.mutations));
|
||||
co_await reload_modules(std::move(modules_to_reload));
|
||||
}
|
||||
), cmd.change);
|
||||
|
||||
if (_in_memory_state_machine_enabled) {
|
||||
if (topology_state_change_hint) {
|
||||
co_await _ss.topology_transition(std::move(*topology_state_change_hint));
|
||||
}
|
||||
co_await reload_modules(std::move(modules_to_reload));
|
||||
}
|
||||
|
||||
co_await _sp.mutate_locally({std::move(history)}, nullptr);
|
||||
}
|
||||
|
||||
@@ -413,9 +440,23 @@ void group0_state_machine::drop_snapshot(raft::snapshot_id id) {
|
||||
}
|
||||
|
||||
future<> group0_state_machine::load_snapshot(raft::snapshot_id id) {
|
||||
// topology_state_load applies persisted state machine state into
|
||||
// memory and thus needs to be protected with apply mutex
|
||||
auto read_apply_mutex_holder = co_await _client.hold_read_apply_mutex(_abort_source);
|
||||
if (_in_memory_state_machine_enabled) {
|
||||
co_await reload_state();
|
||||
}
|
||||
}
|
||||
|
||||
future<> group0_state_machine::enable_in_memory_state_machine() {
|
||||
auto read_apply_mutex_holder = co_await _client.hold_read_apply_mutex(_abort_source);
|
||||
if (!_in_memory_state_machine_enabled) {
|
||||
_in_memory_state_machine_enabled = true;
|
||||
co_await reload_state();
|
||||
}
|
||||
}
|
||||
|
||||
future<> group0_state_machine::reload_state() {
|
||||
// we assume that the apply mutex is held, topology_state_load applies
|
||||
// persisted state machine into memory so it needs to be protected with it
|
||||
co_await _ss.topology_state_load();
|
||||
co_await _ss.view_building_state_load();
|
||||
if (_feature_service.compression_dicts) {
|
||||
|
||||
@@ -113,9 +113,33 @@ class group0_state_machine : public raft_state_machine {
|
||||
gms::feature_service& _feature_service;
|
||||
gms::feature::listener_registration _topology_on_raft_support_listener;
|
||||
|
||||
// This boolean controls whether the in-memory data structures should be updated
|
||||
// after snapshot transfer / command application.
|
||||
//
|
||||
// The reason for the flag is to protect from reading a partially applied state.
|
||||
// A group0 command may consist of multiple mutations that are not applied
|
||||
// in a single, atomic operation, but rather separately. A node can crash
|
||||
// in the middle of applying such a command, leaving the group0 in an inconsistent
|
||||
// state. Thanks to the idempotency of mutations, applying the group0 command
|
||||
// again, fully, will make the state consistent again. Therefore, we use this
|
||||
// flag to control when the in memory state machine should be updated from the
|
||||
// on-disk state - we can only do that if we know that the group0 table state
|
||||
// is consistent.
|
||||
//
|
||||
// The only exception to the above rule is the schema - the schema state is
|
||||
// loaded into memory before group0 is initialized, and the in-memory state
|
||||
// is reloaded even if _in_memory_state_machine_enabled is set to false.
|
||||
// Resolving this exception should be possible, but would require considerable
|
||||
// effort in refactoring the migration manager code. In the meantime, we are
|
||||
// fine with this exception because the migration manager applies all schema
|
||||
// mutations of a single command atomically, in a single commitlog entry -
|
||||
// therefore, we should not observe broken invariants in the schema module.
|
||||
bool _in_memory_state_machine_enabled;
|
||||
|
||||
modules_to_reload get_modules_to_reload(const utils::chunked_vector<canonical_mutation>& mutations);
|
||||
future<> reload_modules(modules_to_reload modules);
|
||||
future<> merge_and_apply(group0_state_machine_merger& merger);
|
||||
future<> reload_state();
|
||||
public:
|
||||
group0_state_machine(raft_group0_client& client, migration_manager& mm, storage_proxy& sp, storage_service& ss,
|
||||
gms::gossiper& gossiper, gms::feature_service& feat, bool topology_change_enabled);
|
||||
@@ -125,6 +149,7 @@ public:
|
||||
future<> load_snapshot(raft::snapshot_id id) override;
|
||||
future<> transfer_snapshot(raft::server_id from_id, raft::snapshot_descriptor snp) override;
|
||||
future<> abort() override;
|
||||
future<> enable_in_memory_state_machine();
|
||||
};
|
||||
|
||||
bool should_flush_system_topology_after_applying(const mutation& mut, const data_dictionary::database db);
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user