Compare commits
8 Commits
debug_form
...
copilot/pr
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
173fb1e6d3 | ||
|
|
e252bb1550 | ||
|
|
5713b5efd1 | ||
|
|
979ec5ada8 | ||
|
|
67503a350b | ||
|
|
a90490c3cf | ||
|
|
6f957ea4e0 | ||
|
|
f6605f7b66 |
18
.github/copilot-instructions.md
vendored
18
.github/copilot-instructions.md
vendored
@@ -55,26 +55,22 @@ ninja build/<mode>/test/boost/<test_name>
|
||||
ninja build/<mode>/scylla
|
||||
|
||||
# Run all tests in a file
|
||||
./test.py --mode=<mode> test/<suite>/<test_name>.py
|
||||
./test.py --mode=<mode> <test_path>
|
||||
|
||||
# Run a single test case from a file
|
||||
./test.py --mode=<mode> test/<suite>/<test_name>.py::<test_function_name>
|
||||
|
||||
# Run all tests in a directory
|
||||
./test.py --mode=<mode> test/<suite>/
|
||||
./test.py --mode=<mode> <test_path>::<test_function_name>
|
||||
|
||||
# Examples
|
||||
./test.py --mode=dev test/alternator/
|
||||
./test.py --mode=dev test/cluster/test_raft_voters.py::test_raft_limited_voters_retain_coordinator
|
||||
./test.py --mode=dev test/cqlpy/test_json.py
|
||||
./test.py --mode=dev alternator/
|
||||
./test.py --mode=dev cluster/test_raft_voters::test_raft_limited_voters_retain_coordinator
|
||||
|
||||
# Optional flags
|
||||
./test.py --mode=dev test/cluster/test_raft_no_quorum.py -v # Verbose output
|
||||
./test.py --mode=dev test/cluster/test_raft_no_quorum.py --repeat 5 # Repeat test 5 times
|
||||
./test.py --mode=dev cluster/test_raft_no_quorum -v # Verbose output
|
||||
./test.py --mode=dev cluster/test_raft_no_quorum --repeat 5 # Repeat test 5 times
|
||||
```
|
||||
|
||||
**Important:**
|
||||
- Use full path with `.py` extension (e.g., `test/cluster/test_raft_no_quorum.py`, not `cluster/test_raft_no_quorum`)
|
||||
- Use path without `.py` extension (e.g., `cluster/test_raft_no_quorum`, not `cluster/test_raft_no_quorum.py`)
|
||||
- To run a single test case, append `::<test_function_name>` to the file path
|
||||
- Add `-v` for verbose output
|
||||
- Add `--repeat <num>` to repeat a test multiple times
|
||||
|
||||
@@ -8,9 +8,6 @@ on:
|
||||
jobs:
|
||||
check-fixes-prefix:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
issues: write
|
||||
steps:
|
||||
- name: Check PR body for "Fixes" prefix patterns
|
||||
uses: actions/github-script@v7
|
||||
|
||||
2
.github/workflows/trigger-scylla-ci.yaml
vendored
2
.github/workflows/trigger-scylla-ci.yaml
vendored
@@ -1,6 +1,4 @@
|
||||
name: Trigger Scylla CI Route
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
on:
|
||||
issue_comment:
|
||||
|
||||
3
.github/workflows/trigger_jenkins.yaml
vendored
3
.github/workflows/trigger_jenkins.yaml
vendored
@@ -1,8 +1,5 @@
|
||||
name: Trigger next gating
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
|
||||
@@ -1295,45 +1295,6 @@
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"path":"/storage_service/logstor_compaction",
|
||||
"operations":[
|
||||
{
|
||||
"method":"POST",
|
||||
"summary":"Trigger compaction of the key-value storage",
|
||||
"type":"void",
|
||||
"nickname":"logstor_compaction",
|
||||
"produces":[
|
||||
"application/json"
|
||||
],
|
||||
"parameters":[
|
||||
{
|
||||
"name":"major",
|
||||
"description":"When true, perform a major compaction",
|
||||
"required":false,
|
||||
"allowMultiple":false,
|
||||
"type":"boolean",
|
||||
"paramType":"query"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"path":"/storage_service/logstor_flush",
|
||||
"operations":[
|
||||
{
|
||||
"method":"POST",
|
||||
"summary":"Trigger flush of logstor storage",
|
||||
"type":"void",
|
||||
"nickname":"logstor_flush",
|
||||
"produces":[
|
||||
"application/json"
|
||||
],
|
||||
"parameters":[]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"path":"/storage_service/active_repair/",
|
||||
"operations":[
|
||||
@@ -3268,38 +3229,6 @@
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"path":"/storage_service/logstor_info",
|
||||
"operations":[
|
||||
{
|
||||
"method":"GET",
|
||||
"summary":"Logstor segment information for one table",
|
||||
"type":"table_logstor_info",
|
||||
"nickname":"logstor_info",
|
||||
"produces":[
|
||||
"application/json"
|
||||
],
|
||||
"parameters":[
|
||||
{
|
||||
"name":"keyspace",
|
||||
"description":"The keyspace",
|
||||
"required":true,
|
||||
"allowMultiple":false,
|
||||
"type":"string",
|
||||
"paramType":"query"
|
||||
},
|
||||
{
|
||||
"name":"table",
|
||||
"description":"table name",
|
||||
"required":true,
|
||||
"allowMultiple":false,
|
||||
"type":"string",
|
||||
"paramType":"query"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"path":"/storage_service/retrain_dict",
|
||||
"operations":[
|
||||
@@ -3708,47 +3637,6 @@
|
||||
}
|
||||
}
|
||||
},
|
||||
"logstor_hist_bucket":{
|
||||
"id":"logstor_hist_bucket",
|
||||
"properties":{
|
||||
"bucket":{
|
||||
"type":"long"
|
||||
},
|
||||
"count":{
|
||||
"type":"long"
|
||||
},
|
||||
"min_data_size":{
|
||||
"type":"long"
|
||||
},
|
||||
"max_data_size":{
|
||||
"type":"long"
|
||||
}
|
||||
}
|
||||
},
|
||||
"table_logstor_info":{
|
||||
"id":"table_logstor_info",
|
||||
"description":"Per-table logstor segment distribution",
|
||||
"properties":{
|
||||
"keyspace":{
|
||||
"type":"string"
|
||||
},
|
||||
"table":{
|
||||
"type":"string"
|
||||
},
|
||||
"compaction_groups":{
|
||||
"type":"long"
|
||||
},
|
||||
"segments":{
|
||||
"type":"long"
|
||||
},
|
||||
"data_size_histogram":{
|
||||
"type":"array",
|
||||
"items":{
|
||||
"$ref":"logstor_hist_bucket"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"tablet_repair_result":{
|
||||
"id":"tablet_repair_result",
|
||||
"description":"Tablet repair result",
|
||||
|
||||
@@ -209,21 +209,6 @@
|
||||
"parameters":[]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"path":"/system/chosen_sstable_version",
|
||||
"operations":[
|
||||
{
|
||||
"method":"GET",
|
||||
"summary":"Get sstable version currently chosen for use in new sstables",
|
||||
"type":"string",
|
||||
"nickname":"get_chosen_sstable_version",
|
||||
"produces":[
|
||||
"application/json"
|
||||
],
|
||||
"parameters":[]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@@ -18,9 +18,7 @@
|
||||
#include "utils/assert.hh"
|
||||
#include "utils/estimated_histogram.hh"
|
||||
#include <algorithm>
|
||||
#include <sstream>
|
||||
#include "db/data_listeners.hh"
|
||||
#include "utils/hash.hh"
|
||||
#include "storage_service.hh"
|
||||
#include "compaction/compaction_manager.hh"
|
||||
#include "unimplemented.hh"
|
||||
@@ -344,56 +342,6 @@ uint64_t accumulate_on_active_memtables(replica::table& t, noncopyable_function<
|
||||
return ret;
|
||||
}
|
||||
|
||||
static
|
||||
future<json::json_return_type>
|
||||
rest_toppartitions_generic(sharded<replica::database>& db, std::unique_ptr<http::request> req) {
|
||||
bool filters_provided = false;
|
||||
|
||||
std::unordered_set<std::tuple<sstring, sstring>, utils::tuple_hash> table_filters {};
|
||||
if (auto filters = req->get_query_param("table_filters"); !filters.empty()) {
|
||||
filters_provided = true;
|
||||
std::stringstream ss { filters };
|
||||
std::string filter;
|
||||
while (!filters.empty() && ss.good()) {
|
||||
std::getline(ss, filter, ',');
|
||||
table_filters.emplace(parse_fully_qualified_cf_name(filter));
|
||||
}
|
||||
}
|
||||
|
||||
std::unordered_set<sstring> keyspace_filters {};
|
||||
if (auto filters = req->get_query_param("keyspace_filters"); !filters.empty()) {
|
||||
filters_provided = true;
|
||||
std::stringstream ss { filters };
|
||||
std::string filter;
|
||||
while (!filters.empty() && ss.good()) {
|
||||
std::getline(ss, filter, ',');
|
||||
keyspace_filters.emplace(std::move(filter));
|
||||
}
|
||||
}
|
||||
|
||||
// when the query is empty return immediately
|
||||
if (filters_provided && table_filters.empty() && keyspace_filters.empty()) {
|
||||
apilog.debug("toppartitions query: processing results");
|
||||
cf::toppartitions_query_results results;
|
||||
|
||||
results.read_cardinality = 0;
|
||||
results.write_cardinality = 0;
|
||||
|
||||
return make_ready_future<json::json_return_type>(results);
|
||||
}
|
||||
|
||||
api::req_param<std::chrono::milliseconds, unsigned> duration{*req, "duration", 1000ms};
|
||||
api::req_param<unsigned> capacity(*req, "capacity", 256);
|
||||
api::req_param<unsigned> list_size(*req, "list_size", 10);
|
||||
|
||||
apilog.info("toppartitions query: #table_filters={} #keyspace_filters={} duration={} list_size={} capacity={}",
|
||||
!table_filters.empty() ? std::to_string(table_filters.size()) : "all", !keyspace_filters.empty() ? std::to_string(keyspace_filters.size()) : "all", duration.value, list_size.value, capacity.value);
|
||||
|
||||
return seastar::do_with(db::toppartitions_query(db, std::move(table_filters), std::move(keyspace_filters), duration.value, list_size, capacity), [] (db::toppartitions_query& q) {
|
||||
return run_toppartitions_query(q);
|
||||
});
|
||||
}
|
||||
|
||||
void set_column_family(http_context& ctx, routes& r, sharded<replica::database>& db) {
|
||||
cf::get_column_family_name.set(r, [&db] (const_req req){
|
||||
std::vector<sstring> res;
|
||||
@@ -1099,10 +1047,6 @@ void set_column_family(http_context& ctx, routes& r, sharded<replica::database>&
|
||||
});
|
||||
});
|
||||
|
||||
ss::toppartitions_generic.set(r, [&db] (std::unique_ptr<http::request> req) {
|
||||
return rest_toppartitions_generic(db, std::move(req));
|
||||
});
|
||||
|
||||
cf::force_major_compaction.set(r, [&ctx, &db](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
|
||||
if (!req->get_query_param("split_output").empty()) {
|
||||
fail(unimplemented::cause::API);
|
||||
@@ -1269,7 +1213,6 @@ void unset_column_family(http_context& ctx, routes& r) {
|
||||
cf::get_sstable_count_per_level.unset(r);
|
||||
cf::get_sstables_for_key.unset(r);
|
||||
cf::toppartitions.unset(r);
|
||||
ss::toppartitions_generic.unset(r);
|
||||
cf::force_major_compaction.unset(r);
|
||||
ss::get_load.unset(r);
|
||||
ss::get_metrics_load.unset(r);
|
||||
|
||||
@@ -17,7 +17,9 @@
|
||||
#include "gms/feature_service.hh"
|
||||
#include "schema/schema_builder.hh"
|
||||
#include "sstables/sstables_manager.hh"
|
||||
#include "utils/hash.hh"
|
||||
#include <optional>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
#include <time.h>
|
||||
#include <algorithm>
|
||||
@@ -610,6 +612,56 @@ rest_get_token_endpoint(http_context& ctx, sharded<service::storage_service>& ss
|
||||
co_return json::json_return_type(stream_range_as_array(token_endpoints, &map_to_json<dht::token, gms::inet_address>));
|
||||
}
|
||||
|
||||
static
|
||||
future<json::json_return_type>
|
||||
rest_toppartitions_generic(http_context& ctx, std::unique_ptr<http::request> req) {
|
||||
bool filters_provided = false;
|
||||
|
||||
std::unordered_set<std::tuple<sstring, sstring>, utils::tuple_hash> table_filters {};
|
||||
if (auto filters = req->get_query_param("table_filters"); !filters.empty()) {
|
||||
filters_provided = true;
|
||||
std::stringstream ss { filters };
|
||||
std::string filter;
|
||||
while (!filters.empty() && ss.good()) {
|
||||
std::getline(ss, filter, ',');
|
||||
table_filters.emplace(parse_fully_qualified_cf_name(filter));
|
||||
}
|
||||
}
|
||||
|
||||
std::unordered_set<sstring> keyspace_filters {};
|
||||
if (auto filters = req->get_query_param("keyspace_filters"); !filters.empty()) {
|
||||
filters_provided = true;
|
||||
std::stringstream ss { filters };
|
||||
std::string filter;
|
||||
while (!filters.empty() && ss.good()) {
|
||||
std::getline(ss, filter, ',');
|
||||
keyspace_filters.emplace(std::move(filter));
|
||||
}
|
||||
}
|
||||
|
||||
// when the query is empty return immediately
|
||||
if (filters_provided && table_filters.empty() && keyspace_filters.empty()) {
|
||||
apilog.debug("toppartitions query: processing results");
|
||||
httpd::column_family_json::toppartitions_query_results results;
|
||||
|
||||
results.read_cardinality = 0;
|
||||
results.write_cardinality = 0;
|
||||
|
||||
return make_ready_future<json::json_return_type>(results);
|
||||
}
|
||||
|
||||
api::req_param<std::chrono::milliseconds, unsigned> duration{*req, "duration", 1000ms};
|
||||
api::req_param<unsigned> capacity(*req, "capacity", 256);
|
||||
api::req_param<unsigned> list_size(*req, "list_size", 10);
|
||||
|
||||
apilog.info("toppartitions query: #table_filters={} #keyspace_filters={} duration={} list_size={} capacity={}",
|
||||
!table_filters.empty() ? std::to_string(table_filters.size()) : "all", !keyspace_filters.empty() ? std::to_string(keyspace_filters.size()) : "all", duration.value, list_size.value, capacity.value);
|
||||
|
||||
return seastar::do_with(db::toppartitions_query(ctx.db, std::move(table_filters), std::move(keyspace_filters), duration.value, list_size, capacity), [] (db::toppartitions_query& q) {
|
||||
return run_toppartitions_query(q);
|
||||
});
|
||||
}
|
||||
|
||||
static
|
||||
json::json_return_type
|
||||
rest_get_release_version(sharded<service::storage_service>& ss, const_req& req) {
|
||||
@@ -781,28 +833,6 @@ rest_force_keyspace_flush(http_context& ctx, std::unique_ptr<http::request> req)
|
||||
co_return json_void();
|
||||
}
|
||||
|
||||
static
|
||||
future<json::json_return_type>
|
||||
rest_logstor_compaction(http_context& ctx, std::unique_ptr<http::request> req) {
|
||||
bool major = false;
|
||||
if (auto major_param = req->get_query_param("major"); !major_param.empty()) {
|
||||
major = validate_bool(major_param);
|
||||
}
|
||||
apilog.info("logstor_compaction: major={}", major);
|
||||
auto& db = ctx.db;
|
||||
co_await replica::database::trigger_logstor_compaction_on_all_shards(db, major);
|
||||
co_return json_void();
|
||||
}
|
||||
|
||||
static
|
||||
future<json::json_return_type>
|
||||
rest_logstor_flush(http_context& ctx, std::unique_ptr<http::request> req) {
|
||||
apilog.info("logstor_flush");
|
||||
auto& db = ctx.db;
|
||||
co_await replica::database::flush_logstor_separator_on_all_shards(db);
|
||||
co_return json_void();
|
||||
}
|
||||
|
||||
static
|
||||
future<json::json_return_type>
|
||||
rest_decommission(sharded<service::storage_service>& ss, sharded<db::snapshot_ctl>& ssc, std::unique_ptr<http::request> req) {
|
||||
@@ -1523,54 +1553,6 @@ rest_sstable_info(http_context& ctx, std::unique_ptr<http::request> req) {
|
||||
});
|
||||
}
|
||||
|
||||
static
|
||||
future<json::json_return_type>
|
||||
rest_logstor_info(http_context& ctx, std::unique_ptr<http::request> req) {
|
||||
auto keyspace = api::req_param<sstring>(*req, "keyspace", {}).value;
|
||||
auto table = api::req_param<sstring>(*req, "table", {}).value;
|
||||
if (table.empty()) {
|
||||
table = api::req_param<sstring>(*req, "cf", {}).value;
|
||||
}
|
||||
|
||||
if (keyspace.empty()) {
|
||||
throw bad_param_exception("The query parameter 'keyspace' is required");
|
||||
}
|
||||
if (table.empty()) {
|
||||
throw bad_param_exception("The query parameter 'table' is required");
|
||||
}
|
||||
|
||||
keyspace = validate_keyspace(ctx, keyspace);
|
||||
auto tid = validate_table(ctx.db.local(), keyspace, table);
|
||||
|
||||
auto& cf = ctx.db.local().find_column_family(tid);
|
||||
if (!cf.uses_logstor()) {
|
||||
throw bad_param_exception(fmt::format("Table {}.{} does not use logstor", keyspace, table));
|
||||
}
|
||||
|
||||
return do_with(replica::logstor::table_segment_stats{}, [keyspace = std::move(keyspace), table = std::move(table), tid, &ctx] (replica::logstor::table_segment_stats& merged_stats) {
|
||||
return ctx.db.map_reduce([&merged_stats](replica::logstor::table_segment_stats&& shard_stats) {
|
||||
merged_stats += shard_stats;
|
||||
}, [tid](const replica::database& db) {
|
||||
return db.get_logstor_table_segment_stats(tid);
|
||||
}).then([&merged_stats, keyspace = std::move(keyspace), table = std::move(table)] {
|
||||
ss::table_logstor_info result;
|
||||
result.keyspace = keyspace;
|
||||
result.table = table;
|
||||
result.compaction_groups = merged_stats.compaction_group_count;
|
||||
result.segments = merged_stats.segment_count;
|
||||
|
||||
for (const auto& bucket : merged_stats.histogram) {
|
||||
ss::logstor_hist_bucket hist;
|
||||
hist.count = bucket.count;
|
||||
hist.max_data_size = bucket.max_data_size;
|
||||
result.data_size_histogram.push(std::move(hist));
|
||||
}
|
||||
|
||||
return make_ready_future<json::json_return_type>(stream_object(result));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
static
|
||||
future<json::json_return_type>
|
||||
rest_reload_raft_topology_state(sharded<service::storage_service>& ss, service::raft_group0_client& group0_client, std::unique_ptr<http::request> req) {
|
||||
@@ -1802,6 +1784,7 @@ rest_bind(FuncType func, BindArgs&... args) {
|
||||
|
||||
void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_service>& ss, sharded<db::snapshot_ctl>& ssc, service::raft_group0_client& group0_client) {
|
||||
ss::get_token_endpoint.set(r, rest_bind(rest_get_token_endpoint, ctx, ss));
|
||||
ss::toppartitions_generic.set(r, rest_bind(rest_toppartitions_generic, ctx));
|
||||
ss::get_release_version.set(r, rest_bind(rest_get_release_version, ss));
|
||||
ss::get_scylla_release_version.set(r, rest_bind(rest_get_scylla_release_version, ss));
|
||||
ss::get_schema_version.set(r, rest_bind(rest_get_schema_version, ss));
|
||||
@@ -1817,8 +1800,6 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
|
||||
ss::force_flush.set(r, rest_bind(rest_force_flush, ctx));
|
||||
ss::force_keyspace_flush.set(r, rest_bind(rest_force_keyspace_flush, ctx));
|
||||
ss::decommission.set(r, rest_bind(rest_decommission, ss, ssc));
|
||||
ss::logstor_compaction.set(r, rest_bind(rest_logstor_compaction, ctx));
|
||||
ss::logstor_flush.set(r, rest_bind(rest_logstor_flush, ctx));
|
||||
ss::move.set(r, rest_bind(rest_move, ss));
|
||||
ss::remove_node.set(r, rest_bind(rest_remove_node, ss));
|
||||
ss::exclude_node.set(r, rest_bind(rest_exclude_node, ss));
|
||||
@@ -1867,7 +1848,6 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
|
||||
ss::retrain_dict.set(r, rest_bind(rest_retrain_dict, ctx, ss, group0_client));
|
||||
ss::estimate_compression_ratios.set(r, rest_bind(rest_estimate_compression_ratios, ctx, ss));
|
||||
ss::sstable_info.set(r, rest_bind(rest_sstable_info, ctx));
|
||||
ss::logstor_info.set(r, rest_bind(rest_logstor_info, ctx));
|
||||
ss::reload_raft_topology_state.set(r, rest_bind(rest_reload_raft_topology_state, ss, group0_client));
|
||||
ss::upgrade_to_raft_topology.set(r, rest_bind(rest_upgrade_to_raft_topology, ss));
|
||||
ss::raft_topology_upgrade_status.set(r, rest_bind(rest_raft_topology_upgrade_status, ss));
|
||||
@@ -1884,6 +1864,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
|
||||
|
||||
void unset_storage_service(http_context& ctx, routes& r) {
|
||||
ss::get_token_endpoint.unset(r);
|
||||
ss::toppartitions_generic.unset(r);
|
||||
ss::get_release_version.unset(r);
|
||||
ss::get_scylla_release_version.unset(r);
|
||||
ss::get_schema_version.unset(r);
|
||||
@@ -1897,8 +1878,6 @@ void unset_storage_service(http_context& ctx, routes& r) {
|
||||
ss::reset_cleanup_needed.unset(r);
|
||||
ss::force_flush.unset(r);
|
||||
ss::force_keyspace_flush.unset(r);
|
||||
ss::logstor_compaction.unset(r);
|
||||
ss::logstor_flush.unset(r);
|
||||
ss::decommission.unset(r);
|
||||
ss::move.unset(r);
|
||||
ss::remove_node.unset(r);
|
||||
@@ -1946,7 +1925,6 @@ void unset_storage_service(http_context& ctx, routes& r) {
|
||||
ss::get_ownership.unset(r);
|
||||
ss::get_effective_ownership.unset(r);
|
||||
ss::sstable_info.unset(r);
|
||||
ss::logstor_info.unset(r);
|
||||
ss::reload_raft_topology_state.unset(r);
|
||||
ss::upgrade_to_raft_topology.unset(r);
|
||||
ss::raft_topology_upgrade_status.unset(r);
|
||||
|
||||
@@ -190,13 +190,6 @@ void set_system(http_context& ctx, routes& r) {
|
||||
return make_ready_future<json::json_return_type>(seastar::to_sstring(format));
|
||||
});
|
||||
});
|
||||
|
||||
hs::get_chosen_sstable_version.set(r, [&ctx] (std::unique_ptr<request> req) {
|
||||
return smp::submit_to(0, [&ctx] {
|
||||
auto format = ctx.db.local().get_user_sstables_manager().get_preferred_sstable_version();
|
||||
return make_ready_future<json::json_return_type>(seastar::to_sstring(format));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -47,7 +47,7 @@ void cache::set_permission_loader(permission_loader_func loader) {
|
||||
_permission_loader = std::move(loader);
|
||||
}
|
||||
|
||||
lw_shared_ptr<const cache::role_record> cache::get(std::string_view role) const noexcept {
|
||||
lw_shared_ptr<const cache::role_record> cache::get(const role_name_t& role) const noexcept {
|
||||
auto it = _roles.find(role);
|
||||
if (it == _roles.end()) {
|
||||
return {};
|
||||
@@ -55,16 +55,6 @@ lw_shared_ptr<const cache::role_record> cache::get(std::string_view role) const
|
||||
return it->second;
|
||||
}
|
||||
|
||||
void cache::for_each_role(const std::function<void(const role_name_t&, const role_record&)>& func) const {
|
||||
for (const auto& [name, record] : _roles) {
|
||||
func(name, *record);
|
||||
}
|
||||
}
|
||||
|
||||
size_t cache::roles_count() const noexcept {
|
||||
return _roles.size();
|
||||
}
|
||||
|
||||
future<permission_set> cache::get_permissions(const role_or_anonymous& role, const resource& r) {
|
||||
std::unordered_map<resource, permission_set>* perms_cache;
|
||||
lw_shared_ptr<role_record> role_ptr;
|
||||
|
||||
@@ -9,7 +9,6 @@
|
||||
#pragma once
|
||||
|
||||
#include <seastar/core/abort_source.hh>
|
||||
#include <string_view>
|
||||
#include <unordered_set>
|
||||
#include <unordered_map>
|
||||
|
||||
@@ -20,7 +19,7 @@
|
||||
#include <seastar/core/semaphore.hh>
|
||||
#include <seastar/core/metrics_registration.hh>
|
||||
|
||||
#include "absl-flat_hash_map.hh"
|
||||
#include <absl/container/flat_hash_map.h>
|
||||
|
||||
#include "auth/permission.hh"
|
||||
#include "auth/common.hh"
|
||||
@@ -43,8 +42,8 @@ public:
|
||||
std::unordered_set<role_name_t> member_of;
|
||||
std::unordered_set<role_name_t> members;
|
||||
sstring salted_hash;
|
||||
std::unordered_map<sstring, sstring, sstring_hash, sstring_eq> attributes;
|
||||
std::unordered_map<sstring, permission_set, sstring_hash, sstring_eq> permissions;
|
||||
std::unordered_map<sstring, sstring> attributes;
|
||||
std::unordered_map<sstring, permission_set> permissions;
|
||||
private:
|
||||
friend cache;
|
||||
// cached permissions include effects of role's inheritance
|
||||
@@ -53,7 +52,7 @@ public:
|
||||
};
|
||||
|
||||
explicit cache(cql3::query_processor& qp, abort_source& as) noexcept;
|
||||
lw_shared_ptr<const role_record> get(std::string_view role) const noexcept;
|
||||
lw_shared_ptr<const role_record> get(const role_name_t& role) const noexcept;
|
||||
void set_permission_loader(permission_loader_func loader);
|
||||
future<permission_set> get_permissions(const role_or_anonymous& role, const resource& r);
|
||||
future<> prune(const resource& r);
|
||||
@@ -62,15 +61,8 @@ public:
|
||||
future<> load_roles(std::unordered_set<role_name_t> roles);
|
||||
static bool includes_table(const table_id&) noexcept;
|
||||
|
||||
// Returns the number of roles in the cache.
|
||||
size_t roles_count() const noexcept;
|
||||
|
||||
// The callback doesn't suspend (no co_await) so it observes the state
|
||||
// of the cache atomically.
|
||||
void for_each_role(const std::function<void(const role_name_t&, const role_record&)>& func) const;
|
||||
|
||||
private:
|
||||
using roles_map = absl::flat_hash_map<role_name_t, lw_shared_ptr<role_record>, sstring_hash, sstring_eq>;
|
||||
using roles_map = absl::flat_hash_map<role_name_t, lw_shared_ptr<role_record>>;
|
||||
roles_map _roles;
|
||||
// anonymous permissions map exists mainly due to compatibility with
|
||||
// higher layers which use role_or_anonymous to get permissions.
|
||||
|
||||
@@ -32,7 +32,7 @@ namespace {
|
||||
logger mylog{"ldap_role_manager"}; // `log` is taken by math.
|
||||
|
||||
struct url_desc_deleter {
|
||||
void operator()(LDAPURLDesc* p) {
|
||||
void operator()(LDAPURLDesc *p) {
|
||||
ldap_free_urldesc(p);
|
||||
}
|
||||
};
|
||||
@@ -40,7 +40,7 @@ struct url_desc_deleter {
|
||||
using url_desc_ptr = std::unique_ptr<LDAPURLDesc, url_desc_deleter>;
|
||||
|
||||
url_desc_ptr parse_url(std::string_view url) {
|
||||
LDAPURLDesc* desc = nullptr;
|
||||
LDAPURLDesc *desc = nullptr;
|
||||
if (ldap_url_parse(url.data(), &desc)) {
|
||||
mylog.error("error in ldap_url_parse({})", url);
|
||||
}
|
||||
@@ -53,12 +53,8 @@ std::vector<sstring> get_attr_values(LDAP* ld, LDAPMessage* res, const char* att
|
||||
mylog.debug("Analyzing search results");
|
||||
for (auto e = ldap_first_entry(ld, res); e; e = ldap_next_entry(ld, e)) {
|
||||
struct deleter {
|
||||
void operator()(berval** p) {
|
||||
ldap_value_free_len(p);
|
||||
}
|
||||
void operator()(char* p) {
|
||||
ldap_memfree(p);
|
||||
}
|
||||
void operator()(berval** p) { ldap_value_free_len(p); }
|
||||
void operator()(char* p) { ldap_memfree(p); }
|
||||
};
|
||||
const std::unique_ptr<char, deleter> dname(ldap_get_dn(ld, e));
|
||||
mylog.debug("Analyzing entry {}", dname.get());
|
||||
@@ -79,29 +75,32 @@ std::vector<sstring> get_attr_values(LDAP* ld, LDAPMessage* res, const char* att
|
||||
|
||||
namespace auth {
|
||||
|
||||
ldap_role_manager::ldap_role_manager(std::string_view query_template, std::string_view target_attr, std::string_view bind_name, std::string_view bind_password,
|
||||
uint32_t permissions_update_interval_in_ms, utils::observer<uint32_t> permissions_update_interval_in_ms_observer, cql3::query_processor& qp,
|
||||
::service::raft_group0_client& rg0c, ::service::migration_manager& mm, cache& cache)
|
||||
: _std_mgr(qp, rg0c, mm, cache)
|
||||
, _group0_client(rg0c)
|
||||
, _query_template(query_template)
|
||||
, _target_attr(target_attr)
|
||||
, _bind_name(bind_name)
|
||||
, _bind_password(bind_password)
|
||||
, _permissions_update_interval_in_ms(permissions_update_interval_in_ms)
|
||||
, _permissions_update_interval_in_ms_observer(std::move(permissions_update_interval_in_ms_observer))
|
||||
, _connection_factory(bind(std::mem_fn(&ldap_role_manager::reconnect), std::ref(*this)))
|
||||
, _cache(cache)
|
||||
, _cache_pruner(make_ready_future<>()) {
|
||||
ldap_role_manager::ldap_role_manager(
|
||||
std::string_view query_template, std::string_view target_attr, std::string_view bind_name, std::string_view bind_password,
|
||||
uint32_t permissions_update_interval_in_ms,
|
||||
utils::observer<uint32_t> permissions_update_interval_in_ms_observer,
|
||||
cql3::query_processor& qp, ::service::raft_group0_client& rg0c, ::service::migration_manager& mm, cache& cache)
|
||||
: _std_mgr(qp, rg0c, mm, cache), _group0_client(rg0c), _query_template(query_template), _target_attr(target_attr), _bind_name(bind_name)
|
||||
, _bind_password(bind_password)
|
||||
, _permissions_update_interval_in_ms(permissions_update_interval_in_ms)
|
||||
, _permissions_update_interval_in_ms_observer(std::move(permissions_update_interval_in_ms_observer))
|
||||
, _connection_factory(bind(std::mem_fn(&ldap_role_manager::reconnect), std::ref(*this)))
|
||||
, _cache(cache)
|
||||
, _cache_pruner(make_ready_future<>()) {
|
||||
}
|
||||
|
||||
ldap_role_manager::ldap_role_manager(cql3::query_processor& qp, ::service::raft_group0_client& rg0c, ::service::migration_manager& mm, cache& cache)
|
||||
: ldap_role_manager(qp.db().get_config().ldap_url_template(), qp.db().get_config().ldap_attr_role(), qp.db().get_config().ldap_bind_dn(),
|
||||
qp.db().get_config().ldap_bind_passwd(), qp.db().get_config().permissions_update_interval_in_ms(),
|
||||
qp.db().get_config().permissions_update_interval_in_ms.observe([this](const uint32_t& v) {
|
||||
_permissions_update_interval_in_ms = v;
|
||||
}),
|
||||
qp, rg0c, mm, cache) {
|
||||
: ldap_role_manager(
|
||||
qp.db().get_config().ldap_url_template(),
|
||||
qp.db().get_config().ldap_attr_role(),
|
||||
qp.db().get_config().ldap_bind_dn(),
|
||||
qp.db().get_config().ldap_bind_passwd(),
|
||||
qp.db().get_config().permissions_update_interval_in_ms(),
|
||||
qp.db().get_config().permissions_update_interval_in_ms.observe([this] (const uint32_t& v) { _permissions_update_interval_in_ms = v; }),
|
||||
qp,
|
||||
rg0c,
|
||||
mm,
|
||||
cache) {
|
||||
}
|
||||
|
||||
std::string_view ldap_role_manager::qualified_java_name() const noexcept {
|
||||
@@ -114,16 +113,17 @@ const resource_set& ldap_role_manager::protected_resources() const {
|
||||
|
||||
future<> ldap_role_manager::start() {
|
||||
if (!parse_url(get_url("dummy-user"))) { // Just need host and port -- any user should do.
|
||||
return make_exception_future(std::runtime_error(fmt::format("error getting LDAP server address from template {}", _query_template)));
|
||||
return make_exception_future(
|
||||
std::runtime_error(fmt::format("error getting LDAP server address from template {}", _query_template)));
|
||||
}
|
||||
_cache_pruner = futurize_invoke([this]() -> future<> {
|
||||
_cache_pruner = futurize_invoke([this] () -> future<> {
|
||||
while (true) {
|
||||
try {
|
||||
co_await seastar::sleep_abortable(std::chrono::milliseconds(_permissions_update_interval_in_ms), _as);
|
||||
} catch (const seastar::sleep_aborted&) {
|
||||
co_return; // ignore
|
||||
}
|
||||
co_await _cache.container().invoke_on_all([](cache& c) -> future<> {
|
||||
co_await _cache.container().invoke_on_all([] (cache& c) -> future<> {
|
||||
try {
|
||||
co_await c.reload_all_permissions();
|
||||
} catch (...) {
|
||||
@@ -165,7 +165,7 @@ future<conn_ptr> ldap_role_manager::connect() {
|
||||
future<conn_ptr> ldap_role_manager::reconnect() {
|
||||
unsigned retries_left = 5;
|
||||
using namespace std::literals::chrono_literals;
|
||||
conn_ptr conn = co_await exponential_backoff_retry::do_until_value(1s, 32s, _as, [this, &retries_left]() -> future<std::optional<conn_ptr>> {
|
||||
conn_ptr conn = co_await exponential_backoff_retry::do_until_value(1s, 32s, _as, [this, &retries_left] () -> future<std::optional<conn_ptr>> {
|
||||
if (!retries_left) {
|
||||
co_return conn_ptr{};
|
||||
}
|
||||
@@ -188,13 +188,11 @@ future<conn_ptr> ldap_role_manager::reconnect() {
|
||||
|
||||
future<> ldap_role_manager::stop() {
|
||||
_as.request_abort();
|
||||
return std::move(_cache_pruner)
|
||||
.then([this] {
|
||||
return _std_mgr.stop();
|
||||
})
|
||||
.then([this] {
|
||||
return _connection_factory.stop();
|
||||
});
|
||||
return std::move(_cache_pruner).then([this] {
|
||||
return _std_mgr.stop();
|
||||
}).then([this] {
|
||||
return _connection_factory.stop();
|
||||
});
|
||||
}
|
||||
|
||||
future<> ldap_role_manager::create(std::string_view name, const role_config& config, ::service::group0_batch& mc) {
|
||||
@@ -223,42 +221,43 @@ future<role_set> ldap_role_manager::query_granted(std::string_view grantee_name,
|
||||
if (!desc) {
|
||||
return make_exception_future<role_set>(std::runtime_error(format("Error parsing URL {}", url)));
|
||||
}
|
||||
return _connection_factory.with_connection(
|
||||
[this, desc = std::move(desc), grantee_name_ = sstring(grantee_name)](ldap_connection& conn) -> future<role_set> {
|
||||
sstring grantee_name = std::move(grantee_name_);
|
||||
ldap_msg_ptr res = co_await conn.search(desc->lud_dn, desc->lud_scope, desc->lud_filter, desc->lud_attrs,
|
||||
/*attrsonly=*/0, /*serverctrls=*/nullptr, /*clientctrls=*/nullptr,
|
||||
/*timeout=*/nullptr, /*sizelimit=*/0);
|
||||
mylog.trace("query_granted: got search results");
|
||||
const auto mtype = ldap_msgtype(res.get());
|
||||
if (mtype != LDAP_RES_SEARCH_ENTRY && mtype != LDAP_RES_SEARCH_RESULT && mtype != LDAP_RES_SEARCH_REFERENCE) {
|
||||
mylog.error("ldap search yielded result {} of type {}", static_cast<const void*>(res.get()), mtype);
|
||||
co_return coroutine::exception(std::make_exception_ptr(std::runtime_error("ldap_role_manager: search result has wrong type")));
|
||||
return _connection_factory.with_connection([this, desc = std::move(desc), grantee_name_ = sstring(grantee_name)]
|
||||
(ldap_connection& conn) -> future<role_set> {
|
||||
sstring grantee_name = std::move(grantee_name_);
|
||||
ldap_msg_ptr res = co_await conn.search(desc->lud_dn, desc->lud_scope, desc->lud_filter, desc->lud_attrs,
|
||||
/*attrsonly=*/0, /*serverctrls=*/nullptr, /*clientctrls=*/nullptr,
|
||||
/*timeout=*/nullptr, /*sizelimit=*/0);
|
||||
mylog.trace("query_granted: got search results");
|
||||
const auto mtype = ldap_msgtype(res.get());
|
||||
if (mtype != LDAP_RES_SEARCH_ENTRY && mtype != LDAP_RES_SEARCH_RESULT && mtype != LDAP_RES_SEARCH_REFERENCE) {
|
||||
mylog.error("ldap search yielded result {} of type {}", static_cast<const void*>(res.get()), mtype);
|
||||
co_return coroutine::exception(std::make_exception_ptr(std::runtime_error("ldap_role_manager: search result has wrong type")));
|
||||
}
|
||||
std::vector<sstring> values = get_attr_values(conn.get_ldap(), res.get(), _target_attr.c_str());
|
||||
auth::role_set valid_roles{grantee_name};
|
||||
|
||||
// Each value is a role to be granted.
|
||||
co_await parallel_for_each(values, [this, &valid_roles] (const sstring& ldap_role) {
|
||||
return _std_mgr.exists(ldap_role).then([&valid_roles, &ldap_role] (bool exists) {
|
||||
if (exists) {
|
||||
valid_roles.insert(ldap_role);
|
||||
} else {
|
||||
mylog.error("unrecognized role received from LDAP: {}", ldap_role);
|
||||
}
|
||||
std::vector<sstring> values = get_attr_values(conn.get_ldap(), res.get(), _target_attr.c_str());
|
||||
auth::role_set valid_roles{grantee_name};
|
||||
|
||||
// Each value is a role to be granted.
|
||||
co_await parallel_for_each(values, [this, &valid_roles](const sstring& ldap_role) {
|
||||
return _std_mgr.exists(ldap_role).then([&valid_roles, &ldap_role](bool exists) {
|
||||
if (exists) {
|
||||
valid_roles.insert(ldap_role);
|
||||
} else {
|
||||
mylog.error("unrecognized role received from LDAP: {}", ldap_role);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
co_return std::move(valid_roles);
|
||||
});
|
||||
});
|
||||
|
||||
co_return std::move(valid_roles);
|
||||
});
|
||||
}
|
||||
|
||||
future<role_to_directly_granted_map> ldap_role_manager::query_all_directly_granted(::service::query_state& qs) {
|
||||
future<role_to_directly_granted_map>
|
||||
ldap_role_manager::query_all_directly_granted(::service::query_state& qs) {
|
||||
role_to_directly_granted_map result;
|
||||
auto roles = co_await query_all(qs);
|
||||
for (auto& role : roles) {
|
||||
for (auto& role: roles) {
|
||||
auto granted_set = co_await query_granted(role, recursive_role_query::no);
|
||||
for (auto& granted : granted_set) {
|
||||
for (auto& granted: granted_set) {
|
||||
if (granted != role) {
|
||||
result.insert({role, granted});
|
||||
}
|
||||
@@ -272,7 +271,7 @@ future<role_set> ldap_role_manager::query_all(::service::query_state& qs) {
|
||||
}
|
||||
|
||||
future<> ldap_role_manager::create_role(std::string_view role_name) {
|
||||
return smp::submit_to(0, [this, role_name]() -> future<> {
|
||||
return smp::submit_to(0, [this, role_name] () -> future<> {
|
||||
int retries = 10;
|
||||
while (true) {
|
||||
auto guard = co_await _group0_client.start_operation(_as, ::service::raft_timeout{});
|
||||
@@ -284,8 +283,8 @@ future<> ldap_role_manager::create_role(std::string_view role_name) {
|
||||
} catch (const role_already_exists&) {
|
||||
// ok
|
||||
} catch (const ::service::group0_concurrent_modification& ex) {
|
||||
mylog.warn("Failed to auto-create role \"{}\" due to guard conflict.{}.", role_name,
|
||||
retries ? " Retrying" : " Number of retries exceeded, giving up");
|
||||
mylog.warn("Failed to auto-create role \"{}\" due to guard conflict.{}.",
|
||||
role_name, retries ? " Retrying" : " Number of retries exceeded, giving up");
|
||||
if (retries--) {
|
||||
continue;
|
||||
}
|
||||
@@ -330,7 +329,8 @@ future<bool> ldap_role_manager::can_login(std::string_view role_name) {
|
||||
return _std_mgr.can_login(role_name);
|
||||
}
|
||||
|
||||
future<std::optional<sstring>> ldap_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state& qs) {
|
||||
future<std::optional<sstring>> ldap_role_manager::get_attribute(
|
||||
std::string_view role_name, std::string_view attribute_name, ::service::query_state& qs) {
|
||||
return _std_mgr.get_attribute(role_name, attribute_name, qs);
|
||||
}
|
||||
|
||||
|
||||
@@ -1,37 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2026-present ScyllaDB
|
||||
*
|
||||
* Modified by ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "auth/default_authorizer.hh"
|
||||
#include "auth/permission.hh"
|
||||
|
||||
namespace auth {
|
||||
|
||||
// maintenance_socket_authorizer is used for clients connecting to the
|
||||
// maintenance socket. It grants all permissions unconditionally (like
|
||||
// AllowAllAuthorizer) while still supporting grant/revoke operations
|
||||
// (delegated to the underlying CassandraAuthorizer / default_authorizer).
|
||||
class maintenance_socket_authorizer : public default_authorizer {
|
||||
public:
|
||||
using default_authorizer::default_authorizer;
|
||||
|
||||
~maintenance_socket_authorizer() override = default;
|
||||
|
||||
future<> start() override {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
future<permission_set> authorize(const role_or_anonymous&, const resource&) const override {
|
||||
return make_ready_future<permission_set>(permissions::ALL);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace auth
|
||||
@@ -30,7 +30,6 @@
|
||||
#include "auth/default_authorizer.hh"
|
||||
#include "auth/ldap_role_manager.hh"
|
||||
#include "auth/maintenance_socket_authenticator.hh"
|
||||
#include "auth/maintenance_socket_authorizer.hh"
|
||||
#include "auth/maintenance_socket_role_manager.hh"
|
||||
#include "auth/password_authenticator.hh"
|
||||
#include "auth/role_or_anonymous.hh"
|
||||
@@ -867,12 +866,6 @@ authenticator_factory make_maintenance_socket_authenticator_factory(
|
||||
};
|
||||
}
|
||||
|
||||
authorizer_factory make_maintenance_socket_authorizer_factory(sharded<cql3::query_processor>& qp) {
|
||||
return [&qp] {
|
||||
return std::make_unique<maintenance_socket_authorizer>(qp.local());
|
||||
};
|
||||
}
|
||||
|
||||
role_manager_factory make_maintenance_socket_role_manager_factory(
|
||||
sharded<cql3::query_processor>& qp,
|
||||
::service::raft_group0_client& g0,
|
||||
|
||||
@@ -434,11 +434,6 @@ authenticator_factory make_maintenance_socket_authenticator_factory(
|
||||
sharded<::service::migration_manager>& mm,
|
||||
sharded<cache>& cache);
|
||||
|
||||
/// Creates a factory for the maintenance socket authorizer.
|
||||
/// This authorizer is not config-selectable and is only used for the maintenance socket.
|
||||
/// It grants all permissions unconditionally while delegating grant/revoke to the default authorizer.
|
||||
authorizer_factory make_maintenance_socket_authorizer_factory(sharded<cql3::query_processor>& qp);
|
||||
|
||||
/// Creates a factory for the maintenance socket role manager.
|
||||
/// This role manager is not config-selectable and is only used for the maintenance socket.
|
||||
role_manager_factory make_maintenance_socket_role_manager_factory(
|
||||
|
||||
@@ -44,12 +44,13 @@ namespace auth {
|
||||
static logging::logger log("standard_role_manager");
|
||||
|
||||
future<std::optional<standard_role_manager::record>> standard_role_manager::find_record(std::string_view role_name) {
|
||||
auto role = _cache.get(role_name);
|
||||
auto name = sstring(role_name);
|
||||
auto role = _cache.get(name);
|
||||
if (!role) {
|
||||
return make_ready_future<std::optional<record>>(std::nullopt);
|
||||
}
|
||||
return make_ready_future<std::optional<record>>(std::make_optional(record{
|
||||
.name = sstring(role_name),
|
||||
.name = std::move(name),
|
||||
.is_superuser = role->is_superuser,
|
||||
.can_login = role->can_login,
|
||||
.member_of = role->member_of
|
||||
@@ -392,21 +393,51 @@ future<role_set> standard_role_manager::query_granted(std::string_view grantee_n
|
||||
}
|
||||
|
||||
future<role_to_directly_granted_map> standard_role_manager::query_all_directly_granted(::service::query_state& qs) {
|
||||
const sstring query = seastar::format("SELECT * FROM {}.{}",
|
||||
db::system_keyspace::NAME,
|
||||
ROLE_MEMBERS_CF);
|
||||
|
||||
const auto results = co_await _qp.execute_internal(
|
||||
query,
|
||||
db::consistency_level::ONE,
|
||||
qs,
|
||||
cql3::query_processor::cache_internal::yes);
|
||||
|
||||
role_to_directly_granted_map roles_map;
|
||||
_cache.for_each_role([&roles_map] (const cache::role_name_t& name, const cache::role_record& record) {
|
||||
for (const auto& granted_role : record.member_of) {
|
||||
roles_map.emplace(name, granted_role);
|
||||
}
|
||||
});
|
||||
std::transform(
|
||||
results->begin(),
|
||||
results->end(),
|
||||
std::inserter(roles_map, roles_map.begin()),
|
||||
[] (const cql3::untyped_result_set_row& row) {
|
||||
return std::make_pair(row.get_as<sstring>("member"), row.get_as<sstring>("role")); }
|
||||
);
|
||||
|
||||
co_return roles_map;
|
||||
}
|
||||
|
||||
future<role_set> standard_role_manager::query_all(::service::query_state& qs) {
|
||||
const sstring query = seastar::format("SELECT {} FROM {}.{}",
|
||||
meta::roles_table::role_col_name,
|
||||
db::system_keyspace::NAME,
|
||||
meta::roles_table::name);
|
||||
|
||||
// To avoid many copies of a view.
|
||||
static const auto role_col_name_string = sstring(meta::roles_table::role_col_name);
|
||||
|
||||
const auto results = co_await _qp.execute_internal(
|
||||
query,
|
||||
db::consistency_level::LOCAL_ONE,
|
||||
qs,
|
||||
cql3::query_processor::cache_internal::yes);
|
||||
|
||||
role_set roles;
|
||||
roles.reserve(_cache.roles_count());
|
||||
_cache.for_each_role([&roles] (const cache::role_name_t& name, const cache::role_record&) {
|
||||
roles.insert(name);
|
||||
});
|
||||
std::transform(
|
||||
results->begin(),
|
||||
results->end(),
|
||||
std::inserter(roles, roles.begin()),
|
||||
[] (const cql3::untyped_result_set_row& row) {
|
||||
return row.get_as<sstring>(role_col_name_string);}
|
||||
);
|
||||
co_return roles;
|
||||
}
|
||||
|
||||
@@ -429,26 +460,31 @@ future<bool> standard_role_manager::can_login(std::string_view role_name) {
|
||||
}
|
||||
|
||||
future<std::optional<sstring>> standard_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state& qs) {
|
||||
auto role = _cache.get(role_name);
|
||||
if (!role) {
|
||||
co_return std::nullopt;
|
||||
const sstring query = seastar::format("SELECT name, value FROM {}.{} WHERE role = ? AND name = ?",
|
||||
db::system_keyspace::NAME,
|
||||
ROLE_ATTRIBUTES_CF);
|
||||
const auto result_set = co_await _qp.execute_internal(query, db::consistency_level::ONE, qs, {sstring(role_name), sstring(attribute_name)}, cql3::query_processor::cache_internal::yes);
|
||||
if (!result_set->empty()) {
|
||||
const cql3::untyped_result_set_row &row = result_set->one();
|
||||
co_return std::optional<sstring>(row.get_as<sstring>("value"));
|
||||
}
|
||||
auto it = role->attributes.find(attribute_name);
|
||||
if (it != role->attributes.end()) {
|
||||
co_return it->second;
|
||||
}
|
||||
co_return std::nullopt;
|
||||
co_return std::optional<sstring>{};
|
||||
}
|
||||
|
||||
future<role_manager::attribute_vals> standard_role_manager::query_attribute_for_all(std::string_view attribute_name, ::service::query_state& qs) {
|
||||
attribute_vals result;
|
||||
_cache.for_each_role([&result, attribute_name] (const cache::role_name_t& name, const cache::role_record& record) {
|
||||
auto it = record.attributes.find(attribute_name);
|
||||
if (it != record.attributes.end()) {
|
||||
result.emplace(name, it->second);
|
||||
}
|
||||
future<role_manager::attribute_vals> standard_role_manager::query_attribute_for_all (std::string_view attribute_name, ::service::query_state& qs) {
|
||||
return query_all(qs).then([this, attribute_name, &qs] (role_set roles) {
|
||||
return do_with(attribute_vals{}, [this, attribute_name, roles = std::move(roles), &qs] (attribute_vals &role_to_att_val) {
|
||||
return parallel_for_each(roles.begin(), roles.end(), [this, &role_to_att_val, attribute_name, &qs] (sstring role) {
|
||||
return get_attribute(role, attribute_name, qs).then([&role_to_att_val, role] (std::optional<sstring> att_val) {
|
||||
if (att_val) {
|
||||
role_to_att_val.emplace(std::move(role), std::move(*att_val));
|
||||
}
|
||||
});
|
||||
}).then([&role_to_att_val] () {
|
||||
return make_ready_future<attribute_vals>(std::move(role_to_att_val));
|
||||
});
|
||||
});
|
||||
});
|
||||
co_return result;
|
||||
}
|
||||
|
||||
future<> standard_role_manager::set_attribute(std::string_view role_name, std::string_view attribute_name, std::string_view attribute_value, ::service::group0_batch& mc) {
|
||||
|
||||
219
cdc/split.cc
219
cdc/split.cc
@@ -76,14 +76,14 @@ struct partition_deletion {
|
||||
|
||||
using clustered_column_set = std::map<clustering_key, cdc::one_kind_column_set, clustering_key::less_compare>;
|
||||
|
||||
template <typename Container>
|
||||
template<typename Container>
|
||||
concept EntryContainer = requires(Container& container) {
|
||||
// Parenthesized due to https://bugs.llvm.org/show_bug.cgi?id=45088
|
||||
{ (container.atomic_entries) } -> std::same_as<std::vector<atomic_column_update>&>;
|
||||
{ (container.nonatomic_entries) } -> std::same_as<std::vector<nonatomic_column_update>&>;
|
||||
};
|
||||
|
||||
template <EntryContainer Container>
|
||||
template<EntryContainer Container>
|
||||
static void add_columns_affected_by_entries(cdc::one_kind_column_set& cset, const Container& cont) {
|
||||
for (const auto& entry : cont.atomic_entries) {
|
||||
cset.set(entry.id);
|
||||
@@ -134,7 +134,7 @@ struct batch {
|
||||
ret.emplace(clustering_key::make_empty(), all_columns);
|
||||
}
|
||||
|
||||
auto process_change_type = [&](const auto& changes) {
|
||||
auto process_change_type = [&] (const auto& changes) {
|
||||
for (const auto& change : changes) {
|
||||
auto& cset = ret[change.key];
|
||||
cset.resize(s.regular_columns_count());
|
||||
@@ -211,9 +211,7 @@ private:
|
||||
|
||||
public:
|
||||
extract_collection_visitor(column_id id, std::map<change_key_t, row_update>& updates)
|
||||
: _id(id)
|
||||
, _updates(updates) {
|
||||
}
|
||||
: _id(id), _updates(updates) {}
|
||||
|
||||
void collection_tombstone(const tombstone& t) {
|
||||
auto& entry = get_or_append_entry(t.timestamp + 1, gc_clock::duration(0));
|
||||
@@ -228,9 +226,7 @@ public:
|
||||
cell(key, c);
|
||||
}
|
||||
|
||||
constexpr bool finished() const {
|
||||
return false;
|
||||
}
|
||||
constexpr bool finished() const { return false; }
|
||||
};
|
||||
|
||||
/* Visits all cells and tombstones in a row, putting the encountered changes into buckets
|
||||
@@ -253,46 +249,41 @@ struct extract_row_visitor {
|
||||
|
||||
void collection_column(const column_definition& cdef, auto&& visit_collection) {
|
||||
visit(*cdef.type, make_visitor(
|
||||
[&](const collection_type_impl& ctype) {
|
||||
struct collection_visitor : public extract_collection_visitor<collection_visitor> {
|
||||
data_type _value_type;
|
||||
[&] (const collection_type_impl& ctype) {
|
||||
struct collection_visitor : public extract_collection_visitor<collection_visitor> {
|
||||
data_type _value_type;
|
||||
|
||||
collection_visitor(column_id id, std::map<change_key_t, row_update>& updates, const collection_type_impl& ctype)
|
||||
: extract_collection_visitor<collection_visitor>(id, updates)
|
||||
, _value_type(ctype.value_comparator()) {
|
||||
}
|
||||
collection_visitor(column_id id, std::map<change_key_t, row_update>& updates, const collection_type_impl& ctype)
|
||||
: extract_collection_visitor<collection_visitor>(id, updates), _value_type(ctype.value_comparator()) {}
|
||||
|
||||
data_type get_value_type(bytes_view) {
|
||||
return _value_type;
|
||||
}
|
||||
} v(cdef.id, _updates, ctype);
|
||||
data_type get_value_type(bytes_view) {
|
||||
return _value_type;
|
||||
}
|
||||
} v(cdef.id, _updates, ctype);
|
||||
|
||||
visit_collection(v);
|
||||
},
|
||||
[&](const user_type_impl& utype) {
|
||||
struct udt_visitor : public extract_collection_visitor<udt_visitor> {
|
||||
const user_type_impl& _utype;
|
||||
visit_collection(v);
|
||||
},
|
||||
[&] (const user_type_impl& utype) {
|
||||
struct udt_visitor : public extract_collection_visitor<udt_visitor> {
|
||||
const user_type_impl& _utype;
|
||||
|
||||
udt_visitor(column_id id, std::map<change_key_t, row_update>& updates, const user_type_impl& utype)
|
||||
: extract_collection_visitor<udt_visitor>(id, updates)
|
||||
, _utype(utype) {
|
||||
}
|
||||
udt_visitor(column_id id, std::map<change_key_t, row_update>& updates, const user_type_impl& utype)
|
||||
: extract_collection_visitor<udt_visitor>(id, updates), _utype(utype) {}
|
||||
|
||||
data_type get_value_type(bytes_view key) {
|
||||
return _utype.type(deserialize_field_index(key));
|
||||
}
|
||||
} v(cdef.id, _updates, utype);
|
||||
data_type get_value_type(bytes_view key) {
|
||||
return _utype.type(deserialize_field_index(key));
|
||||
}
|
||||
} v(cdef.id, _updates, utype);
|
||||
|
||||
visit_collection(v);
|
||||
},
|
||||
[&](const abstract_type& o) {
|
||||
throw std::runtime_error(format("extract_changes: unknown collection type:", o.name()));
|
||||
}));
|
||||
visit_collection(v);
|
||||
},
|
||||
[&] (const abstract_type& o) {
|
||||
throw std::runtime_error(format("extract_changes: unknown collection type:", o.name()));
|
||||
}
|
||||
));
|
||||
}
|
||||
|
||||
constexpr bool finished() const {
|
||||
return false;
|
||||
}
|
||||
constexpr bool finished() const { return false; }
|
||||
};
|
||||
|
||||
struct extract_changes_visitor {
|
||||
@@ -302,8 +293,12 @@ struct extract_changes_visitor {
|
||||
extract_row_visitor v;
|
||||
visit_row_cells(v);
|
||||
|
||||
for (auto& [ts_ttl, row_update] : v._updates) {
|
||||
_result[ts_ttl.first].static_updates.push_back({ts_ttl.second, std::move(row_update.atomic_entries), std::move(row_update.nonatomic_entries)});
|
||||
for (auto& [ts_ttl, row_update]: v._updates) {
|
||||
_result[ts_ttl.first].static_updates.push_back({
|
||||
ts_ttl.second,
|
||||
std::move(row_update.atomic_entries),
|
||||
std::move(row_update.nonatomic_entries)
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -324,18 +319,24 @@ struct extract_changes_visitor {
|
||||
} v;
|
||||
visit_row_cells(v);
|
||||
|
||||
for (auto& [ts_ttl, row_update] : v._updates) {
|
||||
for (auto& [ts_ttl, row_update]: v._updates) {
|
||||
// It is important that changes in the resulting `set_of_changes` are listed
|
||||
// in increasing TTL order. The reason is explained in a comment in cdc/log.cc,
|
||||
// search for "#6070".
|
||||
auto [ts, ttl] = ts_ttl;
|
||||
|
||||
if (v._marker && ts == v._marker_ts && ttl == v._marker_ttl) {
|
||||
_result[ts].clustered_inserts.push_back({ttl, ckey, *v._marker, std::move(row_update.atomic_entries), {}});
|
||||
_result[ts].clustered_inserts.push_back({
|
||||
ttl,
|
||||
ckey,
|
||||
*v._marker,
|
||||
std::move(row_update.atomic_entries),
|
||||
{}
|
||||
});
|
||||
|
||||
auto& cr_insert = _result[ts].clustered_inserts.back();
|
||||
bool clustered_update_exists = false;
|
||||
for (auto& nonatomic_up : row_update.nonatomic_entries) {
|
||||
for (auto& nonatomic_up: row_update.nonatomic_entries) {
|
||||
// Updating a collection column with an INSERT statement implies inserting a tombstone.
|
||||
//
|
||||
// For example, suppose that we have:
|
||||
@@ -361,7 +362,12 @@ struct extract_changes_visitor {
|
||||
cr_insert.nonatomic_entries.push_back(std::move(nonatomic_up));
|
||||
} else {
|
||||
if (!clustered_update_exists) {
|
||||
_result[ts].clustered_updates.push_back({ttl, ckey, {}, {}});
|
||||
_result[ts].clustered_updates.push_back({
|
||||
ttl,
|
||||
ckey,
|
||||
{},
|
||||
{}
|
||||
});
|
||||
|
||||
// Multiple iterations of this `for` loop (for different collection columns)
|
||||
// might want to put their `nonatomic_up`s into an UPDATE change;
|
||||
@@ -384,7 +390,12 @@ struct extract_changes_visitor {
|
||||
}
|
||||
}
|
||||
} else {
|
||||
_result[ts].clustered_updates.push_back({ttl, ckey, std::move(row_update.atomic_entries), std::move(row_update.nonatomic_entries)});
|
||||
_result[ts].clustered_updates.push_back({
|
||||
ttl,
|
||||
ckey,
|
||||
std::move(row_update.atomic_entries),
|
||||
std::move(row_update.nonatomic_entries)
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -401,9 +412,7 @@ struct extract_changes_visitor {
|
||||
_result[t.timestamp].partition_deletions = partition_deletion{t};
|
||||
}
|
||||
|
||||
constexpr bool finished() const {
|
||||
return false;
|
||||
}
|
||||
constexpr bool finished() const { return false; }
|
||||
};
|
||||
|
||||
set_of_changes extract_changes(const mutation& m) {
|
||||
@@ -417,23 +426,13 @@ namespace cdc {
|
||||
struct find_timestamp_visitor {
|
||||
api::timestamp_type _ts = api::missing_timestamp;
|
||||
|
||||
bool finished() const {
|
||||
return _ts != api::missing_timestamp;
|
||||
}
|
||||
bool finished() const { return _ts != api::missing_timestamp; }
|
||||
|
||||
void visit(api::timestamp_type ts) {
|
||||
_ts = ts;
|
||||
}
|
||||
void visit(const atomic_cell_view& cell) {
|
||||
visit(cell.timestamp());
|
||||
}
|
||||
void visit(api::timestamp_type ts) { _ts = ts; }
|
||||
void visit(const atomic_cell_view& cell) { visit(cell.timestamp()); }
|
||||
|
||||
void live_atomic_cell(const column_definition&, const atomic_cell_view& cell) {
|
||||
visit(cell);
|
||||
}
|
||||
void dead_atomic_cell(const column_definition&, const atomic_cell_view& cell) {
|
||||
visit(cell);
|
||||
}
|
||||
void live_atomic_cell(const column_definition&, const atomic_cell_view& cell) { visit(cell); }
|
||||
void dead_atomic_cell(const column_definition&, const atomic_cell_view& cell) { visit(cell); }
|
||||
void collection_tombstone(const tombstone& t) {
|
||||
// A collection tombstone with timestamp T can be created with:
|
||||
// UPDATE ks.t USING TIMESTAMP T + 1 SET X = null WHERE ...
|
||||
@@ -442,33 +441,15 @@ struct find_timestamp_visitor {
|
||||
// with cdc$time using timestamp T + 1 instead of T.
|
||||
visit(t.timestamp + 1);
|
||||
}
|
||||
void live_collection_cell(bytes_view, const atomic_cell_view& cell) {
|
||||
visit(cell);
|
||||
}
|
||||
void dead_collection_cell(bytes_view, const atomic_cell_view& cell) {
|
||||
visit(cell);
|
||||
}
|
||||
void collection_column(const column_definition&, auto&& visit_collection) {
|
||||
visit_collection(*this);
|
||||
}
|
||||
void marker(const row_marker& rm) {
|
||||
visit(rm.timestamp());
|
||||
}
|
||||
void static_row_cells(auto&& visit_row_cells) {
|
||||
visit_row_cells(*this);
|
||||
}
|
||||
void clustered_row_cells(const clustering_key&, auto&& visit_row_cells) {
|
||||
visit_row_cells(*this);
|
||||
}
|
||||
void clustered_row_delete(const clustering_key&, const tombstone& t) {
|
||||
visit(t.timestamp);
|
||||
}
|
||||
void range_delete(const range_tombstone& t) {
|
||||
visit(t.tomb.timestamp);
|
||||
}
|
||||
void partition_delete(const tombstone& t) {
|
||||
visit(t.timestamp);
|
||||
}
|
||||
void live_collection_cell(bytes_view, const atomic_cell_view& cell) { visit(cell); }
|
||||
void dead_collection_cell(bytes_view, const atomic_cell_view& cell) { visit(cell); }
|
||||
void collection_column(const column_definition&, auto&& visit_collection) { visit_collection(*this); }
|
||||
void marker(const row_marker& rm) { visit(rm.timestamp()); }
|
||||
void static_row_cells(auto&& visit_row_cells) { visit_row_cells(*this); }
|
||||
void clustered_row_cells(const clustering_key&, auto&& visit_row_cells) { visit_row_cells(*this); }
|
||||
void clustered_row_delete(const clustering_key&, const tombstone& t) { visit(t.timestamp); }
|
||||
void range_delete(const range_tombstone& t) { visit(t.tomb.timestamp); }
|
||||
void partition_delete(const tombstone& t) { visit(t.timestamp); }
|
||||
};
|
||||
|
||||
/* Find some timestamp inside the given mutation.
|
||||
@@ -524,12 +505,8 @@ struct should_split_visitor {
|
||||
|
||||
virtual ~should_split_visitor() = default;
|
||||
|
||||
inline bool finished() const {
|
||||
return _result;
|
||||
}
|
||||
inline void stop() {
|
||||
_result = true;
|
||||
}
|
||||
inline bool finished() const { return _result; }
|
||||
inline void stop() { _result = true; }
|
||||
|
||||
void visit(api::timestamp_type ts, gc_clock::duration ttl = gc_clock::duration(0)) {
|
||||
if (_ts != api::missing_timestamp && _ts != ts) {
|
||||
@@ -540,23 +517,15 @@ struct should_split_visitor {
|
||||
if (_ttl && *_ttl != ttl) {
|
||||
return stop();
|
||||
}
|
||||
_ttl = {ttl};
|
||||
_ttl = { ttl };
|
||||
}
|
||||
|
||||
void visit(const atomic_cell_view& cell) {
|
||||
visit(cell.timestamp(), get_ttl(cell));
|
||||
}
|
||||
void visit(const atomic_cell_view& cell) { visit(cell.timestamp(), get_ttl(cell)); }
|
||||
|
||||
void live_atomic_cell(const column_definition&, const atomic_cell_view& cell) {
|
||||
visit(cell);
|
||||
}
|
||||
void dead_atomic_cell(const column_definition&, const atomic_cell_view& cell) {
|
||||
visit(cell);
|
||||
}
|
||||
void live_atomic_cell(const column_definition&, const atomic_cell_view& cell) { visit(cell); }
|
||||
void dead_atomic_cell(const column_definition&, const atomic_cell_view& cell) { visit(cell); }
|
||||
|
||||
void collection_tombstone(const tombstone& t) {
|
||||
visit(t.timestamp + 1);
|
||||
}
|
||||
void collection_tombstone(const tombstone& t) { visit(t.timestamp + 1); }
|
||||
|
||||
virtual void live_collection_cell(bytes_view, const atomic_cell_view& cell) {
|
||||
if (_had_row_marker) {
|
||||
@@ -565,12 +534,8 @@ struct should_split_visitor {
|
||||
}
|
||||
visit(cell);
|
||||
}
|
||||
void dead_collection_cell(bytes_view, const atomic_cell_view& cell) {
|
||||
visit(cell);
|
||||
}
|
||||
void collection_column(const column_definition&, auto&& visit_collection) {
|
||||
visit_collection(*this);
|
||||
}
|
||||
void dead_collection_cell(bytes_view, const atomic_cell_view& cell) { visit(cell); }
|
||||
void collection_column(const column_definition&, auto&& visit_collection) { visit_collection(*this); }
|
||||
|
||||
virtual void marker(const row_marker& rm) {
|
||||
_had_row_marker = true;
|
||||
@@ -641,8 +606,8 @@ bool should_split(const mutation& m, const per_request_options& options) {
|
||||
cdc::inspect_mutation(m, v);
|
||||
|
||||
return v._result
|
||||
// A mutation with no timestamp will be split into 0 mutations:
|
||||
|| v._ts == api::missing_timestamp;
|
||||
// A mutation with no timestamp will be split into 0 mutations:
|
||||
|| v._ts == api::missing_timestamp;
|
||||
}
|
||||
|
||||
// Returns true if the row state and the atomic and nonatomic entries represent
|
||||
@@ -677,7 +642,7 @@ static bool entries_match_row_state(const schema_ptr& base_schema, const cell_ma
|
||||
if (current_values.size() != update.cells.size()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
std::unordered_map<sstring_view, bytes> current_values_map;
|
||||
for (const auto& entry : current_values) {
|
||||
const auto attr_name = std::string_view(value_cast<sstring>(entry.first));
|
||||
@@ -746,8 +711,8 @@ bool should_skip(batch& changes, const mutation& base_mutation, change_processor
|
||||
return true;
|
||||
}
|
||||
|
||||
void process_changes_with_splitting(
|
||||
const mutation& base_mutation, change_processor& processor, bool enable_preimage, bool enable_postimage, bool alternator_strict_compatibility) {
|
||||
void process_changes_with_splitting(const mutation& base_mutation, change_processor& processor,
|
||||
bool enable_preimage, bool enable_postimage, bool alternator_strict_compatibility) {
|
||||
const auto base_schema = base_mutation.schema();
|
||||
auto changes = extract_changes(base_mutation);
|
||||
auto pk = base_mutation.key();
|
||||
@@ -859,8 +824,8 @@ void process_changes_with_splitting(
|
||||
}
|
||||
}
|
||||
|
||||
void process_changes_without_splitting(
|
||||
const mutation& base_mutation, change_processor& processor, bool enable_preimage, bool enable_postimage, bool alternator_strict_compatibility) {
|
||||
void process_changes_without_splitting(const mutation& base_mutation, change_processor& processor,
|
||||
bool enable_preimage, bool enable_postimage, bool alternator_strict_compatibility) {
|
||||
if (alternator_strict_compatibility) {
|
||||
auto changes = extract_changes(base_mutation);
|
||||
if (should_skip(changes.begin()->second, base_mutation, processor)) {
|
||||
@@ -877,7 +842,7 @@ void process_changes_without_splitting(
|
||||
|
||||
one_kind_column_set columns{base_schema->static_columns_count()};
|
||||
if (!p.static_row().empty()) {
|
||||
p.static_row().get().for_each_cell([&](column_id id, const atomic_cell_or_collection& cell) {
|
||||
p.static_row().get().for_each_cell([&] (column_id id, const atomic_cell_or_collection& cell) {
|
||||
columns.set(id);
|
||||
});
|
||||
processor.produce_preimage(nullptr, columns);
|
||||
@@ -890,7 +855,7 @@ void process_changes_without_splitting(
|
||||
// Row deleted - include all columns in preimage
|
||||
columns.set(0, base_schema->regular_columns_count(), true);
|
||||
} else {
|
||||
cr.row().cells().for_each_cell([&](column_id id, const atomic_cell_or_collection& cell) {
|
||||
cr.row().cells().for_each_cell([&] (column_id id, const atomic_cell_or_collection& cell) {
|
||||
columns.set(id);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -946,7 +946,7 @@ sstables::shared_sstable sstables_task_executor::consume_sstable() {
|
||||
auto sst = _sstables.back();
|
||||
_sstables.pop_back();
|
||||
--_cm._stats.pending_tasks; // from this point on, switch_state(pending|active) works the same way as any other task
|
||||
cmlog.debug("consumed {}", sst->get_filename());
|
||||
cmlog.debug("{}", format("consumed {}", sst->get_filename()));
|
||||
return sst;
|
||||
}
|
||||
|
||||
@@ -1208,6 +1208,7 @@ future<> compaction_manager::await_tasks(std::vector<shared_ptr<compaction_task_
|
||||
|
||||
std::vector<shared_ptr<compaction_task_executor>>
|
||||
compaction_manager::do_stop_ongoing_compactions(sstring reason, std::function<bool(const compaction_group_view*)> filter, std::optional<compaction_type> type_opt) noexcept {
|
||||
auto ongoing_compactions = get_compactions(filter).size();
|
||||
auto tasks = _tasks
|
||||
| std::views::filter([&filter, type_opt] (const auto& task) {
|
||||
return filter(task.compacting_table()) && (!type_opt || task.compaction_type() == *type_opt);
|
||||
@@ -1216,7 +1217,6 @@ compaction_manager::do_stop_ongoing_compactions(sstring reason, std::function<bo
|
||||
| std::ranges::to<std::vector<shared_ptr<compaction_task_executor>>>();
|
||||
logging::log_level level = tasks.empty() ? log_level::debug : log_level::info;
|
||||
if (cmlog.is_enabled(level)) {
|
||||
auto ongoing_compactions = get_compactions(filter).size();
|
||||
std::string scope = "";
|
||||
if (!tasks.empty()) {
|
||||
const compaction_group_view* t = tasks.front()->compacting_table();
|
||||
@@ -1268,15 +1268,9 @@ future<> compaction_manager::start(const db::config& cfg, utils::disk_space_moni
|
||||
if (dsm && (this_shard_id() == 0)) {
|
||||
_out_of_space_subscription = dsm->subscribe(cfg.critical_disk_utilization_level, [this] (auto threshold_reached) {
|
||||
if (threshold_reached) {
|
||||
return container().invoke_on_all([] (compaction_manager& cm) {
|
||||
cm._in_critical_disk_utilization_mode = true;
|
||||
return cm.drain();
|
||||
});
|
||||
return container().invoke_on_all([] (compaction_manager& cm) { return cm.drain(); });
|
||||
}
|
||||
return container().invoke_on_all([] (compaction_manager& cm) {
|
||||
cm._in_critical_disk_utilization_mode = false;
|
||||
cm.enable();
|
||||
});
|
||||
return container().invoke_on_all([] (compaction_manager& cm) { cm.enable(); });
|
||||
});
|
||||
}
|
||||
|
||||
@@ -1426,17 +1420,11 @@ protected:
|
||||
compaction_strategy cs = t.get_compaction_strategy();
|
||||
compaction_descriptor descriptor = co_await cs.get_sstables_for_compaction(t, _cm.get_strategy_control());
|
||||
int weight = calculate_weight(descriptor);
|
||||
bool debug_enabled = cmlog.is_enabled(log_level::debug);
|
||||
if (debug_enabled) {
|
||||
cmlog.debug("Started minor compaction sstables={} sstables_reapired_at={} range={} uuid={} compaction_uuid={}",
|
||||
descriptor.sstables, compacting_table()->get_sstables_repaired_at(),
|
||||
compacting_table()->token_range(), uuid, _compaction_data.compaction_uuid);
|
||||
}
|
||||
cmlog.debug("Started minor compaction sstables={} sstables_reapired_at={} range={} uuid={} compaction_uuid={}",
|
||||
descriptor.sstables, compacting_table()->get_sstables_repaired_at(),
|
||||
compacting_table()->token_range(), uuid, _compaction_data.compaction_uuid);
|
||||
|
||||
sstring old_sstables;
|
||||
if (debug_enabled) {
|
||||
old_sstables = ::format("{}", descriptor.sstables);
|
||||
}
|
||||
auto old_sstables = ::format("{}", descriptor.sstables);
|
||||
|
||||
if (descriptor.sstables.empty() || !can_proceed() || t.is_auto_compaction_disabled_by_user()) {
|
||||
cmlog.debug("{}: sstables={} can_proceed={} auto_compaction={}", *this, descriptor.sstables.size(), can_proceed(), t.is_auto_compaction_disabled_by_user());
|
||||
@@ -1466,10 +1454,8 @@ protected:
|
||||
try {
|
||||
bool should_update_history = this->should_update_history(descriptor.options.type());
|
||||
compaction_result res = co_await compact_sstables(std::move(descriptor), _compaction_data, on_replace);
|
||||
if (debug_enabled) {
|
||||
cmlog.debug("Finished minor compaction old_sstables={} new_sstables={} sstables_reapired_at={} range={} uuid={} compaction_uuid={}",
|
||||
old_sstables, res.new_sstables, compacting_table()->get_sstables_repaired_at(), compacting_table()->token_range(), uuid, _compaction_data.compaction_uuid);
|
||||
}
|
||||
cmlog.debug("Finished minor compaction old_sstables={} new_sstables={} sstables_reapired_at={} range={} uuid={} compaction_uuid={}",
|
||||
old_sstables, res.new_sstables, compacting_table()->get_sstables_repaired_at(), compacting_table()->token_range(), uuid, _compaction_data.compaction_uuid);
|
||||
finish_compaction();
|
||||
if (should_update_history) {
|
||||
// update_history can take a long time compared to
|
||||
@@ -2362,16 +2348,6 @@ future<compaction_manager::compaction_stats_opt> compaction_manager::perform_spl
|
||||
return perform_task_on_all_files<split_compaction_task_executor>("split", info, t, std::move(options), std::move(owned_ranges_ptr), std::move(get_sstables), throw_if_stopping::no);
|
||||
}
|
||||
|
||||
std::exception_ptr compaction_manager::make_disabled_exception(compaction::compaction_group_view& cg) {
|
||||
std::exception_ptr ex;
|
||||
if (_in_critical_disk_utilization_mode) {
|
||||
ex = std::make_exception_ptr(std::runtime_error("critical disk utilization"));
|
||||
} else {
|
||||
ex = std::make_exception_ptr(compaction_stopped_exception(cg.schema()->ks_name(), cg.schema()->cf_name(), "compaction disabled"));
|
||||
}
|
||||
return ex;
|
||||
}
|
||||
|
||||
future<std::vector<sstables::shared_sstable>>
|
||||
compaction_manager::maybe_split_new_sstable(sstables::shared_sstable sst, compaction_group_view& t, compaction_type_options::split opt) {
|
||||
if (!split_compaction_task_executor::sstable_needs_split(sst, opt)) {
|
||||
@@ -2381,7 +2357,8 @@ compaction_manager::maybe_split_new_sstable(sstables::shared_sstable sst, compac
|
||||
// We don't want to prevent split because compaction is temporarily disabled on a view only for synchronization,
|
||||
// which is unneeded against new sstables that aren't part of any set yet, so never use can_proceed(&t) here.
|
||||
if (is_disabled()) {
|
||||
co_return coroutine::exception(make_disabled_exception(t));
|
||||
co_return coroutine::exception(std::make_exception_ptr(std::runtime_error(format("Cannot split {} because manager has compaction disabled, " \
|
||||
"reason might be out of space prevention", sst->get_filename()))));
|
||||
}
|
||||
std::vector<sstables::shared_sstable> ret;
|
||||
|
||||
|
||||
@@ -115,8 +115,6 @@ private:
|
||||
uint32_t _disabled_state_count = 0;
|
||||
|
||||
bool is_disabled() const { return _state != state::running || _disabled_state_count > 0; }
|
||||
// precondition: is_disabled() is true.
|
||||
std::exception_ptr make_disabled_exception(compaction::compaction_group_view& cg);
|
||||
|
||||
std::optional<future<>> _stop_future;
|
||||
|
||||
@@ -172,7 +170,6 @@ private:
|
||||
shared_tombstone_gc_state _shared_tombstone_gc_state;
|
||||
|
||||
utils::disk_space_monitor::subscription _out_of_space_subscription;
|
||||
bool _in_critical_disk_utilization_mode = false;
|
||||
private:
|
||||
// Requires task->_compaction_state.gate to be held and task to be registered in _tasks.
|
||||
future<compaction_stats_opt> perform_task(shared_ptr<compaction::compaction_task_executor> task, throw_if_stopping do_throw_if_stopping);
|
||||
|
||||
@@ -33,10 +33,8 @@ future<compaction_descriptor> leveled_compaction_strategy::get_sstables_for_comp
|
||||
auto candidate = manifest.get_compaction_candidates(*state->last_compacted_keys, state->compaction_counter);
|
||||
|
||||
if (!candidate.sstables.empty()) {
|
||||
if (leveled_manifest::logger.is_enabled(logging::log_level::debug)) {
|
||||
auto main_set = co_await table_s.main_sstable_set();
|
||||
leveled_manifest::logger.debug("leveled: Compacting {} out of {} sstables", candidate.sstables.size(), main_set->size());
|
||||
}
|
||||
auto main_set = co_await table_s.main_sstable_set();
|
||||
leveled_manifest::logger.debug("leveled: Compacting {} out of {} sstables", candidate.sstables.size(), main_set->size());
|
||||
co_return candidate;
|
||||
}
|
||||
|
||||
|
||||
@@ -15,7 +15,6 @@
|
||||
#include "compaction_strategy_state.hh"
|
||||
#include "utils/error_injection.hh"
|
||||
|
||||
#include <seastar/util/lazy.hh>
|
||||
#include <ranges>
|
||||
|
||||
namespace compaction {
|
||||
@@ -29,12 +28,12 @@ time_window_compaction_strategy_state_ptr time_window_compaction_strategy::get_s
|
||||
}
|
||||
|
||||
const std::unordered_map<sstring, std::chrono::seconds> time_window_compaction_strategy_options::valid_window_units = {
|
||||
{"MINUTES", 60s}, {"HOURS", 3600s}, {"DAYS", 86400s}};
|
||||
{ "MINUTES", 60s }, { "HOURS", 3600s }, { "DAYS", 86400s }
|
||||
};
|
||||
|
||||
const std::unordered_map<sstring, time_window_compaction_strategy_options::timestamp_resolutions>
|
||||
time_window_compaction_strategy_options::valid_timestamp_resolutions = {
|
||||
{"MICROSECONDS", timestamp_resolutions::microsecond},
|
||||
{"MILLISECONDS", timestamp_resolutions::millisecond},
|
||||
const std::unordered_map<sstring, time_window_compaction_strategy_options::timestamp_resolutions> time_window_compaction_strategy_options::valid_timestamp_resolutions = {
|
||||
{ "MICROSECONDS", timestamp_resolutions::microsecond },
|
||||
{ "MILLISECONDS", timestamp_resolutions::millisecond },
|
||||
};
|
||||
|
||||
static std::chrono::seconds validate_compaction_window_unit(const std::map<sstring, sstring>& options) {
|
||||
@@ -44,8 +43,7 @@ static std::chrono::seconds validate_compaction_window_unit(const std::map<sstri
|
||||
if (tmp_value) {
|
||||
auto valid_window_units_it = time_window_compaction_strategy_options::valid_window_units.find(tmp_value.value());
|
||||
if (valid_window_units_it == time_window_compaction_strategy_options::valid_window_units.end()) {
|
||||
throw exceptions::configuration_exception(
|
||||
fmt::format("Invalid window unit {} for {}", tmp_value.value(), time_window_compaction_strategy_options::COMPACTION_WINDOW_UNIT_KEY));
|
||||
throw exceptions::configuration_exception(fmt::format("Invalid window unit {} for {}", tmp_value.value(), time_window_compaction_strategy_options::COMPACTION_WINDOW_UNIT_KEY));
|
||||
}
|
||||
window_unit = valid_window_units_it->second;
|
||||
}
|
||||
@@ -61,12 +59,10 @@ static std::chrono::seconds validate_compaction_window_unit(const std::map<sstri
|
||||
|
||||
static int validate_compaction_window_size(const std::map<sstring, sstring>& options) {
|
||||
auto tmp_value = compaction_strategy_impl::get_value(options, time_window_compaction_strategy_options::COMPACTION_WINDOW_SIZE_KEY);
|
||||
int window_size = cql3::statements::property_definitions::to_long(time_window_compaction_strategy_options::COMPACTION_WINDOW_SIZE_KEY, tmp_value,
|
||||
time_window_compaction_strategy_options::DEFAULT_COMPACTION_WINDOW_SIZE);
|
||||
int window_size = cql3::statements::property_definitions::to_long(time_window_compaction_strategy_options::COMPACTION_WINDOW_SIZE_KEY, tmp_value, time_window_compaction_strategy_options::DEFAULT_COMPACTION_WINDOW_SIZE);
|
||||
|
||||
if (window_size <= 0) {
|
||||
throw exceptions::configuration_exception(
|
||||
fmt::format("{} value ({}) must be greater than 1", time_window_compaction_strategy_options::COMPACTION_WINDOW_SIZE_KEY, window_size));
|
||||
throw exceptions::configuration_exception(fmt::format("{} value ({}) must be greater than 1", time_window_compaction_strategy_options::COMPACTION_WINDOW_SIZE_KEY, window_size));
|
||||
}
|
||||
|
||||
return window_size;
|
||||
@@ -86,30 +82,26 @@ static db_clock::duration validate_expired_sstable_check_frequency_seconds(const
|
||||
try {
|
||||
expired_sstable_check_frequency = std::chrono::seconds(std::stol(tmp_value.value()));
|
||||
} catch (const std::exception& e) {
|
||||
throw exceptions::syntax_exception(fmt::format(
|
||||
"Invalid long value {} for {}", tmp_value.value(), time_window_compaction_strategy_options::EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_KEY));
|
||||
throw exceptions::syntax_exception(fmt::format("Invalid long value {} for {}", tmp_value.value(), time_window_compaction_strategy_options::EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_KEY));
|
||||
}
|
||||
}
|
||||
|
||||
return expired_sstable_check_frequency;
|
||||
}
|
||||
|
||||
static db_clock::duration validate_expired_sstable_check_frequency_seconds(
|
||||
const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
|
||||
static db_clock::duration validate_expired_sstable_check_frequency_seconds(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
|
||||
db_clock::duration expired_sstable_check_frequency = validate_expired_sstable_check_frequency_seconds(options);
|
||||
unchecked_options.erase(time_window_compaction_strategy_options::EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_KEY);
|
||||
return expired_sstable_check_frequency;
|
||||
}
|
||||
|
||||
static time_window_compaction_strategy_options::timestamp_resolutions validate_timestamp_resolution(const std::map<sstring, sstring>& options) {
|
||||
time_window_compaction_strategy_options::timestamp_resolutions timestamp_resolution =
|
||||
time_window_compaction_strategy_options::timestamp_resolutions::microsecond;
|
||||
time_window_compaction_strategy_options::timestamp_resolutions timestamp_resolution = time_window_compaction_strategy_options::timestamp_resolutions::microsecond;
|
||||
|
||||
auto tmp_value = compaction_strategy_impl::get_value(options, time_window_compaction_strategy_options::TIMESTAMP_RESOLUTION_KEY);
|
||||
if (tmp_value) {
|
||||
if (!time_window_compaction_strategy_options::valid_timestamp_resolutions.contains(tmp_value.value())) {
|
||||
throw exceptions::configuration_exception(fmt::format(
|
||||
"Invalid timestamp resolution {} for {}", tmp_value.value(), time_window_compaction_strategy_options::TIMESTAMP_RESOLUTION_KEY));
|
||||
throw exceptions::configuration_exception(fmt::format("Invalid timestamp resolution {} for {}", tmp_value.value(), time_window_compaction_strategy_options::TIMESTAMP_RESOLUTION_KEY));
|
||||
} else {
|
||||
timestamp_resolution = time_window_compaction_strategy_options::valid_timestamp_resolutions.at(tmp_value.value());
|
||||
}
|
||||
@@ -118,8 +110,7 @@ static time_window_compaction_strategy_options::timestamp_resolutions validate_t
|
||||
return timestamp_resolution;
|
||||
}
|
||||
|
||||
static time_window_compaction_strategy_options::timestamp_resolutions validate_timestamp_resolution(
|
||||
const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
|
||||
static time_window_compaction_strategy_options::timestamp_resolutions validate_timestamp_resolution(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
|
||||
time_window_compaction_strategy_options::timestamp_resolutions timestamp_resolution = validate_timestamp_resolution(options);
|
||||
unchecked_options.erase(time_window_compaction_strategy_options::TIMESTAMP_RESOLUTION_KEY);
|
||||
return timestamp_resolution;
|
||||
@@ -154,7 +145,7 @@ void time_window_compaction_strategy_options::validate(const std::map<sstring, s
|
||||
compaction_strategy_impl::validate_min_max_threshold(options, unchecked_options);
|
||||
|
||||
auto it = options.find("enable_optimized_twcs_queries");
|
||||
if (it != options.end() && it->second != "true" && it->second != "false") {
|
||||
if (it != options.end() && it->second != "true" && it->second != "false") {
|
||||
throw exceptions::configuration_exception(fmt::format("enable_optimized_twcs_queries value ({}) must be \"true\" or \"false\"", it->second));
|
||||
}
|
||||
unchecked_options.erase("enable_optimized_twcs_queries");
|
||||
@@ -171,9 +162,7 @@ class classify_by_timestamp {
|
||||
std::vector<int64_t> _known_windows;
|
||||
|
||||
public:
|
||||
explicit classify_by_timestamp(time_window_compaction_strategy_options options)
|
||||
: _options(std::move(options)) {
|
||||
}
|
||||
explicit classify_by_timestamp(time_window_compaction_strategy_options options) : _options(std::move(options)) { }
|
||||
int64_t operator()(api::timestamp_type ts) {
|
||||
const auto window = time_window_compaction_strategy::get_window_for(_options, ts);
|
||||
if (const auto it = std::ranges::find(_known_windows, window); it != _known_windows.end()) {
|
||||
@@ -201,7 +190,7 @@ uint64_t time_window_compaction_strategy::adjust_partition_estimate(const mutati
|
||||
auto estimated_window_count = max_data_segregation_window_count;
|
||||
auto default_ttl = std::chrono::duration_cast<std::chrono::microseconds>(s->default_time_to_live());
|
||||
bool min_and_max_ts_available = ms_meta.min_timestamp && ms_meta.max_timestamp;
|
||||
auto estimate_window_count = [this](timestamp_type min_window, timestamp_type max_window) {
|
||||
auto estimate_window_count = [this] (timestamp_type min_window, timestamp_type max_window) {
|
||||
const auto window_size = get_window_size(_options);
|
||||
return (max_window + (window_size - 1) - min_window) / window_size;
|
||||
};
|
||||
@@ -221,19 +210,21 @@ uint64_t time_window_compaction_strategy::adjust_partition_estimate(const mutati
|
||||
return partition_estimate / std::max(1UL, uint64_t(estimated_window_count));
|
||||
}
|
||||
|
||||
mutation_reader_consumer time_window_compaction_strategy::make_interposer_consumer(
|
||||
const mutation_source_metadata& ms_meta, mutation_reader_consumer end_consumer) const {
|
||||
if (ms_meta.min_timestamp && ms_meta.max_timestamp &&
|
||||
get_window_for(_options, *ms_meta.min_timestamp) == get_window_for(_options, *ms_meta.max_timestamp)) {
|
||||
mutation_reader_consumer time_window_compaction_strategy::make_interposer_consumer(const mutation_source_metadata& ms_meta, mutation_reader_consumer end_consumer) const {
|
||||
if (ms_meta.min_timestamp && ms_meta.max_timestamp
|
||||
&& get_window_for(_options, *ms_meta.min_timestamp) == get_window_for(_options, *ms_meta.max_timestamp)) {
|
||||
return end_consumer;
|
||||
}
|
||||
return [options = _options, end_consumer = std::move(end_consumer)](mutation_reader rd) mutable -> future<> {
|
||||
return mutation_writer::segregate_by_timestamp(std::move(rd), classify_by_timestamp(std::move(options)), end_consumer);
|
||||
return [options = _options, end_consumer = std::move(end_consumer)] (mutation_reader rd) mutable -> future<> {
|
||||
return mutation_writer::segregate_by_timestamp(
|
||||
std::move(rd),
|
||||
classify_by_timestamp(std::move(options)),
|
||||
end_consumer);
|
||||
};
|
||||
}
|
||||
|
||||
compaction_descriptor time_window_compaction_strategy::get_reshaping_job(
|
||||
std::vector<sstables::shared_sstable> input, schema_ptr schema, reshape_config cfg) const {
|
||||
compaction_descriptor
|
||||
time_window_compaction_strategy::get_reshaping_job(std::vector<sstables::shared_sstable> input, schema_ptr schema, reshape_config cfg) const {
|
||||
auto mode = cfg.mode;
|
||||
std::vector<sstables::shared_sstable> single_window;
|
||||
std::vector<sstables::shared_sstable> multi_window;
|
||||
@@ -248,7 +239,7 @@ compaction_descriptor time_window_compaction_strategy::get_reshaping_job(
|
||||
|
||||
// Sort input sstables by first_key order
|
||||
// to allow efficient reshaping of disjoint sstables.
|
||||
std::sort(input.begin(), input.end(), [&schema](const sstables::shared_sstable& a, const sstables::shared_sstable& b) {
|
||||
std::sort(input.begin(), input.end(), [&schema] (const sstables::shared_sstable& a, const sstables::shared_sstable& b) {
|
||||
return dht::ring_position(a->get_first_decorated_key()).less_compare(*schema, dht::ring_position(b->get_first_decorated_key()));
|
||||
});
|
||||
|
||||
@@ -262,34 +253,31 @@ compaction_descriptor time_window_compaction_strategy::get_reshaping_job(
|
||||
}
|
||||
}
|
||||
|
||||
auto is_disjoint = [&schema, mode, max_sstables](const std::vector<sstables::shared_sstable>& ssts) {
|
||||
auto is_disjoint = [&schema, mode, max_sstables] (const std::vector<sstables::shared_sstable>& ssts) {
|
||||
size_t tolerance = (mode == reshape_mode::relaxed) ? max_sstables : 0;
|
||||
return sstable_set_overlapping_count(schema, ssts) <= tolerance;
|
||||
};
|
||||
|
||||
clogger.debug("time_window_compaction_strategy::get_reshaping_job: offstrategy_threshold={} max_sstables={} multi_window={} disjoint={} "
|
||||
"single_window={} disjoint={}",
|
||||
offstrategy_threshold, max_sstables, multi_window.size(), seastar::value_of([&] {
|
||||
return !multi_window.empty() && sstable_set_overlapping_count(schema, multi_window) == 0;
|
||||
}),
|
||||
single_window.size(), seastar::value_of([&] {
|
||||
return !single_window.empty() && sstable_set_overlapping_count(schema, single_window) == 0;
|
||||
}));
|
||||
clogger.debug("time_window_compaction_strategy::get_reshaping_job: offstrategy_threshold={} max_sstables={} multi_window={} disjoint={} single_window={} disjoint={}",
|
||||
offstrategy_threshold, max_sstables,
|
||||
multi_window.size(), !multi_window.empty() && sstable_set_overlapping_count(schema, multi_window) == 0,
|
||||
single_window.size(), !single_window.empty() && sstable_set_overlapping_count(schema, single_window) == 0);
|
||||
|
||||
auto get_job_size = [](const std::vector<sstables::shared_sstable>& ssts) {
|
||||
auto get_job_size = [] (const std::vector<sstables::shared_sstable>& ssts) {
|
||||
return std::ranges::fold_left(ssts | std::views::transform(std::mem_fn(&sstables::sstable::bytes_on_disk)), uint64_t(0), std::plus{});
|
||||
};
|
||||
|
||||
// Targets a space overhead of 10%. All disjoint sstables can be compacted together as long as they won't
|
||||
// cause an overhead above target. Otherwise, the job targets a maximum of #max_threshold sstables.
|
||||
auto need_trimming = [&](const std::vector<sstables::shared_sstable>& ssts, const uint64_t job_size, bool is_disjoint) {
|
||||
auto need_trimming = [&] (const std::vector<sstables::shared_sstable>& ssts, const uint64_t job_size, bool is_disjoint) {
|
||||
const size_t min_sstables = 2;
|
||||
auto is_above_target_size = job_size > target_job_size;
|
||||
|
||||
return (ssts.size() > max_sstables && !is_disjoint) || (ssts.size() > min_sstables && is_above_target_size);
|
||||
return (ssts.size() > max_sstables && !is_disjoint) ||
|
||||
(ssts.size() > min_sstables && is_above_target_size);
|
||||
};
|
||||
|
||||
auto maybe_trim_job = [&need_trimming](std::vector<sstables::shared_sstable>& ssts, uint64_t job_size, bool is_disjoint) {
|
||||
auto maybe_trim_job = [&need_trimming] (std::vector<sstables::shared_sstable>& ssts, uint64_t job_size, bool is_disjoint) {
|
||||
while (need_trimming(ssts, job_size, is_disjoint)) {
|
||||
auto sst = ssts.back();
|
||||
ssts.pop_back();
|
||||
@@ -306,7 +294,7 @@ compaction_descriptor time_window_compaction_strategy::get_reshaping_job(
|
||||
// For example, if there are N sstables spanning window W, where N <= 32, then we can produce all data for W
|
||||
// in a single compaction round, removing the need to later compact W to reduce its number of files.
|
||||
auto sort_size = std::min(max_sstables, multi_window.size());
|
||||
std::ranges::partial_sort(multi_window, multi_window.begin() + sort_size, std::ranges::less(), [](const sstables::shared_sstable& a) {
|
||||
std::ranges::partial_sort(multi_window, multi_window.begin() + sort_size, std::ranges::less(), [] (const sstables::shared_sstable &a) {
|
||||
return a->get_stats_metadata().max_timestamp;
|
||||
});
|
||||
maybe_trim_job(multi_window, job_size, disjoint);
|
||||
@@ -346,7 +334,8 @@ compaction_descriptor time_window_compaction_strategy::get_reshaping_job(
|
||||
return compaction_descriptor();
|
||||
}
|
||||
|
||||
future<compaction_descriptor> time_window_compaction_strategy::get_sstables_for_compaction(compaction_group_view& table_s, strategy_control& control) {
|
||||
future<compaction_descriptor>
|
||||
time_window_compaction_strategy::get_sstables_for_compaction(compaction_group_view& table_s, strategy_control& control) {
|
||||
auto state = get_state(table_s);
|
||||
auto compaction_time = gc_clock::now();
|
||||
auto candidates = co_await control.candidates(table_s);
|
||||
@@ -380,8 +369,10 @@ future<compaction_descriptor> time_window_compaction_strategy::get_sstables_for_
|
||||
co_return compaction_descriptor(std::move(compaction_candidates));
|
||||
}
|
||||
|
||||
time_window_compaction_strategy::bucket_compaction_mode time_window_compaction_strategy::compaction_mode(
|
||||
const time_window_compaction_strategy_state& state, const bucket_t& bucket, timestamp_type bucket_key, timestamp_type now, size_t min_threshold) const {
|
||||
time_window_compaction_strategy::bucket_compaction_mode
|
||||
time_window_compaction_strategy::compaction_mode(const time_window_compaction_strategy_state& state,
|
||||
const bucket_t& bucket, timestamp_type bucket_key,
|
||||
timestamp_type now, size_t min_threshold) const {
|
||||
// STCS will also be performed on older window buckets, to avoid a bad write and
|
||||
// space amplification when something like read repair cause small updates to
|
||||
// those past windows.
|
||||
@@ -394,7 +385,8 @@ time_window_compaction_strategy::bucket_compaction_mode time_window_compaction_s
|
||||
return bucket_compaction_mode::none;
|
||||
}
|
||||
|
||||
std::vector<sstables::shared_sstable> time_window_compaction_strategy::get_next_non_expired_sstables(compaction_group_view& table_s, strategy_control& control,
|
||||
std::vector<sstables::shared_sstable>
|
||||
time_window_compaction_strategy::get_next_non_expired_sstables(compaction_group_view& table_s, strategy_control& control,
|
||||
std::vector<sstables::shared_sstable> non_expiring_sstables, gc_clock::time_point compaction_time, time_window_compaction_strategy_state& state) {
|
||||
auto most_interesting = get_compaction_candidates(table_s, control, non_expiring_sstables, state);
|
||||
|
||||
@@ -408,29 +400,31 @@ std::vector<sstables::shared_sstable> time_window_compaction_strategy::get_next_
|
||||
|
||||
// if there is no sstable to compact in standard way, try compacting single sstable whose droppable tombstone
|
||||
// ratio is greater than threshold.
|
||||
std::erase_if(non_expiring_sstables, [this, compaction_time, &table_s](const sstables::shared_sstable& sst) -> bool {
|
||||
std::erase_if(non_expiring_sstables, [this, compaction_time, &table_s] (const sstables::shared_sstable& sst) -> bool {
|
||||
return !worth_dropping_tombstones(sst, compaction_time, table_s);
|
||||
});
|
||||
if (non_expiring_sstables.empty()) {
|
||||
return {};
|
||||
}
|
||||
auto it = std::ranges::min_element(non_expiring_sstables, [](auto& i, auto& j) {
|
||||
auto it = std::ranges::min_element(non_expiring_sstables, [] (auto& i, auto& j) {
|
||||
return i->get_stats_metadata().min_timestamp < j->get_stats_metadata().min_timestamp;
|
||||
});
|
||||
return {*it};
|
||||
return { *it };
|
||||
}
|
||||
|
||||
std::vector<sstables::shared_sstable> time_window_compaction_strategy::get_compaction_candidates(compaction_group_view& table_s, strategy_control& control,
|
||||
std::vector<sstables::shared_sstable> candidate_sstables, time_window_compaction_strategy_state& state) {
|
||||
std::vector<sstables::shared_sstable>
|
||||
time_window_compaction_strategy::get_compaction_candidates(compaction_group_view& table_s, strategy_control& control,
|
||||
std::vector<sstables::shared_sstable> candidate_sstables, time_window_compaction_strategy_state& state) {
|
||||
auto [buckets, max_timestamp] = get_buckets(std::move(candidate_sstables), _options);
|
||||
// Update the highest window seen, if necessary
|
||||
state.highest_window_seen = std::max(state.highest_window_seen, max_timestamp);
|
||||
|
||||
return newest_bucket(table_s, control, std::move(buckets), table_s.min_compaction_threshold(), table_s.schema()->max_compaction_threshold(),
|
||||
state.highest_window_seen, state);
|
||||
state.highest_window_seen, state);
|
||||
}
|
||||
|
||||
timestamp_type time_window_compaction_strategy::get_window_lower_bound(std::chrono::seconds sstable_window_size, timestamp_type timestamp) {
|
||||
timestamp_type
|
||||
time_window_compaction_strategy::get_window_lower_bound(std::chrono::seconds sstable_window_size, timestamp_type timestamp) {
|
||||
using namespace std::chrono;
|
||||
// mask out window size from timestamp to get lower bound of its window
|
||||
auto num_windows = microseconds(timestamp) / sstable_window_size;
|
||||
@@ -438,8 +432,8 @@ timestamp_type time_window_compaction_strategy::get_window_lower_bound(std::chro
|
||||
return duration_cast<microseconds>(num_windows * sstable_window_size).count();
|
||||
}
|
||||
|
||||
std::pair<std::map<timestamp_type, std::vector<sstables::shared_sstable>>, timestamp_type> time_window_compaction_strategy::get_buckets(
|
||||
std::vector<sstables::shared_sstable> files, const time_window_compaction_strategy_options& options) {
|
||||
std::pair<std::map<timestamp_type, std::vector<sstables::shared_sstable>>, timestamp_type>
|
||||
time_window_compaction_strategy::get_buckets(std::vector<sstables::shared_sstable> files, const time_window_compaction_strategy_options& options) {
|
||||
std::map<timestamp_type, std::vector<sstables::shared_sstable>> buckets;
|
||||
|
||||
timestamp_type max_timestamp = 0;
|
||||
@@ -456,13 +450,11 @@ std::pair<std::map<timestamp_type, std::vector<sstables::shared_sstable>>, times
|
||||
return std::make_pair(std::move(buckets), max_timestamp);
|
||||
}
|
||||
|
||||
} // namespace compaction
|
||||
}
|
||||
|
||||
template <>
|
||||
struct fmt::formatter<std::map<compaction::timestamp_type, std::vector<sstables::shared_sstable>>> {
|
||||
constexpr auto parse(format_parse_context& ctx) {
|
||||
return ctx.begin();
|
||||
}
|
||||
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
|
||||
auto format(const std::map<compaction::timestamp_type, std::vector<sstables::shared_sstable>>& buckets, fmt::format_context& ctx) const {
|
||||
auto out = fmt::format_to(ctx.out(), " buckets = {{\n");
|
||||
for (auto& [timestamp, sstables] : buckets | std::views::reverse) {
|
||||
@@ -474,9 +466,9 @@ struct fmt::formatter<std::map<compaction::timestamp_type, std::vector<sstables:
|
||||
|
||||
namespace compaction {
|
||||
|
||||
std::vector<sstables::shared_sstable> time_window_compaction_strategy::newest_bucket(compaction_group_view& table_s, strategy_control& control,
|
||||
std::map<timestamp_type, std::vector<sstables::shared_sstable>> buckets, int min_threshold, int max_threshold, timestamp_type now,
|
||||
time_window_compaction_strategy_state& state) {
|
||||
std::vector<sstables::shared_sstable>
|
||||
time_window_compaction_strategy::newest_bucket(compaction_group_view& table_s, strategy_control& control, std::map<timestamp_type, std::vector<sstables::shared_sstable>> buckets,
|
||||
int min_threshold, int max_threshold, timestamp_type now, time_window_compaction_strategy_state& state) {
|
||||
clogger.debug("time_window_compaction_strategy::newest_bucket:\n now {}\n{}", now, buckets);
|
||||
|
||||
for (auto&& [key, bucket] : buckets | std::views::reverse) {
|
||||
@@ -517,7 +509,8 @@ std::vector<sstables::shared_sstable> time_window_compaction_strategy::newest_bu
|
||||
return {};
|
||||
}
|
||||
|
||||
std::vector<sstables::shared_sstable> time_window_compaction_strategy::trim_to_threshold(std::vector<sstables::shared_sstable> bucket, int max_threshold) {
|
||||
std::vector<sstables::shared_sstable>
|
||||
time_window_compaction_strategy::trim_to_threshold(std::vector<sstables::shared_sstable> bucket, int max_threshold) {
|
||||
auto n = std::min(bucket.size(), size_t(max_threshold));
|
||||
// Trim the largest sstables off the end to meet the maxThreshold
|
||||
std::ranges::partial_sort(bucket, bucket.begin() + n, std::ranges::less(), std::mem_fn(&sstables::sstable::ondisk_data_size));
|
||||
@@ -549,8 +542,8 @@ future<int64_t> time_window_compaction_strategy::estimated_pending_compactions(c
|
||||
co_return n;
|
||||
}
|
||||
|
||||
std::vector<compaction_descriptor> time_window_compaction_strategy::get_cleanup_compaction_jobs(
|
||||
compaction_group_view& table_s, std::vector<sstables::shared_sstable> candidates) const {
|
||||
std::vector<compaction_descriptor>
|
||||
time_window_compaction_strategy::get_cleanup_compaction_jobs(compaction_group_view& table_s, std::vector<sstables::shared_sstable> candidates) const {
|
||||
std::vector<compaction_descriptor> ret;
|
||||
for (auto&& [_, sstables] : get_buckets(std::move(candidates), _options).first) {
|
||||
auto per_window_jobs = size_tiered_compaction_strategy(_stcs_options).get_cleanup_compaction_jobs(table_s, std::move(sstables));
|
||||
@@ -563,4 +556,4 @@ std::unique_ptr<sstables::sstable_set_impl> time_window_compaction_strategy::mak
|
||||
return std::make_unique<sstables::time_series_sstable_set>(ts.schema(), _options.enable_optimized_twcs_queries);
|
||||
}
|
||||
|
||||
} // namespace compaction
|
||||
}
|
||||
|
||||
@@ -397,17 +397,6 @@ commitlog_total_space_in_mb: -1
|
||||
# you can cache more hot rows
|
||||
# column_index_size_in_kb: 64
|
||||
|
||||
# sstable format version for newly written sstables.
|
||||
# Currently allowed values are `me` and `ms`.
|
||||
# If not specified in the config, this defaults to `me`.
|
||||
#
|
||||
# The difference between `me` and `ms` are the data structures used
|
||||
# in the primary index.
|
||||
# In short, `ms` needs more CPU during sstable writes,
|
||||
# but should behave better during reads,
|
||||
# although it might behave worse for very long clustering keys.
|
||||
sstable_format: ms
|
||||
|
||||
# Auto-scaling of the promoted index prevents running out of memory
|
||||
# when the promoted index grows too large (due to partitions with many rows
|
||||
# vs. too small column_index_size_in_kb). When the serialized representation
|
||||
|
||||
@@ -896,9 +896,6 @@ scylla_core = (['message/messaging_service.cc',
|
||||
'replica/multishard_query.cc',
|
||||
'replica/mutation_dump.cc',
|
||||
'replica/querier.cc',
|
||||
'replica/logstor/segment_manager.cc',
|
||||
'replica/logstor/logstor.cc',
|
||||
'replica/logstor/write_buffer.cc',
|
||||
'mutation/atomic_cell.cc',
|
||||
'mutation/canonical_mutation.cc',
|
||||
'mutation/frozen_mutation.cc',
|
||||
@@ -1470,7 +1467,6 @@ idls = ['idl/gossip_digest.idl.hh',
|
||||
'idl/query.idl.hh',
|
||||
'idl/idl_test.idl.hh',
|
||||
'idl/commitlog.idl.hh',
|
||||
'idl/logstor.idl.hh',
|
||||
'idl/tracing.idl.hh',
|
||||
'idl/consistency_level.idl.hh',
|
||||
'idl/cache_temperature.idl.hh',
|
||||
|
||||
@@ -48,15 +48,13 @@ const sstring query_processor::CQL_VERSION = "3.3.1";
|
||||
const std::chrono::minutes prepared_statements_cache::entry_expiry = std::chrono::minutes(60);
|
||||
|
||||
struct query_processor::remote {
|
||||
remote(service::migration_manager& mm, service::mapreduce_service& fwd, service::storage_service& ss, service::raft_group0_client& group0_client,
|
||||
service::strong_consistency::coordinator& _sc_coordinator)
|
||||
: mm(mm)
|
||||
, mapreducer(fwd)
|
||||
, ss(ss)
|
||||
, group0_client(group0_client)
|
||||
, sc_coordinator(_sc_coordinator)
|
||||
, gate("query_processor::remote") {
|
||||
}
|
||||
remote(service::migration_manager& mm, service::mapreduce_service& fwd,
|
||||
service::storage_service& ss, service::raft_group0_client& group0_client,
|
||||
service::strong_consistency::coordinator& _sc_coordinator)
|
||||
: mm(mm), mapreducer(fwd), ss(ss), group0_client(group0_client)
|
||||
, sc_coordinator(_sc_coordinator)
|
||||
, gate("query_processor::remote")
|
||||
{}
|
||||
|
||||
service::migration_manager& mm;
|
||||
service::mapreduce_service& mapreducer;
|
||||
@@ -79,34 +77,24 @@ static service::query_state query_state_for_internal_call() {
|
||||
return {service::client_state::for_internal_calls(), empty_service_permit()};
|
||||
}
|
||||
|
||||
query_processor::query_processor(service::storage_proxy& proxy, data_dictionary::database db, service::migration_notifier& mn,
|
||||
vector_search::vector_store_client& vsc, query_processor::memory_config mcfg, cql_config& cql_cfg, utils::loading_cache_config auth_prep_cache_cfg,
|
||||
lang::manager& langm)
|
||||
: _migration_subscriber{std::make_unique<migration_subscriber>(this)}
|
||||
, _proxy(proxy)
|
||||
, _db(db)
|
||||
, _mnotifier(mn)
|
||||
, _vector_store_client(vsc)
|
||||
, _mcfg(mcfg)
|
||||
, _cql_config(cql_cfg)
|
||||
, _prepared_cache(prep_cache_log, _mcfg.prepared_statment_cache_size)
|
||||
, _authorized_prepared_cache(std::move(auth_prep_cache_cfg), authorized_prepared_statements_cache_log)
|
||||
, _auth_prepared_cache_cfg_cb([this](uint32_t) {
|
||||
(void)_authorized_prepared_cache_config_action.trigger_later();
|
||||
})
|
||||
, _authorized_prepared_cache_config_action([this] {
|
||||
update_authorized_prepared_cache_config();
|
||||
return make_ready_future<>();
|
||||
})
|
||||
, _authorized_prepared_cache_update_interval_in_ms_observer(_db.get_config().permissions_update_interval_in_ms.observe(_auth_prepared_cache_cfg_cb))
|
||||
, _authorized_prepared_cache_validity_in_ms_observer(_db.get_config().permissions_validity_in_ms.observe(_auth_prepared_cache_cfg_cb))
|
||||
, _lang_manager(langm)
|
||||
, _write_consistency_levels_warned_observer(_db.get_config().write_consistency_levels_warned.observe([this](const auto& v) {
|
||||
_write_consistency_levels_warned = to_consistency_level_set(v);
|
||||
}))
|
||||
, _write_consistency_levels_disallowed_observer(_db.get_config().write_consistency_levels_disallowed.observe([this](const auto& v) {
|
||||
_write_consistency_levels_disallowed = to_consistency_level_set(v);
|
||||
})) {
|
||||
query_processor::query_processor(service::storage_proxy& proxy, data_dictionary::database db, service::migration_notifier& mn, vector_search::vector_store_client& vsc, query_processor::memory_config mcfg, cql_config& cql_cfg, utils::loading_cache_config auth_prep_cache_cfg, lang::manager& langm)
|
||||
: _migration_subscriber{std::make_unique<migration_subscriber>(this)}
|
||||
, _proxy(proxy)
|
||||
, _db(db)
|
||||
, _mnotifier(mn)
|
||||
, _vector_store_client(vsc)
|
||||
, _mcfg(mcfg)
|
||||
, _cql_config(cql_cfg)
|
||||
, _prepared_cache(prep_cache_log, _mcfg.prepared_statment_cache_size)
|
||||
, _authorized_prepared_cache(std::move(auth_prep_cache_cfg), authorized_prepared_statements_cache_log)
|
||||
, _auth_prepared_cache_cfg_cb([this] (uint32_t) { (void) _authorized_prepared_cache_config_action.trigger_later(); })
|
||||
, _authorized_prepared_cache_config_action([this] { update_authorized_prepared_cache_config(); return make_ready_future<>(); })
|
||||
, _authorized_prepared_cache_update_interval_in_ms_observer(_db.get_config().permissions_update_interval_in_ms.observe(_auth_prepared_cache_cfg_cb))
|
||||
, _authorized_prepared_cache_validity_in_ms_observer(_db.get_config().permissions_validity_in_ms.observe(_auth_prepared_cache_cfg_cb))
|
||||
, _lang_manager(langm)
|
||||
, _write_consistency_levels_warned_observer(_db.get_config().write_consistency_levels_warned.observe([this](const auto& v) { _write_consistency_levels_warned = to_consistency_level_set(v); }))
|
||||
, _write_consistency_levels_disallowed_observer(_db.get_config().write_consistency_levels_disallowed.observe([this](const auto& v) { _write_consistency_levels_disallowed = to_consistency_level_set(v); }))
|
||||
{
|
||||
_write_consistency_levels_warned = to_consistency_level_set(_db.get_config().write_consistency_levels_warned());
|
||||
_write_consistency_levels_disallowed = to_consistency_level_set(_db.get_config().write_consistency_levels_disallowed());
|
||||
namespace sm = seastar::metrics;
|
||||
@@ -114,7 +102,7 @@ query_processor::query_processor(service::storage_proxy& proxy, data_dictionary:
|
||||
using clevel = db::consistency_level;
|
||||
sm::label cl_label("consistency_level");
|
||||
|
||||
sm::label who_label("who"); // Who queried system tables
|
||||
sm::label who_label("who"); // Who queried system tables
|
||||
const auto user_who_label_instance = who_label("user");
|
||||
const auto internal_who_label_instance = who_label("internal");
|
||||
|
||||
@@ -122,11 +110,17 @@ query_processor::query_processor(service::storage_proxy& proxy, data_dictionary:
|
||||
const auto system_ks_label_instance = ks_label("system");
|
||||
|
||||
std::vector<sm::metric_definition> qp_group;
|
||||
qp_group.push_back(sm::make_counter("statements_prepared", _stats.prepare_invocations, sm::description("Counts the total number of parsed CQL requests.")));
|
||||
qp_group.push_back(sm::make_counter(
|
||||
"statements_prepared",
|
||||
_stats.prepare_invocations,
|
||||
sm::description("Counts the total number of parsed CQL requests.")));
|
||||
for (auto cl = size_t(clevel::MIN_VALUE); cl <= size_t(clevel::MAX_VALUE); ++cl) {
|
||||
qp_group.push_back(sm::make_counter(
|
||||
"queries", _stats.queries_by_cl[cl], sm::description("Counts queries by consistency level."), {cl_label(clevel(cl)), basic_level})
|
||||
.set_skip_when_empty());
|
||||
qp_group.push_back(
|
||||
sm::make_counter(
|
||||
"queries",
|
||||
_stats.queries_by_cl[cl],
|
||||
sm::description("Counts queries by consistency level."),
|
||||
{cl_label(clevel(cl)), basic_level}).set_skip_when_empty());
|
||||
}
|
||||
_metrics.add_group("query_processor", qp_group);
|
||||
|
||||
@@ -527,23 +521,29 @@ query_processor::query_processor(service::storage_proxy& proxy, data_dictionary:
|
||||
|
||||
std::vector<sm::metric_definition> cql_cl_group;
|
||||
for (auto cl = size_t(clevel::MIN_VALUE); cl <= size_t(clevel::MAX_VALUE); ++cl) {
|
||||
cql_cl_group.push_back(sm::make_counter("writes_per_consistency_level", _cql_stats.writes_per_consistency_level[cl],
|
||||
sm::description("Counts the number of writes for each consistency level."), {cl_label(clevel(cl)), basic_level})
|
||||
.set_skip_when_empty());
|
||||
cql_cl_group.push_back(
|
||||
sm::make_counter(
|
||||
"writes_per_consistency_level",
|
||||
_cql_stats.writes_per_consistency_level[cl],
|
||||
sm::description("Counts the number of writes for each consistency level."),
|
||||
{cl_label(clevel(cl)), basic_level}).set_skip_when_empty());
|
||||
}
|
||||
_metrics.add_group("cql", cql_cl_group);
|
||||
|
||||
_metrics.add_group(
|
||||
"cql", {
|
||||
sm::make_counter("write_consistency_levels_disallowed_violations", _cql_stats.write_consistency_levels_disallowed_violations,
|
||||
sm::description("Counts the number of write_consistency_levels_disallowed guardrail violations, "
|
||||
"i.e. attempts to write with a forbidden consistency level."),
|
||||
{basic_level}),
|
||||
sm::make_counter("write_consistency_levels_warned_violations", _cql_stats.write_consistency_levels_warned_violations,
|
||||
sm::description("Counts the number of write_consistency_levels_warned guardrail violations, "
|
||||
"i.e. attempts to write with a discouraged consistency level."),
|
||||
{basic_level}),
|
||||
});
|
||||
_metrics.add_group("cql", {
|
||||
sm::make_counter(
|
||||
"write_consistency_levels_disallowed_violations",
|
||||
_cql_stats.write_consistency_levels_disallowed_violations,
|
||||
sm::description("Counts the number of write_consistency_levels_disallowed guardrail violations, "
|
||||
"i.e. attempts to write with a forbidden consistency level."),
|
||||
{basic_level}),
|
||||
sm::make_counter(
|
||||
"write_consistency_levels_warned_violations",
|
||||
_cql_stats.write_consistency_levels_warned_violations,
|
||||
sm::description("Counts the number of write_consistency_levels_warned guardrail violations, "
|
||||
"i.e. attempts to write with a discouraged consistency level."),
|
||||
{basic_level}),
|
||||
});
|
||||
|
||||
_mnotifier.register_listener(_migration_subscriber.get());
|
||||
}
|
||||
@@ -554,13 +554,15 @@ query_processor::~query_processor() {
|
||||
}
|
||||
}
|
||||
|
||||
std::pair<std::reference_wrapper<service::strong_consistency::coordinator>, gate::holder> query_processor::acquire_strongly_consistent_coordinator() {
|
||||
std::pair<std::reference_wrapper<service::strong_consistency::coordinator>, gate::holder>
|
||||
query_processor::acquire_strongly_consistent_coordinator() {
|
||||
auto [remote_, holder] = remote();
|
||||
return {remote_.get().sc_coordinator, std::move(holder)};
|
||||
}
|
||||
|
||||
void query_processor::start_remote(service::migration_manager& mm, service::mapreduce_service& mapreducer, service::storage_service& ss,
|
||||
service::raft_group0_client& group0_client, service::strong_consistency::coordinator& sc_coordinator) {
|
||||
void query_processor::start_remote(service::migration_manager& mm, service::mapreduce_service& mapreducer,
|
||||
service::storage_service& ss, service::raft_group0_client& group0_client,
|
||||
service::strong_consistency::coordinator& sc_coordinator) {
|
||||
_remote = std::make_unique<struct remote>(mm, mapreducer, ss, group0_client, sc_coordinator);
|
||||
}
|
||||
|
||||
@@ -580,9 +582,7 @@ future<> query_processor::stop() {
|
||||
}
|
||||
|
||||
future<::shared_ptr<cql_transport::messages::result_message>> query_processor::execute_with_guard(
|
||||
std::function<future<::shared_ptr<cql_transport::messages::result_message>>(
|
||||
service::query_state&, ::shared_ptr<cql_statement>, const query_options&, std::optional<service::group0_guard>)>
|
||||
fn,
|
||||
std::function<future<::shared_ptr<cql_transport::messages::result_message>>(service::query_state&, ::shared_ptr<cql_statement>, const query_options&, std::optional<service::group0_guard>)> fn,
|
||||
::shared_ptr<cql_statement> statement, service::query_state& query_state, const query_options& options) {
|
||||
// execute all statements that need group0 guard on shard0
|
||||
if (this_shard_id() != 0) {
|
||||
@@ -591,13 +591,13 @@ future<::shared_ptr<cql_transport::messages::result_message>> query_processor::e
|
||||
|
||||
auto [remote_, holder] = remote();
|
||||
size_t retries = remote_.get().mm.get_concurrent_ddl_retries();
|
||||
while (true) {
|
||||
while (true) {
|
||||
try {
|
||||
auto guard = co_await remote_.get().mm.start_group0_operation();
|
||||
co_return co_await fn(query_state, statement, options, std::move(guard));
|
||||
} catch (const service::group0_concurrent_modification& ex) {
|
||||
log.warn("Failed to execute statement \"{}\" due to guard conflict.{}.", statement->raw_cql_statement,
|
||||
retries ? " Retrying" : " Number of retries exceeded, giving up");
|
||||
log.warn("Failed to execute statement \"{}\" due to guard conflict.{}.",
|
||||
statement->raw_cql_statement, retries ? " Retrying" : " Number of retries exceeded, giving up");
|
||||
if (retries--) {
|
||||
continue;
|
||||
}
|
||||
@@ -606,30 +606,29 @@ future<::shared_ptr<cql_transport::messages::result_message>> query_processor::e
|
||||
}
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
future<::shared_ptr<result_message>> query_processor::execute_maybe_with_guard(service::query_state& query_state, ::shared_ptr<cql_statement> statement,
|
||||
const query_options& options,
|
||||
future<::shared_ptr<result_message>> (query_processor::*fn)(
|
||||
service::query_state&, ::shared_ptr<cql_statement>, const query_options&, std::optional<service::group0_guard>, Args...),
|
||||
Args... args) {
|
||||
template<typename... Args>
|
||||
future<::shared_ptr<result_message>>
|
||||
query_processor::execute_maybe_with_guard(service::query_state& query_state, ::shared_ptr<cql_statement> statement, const query_options& options,
|
||||
future<::shared_ptr<result_message>>(query_processor::*fn)(service::query_state&, ::shared_ptr<cql_statement>, const query_options&, std::optional<service::group0_guard>, Args...), Args... args) {
|
||||
if (!statement->needs_guard(*this, query_state)) {
|
||||
return (this->*fn)(query_state, std::move(statement), options, std::nullopt, std::forward<Args>(args)...);
|
||||
}
|
||||
static auto exec = [fn](query_processor& qp, Args... args, service::query_state& query_state, ::shared_ptr<cql_statement> statement,
|
||||
const query_options& options, std::optional<service::group0_guard> guard) {
|
||||
static auto exec = [fn] (query_processor& qp, Args... args, service::query_state& query_state, ::shared_ptr<cql_statement> statement, const query_options& options, std::optional<service::group0_guard> guard) {
|
||||
return (qp.*fn)(query_state, std::move(statement), options, std::move(guard), std::forward<Args>(args)...);
|
||||
};
|
||||
return execute_with_guard(std::bind_front(exec, std::ref(*this), std::forward<Args>(args)...), std::move(statement), query_state, options);
|
||||
}
|
||||
|
||||
future<::shared_ptr<result_message>> query_processor::execute_direct_without_checking_exception_message(
|
||||
const std::string_view& query_string, service::query_state& query_state, dialect d, query_options& options) {
|
||||
future<::shared_ptr<result_message>>
|
||||
query_processor::execute_direct_without_checking_exception_message(const std::string_view& query_string, service::query_state& query_state, dialect d, query_options& options) {
|
||||
log.trace("execute_direct: \"{}\"", query_string);
|
||||
tracing::trace(query_state.get_trace_state(), "Parsing a statement");
|
||||
auto p = get_statement(query_string, query_state.get_client_state(), d);
|
||||
auto statement = p->statement;
|
||||
if (statement->get_bound_terms() != options.get_values_count()) {
|
||||
const auto msg = format("Invalid amount of bind variables: expected {:d} received {:d}", statement->get_bound_terms(), options.get_values_count());
|
||||
const auto msg = format("Invalid amount of bind variables: expected {:d} received {:d}",
|
||||
statement->get_bound_terms(),
|
||||
options.get_values_count());
|
||||
throw exceptions::invalid_request_exception(msg);
|
||||
}
|
||||
options.prepare(p->bound_names);
|
||||
@@ -640,13 +639,17 @@ future<::shared_ptr<result_message>> query_processor::execute_direct_without_che
|
||||
metrics.regularStatementsExecuted.inc();
|
||||
#endif
|
||||
auto user = query_state.get_client_state().user();
|
||||
tracing::trace(query_state.get_trace_state(), "Processing a statement for authenticated user: {}",
|
||||
user ? (user->name ? *user->name : "anonymous") : "no user authenticated");
|
||||
tracing::trace(query_state.get_trace_state(), "Processing a statement for authenticated user: {}", user ? (user->name ? *user->name : "anonymous") : "no user authenticated");
|
||||
return execute_maybe_with_guard(query_state, std::move(statement), options, &query_processor::do_execute_direct, std::move(p->warnings));
|
||||
}
|
||||
|
||||
future<::shared_ptr<result_message>> query_processor::do_execute_direct(service::query_state& query_state, shared_ptr<cql_statement> statement,
|
||||
const query_options& options, std::optional<service::group0_guard> guard, cql3::cql_warnings_vec warnings) {
|
||||
future<::shared_ptr<result_message>>
|
||||
query_processor::do_execute_direct(
|
||||
service::query_state& query_state,
|
||||
shared_ptr<cql_statement> statement,
|
||||
const query_options& options,
|
||||
std::optional<service::group0_guard> guard,
|
||||
cql3::cql_warnings_vec warnings) {
|
||||
auto access_future = co_await coroutine::as_future(statement->check_access(*this, query_state.get_client_state()));
|
||||
if (access_future.failed()) {
|
||||
co_await audit::inspect(statement, query_state, options, true);
|
||||
@@ -671,16 +674,26 @@ future<::shared_ptr<result_message>> query_processor::do_execute_direct(service:
|
||||
co_return std::move(m);
|
||||
}
|
||||
|
||||
future<::shared_ptr<result_message>> query_processor::execute_prepared_without_checking_exception_message(service::query_state& query_state,
|
||||
shared_ptr<cql_statement> statement, const query_options& options, statements::prepared_statement::checked_weak_ptr prepared,
|
||||
cql3::prepared_cache_key_type cache_key, bool needs_authorization) {
|
||||
return execute_maybe_with_guard(
|
||||
query_state, std::move(statement), options, &query_processor::do_execute_prepared, std::move(prepared), std::move(cache_key), needs_authorization);
|
||||
future<::shared_ptr<result_message>>
|
||||
query_processor::execute_prepared_without_checking_exception_message(
|
||||
service::query_state& query_state,
|
||||
shared_ptr<cql_statement> statement,
|
||||
const query_options& options,
|
||||
statements::prepared_statement::checked_weak_ptr prepared,
|
||||
cql3::prepared_cache_key_type cache_key,
|
||||
bool needs_authorization) {
|
||||
return execute_maybe_with_guard(query_state, std::move(statement), options, &query_processor::do_execute_prepared, std::move(prepared), std::move(cache_key), needs_authorization);
|
||||
}
|
||||
|
||||
future<::shared_ptr<result_message>> query_processor::do_execute_prepared(service::query_state& query_state, shared_ptr<cql_statement> statement,
|
||||
const query_options& options, std::optional<service::group0_guard> guard, statements::prepared_statement::checked_weak_ptr prepared,
|
||||
cql3::prepared_cache_key_type cache_key, bool needs_authorization) {
|
||||
future<::shared_ptr<result_message>>
|
||||
query_processor::do_execute_prepared(
|
||||
service::query_state& query_state,
|
||||
shared_ptr<cql_statement> statement,
|
||||
const query_options& options,
|
||||
std::optional<service::group0_guard> guard,
|
||||
statements::prepared_statement::checked_weak_ptr prepared,
|
||||
cql3::prepared_cache_key_type cache_key,
|
||||
bool needs_authorization) {
|
||||
if (needs_authorization) {
|
||||
co_await statement->check_access(*this, query_state.get_client_state());
|
||||
try {
|
||||
@@ -694,8 +707,8 @@ future<::shared_ptr<result_message>> query_processor::do_execute_prepared(servic
|
||||
co_return co_await process_authorized_statement(std::move(statement), query_state, options, std::move(guard));
|
||||
}
|
||||
|
||||
future<::shared_ptr<result_message>> query_processor::process_authorized_statement(const ::shared_ptr<cql_statement> statement,
|
||||
service::query_state& query_state, const query_options& options, std::optional<service::group0_guard> guard) {
|
||||
future<::shared_ptr<result_message>>
|
||||
query_processor::process_authorized_statement(const ::shared_ptr<cql_statement> statement, service::query_state& query_state, const query_options& options, std::optional<service::group0_guard> guard) {
|
||||
auto& client_state = query_state.get_client_state();
|
||||
|
||||
++_stats.queries_by_cl[size_t(options.get_consistency())];
|
||||
@@ -705,39 +718,43 @@ future<::shared_ptr<result_message>> query_processor::process_authorized_stateme
|
||||
auto msg = co_await statement->execute_without_checking_exception_message(*this, query_state, options, std::move(guard));
|
||||
|
||||
if (msg) {
|
||||
co_return std::move(msg);
|
||||
co_return std::move(msg);
|
||||
}
|
||||
co_return ::make_shared<result_message::void_message>();
|
||||
}
|
||||
|
||||
future<::shared_ptr<cql_transport::messages::result_message::prepared>> query_processor::prepare(
|
||||
sstring query_string, service::query_state& query_state, cql3::dialect d) {
|
||||
future<::shared_ptr<cql_transport::messages::result_message::prepared>>
|
||||
query_processor::prepare(sstring query_string, service::query_state& query_state, cql3::dialect d) {
|
||||
auto& client_state = query_state.get_client_state();
|
||||
return prepare(std::move(query_string), client_state, d);
|
||||
}
|
||||
|
||||
future<::shared_ptr<cql_transport::messages::result_message::prepared>> query_processor::prepare(
|
||||
sstring query_string, const service::client_state& client_state, cql3::dialect d) {
|
||||
future<::shared_ptr<cql_transport::messages::result_message::prepared>>
|
||||
query_processor::prepare(sstring query_string, const service::client_state& client_state, cql3::dialect d) {
|
||||
try {
|
||||
auto key = compute_id(query_string, client_state.get_raw_keyspace(), d);
|
||||
auto prep_entry = co_await _prepared_cache.get_pinned(key, [this, &query_string, &client_state, d] {
|
||||
auto prepared = get_statement(query_string, client_state, d);
|
||||
prepared->calculate_metadata_id();
|
||||
auto bound_terms = prepared->statement->get_bound_terms();
|
||||
if (bound_terms > std::numeric_limits<uint16_t>::max()) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
format("Too many markers(?). {:d} markers exceed the allowed maximum of {:d}", bound_terms, std::numeric_limits<uint16_t>::max()));
|
||||
}
|
||||
throwing_assert(bound_terms == prepared->bound_names.size());
|
||||
return make_ready_future<std::unique_ptr<statements::prepared_statement>>(std::move(prepared));
|
||||
});
|
||||
|
||||
co_await utils::get_local_injector().inject("query_processor_prepare_wait_after_cache_get", utils::wait_for_message(std::chrono::seconds(60)));
|
||||
auto prepared = get_statement(query_string, client_state, d);
|
||||
prepared->calculate_metadata_id();
|
||||
auto bound_terms = prepared->statement->get_bound_terms();
|
||||
if (bound_terms > std::numeric_limits<uint16_t>::max()) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
format("Too many markers(?). {:d} markers exceed the allowed maximum of {:d}",
|
||||
bound_terms,
|
||||
std::numeric_limits<uint16_t>::max()));
|
||||
}
|
||||
throwing_assert(bound_terms == prepared->bound_names.size());
|
||||
return make_ready_future<std::unique_ptr<statements::prepared_statement>>(std::move(prepared));
|
||||
});
|
||||
|
||||
co_await utils::get_local_injector().inject(
|
||||
"query_processor_prepare_wait_after_cache_get",
|
||||
utils::wait_for_message(std::chrono::seconds(60)));
|
||||
|
||||
auto msg = ::make_shared<result_message::prepared::cql>(prepared_cache_key_type::cql_id(key), std::move(prep_entry),
|
||||
client_state.is_protocol_extension_set(cql_transport::cql_protocol_extension::LWT_ADD_METADATA_MARK));
|
||||
client_state.is_protocol_extension_set(cql_transport::cql_protocol_extension::LWT_ADD_METADATA_MARK));
|
||||
co_return std::move(msg);
|
||||
} catch (typename prepared_statements_cache::statement_is_too_big&) {
|
||||
} catch(typename prepared_statements_cache::statement_is_too_big&) {
|
||||
throw prepared_statement_is_too_big(query_string);
|
||||
}
|
||||
}
|
||||
@@ -748,11 +765,15 @@ static std::string hash_target(std::string_view query_string, std::string_view k
|
||||
return ret;
|
||||
}
|
||||
|
||||
prepared_cache_key_type query_processor::compute_id(std::string_view query_string, std::string_view keyspace, dialect d) {
|
||||
prepared_cache_key_type query_processor::compute_id(
|
||||
std::string_view query_string,
|
||||
std::string_view keyspace,
|
||||
dialect d) {
|
||||
return prepared_cache_key_type(md5_hasher::calculate(hash_target(query_string, keyspace)), d);
|
||||
}
|
||||
|
||||
std::unique_ptr<prepared_statement> query_processor::get_statement(const std::string_view& query, const service::client_state& client_state, dialect d) {
|
||||
std::unique_ptr<prepared_statement>
|
||||
query_processor::get_statement(const std::string_view& query, const service::client_state& client_state, dialect d) {
|
||||
// Measuring allocation cost requires that no yield points exist
|
||||
// between bytes_before and bytes_after. It needs fixing if this
|
||||
// function is ever futurized.
|
||||
@@ -777,7 +798,8 @@ std::unique_ptr<prepared_statement> query_processor::get_statement(const std::st
|
||||
return p;
|
||||
}
|
||||
|
||||
std::unique_ptr<raw::parsed_statement> query_processor::parse_statement(const std::string_view& query, dialect d) {
|
||||
std::unique_ptr<raw::parsed_statement>
|
||||
query_processor::parse_statement(const std::string_view& query, dialect d) {
|
||||
try {
|
||||
{
|
||||
const char* error_injection_key = "query_processor-parse_statement-test_failure";
|
||||
@@ -802,7 +824,8 @@ std::unique_ptr<raw::parsed_statement> query_processor::parse_statement(const st
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::unique_ptr<raw::parsed_statement>> query_processor::parse_statements(std::string_view queries, dialect d) {
|
||||
std::vector<std::unique_ptr<raw::parsed_statement>>
|
||||
query_processor::parse_statements(std::string_view queries, dialect d) {
|
||||
try {
|
||||
auto statements = util::do_with_parser(queries, d, std::mem_fn(&cql3_parser::CqlParser::queries));
|
||||
if (statements.empty()) {
|
||||
@@ -831,10 +854,15 @@ std::pair<std::reference_wrapper<struct query_processor::remote>, gate::holder>
|
||||
on_internal_error(log, "attempted to perform distributed query when `query_processor::remote` is unavailable");
|
||||
}
|
||||
|
||||
query_options query_processor::make_internal_options(const statements::prepared_statement::checked_weak_ptr& p, const std::vector<data_value_or_unset>& values,
|
||||
db::consistency_level cl, int32_t page_size, service::node_local_only node_local_only) const {
|
||||
query_options query_processor::make_internal_options(
|
||||
const statements::prepared_statement::checked_weak_ptr& p,
|
||||
const std::vector<data_value_or_unset>& values,
|
||||
db::consistency_level cl,
|
||||
int32_t page_size,
|
||||
service::node_local_only node_local_only) const {
|
||||
if (p->bound_names.size() != values.size()) {
|
||||
throw std::invalid_argument(format("Invalid number of values. Expecting {:d} but got {:d}", p->bound_names.size(), values.size()));
|
||||
throw std::invalid_argument(
|
||||
format("Invalid number of values. Expecting {:d} but got {:d}", p->bound_names.size(), values.size()));
|
||||
}
|
||||
auto ni = p->bound_names.begin();
|
||||
raw_value_vector_with_unset bound_values;
|
||||
@@ -842,28 +870,32 @@ query_options query_processor::make_internal_options(const statements::prepared_
|
||||
bound_values.unset.resize(values.size());
|
||||
for (auto& var : values) {
|
||||
auto& n = *ni;
|
||||
std::visit(overloaded_functor{[&](const data_value& v) {
|
||||
if (v.type() == bytes_type) {
|
||||
bound_values.values.emplace_back(cql3::raw_value::make_value(value_cast<bytes>(v)));
|
||||
} else if (v.is_null()) {
|
||||
bound_values.values.emplace_back(cql3::raw_value::make_null());
|
||||
} else {
|
||||
bound_values.values.emplace_back(cql3::raw_value::make_value(n->type->decompose(v)));
|
||||
}
|
||||
},
|
||||
[&](const unset_value&) {
|
||||
bound_values.values.emplace_back(cql3::raw_value::make_null());
|
||||
bound_values.unset[std::distance(p->bound_names.begin(), ni)] = true;
|
||||
}},
|
||||
var);
|
||||
std::visit(overloaded_functor {
|
||||
[&] (const data_value& v) {
|
||||
if (v.type() == bytes_type) {
|
||||
bound_values.values.emplace_back(cql3::raw_value::make_value(value_cast<bytes>(v)));
|
||||
} else if (v.is_null()) {
|
||||
bound_values.values.emplace_back(cql3::raw_value::make_null());
|
||||
} else {
|
||||
bound_values.values.emplace_back(cql3::raw_value::make_value(n->type->decompose(v)));
|
||||
}
|
||||
}, [&] (const unset_value&) {
|
||||
bound_values.values.emplace_back(cql3::raw_value::make_null());
|
||||
bound_values.unset[std::distance(p->bound_names.begin(), ni)] = true;
|
||||
}
|
||||
}, var);
|
||||
++ni;
|
||||
}
|
||||
return query_options(cl, std::move(bound_values),
|
||||
cql3::query_options::specific_options{.page_size = page_size,
|
||||
.state = {},
|
||||
.serial_consistency = db::consistency_level::SERIAL,
|
||||
.timestamp = api::missing_timestamp,
|
||||
.node_local_only = node_local_only});
|
||||
return query_options(
|
||||
cl,
|
||||
std::move(bound_values),
|
||||
cql3::query_options::specific_options {
|
||||
.page_size = page_size,
|
||||
.state = {},
|
||||
.serial_consistency = db::consistency_level::SERIAL,
|
||||
.timestamp = api::missing_timestamp,
|
||||
.node_local_only = node_local_only
|
||||
});
|
||||
}
|
||||
|
||||
statements::prepared_statement::checked_weak_ptr query_processor::prepare_internal(const sstring& query_string) {
|
||||
@@ -885,7 +917,11 @@ struct internal_query_state {
|
||||
};
|
||||
|
||||
internal_query_state query_processor::create_paged_state(
|
||||
const sstring& query_string, db::consistency_level cl, const data_value_list& values, int32_t page_size, std::optional<service::query_state> qs) {
|
||||
const sstring& query_string,
|
||||
db::consistency_level cl,
|
||||
const data_value_list& values,
|
||||
int32_t page_size,
|
||||
std::optional<service::query_state> qs) {
|
||||
auto p = prepare_internal(query_string);
|
||||
auto opts = make_internal_options(p, values, cl, page_size);
|
||||
if (!qs) {
|
||||
@@ -899,7 +935,8 @@ bool query_processor::has_more_results(cql3::internal_query_state& state) const
|
||||
}
|
||||
|
||||
future<> query_processor::for_each_cql_result(
|
||||
cql3::internal_query_state& state, noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set::row&)> f) {
|
||||
cql3::internal_query_state& state,
|
||||
noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set::row&)> f) {
|
||||
do {
|
||||
auto msg = co_await execute_paged_internal(state);
|
||||
for (auto& row : *msg) {
|
||||
@@ -910,18 +947,17 @@ future<> query_processor::for_each_cql_result(
|
||||
} while (has_more_results(state));
|
||||
}
|
||||
|
||||
future<::shared_ptr<untyped_result_set>> query_processor::execute_paged_internal(internal_query_state& state) {
|
||||
future<::shared_ptr<untyped_result_set>>
|
||||
query_processor::execute_paged_internal(internal_query_state& state) {
|
||||
state.p->statement->validate(*this, service::client_state::for_internal_calls());
|
||||
::shared_ptr<cql_transport::messages::result_message> msg = co_await state.p->statement->execute(*this, *state.qs, *state.opts, std::nullopt);
|
||||
::shared_ptr<cql_transport::messages::result_message> msg =
|
||||
co_await state.p->statement->execute(*this, *state.qs, *state.opts, std::nullopt);
|
||||
|
||||
class visitor : public result_message::visitor_base {
|
||||
internal_query_state& _state;
|
||||
query_processor& _qp;
|
||||
|
||||
public:
|
||||
visitor(internal_query_state& state, query_processor& qp)
|
||||
: _state(state)
|
||||
, _qp(qp) {
|
||||
visitor(internal_query_state& state, query_processor& qp) : _state(state), _qp(qp) {
|
||||
}
|
||||
virtual ~visitor() = default;
|
||||
void visit(const result_message::rows& rmrs) override {
|
||||
@@ -950,14 +986,23 @@ future<::shared_ptr<untyped_result_set>> query_processor::execute_paged_internal
|
||||
co_return ::make_shared<untyped_result_set>(msg);
|
||||
}
|
||||
|
||||
future<::shared_ptr<untyped_result_set>> query_processor::execute_internal(
|
||||
const sstring& query_string, db::consistency_level cl, const data_value_list& values, cache_internal cache) {
|
||||
future<::shared_ptr<untyped_result_set>>
|
||||
query_processor::execute_internal(
|
||||
const sstring& query_string,
|
||||
db::consistency_level cl,
|
||||
const data_value_list& values,
|
||||
cache_internal cache) {
|
||||
auto qs = query_state_for_internal_call();
|
||||
co_return co_await execute_internal(query_string, cl, qs, values, cache);
|
||||
}
|
||||
|
||||
future<::shared_ptr<untyped_result_set>> query_processor::execute_internal(
|
||||
const sstring& query_string, db::consistency_level cl, service::query_state& query_state, const data_value_list& values, cache_internal cache) {
|
||||
future<::shared_ptr<untyped_result_set>>
|
||||
query_processor::execute_internal(
|
||||
const sstring& query_string,
|
||||
db::consistency_level cl,
|
||||
service::query_state& query_state,
|
||||
const data_value_list& values,
|
||||
cache_internal cache) {
|
||||
|
||||
if (log.is_enabled(logging::log_level::trace)) {
|
||||
log.trace("execute_internal: {}\"{}\" ({})", cache ? "(cached) " : "", query_string, fmt::join(values, ", "));
|
||||
@@ -975,7 +1020,10 @@ future<::shared_ptr<untyped_result_set>> query_processor::execute_internal(
|
||||
}
|
||||
|
||||
future<utils::chunked_vector<mutation>> query_processor::get_mutations_internal(
|
||||
const sstring query_string, service::query_state& query_state, api::timestamp_type timestamp, std::vector<data_value_or_unset> values) {
|
||||
const sstring query_string,
|
||||
service::query_state& query_state,
|
||||
api::timestamp_type timestamp,
|
||||
std::vector<data_value_or_unset> values) {
|
||||
log.debug("get_mutations_internal: \"{}\" ({})", query_string, fmt::join(values, ", "));
|
||||
auto stmt = prepare_internal(query_string);
|
||||
auto mod_stmt = dynamic_pointer_cast<cql3::statements::modification_statement>(stmt->statement);
|
||||
@@ -993,8 +1041,12 @@ future<utils::chunked_vector<mutation>> query_processor::get_mutations_internal(
|
||||
co_return co_await mod_stmt->get_mutations(*this, opts, timeout, true, timestamp, query_state, json_cache, std::move(keys));
|
||||
}
|
||||
|
||||
future<::shared_ptr<untyped_result_set>> query_processor::execute_with_params(
|
||||
statements::prepared_statement::checked_weak_ptr p, db::consistency_level cl, service::query_state& query_state, const data_value_list& values) {
|
||||
future<::shared_ptr<untyped_result_set>>
|
||||
query_processor::execute_with_params(
|
||||
statements::prepared_statement::checked_weak_ptr p,
|
||||
db::consistency_level cl,
|
||||
service::query_state& query_state,
|
||||
const data_value_list& values) {
|
||||
auto opts = make_internal_options(p, values, cl);
|
||||
auto statement = p->statement;
|
||||
|
||||
@@ -1002,24 +1054,30 @@ future<::shared_ptr<untyped_result_set>> query_processor::execute_with_params(
|
||||
co_return ::make_shared<untyped_result_set>(msg);
|
||||
}
|
||||
|
||||
future<::shared_ptr<result_message>> query_processor::do_execute_with_params(
|
||||
service::query_state& query_state, shared_ptr<cql_statement> statement, const query_options& options, std::optional<service::group0_guard> guard) {
|
||||
future<::shared_ptr<result_message>>
|
||||
query_processor::do_execute_with_params(
|
||||
service::query_state& query_state,
|
||||
shared_ptr<cql_statement> statement,
|
||||
const query_options& options, std::optional<service::group0_guard> guard) {
|
||||
statement->validate(*this, service::client_state::for_internal_calls());
|
||||
co_return co_await coroutine::try_future(statement->execute(*this, query_state, options, std::move(guard)));
|
||||
}
|
||||
|
||||
|
||||
future<::shared_ptr<cql_transport::messages::result_message>> query_processor::execute_batch_without_checking_exception_message(
|
||||
::shared_ptr<statements::batch_statement> batch, service::query_state& query_state, query_options& options,
|
||||
future<::shared_ptr<cql_transport::messages::result_message>>
|
||||
query_processor::execute_batch_without_checking_exception_message(
|
||||
::shared_ptr<statements::batch_statement> batch,
|
||||
service::query_state& query_state,
|
||||
query_options& options,
|
||||
std::unordered_map<prepared_cache_key_type, authorized_prepared_statements_cache::value_type> pending_authorization_entries) {
|
||||
auto access_future = co_await coroutine::as_future(batch->check_access(*this, query_state.get_client_state()));
|
||||
co_await coroutine::parallel_for_each(pending_authorization_entries, [this, &query_state](auto& e) -> future<> {
|
||||
try {
|
||||
co_await _authorized_prepared_cache.insert(*query_state.get_client_state().user(), e.first, std::move(e.second));
|
||||
} catch (...) {
|
||||
log.error("failed to cache the entry: {}", std::current_exception());
|
||||
}
|
||||
});
|
||||
co_await coroutine::parallel_for_each(pending_authorization_entries, [this, &query_state] (auto& e) -> future<> {
|
||||
try {
|
||||
co_await _authorized_prepared_cache.insert(*query_state.get_client_state().user(), e.first, std::move(e.second));
|
||||
} catch (...) {
|
||||
log.error("failed to cache the entry: {}", std::current_exception());
|
||||
}
|
||||
});
|
||||
bool failed = access_future.failed();
|
||||
co_await audit::inspect(batch, query_state, options, failed);
|
||||
if (access_future.failed()) {
|
||||
@@ -1028,28 +1086,30 @@ future<::shared_ptr<cql_transport::messages::result_message>> query_processor::e
|
||||
batch->validate();
|
||||
batch->validate(*this, query_state.get_client_state());
|
||||
_stats.queries_by_cl[size_t(options.get_consistency())] += batch->get_statements().size();
|
||||
if (log.is_enabled(logging::log_level::trace)) {
|
||||
if (log.is_enabled(logging::log_level::trace)) {
|
||||
std::ostringstream oss;
|
||||
for (const auto& s : batch->get_statements()) {
|
||||
oss << std::endl << s.statement->raw_cql_statement;
|
||||
for (const auto& s: batch->get_statements()) {
|
||||
oss << std::endl << s.statement->raw_cql_statement;
|
||||
}
|
||||
log.trace("execute_batch({}): {}", batch->get_statements().size(), oss.str());
|
||||
}
|
||||
co_return co_await batch->execute(*this, query_state, options, std::nullopt);
|
||||
}
|
||||
|
||||
future<service::broadcast_tables::query_result> query_processor::execute_broadcast_table_query(const service::broadcast_tables::query& query) {
|
||||
future<service::broadcast_tables::query_result>
|
||||
query_processor::execute_broadcast_table_query(const service::broadcast_tables::query& query) {
|
||||
auto [remote_, holder] = remote();
|
||||
co_return co_await service::broadcast_tables::execute(remote_.get().group0_client, query);
|
||||
}
|
||||
|
||||
future<query::mapreduce_result> query_processor::mapreduce(query::mapreduce_request req, tracing::trace_state_ptr tr_state) {
|
||||
future<query::mapreduce_result>
|
||||
query_processor::mapreduce(query::mapreduce_request req, tracing::trace_state_ptr tr_state) {
|
||||
auto [remote_, holder] = remote();
|
||||
co_return co_await remote_.get().mapreducer.dispatch(std::move(req), std::move(tr_state));
|
||||
}
|
||||
|
||||
future<::shared_ptr<messages::result_message>> query_processor::execute_schema_statement(
|
||||
const statements::schema_altering_statement& stmt, service::query_state& state, const query_options& options, service::group0_batch& mc) {
|
||||
future<::shared_ptr<messages::result_message>>
|
||||
query_processor::execute_schema_statement(const statements::schema_altering_statement& stmt, service::query_state& state, const query_options& options, service::group0_batch& mc) {
|
||||
if (this_shard_id() != 0) {
|
||||
on_internal_error(log, "DDL must be executed on shard 0");
|
||||
}
|
||||
@@ -1103,8 +1163,7 @@ future<> query_processor::announce_schema_statement(const statements::schema_alt
|
||||
co_await remote_.get().mm.announce(std::move(m), std::move(guard), description);
|
||||
}
|
||||
|
||||
query_processor::migration_subscriber::migration_subscriber(query_processor* qp)
|
||||
: _qp{qp} {
|
||||
query_processor::migration_subscriber::migration_subscriber(query_processor* qp) : _qp{qp} {
|
||||
}
|
||||
|
||||
void query_processor::migration_subscriber::on_create_keyspace(const sstring& ks_name) {
|
||||
@@ -1130,7 +1189,10 @@ void query_processor::migration_subscriber::on_create_view(const sstring& ks_nam
|
||||
void query_processor::migration_subscriber::on_update_keyspace(const sstring& ks_name) {
|
||||
}
|
||||
|
||||
void query_processor::migration_subscriber::on_update_column_family(const sstring& ks_name, const sstring& cf_name, bool columns_changed) {
|
||||
void query_processor::migration_subscriber::on_update_column_family(
|
||||
const sstring& ks_name,
|
||||
const sstring& cf_name,
|
||||
bool columns_changed) {
|
||||
// #1255: Ignoring columns_changed deliberately.
|
||||
log.info("Column definitions for {}.{} changed, invalidating related prepared statements", ks_name, cf_name);
|
||||
remove_invalid_prepared_statements(ks_name, cf_name);
|
||||
@@ -1145,7 +1207,9 @@ void query_processor::migration_subscriber::on_update_function(const sstring& ks
|
||||
void query_processor::migration_subscriber::on_update_aggregate(const sstring& ks_name, const sstring& aggregate_name) {
|
||||
}
|
||||
|
||||
void query_processor::migration_subscriber::on_update_view(const sstring& ks_name, const sstring& view_name, bool columns_changed) {
|
||||
void query_processor::migration_subscriber::on_update_view(
|
||||
const sstring& ks_name,
|
||||
const sstring& view_name, bool columns_changed) {
|
||||
// scylladb/scylladb#16392 - Materialized views are also tables so we need at least handle
|
||||
// them as such when changed.
|
||||
on_update_column_family(ks_name, view_name, columns_changed);
|
||||
@@ -1174,28 +1238,39 @@ void query_processor::migration_subscriber::on_drop_view(const sstring& ks_name,
|
||||
remove_invalid_prepared_statements(ks_name, view_name);
|
||||
}
|
||||
|
||||
void query_processor::migration_subscriber::remove_invalid_prepared_statements(sstring ks_name, std::optional<sstring> cf_name) {
|
||||
_qp->_prepared_cache.remove_if([&](::shared_ptr<cql_statement> stmt) {
|
||||
void query_processor::migration_subscriber::remove_invalid_prepared_statements(
|
||||
sstring ks_name,
|
||||
std::optional<sstring> cf_name) {
|
||||
_qp->_prepared_cache.remove_if([&] (::shared_ptr<cql_statement> stmt) {
|
||||
return this->should_invalidate(ks_name, cf_name, stmt);
|
||||
});
|
||||
}
|
||||
|
||||
bool query_processor::migration_subscriber::should_invalidate(sstring ks_name, std::optional<sstring> cf_name, ::shared_ptr<cql_statement> statement) {
|
||||
bool query_processor::migration_subscriber::should_invalidate(
|
||||
sstring ks_name,
|
||||
std::optional<sstring> cf_name,
|
||||
::shared_ptr<cql_statement> statement) {
|
||||
return statement->depends_on(ks_name, cf_name);
|
||||
}
|
||||
|
||||
future<> query_processor::query_internal(const sstring& query_string, db::consistency_level cl, const data_value_list& values, int32_t page_size,
|
||||
noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)> f, std::optional<service::query_state> qs) {
|
||||
future<> query_processor::query_internal(
|
||||
const sstring& query_string,
|
||||
db::consistency_level cl,
|
||||
const data_value_list& values,
|
||||
int32_t page_size,
|
||||
noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)> f,
|
||||
std::optional<service::query_state> qs) {
|
||||
auto query_state = create_paged_state(query_string, cl, values, page_size, std::move(qs));
|
||||
co_return co_await for_each_cql_result(query_state, std::move(f));
|
||||
}
|
||||
|
||||
future<> query_processor::query_internal(const sstring& query_string, noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)> f) {
|
||||
future<> query_processor::query_internal(
|
||||
const sstring& query_string,
|
||||
noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)> f) {
|
||||
return query_internal(query_string, db::consistency_level::ONE, {}, 1000, std::move(f));
|
||||
}
|
||||
|
||||
shared_ptr<cql_transport::messages::result_message> query_processor::bounce_to_shard(
|
||||
unsigned shard, cql3::computed_function_values cached_fn_calls, bool track) {
|
||||
shared_ptr<cql_transport::messages::result_message> query_processor::bounce_to_shard(unsigned shard, cql3::computed_function_values cached_fn_calls, bool track) {
|
||||
if (track) {
|
||||
_proxy.get_stats().replica_cross_shard_ops++;
|
||||
}
|
||||
@@ -1203,8 +1278,7 @@ shared_ptr<cql_transport::messages::result_message> query_processor::bounce_to_s
|
||||
return ::make_shared<cql_transport::messages::result_message::bounce>(my_host_id, shard, std::move(cached_fn_calls));
|
||||
}
|
||||
|
||||
shared_ptr<cql_transport::messages::result_message> query_processor::bounce_to_node(
|
||||
locator::tablet_replica replica, cql3::computed_function_values cached_fn_calls, seastar::lowres_clock::time_point timeout, bool is_write) {
|
||||
shared_ptr<cql_transport::messages::result_message> query_processor::bounce_to_node(locator::tablet_replica replica, cql3::computed_function_values cached_fn_calls, seastar::lowres_clock::time_point timeout, bool is_write) {
|
||||
get_cql_stats().forwarded_requests++;
|
||||
return ::make_shared<cql_transport::messages::result_message::bounce>(replica.host, replica.shard, std::move(cached_fn_calls), timeout, is_write);
|
||||
}
|
||||
@@ -1221,7 +1295,7 @@ void query_processor::update_authorized_prepared_cache_config() {
|
||||
utils::loading_cache_config cfg;
|
||||
cfg.max_size = _mcfg.authorized_prepared_cache_size;
|
||||
cfg.expiry = std::min(std::chrono::milliseconds(_db.get_config().permissions_validity_in_ms()),
|
||||
std::chrono::duration_cast<std::chrono::milliseconds>(prepared_statements_cache::entry_expiry));
|
||||
std::chrono::duration_cast<std::chrono::milliseconds>(prepared_statements_cache::entry_expiry));
|
||||
cfg.refresh = std::chrono::milliseconds(_db.get_config().permissions_update_interval_in_ms());
|
||||
|
||||
if (!_authorized_prepared_cache.update_config(std::move(cfg))) {
|
||||
@@ -1233,4 +1307,4 @@ void query_processor::reset_cache() {
|
||||
_authorized_prepared_cache.reset();
|
||||
}
|
||||
|
||||
} // namespace cql3
|
||||
}
|
||||
|
||||
@@ -265,10 +265,7 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::do_
|
||||
if (guardrail_state == query_processor::write_consistency_guardrail_state::FAIL) {
|
||||
return make_exception_future<shared_ptr<cql_transport::messages::result_message>>(
|
||||
exceptions::invalid_request_exception(
|
||||
format("Write consistency level {} is forbidden by the current configuration "
|
||||
"setting of write_consistency_levels_disallowed. Please use a different "
|
||||
"consistency level, or remove {} from write_consistency_levels_disallowed "
|
||||
"set in the configuration.", cl, cl)));
|
||||
format("Consistency level {} is not allowed for write operations", cl)));
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < _statements.size(); ++i) {
|
||||
@@ -280,8 +277,7 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::do_
|
||||
_stats.statements_in_cas_batches += _statements.size();
|
||||
return execute_with_conditions(qp, options, query_state).then([guardrail_state, cl] (auto result) {
|
||||
if (guardrail_state == query_processor::write_consistency_guardrail_state::WARN) {
|
||||
result->add_warning(format("Using write consistency level {} listed on the "
|
||||
"write_consistency_levels_warned is not recommended.", cl));
|
||||
result->add_warning(format("Write with consistency level {} is warned by guardrail configuration", cl));
|
||||
}
|
||||
return result;
|
||||
});
|
||||
@@ -301,8 +297,7 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::do_
|
||||
}
|
||||
auto result = make_shared<cql_transport::messages::result_message::void_message>();
|
||||
if (guardrail_state == query_processor::write_consistency_guardrail_state::WARN) {
|
||||
result->add_warning(format("Using write consistency level {} listed on the "
|
||||
"write_consistency_levels_warned is not recommended.", cl));
|
||||
result->add_warning(format("Write with consistency level {} is warned by guardrail configuration", cl));
|
||||
}
|
||||
return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(std::move(result));
|
||||
});
|
||||
|
||||
@@ -59,8 +59,6 @@ const sstring cf_prop_defs::COMPACTION_ENABLED_KEY = "enabled";
|
||||
|
||||
const sstring cf_prop_defs::KW_TABLETS = "tablets";
|
||||
|
||||
const sstring cf_prop_defs::KW_STORAGE_ENGINE = "storage_engine";
|
||||
|
||||
schema::extensions_map cf_prop_defs::make_schema_extensions(const db::extensions& exts) const {
|
||||
schema::extensions_map er;
|
||||
for (auto& p : exts.schema_extensions()) {
|
||||
@@ -108,7 +106,6 @@ void cf_prop_defs::validate(const data_dictionary::database db, sstring ks_name,
|
||||
KW_BF_FP_CHANCE, KW_MEMTABLE_FLUSH_PERIOD, KW_COMPACTION,
|
||||
KW_COMPRESSION, KW_CRC_CHECK_CHANCE, KW_ID, KW_PAXOSGRACESECONDS,
|
||||
KW_SYNCHRONOUS_UPDATES, KW_TABLETS,
|
||||
KW_STORAGE_ENGINE,
|
||||
});
|
||||
static std::set<sstring> obsolete_keywords({
|
||||
sstring("index_interval"),
|
||||
@@ -199,20 +196,6 @@ void cf_prop_defs::validate(const data_dictionary::database db, sstring ks_name,
|
||||
}
|
||||
db::tablet_options::validate(*tablet_options_map);
|
||||
}
|
||||
|
||||
if (has_property(KW_STORAGE_ENGINE)) {
|
||||
auto storage_engine = get_string(KW_STORAGE_ENGINE, "");
|
||||
if (storage_engine == "logstor") {
|
||||
if (!db.features().logstor) {
|
||||
throw exceptions::configuration_exception(format("The experimental feature 'logstor' must be enabled in order to use the 'logstor' storage engine."));
|
||||
}
|
||||
if (!db.get_config().enable_logstor()) {
|
||||
throw exceptions::configuration_exception(format("The configuration option 'enable_logstor' must be set to true in the configuration in order to use the 'logstor' storage engine."));
|
||||
}
|
||||
} else {
|
||||
throw exceptions::configuration_exception(format("Illegal value for '{}'", KW_STORAGE_ENGINE));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::map<sstring, sstring> cf_prop_defs::get_compaction_type_options() const {
|
||||
@@ -413,13 +396,6 @@ void cf_prop_defs::apply_to_builder(schema_builder& builder, schema::extensions_
|
||||
if (auto tablet_options_opt = get_map(KW_TABLETS)) {
|
||||
builder.set_tablet_options(std::move(*tablet_options_opt));
|
||||
}
|
||||
|
||||
if (has_property(KW_STORAGE_ENGINE)) {
|
||||
auto storage_engine = get_string(KW_STORAGE_ENGINE, "");
|
||||
if (storage_engine == "logstor") {
|
||||
builder.set_logstor();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void cf_prop_defs::validate_minimum_int(const sstring& field, int32_t minimum_value, int32_t default_value) const
|
||||
|
||||
@@ -64,8 +64,6 @@ public:
|
||||
|
||||
static const sstring KW_TABLETS;
|
||||
|
||||
static const sstring KW_STORAGE_ENGINE;
|
||||
|
||||
// FIXME: In origin the following consts are in CFMetaData.
|
||||
static constexpr int32_t DEFAULT_DEFAULT_TIME_TO_LIVE = 0;
|
||||
static constexpr int32_t DEFAULT_MIN_INDEX_INTERVAL = 128;
|
||||
|
||||
@@ -9,7 +9,6 @@
|
||||
*/
|
||||
|
||||
|
||||
#include "cql3/statements/cf_prop_defs.hh"
|
||||
#include "utils/assert.hh"
|
||||
#include <inttypes.h>
|
||||
#include <boost/regex.hpp>
|
||||
@@ -267,13 +266,6 @@ std::unique_ptr<prepared_statement> create_table_statement::raw_statement::prepa
|
||||
stmt_warning("CREATE TABLE WITH COMPACT STORAGE is deprecated and will eventually be removed in a future version.");
|
||||
}
|
||||
|
||||
if (_properties.properties()->has_property(cf_prop_defs::KW_STORAGE_ENGINE)) {
|
||||
auto storage_engine = _properties.properties()->get_string(cf_prop_defs::KW_STORAGE_ENGINE, "");
|
||||
if (storage_engine == "logstor" && !_column_aliases.empty()) {
|
||||
throw exceptions::configuration_exception("The 'logstor' storage engine cannot be used with tables that have clustering columns");
|
||||
}
|
||||
}
|
||||
|
||||
auto& key_aliases = _key_aliases[0];
|
||||
std::vector<data_type> key_types;
|
||||
for (auto&& alias : key_aliases) {
|
||||
|
||||
@@ -273,10 +273,7 @@ modification_statement::do_execute(query_processor& qp, service::query_state& qs
|
||||
if (guardrail_state == query_processor::write_consistency_guardrail_state::FAIL) {
|
||||
co_return coroutine::exception(
|
||||
std::make_exception_ptr(exceptions::invalid_request_exception(
|
||||
format("Write consistency level {} is forbidden by the current configuration "
|
||||
"setting of write_consistency_levels_disallowed. Please use a different "
|
||||
"consistency level, or remove {} from write_consistency_levels_disallowed "
|
||||
"set in the configuration.", cl, cl))));
|
||||
format("Consistency level {} is not allowed for write operations", cl))));
|
||||
}
|
||||
|
||||
_restrictions->validate_primary_key(options);
|
||||
@@ -284,8 +281,7 @@ modification_statement::do_execute(query_processor& qp, service::query_state& qs
|
||||
if (has_conditions()) {
|
||||
auto result = co_await execute_with_condition(qp, qs, options);
|
||||
if (guardrail_state == query_processor::write_consistency_guardrail_state::WARN) {
|
||||
result->add_warning(format("Using write consistency level {} listed on the "
|
||||
"write_consistency_levels_warned is not recommended.", cl));
|
||||
result->add_warning(format("Write with consistency level {} is warned by guardrail configuration", cl));
|
||||
}
|
||||
co_return result;
|
||||
}
|
||||
@@ -307,8 +303,7 @@ modification_statement::do_execute(query_processor& qp, service::query_state& qs
|
||||
|
||||
auto result = seastar::make_shared<cql_transport::messages::result_message::void_message>();
|
||||
if (guardrail_state == query_processor::write_consistency_guardrail_state::WARN) {
|
||||
result->add_warning(format("Using write consistency level {} listed on the "
|
||||
"write_consistency_levels_warned is not recommended.", cl));
|
||||
result->add_warning(format("Write with consistency level {} is warned by guardrail configuration", cl));
|
||||
}
|
||||
if (keys_size_one) {
|
||||
auto&& table = s->table();
|
||||
|
||||
15
db/config.cc
15
db/config.cc
@@ -679,8 +679,6 @@ db::config::config(std::shared_ptr<db::extensions> exts)
|
||||
"The directory where hints files are stored if hinted handoff is enabled.")
|
||||
, view_hints_directory(this, "view_hints_directory", value_status::Used, "",
|
||||
"The directory where materialized-view updates are stored while a view replica is unreachable.")
|
||||
, logstor_directory(this, "logstor_directory", value_status::Used, "",
|
||||
"The directory where data files for logstor storage are stored.")
|
||||
, saved_caches_directory(this, "saved_caches_directory", value_status::Unused, "",
|
||||
"The directory location where table key and row caches are stored.")
|
||||
/**
|
||||
@@ -864,14 +862,6 @@ db::config::config(std::shared_ptr<db::extensions> exts)
|
||||
"* offheap_objects Native memory, eliminating NIO buffer heap overhead.")
|
||||
, memtable_cleanup_threshold(this, "memtable_cleanup_threshold", value_status::Invalid, .11,
|
||||
"Ratio of occupied non-flushing memtable size to total permitted size for triggering a flush of the largest memtable. Larger values mean larger flushes and less compaction, but also less concurrent flush activity, which can make it difficult to keep your disks saturated under heavy write load.")
|
||||
, logstor_disk_size_in_mb(this, "logstor_disk_size_in_mb", value_status::Used, 2048,
|
||||
"Total size in megabytes allocated for logstor storage on disk.")
|
||||
, logstor_file_size_in_mb(this, "logstor_file_size_in_mb", value_status::Used, 32,
|
||||
"Total size in megabytes allocated for each logstor data file on disk.")
|
||||
, logstor_separator_delay_limit_ms(this, "logstor_separator_delay_limit_ms", value_status::Used, 100,
|
||||
"Maximum delay in milliseconds for logstor separator debt control.")
|
||||
, logstor_separator_max_memory_in_mb(this, "logstor_separator_max_memory_in_mb", value_status::Used, 256,
|
||||
"Maximum memory in megabytes for logstor separator memory buffers.")
|
||||
, file_cache_size_in_mb(this, "file_cache_size_in_mb", value_status::Unused, 512,
|
||||
"Total memory to use for SSTable-reading buffers.")
|
||||
, memtable_flush_queue_size(this, "memtable_flush_queue_size", value_status::Unused, 4,
|
||||
@@ -1291,7 +1281,6 @@ db::config::config(std::shared_ptr<db::extensions> exts)
|
||||
, enable_in_memory_data_store(this, "enable_in_memory_data_store", value_status::Used, false, "Enable in memory mode (system tables are always persisted).")
|
||||
, enable_cache(this, "enable_cache", value_status::Used, true, "Enable cache.")
|
||||
, enable_commitlog(this, "enable_commitlog", value_status::Used, true, "Enable commitlog.")
|
||||
, enable_logstor(this, "enable_logstor", value_status::Used, false, "Enable the logstor storage engine.")
|
||||
, volatile_system_keyspace_for_testing(this, "volatile_system_keyspace_for_testing", value_status::Used, false, "Don't persist system keyspace - testing only!")
|
||||
, api_port(this, "api_port", value_status::Used, 10000, "Http Rest API port.")
|
||||
, api_address(this, "api_address", value_status::Used, "", "Http Rest API address.")
|
||||
@@ -1703,7 +1692,6 @@ void db::config::setup_directories() {
|
||||
maybe_in_workdir(data_file_directories, "data");
|
||||
maybe_in_workdir(hints_directory, "hints");
|
||||
maybe_in_workdir(view_hints_directory, "view_hints");
|
||||
maybe_in_workdir(logstor_directory, "logstor");
|
||||
maybe_in_workdir(saved_caches_directory, "saved_caches");
|
||||
}
|
||||
|
||||
@@ -1873,8 +1861,7 @@ std::map<sstring, db::experimental_features_t::feature> db::experimental_feature
|
||||
{"keyspace-storage-options", feature::KEYSPACE_STORAGE_OPTIONS},
|
||||
{"tablets", feature::UNUSED},
|
||||
{"views-with-tablets", feature::UNUSED},
|
||||
{"strongly-consistent-tables", feature::STRONGLY_CONSISTENT_TABLES},
|
||||
{"logstor", feature::LOGSTOR}
|
||||
{"strongly-consistent-tables", feature::STRONGLY_CONSISTENT_TABLES}
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -117,8 +117,7 @@ struct experimental_features_t {
|
||||
ALTERNATOR_STREAMS,
|
||||
BROADCAST_TABLES,
|
||||
KEYSPACE_STORAGE_OPTIONS,
|
||||
STRONGLY_CONSISTENT_TABLES,
|
||||
LOGSTOR,
|
||||
STRONGLY_CONSISTENT_TABLES
|
||||
};
|
||||
static std::map<sstring, feature> map(); // See enum_option.
|
||||
static std::vector<enum_option<experimental_features_t>> all();
|
||||
@@ -202,7 +201,6 @@ public:
|
||||
named_value<uint64_t> data_file_capacity;
|
||||
named_value<sstring> hints_directory;
|
||||
named_value<sstring> view_hints_directory;
|
||||
named_value<sstring> logstor_directory;
|
||||
named_value<sstring> saved_caches_directory;
|
||||
named_value<sstring> commit_failure_policy;
|
||||
named_value<sstring> disk_failure_policy;
|
||||
@@ -246,10 +244,6 @@ public:
|
||||
named_value<bool> defragment_memory_on_idle;
|
||||
named_value<sstring> memtable_allocation_type;
|
||||
named_value<double> memtable_cleanup_threshold;
|
||||
named_value<uint32_t> logstor_disk_size_in_mb;
|
||||
named_value<uint32_t> logstor_file_size_in_mb;
|
||||
named_value<uint32_t> logstor_separator_delay_limit_ms;
|
||||
named_value<uint32_t> logstor_separator_max_memory_in_mb;
|
||||
named_value<uint32_t> file_cache_size_in_mb;
|
||||
named_value<uint32_t> memtable_flush_queue_size;
|
||||
named_value<uint32_t> memtable_flush_writers;
|
||||
@@ -370,7 +364,6 @@ public:
|
||||
named_value<bool> enable_in_memory_data_store;
|
||||
named_value<bool> enable_cache;
|
||||
named_value<bool> enable_commitlog;
|
||||
named_value<bool> enable_logstor;
|
||||
named_value<bool> volatile_system_keyspace_for_testing;
|
||||
named_value<uint16_t> api_port;
|
||||
named_value<sstring> api_address;
|
||||
|
||||
@@ -63,14 +63,15 @@ namespace db {
|
||||
|
||||
namespace schema_tables {
|
||||
|
||||
static constexpr std::initializer_list<table_kind> all_table_kinds = {table_kind::table, table_kind::view};
|
||||
static constexpr std::initializer_list<table_kind> all_table_kinds = {
|
||||
table_kind::table,
|
||||
table_kind::view
|
||||
};
|
||||
|
||||
static schema_ptr get_table_holder(table_kind k) {
|
||||
switch (k) {
|
||||
case table_kind::table:
|
||||
return tables();
|
||||
case table_kind::view:
|
||||
return views();
|
||||
case table_kind::table: return tables();
|
||||
case table_kind::view: return views();
|
||||
}
|
||||
abort();
|
||||
}
|
||||
@@ -93,18 +94,15 @@ void table_selector::add(sstring name) {
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace schema_tables
|
||||
}
|
||||
|
||||
} // namespace db
|
||||
}
|
||||
|
||||
template <>
|
||||
struct fmt::formatter<db::schema_tables::table_kind> {
|
||||
constexpr auto parse(format_parse_context& ctx) {
|
||||
return ctx.begin();
|
||||
}
|
||||
template <> struct fmt::formatter<db::schema_tables::table_kind> {
|
||||
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
|
||||
auto format(db::schema_tables::table_kind k, fmt::format_context& ctx) const {
|
||||
switch (k) {
|
||||
using enum db::schema_tables::table_kind;
|
||||
using enum db::schema_tables::table_kind;
|
||||
case table:
|
||||
return fmt::format_to(ctx.out(), "table");
|
||||
case view:
|
||||
@@ -127,8 +125,11 @@ static std::optional<table_id> table_id_from_mutations(const schema_mutations& s
|
||||
return table_id(table_row.get_nonnull<utils::UUID>("id"));
|
||||
}
|
||||
|
||||
static future<std::map<table_id, schema_mutations>> read_tables_for_keyspaces(sharded<service::storage_proxy>& proxy, const std::set<sstring>& keyspace_names,
|
||||
table_kind kind, const std::unordered_map<sstring, table_selector>& tables_per_keyspace) {
|
||||
static
|
||||
future<std::map<table_id, schema_mutations>>
|
||||
read_tables_for_keyspaces(sharded<service::storage_proxy>& proxy, const std::set<sstring>& keyspace_names, table_kind kind,
|
||||
const std::unordered_map<sstring, table_selector>& tables_per_keyspace)
|
||||
{
|
||||
std::map<table_id, schema_mutations> result;
|
||||
for (auto&& [keyspace_name, sel] : tables_per_keyspace) {
|
||||
if (!sel.tables.contains(kind)) {
|
||||
@@ -148,30 +149,32 @@ static future<std::map<table_id, schema_mutations>> read_tables_for_keyspaces(sh
|
||||
|
||||
// Extracts the names of tables affected by a schema mutation.
|
||||
// The mutation must target one of the tables in schema_tables_holding_schema_mutations().
|
||||
static table_selector get_affected_tables(const sstring& keyspace_name, const mutation& m) {
|
||||
static
|
||||
table_selector get_affected_tables(const sstring& keyspace_name, const mutation& m) {
|
||||
const schema& s = *m.schema();
|
||||
auto get_table_name = [&](const clustering_key& ck) {
|
||||
auto get_table_name = [&] (const clustering_key& ck) {
|
||||
// The first component of the clustering key in each table listed in
|
||||
// schema_tables_holding_schema_mutations contains the table name.
|
||||
return value_cast<sstring>(utf8_type->deserialize(ck.get_component(s, 0)));
|
||||
};
|
||||
table_selector result;
|
||||
if (m.partition().partition_tombstone()) {
|
||||
slogger.trace("Mutation of {}.{} for keyspace {} contains a partition tombstone", m.schema()->ks_name(), m.schema()->cf_name(), keyspace_name);
|
||||
slogger.trace("Mutation of {}.{} for keyspace {} contains a partition tombstone",
|
||||
m.schema()->ks_name(), m.schema()->cf_name(), keyspace_name);
|
||||
result.all_in_keyspace = true;
|
||||
}
|
||||
for (auto&& e : m.partition().row_tombstones()) {
|
||||
const range_tombstone& rt = e.tombstone();
|
||||
if (rt.start.size(s) == 0 || rt.end.size(s) == 0) {
|
||||
slogger.trace(
|
||||
"Mutation of {}.{} for keyspace {} contains a multi-table range tombstone", m.schema()->ks_name(), m.schema()->cf_name(), keyspace_name);
|
||||
slogger.trace("Mutation of {}.{} for keyspace {} contains a multi-table range tombstone",
|
||||
m.schema()->ks_name(), m.schema()->cf_name(), keyspace_name);
|
||||
result.all_in_keyspace = true;
|
||||
break;
|
||||
}
|
||||
auto table_name = get_table_name(rt.start);
|
||||
if (table_name != get_table_name(rt.end)) {
|
||||
slogger.trace(
|
||||
"Mutation of {}.{} for keyspace {} contains a multi-table range tombstone", m.schema()->ks_name(), m.schema()->cf_name(), keyspace_name);
|
||||
slogger.trace("Mutation of {}.{} for keyspace {} contains a multi-table range tombstone",
|
||||
m.schema()->ks_name(), m.schema()->cf_name(), keyspace_name);
|
||||
result.all_in_keyspace = true;
|
||||
break;
|
||||
}
|
||||
@@ -180,17 +183,16 @@ static table_selector get_affected_tables(const sstring& keyspace_name, const mu
|
||||
for (auto&& row : m.partition().clustered_rows()) {
|
||||
result.add(get_table_name(row.key()));
|
||||
}
|
||||
slogger.trace("Mutation of {}.{} for keyspace {} affects tables: {}, all_in_keyspace: {}", m.schema()->ks_name(), m.schema()->cf_name(), keyspace_name,
|
||||
result.tables, result.all_in_keyspace);
|
||||
slogger.trace("Mutation of {}.{} for keyspace {} affects tables: {}, all_in_keyspace: {}",
|
||||
m.schema()->ks_name(), m.schema()->cf_name(), keyspace_name, result.tables, result.all_in_keyspace);
|
||||
return result;
|
||||
}
|
||||
|
||||
future<schema_result> static read_schema_for_keyspaces(
|
||||
sharded<service::storage_proxy>& proxy, const sstring& schema_table_name, const std::set<sstring>& keyspace_names) {
|
||||
auto map = [&proxy, schema_table_name](const sstring& keyspace_name) {
|
||||
return read_schema_partition_for_keyspace(proxy, schema_table_name, keyspace_name);
|
||||
};
|
||||
auto insert = [](schema_result&& result, auto&& schema_entity) {
|
||||
future<schema_result>
|
||||
static read_schema_for_keyspaces(sharded<service::storage_proxy>& proxy, const sstring& schema_table_name, const std::set<sstring>& keyspace_names)
|
||||
{
|
||||
auto map = [&proxy, schema_table_name] (const sstring& keyspace_name) { return read_schema_partition_for_keyspace(proxy, schema_table_name, keyspace_name); };
|
||||
auto insert = [] (schema_result&& result, auto&& schema_entity) {
|
||||
if (!schema_entity.second->empty()) {
|
||||
result.insert(std::move(schema_entity));
|
||||
}
|
||||
@@ -200,11 +202,11 @@ future<schema_result> static read_schema_for_keyspaces(
|
||||
}
|
||||
|
||||
// Returns names of live table definitions of given keyspace
|
||||
future<std::vector<sstring>> static read_table_names_of_keyspace(
|
||||
sharded<service::storage_proxy>& proxy, const sstring& keyspace_name, schema_ptr schema_table) {
|
||||
future<std::vector<sstring>>
|
||||
static read_table_names_of_keyspace(sharded<service::storage_proxy>& proxy, const sstring& keyspace_name, schema_ptr schema_table) {
|
||||
auto pkey = dht::decorate_key(*schema_table, partition_key::from_singular(*schema_table, keyspace_name));
|
||||
auto&& rs = co_await db::system_keyspace::query(proxy.local().get_db(), schema_table->ks_name(), schema_table->cf_name(), pkey);
|
||||
co_return rs->rows() | std::views::transform([schema_table](const query::result_set_row& row) {
|
||||
co_return rs->rows() | std::views::transform([schema_table] (const query::result_set_row& row) {
|
||||
const sstring name = schema_table->clustering_key_columns().begin()->name_as_text();
|
||||
return row.get_nonnull<sstring>(name);
|
||||
}) | std::ranges::to<std::vector>();
|
||||
@@ -240,7 +242,8 @@ static void maybe_delete_schema_version(mutation& m) {
|
||||
}
|
||||
}
|
||||
|
||||
future<> schema_applier::merge_keyspaces() {
|
||||
future<> schema_applier::merge_keyspaces()
|
||||
{
|
||||
/*
|
||||
* - we don't care about entriesOnlyOnLeft() or entriesInCommon(), because only the changes are of interest to us
|
||||
* - of all entriesOnlyOnRight(), we only care about ones that have live columns; it's possible to have a ColumnFamily
|
||||
@@ -277,16 +280,21 @@ future<> schema_applier::merge_keyspaces() {
|
||||
for (auto& name : created) {
|
||||
slogger.info("Creating keyspace {}", name);
|
||||
auto sk_after_v = _after.scylla_keyspaces.contains(name) ? _after.scylla_keyspaces.at(name) : nullptr;
|
||||
auto ksm = co_await create_keyspace_metadata(schema_result_value_type{name, _after.keyspaces.at(name)}, sk_after_v);
|
||||
auto ksm = co_await create_keyspace_metadata(
|
||||
schema_result_value_type{name, _after.keyspaces.at(name)}, sk_after_v);
|
||||
_affected_keyspaces.created.push_back(
|
||||
co_await replica::database::prepare_create_keyspace_on_all_shards(sharded_db, _proxy, *ksm, _pending_token_metadata));
|
||||
co_await replica::database::prepare_create_keyspace_on_all_shards(
|
||||
sharded_db, _proxy, *ksm, _pending_token_metadata));
|
||||
_affected_keyspaces.names.created.insert(name);
|
||||
}
|
||||
for (auto& name : altered) {
|
||||
slogger.info("Altering keyspace {}", name);
|
||||
auto sk_after_v = _after.scylla_keyspaces.contains(name) ? _after.scylla_keyspaces.at(name) : nullptr;
|
||||
auto tmp_ksm = co_await create_keyspace_metadata(schema_result_value_type{name, _after.keyspaces.at(name)}, sk_after_v);
|
||||
_affected_keyspaces.altered.push_back(co_await replica::database::prepare_update_keyspace_on_all_shards(sharded_db, *tmp_ksm, _pending_token_metadata));
|
||||
auto tmp_ksm = co_await create_keyspace_metadata(
|
||||
schema_result_value_type{name, _after.keyspaces.at(name)}, sk_after_v);
|
||||
_affected_keyspaces.altered.push_back(
|
||||
co_await replica::database::prepare_update_keyspace_on_all_shards(
|
||||
sharded_db, *tmp_ksm, _pending_token_metadata));
|
||||
_affected_keyspaces.names.altered.insert(name);
|
||||
}
|
||||
for (auto& key : _affected_keyspaces.names.dropped) {
|
||||
@@ -319,7 +327,7 @@ static std::vector<column_definition> get_primary_key_definition(const schema_pt
|
||||
static std::vector<bytes> get_primary_key(const std::vector<column_definition>& primary_key, const query::result_set_row* row) {
|
||||
std::vector<bytes> key;
|
||||
for (const auto& column : primary_key) {
|
||||
const data_value* val = row->get_data_value(column.name_as_text());
|
||||
const data_value *val = row->get_data_value(column.name_as_text());
|
||||
key.push_back(val->serialize_nonnull());
|
||||
}
|
||||
return key;
|
||||
@@ -330,7 +338,7 @@ static std::map<std::vector<bytes>, const query::result_set_row*> build_row_map(
|
||||
const std::vector<query::result_set_row>& rows = result.rows();
|
||||
auto primary_key = get_primary_key_definition(result.schema());
|
||||
std::map<std::vector<bytes>, const query::result_set_row*> ret;
|
||||
for (const auto& row : rows) {
|
||||
for (const auto& row: rows) {
|
||||
auto key = get_primary_key(primary_key, &row);
|
||||
ret.insert(std::pair(std::move(key), &row));
|
||||
}
|
||||
@@ -383,8 +391,8 @@ struct aggregate_diff {
|
||||
std::vector<std::pair<const query::result_set_row*, const query::result_set_row*>> dropped;
|
||||
};
|
||||
|
||||
static aggregate_diff diff_aggregates_rows(
|
||||
const schema_result& aggr_before, const schema_result& aggr_after, const schema_result& scylla_aggr_before, const schema_result& scylla_aggr_after) {
|
||||
static aggregate_diff diff_aggregates_rows(const schema_result& aggr_before, const schema_result& aggr_after,
|
||||
const schema_result& scylla_aggr_before, const schema_result& scylla_aggr_after) {
|
||||
using map = std::map<std::vector<bytes>, const query::result_set_row*>;
|
||||
auto aggr_diff = difference(aggr_before, aggr_after, indirect_equal_to<lw_shared_ptr<query::result_set>>());
|
||||
|
||||
@@ -428,11 +436,15 @@ static aggregate_diff diff_aggregates_rows(
|
||||
|
||||
for (const auto& k : diff.entries_only_on_left) {
|
||||
auto entry = scylla_aggr_rows_before.find(k);
|
||||
dropped.push_back({aggr_before_rows.find(k)->second, (entry != scylla_aggr_rows_before.end()) ? entry->second : nullptr});
|
||||
dropped.push_back({
|
||||
aggr_before_rows.find(k)->second, (entry != scylla_aggr_rows_before.end()) ? entry->second : nullptr
|
||||
});
|
||||
}
|
||||
for (const auto& k : diff.entries_only_on_right) {
|
||||
auto entry = scylla_aggr_rows_after.find(k);
|
||||
created.push_back({aggr_after_rows.find(k)->second, (entry != scylla_aggr_rows_after.end()) ? entry->second : nullptr});
|
||||
created.push_back({
|
||||
aggr_after_rows.find(k)->second, (entry != scylla_aggr_rows_after.end()) ? entry->second : nullptr
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -440,10 +452,11 @@ static aggregate_diff diff_aggregates_rows(
|
||||
}
|
||||
|
||||
// see the comments for merge_keyspaces()
|
||||
future<> schema_applier::merge_types() {
|
||||
future<> schema_applier::merge_types()
|
||||
{
|
||||
auto diff = diff_rows(_before.types, _after.types);
|
||||
co_await _affected_user_types.start();
|
||||
co_await _affected_user_types.invoke_on_all([&](affected_user_types_per_shard& af) mutable -> future<> {
|
||||
co_await _affected_user_types.invoke_on_all([&] (affected_user_types_per_shard& af) mutable -> future<> {
|
||||
auto& db = _proxy.local().get_db().local();
|
||||
|
||||
std::map<sstring, std::reference_wrapper<replica::keyspace>> new_keyspaces_per_shard;
|
||||
@@ -465,12 +478,16 @@ future<> schema_applier::merge_types() {
|
||||
// version of view to "before" version of base table and "after" to "after"
|
||||
// respectively.
|
||||
enum class schema_diff_side {
|
||||
left, // old, before
|
||||
left, // old, before
|
||||
right, // new, after
|
||||
};
|
||||
|
||||
static schema_diff_per_shard diff_table_or_view(sharded<service::storage_proxy>& proxy, const std::map<table_id, schema_mutations>& before,
|
||||
const std::map<table_id, schema_mutations>& after, bool reload, noncopyable_function<schema_ptr(schema_mutations sm, schema_diff_side)> create_schema) {
|
||||
static schema_diff_per_shard diff_table_or_view(sharded<service::storage_proxy>& proxy,
|
||||
const std::map<table_id, schema_mutations>& before,
|
||||
const std::map<table_id, schema_mutations>& after,
|
||||
bool reload,
|
||||
noncopyable_function<schema_ptr (schema_mutations sm, schema_diff_side)> create_schema)
|
||||
{
|
||||
schema_diff_per_shard d;
|
||||
auto diff = difference(before, after);
|
||||
for (auto&& key : diff.entries_only_on_left) {
|
||||
@@ -490,10 +507,10 @@ static schema_diff_per_shard diff_table_or_view(sharded<service::storage_proxy>&
|
||||
d.altered.emplace_back(schema_diff_per_shard::altered_schema{s_before, s});
|
||||
}
|
||||
if (reload) {
|
||||
for (auto&& key : diff.entries_in_common) {
|
||||
for (auto&& key: diff.entries_in_common) {
|
||||
auto s = create_schema(std::move(after.at(key)), schema_diff_side::right);
|
||||
slogger.info("Reloading {}.{} id={} version={}", s->ks_name(), s->cf_name(), s->id(), s->version());
|
||||
d.altered.emplace_back(schema_diff_per_shard::altered_schema{s, s});
|
||||
d.altered.emplace_back(schema_diff_per_shard::altered_schema {s, s});
|
||||
}
|
||||
}
|
||||
return d;
|
||||
@@ -507,9 +524,7 @@ static schema_diff_per_shard diff_table_or_view(sharded<service::storage_proxy>&
|
||||
constexpr size_t max_concurrent = 8;
|
||||
|
||||
|
||||
in_progress_types_storage_per_shard::in_progress_types_storage_per_shard(
|
||||
replica::database& db, const affected_keyspaces& affected_keyspaces, const affected_user_types& affected_types)
|
||||
: _stored_user_types(db.as_user_types_storage()) {
|
||||
in_progress_types_storage_per_shard::in_progress_types_storage_per_shard(replica::database& db, const affected_keyspaces& affected_keyspaces, const affected_user_types& affected_types) : _stored_user_types(db.as_user_types_storage()) {
|
||||
// initialize metadata for new keyspaces
|
||||
for (auto& ks_per_shard : affected_keyspaces.created) {
|
||||
auto metadata = ks_per_shard[this_shard_id()]->metadata();
|
||||
@@ -537,7 +552,7 @@ in_progress_types_storage_per_shard::in_progress_types_storage_per_shard(
|
||||
auto& ks_name = type->_keyspace;
|
||||
_in_progress_types[ks_name].remove_type(type);
|
||||
}
|
||||
for (const auto& ks_name : affected_keyspaces.names.dropped) {
|
||||
for (const auto &ks_name : affected_keyspaces.names.dropped) {
|
||||
// can't reference a type when it's keyspace is being dropped
|
||||
_in_progress_types[ks_name] = data_dictionary::user_types_metadata();
|
||||
}
|
||||
@@ -555,9 +570,8 @@ std::shared_ptr<data_dictionary::user_types_storage> in_progress_types_storage_p
|
||||
return _stored_user_types;
|
||||
}
|
||||
|
||||
future<> in_progress_types_storage::init(
|
||||
sharded<replica::database>& sharded_db, const affected_keyspaces& affected_keyspaces, const affected_user_types& affected_types) {
|
||||
co_await sharded_db.invoke_on_all([&](replica::database& db) {
|
||||
future<> in_progress_types_storage::init(sharded<replica::database>& sharded_db, const affected_keyspaces& affected_keyspaces, const affected_user_types& affected_types) {
|
||||
co_await sharded_db.invoke_on_all([&] (replica::database& db) {
|
||||
shards[this_shard_id()] = make_foreign(seastar::make_shared<in_progress_types_storage_per_shard>(db, affected_keyspaces, affected_types));
|
||||
});
|
||||
}
|
||||
@@ -571,7 +585,8 @@ in_progress_types_storage_per_shard& in_progress_types_storage::local() {
|
||||
// that when a base schema and a subset of its views are modified together (i.e.,
|
||||
// upon an alter table or alter type statement), then they are published together
|
||||
// as well, without any deferring in-between.
|
||||
future<> schema_applier::merge_tables_and_views() {
|
||||
future<> schema_applier::merge_tables_and_views()
|
||||
{
|
||||
auto& user_types = _types_storage.local();
|
||||
co_await _affected_tables_and_views.tables_and_views.start();
|
||||
|
||||
@@ -582,10 +597,10 @@ future<> schema_applier::merge_tables_and_views() {
|
||||
|
||||
// Create CDC tables before non-CDC base tables, because we want the base tables with CDC enabled
|
||||
// to point to their CDC tables.
|
||||
local_cdc = diff_table_or_view(_proxy, _before.cdc, _after.cdc, _reload, [&](schema_mutations sm, schema_diff_side) {
|
||||
local_cdc = diff_table_or_view(_proxy, _before.cdc, _after.cdc, _reload, [&] (schema_mutations sm, schema_diff_side) {
|
||||
return create_table_from_mutations(_proxy, std::move(sm), user_types, nullptr);
|
||||
});
|
||||
local_tables = diff_table_or_view(_proxy, _before.tables, _after.tables, _reload, [&](schema_mutations sm, schema_diff_side side) {
|
||||
local_tables = diff_table_or_view(_proxy, _before.tables, _after.tables, _reload, [&] (schema_mutations sm, schema_diff_side side) {
|
||||
// If the table has CDC enabled, find the CDC schema version and set it in the table schema.
|
||||
// If the table is created or altered with CDC enabled, then the CDC
|
||||
// table is also created or altered in the same operation, so we can
|
||||
@@ -621,7 +636,7 @@ future<> schema_applier::merge_tables_and_views() {
|
||||
|
||||
return create_table_from_mutations(_proxy, std::move(sm), user_types, cdc_schema);
|
||||
});
|
||||
local_views = diff_table_or_view(_proxy, _before.views, _after.views, _reload, [&](schema_mutations sm, schema_diff_side side) {
|
||||
local_views = diff_table_or_view(_proxy, _before.views, _after.views, _reload, [&] (schema_mutations sm, schema_diff_side side) {
|
||||
// The view schema mutation should be created with reference to the base table schema because we definitely know it by now.
|
||||
// If we don't do it we are leaving a window where write commands to this schema are illegal.
|
||||
// There are 3 possibilities:
|
||||
@@ -668,26 +683,31 @@ future<> schema_applier::merge_tables_and_views() {
|
||||
frozen_schema_diff tables_frozen = co_await local_tables.freeze();
|
||||
frozen_schema_diff cdc_frozen = co_await local_cdc.freeze();
|
||||
frozen_schema_diff views_frozen = co_await local_views.freeze();
|
||||
co_await _affected_tables_and_views.tables_and_views.invoke_on_others(
|
||||
[this, &tables_frozen, &cdc_frozen, &views_frozen](affected_tables_and_views_per_shard& tables_and_views) -> future<> {
|
||||
auto& db = _proxy.local().get_db().local();
|
||||
tables_and_views.tables = co_await schema_diff_per_shard::copy_from(db, _types_storage, tables_frozen);
|
||||
tables_and_views.cdc = co_await schema_diff_per_shard::copy_from(db, _types_storage, cdc_frozen);
|
||||
tables_and_views.views = co_await schema_diff_per_shard::copy_from(db, _types_storage, views_frozen);
|
||||
});
|
||||
co_await _affected_tables_and_views.tables_and_views.invoke_on_others([this, &tables_frozen, &cdc_frozen, &views_frozen] (affected_tables_and_views_per_shard& tables_and_views) -> future<> {
|
||||
auto& db = _proxy.local().get_db().local();
|
||||
tables_and_views.tables = co_await schema_diff_per_shard::copy_from(
|
||||
db, _types_storage, tables_frozen);
|
||||
tables_and_views.cdc = co_await schema_diff_per_shard::copy_from(
|
||||
db, _types_storage, cdc_frozen);
|
||||
tables_and_views.views = co_await schema_diff_per_shard::copy_from(
|
||||
db, _types_storage, views_frozen);
|
||||
});
|
||||
|
||||
auto& db = _proxy.local().get_db();
|
||||
co_await max_concurrent_for_each(local_views.dropped, max_concurrent, [&db, this](schema_ptr& dt) -> future<> {
|
||||
co_await max_concurrent_for_each(local_views.dropped, max_concurrent, [&db, this] (schema_ptr& dt) -> future<> {
|
||||
auto uuid = dt->id();
|
||||
_affected_tables_and_views.table_shards.insert({uuid, co_await replica::database::prepare_drop_table_on_all_shards(db, uuid)});
|
||||
_affected_tables_and_views.table_shards.insert({uuid,
|
||||
co_await replica::database::prepare_drop_table_on_all_shards(db, uuid)});
|
||||
});
|
||||
co_await max_concurrent_for_each(local_tables.dropped, max_concurrent, [&db, this](schema_ptr& dt) -> future<> {
|
||||
co_await max_concurrent_for_each(local_tables.dropped, max_concurrent, [&db, this] (schema_ptr& dt) -> future<> {
|
||||
auto uuid = dt->id();
|
||||
_affected_tables_and_views.table_shards.insert({uuid, co_await replica::database::prepare_drop_table_on_all_shards(db, uuid)});
|
||||
_affected_tables_and_views.table_shards.insert({uuid,
|
||||
co_await replica::database::prepare_drop_table_on_all_shards(db, uuid)});
|
||||
});
|
||||
co_await max_concurrent_for_each(local_cdc.dropped, max_concurrent, [&db, this](schema_ptr& dt) -> future<> {
|
||||
co_await max_concurrent_for_each(local_cdc.dropped, max_concurrent, [&db, this] (schema_ptr& dt) -> future<> {
|
||||
auto uuid = dt->id();
|
||||
_affected_tables_and_views.table_shards.insert({uuid, co_await replica::database::prepare_drop_table_on_all_shards(db, uuid)});
|
||||
_affected_tables_and_views.table_shards.insert({uuid,
|
||||
co_await replica::database::prepare_drop_table_on_all_shards(db, uuid)});
|
||||
});
|
||||
}
|
||||
|
||||
@@ -699,8 +719,8 @@ future<frozen_schema_diff> schema_diff_per_shard::freeze() const {
|
||||
}
|
||||
for (const auto& a : altered) {
|
||||
result.altered.push_back(frozen_schema_diff::altered_schema{
|
||||
.old_schema = extended_frozen_schema(a.old_schema),
|
||||
.new_schema = extended_frozen_schema(a.new_schema),
|
||||
.old_schema = extended_frozen_schema(a.old_schema),
|
||||
.new_schema = extended_frozen_schema(a.new_schema),
|
||||
});
|
||||
co_await coroutine::maybe_yield();
|
||||
}
|
||||
@@ -723,8 +743,8 @@ future<schema_diff_per_shard> schema_diff_per_shard::copy_from(replica::database
|
||||
}
|
||||
for (const auto& a : oth.altered) {
|
||||
result.altered.push_back(schema_diff_per_shard::altered_schema{
|
||||
.old_schema = a.old_schema.unfreeze(commited_ctxt),
|
||||
.new_schema = a.new_schema.unfreeze(ctxt),
|
||||
.old_schema = a.old_schema.unfreeze(commited_ctxt),
|
||||
.new_schema = a.new_schema.unfreeze(ctxt),
|
||||
});
|
||||
co_await coroutine::maybe_yield();
|
||||
}
|
||||
@@ -738,7 +758,7 @@ future<schema_diff_per_shard> schema_diff_per_shard::copy_from(replica::database
|
||||
|
||||
static future<> notify_tables_and_views(service::migration_notifier& notifier, const affected_tables_and_views& diff) {
|
||||
auto it = diff.tables_and_views.local().columns_changed.cbegin();
|
||||
auto notify = [&](auto& r, auto&& f) -> future<> {
|
||||
auto notify = [&] (auto& r, auto&& f) -> future<> {
|
||||
co_await max_concurrent_for_each(r, max_concurrent, std::move(f));
|
||||
};
|
||||
|
||||
@@ -747,41 +767,24 @@ static future<> notify_tables_and_views(service::migration_notifier& notifier, c
|
||||
const auto& views = diff.tables_and_views.local().views;
|
||||
|
||||
// View drops are notified first, because a table can only be dropped if its views are already deleted
|
||||
co_await notify(views.dropped, [&](auto&& dt) {
|
||||
return notifier.drop_view(view_ptr(dt));
|
||||
});
|
||||
co_await notify(tables.dropped, [&](auto&& dt) {
|
||||
return notifier.drop_column_family(dt);
|
||||
});
|
||||
co_await notify(cdc.dropped, [&](auto&& dt) {
|
||||
return notifier.drop_column_family(dt);
|
||||
});
|
||||
co_await notify(views.dropped, [&] (auto&& dt) { return notifier.drop_view(view_ptr(dt)); });
|
||||
co_await notify(tables.dropped, [&] (auto&& dt) { return notifier.drop_column_family(dt); });
|
||||
co_await notify(cdc.dropped, [&] (auto&& dt) { return notifier.drop_column_family(dt); });
|
||||
// Table creations are notified first, in case a view is created right after the table
|
||||
co_await notify(tables.created, [&](auto&& gs) {
|
||||
return notifier.create_column_family(gs);
|
||||
});
|
||||
co_await notify(cdc.created, [&](auto&& gs) {
|
||||
return notifier.create_column_family(gs);
|
||||
});
|
||||
co_await notify(views.created, [&](auto&& gs) {
|
||||
return notifier.create_view(view_ptr(gs));
|
||||
});
|
||||
co_await notify(tables.created, [&] (auto&& gs) { return notifier.create_column_family(gs); });
|
||||
co_await notify(cdc.created, [&] (auto&& gs) { return notifier.create_column_family(gs); });
|
||||
co_await notify(views.created, [&] (auto&& gs) { return notifier.create_view(view_ptr(gs)); });
|
||||
// Table altering is notified first, in case new base columns appear
|
||||
co_await notify(tables.altered, [&](auto&& altered) {
|
||||
return notifier.update_column_family(altered.new_schema, *it++);
|
||||
});
|
||||
co_await notify(cdc.altered, [&](auto&& altered) {
|
||||
return notifier.update_column_family(altered.new_schema, *it++);
|
||||
});
|
||||
co_await notify(views.altered, [&](auto&& altered) {
|
||||
return notifier.update_view(view_ptr(altered.new_schema), *it++);
|
||||
});
|
||||
co_await notify(tables.altered, [&] (auto&& altered) { return notifier.update_column_family(altered.new_schema, *it++); });
|
||||
co_await notify(cdc.altered, [&] (auto&& altered) { return notifier.update_column_family(altered.new_schema, *it++); });
|
||||
co_await notify(views.altered, [&] (auto&& altered) { return notifier.update_view(view_ptr(altered.new_schema), *it++); });
|
||||
}
|
||||
|
||||
static void drop_cached_func(replica::database& db, const query::result_set_row& row) {
|
||||
auto language = row.get_nonnull<sstring>("language");
|
||||
if (language == "wasm") {
|
||||
cql3::functions::function_name name{row.get_nonnull<sstring>("keyspace_name"), row.get_nonnull<sstring>("function_name")};
|
||||
cql3::functions::function_name name{
|
||||
row.get_nonnull<sstring>("keyspace_name"), row.get_nonnull<sstring>("function_name")};
|
||||
auto arg_types = read_arg_types(row, name.keyspace, db.user_types());
|
||||
db.lang().remove(name, arg_types);
|
||||
}
|
||||
@@ -790,13 +793,14 @@ static void drop_cached_func(replica::database& db, const query::result_set_row&
|
||||
future<> schema_applier::merge_functions() {
|
||||
auto diff = diff_rows(_before.functions, _after.functions);
|
||||
co_await _functions_batch.start();
|
||||
co_await _functions_batch.invoke_on_all(coroutine::lambda([&](cql3::functions::change_batch& batch) -> future<> {
|
||||
co_await _functions_batch.invoke_on_all(coroutine::lambda([&] (cql3::functions::change_batch& batch) -> future<> {
|
||||
auto& db = _proxy.local().get_db().local();
|
||||
for (const auto& val : diff.created) {
|
||||
batch.add_function(co_await create_func(db, *val, _types_storage.local()));
|
||||
}
|
||||
for (const auto& val : diff.dropped) {
|
||||
cql3::functions::function_name name{val->get_nonnull<sstring>("keyspace_name"), val->get_nonnull<sstring>("function_name")};
|
||||
cql3::functions::function_name name{
|
||||
val->get_nonnull<sstring>("keyspace_name"), val->get_nonnull<sstring>("function_name")};
|
||||
auto commited_storage = _types_storage.local().committed_storage();
|
||||
auto arg_types = read_arg_types(*val, name.keyspace, *commited_storage);
|
||||
// as we don't yield between dropping cache and committing batch
|
||||
@@ -814,13 +818,14 @@ future<> schema_applier::merge_functions() {
|
||||
future<> schema_applier::merge_aggregates() {
|
||||
auto diff = diff_aggregates_rows(_before.aggregates, _after.aggregates, _before.scylla_aggregates, _after.scylla_aggregates);
|
||||
|
||||
co_await _functions_batch.invoke_on_all([&](cql3::functions::change_batch& batch) -> future<> {
|
||||
co_await _functions_batch.invoke_on_all([&] (cql3::functions::change_batch& batch)-> future<> {
|
||||
auto& db = _proxy.local().get_db().local();
|
||||
for (const auto& val : diff.created) {
|
||||
batch.add_function(create_aggregate(db, *val.first, val.second, batch, _types_storage.local()));
|
||||
}
|
||||
for (const auto& val : diff.dropped) {
|
||||
cql3::functions::function_name name{val.first->get_nonnull<sstring>("keyspace_name"), val.first->get_nonnull<sstring>("aggregate_name")};
|
||||
cql3::functions::function_name name{
|
||||
val.first->get_nonnull<sstring>("keyspace_name"), val.first->get_nonnull<sstring>("aggregate_name")};
|
||||
auto commited_storage = _types_storage.local().committed_storage();
|
||||
auto arg_types = read_arg_types(*val.first, name.keyspace, *commited_storage);
|
||||
batch.remove_aggregate(name, arg_types);
|
||||
@@ -855,15 +860,15 @@ future<schema_persisted_state> schema_applier::get_schema_persisted_state() {
|
||||
auto [tables, cdc] = extract_cdc(std::move(tables_and_cdc));
|
||||
|
||||
schema_persisted_state v{
|
||||
.keyspaces = co_await read_schema_for_keyspaces(_proxy, KEYSPACES, _keyspaces),
|
||||
.scylla_keyspaces = co_await read_schema_for_keyspaces(_proxy, SCYLLA_KEYSPACES, _keyspaces),
|
||||
.tables = std::move(tables),
|
||||
.types = co_await read_schema_for_keyspaces(_proxy, TYPES, _keyspaces),
|
||||
.views = co_await read_tables_for_keyspaces(_proxy, _keyspaces, table_kind::view, _affected_tables),
|
||||
.cdc = std::move(cdc),
|
||||
.functions = co_await read_schema_for_keyspaces(_proxy, FUNCTIONS, _keyspaces),
|
||||
.aggregates = co_await read_schema_for_keyspaces(_proxy, AGGREGATES, _keyspaces),
|
||||
.scylla_aggregates = co_await read_schema_for_keyspaces(_proxy, SCYLLA_AGGREGATES, _keyspaces),
|
||||
.keyspaces = co_await read_schema_for_keyspaces(_proxy, KEYSPACES, _keyspaces),
|
||||
.scylla_keyspaces = co_await read_schema_for_keyspaces(_proxy, SCYLLA_KEYSPACES, _keyspaces),
|
||||
.tables = std::move(tables),
|
||||
.types = co_await read_schema_for_keyspaces(_proxy, TYPES, _keyspaces),
|
||||
.views = co_await read_tables_for_keyspaces(_proxy, _keyspaces, table_kind::view, _affected_tables),
|
||||
.cdc = std::move(cdc),
|
||||
.functions = co_await read_schema_for_keyspaces(_proxy, FUNCTIONS, _keyspaces),
|
||||
.aggregates = co_await read_schema_for_keyspaces(_proxy, AGGREGATES, _keyspaces),
|
||||
.scylla_aggregates = co_await read_schema_for_keyspaces(_proxy, SCYLLA_AGGREGATES, _keyspaces),
|
||||
};
|
||||
co_return v;
|
||||
}
|
||||
@@ -919,11 +924,10 @@ class pending_schema_getter : public service::schema_getter {
|
||||
private:
|
||||
schema_applier& _sa;
|
||||
sharded<replica::database>& _db;
|
||||
|
||||
public:
|
||||
pending_schema_getter(schema_applier& sa)
|
||||
: _sa(sa)
|
||||
, _db(sa._proxy.local().get_db()) {};
|
||||
pending_schema_getter(schema_applier& sa) :
|
||||
_sa(sa), _db(sa._proxy.local().get_db()) {
|
||||
};
|
||||
|
||||
virtual flat_hash_map<sstring, locator::replication_strategy_ptr> get_keyspaces_replication() const override {
|
||||
flat_hash_map<sstring, locator::replication_strategy_ptr> out;
|
||||
@@ -985,7 +989,8 @@ future<> schema_applier::update_tablets() {
|
||||
if (_tablet_hint) {
|
||||
slogger.info("Tablet metadata changed");
|
||||
pending_schema_getter getter{*this};
|
||||
_token_metadata_change = co_await _ss.local().prepare_token_metadata_change(_pending_token_metadata.local(), getter);
|
||||
_token_metadata_change = co_await _ss.local().prepare_token_metadata_change(
|
||||
_pending_token_metadata.local(), getter);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -994,7 +999,8 @@ future<> schema_applier::update_tablets() {
|
||||
future<> schema_applier::load_mutable_token_metadata() {
|
||||
locator::mutable_token_metadata_ptr current_token_metadata = co_await _ss.local().get_mutable_token_metadata_ptr();
|
||||
if (_tablet_hint) {
|
||||
auto new_token_metadata = co_await _ss.local().prepare_tablet_metadata(_tablet_hint, current_token_metadata);
|
||||
auto new_token_metadata = co_await _ss.local().prepare_tablet_metadata(
|
||||
_tablet_hint, current_token_metadata);
|
||||
co_return co_await _pending_token_metadata.assign(new_token_metadata);
|
||||
}
|
||||
co_await _pending_token_metadata.assign(current_token_metadata);
|
||||
@@ -1109,13 +1115,14 @@ future<> schema_applier::commit() {
|
||||
// However, we can only acquire the (write) lock after preparing all
|
||||
// entities for the pending schema change that need to iterate over tables_metadata;
|
||||
// otherwise, such iteration would deadlock.
|
||||
_metadata_locks = std::make_unique<replica::tables_metadata_lock_on_all_shards>(co_await replica::database::lock_tables_metadata(sharded_db));
|
||||
_metadata_locks = std::make_unique<replica::tables_metadata_lock_on_all_shards>(
|
||||
co_await replica::database::lock_tables_metadata(sharded_db));
|
||||
// Run func first on shard 0
|
||||
// to allow "seeding" of the effective_replication_map
|
||||
// with a new e_r_m instance.
|
||||
SCYLLA_ASSERT(this_shard_id() == 0);
|
||||
commit_on_shard(sharded_db.local());
|
||||
co_await sharded_db.invoke_on_others([this](replica::database& db) {
|
||||
co_await sharded_db.invoke_on_others([this] (replica::database& db) {
|
||||
commit_on_shard(db);
|
||||
});
|
||||
// unlock as some functions in post_commit() may read data under those locks
|
||||
@@ -1147,11 +1154,12 @@ future<> schema_applier::finalize_tables_and_views() {
|
||||
|
||||
if (_tablet_hint) {
|
||||
auto& db = sharded_db.local();
|
||||
co_await db.get_compaction_manager().get_shared_tombstone_gc_state().flush_pending_repair_time_update(db);
|
||||
co_await db.get_compaction_manager().get_shared_tombstone_gc_state().
|
||||
flush_pending_repair_time_update(db);
|
||||
_ss.local().wake_up_topology_state_machine();
|
||||
}
|
||||
|
||||
co_await sharded_db.invoke_on_all([&diff](replica::database& db) -> future<> {
|
||||
co_await sharded_db.invoke_on_all([&diff] (replica::database& db) -> future<> {
|
||||
const auto& tables = diff.tables_and_views.local().tables;
|
||||
const auto& cdc = diff.tables_and_views.local().cdc;
|
||||
const auto& views = diff.tables_and_views.local().views;
|
||||
@@ -1176,14 +1184,15 @@ future<> schema_applier::finalize_tables_and_views() {
|
||||
//
|
||||
// Drop column mapping entries for dropped tables since these will not be TTLed automatically
|
||||
// and will stay there forever if we don't clean them up manually
|
||||
co_await max_concurrent_for_each(diff.tables_and_views.local().tables.created, max_concurrent, [this](const schema_ptr& gs) -> future<> {
|
||||
co_await max_concurrent_for_each(diff.tables_and_views.local().tables.created, max_concurrent, [this] (const schema_ptr& gs) -> future<> {
|
||||
co_await store_column_mapping(_proxy, gs, false);
|
||||
});
|
||||
co_await max_concurrent_for_each(
|
||||
diff.tables_and_views.local().tables.altered, max_concurrent, [this](const schema_diff_per_shard::altered_schema& altered) -> future<> {
|
||||
co_await when_all_succeed(store_column_mapping(_proxy, altered.old_schema, true), store_column_mapping(_proxy, altered.new_schema, false));
|
||||
});
|
||||
co_await max_concurrent_for_each(diff.tables_and_views.local().tables.dropped, max_concurrent, [this](const schema_ptr& s) -> future<> {
|
||||
co_await max_concurrent_for_each(diff.tables_and_views.local().tables.altered, max_concurrent, [this] (const schema_diff_per_shard::altered_schema& altered) -> future<> {
|
||||
co_await when_all_succeed(
|
||||
store_column_mapping(_proxy, altered.old_schema, true),
|
||||
store_column_mapping(_proxy, altered.new_schema, false));
|
||||
});
|
||||
co_await max_concurrent_for_each(diff.tables_and_views.local().tables.dropped, max_concurrent, [this] (const schema_ptr& s) -> future<> {
|
||||
co_await drop_column_mapping(_sys_ks.local(), s->id(), s->version());
|
||||
});
|
||||
}
|
||||
@@ -1191,7 +1200,7 @@ future<> schema_applier::finalize_tables_and_views() {
|
||||
future<> schema_applier::post_commit() {
|
||||
co_await finalize_tables_and_views();
|
||||
auto& sharded_db = _proxy.local().get_db();
|
||||
co_await sharded_db.invoke_on_all([&](replica::database& db) -> future<> {
|
||||
co_await sharded_db.invoke_on_all([&] (replica::database& db) -> future<> {
|
||||
auto& notifier = db.get_notifier();
|
||||
// notify about keyspaces
|
||||
for (const auto& name : _affected_keyspaces.names.created) {
|
||||
@@ -1251,8 +1260,8 @@ static future<> execute_do_merge_schema(sharded<service::storage_proxy>& proxy,
|
||||
co_await ap.post_commit();
|
||||
}
|
||||
|
||||
static future<> do_merge_schema(sharded<service::storage_proxy>& proxy, sharded<service::storage_service>& ss, sharded<db::system_keyspace>& sys_ks,
|
||||
utils::chunked_vector<mutation> mutations, bool reload) {
|
||||
static future<> do_merge_schema(sharded<service::storage_proxy>& proxy, sharded<service::storage_service>& ss, sharded<db::system_keyspace>& sys_ks, utils::chunked_vector<mutation> mutations, bool reload)
|
||||
{
|
||||
slogger.trace("do_merge_schema: {}", mutations);
|
||||
schema_applier ap(proxy, ss, sys_ks, reload);
|
||||
co_await execute_do_merge_schema(proxy, ap, std::move(mutations)).finally([&ap]() {
|
||||
@@ -1269,22 +1278,22 @@ static future<> do_merge_schema(sharded<service::storage_proxy>& proxy, sharded<
|
||||
* @throws ConfigurationException If one of metadata attributes has invalid value
|
||||
* @throws IOException If data was corrupted during transportation or failed to apply fs operations
|
||||
*/
|
||||
future<> merge_schema(sharded<db::system_keyspace>& sys_ks, sharded<service::storage_proxy>& proxy, sharded<service::storage_service>& ss,
|
||||
utils::chunked_vector<mutation> mutations, bool reload) {
|
||||
future<> merge_schema(sharded<db::system_keyspace>& sys_ks, sharded<service::storage_proxy>& proxy, sharded<service::storage_service>& ss, utils::chunked_vector<mutation> mutations, bool reload)
|
||||
{
|
||||
if (this_shard_id() != 0) {
|
||||
// mutations must be applied on the owning shard (0).
|
||||
co_await smp::submit_to(0, coroutine::lambda([&, fmuts = freeze(mutations)]() mutable -> future<> {
|
||||
co_await smp::submit_to(0, coroutine::lambda([&, fmuts = freeze(mutations)] () mutable -> future<> {
|
||||
co_await merge_schema(sys_ks, proxy, ss, co_await unfreeze_gently(fmuts), reload);
|
||||
}));
|
||||
co_return;
|
||||
}
|
||||
co_await with_merge_lock([&]() mutable -> future<> {
|
||||
co_await with_merge_lock([&] () mutable -> future<> {
|
||||
co_await do_merge_schema(proxy, ss, sys_ks, std::move(mutations), reload);
|
||||
auto version = co_await get_group0_schema_version(sys_ks.local());
|
||||
co_await update_schema_version_and_announce(sys_ks, proxy, version);
|
||||
});
|
||||
}
|
||||
|
||||
} // namespace schema_tables
|
||||
}
|
||||
|
||||
} // namespace db
|
||||
}
|
||||
|
||||
@@ -336,8 +336,6 @@ schema_ptr scylla_tables(schema_features features) {
|
||||
// since it is written to only after the cluster feature is enabled.
|
||||
sb.with_column("tablets", map_type_impl::get_instance(utf8_type, utf8_type, false));
|
||||
|
||||
sb.with_column("storage_engine", utf8_type);
|
||||
|
||||
sb.with_hash_version();
|
||||
s = sb.build();
|
||||
}
|
||||
@@ -1678,9 +1676,6 @@ mutation make_scylla_tables_mutation(schema_ptr table, api::timestamp_type times
|
||||
m.set_clustered_cell(ckey, cdef, make_map_mutation(map, cdef, timestamp));
|
||||
}
|
||||
}
|
||||
if (table->logstor_enabled()) {
|
||||
m.set_clustered_cell(ckey, "storage_engine", "logstor", timestamp);
|
||||
}
|
||||
// In-memory tables are deprecated since scylla-2024.1.0
|
||||
// FIXME: delete the column when there's no live version supporting it anymore.
|
||||
// Writing it here breaks upgrade rollback to versions that do not support the in_memory schema_feature
|
||||
@@ -2166,13 +2161,6 @@ static void prepare_builder_from_scylla_tables_row(const schema_ctxt& ctxt, sche
|
||||
auto tablet_options = db::tablet_options(*opt_map);
|
||||
builder.set_tablet_options(tablet_options.to_map());
|
||||
}
|
||||
if (auto storage_engine = table_row.get<sstring>("storage_engine")) {
|
||||
if (*storage_engine == "logstor") {
|
||||
builder.set_logstor();
|
||||
} else {
|
||||
throw std::invalid_argument(format("Invalid value for storage_engine: {}", *storage_engine));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
schema_ptr create_table_from_mutations(const schema_ctxt& ctxt, schema_mutations sm, const data_dictionary::user_types_storage& user_types, schema_ptr cdc_schema, std::optional<table_schema_version> version)
|
||||
|
||||
@@ -3052,7 +3052,7 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
|
||||
co_return ret;
|
||||
}
|
||||
|
||||
const bool tablet_balancing_not_supported = _db.features().strongly_consistent_tables || _db.features().logstor;
|
||||
const bool strongly_consistent_tables = _db.features().strongly_consistent_tables;
|
||||
|
||||
for (auto& row : *rs) {
|
||||
if (!row.has("host_id")) {
|
||||
@@ -3289,7 +3289,7 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
|
||||
ret.session = service::session_id(some_row.get_as<utils::UUID>("session"));
|
||||
}
|
||||
|
||||
if (tablet_balancing_not_supported) {
|
||||
if (strongly_consistent_tables) {
|
||||
ret.tablet_balancing_enabled = false;
|
||||
} else if (some_row.has("tablet_balancing_enabled")) {
|
||||
ret.tablet_balancing_enabled = some_row.get_as<bool>("tablet_balancing_enabled");
|
||||
|
||||
@@ -2647,7 +2647,7 @@ future<> view_builder::add_new_view(view_ptr view, build_step& step) {
|
||||
}
|
||||
|
||||
if (this_shard_id() == smp::count - 1) {
|
||||
inject_failure("add_new_view_fail_last_shard");
|
||||
co_await utils::get_local_injector().inject("add_new_view_pause_last_shard", utils::wait_for_message(5min));
|
||||
}
|
||||
|
||||
co_await _sys_ks.register_view_for_building(view->ks_name(), view->cf_name(), step.current_token());
|
||||
|
||||
@@ -29,8 +29,8 @@ static logging::logger blogger("boot_strapper");
|
||||
|
||||
namespace dht {
|
||||
|
||||
future<> boot_strapper::bootstrap(
|
||||
streaming::stream_reason reason, gms::gossiper& gossiper, service::frozen_topology_guard topo_guard, locator::host_id replace_address) {
|
||||
future<> boot_strapper::bootstrap(streaming::stream_reason reason, gms::gossiper& gossiper, service::frozen_topology_guard topo_guard,
|
||||
locator::host_id replace_address) {
|
||||
blogger.debug("Beginning bootstrap process: sorted_tokens={}", get_token_metadata().sorted_tokens());
|
||||
sstring description;
|
||||
if (reason == streaming::stream_reason::bootstrap) {
|
||||
@@ -41,8 +41,7 @@ future<> boot_strapper::bootstrap(
|
||||
throw std::runtime_error("Wrong stream_reason provided: it can only be replace or bootstrap");
|
||||
}
|
||||
try {
|
||||
auto streamer = make_lw_shared<range_streamer>(
|
||||
_db, _stream_manager, _token_metadata_ptr, _abort_source, _tokens, _address, _dr, description, reason, topo_guard);
|
||||
auto streamer = make_lw_shared<range_streamer>(_db, _stream_manager, _token_metadata_ptr, _abort_source, _tokens, _address, _dr, description, reason, topo_guard);
|
||||
auto nodes_to_filter = gossiper.get_unreachable_members();
|
||||
if (reason == streaming::stream_reason::replace) {
|
||||
nodes_to_filter.insert(std::move(replace_address));
|
||||
@@ -72,8 +71,7 @@ std::unordered_set<token> boot_strapper::get_random_bootstrap_tokens(const token
|
||||
}
|
||||
|
||||
if (num_tokens == 1) {
|
||||
blogger.warn(
|
||||
"Picking random token for a single vnode. You should probably add more vnodes; failing that, you should probably specify the token manually");
|
||||
blogger.warn("Picking random token for a single vnode. You should probably add more vnodes; failing that, you should probably specify the token manually");
|
||||
}
|
||||
|
||||
auto tokens = get_random_tokens(std::move(tmptr), num_tokens);
|
||||
@@ -88,8 +86,7 @@ std::unordered_set<token> boot_strapper::get_bootstrap_tokens(token_metadata_ptr
|
||||
return get_bootstrap_tokens(std::move(tmptr), cfg.initial_token(), cfg.num_tokens(), check);
|
||||
}
|
||||
|
||||
std::unordered_set<token> boot_strapper::get_bootstrap_tokens(
|
||||
const token_metadata_ptr tmptr, sstring tokens_string, uint32_t num_tokens, check_token_endpoint check) {
|
||||
std::unordered_set<token> boot_strapper::get_bootstrap_tokens(const token_metadata_ptr tmptr, sstring tokens_string, uint32_t num_tokens, check_token_endpoint check) {
|
||||
std::unordered_set<sstring> initial_tokens;
|
||||
try {
|
||||
boost::split(initial_tokens, tokens_string, boost::is_any_of(sstring(", ")));
|
||||
@@ -105,8 +102,7 @@ std::unordered_set<token> boot_strapper::get_bootstrap_tokens(
|
||||
for (auto& token_string : initial_tokens) {
|
||||
auto token = dht::token::from_sstring(token_string);
|
||||
if (check && tmptr->get_endpoint(token)) {
|
||||
throw std::runtime_error(
|
||||
format("Bootstrapping to existing token {} is not allowed (decommission/removenode the old node first).", token_string));
|
||||
throw std::runtime_error(format("Bootstrapping to existing token {} is not allowed (decommission/removenode the old node first).", token_string));
|
||||
}
|
||||
tokens.insert(token);
|
||||
}
|
||||
|
||||
@@ -26,9 +26,10 @@ static logging::logger logger("range_streamer");
|
||||
|
||||
using inet_address = gms::inet_address;
|
||||
|
||||
std::unordered_map<locator::host_id, dht::token_range_vector> range_streamer::get_range_fetch_map(
|
||||
const std::unordered_map<dht::token_range, std::vector<locator::host_id>>& ranges_with_sources,
|
||||
const std::unordered_set<std::unique_ptr<i_source_filter>>& source_filters, const sstring& keyspace) {
|
||||
std::unordered_map<locator::host_id, dht::token_range_vector>
|
||||
range_streamer::get_range_fetch_map(const std::unordered_map<dht::token_range, std::vector<locator::host_id>>& ranges_with_sources,
|
||||
const std::unordered_set<std::unique_ptr<i_source_filter>>& source_filters,
|
||||
const sstring& keyspace) {
|
||||
std::unordered_map<locator::host_id, dht::token_range_vector> range_fetch_map_map;
|
||||
const auto& topo = _token_metadata_ptr->get_topology();
|
||||
for (const auto& x : ranges_with_sources) {
|
||||
@@ -78,8 +79,8 @@ std::unordered_map<locator::host_id, dht::token_range_vector> range_streamer::ge
|
||||
}
|
||||
|
||||
// Must be called from a seastar thread
|
||||
std::unordered_map<dht::token_range, std::vector<locator::host_id>> range_streamer::get_all_ranges_with_sources_for(
|
||||
const sstring& keyspace_name, const locator::vnode_effective_replication_map* erm, dht::token_range_vector desired_ranges) {
|
||||
std::unordered_map<dht::token_range, std::vector<locator::host_id>>
|
||||
range_streamer::get_all_ranges_with_sources_for(const sstring& keyspace_name, const locator::vnode_effective_replication_map* erm, dht::token_range_vector desired_ranges) {
|
||||
logger.debug("{} ks={}", __func__, keyspace_name);
|
||||
|
||||
auto range_addresses = erm->get_range_host_ids().get();
|
||||
@@ -113,24 +114,24 @@ std::unordered_map<dht::token_range, std::vector<locator::host_id>> range_stream
|
||||
}
|
||||
|
||||
// Must be called from a seastar thread
|
||||
std::unordered_map<dht::token_range, std::vector<locator::host_id>> range_streamer::get_all_ranges_with_strict_sources_for(
|
||||
const sstring& keyspace_name, const locator::vnode_effective_replication_map* erm, dht::token_range_vector desired_ranges, gms::gossiper& gossiper) {
|
||||
std::unordered_map<dht::token_range, std::vector<locator::host_id>>
|
||||
range_streamer::get_all_ranges_with_strict_sources_for(const sstring& keyspace_name, const locator::vnode_effective_replication_map* erm, dht::token_range_vector desired_ranges, gms::gossiper& gossiper) {
|
||||
logger.debug("{} ks={}", __func__, keyspace_name);
|
||||
SCYLLA_ASSERT(_tokens.empty() == false);
|
||||
SCYLLA_ASSERT (_tokens.empty() == false);
|
||||
|
||||
auto& strat = erm->get_replication_strategy();
|
||||
|
||||
// Active ranges
|
||||
//Active ranges
|
||||
auto metadata_clone = get_token_metadata().clone_only_token_map().get();
|
||||
auto range_addresses = strat.get_range_host_ids(metadata_clone).get();
|
||||
|
||||
// Pending ranges
|
||||
//Pending ranges
|
||||
metadata_clone.update_topology(_address, _dr);
|
||||
metadata_clone.update_normal_tokens(_tokens, _address).get();
|
||||
auto pending_range_addresses = strat.get_range_host_ids(metadata_clone).get();
|
||||
auto pending_range_addresses = strat.get_range_host_ids(metadata_clone).get();
|
||||
metadata_clone.clear_gently().get();
|
||||
|
||||
// Collects the source that will have its range moved to the new node
|
||||
//Collects the source that will have its range moved to the new node
|
||||
std::unordered_map<dht::token_range, std::vector<locator::host_id>> range_sources;
|
||||
|
||||
logger.debug("keyspace={}, desired_ranges.size={}, range_addresses.size={}", keyspace_name, desired_ranges.size(), range_addresses.size());
|
||||
@@ -149,12 +150,11 @@ std::unordered_map<dht::token_range, std::vector<locator::host_id>> range_stream
|
||||
}
|
||||
|
||||
std::unordered_set<locator::host_id> new_endpoints(it->second.begin(), it->second.end());
|
||||
// Due to CASSANDRA-5953 we can have a higher RF then we have endpoints.
|
||||
// So we need to be careful to only be strict when endpoints == RF
|
||||
//Due to CASSANDRA-5953 we can have a higher RF then we have endpoints.
|
||||
//So we need to be careful to only be strict when endpoints == RF
|
||||
if (old_endpoints.size() == erm->get_replication_factor()) {
|
||||
std::erase_if(old_endpoints, [&new_endpoints](locator::host_id ep) {
|
||||
return new_endpoints.contains(ep);
|
||||
});
|
||||
std::erase_if(old_endpoints,
|
||||
[&new_endpoints] (locator::host_id ep) { return new_endpoints.contains(ep); });
|
||||
if (old_endpoints.size() != 1) {
|
||||
throw std::runtime_error(format("Expected 1 endpoint but found {:d}", old_endpoints.size()));
|
||||
}
|
||||
@@ -163,7 +163,7 @@ std::unordered_map<dht::token_range, std::vector<locator::host_id>> range_stream
|
||||
}
|
||||
}
|
||||
|
||||
// Validate
|
||||
//Validate
|
||||
auto it = range_sources.find(desired_range);
|
||||
if (it == range_sources.end()) {
|
||||
throw std::runtime_error(format("No sources found for {}", desired_range));
|
||||
@@ -176,9 +176,7 @@ std::unordered_map<dht::token_range, std::vector<locator::host_id>> range_stream
|
||||
locator::host_id source_id = it->second.front();
|
||||
|
||||
if (gossiper.is_enabled() && !gossiper.is_alive(source_id)) {
|
||||
throw std::runtime_error(format("A node required to move the data consistently is down ({}). If you wish to move the data from a potentially "
|
||||
"inconsistent replica, restart the node with consistent_rangemovement=false",
|
||||
source_id));
|
||||
throw std::runtime_error(format("A node required to move the data consistently is down ({}). If you wish to move the data from a potentially inconsistent replica, restart the node with consistent_rangemovement=false", source_id));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -190,8 +188,12 @@ bool range_streamer::use_strict_sources_for_ranges(const sstring& keyspace_name,
|
||||
auto nr_nodes_in_ring = get_token_metadata().get_normal_token_owners().size();
|
||||
bool everywhere_topology = erm.get_replication_strategy().get_type() == locator::replication_strategy_type::everywhere_topology;
|
||||
// Use strict when number of nodes in the ring is equal or more than RF
|
||||
auto strict = _db.local().get_config().consistent_rangemovement() && !_tokens.empty() && !everywhere_topology && nr_nodes_in_ring >= rf;
|
||||
logger.debug("use_strict_sources_for_ranges: ks={}, nr_nodes_in_ring={}, rf={}, strict={}", keyspace_name, nr_nodes_in_ring, rf, strict);
|
||||
auto strict = _db.local().get_config().consistent_rangemovement()
|
||||
&& !_tokens.empty()
|
||||
&& !everywhere_topology
|
||||
&& nr_nodes_in_ring >= rf;
|
||||
logger.debug("use_strict_sources_for_ranges: ks={}, nr_nodes_in_ring={}, rf={}, strict={}",
|
||||
keyspace_name, nr_nodes_in_ring, rf, strict);
|
||||
return strict;
|
||||
}
|
||||
|
||||
@@ -212,36 +214,34 @@ void range_streamer::add_rx_ranges(const sstring& keyspace_name, std::unordered_
|
||||
}
|
||||
|
||||
// TODO: This is the legacy range_streamer interface, it is add_rx_ranges which adds rx ranges.
|
||||
future<> range_streamer::add_ranges(const sstring& keyspace_name, locator::static_effective_replication_map_ptr erm, dht::token_range_vector ranges,
|
||||
gms::gossiper& gossiper, bool is_replacing) {
|
||||
return seastar::async([this, keyspace_name, ermp = std::move(erm), ranges = std::move(ranges), &gossiper, is_replacing]() mutable {
|
||||
if (_nr_tx_added) {
|
||||
throw std::runtime_error("Mixed sending and receiving is not supported");
|
||||
}
|
||||
_nr_rx_added++;
|
||||
auto erm = ermp->maybe_as_vnode_effective_replication_map();
|
||||
SCYLLA_ASSERT(erm != nullptr);
|
||||
auto ranges_for_keyspace = !is_replacing && use_strict_sources_for_ranges(keyspace_name, *erm)
|
||||
? get_all_ranges_with_strict_sources_for(keyspace_name, erm, std::move(ranges), gossiper)
|
||||
: get_all_ranges_with_sources_for(keyspace_name, erm, std::move(ranges));
|
||||
future<> range_streamer::add_ranges(const sstring& keyspace_name, locator::static_effective_replication_map_ptr erm, dht::token_range_vector ranges, gms::gossiper& gossiper, bool is_replacing) {
|
||||
return seastar::async([this, keyspace_name, ermp = std::move(erm), ranges= std::move(ranges), &gossiper, is_replacing] () mutable {
|
||||
if (_nr_tx_added) {
|
||||
throw std::runtime_error("Mixed sending and receiving is not supported");
|
||||
}
|
||||
_nr_rx_added++;
|
||||
auto erm = ermp->maybe_as_vnode_effective_replication_map();
|
||||
SCYLLA_ASSERT(erm != nullptr);
|
||||
auto ranges_for_keyspace = !is_replacing && use_strict_sources_for_ranges(keyspace_name, *erm)
|
||||
? get_all_ranges_with_strict_sources_for(keyspace_name, erm, std::move(ranges), gossiper)
|
||||
: get_all_ranges_with_sources_for(keyspace_name, erm, std::move(ranges));
|
||||
|
||||
if (logger.is_enabled(logging::log_level::debug)) {
|
||||
for (auto& x : ranges_for_keyspace) {
|
||||
logger.debug("{} : keyspace {} range {} exists on {}", _description, keyspace_name, x.first, x.second);
|
||||
}
|
||||
if (logger.is_enabled(logging::log_level::debug)) {
|
||||
for (auto& x : ranges_for_keyspace) {
|
||||
logger.debug("{} : keyspace {} range {} exists on {}", _description, keyspace_name, x.first, x.second);
|
||||
}
|
||||
}
|
||||
|
||||
std::unordered_map<locator::host_id, dht::token_range_vector> range_fetch_map =
|
||||
get_range_fetch_map(ranges_for_keyspace, _source_filters, keyspace_name);
|
||||
utils::clear_gently(ranges_for_keyspace).get();
|
||||
std::unordered_map<locator::host_id, dht::token_range_vector> range_fetch_map = get_range_fetch_map(ranges_for_keyspace, _source_filters, keyspace_name);
|
||||
utils::clear_gently(ranges_for_keyspace).get();
|
||||
|
||||
if (logger.is_enabled(logging::log_level::debug)) {
|
||||
for (auto& x : range_fetch_map) {
|
||||
logger.debug("{} : keyspace={}, ranges={} from source={}, range_size={}", _description, keyspace_name, x.second, x.first, x.second.size());
|
||||
}
|
||||
if (logger.is_enabled(logging::log_level::debug)) {
|
||||
for (auto& x : range_fetch_map) {
|
||||
logger.debug("{} : keyspace={}, ranges={} from source={}, range_size={}", _description, keyspace_name, x.second, x.first, x.second.size());
|
||||
}
|
||||
_to_stream.emplace(keyspace_name, std::move(range_fetch_map));
|
||||
});
|
||||
}
|
||||
_to_stream.emplace(keyspace_name, std::move(range_fetch_map));
|
||||
});
|
||||
}
|
||||
|
||||
future<> range_streamer::stream_async() {
|
||||
@@ -250,73 +250,73 @@ future<> range_streamer::stream_async() {
|
||||
_token_metadata_ptr = nullptr;
|
||||
logger.info("{} starts, nr_ranges_remaining={}", _description, _nr_ranges_remaining);
|
||||
auto start = lowres_clock::now();
|
||||
return do_for_each(_to_stream, [this, description = _description](auto& stream) {
|
||||
return do_for_each(_to_stream, [this, description = _description] (auto& stream) {
|
||||
const auto& keyspace = stream.first;
|
||||
auto& ip_range_vec = stream.second;
|
||||
auto ips = ip_range_vec | std::views::keys | std::ranges::to<std::list>();
|
||||
// Fetch from or send to peer node in parallel
|
||||
logger.info("{} with {} for keyspace={} started, nodes_to_stream={}", description, ips, keyspace, ip_range_vec.size());
|
||||
return parallel_for_each(ip_range_vec, [this, description, keyspace](auto& ip_range) {
|
||||
auto& source = ip_range.first;
|
||||
auto& range_vec = ip_range.second;
|
||||
return seastar::with_semaphore(_limiter, 1, [this, description, keyspace, source, &range_vec]() mutable {
|
||||
return seastar::async([this, description, keyspace, source, &range_vec]() mutable {
|
||||
// TODO: It is better to use fiber instead of thread here because
|
||||
// creating a thread per peer can be some memory in a large cluster.
|
||||
auto start_time = lowres_clock::now();
|
||||
unsigned sp_index = 0;
|
||||
unsigned nr_ranges_streamed = 0;
|
||||
size_t nr_ranges_total = range_vec.size();
|
||||
auto do_streaming = [&](dht::token_range_vector&& ranges_to_stream) {
|
||||
auto sp = stream_plan(_stream_manager.local(), format("{}-{}-index-{:d}", description, keyspace, sp_index++), _reason, _topo_guard);
|
||||
auto abort_listener = _abort_source.subscribe([&]() noexcept {
|
||||
sp.abort();
|
||||
});
|
||||
_abort_source.check();
|
||||
logger.info("{} with {} for keyspace={}, streaming [{}, {}) out of {} ranges", description, source, keyspace, nr_ranges_streamed,
|
||||
nr_ranges_streamed + ranges_to_stream.size(), nr_ranges_total);
|
||||
auto ranges_streamed = ranges_to_stream.size();
|
||||
if (_nr_rx_added) {
|
||||
sp.request_ranges(source, keyspace, std::move(ranges_to_stream), _tables);
|
||||
} else if (_nr_tx_added) {
|
||||
sp.transfer_ranges(source, keyspace, std::move(ranges_to_stream), _tables);
|
||||
}
|
||||
sp.execute().discard_result().get();
|
||||
// Update finished percentage
|
||||
nr_ranges_streamed += ranges_streamed;
|
||||
_nr_ranges_remaining -= ranges_streamed;
|
||||
float percentage = _nr_total_ranges == 0 ? 1 : (_nr_total_ranges - _nr_ranges_remaining) / (float)_nr_total_ranges;
|
||||
_stream_manager.local().update_finished_percentage(_reason, percentage);
|
||||
logger.info("Finished {} out of {} ranges for {}, finished percentage={}", _nr_total_ranges - _nr_ranges_remaining, _nr_total_ranges,
|
||||
_reason, percentage);
|
||||
};
|
||||
dht::token_range_vector ranges_to_stream;
|
||||
try {
|
||||
for (auto it = range_vec.begin(); it < range_vec.end();) {
|
||||
ranges_to_stream.push_back(*it);
|
||||
++it;
|
||||
auto fraction = _db.local().get_config().stream_plan_ranges_fraction();
|
||||
size_t nr_ranges_per_stream_plan = nr_ranges_total * fraction;
|
||||
if (ranges_to_stream.size() < nr_ranges_per_stream_plan) {
|
||||
continue;
|
||||
} else {
|
||||
do_streaming(std::exchange(ranges_to_stream, {}));
|
||||
it = range_vec.erase(range_vec.begin(), it);
|
||||
}
|
||||
}
|
||||
if (ranges_to_stream.size() > 0) {
|
||||
do_streaming(std::exchange(ranges_to_stream, {}));
|
||||
range_vec.clear();
|
||||
}
|
||||
} catch (...) {
|
||||
auto t = std::chrono::duration_cast<std::chrono::duration<float>>(lowres_clock::now() - start_time).count();
|
||||
logger.warn("{} with {} for keyspace={} failed, took {} seconds: {}", description, source, keyspace, t, std::current_exception());
|
||||
throw;
|
||||
return parallel_for_each(ip_range_vec, [this, description, keyspace] (auto& ip_range) {
|
||||
auto& source = ip_range.first;
|
||||
auto& range_vec = ip_range.second;
|
||||
return seastar::with_semaphore(_limiter, 1, [this, description, keyspace, source, &range_vec] () mutable {
|
||||
return seastar::async([this, description, keyspace, source, &range_vec] () mutable {
|
||||
// TODO: It is better to use fiber instead of thread here because
|
||||
// creating a thread per peer can be some memory in a large cluster.
|
||||
auto start_time = lowres_clock::now();
|
||||
unsigned sp_index = 0;
|
||||
unsigned nr_ranges_streamed = 0;
|
||||
size_t nr_ranges_total = range_vec.size();
|
||||
auto do_streaming = [&] (dht::token_range_vector&& ranges_to_stream) {
|
||||
auto sp = stream_plan(_stream_manager.local(), format("{}-{}-index-{:d}", description, keyspace, sp_index++),
|
||||
_reason, _topo_guard);
|
||||
auto abort_listener = _abort_source.subscribe([&] () noexcept { sp.abort(); });
|
||||
_abort_source.check();
|
||||
logger.info("{} with {} for keyspace={}, streaming [{}, {}) out of {} ranges",
|
||||
description, source, keyspace,
|
||||
nr_ranges_streamed, nr_ranges_streamed + ranges_to_stream.size(), nr_ranges_total);
|
||||
auto ranges_streamed = ranges_to_stream.size();
|
||||
if (_nr_rx_added) {
|
||||
sp.request_ranges(source, keyspace, std::move(ranges_to_stream), _tables);
|
||||
} else if (_nr_tx_added) {
|
||||
sp.transfer_ranges(source, keyspace, std::move(ranges_to_stream), _tables);
|
||||
}
|
||||
sp.execute().discard_result().get();
|
||||
// Update finished percentage
|
||||
nr_ranges_streamed += ranges_streamed;
|
||||
_nr_ranges_remaining -= ranges_streamed;
|
||||
float percentage = _nr_total_ranges == 0 ? 1 : (_nr_total_ranges - _nr_ranges_remaining) / (float)_nr_total_ranges;
|
||||
_stream_manager.local().update_finished_percentage(_reason, percentage);
|
||||
logger.info("Finished {} out of {} ranges for {}, finished percentage={}",
|
||||
_nr_total_ranges - _nr_ranges_remaining, _nr_total_ranges, _reason, percentage);
|
||||
};
|
||||
dht::token_range_vector ranges_to_stream;
|
||||
try {
|
||||
for (auto it = range_vec.begin(); it < range_vec.end();) {
|
||||
ranges_to_stream.push_back(*it);
|
||||
++it;
|
||||
auto fraction = _db.local().get_config().stream_plan_ranges_fraction();
|
||||
size_t nr_ranges_per_stream_plan = nr_ranges_total * fraction;
|
||||
if (ranges_to_stream.size() < nr_ranges_per_stream_plan) {
|
||||
continue;
|
||||
} else {
|
||||
do_streaming(std::exchange(ranges_to_stream, {}));
|
||||
it = range_vec.erase(range_vec.begin(), it);
|
||||
}
|
||||
}
|
||||
if (ranges_to_stream.size() > 0) {
|
||||
do_streaming(std::exchange(ranges_to_stream, {}));
|
||||
range_vec.clear();
|
||||
}
|
||||
} catch (...) {
|
||||
auto t = std::chrono::duration_cast<std::chrono::duration<float>>(lowres_clock::now() - start_time).count();
|
||||
logger.info("{} with {} for keyspace={} succeeded, took {} seconds", description, source, keyspace, t);
|
||||
});
|
||||
});
|
||||
logger.warn("{} with {} for keyspace={} failed, took {} seconds: {}", description, source, keyspace, t, std::current_exception());
|
||||
throw;
|
||||
}
|
||||
auto t = std::chrono::duration_cast<std::chrono::duration<float>>(lowres_clock::now() - start_time).count();
|
||||
logger.info("{} with {} for keyspace={} succeeded, took {} seconds", description, source, keyspace, t);
|
||||
});
|
||||
});
|
||||
});
|
||||
}).finally([this, start] {
|
||||
auto t = std::chrono::duration_cast<std::chrono::seconds>(lowres_clock::now() - start).count();
|
||||
@@ -344,4 +344,4 @@ size_t range_streamer::nr_ranges_to_stream() {
|
||||
return nr_ranges_remaining;
|
||||
}
|
||||
|
||||
} // namespace dht
|
||||
} // dht
|
||||
|
||||
63
dht/token.hh
63
dht/token.hh
@@ -30,31 +30,6 @@ enum class token_kind {
|
||||
after_all_keys,
|
||||
};
|
||||
|
||||
// Represents a token for partition keys.
|
||||
// Has a disengaged state, which sorts before all engaged states.
|
||||
struct raw_token {
|
||||
int64_t value;
|
||||
|
||||
/// Constructs a disengaged token.
|
||||
raw_token() : value(std::numeric_limits<int64_t>::min()) {}
|
||||
|
||||
/// Constructs an engaged token.
|
||||
/// The token must be of token_kind::key kind.
|
||||
explicit raw_token(const token&);
|
||||
|
||||
explicit raw_token(int64_t v) : value(v) {};
|
||||
|
||||
std::strong_ordering operator<=>(const raw_token& o) const noexcept = default;
|
||||
std::strong_ordering operator<=>(const token& o) const noexcept;
|
||||
|
||||
/// Returns true iff engaged.
|
||||
explicit operator bool() const noexcept {
|
||||
return value != std::numeric_limits<int64_t>::min();
|
||||
}
|
||||
};
|
||||
|
||||
using raw_token_opt = seastar::optimized_optional<raw_token>;
|
||||
|
||||
class token {
|
||||
// INT64_MIN is not a legal token, but a special value used to represent
|
||||
// infinity in token intervals.
|
||||
@@ -77,10 +52,6 @@ public:
|
||||
|
||||
constexpr explicit token(int64_t d) noexcept : token(kind::key, normalize(d)) {}
|
||||
|
||||
token(raw_token raw) noexcept
|
||||
: token(raw ? kind::key : kind::before_all_keys, raw.value)
|
||||
{ }
|
||||
|
||||
// This constructor seems redundant with the bytes_view constructor, but
|
||||
// it's necessary for IDL, which passes a deserialized_bytes_proxy here.
|
||||
// (deserialized_bytes_proxy is convertible to bytes&&, but not bytes_view.)
|
||||
@@ -252,29 +223,6 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
inline
|
||||
raw_token::raw_token(const token& t)
|
||||
: value(t.raw())
|
||||
{
|
||||
#ifdef DEBUG
|
||||
assert(t._kind == token::kind::key);
|
||||
#endif
|
||||
}
|
||||
|
||||
inline
|
||||
std::strong_ordering raw_token::operator<=>(const token& o) const noexcept {
|
||||
switch (o._kind) {
|
||||
case token::kind::after_all_keys:
|
||||
return std::strong_ordering::less;
|
||||
case token::kind::before_all_keys:
|
||||
// before_all_keys has a raw value set to the same raw value as a disengaged raw_token, and sorts before all keys.
|
||||
// So we can order them by just comparing raw values.
|
||||
[[fallthrough]];
|
||||
case token::kind::key:
|
||||
return value <=> o._data;
|
||||
}
|
||||
}
|
||||
|
||||
inline constexpr std::strong_ordering tri_compare_raw(const int64_t l1, const int64_t l2) noexcept {
|
||||
if (l1 == l2) {
|
||||
return std::strong_ordering::equal;
|
||||
@@ -381,17 +329,6 @@ struct fmt::formatter<dht::token> : fmt::formatter<string_view> {
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct fmt::formatter<dht::raw_token> : fmt::formatter<string_view> {
|
||||
template <typename FormatContext>
|
||||
auto format(const dht::raw_token& t, FormatContext& ctx) const {
|
||||
if (!t) {
|
||||
return fmt::format_to(ctx.out(), "null");
|
||||
}
|
||||
return fmt::format_to(ctx.out(), "{}", t.value);
|
||||
}
|
||||
};
|
||||
|
||||
namespace std {
|
||||
|
||||
template<>
|
||||
|
||||
2
dist/common/sysconfig/scylla-node-exporter
vendored
2
dist/common/sysconfig/scylla-node-exporter
vendored
@@ -1 +1 @@
|
||||
SCYLLA_NODE_EXPORTER_ARGS="--collector.interrupts --collector.ethtool.metrics-include='(bw_in_allowance_exceeded|bw_out_allowance_exceeded|conntrack_allowance_exceeded|conntrack_allowance_available|linklocal_allowance_exceeded)' --collector.ethtool --collector.systemd --collector.systemd.unit-include='^(scylla-server|systemd-coredump.*)\.service$' --no-collector.hwmon --no-collector.bcache --no-collector.btrfs --no-collector.fibrechannel --no-collector.infiniband --no-collector.ipvs --no-collector.nfs --no-collector.nfsd --no-collector.powersupplyclass --no-collector.rapl --no-collector.tapestats --no-collector.thermal_zone --no-collector.udp_queues --no-collector.zfs"
|
||||
SCYLLA_NODE_EXPORTER_ARGS="--collector.interrupts --collector.ethtool.metrics-include='(bw_in_allowance_exceeded|bw_out_allowance_exceeded|conntrack_allowance_exceeded|conntrack_allowance_available|linklocal_allowance_exceeded)' --collector.ethtool --no-collector.hwmon --no-collector.bcache --no-collector.btrfs --no-collector.fibrechannel --no-collector.infiniband --no-collector.ipvs --no-collector.nfs --no-collector.nfsd --no-collector.powersupplyclass --no-collector.rapl --no-collector.tapestats --no-collector.thermal_zone --no-collector.udp_queues --no-collector.zfs"
|
||||
|
||||
@@ -139,7 +139,7 @@ The ``WHERE`` clause
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
The ``WHERE`` clause specifies which rows must be queried. It is composed of relations on the columns that are part of
|
||||
the ``PRIMARY KEY``, and relations can be joined only with ``AND`` (``OR`` and other logical operators are not supported).
|
||||
the ``PRIMARY KEY``.
|
||||
|
||||
Not all relations are allowed in a query. For instance, non-equal relations (where ``IN`` is considered as an equal
|
||||
relation) on a partition key are not supported (see the use of the ``TOKEN`` method below to do non-equal queries on
|
||||
@@ -200,23 +200,6 @@ The tuple notation may also be used for ``IN`` clauses on clustering columns::
|
||||
WHERE userid = 'john doe'
|
||||
AND (blog_title, posted_at) IN (('John''s Blog', '2012-01-01'), ('Extreme Chess', '2014-06-01'))
|
||||
|
||||
This tuple notation is different from boolean grouping. For example, the following query is not supported::
|
||||
|
||||
SELECT * FROM users
|
||||
WHERE (country = 'BR' AND state = 'SP')
|
||||
|
||||
because parentheses are only allowed around a single relation, so this works: ``(country = 'BR') AND (state = 'SP')``, but this does not: ``(country = 'BR' AND state = 'SP')``.
|
||||
Similarly, an extended query of the form of::
|
||||
|
||||
SELECT * FROM users
|
||||
WHERE (country = 'BR' AND state = 'SP')
|
||||
OR (country = 'BR' AND state = 'RJ')
|
||||
|
||||
won't work due to both: grouping boolean expressions and not supporting ``OR``, so when possible,
|
||||
rewrite such queries with ``IN`` on the varying column, for example
|
||||
``country = 'BR' AND state IN ('SP', 'RJ')``, or run multiple queries and merge
|
||||
the results client-side.
|
||||
|
||||
The ``CONTAINS`` operator may only be used on collection columns (lists, sets, and maps). In the case of maps,
|
||||
``CONTAINS`` applies to the map values. The ``CONTAINS KEY`` operator may only be used on map columns and applies to the
|
||||
map keys.
|
||||
|
||||
@@ -1,236 +0,0 @@
|
||||
.. highlight:: cql
|
||||
|
||||
.. _cql-guardrails:
|
||||
|
||||
CQL Guardrails
|
||||
==============
|
||||
|
||||
ScyllaDB provides a set of configurable guardrail parameters that help operators
|
||||
enforce best practices and prevent misconfigurations that could degrade cluster
|
||||
health, availability, or performance. Guardrails operate at two severity levels:
|
||||
|
||||
* **Warn**: The request succeeds, but the server includes a warning in the CQL
|
||||
response. Depending on the specific guardrail, the warning may also be logged on the server side.
|
||||
* **Fail**: The request is rejected with an error/exception (the specific type
|
||||
depends on the guardrail). The user must correct the request or adjust the
|
||||
guardrail configuration to proceed.
|
||||
|
||||
.. note::
|
||||
|
||||
Guardrails are checked only when a statement is
|
||||
executed. They do not retroactively validate existing keyspaces, tables, or
|
||||
previously completed writes.
|
||||
|
||||
For the full list of configuration properties, including types, defaults, and
|
||||
liveness information, see :doc:`Configuration Parameters </reference/configuration-parameters>`.
|
||||
|
||||
.. _guardrails-replication-factor:
|
||||
|
||||
Replication Factor Guardrails
|
||||
-----------------------------
|
||||
|
||||
These four parameters control the minimum and maximum allowed replication factor
|
||||
(RF) values. They are evaluated whenever a ``CREATE KEYSPACE`` or
|
||||
``ALTER KEYSPACE`` statement is executed. Each data center's RF is checked
|
||||
individually.
|
||||
|
||||
An RF of ``0`` — which means "do not replicate to this data center" — is
|
||||
always allowed and never triggers a guardrail.
|
||||
|
||||
A threshold value of ``-1`` disables the corresponding check.
|
||||
|
||||
``minimum_replication_factor_warn_threshold``
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
If any data center's RF is set to a value greater than ``0`` and lower than
|
||||
this threshold, the server attaches a warning to the CQL response identifying
|
||||
the offending data center and RF value.
|
||||
|
||||
**When to use.** The default of ``3`` is the standard recommendation for
|
||||
production clusters. An RF below ``3`` means that the cluster cannot tolerate
|
||||
even a single node failure without data loss or read unavailability (assuming
|
||||
``QUORUM`` consistency). Keep this at ``3`` unless your deployment has specific
|
||||
constraints (e.g., a development or test cluster with fewer than 3 nodes).
|
||||
|
||||
``minimum_replication_factor_fail_threshold``
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
If any data center's RF is set to a value greater than ``0`` and lower than
|
||||
this threshold, the request is rejected with a ``ConfigurationException``
|
||||
identifying the offending data center and RF value.
|
||||
|
||||
**When to use.** Enable this parameter (e.g., set to ``3``) in production
|
||||
environments where allowing a low RF would be operationally dangerous. Unlike
|
||||
the warn threshold, this provides a hard guarantee that no keyspace can be
|
||||
created or altered to have an RF below the limit.
|
||||
|
||||
``maximum_replication_factor_warn_threshold``
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
If any data center's RF exceeds this threshold, the server attaches a warning to the CQL response identifying
|
||||
the offending data center and RF value.
|
||||
|
||||
**When to use.** An excessively high RF increases write amplification and
|
||||
storage costs proportionally. For example, an RF of ``5`` means every write
|
||||
is replicated to five nodes. Set this threshold to alert operators who
|
||||
may unintentionally set an RF that is too high.
|
||||
|
||||
``maximum_replication_factor_fail_threshold``
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
If any data center's RF exceeds this threshold, the request is rejected with a ``ConfigurationException``
|
||||
identifying the offending data center and RF value.
|
||||
|
||||
**When to use.** Enable this parameter to prevent accidental creation of
|
||||
keyspaces with an unreasonably high RF. An extremely high RF wastes storage and
|
||||
network bandwidth and can lead to write latency spikes. This is a hard limit —
|
||||
the keyspace creation or alteration will not proceed until the RF is lowered.
|
||||
|
||||
**Metrics.** ScyllaDB exposes per-shard metrics that track the number of
|
||||
times each replication factor guardrail has been triggered:
|
||||
|
||||
* ``scylla_cql_minimum_replication_factor_warn_violations``
|
||||
* ``scylla_cql_minimum_replication_factor_fail_violations``
|
||||
* ``scylla_cql_maximum_replication_factor_warn_violations``
|
||||
* ``scylla_cql_maximum_replication_factor_fail_violations``
|
||||
|
||||
A sustained increase in any of these metrics indicates that
|
||||
``CREATE KEYSPACE`` or ``ALTER KEYSPACE`` requests are hitting the configured
|
||||
thresholds.
|
||||
|
||||
.. _guardrails-replication-strategy:
|
||||
|
||||
Replication Strategy Guardrails
|
||||
-------------------------------
|
||||
|
||||
These two parameters control which replication strategies trigger warnings or
|
||||
are rejected when a keyspace is created or altered.
|
||||
|
||||
``replication_strategy_warn_list``
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
If the replication strategy used in a ``CREATE KEYSPACE`` or ``ALTER KEYSPACE``
|
||||
statement is on this list, the server attaches a warning to the CQL response
|
||||
identifying the discouraged strategy and the affected keyspace.
|
||||
|
||||
**When to use.** ``SimpleStrategy`` is not recommended for production use.
|
||||
It places replicas without awareness of data center or rack topology, which
|
||||
can undermine fault tolerance in multi-DC deployments. Even in single-DC
|
||||
deployments, ``NetworkTopologyStrategy`` is recommended because it keeps the
|
||||
schema ready for future topology changes.
|
||||
|
||||
The default configuration warns on ``SimpleStrategy``, which is appropriate
|
||||
for most deployments. If you have existing keyspaces that use
|
||||
``SimpleStrategy``, see :doc:`Update Topology Strategy From Simple to Network
|
||||
</operating-scylla/procedures/cluster-management/update-topology-strategy-from-simple-to-network>`
|
||||
for the migration procedure.
|
||||
|
||||
``replication_strategy_fail_list``
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
If the replication strategy used in a ``CREATE KEYSPACE`` or ``ALTER KEYSPACE``
|
||||
statement is on this list, the request is rejected with a
|
||||
``ConfigurationException`` identifying the forbidden strategy and the affected
|
||||
keyspace.
|
||||
|
||||
**When to use.** In production environments, add ``SimpleStrategy`` to this
|
||||
list to enforce ``NetworkTopologyStrategy`` across all keyspaces. This helps
|
||||
prevent new production keyspaces from being created with a topology-unaware
|
||||
strategy.
|
||||
|
||||
**Metrics.** The following per-shard metrics track replication strategy
|
||||
guardrail violations:
|
||||
|
||||
* ``scylla_cql_replication_strategy_warn_list_violations``
|
||||
* ``scylla_cql_replication_strategy_fail_list_violations``
|
||||
|
||||
.. _guardrails-write-consistency-level:
|
||||
|
||||
Write Consistency Level Guardrails
|
||||
----------------------------------
|
||||
|
||||
These two parameters control which consistency levels (CL) are allowed for
|
||||
write operations (``INSERT``, ``UPDATE``, ``DELETE``, and ``BATCH``
|
||||
statements).
|
||||
|
||||
Be aware that adding warnings to CQL responses can significantly increase
|
||||
network traffic and reduce overall throughput.
|
||||
|
||||
``write_consistency_levels_warned``
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
If a write operation uses a consistency level on this list, the server attaches
|
||||
a warning to the CQL response identifying the discouraged consistency level.
|
||||
|
||||
**When to use.** Use this parameter to alert application developers when they
|
||||
use a consistency level that, while technically functional, is not recommended
|
||||
for the workload. Common examples:
|
||||
|
||||
* **Warn on** ``ANY``: writes at ``ANY`` are acknowledged as soon as at least
|
||||
one node (including a coordinator acting as a hinted handoff store) receives
|
||||
the mutation. This means data may not be persisted on any replica node at
|
||||
the time of acknowledgement, risking data loss if the coordinator fails
|
||||
before hinted handoff completes.
|
||||
* **Warn on** ``ALL``: writes at ``ALL`` require every replica to acknowledge
|
||||
the write. If any single replica is down, the write fails. This significantly
|
||||
reduces write availability.
|
||||
|
||||
``write_consistency_levels_disallowed``
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
If a write operation uses a consistency level on this list, the request is
|
||||
rejected with an ``InvalidRequestException`` identifying the forbidden
|
||||
consistency level.
|
||||
|
||||
**When to use.** Use this parameter to hard-block consistency levels that are
|
||||
considered unsafe for your deployment:
|
||||
|
||||
* **Disallow** ``ANY``: in production environments, ``ANY`` is almost never
|
||||
appropriate. It provides the weakest durability guarantee and is a common
|
||||
source of data-loss incidents when operators or application developers use it
|
||||
unintentionally.
|
||||
* **Disallow** ``ALL``: in clusters where high write availability is critical,
|
||||
blocking ``ALL`` prevents a single node failure from causing write
|
||||
unavailability.
|
||||
|
||||
**Metrics.** The following per-shard metrics track write consistency level
|
||||
guardrail violations:
|
||||
|
||||
* ``scylla_cql_write_consistency_levels_warned_violations``
|
||||
* ``scylla_cql_write_consistency_levels_disallowed_violations``
|
||||
|
||||
Additionally, ScyllaDB exposes the
|
||||
``scylla_cql_writes_per_consistency_level`` metric, labeled by consistency
|
||||
level, which tracks the total number of write requests per CL. This metric is
|
||||
useful for understanding the current write-CL distribution across the cluster
|
||||
*before* deciding which levels to warn on or disallow. For example, querying
|
||||
this metric can reveal whether any application is inadvertently using ``ANY``
|
||||
or ``ALL`` for writes.
|
||||
|
||||
.. _guardrails-compact-storage:
|
||||
|
||||
Compact Storage Guardrail
|
||||
-------------------------
|
||||
|
||||
``enable_create_table_with_compact_storage``
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
This boolean parameter controls whether ``CREATE TABLE`` statements with the
|
||||
deprecated ``COMPACT STORAGE`` option are allowed. Unlike the other guardrails,
|
||||
it acts as a simple on/off switch rather than using separate warn and fail
|
||||
thresholds.
|
||||
|
||||
**When to use.** Leave this at the default (``false``) for all new
|
||||
deployments. ``COMPACT STORAGE`` is a legacy feature that will be permanently
|
||||
removed in a future version of ScyllaDB. Set to ``true`` only if you have a specific,
|
||||
temporary need to create compact storage tables (e.g., compatibility with legacy
|
||||
applications during a migration). For details on the ``COMPACT STORAGE`` option, see
|
||||
:ref:`Compact Tables <compact-tables>` in the Data Definition documentation.
|
||||
|
||||
Additional References
|
||||
---------------------
|
||||
|
||||
* :doc:`Consistency Level </cql/consistency>`
|
||||
* :doc:`Data Definition (CREATE/ALTER KEYSPACE) </cql/ddl>`
|
||||
* :doc:`How to Safely Increase the Replication Factor </kb/rf-increase>`
|
||||
* :doc:`Metrics Reference </reference/metrics>`
|
||||
@@ -17,7 +17,6 @@ CQL Reference
|
||||
secondary-indexes
|
||||
time-to-live
|
||||
functions
|
||||
guardrails
|
||||
wasm
|
||||
json
|
||||
mv
|
||||
@@ -47,7 +46,6 @@ It allows you to create keyspaces and tables, insert and query tables, and more.
|
||||
* :doc:`Data Types </cql/types>`
|
||||
* :doc:`Definitions </cql/definitions>`
|
||||
* :doc:`Global Secondary Indexes </cql/secondary-indexes>`
|
||||
* :doc:`CQL Guardrails </cql/guardrails>`
|
||||
* :doc:`Expiring Data with Time to Live (TTL) </cql/time-to-live>`
|
||||
* :doc:`Functions </cql/functions>`
|
||||
* :doc:`JSON Support </cql/json>`
|
||||
|
||||
@@ -1,111 +1,347 @@
|
||||
# Introduction
|
||||
# Prototype design: auditing all keyspaces and per-role auditing
|
||||
|
||||
Similar to the approach described in CASSANDRA-12151, we add the
|
||||
concept of an audit specification. An audit has a target (syslog or a
|
||||
table) and a set of events/actions that it wants recorded. We
|
||||
introduce new CQL syntax for Scylla users to describe and manipulate
|
||||
audit specifications.
|
||||
## Summary
|
||||
|
||||
Prior art:
|
||||
- Microsoft SQL Server [audit
|
||||
description](https://docs.microsoft.com/en-us/sql/relational-databases/security/auditing/sql-server-audit-database-engine?view=sql-server-ver15)
|
||||
- pgAudit [docs](https://github.com/pgaudit/pgaudit/blob/master/README.md)
|
||||
- MySQL audit_log docs in
|
||||
[MySQL](https://dev.mysql.com/doc/refman/8.0/en/audit-log.html) and
|
||||
[Azure](https://docs.microsoft.com/en-us/azure/mysql/concepts-audit-logs)
|
||||
- DynamoDB can [use CloudTrail](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/logging-using-cloudtrail.html) to log all events
|
||||
Extend the existing `scylla.yaml`-driven audit subsystem with two focused capabilities:
|
||||
|
||||
# CQL extensions
|
||||
1. allow auditing **all keyspaces** without enumerating them one by one
|
||||
2. allow auditing only a configured set of **roles**
|
||||
|
||||
## Create an audit
|
||||
The prototype should stay close to the current implementation in `audit/`:
|
||||
|
||||
```cql
|
||||
CREATE AUDIT [IF NOT EXISTS] audit-name WITH TARGET { SYSLOG | table-name }
|
||||
[ AND TRIGGER KEYSPACE IN (ks1, ks2, ks3) ]
|
||||
[ AND TRIGGER TABLE IN (tbl1, tbl2, tbl3) ]
|
||||
[ AND TRIGGER ROLE IN (usr1, usr2, usr3) ]
|
||||
[ AND TRIGGER CATEGORY IN (cat1, cat2, cat3) ]
|
||||
;
|
||||
- keep the existing backends (`table`, `syslog`, or both)
|
||||
- keep the existing category / keyspace / table filters
|
||||
- preserve live updates for audit configuration
|
||||
- avoid any schema change to `audit.audit_log`
|
||||
|
||||
This is intentionally a small extension of the current auditing model, not a redesign around new CQL statements such as `CREATE AUDIT`.
|
||||
|
||||
## Motivation
|
||||
|
||||
Today Scylla exposes three main audit selectors:
|
||||
|
||||
- `audit_categories`
|
||||
- `audit_tables`
|
||||
- `audit_keyspaces`
|
||||
|
||||
This leaves two operational gaps:
|
||||
|
||||
1. **Auditing all keyspaces is cumbersome.**
|
||||
Large installations may create keyspaces dynamically, or manage many tenant keyspaces. Requiring operators to keep
|
||||
`audit_keyspaces` synchronized with the full keyspace list is error-prone and defeats the point of cluster-wide auditing.
|
||||
2. **Auditing is all-or-nothing with respect to users.**
|
||||
Once a category/keyspace/table combination matches, any authenticated user generating that traffic is audited.
|
||||
Operators want to narrow the scope to specific tenants, service accounts, or privileged roles.
|
||||
|
||||
These two additions also work well together: "audit all keyspaces, but only for selected roles" is a practical way to reduce
|
||||
both audit volume and performance impact.
|
||||
|
||||
## Goals
|
||||
|
||||
- Add a way to express "all keyspaces" in the current configuration model.
|
||||
- Add a new role filter that limits auditing to selected roles.
|
||||
- Preserve backwards compatibility for existing configurations.
|
||||
- Keep the evaluation cheap on the request path.
|
||||
- Support live configuration updates, consistent with the existing audit options.
|
||||
|
||||
## Non-goals
|
||||
|
||||
- Introducing `CREATE AUDIT`, `ALTER AUDIT`, or other new CQL syntax.
|
||||
- Adding per-role audit destinations.
|
||||
- Adding different categories per role.
|
||||
- Expanding role matching through the full granted-role graph in the prototype.
|
||||
- Changing the on-disk audit table schema.
|
||||
|
||||
## Current behavior
|
||||
|
||||
At the moment, audit logging is controlled by:
|
||||
|
||||
- `audit`
|
||||
- `audit_categories`
|
||||
- `audit_tables`
|
||||
- `audit_keyspaces`
|
||||
|
||||
The current decision rule in `audit::should_log()` is effectively:
|
||||
|
||||
```text
|
||||
category matches
|
||||
&& (
|
||||
keyspace is listed in audit_keyspaces
|
||||
|| table is listed in audit_tables
|
||||
|| category in {AUTH, ADMIN, DCL}
|
||||
)
|
||||
```
|
||||
|
||||
From this point on, every database event that matches all present
|
||||
triggers will be recorded in the target. When the target is a table,
|
||||
it behaves like the [current
|
||||
design](https://docs.scylladb.com/operating-scylla/security/auditing/#table-storage).
|
||||
Observations:
|
||||
|
||||
The audit name must be different from all other audits, unless IF NOT
|
||||
EXISTS precedes it, in which case the existing audit must be identical
|
||||
to the new definition. Case sensitivity and length limit are the same
|
||||
as for table names.
|
||||
- `AUTH`, `ADMIN`, and `DCL` are already global once their category is enabled.
|
||||
- `DDL`, `DML`, and `QUERY` need a matching keyspace or table.
|
||||
- An empty `audit_keyspaces` means "audit no keyspaces", not "audit every keyspace".
|
||||
- There is no role-based filter; the authenticated user is recorded in the log but is not part of the decision.
|
||||
- The exact implementation to preserve is in `audit/audit.cc` (`should_log()`, `inspect()`, and `inspect_login()`).
|
||||
|
||||
A trigger kind (ie, `KEYSPACE`, `TABLE`, `ROLE`, or `CATEGORY`) can be
|
||||
specified at most once.
|
||||
## Proposed configuration
|
||||
|
||||
## Show an audit
|
||||
### 1. Add `audit_all_keyspaces`
|
||||
|
||||
```cql
|
||||
DESCRIBE AUDIT [audit-name ...];
|
||||
Introduce a new live-update boolean option:
|
||||
|
||||
Examples:
|
||||
|
||||
```yaml
|
||||
# Audit all keyspaces for matching categories
|
||||
audit_all_keyspaces: true
|
||||
|
||||
# Audit all keyspaces for selected roles
|
||||
audit_all_keyspaces: true
|
||||
audit_roles: "alice,bob"
|
||||
```
|
||||
|
||||
Prints definitions of all audits named herein. If no names are
|
||||
provided, prints all audits.
|
||||
Semantics:
|
||||
|
||||
## Delete an audit
|
||||
- `audit_all_keyspaces: false` keeps the existing behavior.
|
||||
- `audit_all_keyspaces: true` makes every keyspace match.
|
||||
- `audit_keyspaces` keeps its existing meaning: an explicit list of keyspaces, or no keyspace-wide auditing when left empty.
|
||||
- `audit_all_keyspaces: true` and a non-empty `audit_keyspaces` must be rejected as invalid configuration,
|
||||
because the two options express overlapping scope in different ways.
|
||||
- A dedicated boolean is preferable to overloading `audit_keyspaces`, because it avoids changing the meaning of existing configurations.
|
||||
- This also keeps the behavior aligned with today's `audit_tables` handling, where leaving `audit_tables` empty does not introduce a new wildcard syntax.
|
||||
|
||||
```cql
|
||||
DROP AUDIT audit-name;
|
||||
### 2. Add `audit_roles`
|
||||
|
||||
Introduce a new live-update configuration option:
|
||||
|
||||
```yaml
|
||||
audit_roles: "alice,bob,service_api"
|
||||
```
|
||||
|
||||
Stops logging events specified by this audit. Doesn't impact the
|
||||
already logged events. If the target is a table, it remains as it is.
|
||||
Semantics:
|
||||
|
||||
## Alter an audit
|
||||
- empty `audit_roles` means **no role filtering**, preserving today's behavior
|
||||
- non-empty `audit_roles` means audit only requests whose effective logged username matches one of the configured roles
|
||||
- matching is byte-for-byte exact, using the same role name that is already written to the audit record's `username` column / syslog field
|
||||
- the prototype should compare against the post-authentication role name from the session and audit log,
|
||||
with no additional case folding or role-graph expansion
|
||||
|
||||
```cql
|
||||
ALTER AUDIT audit-name WITH {same syntax as CREATE}
|
||||
Examples:
|
||||
|
||||
```yaml
|
||||
# Audit all roles in a single keyspace (current behavior, made explicit)
|
||||
audit_keyspaces: "ks1"
|
||||
audit_roles: ""
|
||||
|
||||
# Audit two roles across all keyspaces
|
||||
audit_all_keyspaces: true
|
||||
audit_roles: "alice,bob"
|
||||
|
||||
# Audit a service role, but only for selected tables
|
||||
audit_tables: "ks1.orders,ks1.payments"
|
||||
audit_roles: "billing_service"
|
||||
```
|
||||
|
||||
Any trigger provided will be updated (or newly created, if previously
|
||||
absent). To drop a trigger, use `IN *`.
|
||||
## Decision rule after the change
|
||||
|
||||
## Permissions
|
||||
After the prototype, the rule becomes:
|
||||
|
||||
Only superusers can modify audits or turn them on and off.
|
||||
```text
|
||||
category matches
|
||||
&& role matches
|
||||
&& (
|
||||
category in {AUTH, ADMIN, DCL}
|
||||
|| audit_all_keyspaces
|
||||
|| keyspace is listed in audit_keyspaces
|
||||
|| table is listed in audit_tables
|
||||
)
|
||||
```
|
||||
|
||||
Only superusers can read tables that are audit targets; no user can
|
||||
modify them. Only superusers can drop tables that are audit targets,
|
||||
after the audit itself is dropped. If a superuser doesn't drop a
|
||||
target table, it remains in existence indefinitely.
|
||||
Where:
|
||||
|
||||
# Implementation
|
||||
- `role matches` is always true when `audit_roles` is empty
|
||||
- `audit_all_keyspaces` is true when the new boolean option is enabled
|
||||
|
||||
## Efficient trigger evaluation
|
||||
For login auditing, the rule is simply:
|
||||
|
||||
```text
|
||||
AUTH category enabled && role matches(login username)
|
||||
```
|
||||
|
||||
## Implementation details
|
||||
|
||||
### Configuration parsing
|
||||
|
||||
Add a new config entry:
|
||||
|
||||
- `db::config::audit_all_keyspaces`
|
||||
- `db::config::audit_roles`
|
||||
|
||||
It should mirror the existing audit selectors:
|
||||
|
||||
- `audit_all_keyspaces`: type `named_value<bool>`, liveness `LiveUpdate`, default `false`
|
||||
- `audit_roles`: type `named_value<sstring>`, liveness `LiveUpdate`, default empty string
|
||||
|
||||
Parsing changes:
|
||||
|
||||
- keep `parse_audit_tables()` as-is
|
||||
- keep `parse_audit_keyspaces()` semantics as-is
|
||||
- add `parse_audit_roles()` that returns a set of role names
|
||||
- normalize empty or whitespace-only keyspace lists to an empty configuration rather than treating them as real keyspace names
|
||||
- add cross-field validation so `audit_all_keyspaces: true` cannot be combined with a non-empty
|
||||
`audit_keyspaces`, both at startup and during live updates
|
||||
|
||||
To avoid re-parsing on every request, the `audit::audit` service should store:
|
||||
|
||||
```c++
|
||||
namespace audit {
|
||||
|
||||
/// Stores triggers from an AUDIT statement.
|
||||
class triggers {
|
||||
// Use trie structures for speedy string lookup.
|
||||
optional<trie> _ks_trigger, _tbl_trigger, _usr_trigger;
|
||||
|
||||
// A logical-AND filter.
|
||||
optional<unsigned> _cat_trigger;
|
||||
|
||||
public:
|
||||
/// True iff every non-null trigger matches the corresponding ainf element.
|
||||
bool should_audit(const audit_info& ainf);
|
||||
};
|
||||
|
||||
} // namespace audit
|
||||
bool _audit_all_keyspaces;
|
||||
std::set<sstring> _audited_keyspaces;
|
||||
std::set<sstring> _audited_roles;
|
||||
```
|
||||
|
||||
To prevent modification of target tables, `audit::inspect()` will
|
||||
check the statement and throw if it is disallowed, similar to what
|
||||
`check_access()` currently does.
|
||||
Using a dedicated boolean keeps the hot-path check straightforward and avoids reinterpreting the existing
|
||||
`_audited_keyspaces` selector.
|
||||
|
||||
## Persisting audit definitions
|
||||
Using `std::set` for the explicit selectors keeps the prototype aligned with the current implementation and minimizes code churn.
|
||||
If profiling later shows lookup cost matters here, the container choice can be revisited independently of the feature semantics.
|
||||
|
||||
Obviously, an audit definition must survive a server restart and stay
|
||||
consistent among all nodes in a cluster. We'll accomplish both by
|
||||
storing audits in a system table.
|
||||
### Audit object changes
|
||||
|
||||
The current `audit_info` already carries:
|
||||
|
||||
- category
|
||||
- keyspace
|
||||
- table
|
||||
- query text
|
||||
|
||||
The username is available separately from `service::query_state` and is already passed to storage helpers when an entry is written.
|
||||
For the prototype there is no need to duplicate the username into `audit_info`.
|
||||
|
||||
Instead:
|
||||
|
||||
- change `should_log()` to take the effective username as an additional input
|
||||
- change `should_log_login()` to check the username against `audit_roles`
|
||||
- keep the storage helpers unchanged, because they already persist the username
|
||||
- update the existing internal call sites in `inspect()` and `inspect_login()` to pass the username through
|
||||
|
||||
One possible interface shape is:
|
||||
|
||||
```c++
|
||||
bool should_log(std::string_view username, const audit_info* info) const;
|
||||
bool should_log_login(std::string_view username) const;
|
||||
```
|
||||
|
||||
### Role semantics
|
||||
|
||||
For the prototype, "role" means the role name already associated with the current client session:
|
||||
|
||||
- successful authenticated sessions use the session's user name
|
||||
- failed login events use the login name from the authentication attempt
|
||||
- failed login events are still subject to `audit_roles`, matched against the attempted login name
|
||||
|
||||
This keeps the feature easy to explain and aligns the filter with what users already see in audit output.
|
||||
|
||||
The prototype should **not** try to expand inherited roles. If a user logs in as `alice` and inherits permissions from another role,
|
||||
the audit filter still matches `alice`. This keeps the behavior deterministic and avoids expensive role graph lookups on the request path.
|
||||
|
||||
### Keyspace semantics
|
||||
|
||||
`audit_all_keyspaces: true` should affect any statement whose `audit_info` carries a keyspace name.
|
||||
|
||||
Important consequences:
|
||||
|
||||
- it makes `DDL` / `DML` / `QUERY` auditing effectively cluster-wide
|
||||
- it does not change the existing global handling of `AUTH`, `ADMIN`, and `DCL`
|
||||
- statements that naturally have no keyspace name continue to depend on their category-specific behavior
|
||||
|
||||
No extra schema or metadata scan is required: the request already carries the keyspace information needed for the decision.
|
||||
|
||||
## Backwards compatibility
|
||||
|
||||
This design keeps existing behavior intact:
|
||||
|
||||
- existing clusters that do not set `audit_roles` continue to audit all roles
|
||||
- existing clusters that leave `audit_keyspaces` empty continue to audit no keyspaces
|
||||
- existing explicit keyspace/table lists keep their current meaning
|
||||
|
||||
The feature is enabled only by a new explicit boolean, so existing `audit_keyspaces` values do not need to be reinterpreted.
|
||||
The only newly-invalid combination is enabling `audit_all_keyspaces` while also listing explicit keyspaces.
|
||||
|
||||
## Operational considerations
|
||||
|
||||
### Performance and volume
|
||||
|
||||
`audit_all_keyspaces: true` can significantly increase audit volume, especially with `QUERY` and `DML`.
|
||||
|
||||
The intended mitigation is to combine it with:
|
||||
|
||||
- a narrow `audit_categories`
|
||||
- a narrow `audit_roles`
|
||||
|
||||
That combination gives operators a simple and cheap filter model:
|
||||
|
||||
- first by category
|
||||
- then by role
|
||||
- then by keyspace/table scope
|
||||
|
||||
### Live updates
|
||||
|
||||
`audit_roles` should follow the same live-update behavior as the current audit filters.
|
||||
|
||||
Changing:
|
||||
|
||||
- `audit_roles`
|
||||
- `audit_all_keyspaces`
|
||||
- `audit_keyspaces`
|
||||
- `audit_tables`
|
||||
- `audit_categories`
|
||||
|
||||
should update the in-memory selectors on all shards without restarting the node.
|
||||
|
||||
### Prototype limitation
|
||||
|
||||
Because matching is done against the authenticated session role name, `audit_roles` cannot express "audit everyone who inherits role X".
|
||||
Operators must list the concrete login roles they want to audit. This is a deliberate trade-off in the prototype to keep matching cheap
|
||||
and avoid role graph lookups on every audited request.
|
||||
|
||||
Example: if `alice` inherits permissions from `admin_role`, configuring `audit_roles: "admin_role"` would not audit requests from
|
||||
`alice`; to audit those requests, `alice` itself must be listed.
|
||||
|
||||
### Audit table schema
|
||||
|
||||
No schema change is needed. The audit table already includes `username`, which is sufficient for both storage and later analysis.
|
||||
|
||||
## Testing plan
|
||||
|
||||
The prototype should extend existing audit coverage rather than introduce a separate test framework.
|
||||
|
||||
### Parser / unit coverage
|
||||
|
||||
Add focused tests for:
|
||||
|
||||
- empty `audit_roles`
|
||||
- specific `audit_roles`
|
||||
- `audit_all_keyspaces: true`
|
||||
- invalid mixed configuration: `audit_all_keyspaces: true` with non-empty `audit_keyspaces`
|
||||
- empty or whitespace-only keyspace lists such as `",,,"` or `" "`, which should normalize to an empty configuration and therefore audit no keyspaces
|
||||
- boolean config parsing for `audit_all_keyspaces`
|
||||
|
||||
### Behavioral coverage
|
||||
|
||||
Extend the existing audit tests in `test/cluster/dtest/audit_test.py` with scenarios such as:
|
||||
|
||||
1. `audit_all_keyspaces: true` audits statements in multiple keyspaces without listing them explicitly
|
||||
2. `audit_roles: "alice"` logs requests from `alice` but not from `bob`
|
||||
3. `audit_all_keyspaces: true` + `audit_roles: "alice"` only logs `alice`'s traffic cluster-wide
|
||||
4. login auditing respects `audit_roles`
|
||||
5. live-updating `audit_roles` changes behavior without restart
|
||||
6. setting `audit_all_keyspaces: true` together with explicit `audit_keyspaces` is rejected with a clear error
|
||||
|
||||
## Future evolution
|
||||
|
||||
This prototype is deliberately small, but it fits a broader audit-spec design if we decide to revisit that later.
|
||||
|
||||
In a future CQL-driven design, these two additions map naturally to triggers such as:
|
||||
|
||||
- `TRIGGER KEYSPACE IN *`
|
||||
- `TRIGGER ROLE IN (...)`
|
||||
|
||||
That means the prototype is not throwaway work: it improves the current operational model immediately while keeping a clean path
|
||||
toward richer audit objects in the future.
|
||||
|
||||
@@ -1,124 +0,0 @@
|
||||
# Logstor
|
||||
|
||||
## Introduction
|
||||
|
||||
Logstor is a log-structured storage engine for ScyllaDB optimized for key-value workloads. It provides an alternative storage backend for key-value tables - tables with a partition key only, with no clustering columns.
|
||||
|
||||
Unlike the traditional LSM-tree based storage, logstor uses a log-structured approach with in-memory indexing, making it particularly suitable for workloads with frequent overwrites and point lookups.
|
||||
|
||||
## Architecture
|
||||
|
||||
Logstor consists of several key components:
|
||||
|
||||
### Components
|
||||
|
||||
#### Primary Index
|
||||
|
||||
The primary index is entirely in memory and it maps a partition key to its location in the log segments. It consists of a B-tree per each table that is ordered token.
|
||||
|
||||
#### Segment Manager
|
||||
|
||||
The `segment_manager` handles the allocation and management of fixed-size segments (default 128KB). Segments are grouped into large files (default 32MB). Key responsibilities include:
|
||||
|
||||
- **Segment allocation**: Provides segments for writing new data
|
||||
- **Space reclamation**: Tracks free space in each segment
|
||||
- **Compaction**: Copies live data from sparse segments to reclaim space
|
||||
- **Recovery**: Scans segments on startup to rebuild the index
|
||||
- **Separator**: Rewrites segments that have records from different compaction groups into new segments that are separated by compaction group.
|
||||
|
||||
The data in the segments consists of records of type `log_record`. Each record contains the value for some key as a `canonical_mutation` and additional metadata.
|
||||
|
||||
The `segment_manager` receives new writes via a `write_buffer` and writes them sequentially to the active segment with 4k-block alignment.
|
||||
|
||||
#### Write Buffer
|
||||
|
||||
The `write_buffer` manages a buffer of log records and handles the serialization of the records including headers and alignment. It can be used to write multiple records to the buffer and then write the buffer to the segment manager.
|
||||
|
||||
The `buffered_writer` manages multiple write buffers for user writes, an active buffer and multiple flushing ones, to batch writes and manage backpressure.
|
||||
|
||||
### Data Flow
|
||||
|
||||
**Write Path:**
|
||||
1. Application writes mutation to logstor
|
||||
2. Mutation is converted to a log record
|
||||
3. Record is written to write buffer
|
||||
4. The buffer is switched and written to the active segment.
|
||||
5. Index is updated with new record locations
|
||||
6. Old record locations (for overwrites) are marked as free
|
||||
|
||||
**Read Path:**
|
||||
1. Application requests data for a partition key
|
||||
2. Index lookup returns record location
|
||||
3. Segment manager reads record from disk
|
||||
4. Record is deserialized into a mutation and returned
|
||||
|
||||
**Separator:**
|
||||
1. When a record is written to the active segment, it is also written to its compaction group's separator buffer. The separator buffer holds a reference to the original segment.
|
||||
2. The separator buffer is flushed when it's full, or requested to flush for other reason. It is written into a new segment in the compaction group, and it updates the location of the records from the original mixed segments to the new segments in the compaction group.
|
||||
3. After the separator buffer is flushed and all records from the original segment are moved, it releases the reference of the segment. When there are no more reference to the segment it is freed.
|
||||
|
||||
**Compaction:**
|
||||
1. The amount of live data is tracked for each segment in its segment_descriptor. The segment descriptors are stored in a histogram by live data.
|
||||
2. A segment set from a single compaction group is submitted for compaction.
|
||||
3. Compaction picks segments for compaction from the segment set. It chooses segments with the lowest utilization such that compacting them results in net gain of free segments.
|
||||
4. It reads the segments, finding all live records, and writing them into a write buffer. When the buffer is full it is flushed into a new segment, and for each recording updating the index location to the new location.
|
||||
5. After all live records are rewritten the old segments are freed.
|
||||
|
||||
## Usage
|
||||
|
||||
### Enabling Logstor
|
||||
|
||||
To use logstor, enable it in the configuration:
|
||||
|
||||
```yaml
|
||||
enable_logstor: true
|
||||
|
||||
experimental_features:
|
||||
- logstor
|
||||
```
|
||||
|
||||
### Creating Tables
|
||||
|
||||
Tables using logstor must have no clustering columns, and created with the `storage_engine` property equals to 'logstor':
|
||||
|
||||
```cql
|
||||
CREATE TABLE keyspace.user_profiles (
|
||||
user_id uuid PRIMARY KEY,
|
||||
name text,
|
||||
email text,
|
||||
metadata frozen<map<text, text>>
|
||||
) WITH storage_engine = 'logstor';
|
||||
```
|
||||
|
||||
### Basic Operations
|
||||
|
||||
**Insert/Update:**
|
||||
|
||||
```cql
|
||||
INSERT INTO keyspace.table_name (pk, v) VALUES (1, 'value1');
|
||||
INSERT INTO keyspace.table_name (pk, v) VALUES (2, 'value2');
|
||||
|
||||
-- Overwrite with new value
|
||||
INSERT INTO keyspace.table_name (pk, v) VALUES (1, 'updated_value');
|
||||
```
|
||||
|
||||
Currently, updates must write the full row. Updating individual columns is not yet supported. Each write replaces the entire partition.
|
||||
|
||||
**Select:**
|
||||
|
||||
```cql
|
||||
SELECT * FROM keyspace.table_name WHERE pk = 1;
|
||||
-- Returns: (1, 'updated_value')
|
||||
|
||||
SELECT pk, v FROM keyspace.table_name WHERE pk = 2;
|
||||
-- Returns: (2, 'value2')
|
||||
|
||||
SELECT * FROM keyspace.table_name;
|
||||
-- Returns: (1, 'updated_value'), (2, 'value2')
|
||||
```
|
||||
|
||||
**Delete:**
|
||||
|
||||
```cql
|
||||
DELETE FROM keyspace.table_name WHERE pk = 1;
|
||||
```
|
||||
@@ -52,7 +52,7 @@ Install ScyllaDB
|
||||
.. code-block:: console
|
||||
:substitutions:
|
||||
|
||||
sudo wget -O /etc/apt/sources.list.d/scylla.list https://downloads.scylladb.com/deb/debian/|UBUNTU_SCYLLADB_LIST|
|
||||
sudo wget -O /etc/apt/sources.list.d/scylla.list http://downloads.scylladb.com/deb/debian/|UBUNTU_SCYLLADB_LIST|
|
||||
|
||||
|
||||
#. Install ScyllaDB packages.
|
||||
@@ -125,7 +125,7 @@ Install ScyllaDB
|
||||
.. code-block:: console
|
||||
:substitutions:
|
||||
|
||||
sudo curl -o /etc/yum.repos.d/scylla.repo -L https://downloads.scylladb.com/rpm/centos/|CENTOS_SCYLLADB_REPO|
|
||||
sudo curl -o /etc/yum.repos.d/scylla.repo -L http://downloads.scylladb.com/rpm/centos/|CENTOS_SCYLLADB_REPO|
|
||||
|
||||
#. Install ScyllaDB packages.
|
||||
|
||||
@@ -133,19 +133,19 @@ Install ScyllaDB
|
||||
|
||||
sudo yum install scylla
|
||||
|
||||
Running the command installs the latest official version of ScyllaDB.
|
||||
Alternatively, you can install a specific patch version:
|
||||
Running the command installs the latest official version of ScyllaDB Open Source.
|
||||
Alternatively, you can to install a specific patch version:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
sudo yum install scylla-<your patch version>
|
||||
|
||||
Example: The following example shows installing ScyllaDB 2025.3.1.
|
||||
Example: The following example shows the command to install ScyllaDB 5.2.3.
|
||||
|
||||
.. code-block:: console
|
||||
:class: hide-copy-button
|
||||
|
||||
sudo yum install scylla-2025.3.1
|
||||
sudo yum install scylla-5.2.3
|
||||
|
||||
.. include:: /getting-started/_common/setup-after-install.rst
|
||||
|
||||
|
||||
@@ -36,8 +36,11 @@ release versions, run:
|
||||
curl -sSf get.scylladb.com/server | sudo bash -s -- --list-active-releases
|
||||
|
||||
|
||||
To install a non-default version, run the command with the ``--scylla-version``
|
||||
option to specify the version you want to install.
|
||||
Versions 2025.1 and Later
|
||||
==============================
|
||||
|
||||
Run the command with the ``--scylla-version`` option to specify the version
|
||||
you want to install.
|
||||
|
||||
**Example**
|
||||
|
||||
@@ -47,4 +50,20 @@ option to specify the version you want to install.
|
||||
curl -sSf get.scylladb.com/server | sudo bash -s -- --scylla-version |CURRENT_VERSION|
|
||||
|
||||
|
||||
Versions Earlier than 2025.1
|
||||
================================
|
||||
|
||||
To install a supported version of *ScyllaDB Enterprise*, run the command with:
|
||||
|
||||
* ``--scylla-product scylla-enterprise`` to specify that you want to install
|
||||
ScyllaDB Entrprise.
|
||||
* ``--scylla-version`` to specify the version you want to install.
|
||||
|
||||
For example:
|
||||
|
||||
.. code:: console
|
||||
|
||||
curl -sSf get.scylladb.com/server | sudo bash -s -- --scylla-product scylla-enterprise --scylla-version 2024.1
|
||||
|
||||
|
||||
.. include:: /getting-started/_common/setup-after-install.rst
|
||||
@@ -57,11 +57,12 @@ To enable shared dictionaries:
|
||||
internode_compression_enable_advanced: true
|
||||
rpc_dict_training_when: when_leader
|
||||
|
||||
.. note::
|
||||
.. warning:: Enabling shared dictionary training might leak unencrypted data to disk.
|
||||
|
||||
Some dictionary training data may be encrypted using storage-level encryption
|
||||
(if enabled) instead of database-level encryption, meaning protection is
|
||||
applied at the storage layer rather than within the database itself.
|
||||
Trained dictionaries contain randomly chosen samples of data transferred between
|
||||
nodes. The data samples are persisted in the Raft log, which is not encrypted.
|
||||
As a result, some data from otherwise encrypted tables might be stored on disk
|
||||
unencrypted.
|
||||
|
||||
|
||||
Reference
|
||||
|
||||
@@ -727,12 +727,7 @@ public:
|
||||
|
||||
// now we need one page more to be able to save one for next lap
|
||||
auto fill_size = align_up(buf1.size(), block_size) + block_size - buf1.size();
|
||||
// If the underlying stream is already at EOF (e.g. buf1 came from
|
||||
// cached _next while the previous read_exactly drained the source),
|
||||
// skip the read_exactly call — it would return empty anyway.
|
||||
auto buf2 = _input.eof()
|
||||
? temporary_buffer<char>()
|
||||
: co_await _input.read_exactly(fill_size);
|
||||
auto buf2 = co_await _input.read_exactly(fill_size);
|
||||
|
||||
temporary_buffer<char> output(buf1.size() + buf2.size());
|
||||
|
||||
|
||||
@@ -172,7 +172,6 @@ public:
|
||||
gms::feature rack_list_rf { *this, "RACK_LIST_RF"sv };
|
||||
gms::feature driver_service_level { *this, "DRIVER_SERVICE_LEVEL"sv };
|
||||
gms::feature strongly_consistent_tables { *this, "STRONGLY_CONSISTENT_TABLES"sv };
|
||||
gms::feature logstor { *this, "LOGSTOR"sv };
|
||||
gms::feature client_routes { *this, "CLIENT_ROUTES"sv };
|
||||
gms::feature removenode_with_left_token_ring { *this, "REMOVENODE_WITH_LEFT_TOKEN_RING"sv };
|
||||
gms::feature size_based_load_balancing { *this, "SIZE_BASED_LOAD_BALANCING"sv };
|
||||
|
||||
591
gms/gossiper.cc
591
gms/gossiper.cc
File diff suppressed because it is too large
Load Diff
@@ -48,7 +48,6 @@ set(idl_headers
|
||||
messaging_service.idl.hh
|
||||
paxos.idl.hh
|
||||
raft.idl.hh
|
||||
raft_util.idl.hh
|
||||
raft_storage.idl.hh
|
||||
group0.idl.hh
|
||||
hinted_handoff.idl.hh
|
||||
@@ -56,7 +55,6 @@ set(idl_headers
|
||||
storage_proxy.idl.hh
|
||||
storage_service.idl.hh
|
||||
strong_consistency/state_machine.idl.hh
|
||||
logstor.idl.hh
|
||||
group0_state_machine.idl.hh
|
||||
mapreduce_request.idl.hh
|
||||
replica_exception.idl.hh
|
||||
|
||||
@@ -1,28 +0,0 @@
|
||||
/*
|
||||
* Copyright 2026-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
|
||||
#include "idl/frozen_schema.idl.hh"
|
||||
#include "idl/token.idl.hh"
|
||||
#include "mutation/canonical_mutation.hh"
|
||||
|
||||
namespace replica {
|
||||
namespace logstor {
|
||||
|
||||
struct primary_index_key {
|
||||
dht::decorated_key dk;
|
||||
};
|
||||
|
||||
class log_record {
|
||||
replica::logstor::primary_index_key key;
|
||||
replica::logstor::record_generation generation;
|
||||
table_id table;
|
||||
canonical_mutation mut;
|
||||
};
|
||||
|
||||
}
|
||||
}
|
||||
3
init.cc
3
init.cc
@@ -96,9 +96,6 @@ std::set<sstring> get_disabled_features_from_db_config(const db::config& cfg, st
|
||||
if (!cfg.check_experimental(db::experimental_features_t::feature::STRONGLY_CONSISTENT_TABLES)) {
|
||||
disabled.insert("STRONGLY_CONSISTENT_TABLES"s);
|
||||
}
|
||||
if (!cfg.check_experimental(db::experimental_features_t::feature::LOGSTOR)) {
|
||||
disabled.insert("LOGSTOR"s);
|
||||
}
|
||||
if (!cfg.table_digest_insensitive_to_expiry()) {
|
||||
disabled.insert("TABLE_DIGEST_INSENSITIVE_TO_EXPIRY"s);
|
||||
}
|
||||
|
||||
@@ -33,14 +33,15 @@ size_t hash<locator::endpoint_dc_rack>::operator()(const locator::endpoint_dc_ra
|
||||
return utils::tuple_hash()(std::tie(v.dc, v.rack));
|
||||
}
|
||||
|
||||
} // namespace std
|
||||
}
|
||||
|
||||
namespace locator {
|
||||
|
||||
static logging::logger logger("network_topology_strategy");
|
||||
|
||||
network_topology_strategy::network_topology_strategy(replication_strategy_params params, const topology* topo)
|
||||
: abstract_replication_strategy(params, replication_strategy_type::network_topology) {
|
||||
network_topology_strategy::network_topology_strategy(replication_strategy_params params, const topology* topo) :
|
||||
abstract_replication_strategy(params,
|
||||
replication_strategy_type::network_topology) {
|
||||
auto opts = _config_options;
|
||||
|
||||
logger.debug("options={}", opts);
|
||||
@@ -64,7 +65,8 @@ network_topology_strategy::network_topology_strategy(replication_strategy_params
|
||||
if (boost::equals(key, "replication_factor")) {
|
||||
on_internal_error(rslogger, "replication_factor should have been replaced with a DC:RF mapping by now");
|
||||
} else {
|
||||
throw exceptions::configuration_exception(format("'{}' is not a valid option, did you mean (lowercase) 'replication_factor'?", key));
|
||||
throw exceptions::configuration_exception(format(
|
||||
"'{}' is not a valid option, did you mean (lowercase) 'replication_factor'?", key));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -107,8 +109,8 @@ class natural_endpoints_tracker {
|
||||
, _rf_left(std::min(rf, node_count))
|
||||
// If there aren't enough racks in this DC to fill the RF, we'll still use at least one node from each rack,
|
||||
// and the difference is to be filled by the first encountered nodes.
|
||||
, _acceptable_rack_repeats(rf - rack_count) {
|
||||
}
|
||||
, _acceptable_rack_repeats(rf - rack_count)
|
||||
{}
|
||||
|
||||
/**
|
||||
* Attempts to add an endpoint to the replicas for this datacenter, adding to the endpoints set if successful.
|
||||
@@ -199,7 +201,8 @@ public:
|
||||
, _tp(_tm.get_topology())
|
||||
, _dc_rep_factor(dc_rep_factor)
|
||||
, _token_owners(_tm.get_datacenter_token_owners())
|
||||
, _racks(_tm.get_datacenter_racks_token_owners()) {
|
||||
, _racks(_tm.get_datacenter_racks_token_owners())
|
||||
{
|
||||
// not aware of any cluster members
|
||||
SCYLLA_ASSERT(!_token_owners.empty() && !_racks.empty());
|
||||
|
||||
@@ -248,14 +251,16 @@ public:
|
||||
for (const auto& [dc, rf_data] : dc_rf) {
|
||||
auto rf = rf_data.count();
|
||||
if (rf > endpoints_in(dc)) {
|
||||
throw exceptions::configuration_exception(
|
||||
seastar::format("Datacenter {} doesn't have enough token-owning nodes for replication_factor={}", dc, rf));
|
||||
throw exceptions::configuration_exception(seastar::format(
|
||||
"Datacenter {} doesn't have enough token-owning nodes for replication_factor={}", dc, rf));
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
future<host_id_set> network_topology_strategy::calculate_natural_endpoints(const token& search_token, const token_metadata& tm) const {
|
||||
future<host_id_set>
|
||||
network_topology_strategy::calculate_natural_endpoints(
|
||||
const token& search_token, const token_metadata& tm) const {
|
||||
|
||||
natural_endpoints_tracker tracker(tm, _dc_rep_factor);
|
||||
|
||||
@@ -280,14 +285,12 @@ void network_topology_strategy::validate_options(const gms::feature_service& fs,
|
||||
for (auto& c : _config_options) {
|
||||
if (c.first == sstring("replication_factor")) {
|
||||
on_internal_error(rslogger, fmt::format("'replication_factor' tag should be unrolled into a list of DC:RF by now."
|
||||
"_config_options:{}",
|
||||
_config_options));
|
||||
"_config_options:{}", _config_options));
|
||||
}
|
||||
auto dc = dcs.find(c.first);
|
||||
if (dc == dcs.end()) {
|
||||
throw exceptions::configuration_exception(format("Unrecognized strategy option {{{}}} "
|
||||
"passed to NetworkTopologyStrategy",
|
||||
this->to_qualified_class_name(c.first)));
|
||||
"passed to NetworkTopologyStrategy", this->to_qualified_class_name(c.first)));
|
||||
}
|
||||
auto racks = dc->second | std::views::keys | std::ranges::to<std::unordered_set<sstring>>();
|
||||
auto rf = parse_replication_factor(c.second);
|
||||
@@ -308,8 +311,8 @@ future<tablet_map> network_topology_strategy::allocate_tablets_for_new_table(sch
|
||||
rslogger.info("Rounding up tablet count from {} to {} for table {}.{}", tablet_count, aligned_tablet_count, s->ks_name(), s->cf_name());
|
||||
tablet_count = aligned_tablet_count;
|
||||
}
|
||||
co_return co_await reallocate_tablets(
|
||||
std::move(s), std::move(tm), tablet_map(tablet_count, get_consistency() != data_dictionary::consistency_config_option::eventual));
|
||||
co_return co_await reallocate_tablets(std::move(s), std::move(tm),
|
||||
tablet_map(tablet_count, get_consistency() != data_dictionary::consistency_config_option::eventual));
|
||||
}
|
||||
|
||||
future<tablet_map> network_topology_strategy::reallocate_tablets(schema_ptr s, token_metadata_ptr tm, tablet_map tablets) const {
|
||||
@@ -318,15 +321,16 @@ future<tablet_map> network_topology_strategy::reallocate_tablets(schema_ptr s, t
|
||||
co_await load.populate_with_normalized_load();
|
||||
co_await load.populate(std::nullopt, s->id());
|
||||
|
||||
tablet_logger.debug(
|
||||
"Allocating tablets for {}.{} ({}): dc_rep_factor={} tablet_count={}", s->ks_name(), s->cf_name(), s->id(), _dc_rep_factor, tablets.tablet_count());
|
||||
tablet_logger.debug("Allocating tablets for {}.{} ({}): dc_rep_factor={} tablet_count={}", s->ks_name(), s->cf_name(), s->id(), _dc_rep_factor, tablets.tablet_count());
|
||||
|
||||
for (tablet_id tb : tablets.tablet_ids()) {
|
||||
auto tinfo = tablets.get_tablet_info(tb);
|
||||
tinfo.replicas = co_await reallocate_tablets(s, tm, load, tablets, tb);
|
||||
if (tablets.has_raft_info()) {
|
||||
if (!tablets.get_tablet_raft_info(tb).group_id) {
|
||||
tablets.set_tablet_raft_info(tb, tablet_raft_info{.group_id = raft::group_id{utils::UUID_gen::get_time_UUID()}});
|
||||
tablets.set_tablet_raft_info(tb, tablet_raft_info {
|
||||
.group_id = raft::group_id{utils::UUID_gen::get_time_UUID()}
|
||||
});
|
||||
}
|
||||
}
|
||||
tablets.set_tablet(tb, std::move(tinfo));
|
||||
@@ -336,8 +340,7 @@ future<tablet_map> network_topology_strategy::reallocate_tablets(schema_ptr s, t
|
||||
co_return tablets;
|
||||
}
|
||||
|
||||
future<tablet_replica_set> network_topology_strategy::reallocate_tablets(
|
||||
schema_ptr s, token_metadata_ptr tm, load_sketch& load, const tablet_map& cur_tablets, tablet_id tb) const {
|
||||
future<tablet_replica_set> network_topology_strategy::reallocate_tablets(schema_ptr s, token_metadata_ptr tm, load_sketch& load, const tablet_map& cur_tablets, tablet_id tb) const {
|
||||
tablet_replica_set replicas;
|
||||
// Current number of replicas per dc
|
||||
std::unordered_map<sstring, size_t> nodes_per_dc;
|
||||
@@ -361,8 +364,8 @@ future<tablet_replica_set> network_topology_strategy::reallocate_tablets(
|
||||
if (new_rf && new_rf->is_rack_based()) {
|
||||
auto diff = diff_racks(old_racks_per_dc[dc], new_rf->get_rack_list());
|
||||
|
||||
tablet_logger.debug("reallocate_tablets {}.{} tablet_id={} dc={} old_racks={} add_racks={} del_racks={}", s->ks_name(), s->cf_name(), tb, dc,
|
||||
old_racks_per_dc[dc], diff.added, diff.removed);
|
||||
tablet_logger.debug("reallocate_tablets {}.{} tablet_id={} dc={} old_racks={} add_racks={} del_racks={}",
|
||||
s->ks_name(), s->cf_name(), tb, dc, old_racks_per_dc[dc], diff.added, diff.removed);
|
||||
|
||||
if (!diff) {
|
||||
continue;
|
||||
@@ -392,18 +395,23 @@ future<tablet_replica_set> network_topology_strategy::reallocate_tablets(
|
||||
co_return replicas;
|
||||
}
|
||||
|
||||
tablet_replica_set network_topology_strategy::drop_tablets_in_racks(schema_ptr s, token_metadata_ptr tm, load_sketch& load, tablet_id tb,
|
||||
const tablet_replica_set& cur_replicas, const sstring& dc, const rack_list& racks_to_drop) const {
|
||||
tablet_replica_set network_topology_strategy::drop_tablets_in_racks(schema_ptr s,
|
||||
token_metadata_ptr tm,
|
||||
load_sketch& load,
|
||||
tablet_id tb,
|
||||
const tablet_replica_set& cur_replicas,
|
||||
const sstring& dc,
|
||||
const rack_list& racks_to_drop) const {
|
||||
auto& topo = tm->get_topology();
|
||||
tablet_replica_set filtered;
|
||||
auto is_rack_to_drop = [&racks_to_drop](const sstring& rack) {
|
||||
auto is_rack_to_drop = [&racks_to_drop] (const sstring& rack) {
|
||||
return std::ranges::contains(racks_to_drop, rack);
|
||||
};
|
||||
for (const auto& tr : cur_replicas) {
|
||||
auto& node = topo.get_node(tr.host);
|
||||
if (node.dc_rack().dc == dc && is_rack_to_drop(node.dc_rack().rack)) {
|
||||
tablet_logger.debug("drop_tablets_in_rack {}.{} tablet_id={} dc={} rack={} removing replica: {}", s->ks_name(), s->cf_name(), tb, node.dc_rack().dc,
|
||||
node.dc_rack().rack, tr);
|
||||
tablet_logger.debug("drop_tablets_in_rack {}.{} tablet_id={} dc={} rack={} removing replica: {}",
|
||||
s->ks_name(), s->cf_name(), tb, node.dc_rack().dc, node.dc_rack().rack, tr);
|
||||
load.unload(tr.host, tr.shard, 1, service::default_target_tablet_size);
|
||||
} else {
|
||||
filtered.emplace_back(tr);
|
||||
@@ -412,17 +420,22 @@ tablet_replica_set network_topology_strategy::drop_tablets_in_racks(schema_ptr s
|
||||
return filtered;
|
||||
}
|
||||
|
||||
tablet_replica_set network_topology_strategy::add_tablets_in_racks(schema_ptr s, token_metadata_ptr tm, load_sketch& load, tablet_id tb,
|
||||
const tablet_replica_set& cur_replicas, const sstring& dc, const rack_list& racks_to_add) const {
|
||||
tablet_replica_set network_topology_strategy::add_tablets_in_racks(schema_ptr s,
|
||||
token_metadata_ptr tm,
|
||||
load_sketch& load,
|
||||
tablet_id tb,
|
||||
const tablet_replica_set& cur_replicas,
|
||||
const sstring& dc,
|
||||
const rack_list& racks_to_add) const {
|
||||
auto nodes = tm->get_datacenter_racks_token_owners_nodes();
|
||||
auto& dc_nodes = nodes.at(dc);
|
||||
auto new_replicas = cur_replicas;
|
||||
|
||||
for (auto&& rack : racks_to_add) {
|
||||
for (auto&& rack: racks_to_add) {
|
||||
host_id min_node;
|
||||
double min_load = std::numeric_limits<double>::max();
|
||||
|
||||
for (auto&& node : dc_nodes.at(rack)) {
|
||||
for (auto&& node: dc_nodes.at(rack)) {
|
||||
if (!node.get().is_normal()) {
|
||||
continue;
|
||||
}
|
||||
@@ -437,26 +450,29 @@ tablet_replica_set network_topology_strategy::add_tablets_in_racks(schema_ptr s,
|
||||
}
|
||||
|
||||
if (!min_node) {
|
||||
throw std::runtime_error(fmt::format("No candidate node in rack {}.{} to allocate tablet replica", dc, rack));
|
||||
throw std::runtime_error(
|
||||
fmt::format("No candidate node in rack {}.{} to allocate tablet replica", dc, rack));
|
||||
}
|
||||
|
||||
auto new_replica = tablet_replica{min_node, load.next_shard(min_node, 1, service::default_target_tablet_size)};
|
||||
new_replicas.push_back(new_replica);
|
||||
|
||||
tablet_logger.trace("add_tablet_in_rack {}.{} tablet_id={} dc={} rack={} load={} new_replica={}", s->ks_name(), s->cf_name(), tb.id, dc, rack, min_load,
|
||||
new_replica);
|
||||
tablet_logger.trace("add_tablet_in_rack {}.{} tablet_id={} dc={} rack={} load={} new_replica={}",
|
||||
s->ks_name(), s->cf_name(), tb.id, dc, rack, min_load, new_replica);
|
||||
}
|
||||
return new_replicas;
|
||||
}
|
||||
|
||||
future<tablet_replica_set> network_topology_strategy::add_tablets_in_dc(schema_ptr s, token_metadata_ptr tm, load_sketch& load, tablet_id tb,
|
||||
std::map<sstring, std::unordered_set<locator::host_id>>& replicas_per_rack, const tablet_replica_set& cur_replicas, sstring dc, size_t dc_node_count,
|
||||
size_t dc_rf) const {
|
||||
std::map<sstring, std::unordered_set<locator::host_id>>& replicas_per_rack,
|
||||
const tablet_replica_set& cur_replicas,
|
||||
sstring dc, size_t dc_node_count, size_t dc_rf) const {
|
||||
static thread_local std::default_random_engine rnd_engine{std::random_device{}()};
|
||||
|
||||
auto replicas = cur_replicas;
|
||||
// all_dc_racks is ordered lexicographically on purpose
|
||||
auto all_dc_racks = tm->get_datacenter_racks_token_owners_nodes().at(dc) | std::ranges::to<std::map>();
|
||||
auto all_dc_racks = tm->get_datacenter_racks_token_owners_nodes().at(dc)
|
||||
| std::ranges::to<std::map>();
|
||||
|
||||
// Track all nodes with no replicas on them for this tablet, per rack.
|
||||
struct node_load {
|
||||
@@ -465,7 +481,7 @@ future<tablet_replica_set> network_topology_strategy::add_tablets_in_dc(schema_p
|
||||
};
|
||||
// for sorting in descending load order
|
||||
// (in terms of load)
|
||||
auto node_load_cmp = [](const node_load& a, const node_load& b) {
|
||||
auto node_load_cmp = [] (const node_load& a, const node_load& b) {
|
||||
return a.load > b.load;
|
||||
};
|
||||
|
||||
@@ -517,7 +533,7 @@ future<tablet_replica_set> network_topology_strategy::add_tablets_in_dc(schema_p
|
||||
|
||||
// ensure fairness across racks (in particular if rf < number_of_racks)
|
||||
// by rotating the racks order
|
||||
auto append_candidate_racks = [&](candidates_list& racks) {
|
||||
auto append_candidate_racks = [&] (candidates_list& racks) {
|
||||
if (auto size = racks.size()) {
|
||||
auto it = racks.begin() + tb.id % size;
|
||||
std::move(it, racks.end(), std::back_inserter(candidate_racks));
|
||||
@@ -529,19 +545,20 @@ future<tablet_replica_set> network_topology_strategy::add_tablets_in_dc(schema_p
|
||||
append_candidate_racks(existing_racks);
|
||||
|
||||
if (candidate_racks.empty()) {
|
||||
on_internal_error(tablet_logger, seastar::format("allocate_replica {}.{}: no candidate racks found for dc={} allocated={} rf={}: existing={}",
|
||||
s->ks_name(), s->cf_name(), dc, dc_node_count, dc_rf, replicas_per_rack));
|
||||
on_internal_error(tablet_logger,
|
||||
seastar::format("allocate_replica {}.{}: no candidate racks found for dc={} allocated={} rf={}: existing={}",
|
||||
s->ks_name(), s->cf_name(), dc, dc_node_count, dc_rf, replicas_per_rack));
|
||||
}
|
||||
|
||||
auto candidate_rack = candidate_racks.begin();
|
||||
|
||||
auto allocate_replica = [&](candidates_list::iterator& candidate) {
|
||||
auto allocate_replica = [&] (candidates_list::iterator& candidate) {
|
||||
const auto& rack = candidate->rack;
|
||||
auto& nodes = candidate->nodes;
|
||||
if (nodes.empty()) {
|
||||
on_internal_error(tablet_logger, seastar::format("allocate_replica {}.{} tablet_id={}: candidates vector for rack={} is empty for allocating "
|
||||
"tablet replicas in dc={} allocated={} rf={}",
|
||||
s->ks_name(), s->cf_name(), tb.id, rack, dc, dc_node_count, dc_rf));
|
||||
on_internal_error(tablet_logger,
|
||||
seastar::format("allocate_replica {}.{} tablet_id={}: candidates vector for rack={} is empty for allocating tablet replicas in dc={} allocated={} rf={}",
|
||||
s->ks_name(), s->cf_name(), tb.id, rack, dc, dc_node_count, dc_rf));
|
||||
}
|
||||
auto host_id = nodes.back().host;
|
||||
auto replica = tablet_replica{host_id, load.next_shard(host_id, 1, service::default_target_tablet_size)};
|
||||
@@ -549,13 +566,13 @@ future<tablet_replica_set> network_topology_strategy::add_tablets_in_dc(schema_p
|
||||
auto inserted = replicas_per_rack[node.dc_rack().rack].insert(host_id).second;
|
||||
// Sanity check that a node is not used more than once
|
||||
if (!inserted) {
|
||||
on_internal_error(tablet_logger, seastar::format("allocate_replica {}.{} tablet_id={}: allocated replica={} node already used when allocating "
|
||||
"tablet replicas in dc={} allocated={} rf={}: replicas={}",
|
||||
s->ks_name(), s->cf_name(), tb.id, replica, dc, dc_node_count, dc_rf, replicas));
|
||||
on_internal_error(tablet_logger,
|
||||
seastar::format("allocate_replica {}.{} tablet_id={}: allocated replica={} node already used when allocating tablet replicas in dc={} allocated={} rf={}: replicas={}",
|
||||
s->ks_name(), s->cf_name(), tb.id, replica, dc, dc_node_count, dc_rf, replicas));
|
||||
}
|
||||
nodes.pop_back();
|
||||
tablet_logger.trace("allocate_replica {}.{} tablet_id={}: allocated tablet replica={} dc={} rack={}: nodes remaining in rack={}", s->ks_name(),
|
||||
s->cf_name(), tb.id, replica, node.dc_rack().dc, node.dc_rack().rack, nodes.size());
|
||||
tablet_logger.trace("allocate_replica {}.{} tablet_id={}: allocated tablet replica={} dc={} rack={}: nodes remaining in rack={}",
|
||||
s->ks_name(), s->cf_name(), tb.id, replica, node.dc_rack().dc, node.dc_rack().rack, nodes.size());
|
||||
if (nodes.empty()) {
|
||||
candidate = candidate_racks.erase(candidate);
|
||||
} else {
|
||||
@@ -566,8 +583,7 @@ future<tablet_replica_set> network_topology_strategy::add_tablets_in_dc(schema_p
|
||||
}
|
||||
if (tablet_logger.is_enabled(log_level::trace)) {
|
||||
if (candidate != candidate_racks.end()) {
|
||||
tablet_logger.trace("allocate_replica {}.{} tablet_id={}: next rack={} nodes={}", s->ks_name(), s->cf_name(), tb.id, candidate->rack,
|
||||
candidate->nodes.size());
|
||||
tablet_logger.trace("allocate_replica {}.{} tablet_id={}: next rack={} nodes={}", s->ks_name(), s->cf_name(), tb.id, candidate->rack, candidate->nodes.size());
|
||||
} else {
|
||||
tablet_logger.trace("allocate_replica {}.{} tablet_id={}: no candidate racks left", s->ks_name(), s->cf_name(), tb.id);
|
||||
}
|
||||
@@ -575,15 +591,15 @@ future<tablet_replica_set> network_topology_strategy::add_tablets_in_dc(schema_p
|
||||
return replica;
|
||||
};
|
||||
|
||||
tablet_logger.debug("allocate_replica {}.{} tablet_id={}: allocating tablet replicas in dc={} allocated={} rf={}", s->ks_name(), s->cf_name(), tb.id, dc,
|
||||
dc_node_count, dc_rf);
|
||||
tablet_logger.debug("allocate_replica {}.{} tablet_id={}: allocating tablet replicas in dc={} allocated={} rf={}",
|
||||
s->ks_name(), s->cf_name(), tb.id, dc, dc_node_count, dc_rf);
|
||||
|
||||
for (size_t remaining = dc_rf - dc_node_count; remaining; --remaining) {
|
||||
co_await coroutine::maybe_yield();
|
||||
if (candidate_rack == candidate_racks.end()) {
|
||||
on_internal_error(tablet_logger, format("allocate_replica {}.{} tablet_id={}: ran out of candidates for allocating tablet replicas in dc={} "
|
||||
"allocated={} rf={}: remaining={}",
|
||||
s->ks_name(), s->cf_name(), tb.id, dc, dc_node_count, dc_rf, remaining));
|
||||
on_internal_error(tablet_logger,
|
||||
format("allocate_replica {}.{} tablet_id={}: ran out of candidates for allocating tablet replicas in dc={} allocated={} rf={}: remaining={}",
|
||||
s->ks_name(), s->cf_name(), tb.id, dc, dc_node_count, dc_rf, remaining));
|
||||
}
|
||||
replicas.emplace_back(allocate_replica(candidate_rack));
|
||||
}
|
||||
@@ -592,9 +608,9 @@ future<tablet_replica_set> network_topology_strategy::add_tablets_in_dc(schema_p
|
||||
}
|
||||
|
||||
tablet_replica_set network_topology_strategy::drop_tablets_in_dc(schema_ptr s, const locator::topology& topo, load_sketch& load, tablet_id tb,
|
||||
const tablet_replica_set& cur_replicas, sstring dc, size_t dc_node_count, size_t dc_rf) const {
|
||||
tablet_logger.debug("drop_tablets_in_dc {}.{} tablet_id={}: deallocating tablet replicas in dc={} allocated={} rf={}", s->ks_name(), s->cf_name(), tb.id,
|
||||
dc, dc_node_count, dc_rf);
|
||||
const tablet_replica_set& cur_replicas,
|
||||
sstring dc, size_t dc_node_count, size_t dc_rf) const {
|
||||
tablet_logger.debug("drop_tablets_in_dc {}.{} tablet_id={}: deallocating tablet replicas in dc={} allocated={} rf={}", s->ks_name(), s->cf_name(), tb.id, dc, dc_node_count, dc_rf);
|
||||
|
||||
// Leave dc_rf replicas in dc, effectively deallocating in reverse order,
|
||||
// to maintain replica pairing between the base table and its materialized views.
|
||||
@@ -613,7 +629,8 @@ tablet_replica_set network_topology_strategy::drop_tablets_in_dc(schema_ptr s, c
|
||||
return filtered;
|
||||
}
|
||||
|
||||
sstring network_topology_strategy::sanity_check_read_replicas(const effective_replication_map& erm, const host_id_vector_replica_set& read_replicas) const {
|
||||
sstring network_topology_strategy::sanity_check_read_replicas(const effective_replication_map& erm,
|
||||
const host_id_vector_replica_set& read_replicas) const {
|
||||
const auto& topology = erm.get_topology();
|
||||
|
||||
struct rf_node_count {
|
||||
@@ -646,4 +663,4 @@ sstring network_topology_strategy::sanity_check_read_replicas(const effective_re
|
||||
using registry = class_registrator<abstract_replication_strategy, network_topology_strategy, replication_strategy_params, const topology*>;
|
||||
static registry registrator("org.apache.cassandra.locator.NetworkTopologyStrategy");
|
||||
static registry registrator_short_name("NetworkTopologyStrategy");
|
||||
} // namespace locator
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -611,10 +611,6 @@ public:
|
||||
/// Returns tablet_id of a tablet which owns a given token.
|
||||
tablet_id get_tablet_id(token) const;
|
||||
|
||||
// Returns the side of the tablet's range that a given token belongs to.
|
||||
// Less expensive than get_tablet_id_and_range_side() when tablet_id is already known.
|
||||
tablet_range_side get_tablet_range_side(token) const;
|
||||
|
||||
// Returns tablet_id and also the side of the tablet's range that a given token belongs to.
|
||||
std::pair<tablet_id, tablet_range_side> get_tablet_id_and_range_side(token) const;
|
||||
|
||||
|
||||
@@ -26,16 +26,12 @@
|
||||
|
||||
struct node_printer {
|
||||
const locator::node* v;
|
||||
node_printer(const locator::node* n) noexcept
|
||||
: v(n) {
|
||||
}
|
||||
node_printer(const locator::node* n) noexcept : v(n) {}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct fmt::formatter<node_printer> {
|
||||
constexpr auto parse(format_parse_context& ctx) {
|
||||
return ctx.begin();
|
||||
}
|
||||
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
|
||||
auto format(const node_printer& np, fmt::format_context& ctx) const {
|
||||
const locator::node* node = np.v;
|
||||
auto out = fmt::format_to(ctx.out(), "node={}", fmt::ptr(node));
|
||||
@@ -47,9 +43,7 @@ struct fmt::formatter<node_printer> {
|
||||
};
|
||||
|
||||
static auto lazy_backtrace() {
|
||||
return seastar::value_of([] {
|
||||
return current_backtrace();
|
||||
});
|
||||
return seastar::value_of([] { return current_backtrace(); });
|
||||
}
|
||||
|
||||
namespace locator {
|
||||
@@ -57,12 +51,11 @@ namespace locator {
|
||||
static logging::logger tlogger("topology");
|
||||
|
||||
thread_local const endpoint_dc_rack endpoint_dc_rack::default_location = {
|
||||
.dc = locator::production_snitch_base::default_dc,
|
||||
.rack = locator::production_snitch_base::default_rack,
|
||||
.dc = locator::production_snitch_base::default_dc,
|
||||
.rack = locator::production_snitch_base::default_rack,
|
||||
};
|
||||
|
||||
node::node(const locator::topology* topology, locator::host_id id, endpoint_dc_rack dc_rack, state state, shard_id shard_count, bool excluded,
|
||||
this_node is_this_node, node::idx_type idx, bool draining)
|
||||
node::node(const locator::topology* topology, locator::host_id id, endpoint_dc_rack dc_rack, state state, shard_id shard_count, bool excluded, this_node is_this_node, node::idx_type idx, bool draining)
|
||||
: _topology(topology)
|
||||
, _host_id(id)
|
||||
, _dc_rack(std::move(dc_rack))
|
||||
@@ -71,11 +64,10 @@ node::node(const locator::topology* topology, locator::host_id id, endpoint_dc_r
|
||||
, _excluded(excluded)
|
||||
, _draining(draining)
|
||||
, _is_this_node(is_this_node)
|
||||
, _idx(idx) {
|
||||
}
|
||||
, _idx(idx)
|
||||
{}
|
||||
|
||||
node_holder node::make(const locator::topology* topology, locator::host_id id, endpoint_dc_rack dc_rack, state state, shard_id shard_count, bool excluded,
|
||||
node::this_node is_this_node, node::idx_type idx, bool draining) {
|
||||
node_holder node::make(const locator::topology* topology, locator::host_id id, endpoint_dc_rack dc_rack, state state, shard_id shard_count, bool excluded, node::this_node is_this_node, node::idx_type idx, bool draining) {
|
||||
return std::make_unique<node>(topology, std::move(id), std::move(dc_rack), std::move(state), shard_count, excluded, is_this_node, idx, draining);
|
||||
}
|
||||
|
||||
@@ -85,22 +77,14 @@ node_holder node::clone() const {
|
||||
|
||||
std::string node::to_string(node::state s) {
|
||||
switch (s) {
|
||||
case state::none:
|
||||
return "none";
|
||||
case state::bootstrapping:
|
||||
return "bootstrapping";
|
||||
case state::replacing:
|
||||
return "replacing";
|
||||
case state::normal:
|
||||
return "normal";
|
||||
case state::being_decommissioned:
|
||||
return "being_decommissioned";
|
||||
case state::being_removed:
|
||||
return "being_removed";
|
||||
case state::being_replaced:
|
||||
return "being_replaced";
|
||||
case state::left:
|
||||
return "left";
|
||||
case state::none: return "none";
|
||||
case state::bootstrapping: return "bootstrapping";
|
||||
case state::replacing: return "replacing";
|
||||
case state::normal: return "normal";
|
||||
case state::being_decommissioned: return "being_decommissioned";
|
||||
case state::being_removed: return "being_removed";
|
||||
case state::being_replaced: return "being_replaced";
|
||||
case state::left: return "left";
|
||||
}
|
||||
__builtin_unreachable();
|
||||
}
|
||||
@@ -117,19 +101,21 @@ future<> topology::clear_gently() noexcept {
|
||||
}
|
||||
|
||||
topology::topology(shallow_copy, config cfg)
|
||||
: _shard(this_shard_id())
|
||||
, _cfg(cfg)
|
||||
, _sort_by_proximity(true) {
|
||||
: _shard(this_shard_id())
|
||||
, _cfg(cfg)
|
||||
, _sort_by_proximity(true)
|
||||
{
|
||||
// constructor for shallow copying of token_metadata_impl
|
||||
}
|
||||
|
||||
topology::topology(config cfg)
|
||||
: _shard(this_shard_id())
|
||||
, _cfg(cfg)
|
||||
, _sort_by_proximity(!cfg.disable_proximity_sorting)
|
||||
, _random_engine(std::random_device{}()) {
|
||||
tlogger.trace("topology[{}]: constructing using config: endpoint={} id={} dc={} rack={}", fmt::ptr(this), cfg.this_endpoint, cfg.this_host_id,
|
||||
cfg.local_dc_rack.dc, cfg.local_dc_rack.rack);
|
||||
: _shard(this_shard_id())
|
||||
, _cfg(cfg)
|
||||
, _sort_by_proximity(!cfg.disable_proximity_sorting)
|
||||
, _random_engine(std::random_device{}())
|
||||
{
|
||||
tlogger.trace("topology[{}]: constructing using config: endpoint={} id={} dc={} rack={}", fmt::ptr(this),
|
||||
cfg.this_endpoint, cfg.this_host_id, cfg.local_dc_rack.dc, cfg.local_dc_rack.rack);
|
||||
add_node(cfg.this_host_id, cfg.local_dc_rack, node::state::none);
|
||||
}
|
||||
|
||||
@@ -145,7 +131,8 @@ topology::topology(topology&& o) noexcept
|
||||
, _dc_racks(std::move(o._dc_racks))
|
||||
, _sort_by_proximity(o._sort_by_proximity)
|
||||
, _datacenters(std::move(o._datacenters))
|
||||
, _random_engine(std::move(o._random_engine)) {
|
||||
, _random_engine(std::move(o._random_engine))
|
||||
{
|
||||
SCYLLA_ASSERT(_shard == this_shard_id());
|
||||
tlogger.trace("topology[{}]: move from [{}]", fmt::ptr(this), fmt::ptr(&o));
|
||||
|
||||
@@ -166,18 +153,16 @@ topology& topology::operator=(topology&& o) noexcept {
|
||||
|
||||
void topology::set_host_id_cfg(host_id this_host_id) {
|
||||
if (_cfg.this_host_id) {
|
||||
on_internal_error(tlogger,
|
||||
fmt::format("topology[{}] set_host_id_cfg can be caller only once current id {} new id {}", fmt::ptr(this), _cfg.this_host_id, this_host_id));
|
||||
on_internal_error(tlogger, fmt::format("topology[{}] set_host_id_cfg can be caller only once current id {} new id {}", fmt::ptr(this), _cfg.this_host_id, this_host_id));
|
||||
}
|
||||
if (_nodes.size() != 1) {
|
||||
on_internal_error(tlogger, fmt::format("topology[{}] set_host_id_cfg called while nodes size is greater than 1", fmt::ptr(this)));
|
||||
on_internal_error(tlogger, fmt::format("topology[{}] set_host_id_cfg called while nodes size is greater than 1", fmt::ptr(this)));
|
||||
}
|
||||
if (!_this_node) {
|
||||
on_internal_error(tlogger, fmt::format("topology[{}] set_host_id_cfg called while _this_nodes is null", fmt::ptr(this)));
|
||||
on_internal_error(tlogger, fmt::format("topology[{}] set_host_id_cfg called while _this_nodes is null", fmt::ptr(this)));
|
||||
}
|
||||
if (_this_node->host_id()) {
|
||||
on_internal_error(
|
||||
tlogger, fmt::format("topology[{}] set_host_id_cfg called while _this_nodes has non null id {}", fmt::ptr(this), _this_node->host_id()));
|
||||
on_internal_error(tlogger, fmt::format("topology[{}] set_host_id_cfg called while _this_nodes has non null id {}", fmt::ptr(this), _this_node->host_id()));
|
||||
}
|
||||
|
||||
remove_node(*_this_node);
|
||||
@@ -218,8 +203,7 @@ const node& topology::add_node(node_holder nptr) {
|
||||
|
||||
if (nptr->topology() != this) {
|
||||
if (nptr->topology()) {
|
||||
on_fatal_internal_error(tlogger,
|
||||
seastar::format("topology[{}]: {} belongs to different topology={}", fmt::ptr(this), node_printer(node), fmt::ptr(node->topology())));
|
||||
on_fatal_internal_error(tlogger, seastar::format("topology[{}]: {} belongs to different topology={}", fmt::ptr(this), node_printer(node), fmt::ptr(node->topology())));
|
||||
}
|
||||
nptr->set_topology(this);
|
||||
}
|
||||
@@ -235,8 +219,7 @@ const node& topology::add_node(node_holder nptr) {
|
||||
try {
|
||||
if (is_configured_this_node(*node)) {
|
||||
if (_this_node) {
|
||||
on_internal_error(tlogger,
|
||||
seastar::format("topology[{}]: {}: local node already mapped to {}", fmt::ptr(this), node_printer(node), node_printer(this_node())));
|
||||
on_internal_error(tlogger, seastar::format("topology[{}]: {}: local node already mapped to {}", fmt::ptr(this), node_printer(node), node_printer(this_node())));
|
||||
}
|
||||
locator::node& n = *_nodes.back();
|
||||
n._is_this_node = node::this_node::yes;
|
||||
@@ -255,25 +238,14 @@ const node& topology::add_node(node_holder nptr) {
|
||||
return *node;
|
||||
}
|
||||
|
||||
void topology::update_node(node& node, std::optional<host_id> opt_id, std::optional<endpoint_dc_rack> opt_dr, std::optional<node::state> opt_st,
|
||||
std::optional<shard_id> opt_shard_count) {
|
||||
void topology::update_node(node& node, std::optional<host_id> opt_id, std::optional<endpoint_dc_rack> opt_dr, std::optional<node::state> opt_st, std::optional<shard_id> opt_shard_count) {
|
||||
tlogger.debug("topology[{}]: update_node: {}: to: host_id={} dc={} rack={} state={} shard_count={}, at {}", fmt::ptr(this), node_printer(&node),
|
||||
seastar::value_of([&] {
|
||||
return opt_id ? format("{}", *opt_id) : "unchanged";
|
||||
}),
|
||||
seastar::value_of([&] {
|
||||
return opt_dr ? format("{}", opt_dr->dc) : "unchanged";
|
||||
}),
|
||||
seastar::value_of([&] {
|
||||
return opt_dr ? format("{}", opt_dr->rack) : "unchanged";
|
||||
}),
|
||||
seastar::value_of([&] {
|
||||
return opt_st ? format("{}", *opt_st) : "unchanged";
|
||||
}),
|
||||
seastar::value_of([&] {
|
||||
return opt_shard_count ? format("{}", *opt_shard_count) : "unchanged";
|
||||
}),
|
||||
lazy_backtrace());
|
||||
opt_id ? format("{}", *opt_id) : "unchanged",
|
||||
opt_dr ? format("{}", opt_dr->dc) : "unchanged",
|
||||
opt_dr ? format("{}", opt_dr->rack) : "unchanged",
|
||||
opt_st ? format("{}", *opt_st) : "unchanged",
|
||||
opt_shard_count ? format("{}", *opt_shard_count) : "unchanged",
|
||||
lazy_backtrace());
|
||||
|
||||
bool changed = false;
|
||||
if (opt_id) {
|
||||
@@ -285,8 +257,7 @@ void topology::update_node(node& node, std::optional<host_id> opt_id, std::optio
|
||||
on_internal_error(tlogger, seastar::format("This node host_id is already set: {}: new host_id={}", node_printer(&node), *opt_id));
|
||||
}
|
||||
if (_nodes_by_host_id.contains(*opt_id)) {
|
||||
on_internal_error(tlogger, seastar::format("Cannot update node host_id: {}: new host_id already exists: {}", node_printer(&node),
|
||||
node_printer(find_node(*opt_id))));
|
||||
on_internal_error(tlogger, seastar::format("Cannot update node host_id: {}: new host_id already exists: {}", node_printer(&node), node_printer(find_node(*opt_id))));
|
||||
}
|
||||
changed = true;
|
||||
} else {
|
||||
@@ -471,11 +442,11 @@ const node* topology::find_node(node::idx_type idx) const noexcept {
|
||||
return _nodes.at(idx).get();
|
||||
}
|
||||
|
||||
const node& topology::add_or_update_endpoint(
|
||||
host_id id, std::optional<endpoint_dc_rack> opt_dr, std::optional<node::state> opt_st, std::optional<shard_id> shard_count) {
|
||||
tlogger.trace("topology[{}]: add_or_update_endpoint: host_id={} dc={} rack={} state={} shards={}, at {}", fmt::ptr(this), id,
|
||||
opt_dr.value_or(endpoint_dc_rack{}).dc, opt_dr.value_or(endpoint_dc_rack{}).rack, opt_st.value_or(node::state::none), shard_count,
|
||||
lazy_backtrace());
|
||||
const node& topology::add_or_update_endpoint(host_id id, std::optional<endpoint_dc_rack> opt_dr, std::optional<node::state> opt_st, std::optional<shard_id> shard_count)
|
||||
{
|
||||
tlogger.trace("topology[{}]: add_or_update_endpoint: host_id={} dc={} rack={} state={} shards={}, at {}", fmt::ptr(this),
|
||||
id, opt_dr.value_or(endpoint_dc_rack{}).dc, opt_dr.value_or(endpoint_dc_rack{}).rack, opt_st.value_or(node::state::none), shard_count,
|
||||
lazy_backtrace());
|
||||
|
||||
auto* n = find_node(id);
|
||||
if (n) {
|
||||
@@ -483,10 +454,14 @@ const node& topology::add_or_update_endpoint(
|
||||
return *n;
|
||||
}
|
||||
|
||||
return add_node(id, opt_dr.value_or(endpoint_dc_rack::default_location), opt_st.value_or(node::state::none), shard_count.value_or(0));
|
||||
return add_node(id,
|
||||
opt_dr.value_or(endpoint_dc_rack::default_location),
|
||||
opt_st.value_or(node::state::none),
|
||||
shard_count.value_or(0));
|
||||
}
|
||||
|
||||
bool topology::remove_endpoint(locator::host_id host_id) {
|
||||
bool topology::remove_endpoint(locator::host_id host_id)
|
||||
{
|
||||
auto node = find_node(host_id);
|
||||
tlogger.debug("topology[{}]: remove_endpoint: host_id={}: {}", fmt::ptr(this), host_id, node_printer(node));
|
||||
// Do not allow removing yourself from the topology
|
||||
@@ -527,7 +502,7 @@ void topology::do_sort_by_proximity(locator::host_id address, host_id_vector_rep
|
||||
locator::host_id id;
|
||||
int distance;
|
||||
};
|
||||
auto host_infos = addresses | std::views::transform([&](locator::host_id id) {
|
||||
auto host_infos = addresses | std::views::transform([&] (locator::host_id id) {
|
||||
const auto& loc1 = get_location(id);
|
||||
return info{id, distance(address, loc, id, loc1)};
|
||||
}) | std::ranges::to<utils::small_vector<info, host_id_vector_replica_set::internal_capacity()>>();
|
||||
@@ -589,12 +564,11 @@ std::unordered_set<locator::host_id> topology::get_all_host_ids() const {
|
||||
return ids;
|
||||
}
|
||||
|
||||
std::unordered_map<sstring, std::unordered_set<host_id>> topology::get_datacenter_host_ids() const {
|
||||
std::unordered_map<sstring, std::unordered_set<host_id>>
|
||||
topology::get_datacenter_host_ids() const {
|
||||
std::unordered_map<sstring, std::unordered_set<host_id>> ret;
|
||||
for (auto& [dc, nodes] : _dc_nodes) {
|
||||
ret[dc] = nodes | std::views::transform([](const node& n) {
|
||||
return n.host_id();
|
||||
}) | std::ranges::to<std::unordered_set>();
|
||||
ret[dc] = nodes | std::views::transform([] (const node& n) { return n.host_id(); }) | std::ranges::to<std::unordered_set>();
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
9
main.cc
9
main.cc
@@ -19,6 +19,8 @@
|
||||
#include "gms/inet_address.hh"
|
||||
#include "auth/allow_all_authenticator.hh"
|
||||
#include "auth/allow_all_authorizer.hh"
|
||||
#include "auth/maintenance_socket_authenticator.hh"
|
||||
#include "auth/maintenance_socket_role_manager.hh"
|
||||
#include <seastar/core/future.hh>
|
||||
#include <seastar/core/signal.hh>
|
||||
#include <seastar/core/timer.hh>
|
||||
@@ -1962,11 +1964,6 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
checkpoint(stop_signal, "loading non-system sstables");
|
||||
replica::distributed_loader::init_non_system_keyspaces(db, proxy, sys_ks).get();
|
||||
|
||||
checkpoint(stop_signal, "recovering logstor");
|
||||
db.invoke_on_all([] (replica::database& db) {
|
||||
return db.recover_logstor();
|
||||
}).get();
|
||||
|
||||
// Depends on all keyspaces being initialized because after this call
|
||||
// we can be reloading schema.
|
||||
mm.local().register_feature_listeners();
|
||||
@@ -2105,7 +2102,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
if (cfg->maintenance_socket() != "ignore") {
|
||||
checkpoint(stop_signal, "starting maintenance auth service");
|
||||
maintenance_auth_service.start(std::ref(qp), std::ref(group0_client),
|
||||
auth::make_maintenance_socket_authorizer_factory(qp),
|
||||
auth::make_authorizer_factory(auth::allow_all_authorizer_name, qp),
|
||||
auth::make_maintenance_socket_authenticator_factory(qp, group0_client, mm, auth_cache),
|
||||
auth::make_maintenance_socket_role_manager_factory(qp, group0_client, mm, auth_cache),
|
||||
maintenance_socket_enabled::yes, std::ref(auth_cache)).get();
|
||||
|
||||
@@ -103,7 +103,7 @@ future<std::optional<tasks::task_status>> node_ops_virtual_task::get_status(task
|
||||
.entity = stats.entity,
|
||||
.progress_units = "",
|
||||
.progress = tasks::task_manager::task::progress{},
|
||||
.children = co_await get_children(get_module(), id, _ss.get_token_metadata_ptr())
|
||||
.children = co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper()))
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -8,10 +8,9 @@
|
||||
|
||||
"""exec_cql.py
|
||||
Execute CQL statements from a file where each non-empty, non-comment line is exactly one CQL statement.
|
||||
Connects via a Unix domain socket (maintenance socket), bypassing authentication.
|
||||
Requires python cassandra-driver. Stops at first failure.
|
||||
Usage:
|
||||
./exec_cql.py --file ./conf/auth.cql --socket /path/to/cql.m
|
||||
./exec_cql.py --file ./conf/auth.cql [--host 127.0.0.1 --port 9042]
|
||||
"""
|
||||
import argparse, os, sys
|
||||
from typing import Sequence
|
||||
@@ -27,27 +26,18 @@ def read_statements(path: str) -> list[tuple[int, str]]:
|
||||
stms.append((lineno, line))
|
||||
return stms
|
||||
|
||||
def exec_statements(statements: list[tuple[int, str]], socket_path: str, timeout: float) -> int:
|
||||
"""Execute CQL statements via a Unix domain socket (maintenance socket).
|
||||
|
||||
The maintenance socket only starts listening after the auth subsystem is
|
||||
fully initialised, so a successful connect means the node is ready.
|
||||
"""
|
||||
from cassandra.cluster import Cluster
|
||||
from cassandra.connection import UnixSocketEndPoint # type: ignore
|
||||
from cassandra.policies import WhiteListRoundRobinPolicy # type: ignore
|
||||
|
||||
ep = UnixSocketEndPoint(socket_path)
|
||||
def exec_driver(statements: list[tuple[int, str]], host: str, port: int, timeout: float, username: str, password: str) -> int:
|
||||
try:
|
||||
cluster = Cluster(
|
||||
contact_points=[ep],
|
||||
load_balancing_policy=WhiteListRoundRobinPolicy([ep]),
|
||||
)
|
||||
session = cluster.connect()
|
||||
except Exception as e:
|
||||
print(f'ERROR: failed to connect to maintenance socket {socket_path}: {e}', file=sys.stderr)
|
||||
from cassandra.cluster import Cluster
|
||||
from cassandra.auth import PlainTextAuthProvider # type: ignore
|
||||
except Exception:
|
||||
print('ERROR: cassandra-driver not installed. Install with: pip install cassandra-driver', file=sys.stderr)
|
||||
return 2
|
||||
|
||||
auth_provider = None
|
||||
if username != "":
|
||||
auth_provider = PlainTextAuthProvider(username=username, password=password)
|
||||
cluster = Cluster([host], port=port, auth_provider=auth_provider)
|
||||
session = cluster.connect()
|
||||
try:
|
||||
for _, (lineno, s) in enumerate(statements, 1):
|
||||
try:
|
||||
@@ -60,11 +50,13 @@ def exec_statements(statements: list[tuple[int, str]], socket_path: str, timeout
|
||||
return 0
|
||||
|
||||
def main(argv: Sequence[str]) -> int:
|
||||
ap = argparse.ArgumentParser(description='Execute one-line CQL statements from file via maintenance socket')
|
||||
ap = argparse.ArgumentParser(description='Execute one-line CQL statements from file (driver only)')
|
||||
ap.add_argument('--file', required=True)
|
||||
ap.add_argument('--socket', required=True,
|
||||
help='Path to the Unix domain maintenance socket (<workdir>/cql.m)')
|
||||
ap.add_argument('--host', default='127.0.0.1')
|
||||
ap.add_argument('--port', type=int, default=9042)
|
||||
ap.add_argument('--timeout', type=float, default=30.0)
|
||||
ap.add_argument('--username', default='cassandra')
|
||||
ap.add_argument('--password', default='cassandra')
|
||||
args = ap.parse_args(argv)
|
||||
if not os.path.isfile(args.file):
|
||||
print(f"File not found: {args.file}", file=sys.stderr)
|
||||
@@ -73,7 +65,7 @@ def main(argv: Sequence[str]) -> int:
|
||||
if not stmts:
|
||||
print('No statements found', file=sys.stderr)
|
||||
return 1
|
||||
rc = exec_statements(stmts, args.socket, args.timeout)
|
||||
rc = exec_driver(stmts, args.host, args.port, args.timeout, args.username, args.password)
|
||||
if rc == 0:
|
||||
print('All statements executed successfully')
|
||||
return rc
|
||||
|
||||
58
pgo/pgo.py
58
pgo/pgo.py
@@ -15,7 +15,6 @@ from typing import Any, Optional
|
||||
import asyncio
|
||||
import contextlib
|
||||
import glob
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
@@ -365,14 +364,12 @@ async def start_node(executable: PathLike, cluster_workdir: PathLike, addr: str,
|
||||
llvm_profile_file = f"{addr}-%m.profraw"
|
||||
scylla_workdir = f"{addr}"
|
||||
logfile = f"{addr}.log"
|
||||
socket = maintenance_socket_path(cluster_workdir, addr)
|
||||
command = [
|
||||
"env",
|
||||
f"LLVM_PROFILE_FILE={llvm_profile_file}",
|
||||
f"SCYLLA_HOME={os.path.realpath(os.getcwd())}", # We assume that the script has Scylla's `conf/` as its filesystem neighbour.
|
||||
os.path.realpath(executable),
|
||||
f"--workdir={scylla_workdir}",
|
||||
f"--maintenance-socket={socket}",
|
||||
"--ring-delay-ms=0",
|
||||
"--developer-mode=yes",
|
||||
"--memory=1G",
|
||||
@@ -394,7 +391,6 @@ async def start_node(executable: PathLike, cluster_workdir: PathLike, addr: str,
|
||||
f"--authenticator=PasswordAuthenticator",
|
||||
f"--authorizer=CassandraAuthorizer",
|
||||
] + list(extra_opts)
|
||||
training_logger.info(f"Using maintenance socket {socket}")
|
||||
return await run(['bash', '-c', fr"""exec {shlex.join(command)} >{q(logfile)} 2>&1"""], cwd=cluster_workdir)
|
||||
|
||||
async def start_cluster(executable: PathLike, addrs: list[str], cpusets: Optional[list[str]], workdir: PathLike, cluster_name: str, extra_opts: list[str]) -> list[Process]:
|
||||
@@ -437,25 +433,16 @@ async def start_cluster(executable: PathLike, addrs: list[str], cpusets: Optiona
|
||||
procs.append(proc)
|
||||
await wait_for_node(proc, addrs[i], timeout)
|
||||
except:
|
||||
await stop_cluster(procs, addrs, cluster_workdir=workdir)
|
||||
await stop_cluster(procs, addrs)
|
||||
raise
|
||||
return procs
|
||||
|
||||
async def stop_cluster(procs: list[Process], addrs: list[str], cluster_workdir: PathLike) -> None:
|
||||
async def stop_cluster(procs: list[Process], addrs: list[str]) -> None:
|
||||
"""Stops a Scylla cluster started with start_cluster().
|
||||
Doesn't return until all nodes exit, even if stop_cluster() is cancelled.
|
||||
|
||||
"""
|
||||
await clean_gather(*[cancel_process(p, timeout=60) for p in procs])
|
||||
_cleanup_short_sockets(cluster_workdir, addrs)
|
||||
|
||||
def _cleanup_short_sockets(cluster_workdir: PathLike, addrs: list[str]) -> None:
|
||||
"""Remove short maintenance socket files created in /tmp."""
|
||||
for addr in addrs:
|
||||
try:
|
||||
os.unlink(maintenance_socket_path(cluster_workdir, addr))
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
async def wait_for_port(addr: str, port: int) -> None:
|
||||
await bash(fr'until printf "" >>/dev/tcp/{addr}/{port}; do sleep 0.1; done 2>/dev/null')
|
||||
@@ -465,33 +452,6 @@ async def merge_profraw(directory: PathLike) -> None:
|
||||
if glob.glob(f"{directory}/*.profraw"):
|
||||
await bash(fr"llvm-profdata merge {q(directory)}/*.profraw -output {q(directory)}/prof.profdata")
|
||||
|
||||
def maintenance_socket_path(cluster_workdir: PathLike, addr: str) -> str:
|
||||
"""Return the maintenance socket path for a node.
|
||||
|
||||
Returns a short deterministic path in /tmp (derived from an MD5 hash of
|
||||
the natural ``<cluster_workdir>/<addr>/cql.m`` path) to stay within the
|
||||
Unix domain socket length limit.
|
||||
The same path is passed to Scylla via ``--maintenance-socket`` in
|
||||
``start_node()``.
|
||||
"""
|
||||
natural = os.path.realpath(f"{cluster_workdir}/{addr}/cql.m")
|
||||
path_hash = hashlib.md5(natural.encode()).hexdigest()[:12]
|
||||
return os.path.join(tempfile.gettempdir(), f'pgo-{path_hash}.m')
|
||||
|
||||
async def setup_cassandra_user(workdir: PathLike, addr: str) -> None:
|
||||
"""Create the ``cassandra`` superuser via the maintenance socket.
|
||||
|
||||
The default cassandra superuser is no longer seeded automatically, but
|
||||
``cassandra-stress`` hardcodes ``user=cassandra password=cassandra``.
|
||||
We create the role over the maintenance socket so that cassandra-stress
|
||||
and other tools that rely on the default credentials keep working.
|
||||
"""
|
||||
socket = maintenance_socket_path(workdir, addr)
|
||||
stmt = "CREATE ROLE cassandra WITH PASSWORD = 'cassandra' AND SUPERUSER = true AND LOGIN = true;"
|
||||
f = q(socket)
|
||||
# Write the statement to a temp file and execute it via exec_cql.py.
|
||||
await bash(fr"""tmpf=$(mktemp); echo {q(stmt)} > "$tmpf"; python3 ./exec_cql.py --file "$tmpf" --socket {f}; rc=$?; rm -f "$tmpf"; exit $rc""")
|
||||
|
||||
async def get_bolt_opts(executable: PathLike) -> list[str]:
|
||||
"""Returns the extra opts which have to be passed to a BOLT-instrumented Scylla
|
||||
to trigger a generation of a BOLT profile file.
|
||||
@@ -543,7 +503,7 @@ async def with_cluster(executable: PathLike, workdir: PathLike, cpusets: Optiona
|
||||
yield addrs, procs
|
||||
finally:
|
||||
training_logger.info(f"Stopping the cluster in {workdir}")
|
||||
await stop_cluster(procs, addrs, cluster_workdir=workdir)
|
||||
await stop_cluster(procs, addrs)
|
||||
training_logger.info(f"Stopped the cluster in {workdir}")
|
||||
|
||||
################################################################################
|
||||
@@ -597,10 +557,8 @@ def kw(**kwargs):
|
||||
|
||||
@contextlib.asynccontextmanager
|
||||
async def with_cs_populate(executable: PathLike, workdir: PathLike) -> AsyncIterator[str]:
|
||||
"""Provides a Scylla cluster, creates the cassandra superuser, and waits
|
||||
for compactions to end before stopping it."""
|
||||
"""Provides a Scylla cluster and waits for compactions to end before stopping it."""
|
||||
async with with_cluster(executable=executable, workdir=workdir) as (addrs, procs):
|
||||
await setup_cassandra_user(workdir, addrs[0])
|
||||
yield addrs[0]
|
||||
async with asyncio.timeout(3600):
|
||||
# Should it also flush memtables?
|
||||
@@ -709,10 +667,9 @@ populators["decommission_dataset"] = populate_decommission
|
||||
# AUTH CONNECTIONS STRESS ==================================================
|
||||
|
||||
async def populate_auth_conns(executable: PathLike, workdir: PathLike) -> None:
|
||||
# Create roles, table and permissions via CQL script over the maintenance socket.
|
||||
# Create roles, table and permissions via CQL script.
|
||||
async with with_cs_populate(executable=executable, workdir=workdir) as server:
|
||||
socket = maintenance_socket_path(workdir, server)
|
||||
await bash(fr"python3 ./exec_cql.py --file conf/auth.cql --socket {q(socket)}")
|
||||
await bash(fr"python3 ./exec_cql.py --file conf/auth.cql --host {server}")
|
||||
|
||||
async def train_auth_conns(executable: PathLike, workdir: PathLike) -> None:
|
||||
# Repeatedly connect as the reader user and perform simple reads to stress
|
||||
@@ -765,8 +722,7 @@ populators["si_dataset"] = populate_si
|
||||
|
||||
async def populate_counters(executable: PathLike, workdir: PathLike) -> None:
|
||||
async with with_cs_populate(executable=executable, workdir=workdir) as server:
|
||||
socket = maintenance_socket_path(workdir, server)
|
||||
await bash(fr"python3 ./exec_cql.py --file conf/counters.cql --socket {q(socket)}")
|
||||
await bash(fr"python3 ./exec_cql.py --file conf/counters.cql --host {server}")
|
||||
# Sleeps added in reaction to schema disagreement errors.
|
||||
# FIXME: get rid of this sleep and find a sane way to wait for schema
|
||||
# agreement.
|
||||
|
||||
@@ -68,7 +68,6 @@ public:
|
||||
using resources = reader_resources;
|
||||
|
||||
friend class reader_permit;
|
||||
friend struct reader_concurrency_semaphore_tester;
|
||||
|
||||
enum class evict_reason {
|
||||
permit, // evicted due to permit shortage
|
||||
|
||||
1740
repair/repair.cc
1740
repair/repair.cc
File diff suppressed because it is too large
Load Diff
@@ -3253,13 +3253,10 @@ private:
|
||||
// sequentially because the rows from repair follower 1 to
|
||||
// repair master might reduce the amount of missing data
|
||||
// between repair master and repair follower 2.
|
||||
auto working_hashes = master.working_row_hashes().get();
|
||||
repair_hash_set set_diff = get_set_diff(master.peer_row_hash_sets(node_idx), working_hashes);
|
||||
repair_hash_set set_diff = get_set_diff(master.peer_row_hash_sets(node_idx), master.working_row_hashes().get());
|
||||
// Request missing sets from peer node
|
||||
if (rlogger.is_enabled(logging::log_level::debug)) {
|
||||
rlogger.debug("Before get_row_diff to node {}, local={}, peer={}, set_diff={}",
|
||||
node, working_hashes.size(), master.peer_row_hash_sets(node_idx).size(), set_diff.size());
|
||||
}
|
||||
rlogger.debug("Before get_row_diff to node {}, local={}, peer={}, set_diff={}",
|
||||
node, master.working_row_hashes().get().size(), master.peer_row_hash_sets(node_idx).size(), set_diff.size());
|
||||
// If we need to pull all rows from the peer. We can avoid
|
||||
// sending the row hashes on wire by setting needs_all_rows flag.
|
||||
auto needs_all_rows = repair_meta::needs_all_rows_t(set_diff.size() == master.peer_row_hash_sets(node_idx).size());
|
||||
@@ -3272,9 +3269,7 @@ private:
|
||||
master.get_row_diff(std::move(set_diff), needs_all_rows, node, node_idx, dst_cpu_id);
|
||||
ns.state = repair_state::get_row_diff_finished;
|
||||
}
|
||||
if (rlogger.is_enabled(logging::log_level::debug)) {
|
||||
rlogger.debug("After get_row_diff node {}, hash_sets={}", master.myhostid(), master.working_row_hashes().get().size());
|
||||
}
|
||||
rlogger.debug("After get_row_diff node {}, hash_sets={}", master.myhostid(), master.working_row_hashes().get().size());
|
||||
} catch (...) {
|
||||
rlogger.warn("repair[{}]: get_row_diff: got error from node={}, keyspace={}, table={}, range={}, error={}",
|
||||
_shard_task.global_repair_id.uuid(), node, _shard_task.get_keyspace(), _cf_name, _range, std::current_exception());
|
||||
|
||||
@@ -9,9 +9,6 @@ target_sources(replica
|
||||
memtable.cc
|
||||
exceptions.cc
|
||||
dirty_memory_manager.cc
|
||||
logstor/segment_manager.cc
|
||||
logstor/logstor.cc
|
||||
logstor/write_buffer.cc
|
||||
multishard_query.cc
|
||||
mutation_dump.cc
|
||||
schema_describe_helper.cc
|
||||
|
||||
@@ -17,7 +17,6 @@
|
||||
// FIXME: un-nest compaction_reenabler, so we can forward declare it and remove this include.
|
||||
#include "compaction/compaction_manager.hh"
|
||||
#include "locator/tablets.hh"
|
||||
#include "replica/logstor/compaction.hh"
|
||||
#include "sstables/sstable_set.hh"
|
||||
#include "utils/chunked_vector.hh"
|
||||
#include <absl/container/flat_hash_map.h>
|
||||
@@ -34,10 +33,6 @@ class effective_replication_map;
|
||||
|
||||
namespace replica {
|
||||
|
||||
namespace logstor {
|
||||
class primary_index;
|
||||
}
|
||||
|
||||
using enable_backlog_tracker = bool_class<class enable_backlog_tracker_tag>;
|
||||
|
||||
enum class repair_sstable_classification {
|
||||
@@ -96,12 +91,6 @@ class compaction_group {
|
||||
bool _tombstone_gc_enabled = true;
|
||||
std::optional<compaction::compaction_backlog_tracker> _backlog_tracker;
|
||||
repair_classifier_func _repair_sstable_classifier;
|
||||
|
||||
lw_shared_ptr<logstor::segment_set> _logstor_segments;
|
||||
std::optional<logstor::separator_buffer> _logstor_separator;
|
||||
std::vector<future<>> _separator_flushes;
|
||||
seastar::semaphore _separator_flush_sem{1};
|
||||
|
||||
private:
|
||||
std::unique_ptr<compaction_group_view> make_compacting_view();
|
||||
std::unique_ptr<compaction_group_view> make_non_compacting_view();
|
||||
@@ -234,7 +223,6 @@ public:
|
||||
const std::vector<sstables::shared_sstable>& compacted_undeleted_sstables() const noexcept;
|
||||
// Triggers regular compaction.
|
||||
void trigger_compaction();
|
||||
void trigger_logstor_compaction();
|
||||
bool compaction_disabled() const;
|
||||
future<unsigned> estimate_pending_compactions() const;
|
||||
|
||||
@@ -243,7 +231,6 @@ public:
|
||||
|
||||
size_t live_sstable_count() const noexcept;
|
||||
uint64_t live_disk_space_used() const noexcept;
|
||||
size_t logstor_disk_space_used() const noexcept;
|
||||
sstables::file_size_stats live_disk_space_used_full_stats() const noexcept;
|
||||
uint64_t total_disk_space_used() const noexcept;
|
||||
sstables::file_size_stats total_disk_space_used_full_stats() const noexcept;
|
||||
@@ -275,37 +262,12 @@ public:
|
||||
compaction::compaction_manager& get_compaction_manager() noexcept;
|
||||
const compaction::compaction_manager& get_compaction_manager() const noexcept;
|
||||
|
||||
logstor::segment_manager& get_logstor_segment_manager() noexcept;
|
||||
const logstor::segment_manager& get_logstor_segment_manager() const noexcept;
|
||||
|
||||
logstor::compaction_manager& get_logstor_compaction_manager() noexcept;
|
||||
const logstor::compaction_manager& get_logstor_compaction_manager() const noexcept;
|
||||
|
||||
logstor::primary_index& get_logstor_index() noexcept;
|
||||
|
||||
future<> split(compaction::compaction_type_options::split opt, tasks::task_info tablet_split_task_info);
|
||||
|
||||
void set_repair_sstable_classifier(repair_classifier_func repair_sstable_classifier) {
|
||||
_repair_sstable_classifier = std::move(repair_sstable_classifier);
|
||||
}
|
||||
|
||||
void add_logstor_segment(logstor::segment_descriptor& desc) {
|
||||
_logstor_segments->add_segment(desc);
|
||||
}
|
||||
|
||||
future<> discard_logstor_segments();
|
||||
|
||||
future<> flush_separator(std::optional<size_t> seq_num = std::nullopt);
|
||||
logstor::separator_buffer& get_separator_buffer(size_t write_size);
|
||||
|
||||
logstor::segment_set& logstor_segments() noexcept {
|
||||
return *_logstor_segments;
|
||||
}
|
||||
|
||||
const logstor::segment_set& logstor_segments() const noexcept {
|
||||
return *_logstor_segments;
|
||||
}
|
||||
|
||||
friend class storage_group;
|
||||
};
|
||||
|
||||
@@ -350,14 +312,7 @@ public:
|
||||
|
||||
const compaction_group_ptr& main_compaction_group() const noexcept;
|
||||
const std::vector<compaction_group_ptr>& split_ready_compaction_groups() const;
|
||||
// Selects the compaction group for the given token. Computes the range side
|
||||
// from the token only when in splitting mode. This avoids the cost of computing
|
||||
// range side on the hot path when it's not needed.
|
||||
compaction_group_ptr& select_compaction_group(dht::token, const locator::tablet_map&) noexcept;
|
||||
// Selects the compaction group for an sstable spanning a token range.
|
||||
// If the first and last tokens fall on different sides of the split point,
|
||||
// the sstable belongs to the main compaction group.
|
||||
compaction_group_ptr& select_compaction_group(dht::token first, dht::token last, const locator::tablet_map&) noexcept;
|
||||
compaction_group_ptr& select_compaction_group(locator::tablet_range_side) noexcept;
|
||||
|
||||
uint64_t live_disk_space_used() const;
|
||||
|
||||
@@ -477,9 +432,7 @@ public:
|
||||
// refresh_mutation_source must be called when there are changes to data source
|
||||
// structures but logical state of data is not changed (e.g. when state for a
|
||||
// new tablet replica is allocated).
|
||||
virtual void update_effective_replication_map(const locator::effective_replication_map_ptr& old_erm,
|
||||
const locator::effective_replication_map& erm,
|
||||
noncopyable_function<void()> refresh_mutation_source) = 0;
|
||||
virtual void update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) = 0;
|
||||
|
||||
virtual compaction_group& compaction_group_for_token(dht::token token) const = 0;
|
||||
virtual compaction_group& compaction_group_for_key(partition_key_view key, const schema_ptr& s) const = 0;
|
||||
|
||||
1594
replica/database.cc
1594
replica/database.cc
File diff suppressed because it is too large
Load Diff
@@ -16,7 +16,6 @@
|
||||
#include <seastar/core/execution_stage.hh>
|
||||
#include <seastar/core/when_all.hh>
|
||||
#include "replica/global_table_ptr.hh"
|
||||
#include "replica/logstor/compaction.hh"
|
||||
#include "types/user.hh"
|
||||
#include "utils/assert.hh"
|
||||
#include "utils/hash.hh"
|
||||
@@ -36,7 +35,6 @@
|
||||
#include <seastar/core/gate.hh>
|
||||
#include "db/commitlog/replay_position.hh"
|
||||
#include "db/commitlog/commitlog_types.hh"
|
||||
#include "logstor/logstor.hh"
|
||||
#include "schema/schema_fwd.hh"
|
||||
#include "db/view/view.hh"
|
||||
#include "db/snapshot-ctl.hh"
|
||||
@@ -546,9 +544,6 @@ private:
|
||||
utils::phased_barrier _flush_barrier;
|
||||
std::vector<view_ptr> _views;
|
||||
|
||||
logstor::logstor* _logstor = nullptr;
|
||||
std::unique_ptr<logstor::primary_index> _logstor_index;
|
||||
|
||||
std::unique_ptr<cell_locker> _counter_cell_locks; // Memory-intensive; allocate only when needed.
|
||||
|
||||
// Labels used to identify writes and reads for this table in the rate_limiter structure.
|
||||
@@ -616,10 +611,6 @@ public:
|
||||
sstables::offstrategy offstrategy = sstables::offstrategy::no);
|
||||
future<> add_sstables_and_update_cache(const std::vector<sstables::shared_sstable>& ssts);
|
||||
|
||||
bool add_logstor_segment(logstor::segment_descriptor&, dht::token first_token, dht::token last_token);
|
||||
|
||||
logstor::separator_buffer& get_logstor_separator_buffer(dht::token token, size_t write_size);
|
||||
|
||||
// Restricted to new sstables produced by external processes such as repair.
|
||||
// The sstable might undergo split if table is in split mode.
|
||||
// If no need for split, the input sstable will only be attached to the sstable set.
|
||||
@@ -842,21 +833,6 @@ public:
|
||||
// to issue disk operations safely.
|
||||
void mark_ready_for_writes(db::commitlog* cl);
|
||||
|
||||
void init_logstor(logstor::logstor* ls);
|
||||
|
||||
bool uses_logstor() const {
|
||||
return _logstor != nullptr;
|
||||
}
|
||||
|
||||
logstor::primary_index& logstor_index() noexcept {
|
||||
return *_logstor_index;
|
||||
}
|
||||
const logstor::primary_index& logstor_index() const noexcept {
|
||||
return *_logstor_index;
|
||||
}
|
||||
|
||||
size_t get_logstor_memory_usage() const;
|
||||
|
||||
// Creates a mutation reader which covers all data sources for this column family.
|
||||
// Caller needs to ensure that column_family remains live (FIXME: relax this).
|
||||
// Note: for data queries use query() instead.
|
||||
@@ -882,14 +858,6 @@ public:
|
||||
return make_mutation_reader(std::move(schema), std::move(permit), range, full_slice);
|
||||
}
|
||||
|
||||
mutation_reader make_logstor_mutation_reader(schema_ptr s,
|
||||
reader_permit permit,
|
||||
const dht::partition_range& pr,
|
||||
const query::partition_slice& slice,
|
||||
tracing::trace_state_ptr trace_state,
|
||||
streamed_mutation::forwarding fwd,
|
||||
mutation_reader::forwarding fwd_mr) const;
|
||||
|
||||
// The streaming mutation reader differs from the regular mutation reader in that:
|
||||
// - Reflects all writes accepted by replica prior to creation of the
|
||||
// reader and a _bounded_ amount of writes which arrive later.
|
||||
@@ -1079,7 +1047,6 @@ public:
|
||||
bool needs_flush() const;
|
||||
future<> clear(); // discards memtable(s) without flushing them to disk.
|
||||
future<db::replay_position> discard_sstables(db_clock::time_point);
|
||||
future<> discard_logstor_segments();
|
||||
|
||||
bool can_flush() const;
|
||||
|
||||
@@ -1131,7 +1098,6 @@ public:
|
||||
void start_compaction();
|
||||
void trigger_compaction();
|
||||
void try_trigger_compaction(compaction_group& cg) noexcept;
|
||||
void trigger_logstor_compaction();
|
||||
// Triggers offstrategy compaction, if needed, in the background.
|
||||
void trigger_offstrategy_compaction();
|
||||
// Performs offstrategy compaction, if needed, returning
|
||||
@@ -1160,22 +1126,6 @@ public:
|
||||
return _compaction_manager;
|
||||
}
|
||||
|
||||
logstor::segment_manager& get_logstor_segment_manager() noexcept {
|
||||
return _logstor->get_segment_manager();
|
||||
}
|
||||
|
||||
const logstor::segment_manager& get_logstor_segment_manager() const noexcept {
|
||||
return _logstor->get_segment_manager();
|
||||
}
|
||||
|
||||
logstor::compaction_manager& get_logstor_compaction_manager() noexcept {
|
||||
return _logstor->get_compaction_manager();
|
||||
}
|
||||
|
||||
future<> flush_separator(std::optional<size_t> seq_num = std::nullopt);
|
||||
|
||||
future<logstor::table_segment_stats> get_logstor_segment_stats() const;
|
||||
|
||||
table_stats& get_stats() const {
|
||||
return _stats;
|
||||
}
|
||||
@@ -1663,8 +1613,6 @@ private:
|
||||
dirty_memory_manager _system_dirty_memory_manager;
|
||||
dirty_memory_manager _dirty_memory_manager;
|
||||
|
||||
timer<lowres_clock> _dirty_memory_threshold_controller;
|
||||
|
||||
database_config _dbcfg;
|
||||
flush_controller _memtable_controller;
|
||||
drain_progress _drain_progress {};
|
||||
@@ -1707,8 +1655,6 @@ private:
|
||||
bool _enable_autocompaction_toggle = false;
|
||||
querier_cache _querier_cache;
|
||||
|
||||
std::unique_ptr<logstor::logstor> _logstor;
|
||||
|
||||
std::unique_ptr<db::large_data_handler> _large_data_handler;
|
||||
std::unique_ptr<db::large_data_handler> _nop_large_data_handler;
|
||||
|
||||
@@ -1750,8 +1696,6 @@ public:
|
||||
std::shared_ptr<data_dictionary::user_types_storage> as_user_types_storage() const noexcept;
|
||||
const data_dictionary::user_types_storage& user_types() const noexcept;
|
||||
future<> init_commitlog();
|
||||
future<> init_logstor();
|
||||
future<> recover_logstor();
|
||||
const gms::feature_service& features() const { return _feat; }
|
||||
future<> apply_in_memory(const frozen_mutation& m, schema_ptr m_schema, db::rp_handle&&, db::timeout_clock::time_point timeout);
|
||||
future<> apply_in_memory(const mutation& m, column_family& cf, db::rp_handle&&, db::timeout_clock::time_point timeout);
|
||||
@@ -2052,13 +1996,6 @@ public:
|
||||
// a wrapper around flush_all_tables, allowing the caller to express intent more clearly
|
||||
future<> flush_commitlog() { return flush_all_tables(); }
|
||||
|
||||
static future<> trigger_logstor_compaction_on_all_shards(sharded<database>& sharded_db, bool major);
|
||||
void trigger_logstor_compaction(bool major);
|
||||
static future<> flush_logstor_separator_on_all_shards(sharded<database>& sharded_db);
|
||||
future<> flush_logstor_separator(std::optional<size_t> seq_num = std::nullopt);
|
||||
future<logstor::table_segment_stats> get_logstor_table_segment_stats(table_id table) const;
|
||||
size_t get_logstor_memory_usage() const;
|
||||
|
||||
static future<db_clock::time_point> get_all_tables_flushed_at(sharded<database>& sharded_db);
|
||||
|
||||
static future<> drop_cache_for_table_on_all_shards(sharded<database>& sharded_db, table_id id);
|
||||
|
||||
@@ -142,16 +142,6 @@ void region_group::notify_unspooled_pressure_relieved() {
|
||||
_relief.signal();
|
||||
}
|
||||
|
||||
void region_group::update_limits(size_t unspooled_hard_limit, size_t unspooled_soft_limit, size_t real_hard_limit) {
|
||||
_cfg.unspooled_hard_limit = unspooled_hard_limit;
|
||||
_cfg.unspooled_soft_limit = unspooled_soft_limit;
|
||||
_cfg.real_hard_limit = real_hard_limit;
|
||||
|
||||
// check pressure with the new limits
|
||||
update_real(0);
|
||||
update_unspooled(0);
|
||||
}
|
||||
|
||||
bool region_group::do_update_real_and_check_relief(ssize_t delta) {
|
||||
_real_total_memory += delta;
|
||||
|
||||
@@ -221,18 +211,9 @@ dirty_memory_manager::dirty_memory_manager(replica::database& db, size_t thresho
|
||||
.real_hard_limit = threshold,
|
||||
.start_reclaiming = std::bind_front(&dirty_memory_manager::start_reclaiming, this)
|
||||
}, deferred_work_sg)
|
||||
, _threshold(threshold)
|
||||
, _soft_limit(soft_limit)
|
||||
, _flush_serializer(1)
|
||||
, _waiting_flush(flush_when_needed()) {}
|
||||
|
||||
void dirty_memory_manager::update_threshold(size_t threshold) {
|
||||
if (threshold != _threshold) {
|
||||
_threshold = threshold;
|
||||
_region_group.update_limits(threshold / 2, threshold * _soft_limit / 2, threshold);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
dirty_memory_manager::setup_collectd(sstring namestr) {
|
||||
namespace sm = seastar::metrics;
|
||||
|
||||
@@ -268,8 +268,6 @@ public:
|
||||
}
|
||||
void update_unspooled(ssize_t delta);
|
||||
|
||||
void update_limits(size_t unspooled_hard_limit, size_t unspooled_soft_limit, size_t real_hard_limit);
|
||||
|
||||
void increase_usage(logalloc::region* r) { // Called by memtable's region_listener
|
||||
// It would be easier to call update, but it is unfortunately broken in boost versions up to at
|
||||
// least 1.59.
|
||||
@@ -397,9 +395,6 @@ class dirty_memory_manager {
|
||||
// memory usage minus bytes that were already written to disk.
|
||||
dirty_memory_manager_logalloc::region_group _region_group;
|
||||
|
||||
size_t _threshold;
|
||||
double _soft_limit;
|
||||
|
||||
// We would like to serialize the flushing of memtables. While flushing many memtables
|
||||
// simultaneously can sustain high levels of throughput, the memory is not freed until the
|
||||
// memtable is totally gone. That means that if we have throttled requests, they will stay
|
||||
@@ -488,8 +483,6 @@ public:
|
||||
return _region_group;
|
||||
}
|
||||
|
||||
void update_threshold(size_t threshold);
|
||||
|
||||
void revert_potentially_cleaned_up_memory(logalloc::region* from, int64_t delta) {
|
||||
_region_group.update_real(-delta);
|
||||
_region_group.update_unspooled(delta);
|
||||
|
||||
@@ -1,177 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2026-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include "types.hh"
|
||||
#include "utils/chunked_vector.hh"
|
||||
#include "write_buffer.hh"
|
||||
#include "utils/log_heap.hh"
|
||||
|
||||
namespace replica::logstor {
|
||||
|
||||
constexpr log_heap_options segment_descriptor_hist_options(4 * 1024, 3, 128 * 1024);
|
||||
|
||||
struct segment_set;
|
||||
|
||||
struct segment_descriptor : public log_heap_hook<segment_descriptor_hist_options> {
|
||||
// free_space = segment_size - net_data_size
|
||||
// initially set to segment_size
|
||||
// when writing records, decrease by total net data size
|
||||
// when freeing a record, increase by the record's net data size
|
||||
size_t free_space{0};
|
||||
size_t record_count{0};
|
||||
segment_generation seg_gen{1};
|
||||
segment_set* owner{nullptr}; // non-owning, set when added to a segment_set
|
||||
|
||||
void reset(size_t segment_size) noexcept {
|
||||
free_space = segment_size;
|
||||
record_count = 0;
|
||||
}
|
||||
|
||||
size_t net_data_size(size_t segment_size) const noexcept {
|
||||
return segment_size - free_space;
|
||||
}
|
||||
|
||||
void on_free_segment() noexcept {
|
||||
++seg_gen;
|
||||
}
|
||||
|
||||
void on_write(size_t net_data_size, size_t cnt = 1) noexcept {
|
||||
free_space -= net_data_size;
|
||||
record_count += cnt;
|
||||
}
|
||||
|
||||
void on_write(log_location loc) noexcept {
|
||||
on_write(loc.size);
|
||||
}
|
||||
|
||||
void on_free(size_t net_data_size, size_t cnt = 1) noexcept {
|
||||
free_space += net_data_size;
|
||||
record_count -= cnt;
|
||||
}
|
||||
|
||||
void on_free(log_location loc) noexcept {
|
||||
on_free(loc.size);
|
||||
}
|
||||
};
|
||||
|
||||
using segment_descriptor_hist = log_heap<segment_descriptor, segment_descriptor_hist_options>;
|
||||
|
||||
struct segment_set {
|
||||
segment_descriptor_hist _segments;
|
||||
size_t _segment_count{0};
|
||||
|
||||
void add_segment(segment_descriptor& desc) {
|
||||
desc.owner = this;
|
||||
_segments.push(desc);
|
||||
++_segment_count;
|
||||
}
|
||||
|
||||
void update_segment(segment_descriptor& desc) {
|
||||
_segments.adjust_up(desc);
|
||||
}
|
||||
|
||||
void remove_segment(segment_descriptor& desc) {
|
||||
_segments.erase(desc);
|
||||
desc.owner = nullptr;
|
||||
--_segment_count;
|
||||
}
|
||||
|
||||
size_t segment_count() const noexcept {
|
||||
return _segment_count;
|
||||
}
|
||||
};
|
||||
|
||||
class segment_ref {
|
||||
struct state {
|
||||
log_segment_id id;
|
||||
std::function<void()> on_last_release;
|
||||
std::function<void()> on_failure;
|
||||
bool flush_failure{false};
|
||||
~state() {
|
||||
if (!flush_failure) {
|
||||
if (on_last_release) on_last_release();
|
||||
} else {
|
||||
if (on_failure) on_failure();
|
||||
}
|
||||
}
|
||||
};
|
||||
lw_shared_ptr<state> _state;
|
||||
public:
|
||||
segment_ref() = default;
|
||||
|
||||
// Copyable: copying increments the shared ref count
|
||||
segment_ref(const segment_ref&) = default;
|
||||
segment_ref& operator=(const segment_ref&) = default;
|
||||
segment_ref(segment_ref&&) noexcept = default;
|
||||
segment_ref& operator=(segment_ref&&) noexcept = default;
|
||||
|
||||
log_segment_id id() const noexcept { return _state->id; }
|
||||
bool empty() const noexcept { return !_state; }
|
||||
|
||||
void set_flush_failure() noexcept { if (_state) _state->flush_failure = true; }
|
||||
|
||||
private:
|
||||
friend class segment_manager_impl;
|
||||
explicit segment_ref(log_segment_id id, std::function<void()> on_last_release, std::function<void()> on_failure)
|
||||
: _state(make_lw_shared<state>(id, std::move(on_last_release), std::move(on_failure)))
|
||||
{}
|
||||
};
|
||||
|
||||
struct separator_buffer {
|
||||
write_buffer* buf;
|
||||
utils::chunked_vector<future<>> pending_updates;
|
||||
utils::chunked_vector<segment_ref> held_segments;
|
||||
std::optional<size_t> min_seq_num;
|
||||
bool flushed{false};
|
||||
|
||||
separator_buffer(write_buffer* wb)
|
||||
: buf(wb)
|
||||
{}
|
||||
|
||||
~separator_buffer() {
|
||||
if (!flushed && buf && buf->has_data()) {
|
||||
for (auto& seg_ref : held_segments) {
|
||||
seg_ref.set_flush_failure();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
separator_buffer(const separator_buffer&) = delete;
|
||||
separator_buffer& operator=(const separator_buffer&) = delete;
|
||||
|
||||
separator_buffer(separator_buffer&&) noexcept = default;
|
||||
separator_buffer& operator=(separator_buffer&&) noexcept = default;
|
||||
|
||||
future<log_location_with_holder> write(log_record_writer writer) {
|
||||
return buf->write(std::move(writer));
|
||||
}
|
||||
|
||||
bool can_fit(const log_record_writer& writer) const noexcept {
|
||||
return buf->can_fit(writer);
|
||||
}
|
||||
|
||||
bool can_fit(size_t write_size) const noexcept {
|
||||
return buf->can_fit(write_size);
|
||||
}
|
||||
};
|
||||
|
||||
class compaction_manager {
|
||||
public:
|
||||
virtual ~compaction_manager() = default;
|
||||
|
||||
virtual separator_buffer allocate_separator_buffer() = 0;
|
||||
|
||||
virtual future<> flush_separator_buffer(separator_buffer, replica::compaction_group&) = 0;
|
||||
|
||||
virtual void submit(replica::compaction_group&) = 0;
|
||||
|
||||
virtual future<> stop_ongoing_compactions(replica::compaction_group&) = 0;
|
||||
};
|
||||
|
||||
}
|
||||
@@ -1,167 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2026-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include "dht/decorated_key.hh"
|
||||
#include "dht/ring_position.hh"
|
||||
#include "types.hh"
|
||||
#include "utils/bptree.hh"
|
||||
#include "utils/double-decker.hh"
|
||||
#include "utils/phased_barrier.hh"
|
||||
|
||||
namespace replica::logstor {
|
||||
|
||||
class primary_index_entry {
|
||||
dht::decorated_key _key;
|
||||
index_entry _e;
|
||||
struct {
|
||||
bool _head : 1;
|
||||
bool _tail : 1;
|
||||
bool _train : 1;
|
||||
} _flags{};
|
||||
public:
|
||||
primary_index_entry(dht::decorated_key key, index_entry e)
|
||||
: _key(std::move(key))
|
||||
, _e(std::move(e))
|
||||
{ }
|
||||
|
||||
primary_index_entry(primary_index_entry&&) noexcept = default;
|
||||
|
||||
bool is_head() const noexcept { return _flags._head; }
|
||||
void set_head(bool v) noexcept { _flags._head = v; }
|
||||
bool is_tail() const noexcept { return _flags._tail; }
|
||||
void set_tail(bool v) noexcept { _flags._tail = v; }
|
||||
bool with_train() const noexcept { return _flags._train; }
|
||||
void set_train(bool v) noexcept { _flags._train = v; }
|
||||
|
||||
const dht::decorated_key& key() const noexcept { return _key; }
|
||||
const index_entry& entry() const noexcept { return _e; }
|
||||
|
||||
friend class primary_index;
|
||||
|
||||
friend dht::ring_position_view ring_position_view_to_compare(const primary_index_entry& e) { return e._key; }
|
||||
};
|
||||
|
||||
class primary_index final {
|
||||
public:
|
||||
using partitions_type = double_decker<int64_t, primary_index_entry,
|
||||
dht::raw_token_less_comparator, dht::ring_position_comparator,
|
||||
16, bplus::key_search::linear>;
|
||||
private:
|
||||
partitions_type _partitions;
|
||||
schema_ptr _schema;
|
||||
size_t _key_count = 0;
|
||||
|
||||
mutable utils::phased_barrier _reads_phaser{"logstor_primary_index"};
|
||||
|
||||
public:
|
||||
explicit primary_index(schema_ptr schema)
|
||||
: _partitions(dht::raw_token_less_comparator{})
|
||||
, _schema(std::move(schema))
|
||||
{}
|
||||
|
||||
void set_schema(schema_ptr s) {
|
||||
_schema = std::move(s);
|
||||
}
|
||||
|
||||
void clear() {
|
||||
_partitions.clear();
|
||||
_key_count = 0;
|
||||
}
|
||||
|
||||
utils::phased_barrier::operation start_read() const {
|
||||
return _reads_phaser.start();
|
||||
}
|
||||
|
||||
future<> await_pending_reads() {
|
||||
return _reads_phaser.advance_and_await();
|
||||
}
|
||||
|
||||
std::optional<index_entry> get(const primary_index_key& key) const {
|
||||
auto it = _partitions.find(key.dk, dht::ring_position_comparator(*_schema));
|
||||
if (it != _partitions.end()) {
|
||||
return it->_e;
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
std::optional<index_entry> exchange(const primary_index_key& key, index_entry new_entry) {
|
||||
partitions_type::bound_hint hint;
|
||||
auto i = _partitions.lower_bound(key.dk, dht::ring_position_comparator(*_schema), hint);
|
||||
if (hint.match) {
|
||||
auto old_entry = i->_e;
|
||||
i->_e = std::move(new_entry);
|
||||
return old_entry;
|
||||
} else {
|
||||
_partitions.emplace_before(i, key.dk.token().raw(), hint, key.dk, std::move(new_entry));
|
||||
++_key_count;
|
||||
return std::nullopt;
|
||||
}
|
||||
}
|
||||
|
||||
bool update_record_location(const primary_index_key& key, log_location old_location, log_location new_location) {
|
||||
auto it = _partitions.find(key.dk, dht::ring_position_comparator(*_schema));
|
||||
if (it != _partitions.end()) {
|
||||
if (it->_e.location == old_location) {
|
||||
it->_e.location = new_location;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
std::pair<bool, std::optional<index_entry>> insert_if_newer(const primary_index_key& key, index_entry new_entry) {
|
||||
partitions_type::bound_hint hint;
|
||||
auto i = _partitions.lower_bound(key.dk, dht::ring_position_comparator(*_schema), hint);
|
||||
if (hint.match) {
|
||||
if (i->_e.generation < new_entry.generation) {
|
||||
auto old_entry = i->_e;
|
||||
i->_e = std::move(new_entry);
|
||||
return {true, std::make_optional(old_entry)};
|
||||
} else {
|
||||
return {false, std::make_optional(i->_e)};
|
||||
}
|
||||
} else {
|
||||
_partitions.emplace_before(i, key.dk.token().raw(), hint, key.dk, std::move(new_entry));
|
||||
++_key_count;
|
||||
return {true, std::nullopt};
|
||||
}
|
||||
}
|
||||
|
||||
bool erase(const primary_index_key& key, log_location loc) {
|
||||
auto it = _partitions.find(key.dk, dht::ring_position_comparator(*_schema));
|
||||
if (it != _partitions.end() && it->_e.location == loc) {
|
||||
it.erase(dht::raw_token_less_comparator{});
|
||||
--_key_count;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
auto begin() const noexcept { return _partitions.begin(); }
|
||||
auto end() const noexcept { return _partitions.end(); }
|
||||
|
||||
bool empty() const noexcept { return _partitions.empty(); }
|
||||
|
||||
size_t get_key_count() const noexcept { return _key_count; }
|
||||
|
||||
size_t get_memory_usage() const noexcept { return _key_count * sizeof(index_entry); }
|
||||
|
||||
// First entry with key >= pos (for positioning at range start)
|
||||
partitions_type::const_iterator lower_bound(const dht::ring_position_view& pos) const {
|
||||
return _partitions.lower_bound(pos, dht::ring_position_comparator(*_schema));
|
||||
}
|
||||
|
||||
// First entry with key strictly > key (for advancing past a key after a yield)
|
||||
partitions_type::const_iterator upper_bound(const dht::decorated_key& key) const {
|
||||
return _partitions.upper_bound(key, dht::ring_position_comparator(*_schema));
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
@@ -1,297 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2026-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
#include "replica/logstor/logstor.hh"
|
||||
#include <seastar/core/coroutine.hh>
|
||||
#include <seastar/util/log.hh>
|
||||
#include <seastar/core/future.hh>
|
||||
#include "readers/from_mutations.hh"
|
||||
#include "keys/keys.hh"
|
||||
#include "replica/logstor/segment_manager.hh"
|
||||
#include "replica/logstor/types.hh"
|
||||
#include "utils/managed_bytes.hh"
|
||||
#include <openssl/ripemd.h>
|
||||
#include <openssl/evp.h>
|
||||
|
||||
namespace replica::logstor {
|
||||
|
||||
seastar::logger logstor_logger("logstor");
|
||||
|
||||
logstor::logstor(logstor_config config)
|
||||
: _segment_manager(config.segment_manager_cfg)
|
||||
, _write_buffer(_segment_manager, config.flush_sg) {
|
||||
}
|
||||
|
||||
future<> logstor::do_recovery(replica::database& db) {
|
||||
co_await _segment_manager.do_recovery(db);
|
||||
}
|
||||
|
||||
future<> logstor::start() {
|
||||
logstor_logger.info("Starting logstor");
|
||||
|
||||
co_await _segment_manager.start();
|
||||
co_await _write_buffer.start();
|
||||
|
||||
logstor_logger.info("logstor started");
|
||||
}
|
||||
|
||||
future<> logstor::stop() {
|
||||
logstor_logger.info("Stopping logstor");
|
||||
|
||||
co_await _write_buffer.stop();
|
||||
co_await _segment_manager.stop();
|
||||
|
||||
logstor_logger.info("logstor stopped");
|
||||
}
|
||||
|
||||
size_t logstor::get_memory_usage() const {
|
||||
return _segment_manager.get_memory_usage();
|
||||
}
|
||||
|
||||
future<> logstor::write(const mutation& m, compaction_group& cg, seastar::gate::holder cg_holder) {
|
||||
primary_index_key key(m.decorated_key());
|
||||
table_id table = m.schema()->id();
|
||||
auto& index = cg.get_logstor_index();
|
||||
|
||||
// TODO ?
|
||||
record_generation gen = index.get(key)
|
||||
.transform([](const index_entry& entry) {
|
||||
return entry.generation + 1;
|
||||
}).value_or(record_generation(1));
|
||||
|
||||
log_record record {
|
||||
.key = key,
|
||||
.generation = gen,
|
||||
.table = table,
|
||||
.mut = canonical_mutation(m)
|
||||
};
|
||||
|
||||
return _write_buffer.write(std::move(record), &cg, std::move(cg_holder)).then_unpack([this, &index, gen, key = std::move(key)]
|
||||
(log_location location, seastar::gate::holder op) {
|
||||
index_entry new_entry {
|
||||
.location = location,
|
||||
.generation = gen,
|
||||
};
|
||||
|
||||
auto old_entry = index.exchange(key, std::move(new_entry));
|
||||
|
||||
// If overwriting, free old record
|
||||
if (old_entry) {
|
||||
_segment_manager.free_record(old_entry->location);
|
||||
}
|
||||
}).handle_exception([] (std::exception_ptr ep) {
|
||||
logstor_logger.error("Error writing mutation: {}", ep);
|
||||
return make_exception_future<>(ep);
|
||||
});
|
||||
}
|
||||
|
||||
future<std::optional<log_record>> logstor::read(const primary_index& index, primary_index_key key) {
|
||||
auto op = index.start_read();
|
||||
|
||||
auto entry_opt = index.get(key);
|
||||
if (!entry_opt.has_value()) {
|
||||
return make_ready_future<std::optional<log_record>>(std::nullopt);
|
||||
}
|
||||
|
||||
const auto& entry = *entry_opt;
|
||||
|
||||
return _segment_manager.read(entry.location).then([key = std::move(key), op = std::move(op)] (log_record record) {
|
||||
return std::optional<log_record>(std::move(record));
|
||||
}).handle_exception([] (std::exception_ptr ep) {
|
||||
logstor_logger.error("Error reading record: {}", ep);
|
||||
return make_exception_future<std::optional<log_record>>(ep);
|
||||
});
|
||||
}
|
||||
|
||||
future<std::optional<canonical_mutation>> logstor::read(const schema& s, const primary_index& index, const dht::decorated_key& dk) {
|
||||
primary_index_key key(dk);
|
||||
return read(index, key).then([&dk] (std::optional<log_record> record_opt) -> std::optional<canonical_mutation> {
|
||||
if (!record_opt.has_value()) {
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
auto& record = *record_opt;
|
||||
|
||||
if (record.mut.key() != dk.key()) [[unlikely]] {
|
||||
throw std::runtime_error(fmt::format(
|
||||
"Key mismatch reading log entry: expected {}, got {}",
|
||||
dk.key(), record.mut.key()
|
||||
));
|
||||
}
|
||||
|
||||
return std::optional<canonical_mutation>(std::move(record.mut));
|
||||
});
|
||||
}
|
||||
|
||||
segment_manager& logstor::get_segment_manager() noexcept {
|
||||
return _segment_manager;
|
||||
}
|
||||
|
||||
const segment_manager& logstor::get_segment_manager() const noexcept {
|
||||
return _segment_manager;
|
||||
}
|
||||
|
||||
compaction_manager& logstor::get_compaction_manager() noexcept {
|
||||
return _segment_manager.get_compaction_manager();
|
||||
}
|
||||
|
||||
const compaction_manager& logstor::get_compaction_manager() const noexcept {
|
||||
return _segment_manager.get_compaction_manager();
|
||||
}
|
||||
|
||||
mutation_reader logstor::make_reader(schema_ptr schema,
|
||||
const primary_index& index,
|
||||
reader_permit permit,
|
||||
const dht::partition_range& pr,
|
||||
const query::partition_slice& slice,
|
||||
tracing::trace_state_ptr trace_state) {
|
||||
|
||||
class logstor_range_reader : public mutation_reader::impl {
|
||||
logstor* _logstor;
|
||||
const primary_index& _index;
|
||||
dht::partition_range _pr;
|
||||
query::partition_slice _slice;
|
||||
tracing::trace_state_ptr _trace_state;
|
||||
std::optional<dht::decorated_key> _last_key; // owns the key, safe across yields
|
||||
mutation_reader_opt _current_partition_reader;
|
||||
dht::ring_position_comparator _cmp;
|
||||
|
||||
// Finds the next iterator to process, safe to call after any co_await
|
||||
primary_index::partitions_type::const_iterator find_next() const {
|
||||
auto it = _last_key
|
||||
? _index.upper_bound(*_last_key) // strictly after last key
|
||||
: position_at_range_start(); // initial positioning
|
||||
// If start was exclusive and we haven't yet seen a key
|
||||
return it;
|
||||
}
|
||||
|
||||
primary_index::partitions_type::const_iterator position_at_range_start() const {
|
||||
if (!_pr.start()) {
|
||||
return _index.begin();
|
||||
}
|
||||
auto it = _index.lower_bound(_pr.start()->value());
|
||||
if (!_pr.start()->is_inclusive() && it != _index.end()) {
|
||||
if (_cmp(it->key(), _pr.start()->value()) == 0) {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
return it;
|
||||
}
|
||||
|
||||
bool exceeds_range_end(const primary_index_entry& e) const {
|
||||
if (!_pr.end()) return false;
|
||||
auto c = _cmp(e.key(), _pr.end()->value());
|
||||
return _pr.end()->is_inclusive() ? c > 0 : c >= 0;
|
||||
}
|
||||
|
||||
public:
|
||||
logstor_range_reader(schema_ptr s, const primary_index& idx, reader_permit p,
|
||||
logstor* ls, dht::partition_range pr,
|
||||
query::partition_slice slice, tracing::trace_state_ptr ts)
|
||||
: impl(std::move(s), std::move(p))
|
||||
, _logstor(ls), _index(idx), _pr(std::move(pr))
|
||||
, _slice(std::move(slice)), _trace_state(std::move(ts))
|
||||
, _cmp(*_schema)
|
||||
{}
|
||||
|
||||
virtual future<> fill_buffer() override {
|
||||
while (!is_buffer_full() && !_end_of_stream) {
|
||||
// Drain current partition's reader first
|
||||
if (_current_partition_reader) {
|
||||
co_await _current_partition_reader->fill_buffer();
|
||||
_current_partition_reader->move_buffer_content_to(*this);
|
||||
if (!_current_partition_reader->is_end_of_stream()) {
|
||||
continue;
|
||||
}
|
||||
co_await _current_partition_reader->close();
|
||||
_current_partition_reader = std::nullopt;
|
||||
// _last_key was already set when we opened the reader
|
||||
}
|
||||
|
||||
// Find next key in range (safe after co_await since we use _last_key)
|
||||
auto it = find_next();
|
||||
if (it == _index.end() || exceeds_range_end(*it)) {
|
||||
_end_of_stream = true;
|
||||
break;
|
||||
}
|
||||
|
||||
// Snapshot the key before yielding
|
||||
auto current_key = it->key();
|
||||
|
||||
auto guard = reader_permit::awaits_guard(_permit);
|
||||
auto cmut = co_await _logstor->read(*_schema, _index, current_key);
|
||||
|
||||
_last_key = current_key; // mark as visited even if not found (tombstoned)
|
||||
|
||||
if (!cmut) {
|
||||
continue; // key was removed between index lookup and read
|
||||
}
|
||||
|
||||
tracing::trace(_trace_state, "logstor_range_reader: fetched key {}", current_key);
|
||||
|
||||
_current_partition_reader = make_mutation_reader_from_mutations(
|
||||
_schema, _permit, cmut->to_mutation(_schema),
|
||||
_slice, streamed_mutation::forwarding::no
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
virtual future<> next_partition() override {
|
||||
clear_buffer_to_next_partition();
|
||||
if (!is_buffer_empty()) return make_ready_future<>();
|
||||
_end_of_stream = false;
|
||||
if (_current_partition_reader) {
|
||||
auto fut = _current_partition_reader->close();
|
||||
_current_partition_reader = std::nullopt;
|
||||
return fut;
|
||||
}
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
virtual future<> fast_forward_to(const dht::partition_range& pr) override {
|
||||
clear_buffer();
|
||||
_end_of_stream = false;
|
||||
_pr = pr;
|
||||
_last_key = std::nullopt; // re-position from new range start
|
||||
if (_current_partition_reader) {
|
||||
auto fut = _current_partition_reader->close();
|
||||
_current_partition_reader = std::nullopt;
|
||||
return fut;
|
||||
}
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
virtual future<> fast_forward_to(position_range pr) override {
|
||||
if (_current_partition_reader) {
|
||||
clear_buffer();
|
||||
return _current_partition_reader->fast_forward_to(std::move(pr));
|
||||
}
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
virtual future<> close() noexcept override {
|
||||
if (_current_partition_reader) {
|
||||
return _current_partition_reader->close();
|
||||
}
|
||||
return make_ready_future<>();
|
||||
}
|
||||
};
|
||||
|
||||
return make_mutation_reader<logstor_range_reader>(
|
||||
std::move(schema), index, std::move(permit), this, pr, slice, std::move(trace_state)
|
||||
);
|
||||
}
|
||||
|
||||
void logstor::set_trigger_compaction_hook(std::function<void()> fn) {
|
||||
_segment_manager.set_trigger_compaction_hook(std::move(fn));
|
||||
}
|
||||
|
||||
void logstor::set_trigger_separator_flush_hook(std::function<void(size_t)> fn) {
|
||||
_segment_manager.set_trigger_separator_flush_hook(std::move(fn));
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1,81 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2026-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include <seastar/core/future.hh>
|
||||
#include <seastar/core/temporary_buffer.hh>
|
||||
#include <optional>
|
||||
#include <seastar/core/scheduling.hh>
|
||||
#include "readers/mutation_reader.hh"
|
||||
#include "replica/compaction_group.hh"
|
||||
#include "types.hh"
|
||||
#include "index.hh"
|
||||
#include "segment_manager.hh"
|
||||
#include "write_buffer.hh"
|
||||
#include "mutation/mutation.hh"
|
||||
#include "dht/decorated_key.hh"
|
||||
|
||||
namespace replica {
|
||||
|
||||
class compaction_group;
|
||||
class database;
|
||||
|
||||
namespace logstor {
|
||||
|
||||
extern seastar::logger logstor_logger;
|
||||
|
||||
struct logstor_config {
|
||||
segment_manager_config segment_manager_cfg;
|
||||
seastar::scheduling_group flush_sg;
|
||||
};
|
||||
|
||||
class logstor {
|
||||
|
||||
segment_manager _segment_manager;
|
||||
buffered_writer _write_buffer;
|
||||
|
||||
public:
|
||||
|
||||
explicit logstor(logstor_config);
|
||||
|
||||
logstor(const logstor&) = delete;
|
||||
logstor& operator=(const logstor&) = delete;
|
||||
|
||||
future<> do_recovery(replica::database&);
|
||||
|
||||
future<> start();
|
||||
future<> stop();
|
||||
|
||||
size_t get_memory_usage() const;
|
||||
|
||||
segment_manager& get_segment_manager() noexcept;
|
||||
const segment_manager& get_segment_manager() const noexcept;
|
||||
|
||||
compaction_manager& get_compaction_manager() noexcept;
|
||||
const compaction_manager& get_compaction_manager() const noexcept;
|
||||
|
||||
future<> write(const mutation&, compaction_group&, seastar::gate::holder cg_holder);
|
||||
|
||||
future<std::optional<log_record>> read(const primary_index&, primary_index_key);
|
||||
|
||||
future<std::optional<canonical_mutation>> read(const schema&, const primary_index&, const dht::decorated_key&);
|
||||
|
||||
/// Create a mutation reader for a specific key
|
||||
mutation_reader make_reader(schema_ptr schema,
|
||||
const primary_index& index,
|
||||
reader_permit permit,
|
||||
const dht::partition_range& pr,
|
||||
const query::partition_slice& slice,
|
||||
tracing::trace_state_ptr trace_state = nullptr);
|
||||
|
||||
void set_trigger_compaction_hook(std::function<void()> fn);
|
||||
void set_trigger_separator_flush_hook(std::function<void(size_t)> fn);
|
||||
};
|
||||
|
||||
} // namespace logstor
|
||||
} // namespace replica
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,128 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2026-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <filesystem>
|
||||
#include <seastar/core/shared_future.hh>
|
||||
#include <seastar/core/file.hh>
|
||||
#include <seastar/core/rwlock.hh>
|
||||
#include <seastar/core/gate.hh>
|
||||
#include <seastar/core/queue.hh>
|
||||
#include <seastar/core/shared_ptr.hh>
|
||||
#include "bytes_fwd.hh"
|
||||
#include "replica/logstor/write_buffer.hh"
|
||||
#include "types.hh"
|
||||
#include "utils/updateable_value.hh"
|
||||
|
||||
namespace replica {
|
||||
|
||||
class database;
|
||||
|
||||
namespace logstor {
|
||||
|
||||
class compaction_manager;
|
||||
class segment_set;
|
||||
class primary_index;
|
||||
|
||||
static constexpr size_t default_segment_size = 128 * 1024;
|
||||
static constexpr size_t default_file_size = 32 * 1024 * 1024;
|
||||
|
||||
/// Configuration for the segment manager
|
||||
struct segment_manager_config {
|
||||
std::filesystem::path base_dir;
|
||||
size_t segment_size = default_segment_size;
|
||||
size_t file_size = default_file_size;
|
||||
size_t disk_size;
|
||||
bool compaction_enabled = true;
|
||||
size_t max_segments_per_compaction = 8;
|
||||
seastar::scheduling_group compaction_sg;
|
||||
utils::updateable_value<float> compaction_static_shares;
|
||||
seastar::scheduling_group separator_sg;
|
||||
uint32_t separator_delay_limit_ms;
|
||||
size_t max_separator_memory = 1 * 1024 * 1024;
|
||||
};
|
||||
|
||||
struct table_segment_histogram_bucket {
|
||||
size_t count;
|
||||
size_t max_data_size;
|
||||
|
||||
table_segment_histogram_bucket& operator+=(table_segment_histogram_bucket& other) {
|
||||
count += other.count;
|
||||
max_data_size = std::max(max_data_size, other.max_data_size);
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
struct table_segment_stats {
|
||||
size_t compaction_group_count{0};
|
||||
size_t segment_count{0};
|
||||
std::vector<table_segment_histogram_bucket> histogram;
|
||||
|
||||
table_segment_stats& operator+=(table_segment_stats& other) {
|
||||
compaction_group_count += other.compaction_group_count;
|
||||
segment_count += other.segment_count;
|
||||
histogram.resize(std::max(histogram.size(), other.histogram.size()));
|
||||
for (size_t i = 0; i < other.histogram.size(); i++) {
|
||||
histogram[i] += other.histogram[i];
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
class segment_manager_impl;
|
||||
class log_index;
|
||||
|
||||
class segment_manager {
|
||||
std::unique_ptr<segment_manager_impl> _impl;
|
||||
private:
|
||||
segment_manager_impl& get_impl() noexcept;
|
||||
const segment_manager_impl& get_impl() const noexcept;
|
||||
public:
|
||||
static constexpr size_t block_alignment = 4096;
|
||||
|
||||
explicit segment_manager(segment_manager_config config);
|
||||
~segment_manager();
|
||||
|
||||
segment_manager(const segment_manager&) = delete;
|
||||
segment_manager& operator=(const segment_manager&) = delete;
|
||||
|
||||
future<> do_recovery(replica::database&);
|
||||
|
||||
future<> start();
|
||||
future<> stop();
|
||||
|
||||
future<log_location> write(write_buffer& wb);
|
||||
|
||||
future<log_record> read(log_location location);
|
||||
|
||||
void free_record(log_location location);
|
||||
|
||||
future<> for_each_record(const std::vector<log_segment_id>& segments,
|
||||
std::function<future<>(log_location, log_record)> callback);
|
||||
|
||||
compaction_manager& get_compaction_manager() noexcept;
|
||||
const compaction_manager& get_compaction_manager() const noexcept;
|
||||
|
||||
void set_trigger_compaction_hook(std::function<void()> fn);
|
||||
void set_trigger_separator_flush_hook(std::function<void(size_t)> fn);
|
||||
|
||||
size_t get_segment_size() const noexcept;
|
||||
|
||||
future<> discard_segments(segment_set&);
|
||||
|
||||
size_t get_memory_usage() const;
|
||||
|
||||
future<> await_pending_writes();
|
||||
|
||||
friend class segment_manager_impl;
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
}
|
||||
@@ -1,80 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2026-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <fmt/format.h>
|
||||
#include "mutation/canonical_mutation.hh"
|
||||
#include "replica/logstor/utils.hh"
|
||||
#include "dht/decorated_key.hh"
|
||||
|
||||
namespace replica::logstor {
|
||||
|
||||
struct log_segment_id {
|
||||
uint32_t value;
|
||||
|
||||
bool operator==(const log_segment_id& other) const noexcept = default;
|
||||
auto operator<=>(const log_segment_id& other) const noexcept = default;
|
||||
};
|
||||
|
||||
struct log_location {
|
||||
log_segment_id segment;
|
||||
uint32_t offset;
|
||||
uint32_t size;
|
||||
|
||||
bool operator==(const log_location& other) const noexcept = default;
|
||||
};
|
||||
|
||||
struct primary_index_key {
|
||||
dht::decorated_key dk;
|
||||
};
|
||||
|
||||
using record_generation = generation_base<uint16_t>;
|
||||
using segment_generation = generation_base<uint16_t>;
|
||||
|
||||
struct index_entry {
|
||||
log_location location;
|
||||
record_generation generation;
|
||||
|
||||
bool operator==(const index_entry& other) const noexcept = default;
|
||||
};
|
||||
|
||||
struct log_record {
|
||||
primary_index_key key;
|
||||
record_generation generation;
|
||||
table_id table;
|
||||
canonical_mutation mut;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
// Format specialization declarations and implementations
|
||||
template <>
|
||||
struct fmt::formatter<replica::logstor::log_segment_id> : fmt::formatter<string_view> {
|
||||
template <typename FormatContext>
|
||||
auto format(const replica::logstor::log_segment_id& id, FormatContext& ctx) const {
|
||||
return fmt::format_to(ctx.out(), "segment({})", id.value);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct fmt::formatter<replica::logstor::log_location> : fmt::formatter<string_view> {
|
||||
template <typename FormatContext>
|
||||
auto format(const replica::logstor::log_location& loc, FormatContext& ctx) const {
|
||||
return fmt::format_to(ctx.out(), "{{segment:{}, offset:{}, size:{}}}",
|
||||
loc.segment, loc.offset, loc.size);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct fmt::formatter<replica::logstor::primary_index_key> : fmt::formatter<string_view> {
|
||||
template <typename FormatContext>
|
||||
auto format(const replica::logstor::primary_index_key& key, FormatContext& ctx) const {
|
||||
return fmt::format_to(ctx.out(), "{}", key.dk);
|
||||
}
|
||||
};
|
||||
@@ -1,104 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2026-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include <concepts>
|
||||
#include "serializer.hh"
|
||||
|
||||
namespace replica::logstor {
|
||||
|
||||
// an unsigned integer that can be incremented and compared with wraparound semantics
|
||||
template <std::unsigned_integral T>
|
||||
class generation_base {
|
||||
T _value;
|
||||
|
||||
public:
|
||||
|
||||
using underlying = T;
|
||||
|
||||
constexpr generation_base() noexcept : _value(0) {}
|
||||
constexpr explicit generation_base(T value) noexcept : _value(value) {}
|
||||
|
||||
constexpr T value() const noexcept { return _value; }
|
||||
|
||||
constexpr generation_base& operator++() noexcept {
|
||||
++_value;
|
||||
return *this;
|
||||
}
|
||||
|
||||
constexpr generation_base operator++(int) noexcept {
|
||||
auto old = *this;
|
||||
++_value;
|
||||
return old;
|
||||
}
|
||||
|
||||
constexpr generation_base& operator+=(T delta) noexcept {
|
||||
_value += delta;
|
||||
return *this;
|
||||
}
|
||||
|
||||
constexpr generation_base operator+(T delta) const noexcept {
|
||||
return generation_base(_value + delta);
|
||||
}
|
||||
|
||||
constexpr bool operator==(const generation_base& other) const noexcept = default;
|
||||
|
||||
/// Comparison using wraparound semantics.
|
||||
/// Returns true if this generation is less than other, accounting for wraparound.
|
||||
/// Assumes generations are within half the value space of each other.
|
||||
constexpr bool operator<(const generation_base& other) const noexcept {
|
||||
// Use signed comparison after converting difference to signed type
|
||||
// This handles wraparound: if diff > max/2, it's treated as negative
|
||||
using signed_type = std::make_signed_t<T>;
|
||||
auto diff = static_cast<signed_type>(_value - other._value);
|
||||
return diff < 0;
|
||||
}
|
||||
|
||||
constexpr bool operator<=(const generation_base& other) const noexcept {
|
||||
return *this == other || *this < other;
|
||||
}
|
||||
|
||||
constexpr bool operator>(const generation_base& other) const noexcept {
|
||||
return other < *this;
|
||||
}
|
||||
|
||||
constexpr bool operator>=(const generation_base& other) const noexcept {
|
||||
return other <= *this;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
template <std::unsigned_integral T>
|
||||
struct fmt::formatter<replica::logstor::generation_base<T>> : fmt::formatter<T> {
|
||||
template <typename FormatContext>
|
||||
auto format(const replica::logstor::generation_base<T>& gen, FormatContext& ctx) const {
|
||||
return fmt::formatter<T>::format(gen.value(), ctx);
|
||||
}
|
||||
};
|
||||
|
||||
namespace ser {
|
||||
|
||||
template <std::unsigned_integral T>
|
||||
struct serializer<replica::logstor::generation_base<T>> {
|
||||
template <typename Output>
|
||||
static void write(Output& out, const replica::logstor::generation_base<T>& g) {
|
||||
serializer<typename replica::logstor::generation_base<T>::underlying>::write(out, g.value());
|
||||
}
|
||||
template <typename Input>
|
||||
static replica::logstor::generation_base<T> read(Input& in) {
|
||||
auto val = serializer<typename replica::logstor::generation_base<T>::underlying>::read(in);
|
||||
return replica::logstor::generation_base<T>(val);
|
||||
}
|
||||
template <typename Input>
|
||||
static void skip(Input& in) {
|
||||
serializer<typename replica::logstor::generation_base<T>::underlying>::skip(in);
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
@@ -1,278 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2026-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
#include "write_buffer.hh"
|
||||
#include "segment_manager.hh"
|
||||
#include "bytes_fwd.hh"
|
||||
#include "logstor.hh"
|
||||
#include "replica/logstor/types.hh"
|
||||
#include <seastar/core/simple-stream.hh>
|
||||
#include <seastar/core/with_scheduling_group.hh>
|
||||
#include <seastar/core/on_internal_error.hh>
|
||||
#include "serializer_impl.hh"
|
||||
#include "idl/logstor.dist.hh"
|
||||
#include "idl/logstor.dist.impl.hh"
|
||||
#include <seastar/core/align.hh>
|
||||
#include <seastar/core/aligned_buffer.hh>
|
||||
|
||||
namespace replica::logstor {
|
||||
|
||||
void log_record_writer::compute_size() const {
|
||||
seastar::measuring_output_stream ms;
|
||||
ser::serialize(ms, _record);
|
||||
_size = ms.size();
|
||||
}
|
||||
|
||||
void log_record_writer::write(ostream& out) const {
|
||||
ser::serialize(out, _record);
|
||||
}
|
||||
|
||||
// write_buffer
|
||||
|
||||
write_buffer::write_buffer(size_t buffer_size, bool with_record_copy)
|
||||
: _buffer_size(buffer_size)
|
||||
, _buffer(seastar::allocate_aligned_buffer<char>(buffer_size, 4096))
|
||||
, _with_record_copy(with_record_copy)
|
||||
{
|
||||
if (_with_record_copy) {
|
||||
_records_copy.reserve(_buffer_size / 100);
|
||||
}
|
||||
reset();
|
||||
}
|
||||
|
||||
void write_buffer::reset() {
|
||||
_stream = seastar::simple_memory_output_stream(_buffer.get(), _buffer_size);
|
||||
_header_stream = _stream.write_substream(buffer_header_size);
|
||||
_buffer_header = {};
|
||||
_net_data_size = 0;
|
||||
_record_count = 0;
|
||||
_written = {};
|
||||
_records_copy.clear();
|
||||
_write_gate = {};
|
||||
}
|
||||
|
||||
future<> write_buffer::close() {
|
||||
if (!_write_gate.is_closed()) {
|
||||
co_await _write_gate.close();
|
||||
}
|
||||
}
|
||||
|
||||
size_t write_buffer::get_max_write_size() const noexcept {
|
||||
return _buffer_size - (buffer_header_size + record_header_size);
|
||||
}
|
||||
|
||||
bool write_buffer::can_fit(size_t data_size) const noexcept {
|
||||
// Calculate total space needed including header, data, and alignment padding
|
||||
auto total_size = record_header_size + data_size;
|
||||
auto aligned_size = align_up(total_size, record_alignment);
|
||||
return aligned_size <= _stream.size();
|
||||
}
|
||||
|
||||
bool write_buffer::has_data() const noexcept {
|
||||
return offset_in_buffer() > buffer_header_size;
|
||||
}
|
||||
|
||||
future<log_location_with_holder> write_buffer::write(log_record_writer writer, compaction_group* cg, seastar::gate::holder cg_holder) {
|
||||
const auto data_size = writer.size();
|
||||
|
||||
if (!can_fit(data_size)) {
|
||||
throw std::runtime_error(fmt::format("Write size {} exceeds buffer size {}", data_size, _stream.size()));
|
||||
}
|
||||
|
||||
auto rh = record_header {
|
||||
.data_size = data_size
|
||||
};
|
||||
ser::serialize(_stream, rh);
|
||||
|
||||
// Write actual data
|
||||
size_t data_offset_in_buffer = offset_in_buffer();
|
||||
auto data_out = _stream.write_substream(data_size);
|
||||
writer.write(data_out);
|
||||
|
||||
_net_data_size += data_size;
|
||||
_record_count++;
|
||||
|
||||
// Add padding to align record
|
||||
pad_to_alignment(record_alignment);
|
||||
|
||||
auto record_location = [data_offset_in_buffer, data_size] (log_location base_location) {
|
||||
return log_location {
|
||||
.segment = base_location.segment,
|
||||
.offset = base_location.offset + data_offset_in_buffer,
|
||||
.size = data_size
|
||||
};
|
||||
};
|
||||
|
||||
if (_with_record_copy) {
|
||||
_records_copy.push_back(record_in_buffer {
|
||||
.writer = std::move(writer),
|
||||
.offset_in_buffer = data_offset_in_buffer,
|
||||
.data_size = data_size,
|
||||
.loc = _written.get_shared_future().then(record_location),
|
||||
.cg = cg,
|
||||
.cg_holder = std::move(cg_holder)
|
||||
});
|
||||
}
|
||||
|
||||
// hold the write buffer until the write is complete, and pass the holder to the
|
||||
// caller for follow-up operations that should continue holding the buffer, such
|
||||
// as index updates.
|
||||
auto op = _write_gate.hold();
|
||||
|
||||
return _written.get_shared_future().then([record_location, op = std::move(op)] (log_location base_location) mutable {
|
||||
return std::make_tuple(record_location(base_location), std::move(op));
|
||||
});
|
||||
}
|
||||
|
||||
future<log_location> write_buffer::write_no_holder(log_record_writer writer) {
|
||||
// write and leave the gate immediately after the write.
|
||||
// use carefully when the gate it not needed.
|
||||
return write(std::move(writer)).then_unpack([] (log_location loc, seastar::gate::holder op) {
|
||||
return loc;
|
||||
});
|
||||
}
|
||||
|
||||
void write_buffer::pad_to_alignment(size_t alignment) {
|
||||
auto current_pos = offset_in_buffer();
|
||||
auto next_pos = align_up(current_pos, alignment);
|
||||
auto padding = next_pos - current_pos;
|
||||
if (padding > 0) {
|
||||
_stream.fill('\0', padding);
|
||||
}
|
||||
}
|
||||
|
||||
void write_buffer::finalize(size_t alignment) {
|
||||
_buffer_header.data_size = static_cast<uint32_t>(offset_in_buffer() - buffer_header_size);
|
||||
pad_to_alignment(alignment);
|
||||
}
|
||||
|
||||
void write_buffer::write_header(segment_generation seg_gen) {
|
||||
_buffer_header.magic = buffer_header_magic;
|
||||
_buffer_header.seg_gen = seg_gen;
|
||||
ser::serialize<buffer_header>(_header_stream, _buffer_header);
|
||||
}
|
||||
|
||||
future<> write_buffer::complete_writes(log_location base_location) {
|
||||
_written.set_value(base_location);
|
||||
co_await close();
|
||||
}
|
||||
|
||||
future<> write_buffer::abort_writes(std::exception_ptr ex) {
|
||||
if (!_written.available()) {
|
||||
_written.set_exception(std::move(ex));
|
||||
}
|
||||
co_await close();
|
||||
}
|
||||
|
||||
std::vector<write_buffer::record_in_buffer>& write_buffer::records() {
|
||||
if (!_with_record_copy) {
|
||||
on_internal_error(logstor_logger, "requesting records but the write buffer has no record copy enabled");
|
||||
}
|
||||
return _records_copy;
|
||||
}
|
||||
|
||||
size_t write_buffer::estimate_required_segments(size_t net_data_size, size_t record_count, size_t segment_size) {
|
||||
// Calculate total size needed including headers and alignment padding
|
||||
size_t total_size = record_header_size * record_count + net_data_size;
|
||||
|
||||
// not perfect so let's multiply by some overhead constant
|
||||
total_size = static_cast<size_t>(total_size * 1.1);
|
||||
|
||||
return align_up(total_size, segment_size) / segment_size;
|
||||
|
||||
}
|
||||
|
||||
// buffered_writer
|
||||
|
||||
buffered_writer::buffered_writer(segment_manager& sm, seastar::scheduling_group flush_sg)
|
||||
: _sm(sm)
|
||||
, _available_buffers(num_flushing_buffers)
|
||||
, _flush_sg(flush_sg) {
|
||||
|
||||
_buffers.reserve(num_flushing_buffers + 1);
|
||||
for (size_t i = 0; i < num_flushing_buffers + 1; ++i) {
|
||||
_buffers.emplace_back(_sm.get_segment_size(), true);
|
||||
}
|
||||
|
||||
_active_buffer = active_buffer {
|
||||
.buf = &_buffers[0],
|
||||
};
|
||||
|
||||
for (size_t i = 1; i < num_flushing_buffers + 1; ++i) {
|
||||
_available_buffers.push(&_buffers[i]);
|
||||
}
|
||||
}
|
||||
|
||||
future<> buffered_writer::start() {
|
||||
logstor_logger.info("Starting write buffer");
|
||||
co_return;
|
||||
}
|
||||
|
||||
future<> buffered_writer::stop() {
|
||||
if (_async_gate.is_closed()) {
|
||||
co_return;
|
||||
}
|
||||
logstor_logger.info("Stopping write buffer");
|
||||
|
||||
co_await _async_gate.close();
|
||||
logstor_logger.info("Write buffer stopped");
|
||||
}
|
||||
|
||||
future<log_location_with_holder> buffered_writer::write(log_record record, compaction_group* cg, seastar::gate::holder cg_holder) {
|
||||
auto holder = _async_gate.hold();
|
||||
|
||||
log_record_writer writer(std::move(record));
|
||||
|
||||
if (writer.size() > _active_buffer.buf->get_max_write_size()) {
|
||||
throw std::runtime_error(fmt::format("Write size {} exceeds buffer size {}", writer.size(), _active_buffer.buf->get_max_write_size()));
|
||||
}
|
||||
|
||||
// Check if write fits in current buffer
|
||||
while (!_active_buffer.buf->can_fit(writer)) {
|
||||
co_await _buffer_switched.wait();
|
||||
}
|
||||
|
||||
// Write to buffer at current position
|
||||
auto fut = _active_buffer.buf->write(std::move(writer), cg, std::move(cg_holder));
|
||||
|
||||
// Trigger flush for the active buffer if not in progress
|
||||
if (!std::exchange(_active_buffer.flush_requested, true)) {
|
||||
(void)with_gate(_async_gate, [this] {
|
||||
return switch_buffer().then([this] (write_buffer* old_buf) mutable {
|
||||
return with_scheduling_group(_flush_sg, [this, old_buf] mutable {
|
||||
return flush(old_buf);
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
co_return co_await std::move(fut);
|
||||
}
|
||||
|
||||
future<write_buffer*> buffered_writer::switch_buffer() {
|
||||
// Wait for and get the next available buffer
|
||||
auto new_buf = co_await _available_buffers.pop_eventually();
|
||||
|
||||
auto next_active_buffer = active_buffer {
|
||||
.buf = std::move(new_buf),
|
||||
};
|
||||
|
||||
auto old_active_buffer = std::exchange(_active_buffer, std::move(next_active_buffer));
|
||||
_buffer_switched.broadcast();
|
||||
|
||||
co_return std::move(old_active_buffer.buf);
|
||||
}
|
||||
|
||||
future<> buffered_writer::flush(write_buffer* buf) {
|
||||
co_await _sm.write(*buf);
|
||||
|
||||
// Return the flushed buffer to the available queue
|
||||
buf->reset();
|
||||
_available_buffers.push(std::move(buf));
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1,294 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2026-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include <seastar/core/future.hh>
|
||||
#include <seastar/core/gate.hh>
|
||||
#include <seastar/core/temporary_buffer.hh>
|
||||
#include <seastar/core/aligned_buffer.hh>
|
||||
#include <seastar/core/condition-variable.hh>
|
||||
#include <seastar/core/scheduling.hh>
|
||||
#include <seastar/core/semaphore.hh>
|
||||
#include <seastar/core/queue.hh>
|
||||
#include <seastar/core/simple-stream.hh>
|
||||
#include <seastar/core/shared_future.hh>
|
||||
#include "types.hh"
|
||||
#include "serializer.hh"
|
||||
|
||||
namespace replica {
|
||||
|
||||
class compaction_group;
|
||||
|
||||
namespace logstor {
|
||||
|
||||
class segment_manager;
|
||||
|
||||
// Writer for log records that handles serialization and size computation
|
||||
class log_record_writer {
|
||||
|
||||
using ostream = seastar::simple_memory_output_stream;
|
||||
|
||||
log_record _record;
|
||||
mutable std::optional<size_t> _size;
|
||||
|
||||
void compute_size() const;
|
||||
|
||||
public:
|
||||
explicit log_record_writer(log_record record)
|
||||
: _record(std::move(record))
|
||||
{}
|
||||
|
||||
// Get serialized size (computed lazily)
|
||||
size_t size() const {
|
||||
if (!_size) {
|
||||
compute_size();
|
||||
}
|
||||
return *_size;
|
||||
}
|
||||
|
||||
// Write the record to an output stream
|
||||
void write(ostream& out) const;
|
||||
|
||||
const log_record& record() const {
|
||||
return _record;
|
||||
}
|
||||
};
|
||||
|
||||
using log_location_with_holder = std::tuple<log_location, seastar::gate::holder>;
|
||||
|
||||
// Manages a single aligned buffer for accumulating records and writing
|
||||
// them to the segment manager.
|
||||
//
|
||||
// usage:
|
||||
//
|
||||
// create write buffer with specified size:
|
||||
// write_buffer wb(buffer_size);
|
||||
// write data to the buffer if fits and get a future for the log location when flushed:
|
||||
// log_record_writer writer(record);
|
||||
// auto loc_fut = wb.write(writer);
|
||||
// flush the buffer to the segment manager:
|
||||
// co_await sm.write(wb);
|
||||
// await individual write locations:
|
||||
// auto record_loc = co_await std::move(loc_fut);
|
||||
class write_buffer {
|
||||
public:
|
||||
|
||||
using ostream = seastar::simple_memory_output_stream;
|
||||
|
||||
// buffer: buffer_header | record_1 | ... | record_n | 0-padding
|
||||
// record: record_header | record_data | 0-padding
|
||||
//
|
||||
// buffer_header and record are aligned by record_alignment
|
||||
// buffer_header and record_header have explicit sizes and serialization below
|
||||
|
||||
static constexpr uint32_t buffer_header_magic = 0x4c475342;
|
||||
static constexpr size_t record_alignment = 8;
|
||||
|
||||
struct buffer_header {
|
||||
uint32_t magic;
|
||||
uint32_t data_size; // size of all records data following the buffer_header
|
||||
segment_generation seg_gen;
|
||||
uint16_t reserved1;
|
||||
uint32_t reserved2;
|
||||
};
|
||||
static constexpr size_t buffer_header_size = 3 * sizeof(uint32_t) + sizeof(uint16_t) + sizeof(segment_generation::underlying);
|
||||
|
||||
static_assert(buffer_header_size % record_alignment == 0, "Buffer header size must be aligned by record_alignment");
|
||||
|
||||
struct record_header {
|
||||
uint32_t data_size; // size of the record data following the record_header
|
||||
};
|
||||
static constexpr size_t record_header_size = sizeof(uint32_t);
|
||||
|
||||
private:
|
||||
|
||||
using aligned_buffer_type = std::unique_ptr<char[], free_deleter>;
|
||||
|
||||
size_t _buffer_size;
|
||||
aligned_buffer_type _buffer;
|
||||
seastar::simple_memory_output_stream _stream;
|
||||
buffer_header _buffer_header;
|
||||
seastar::simple_memory_output_stream _header_stream;
|
||||
|
||||
size_t _net_data_size{0};
|
||||
size_t _record_count{0};
|
||||
|
||||
shared_promise<log_location> _written;
|
||||
|
||||
seastar::gate _write_gate;
|
||||
|
||||
struct record_in_buffer {
|
||||
log_record_writer writer;
|
||||
size_t offset_in_buffer;
|
||||
size_t data_size;
|
||||
future<log_location> loc;
|
||||
compaction_group* cg;
|
||||
seastar::gate::holder cg_holder;
|
||||
};
|
||||
|
||||
bool _with_record_copy;
|
||||
std::vector<record_in_buffer> _records_copy;
|
||||
|
||||
public:
|
||||
|
||||
write_buffer(size_t buffer_size, bool with_record_copy);
|
||||
|
||||
void reset();
|
||||
|
||||
write_buffer(const write_buffer&) = delete;
|
||||
write_buffer& operator=(const write_buffer&) = delete;
|
||||
|
||||
write_buffer(write_buffer&&) noexcept = default;
|
||||
write_buffer& operator=(write_buffer&&) noexcept = default;
|
||||
|
||||
future<> close();
|
||||
|
||||
size_t get_buffer_size() const noexcept { return _buffer_size; }
|
||||
size_t offset_in_buffer() const noexcept { return _buffer_size - _stream.size(); }
|
||||
|
||||
bool can_fit(size_t data_size) const noexcept;
|
||||
|
||||
bool can_fit(const log_record_writer& writer) const noexcept {
|
||||
return can_fit(writer.size());
|
||||
}
|
||||
|
||||
bool has_data() const noexcept;
|
||||
|
||||
size_t get_max_write_size() const noexcept;
|
||||
|
||||
size_t get_net_data_size() const noexcept { return _net_data_size; }
|
||||
size_t get_record_count() const noexcept { return _record_count; }
|
||||
|
||||
// Write a record to the buffer.
|
||||
// Returns a future that will be resolved with the log location once flushed and a gate holder
|
||||
// that keeps the write buffer open. The gate should be held for index updates after the write
|
||||
// is done.
|
||||
future<log_location_with_holder> write(log_record_writer, compaction_group*, seastar::gate::holder cg_holder);
|
||||
|
||||
future<log_location_with_holder> write(log_record_writer writer) {
|
||||
return write(std::move(writer), nullptr, {});
|
||||
}
|
||||
|
||||
// Write a record to the buffer.
|
||||
// Returns a future that will be resolved with the log location once flushed.
|
||||
// If there are follow-up operations to the write such as index updates then consider
|
||||
// using write_with_holder instead to keep the write buffer open until those operations are complete.
|
||||
future<log_location> write_no_holder(log_record_writer);
|
||||
|
||||
static size_t estimate_required_segments(size_t net_data_size, size_t record_count, size_t segment_size);
|
||||
|
||||
private:
|
||||
|
||||
const char* data() const noexcept { return _buffer.get(); }
|
||||
|
||||
void write_header(segment_generation);
|
||||
|
||||
// get all write records in the buffer.
|
||||
// with_record_copy must be to true when creating the write_buffer.
|
||||
std::vector<record_in_buffer>& records();
|
||||
|
||||
/// Complete all tracked writes with their locations when the buffer is flushed to base_location
|
||||
future<> complete_writes(log_location base_location);
|
||||
future<> abort_writes(std::exception_ptr);
|
||||
|
||||
void pad_to_alignment(size_t alignment);
|
||||
void finalize(size_t alignment);
|
||||
|
||||
friend class segment_manager_impl;
|
||||
friend class compaction_manager_impl;
|
||||
};
|
||||
|
||||
// Manages multiple buffers, a single active buffer and multiple flushing buffers.
|
||||
// When switch is requested for the active buffer, it waits for a flushing buffer to
|
||||
// become available, and continuing to accumulate writes until then.
|
||||
class buffered_writer {
|
||||
static constexpr size_t num_flushing_buffers = 4;
|
||||
|
||||
segment_manager& _sm;
|
||||
|
||||
struct active_buffer {
|
||||
write_buffer* buf;
|
||||
bool flush_requested{false};
|
||||
} _active_buffer;
|
||||
|
||||
std::vector<write_buffer> _buffers;
|
||||
seastar::queue<write_buffer*> _available_buffers;
|
||||
seastar::gate _async_gate;
|
||||
seastar::condition_variable _buffer_switched;
|
||||
seastar::scheduling_group _flush_sg;
|
||||
|
||||
public:
|
||||
explicit buffered_writer(segment_manager& sm, seastar::scheduling_group flush_sg);
|
||||
|
||||
buffered_writer(const buffered_writer&) = delete;
|
||||
buffered_writer& operator=(const buffered_writer&) = delete;
|
||||
|
||||
future<> start();
|
||||
future<> stop();
|
||||
|
||||
future<log_location_with_holder> write(log_record, compaction_group* cg = nullptr, seastar::gate::holder cg_holder = {});
|
||||
|
||||
private:
|
||||
future<write_buffer*> switch_buffer();
|
||||
future<> flush(write_buffer*);
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
namespace ser {
|
||||
|
||||
template <>
|
||||
struct serializer<replica::logstor::write_buffer::buffer_header> {
|
||||
template <typename Output>
|
||||
static void write(Output& out, const replica::logstor::write_buffer::buffer_header& h) {
|
||||
serializer<uint32_t>::write(out, h.magic);
|
||||
serializer<uint32_t>::write(out, h.data_size);
|
||||
serializer<replica::logstor::segment_generation>::write(out, h.seg_gen);
|
||||
serializer<uint16_t>::write(out, h.reserved1);
|
||||
serializer<uint32_t>::write(out, h.reserved2);
|
||||
}
|
||||
template <typename Input>
|
||||
static replica::logstor::write_buffer::buffer_header read(Input& in) {
|
||||
replica::logstor::write_buffer::buffer_header h;
|
||||
h.magic = serializer<uint32_t>::read(in);
|
||||
h.data_size = serializer<uint32_t>::read(in);
|
||||
h.seg_gen = serializer<replica::logstor::segment_generation>::read(in);
|
||||
h.reserved1 = serializer<uint16_t>::read(in);
|
||||
h.reserved2 = serializer<uint32_t>::read(in);
|
||||
return h;
|
||||
}
|
||||
template <typename Input>
|
||||
static void skip(Input& in) {
|
||||
serializer<uint32_t>::skip(in);
|
||||
serializer<uint32_t>::skip(in);
|
||||
serializer<replica::logstor::segment_generation>::skip(in);
|
||||
serializer<uint16_t>::skip(in);
|
||||
serializer<uint32_t>::skip(in);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct serializer<replica::logstor::write_buffer::record_header> {
|
||||
template <typename Output>
|
||||
static void write(Output& out, const replica::logstor::write_buffer::record_header& h) {
|
||||
serializer<uint32_t>::write(out, h.data_size);
|
||||
}
|
||||
template <typename Input>
|
||||
static replica::logstor::write_buffer::record_header read(Input& in) {
|
||||
replica::logstor::write_buffer::record_header h;
|
||||
h.data_size = serializer<uint32_t>::read(in);
|
||||
return h;
|
||||
}
|
||||
template <typename Input>
|
||||
static void skip(Input& in) {
|
||||
serializer<uint32_t>::skip(in);
|
||||
}
|
||||
};
|
||||
} // namespace ser
|
||||
425
replica/table.cc
425
replica/table.cc
@@ -217,17 +217,6 @@ table::add_memtables_to_reader_list(std::vector<mutation_reader>& readers,
|
||||
}
|
||||
}
|
||||
|
||||
mutation_reader
|
||||
table::make_logstor_mutation_reader(schema_ptr s,
|
||||
reader_permit permit,
|
||||
const dht::partition_range& pr,
|
||||
const query::partition_slice& slice,
|
||||
tracing::trace_state_ptr trace_state,
|
||||
streamed_mutation::forwarding fwd,
|
||||
mutation_reader::forwarding fwd_mr) const {
|
||||
return _logstor->make_reader(std::move(s), logstor_index(), std::move(permit), pr, slice, std::move(trace_state));
|
||||
}
|
||||
|
||||
mutation_reader
|
||||
table::make_mutation_reader(schema_ptr s,
|
||||
reader_permit permit,
|
||||
@@ -240,10 +229,6 @@ table::make_mutation_reader(schema_ptr s,
|
||||
return (*_virtual_reader).make_mutation_reader(s, std::move(permit), range, slice, trace_state, fwd, fwd_mr);
|
||||
}
|
||||
|
||||
if (_logstor) [[unlikely]] {
|
||||
return make_logstor_mutation_reader(s, std::move(permit), range, slice, std::move(trace_state), fwd, fwd_mr);
|
||||
}
|
||||
|
||||
std::vector<mutation_reader> readers;
|
||||
|
||||
// We're assuming that cache and memtables are both read atomically
|
||||
@@ -731,9 +716,7 @@ public:
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
void update_effective_replication_map(const locator::effective_replication_map_ptr& old_erm,
|
||||
const locator::effective_replication_map& erm,
|
||||
noncopyable_function<void()> refresh_mutation_source) override {}
|
||||
void update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) override {}
|
||||
|
||||
compaction_group& compaction_group_for_token(dht::token token) const override {
|
||||
return get_compaction_group();
|
||||
@@ -779,11 +762,6 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
struct background_merge_guard {
|
||||
compaction::compaction_reenabler compaction_guard;
|
||||
locator::effective_replication_map_ptr erm_guard;
|
||||
};
|
||||
|
||||
class tablet_storage_group_manager final : public storage_group_manager {
|
||||
replica::table& _t;
|
||||
locator::host_id _my_host_id;
|
||||
@@ -804,7 +782,7 @@ class tablet_storage_group_manager final : public storage_group_manager {
|
||||
utils::phased_barrier _merge_fiber_barrier;
|
||||
std::optional<utils::phased_barrier::operation> _pending_merge_fiber_work;
|
||||
// Holds compaction reenabler which disables compaction temporarily during tablet merge
|
||||
std::vector<background_merge_guard> _compaction_reenablers_for_merging;
|
||||
std::vector<compaction::compaction_reenabler> _compaction_reenablers_for_merging;
|
||||
private:
|
||||
const schema_ptr& schema() const {
|
||||
return _t.schema();
|
||||
@@ -828,8 +806,7 @@ private:
|
||||
// Called when coordinator executes tablet merge. Tablet ids X and X+1 are merged into
|
||||
// the new tablet id (X >> 1). In practice, that means storage groups for X and X+1
|
||||
// are merged into a new storage group with id (X >> 1).
|
||||
void handle_tablet_merge_completion(locator::effective_replication_map_ptr old_erm,
|
||||
const locator::tablet_map& old_tmap, const locator::tablet_map& new_tmap);
|
||||
void handle_tablet_merge_completion(const locator::tablet_map& old_tmap, const locator::tablet_map& new_tmap);
|
||||
|
||||
// When merge completes, compaction groups of sibling tablets are added to same storage
|
||||
// group, but they're not merged yet into one, since the merge completion handler happens
|
||||
@@ -845,8 +822,9 @@ private:
|
||||
return tablet_map().get_tablet_id(t).value();
|
||||
}
|
||||
|
||||
size_t storage_group_of(dht::token t) const {
|
||||
auto idx = tablet_id_for_token(t);
|
||||
std::pair<size_t, locator::tablet_range_side> storage_group_of(dht::token t) const {
|
||||
auto [id, side] = tablet_map().get_tablet_id_and_range_side(t);
|
||||
auto idx = id.value();
|
||||
#ifndef SCYLLA_BUILD_MODE_RELEASE
|
||||
if (idx >= tablet_count()) {
|
||||
on_fatal_internal_error(tlogger, format("storage_group_of: index out of range: idx={} size_log2={} size={} token={}",
|
||||
@@ -858,7 +836,7 @@ private:
|
||||
idx, sg.token_range(), t));
|
||||
}
|
||||
#endif
|
||||
return idx;
|
||||
return { idx, side };
|
||||
}
|
||||
|
||||
repair_classifier_func make_repair_sstable_classifier_func() const {
|
||||
@@ -922,9 +900,7 @@ public:
|
||||
std::exchange(_stop_fut, make_ready_future())).discard_result();
|
||||
}
|
||||
|
||||
void update_effective_replication_map(const locator::effective_replication_map_ptr& old_erm,
|
||||
const locator::effective_replication_map& erm,
|
||||
noncopyable_function<void()> refresh_mutation_source) override;
|
||||
void update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) override;
|
||||
|
||||
compaction_group& compaction_group_for_token(dht::token token) const override;
|
||||
utils::chunked_vector<storage_group_ptr> storage_groups_for_token_range(dht::token_range tr) const override;
|
||||
@@ -935,7 +911,7 @@ public:
|
||||
return log2ceil(tablet_map().tablet_count());
|
||||
}
|
||||
storage_group& storage_group_for_token(dht::token token) const override {
|
||||
return storage_group_for_id(storage_group_of(token));
|
||||
return storage_group_for_id(storage_group_of(token).first);
|
||||
}
|
||||
|
||||
locator::combined_load_stats table_load_stats() const override;
|
||||
@@ -983,20 +959,9 @@ size_t storage_group::to_idx(locator::tablet_range_side side) const {
|
||||
return size_t(side);
|
||||
}
|
||||
|
||||
compaction_group_ptr& storage_group::select_compaction_group(dht::token token, const locator::tablet_map& tmap) noexcept {
|
||||
compaction_group_ptr& storage_group::select_compaction_group(locator::tablet_range_side side) noexcept {
|
||||
if (splitting_mode()) {
|
||||
return _split_ready_groups[to_idx(tmap.get_tablet_range_side(token))];
|
||||
}
|
||||
return _main_cg;
|
||||
}
|
||||
|
||||
compaction_group_ptr& storage_group::select_compaction_group(dht::token first, dht::token last, const locator::tablet_map& tmap) noexcept {
|
||||
if (splitting_mode()) {
|
||||
auto first_side = tmap.get_tablet_range_side(first);
|
||||
auto last_side = tmap.get_tablet_range_side(last);
|
||||
if (first_side == last_side) {
|
||||
return _split_ready_groups[to_idx(first_side)];
|
||||
}
|
||||
return _split_ready_groups[to_idx(side)];
|
||||
}
|
||||
return _main_cg;
|
||||
}
|
||||
@@ -1091,38 +1056,6 @@ future<> compaction_group::split(compaction::compaction_type_options::split opt,
|
||||
}
|
||||
}
|
||||
|
||||
future<> compaction_group::discard_logstor_segments() {
|
||||
auto& sm = get_logstor_segment_manager();
|
||||
co_await sm.discard_segments(*_logstor_segments);
|
||||
}
|
||||
|
||||
future<> compaction_group::flush_separator(std::optional<size_t> seq_num) {
|
||||
auto units = co_await get_units(_separator_flush_sem, 1);
|
||||
auto pending = std::exchange(_separator_flushes, {});
|
||||
if (_logstor_separator && (!seq_num || _logstor_separator->min_seq_num < *seq_num)) {
|
||||
auto& cm = get_logstor_compaction_manager();
|
||||
auto b = std::move(*_logstor_separator);
|
||||
_logstor_separator.reset();
|
||||
pending.push_back(cm.flush_separator_buffer(std::move(b), *this));
|
||||
}
|
||||
co_await when_all(pending.begin(), pending.end());
|
||||
}
|
||||
|
||||
logstor::separator_buffer& compaction_group::get_separator_buffer(size_t write_size) {
|
||||
if (!_logstor_separator || !_logstor_separator->can_fit(write_size)) {
|
||||
auto& cm = get_logstor_compaction_manager();
|
||||
if (_logstor_separator) {
|
||||
auto b = std::move(*_logstor_separator);
|
||||
_logstor_separator.reset();
|
||||
|
||||
std::erase_if(_separator_flushes, [](future<>& f) { return f.available(); });
|
||||
_separator_flushes.push_back(cm.flush_separator_buffer(std::move(b), *this));
|
||||
}
|
||||
_logstor_separator.emplace(cm.allocate_separator_buffer());
|
||||
}
|
||||
return *_logstor_separator;
|
||||
}
|
||||
|
||||
future<> storage_group::split(compaction::compaction_type_options::split opt, tasks::task_info tablet_split_task_info) {
|
||||
if (set_split_mode()) {
|
||||
co_return;
|
||||
@@ -1289,9 +1222,9 @@ storage_group& table::storage_group_for_id(size_t i) const {
|
||||
}
|
||||
|
||||
compaction_group& tablet_storage_group_manager::compaction_group_for_token(dht::token token) const {
|
||||
auto idx = storage_group_of(token);
|
||||
auto [idx, range_side] = storage_group_of(token);
|
||||
auto& sg = storage_group_for_id(idx);
|
||||
return *sg.select_compaction_group(token, tablet_map());
|
||||
return *sg.select_compaction_group(range_side);
|
||||
}
|
||||
|
||||
compaction_group& table::compaction_group_for_token(dht::token token) const {
|
||||
@@ -1332,8 +1265,8 @@ compaction_group& table::compaction_group_for_key(partition_key_view key, const
|
||||
}
|
||||
|
||||
compaction_group& tablet_storage_group_manager::compaction_group_for_sstable(const sstables::shared_sstable& sst) const {
|
||||
auto first_id = storage_group_of(sst->get_first_decorated_key().token());
|
||||
auto last_id = storage_group_of(sst->get_last_decorated_key().token());
|
||||
auto [first_id, first_range_side] = storage_group_of(sst->get_first_decorated_key().token());
|
||||
auto [last_id, last_range_side] = storage_group_of(sst->get_last_decorated_key().token());
|
||||
|
||||
auto sstable_desc = [] (const sstables::shared_sstable& sst) {
|
||||
auto& identifier_opt = sst->sstable_identifier();
|
||||
@@ -1356,10 +1289,12 @@ compaction_group& tablet_storage_group_manager::compaction_group_for_sstable(con
|
||||
|
||||
try {
|
||||
auto& sg = storage_group_for_id(first_id);
|
||||
return *sg.select_compaction_group(
|
||||
sst->get_first_decorated_key().token(),
|
||||
sst->get_last_decorated_key().token(),
|
||||
tablet_map());
|
||||
|
||||
if (first_range_side != last_range_side) {
|
||||
return *sg.main_compaction_group();
|
||||
}
|
||||
|
||||
return *sg.select_compaction_group(first_range_side);
|
||||
} catch (std::out_of_range& e) {
|
||||
on_internal_error(tlogger, format("Unable to load SSTable {} of tablet {}, due to {}",
|
||||
sstable_desc(sst),
|
||||
@@ -1530,7 +1465,6 @@ table::add_new_sstable_and_update_cache(sstables::shared_sstable new_sst,
|
||||
sstables::offstrategy offstrategy) {
|
||||
std::vector<sstables::shared_sstable> ret, ssts;
|
||||
std::exception_ptr ex;
|
||||
log_level failure_log_level = log_level::error;
|
||||
try {
|
||||
bool trigger_compaction = offstrategy == sstables::offstrategy::no;
|
||||
auto& cg = compaction_group_for_sstable(new_sst);
|
||||
@@ -1552,9 +1486,6 @@ table::add_new_sstable_and_update_cache(sstables::shared_sstable new_sst,
|
||||
co_await do_add_sstable_and_update_cache(cg, sst, offstrategy, trigger_compaction);
|
||||
sst = nullptr;
|
||||
}
|
||||
} catch (compaction::compaction_stopped_exception&) {
|
||||
failure_log_level = log_level::warn;
|
||||
ex = std::current_exception();
|
||||
} catch (...) {
|
||||
ex = std::current_exception();
|
||||
}
|
||||
@@ -1562,13 +1493,13 @@ table::add_new_sstable_and_update_cache(sstables::shared_sstable new_sst,
|
||||
if (ex) {
|
||||
// on failed split, input sstable is unlinked here.
|
||||
if (new_sst) {
|
||||
tlogger.log(failure_log_level, "Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", new_sst->get_filename(), new_sst->get_origin(), ex);
|
||||
tlogger.error("Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", new_sst->get_filename(), new_sst->get_origin(), ex);
|
||||
co_await new_sst->unlink();
|
||||
}
|
||||
// on failure after successful split, sstables not attached yet will be unlinked
|
||||
co_await coroutine::parallel_for_each(ssts, [&ex, failure_log_level] (sstables::shared_sstable sst) -> future<> {
|
||||
co_await coroutine::parallel_for_each(ssts, [&ex] (sstables::shared_sstable sst) -> future<> {
|
||||
if (sst) {
|
||||
tlogger.log(failure_log_level, "Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", sst->get_filename(), sst->get_origin(), ex);
|
||||
tlogger.error("Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", sst->get_filename(), sst->get_origin(), ex);
|
||||
co_await sst->unlink();
|
||||
}
|
||||
});
|
||||
@@ -1582,7 +1513,6 @@ table::add_new_sstables_and_update_cache(std::vector<sstables::shared_sstable> n
|
||||
std::function<future<>(sstables::shared_sstable)> on_add) {
|
||||
std::exception_ptr ex;
|
||||
std::vector<sstables::shared_sstable> ret;
|
||||
log_level failure_log_level = log_level::error;
|
||||
|
||||
// We rely on add_new_sstable_and_update_cache() to unlink the sstable fed into it,
|
||||
// so the exception handling below will only have to unlink sstables not processed yet.
|
||||
@@ -1592,17 +1522,14 @@ table::add_new_sstables_and_update_cache(std::vector<sstables::shared_sstable> n
|
||||
std::ranges::move(ssts, std::back_inserter(ret));
|
||||
|
||||
}
|
||||
} catch (compaction::compaction_stopped_exception&) {
|
||||
failure_log_level = log_level::warn;
|
||||
ex = std::current_exception();
|
||||
} catch (...) {
|
||||
ex = std::current_exception();
|
||||
}
|
||||
|
||||
if (ex) {
|
||||
co_await coroutine::parallel_for_each(new_ssts, [&ex, failure_log_level] (sstables::shared_sstable sst) -> future<> {
|
||||
co_await coroutine::parallel_for_each(new_ssts, [&ex] (sstables::shared_sstable sst) -> future<> {
|
||||
if (sst) {
|
||||
tlogger.log(failure_log_level, "Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", sst->get_filename(), sst->get_origin(), ex);
|
||||
tlogger.error("Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", sst->get_filename(), sst->get_origin(), ex);
|
||||
co_await sst->unlink();
|
||||
}
|
||||
});
|
||||
@@ -1641,19 +1568,6 @@ table::update_cache(compaction_group& cg, lw_shared_ptr<memtable> m, std::vector
|
||||
}
|
||||
}
|
||||
|
||||
bool table::add_logstor_segment(logstor::segment_descriptor& seg_desc, dht::token first_token, dht::token last_token) {
|
||||
auto& cg = compaction_group_for_token(first_token);
|
||||
if (&cg != &compaction_group_for_token(last_token)) {
|
||||
return false;
|
||||
}
|
||||
cg.add_logstor_segment(seg_desc);
|
||||
return true;
|
||||
}
|
||||
|
||||
logstor::separator_buffer& table::get_logstor_separator_buffer(dht::token token, size_t write_size) {
|
||||
return compaction_group_for_token(token).get_separator_buffer(write_size);
|
||||
}
|
||||
|
||||
// Handles permit management only, used for situations where we don't want to inform
|
||||
// the compaction manager about backlogs (i.e., tests)
|
||||
class permit_monitor : public sstables::write_monitor {
|
||||
@@ -1851,9 +1765,7 @@ table::seal_active_memtable(compaction_group& cg, flush_permit&& flush_permit) n
|
||||
utils::get_local_injector().inject("table_seal_active_memtable_try_flush", []() {
|
||||
throw std::system_error(ENOSPC, std::system_category(), "Injected error");
|
||||
});
|
||||
co_await this->try_flush_memtable_to_sstable(cg, old, std::move(write_permit));
|
||||
// signal a memtable was sealed
|
||||
utils::get_local_injector().receive_message("table_seal_post_flush_waiters");
|
||||
co_return co_await this->try_flush_memtable_to_sstable(cg, old, std::move(write_permit));
|
||||
});
|
||||
|
||||
undo_stats.reset();
|
||||
@@ -2109,15 +2021,8 @@ size_t compaction_group::live_sstable_count() const noexcept {
|
||||
return _main_sstables->size() + _maintenance_sstables->size();
|
||||
}
|
||||
|
||||
size_t compaction_group::logstor_disk_space_used() const noexcept {
|
||||
if (!_logstor_segments || !_t.uses_logstor()) {
|
||||
return 0;
|
||||
}
|
||||
return _logstor_segments->segment_count() * _t.get_logstor_segment_manager().get_segment_size();
|
||||
}
|
||||
|
||||
uint64_t compaction_group::live_disk_space_used() const noexcept {
|
||||
return _main_sstables->bytes_on_disk() + _maintenance_sstables->bytes_on_disk() + logstor_disk_space_used();
|
||||
return _main_sstables->bytes_on_disk() + _maintenance_sstables->bytes_on_disk();
|
||||
}
|
||||
|
||||
sstables::file_size_stats compaction_group::live_disk_space_used_full_stats() const noexcept {
|
||||
@@ -2467,12 +2372,6 @@ void table::trigger_compaction() {
|
||||
});
|
||||
}
|
||||
|
||||
void table::trigger_logstor_compaction() {
|
||||
for_each_compaction_group([] (compaction_group& cg) {
|
||||
cg.trigger_logstor_compaction();
|
||||
});
|
||||
}
|
||||
|
||||
void table::try_trigger_compaction(compaction_group& cg) noexcept {
|
||||
try {
|
||||
cg.trigger_compaction();
|
||||
@@ -2481,51 +2380,6 @@ void table::try_trigger_compaction(compaction_group& cg) noexcept {
|
||||
}
|
||||
}
|
||||
|
||||
future<> table::flush_separator(std::optional<size_t> seq_num) {
|
||||
if (!uses_logstor()) {
|
||||
co_return;
|
||||
}
|
||||
|
||||
// wait for all previous writes to be written to a separator buffer
|
||||
co_await get_logstor_segment_manager().await_pending_writes();
|
||||
|
||||
// flush separator buffers
|
||||
co_await parallel_foreach_compaction_group([seq_num] (compaction_group& cg) {
|
||||
return cg.flush_separator(seq_num);
|
||||
});
|
||||
}
|
||||
|
||||
future<logstor::table_segment_stats> table::get_logstor_segment_stats() const {
|
||||
logstor::table_segment_stats result;
|
||||
if (!uses_logstor()) {
|
||||
co_return std::move(result);
|
||||
}
|
||||
|
||||
const auto segment_size = get_logstor_segment_manager().get_segment_size();
|
||||
const auto bucket_count = 32;
|
||||
const auto bucket_size = segment_size / bucket_count;
|
||||
|
||||
result.histogram.resize(bucket_count);
|
||||
|
||||
co_await const_cast<table*>(this)->parallel_foreach_compaction_group([&] (const compaction_group& cg) -> future<> {
|
||||
const auto& cg_segments = cg.logstor_segments();
|
||||
|
||||
result.compaction_group_count++;
|
||||
result.segment_count += cg_segments.segment_count();
|
||||
|
||||
for (const auto& desc : cg_segments._segments) {
|
||||
co_await coroutine::maybe_yield();
|
||||
auto data_size = desc.net_data_size(segment_size);
|
||||
auto bucket_index = std::min<size_t>(data_size / bucket_size, bucket_count - 1);
|
||||
auto& bucket = result.histogram[bucket_index];
|
||||
bucket.count++;
|
||||
bucket.max_data_size = std::max(bucket.max_data_size, data_size);
|
||||
}
|
||||
});
|
||||
|
||||
co_return std::move(result);
|
||||
}
|
||||
|
||||
void compaction_group::trigger_compaction() {
|
||||
// But not if we're locked out or stopping
|
||||
if (!_async_gate.is_closed()) {
|
||||
@@ -2536,14 +2390,6 @@ void compaction_group::trigger_compaction() {
|
||||
}
|
||||
}
|
||||
|
||||
void compaction_group::trigger_logstor_compaction() {
|
||||
if (!_async_gate.is_closed() && !_t.is_auto_compaction_disabled_by_user()) {
|
||||
if (_logstor_segments) {
|
||||
get_logstor_compaction_manager().submit(*this);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void table::trigger_offstrategy_compaction() {
|
||||
// Run in background.
|
||||
// This is safe since the the compaction task is tracked
|
||||
@@ -3000,7 +2846,6 @@ compaction_group::compaction_group(table& t, size_t group_id, dht::token_range t
|
||||
, _async_gate(format("[compaction_group {}.{} {}]", t.schema()->ks_name(), t.schema()->cf_name(), group_id))
|
||||
, _backlog_tracker(t.get_compaction_strategy().make_backlog_tracker())
|
||||
, _repair_sstable_classifier(std::move(repair_classifier))
|
||||
, _logstor_segments(make_lw_shared<logstor::segment_set>())
|
||||
{
|
||||
}
|
||||
|
||||
@@ -3034,13 +2879,9 @@ future<> compaction_group::stop(sstring reason) noexcept {
|
||||
for (auto view : all_views()) {
|
||||
co_await _t._compaction_manager.stop_ongoing_compactions(reason, view);
|
||||
}
|
||||
if (_t.uses_logstor()) {
|
||||
co_await get_logstor_compaction_manager().stop_ongoing_compactions(*this);
|
||||
}
|
||||
co_await _async_gate.close();
|
||||
auto flush_future = co_await seastar::coroutine::as_future(flush());
|
||||
|
||||
co_await flush_separator();
|
||||
co_await _flush_gate.close();
|
||||
co_await _sstable_add_gate.close();
|
||||
// FIXME: indentation
|
||||
@@ -3357,9 +3198,7 @@ future<> tablet_storage_group_manager::merge_completion_fiber() {
|
||||
}
|
||||
}
|
||||
|
||||
void tablet_storage_group_manager::handle_tablet_merge_completion(locator::effective_replication_map_ptr old_erm,
|
||||
const locator::tablet_map& old_tmap,
|
||||
const locator::tablet_map& new_tmap) {
|
||||
void tablet_storage_group_manager::handle_tablet_merge_completion(const locator::tablet_map& old_tmap, const locator::tablet_map& new_tmap) {
|
||||
auto table_id = schema()->id();
|
||||
size_t old_tablet_count = old_tmap.tablet_count();
|
||||
size_t new_tablet_count = new_tmap.tablet_count();
|
||||
@@ -3383,7 +3222,7 @@ void tablet_storage_group_manager::handle_tablet_merge_completion(locator::effec
|
||||
auto new_cg = make_lw_shared<compaction_group>(_t, new_tid, new_range, make_repair_sstable_classifier_func());
|
||||
for (auto& view : new_cg->all_views()) {
|
||||
auto cre = _t.get_compaction_manager().stop_and_disable_compaction_no_wait(*view, "tablet merging");
|
||||
_compaction_reenablers_for_merging.push_back(background_merge_guard{std::move(cre), old_erm});
|
||||
_compaction_reenablers_for_merging.push_back(std::move(cre));
|
||||
}
|
||||
auto new_sg = make_lw_shared<storage_group>(std::move(new_cg));
|
||||
|
||||
@@ -3416,11 +3255,7 @@ void tablet_storage_group_manager::handle_tablet_merge_completion(locator::effec
|
||||
_merge_completion_event.signal();
|
||||
}
|
||||
|
||||
void tablet_storage_group_manager::update_effective_replication_map(
|
||||
const locator::effective_replication_map_ptr& old_erm,
|
||||
const locator::effective_replication_map& erm,
|
||||
noncopyable_function<void()> refresh_mutation_source)
|
||||
{
|
||||
void tablet_storage_group_manager::update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) {
|
||||
auto* new_tablet_map = &erm.get_token_metadata().tablets().get_tablet_map(schema()->id());
|
||||
auto* old_tablet_map = std::exchange(_tablet_map, new_tablet_map);
|
||||
|
||||
@@ -3436,7 +3271,7 @@ void tablet_storage_group_manager::update_effective_replication_map(
|
||||
if (utils::get_local_injector().is_enabled("tablet_force_tablet_count_decrease_once")) {
|
||||
utils::get_local_injector().disable("tablet_force_tablet_count_decrease");
|
||||
}
|
||||
handle_tablet_merge_completion(old_erm, *old_tablet_map, *new_tablet_map);
|
||||
handle_tablet_merge_completion(*old_tablet_map, *new_tablet_map);
|
||||
}
|
||||
|
||||
// Allocate storage group if tablet is migrating in, or deallocate if it's migrating out.
|
||||
@@ -3522,7 +3357,7 @@ void table::update_effective_replication_map(locator::effective_replication_map_
|
||||
};
|
||||
|
||||
if (uses_tablets()) {
|
||||
_sg_manager->update_effective_replication_map(old_erm, *_erm, refresh_mutation_source);
|
||||
_sg_manager->update_effective_replication_map(*_erm, refresh_mutation_source);
|
||||
}
|
||||
if (old_erm) {
|
||||
old_erm->invalidate();
|
||||
@@ -4167,7 +4002,6 @@ future<std::unordered_map<sstring, table::snapshot_details>> table::get_snapshot
|
||||
}
|
||||
|
||||
auto lister = directory_lister(snapshots_dir, lister::dir_entry_types::of<directory_entry_type::directory>());
|
||||
auto close_lister = deferred_close(lister);
|
||||
while (auto de = lister.get().get()) {
|
||||
auto snapshot_name = de->name;
|
||||
all_snapshots.emplace(snapshot_name, snapshot_details());
|
||||
@@ -4175,9 +4009,6 @@ future<std::unordered_map<sstring, table::snapshot_details>> table::get_snapshot
|
||||
auto& sd = all_snapshots.at(snapshot_name);
|
||||
sd.total += details.total;
|
||||
sd.live += details.live;
|
||||
utils::get_local_injector().inject("get_snapshot_details", [&] (auto& handler) -> future<> {
|
||||
throw std::runtime_error("Injected exception in get_snapshot_details");
|
||||
}).get();
|
||||
}
|
||||
}
|
||||
return all_snapshots;
|
||||
@@ -4197,66 +4028,53 @@ future<table::snapshot_details> table::get_snapshot_details(fs::path snapshot_di
|
||||
}
|
||||
|
||||
auto lister = directory_lister(snapshot_directory, snapshot_dir, lister::dir_entry_types::of<directory_entry_type::regular>());
|
||||
std::exception_ptr ex;
|
||||
try {
|
||||
while (auto de = co_await lister.get()) {
|
||||
const auto& name = de->name;
|
||||
future<stat_data> (&file_stat)(file& directory, std::string_view name, follow_symlink) noexcept = seastar::file_stat;
|
||||
auto sd = co_await io_check(file_stat, snapshot_directory, name, follow_symlink::no);
|
||||
auto size = sd.allocated_size;
|
||||
while (auto de = co_await lister.get()) {
|
||||
const auto& name = de->name;
|
||||
future<stat_data> (&file_stat)(file& directory, std::string_view name, follow_symlink) noexcept = seastar::file_stat;
|
||||
auto sd = co_await io_check(file_stat, snapshot_directory, name, follow_symlink::no);
|
||||
auto size = sd.allocated_size;
|
||||
|
||||
utils::get_local_injector().inject("per-snapshot-get_snapshot_details", [&] (auto& handler) -> future<> {
|
||||
throw std::runtime_error("Injected exception in per-snapshot-get_snapshot_details");
|
||||
}).get();
|
||||
|
||||
// The manifest and schema.cql files are the only files expected to be in this directory not belonging to the SSTable.
|
||||
//
|
||||
// All the others should just generate an exception: there is something wrong, so don't blindly
|
||||
// add it to the size.
|
||||
if (name != "manifest.json" && name != "schema.cql") {
|
||||
details.total += size;
|
||||
if (sd.number_of_links == 1) {
|
||||
// File exists only in the snapshot directory.
|
||||
details.live += size;
|
||||
continue;
|
||||
}
|
||||
// If the number of links is greater than 1, it is still possible that the file is linked to another snapshot
|
||||
// So check the datadir for the file too.
|
||||
} else {
|
||||
// The manifest and schema.sql files are the only files expected to be in this directory not belonging to the SSTable.
|
||||
//
|
||||
// All the others should just generate an exception: there is something wrong, so don't blindly
|
||||
// add it to the size.
|
||||
if (name != "manifest.json" && name != "schema.cql") {
|
||||
details.total += size;
|
||||
if (sd.number_of_links == 1) {
|
||||
// File exists only in the snapshot directory.
|
||||
details.live += size;
|
||||
continue;
|
||||
}
|
||||
|
||||
auto exists_in_dir = [&] (file& dir, const fs::path& path, std::string_view name) -> future<bool> {
|
||||
try {
|
||||
// File exists in the main SSTable directory. Snapshots are not contributing to size
|
||||
auto psd = co_await io_check(file_stat, dir, name, follow_symlink::no);
|
||||
// File in main SSTable directory must be hardlinked to the file in the snapshot dir with the same name.
|
||||
if (psd.device_id != sd.device_id || psd.inode_number != sd.inode_number) {
|
||||
dblog.warn("[{} device_id={} inode_number={} size={}] is not the same file as [{} device_id={} inode_number={} size={}]",
|
||||
(path / name).native(), psd.device_id, psd.inode_number, psd.size,
|
||||
(snapshot_dir / name).native(), sd.device_id, sd.inode_number, sd.size);
|
||||
co_return false;
|
||||
}
|
||||
co_return true;
|
||||
} catch (std::system_error& e) {
|
||||
if (e.code() != std::error_code(ENOENT, std::system_category())) {
|
||||
throw;
|
||||
}
|
||||
co_return false;
|
||||
}
|
||||
};
|
||||
// Check staging dir first, as files might be moved from there to the datadir concurrently to this check
|
||||
if ((!staging_dir || !co_await exists_in_dir(staging_directory, *staging_dir, name)) &&
|
||||
!co_await exists_in_dir(data_directory, datadir, name)) {
|
||||
details.live += size;
|
||||
}
|
||||
// If the number of links is greater than 1, it is still possible that the file is linked to another snapshot
|
||||
// So check the datadir for the file too.
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto exists_in_dir = [&] (file& dir, const fs::path& path, std::string_view name) -> future<bool> {
|
||||
try {
|
||||
// File exists in the main SSTable directory. Snapshots are not contributing to size
|
||||
auto psd = co_await io_check(file_stat, dir, name, follow_symlink::no);
|
||||
// File in main SSTable directory must be hardlinked to the file in the snapshot dir with the same name.
|
||||
if (psd.device_id != sd.device_id || psd.inode_number != sd.inode_number) {
|
||||
dblog.warn("[{} device_id={} inode_number={} size={}] is not the same file as [{} device_id={} inode_number={} size={}]",
|
||||
(path / name).native(), psd.device_id, psd.inode_number, psd.size,
|
||||
(snapshot_dir / name).native(), sd.device_id, sd.inode_number, sd.size);
|
||||
co_return false;
|
||||
}
|
||||
co_return true;
|
||||
} catch (std::system_error& e) {
|
||||
if (e.code() != std::error_code(ENOENT, std::system_category())) {
|
||||
throw;
|
||||
}
|
||||
co_return false;
|
||||
}
|
||||
};
|
||||
// Check staging dir first, as files might be moved from there to the datadir concurrently to this check
|
||||
if ((!staging_dir || !co_await exists_in_dir(staging_directory, *staging_dir, name)) &&
|
||||
!co_await exists_in_dir(data_directory, datadir, name)) {
|
||||
details.live += size;
|
||||
}
|
||||
} catch (...) {
|
||||
ex = std::current_exception();
|
||||
}
|
||||
co_await lister.close();
|
||||
if (ex) {
|
||||
co_await coroutine::return_exception_ptr(std::move(ex));
|
||||
}
|
||||
|
||||
co_return details;
|
||||
@@ -4443,18 +4261,6 @@ future<db::replay_position> table::discard_sstables(db_clock::time_point truncat
|
||||
co_return rp;
|
||||
}
|
||||
|
||||
future<> table::discard_logstor_segments() {
|
||||
if (!uses_logstor()) {
|
||||
co_return;
|
||||
}
|
||||
|
||||
_logstor_index->clear();
|
||||
|
||||
co_await parallel_foreach_compaction_group([] (compaction_group& cg) {
|
||||
return cg.discard_logstor_segments();
|
||||
});
|
||||
}
|
||||
|
||||
void table::mark_ready_for_writes(db::commitlog* cl) {
|
||||
if (!_readonly) {
|
||||
on_internal_error(dblog, ::format("table {}.{} is already writable", _schema->ks_name(), _schema->cf_name()));
|
||||
@@ -4465,19 +4271,6 @@ void table::mark_ready_for_writes(db::commitlog* cl) {
|
||||
_readonly = false;
|
||||
}
|
||||
|
||||
void table::init_logstor(logstor::logstor* ls) {
|
||||
_logstor = ls;
|
||||
_logstor_index = std::make_unique<logstor::primary_index>(_schema);
|
||||
}
|
||||
|
||||
size_t table::get_logstor_memory_usage() const {
|
||||
size_t m = 0;
|
||||
if (_logstor_index) {
|
||||
m += _logstor_index->get_memory_usage();
|
||||
}
|
||||
return m;
|
||||
}
|
||||
|
||||
db::commitlog* table::commitlog() const {
|
||||
if (_readonly) [[unlikely]] {
|
||||
on_internal_error(dblog, ::format("table {}.{} is readonly", _schema->ks_name(), _schema->cf_name()));
|
||||
@@ -4502,9 +4295,6 @@ void table::set_schema(schema_ptr s) {
|
||||
if (_counter_cell_locks) {
|
||||
_counter_cell_locks->set_schema(s);
|
||||
}
|
||||
if (_logstor_index) {
|
||||
_logstor_index->set_schema(s);
|
||||
}
|
||||
_schema = std::move(s);
|
||||
|
||||
for (auto&& v : _views) {
|
||||
@@ -4732,11 +4522,6 @@ future<> table::apply(const mutation& m, db::rp_handle&& h, db::timeout_clock::t
|
||||
|
||||
auto& cg = compaction_group_for_token(m.token());
|
||||
auto holder = cg.async_gate().hold();
|
||||
|
||||
if (_logstor) [[unlikely]] {
|
||||
return _logstor->write(m, cg, std::move(holder));
|
||||
}
|
||||
|
||||
return dirty_memory_region_group().run_when_memory_available([this, &m, h = std::move(h), &cg, holder = std::move(holder)] () mutable {
|
||||
do_apply(cg, std::move(h), m);
|
||||
}, timeout);
|
||||
@@ -4752,10 +4537,6 @@ future<> table::apply(const frozen_mutation& m, schema_ptr m_schema, db::rp_hand
|
||||
auto& cg = compaction_group_for_key(m.key(), m_schema);
|
||||
auto holder = cg.async_gate().hold();
|
||||
|
||||
if (_logstor) [[unlikely]] {
|
||||
return _logstor->write(m.unfreeze(m_schema), cg, std::move(holder));
|
||||
}
|
||||
|
||||
return dirty_memory_region_group().run_when_memory_available([this, &m, m_schema = std::move(m_schema), h = std::move(h), &cg, holder = std::move(holder)]() mutable {
|
||||
do_apply(cg, std::move(h), m, m_schema);
|
||||
}, timeout);
|
||||
@@ -4860,14 +4641,13 @@ table::query(schema_ptr query_schema,
|
||||
}
|
||||
|
||||
std::optional<full_position> last_pos;
|
||||
if (querier_opt) {
|
||||
if (querier_opt->current_position()) {
|
||||
last_pos.emplace(*querier_opt->current_position());
|
||||
}
|
||||
if (!saved_querier || (!querier_opt->are_limits_reached() && !qs.builder.is_short_read())) {
|
||||
co_await querier_opt->close();
|
||||
querier_opt = {};
|
||||
}
|
||||
if (querier_opt && querier_opt->current_position()) {
|
||||
last_pos.emplace(*querier_opt->current_position());
|
||||
}
|
||||
|
||||
if (!saved_querier || (querier_opt && !querier_opt->are_limits_reached() && !qs.builder.is_short_read())) {
|
||||
co_await querier_opt->close();
|
||||
querier_opt = {};
|
||||
}
|
||||
if (saved_querier) {
|
||||
*saved_querier = std::move(querier_opt);
|
||||
@@ -4957,10 +4737,6 @@ table::enable_auto_compaction() {
|
||||
// see table::disable_auto_compaction() notes.
|
||||
_compaction_disabled_by_user = false;
|
||||
trigger_compaction();
|
||||
|
||||
if (uses_logstor()) {
|
||||
trigger_logstor_compaction();
|
||||
}
|
||||
}
|
||||
|
||||
future<>
|
||||
@@ -4992,18 +4768,11 @@ table::disable_auto_compaction() {
|
||||
// - it will break computation of major compaction descriptor
|
||||
// for new submissions
|
||||
_compaction_disabled_by_user = true;
|
||||
|
||||
auto holder = _async_gate.hold();
|
||||
|
||||
co_await parallel_foreach_compaction_group_view([this] (compaction::compaction_group_view& view) {
|
||||
return _compaction_manager.stop_ongoing_compactions("disable auto-compaction", &view, compaction::compaction_type::Compaction);
|
||||
});
|
||||
|
||||
if (uses_logstor()) {
|
||||
co_await parallel_foreach_compaction_group([this] (compaction_group& cg) {
|
||||
return get_logstor_compaction_manager().stop_ongoing_compactions(cg);
|
||||
return with_gate(_async_gate, [this] {
|
||||
return parallel_foreach_compaction_group_view([this] (compaction::compaction_group_view& view) {
|
||||
return _compaction_manager.stop_ongoing_compactions("disable auto-compaction", &view, compaction::compaction_type::Compaction);
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
void table::set_tombstone_gc_enabled(bool tombstone_gc_enabled) noexcept {
|
||||
@@ -5216,26 +4985,6 @@ const compaction::compaction_manager& compaction_group::get_compaction_manager()
|
||||
return _t.get_compaction_manager();
|
||||
}
|
||||
|
||||
logstor::segment_manager& compaction_group::get_logstor_segment_manager() noexcept {
|
||||
return _t.get_logstor_segment_manager();
|
||||
}
|
||||
|
||||
const logstor::segment_manager& compaction_group::get_logstor_segment_manager() const noexcept {
|
||||
return _t.get_logstor_segment_manager();
|
||||
}
|
||||
|
||||
logstor::compaction_manager& compaction_group::get_logstor_compaction_manager() noexcept {
|
||||
return _t.get_logstor_compaction_manager();
|
||||
}
|
||||
|
||||
const logstor::compaction_manager& compaction_group::get_logstor_compaction_manager() const noexcept {
|
||||
return _t.get_logstor_compaction_manager();
|
||||
}
|
||||
|
||||
logstor::primary_index& compaction_group::get_logstor_index() noexcept {
|
||||
return _t.logstor_index();
|
||||
}
|
||||
|
||||
compaction::compaction_group_view& compaction_group::as_view_for_static_sharding() const {
|
||||
return view_for_unrepaired_data();
|
||||
}
|
||||
|
||||
@@ -592,7 +592,6 @@ bool operator==(const schema::user_properties& lhs, const schema::user_propertie
|
||||
&& lhs.compaction_strategy == rhs.compaction_strategy
|
||||
&& lhs.compaction_strategy_options == rhs.compaction_strategy_options
|
||||
&& lhs.compaction_enabled == rhs.compaction_enabled
|
||||
&& lhs.storage_engine == rhs.storage_engine
|
||||
&& lhs.caching_options == rhs.caching_options
|
||||
&& lhs.tablet_options == rhs.tablet_options
|
||||
&& lhs.get_paxos_grace_seconds() == rhs.get_paxos_grace_seconds()
|
||||
@@ -699,7 +698,6 @@ table_schema_version schema::calculate_digest(const schema::raw_schema& r) {
|
||||
feed_hash(h, r._view_info);
|
||||
feed_hash(h, r._indices_by_name);
|
||||
feed_hash(h, r._is_counter);
|
||||
feed_hash(h, r._props.storage_engine);
|
||||
|
||||
for (auto&& [name, ext] : r._props.extensions) {
|
||||
feed_hash(h, name);
|
||||
@@ -876,9 +874,6 @@ auto fmt::formatter<schema>::format(const schema& s, fmt::format_context& ctx) c
|
||||
out = fmt::format_to(out, ",minIndexInterval={}", s._raw._props.min_index_interval);
|
||||
out = fmt::format_to(out, ",maxIndexInterval={}", s._raw._props.max_index_interval);
|
||||
out = fmt::format_to(out, ",speculativeRetry={}", s._raw._props.speculative_retry.to_sstring());
|
||||
if (s.storage_engine() != storage_engine_type::normal) {
|
||||
out = fmt::format_to(out, ",storage_engine={}", storage_engine_type_to_sstring(s.storage_engine()));
|
||||
}
|
||||
out = fmt::format_to(out, ",tablets={{");
|
||||
if (s._raw._props.tablet_options) {
|
||||
n = 0;
|
||||
@@ -1215,9 +1210,6 @@ fragmented_ostringstream& schema::schema_properties(const schema_describe_helper
|
||||
os << "\n AND memtable_flush_period_in_ms = " << fmt::to_string(memtable_flush_period());
|
||||
os << "\n AND min_index_interval = " << fmt::to_string(min_index_interval());
|
||||
os << "\n AND speculative_retry = '" << speculative_retry().to_sstring() << "'";
|
||||
if (storage_engine() != storage_engine_type::normal) {
|
||||
os << "\n AND storage_engine = '" << storage_engine_type_to_sstring(storage_engine()) << "'";
|
||||
}
|
||||
|
||||
if (has_tablet_options()) {
|
||||
os << "\n AND tablets = {";
|
||||
|
||||
@@ -175,21 +175,6 @@ public:
|
||||
bool operator==(const speculative_retry& other) const = default;
|
||||
};
|
||||
|
||||
enum class storage_engine_type {
|
||||
normal,
|
||||
logstor,
|
||||
};
|
||||
|
||||
inline sstring storage_engine_type_to_sstring(storage_engine_type t) {
|
||||
switch (t) {
|
||||
case storage_engine_type::normal:
|
||||
return "normal";
|
||||
case storage_engine_type::logstor:
|
||||
return "logstor";
|
||||
}
|
||||
throw std::invalid_argument(format("unknown storage engine type: {:d}\n", uint8_t(t)));
|
||||
}
|
||||
|
||||
using index_options_map = std::unordered_map<sstring, sstring>;
|
||||
|
||||
enum class index_metadata_kind {
|
||||
@@ -576,7 +561,6 @@ public:
|
||||
compaction::compaction_strategy_type compaction_strategy = compaction::compaction_strategy_type::incremental;
|
||||
std::map<sstring, sstring> compaction_strategy_options;
|
||||
bool compaction_enabled = true;
|
||||
storage_engine_type storage_engine = storage_engine_type::normal;
|
||||
::caching_options caching_options;
|
||||
std::optional<std::map<sstring, sstring>> tablet_options;
|
||||
|
||||
@@ -792,14 +776,6 @@ public:
|
||||
return _raw._props.compaction_enabled;
|
||||
}
|
||||
|
||||
storage_engine_type storage_engine() const {
|
||||
return _raw._props.storage_engine;
|
||||
}
|
||||
|
||||
bool logstor_enabled() const {
|
||||
return _raw._props.storage_engine == storage_engine_type::logstor;
|
||||
}
|
||||
|
||||
const cdc::options& cdc_options() const {
|
||||
return _raw._props.get_cdc_options();
|
||||
}
|
||||
|
||||
@@ -269,11 +269,6 @@ public:
|
||||
enable_schema_commitlog();
|
||||
}
|
||||
|
||||
schema_builder& set_logstor() {
|
||||
_raw._props.storage_engine = storage_engine_type::logstor;
|
||||
return *this;
|
||||
}
|
||||
|
||||
class default_names {
|
||||
public:
|
||||
default_names(const schema_builder&);
|
||||
|
||||
@@ -22,12 +22,12 @@ static logging::logger slogger("schema_registry");
|
||||
static thread_local schema_registry registry;
|
||||
|
||||
schema_version_not_found::schema_version_not_found(table_schema_version v)
|
||||
: std::runtime_error{format("Schema version {} not found", v)} {
|
||||
}
|
||||
: std::runtime_error{format("Schema version {} not found", v)}
|
||||
{ }
|
||||
|
||||
schema_version_loading_failed::schema_version_loading_failed(table_schema_version v)
|
||||
: std::runtime_error{format("Failed to load schema version {}", v)} {
|
||||
}
|
||||
: std::runtime_error{format("Failed to load schema version {}", v)}
|
||||
{ }
|
||||
|
||||
schema_registry_entry::~schema_registry_entry() {
|
||||
if (_schema) {
|
||||
@@ -39,7 +39,8 @@ schema_registry_entry::schema_registry_entry(table_schema_version v, schema_regi
|
||||
: _state(state::INITIAL)
|
||||
, _version(v)
|
||||
, _registry(r)
|
||||
, _sync_state(sync_state::NOT_SYNCED) {
|
||||
, _sync_state(sync_state::NOT_SYNCED)
|
||||
{
|
||||
_erase_timer.set_callback([this] {
|
||||
slogger.debug("Dropping {}", _version);
|
||||
SCYLLA_ASSERT(!_schema);
|
||||
@@ -70,8 +71,8 @@ void schema_registry::attach_table(schema_registry_entry& e) noexcept {
|
||||
e.set_table(table.weak_from_this());
|
||||
} catch (const replica::no_such_column_family&) {
|
||||
if (slogger.is_enabled(seastar::log_level::debug)) {
|
||||
slogger.debug("No table for schema version {} of {}.{}: {}", e._version, e.get_schema()->ks_name(), e.get_schema()->cf_name(),
|
||||
seastar::current_backtrace());
|
||||
slogger.debug("No table for schema version {} of {}.{}: {}", e._version,
|
||||
e.get_schema()->ks_name(), e.get_schema()->cf_name(), seastar::current_backtrace());
|
||||
}
|
||||
// ignore
|
||||
}
|
||||
@@ -220,7 +221,7 @@ future<schema_ptr> schema_registry_entry::start_loading(async_schema_loader load
|
||||
_state = state::LOADING;
|
||||
slogger.trace("Loading {}", _version);
|
||||
// Move to background.
|
||||
(void)f.then_wrapped([self = shared_from_this(), this](future<extended_frozen_schema>&& f) {
|
||||
(void)f.then_wrapped([self = shared_from_this(), this] (future<extended_frozen_schema>&& f) {
|
||||
_loader = {};
|
||||
if (_state != state::LOADING) {
|
||||
slogger.trace("Loading of {} aborted", _version);
|
||||
@@ -293,8 +294,8 @@ schema_registry& local_schema_registry() {
|
||||
}
|
||||
|
||||
global_schema_ptr::global_schema_ptr(const global_schema_ptr& o)
|
||||
: global_schema_ptr(o.get()) {
|
||||
}
|
||||
: global_schema_ptr(o.get())
|
||||
{ }
|
||||
|
||||
global_schema_ptr::global_schema_ptr(global_schema_ptr&& o) noexcept {
|
||||
auto current = this_shard_id();
|
||||
@@ -331,15 +332,15 @@ schema_ptr global_schema_ptr::get() const {
|
||||
}
|
||||
|
||||
global_schema_ptr::global_schema_ptr(const schema_ptr& ptr)
|
||||
: _cpu_of_origin(this_shard_id()) {
|
||||
: _cpu_of_origin(this_shard_id()) {
|
||||
// _ptr must always have an associated registry entry,
|
||||
// if ptr doesn't, we need to load it into the registry.
|
||||
auto ensure_registry_entry = [](const schema_ptr& s) {
|
||||
auto ensure_registry_entry = [] (const schema_ptr& s) {
|
||||
schema_registry_entry* e = s->registry_entry();
|
||||
if (e) {
|
||||
return s;
|
||||
} else {
|
||||
return local_schema_registry().get_or_load(s->version(), [&s](table_schema_version) -> extended_frozen_schema {
|
||||
return local_schema_registry().get_or_load(s->version(), [&s] (table_schema_version) -> extended_frozen_schema {
|
||||
return extended_frozen_schema(s);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -952,8 +952,6 @@ class sstring:
|
||||
|
||||
@staticmethod
|
||||
def to_hex(data, size):
|
||||
if size == 0:
|
||||
return ''
|
||||
inf = gdb.selected_inferior()
|
||||
return bytes(inf.read_memory(data, size)).hex()
|
||||
|
||||
@@ -976,8 +974,6 @@ class sstring:
|
||||
return self.ref['u']['external']['str']
|
||||
|
||||
def as_bytes(self):
|
||||
if len(self) == 0:
|
||||
return b''
|
||||
inf = gdb.selected_inferior()
|
||||
return bytes(inf.read_memory(self.data(), len(self)))
|
||||
|
||||
@@ -5640,8 +5636,6 @@ class scylla_sstable_summary(gdb.Command):
|
||||
self.inf = gdb.selected_inferior()
|
||||
|
||||
def to_hex(self, data, size):
|
||||
if size == 0:
|
||||
return ''
|
||||
return bytes(self.inf.read_memory(data, size)).hex()
|
||||
|
||||
def invoke(self, arg, for_tty):
|
||||
@@ -5653,10 +5647,6 @@ class scylla_sstable_summary(gdb.Command):
|
||||
sst = seastar_lw_shared_ptr(arg).get().dereference()
|
||||
else:
|
||||
sst = arg
|
||||
ms_version = int(gdb.parse_and_eval('sstables::sstable_version_types::ms'))
|
||||
if int(sst['_version']) >= ms_version:
|
||||
gdb.write("sstable uses ms format (trie-based index); summary is not populated.\n")
|
||||
return
|
||||
summary = seastar_lw_shared_ptr(sst['_components']['_value']).get().dereference()['summary']
|
||||
|
||||
gdb.write("header: {}\n".format(summary['header']))
|
||||
|
||||
@@ -227,6 +227,8 @@ future<> group0_state_machine::reload_modules(modules_to_reload modules) {
|
||||
for (const auto& m : modules.entries) {
|
||||
if (m.table == db::system_keyspace::service_levels_v2()->id()) {
|
||||
update_service_levels_cache = true;
|
||||
} else if (m.table == db::system_keyspace::role_members()->id() || m.table == db::system_keyspace::role_attributes()->id()) {
|
||||
update_service_levels_effective_cache = true;
|
||||
} else if (m.table == db::system_keyspace::dicts()->id()) {
|
||||
auto pk_type = db::system_keyspace::dicts()->partition_key_type();
|
||||
auto name_value = pk_type->deserialize_value(m.pk.representation());
|
||||
@@ -245,11 +247,6 @@ future<> group0_state_machine::reload_modules(modules_to_reload modules) {
|
||||
auto cdc_log_table_id = table_id(value_cast<utils::UUID>(uuid_type->deserialize_value(elements.front())));
|
||||
update_cdc_streams.insert(cdc_log_table_id);
|
||||
} else if (auth::cache::includes_table(m.table)) {
|
||||
if (m.table == db::system_keyspace::role_members()->id() ||
|
||||
m.table == db::system_keyspace::role_attributes()->id()) {
|
||||
update_service_levels_effective_cache = true;
|
||||
}
|
||||
|
||||
auto schema = _ss.get_database().find_schema(m.table);
|
||||
const auto elements = m.pk.explode(*schema);
|
||||
auto role = value_cast<sstring>(schema->partition_key_type()->
|
||||
@@ -258,9 +255,6 @@ future<> group0_state_machine::reload_modules(modules_to_reload modules) {
|
||||
}
|
||||
}
|
||||
|
||||
if (update_auth_cache_roles.size()) {
|
||||
co_await _ss.auth_cache().load_roles(std::move(update_auth_cache_roles));
|
||||
}
|
||||
if (update_service_levels_cache || update_service_levels_effective_cache) { // this also updates SL effective cache
|
||||
co_await _ss.update_service_levels_cache(qos::update_both_cache_levels(update_service_levels_cache), qos::query_context::group0);
|
||||
}
|
||||
@@ -270,6 +264,9 @@ future<> group0_state_machine::reload_modules(modules_to_reload modules) {
|
||||
if (update_cdc_streams.size()) {
|
||||
co_await _ss.load_cdc_streams(std::move(update_cdc_streams));
|
||||
}
|
||||
if (update_auth_cache_roles.size()) {
|
||||
co_await _ss.auth_cache().load_roles(std::move(update_auth_cache_roles));
|
||||
}
|
||||
}
|
||||
|
||||
future<> group0_state_machine::merge_and_apply(group0_state_machine_merger& merger) {
|
||||
|
||||
@@ -4653,7 +4653,6 @@ void storage_proxy::send_to_live_endpoints(storage_proxy::response_id_type respo
|
||||
auto& stats = handler_ptr->stats();
|
||||
auto& handler = *handler_ptr;
|
||||
auto& global_stats = handler._proxy->_global_stats;
|
||||
auto schema = handler_ptr->get_schema();
|
||||
|
||||
if (handler.get_targets().size() == 0) {
|
||||
// Usually we remove the response handler when receiving responses from all targets.
|
||||
@@ -4749,7 +4748,7 @@ void storage_proxy::send_to_live_endpoints(storage_proxy::response_id_type respo
|
||||
}
|
||||
|
||||
// Waited on indirectly.
|
||||
(void)f.handle_exception([response_id, forward_size, coordinator, handler_ptr, p = shared_from_this(), &stats, schema] (std::exception_ptr eptr) {
|
||||
(void)f.handle_exception([response_id, forward_size, coordinator, handler_ptr, p = shared_from_this(), &stats] (std::exception_ptr eptr) {
|
||||
++stats.writes_errors.get_ep_stat(handler_ptr->_effective_replication_map_ptr->get_topology(), coordinator);
|
||||
error err = error::FAILURE;
|
||||
std::optional<sstring> msg;
|
||||
@@ -4763,8 +4762,8 @@ void storage_proxy::send_to_live_endpoints(storage_proxy::response_id_type respo
|
||||
// ignore, disconnect will be logged by gossiper
|
||||
} else if (const auto* e = try_catch_nested<seastar::gate_closed_exception>(eptr)) {
|
||||
// may happen during shutdown, log and ignore it
|
||||
slogger.warn("gate_closed_exception during mutation write to {}.{} on {}: {}",
|
||||
schema->ks_name(), schema->cf_name(), coordinator, e->what());
|
||||
slogger.warn("gate_closed_exception during mutation write to {}: {}",
|
||||
coordinator, e->what());
|
||||
} else if (try_catch<timed_out_error>(eptr)) {
|
||||
// from lmutate(). Ignore so that logs are not flooded
|
||||
// database total_writes_timedout counter was incremented.
|
||||
@@ -4775,8 +4774,7 @@ void storage_proxy::send_to_live_endpoints(storage_proxy::response_id_type respo
|
||||
} else if (auto* e = try_catch<replica::critical_disk_utilization_exception>(eptr)) {
|
||||
msg = e->what();
|
||||
} else {
|
||||
slogger.error("exception during mutation write to {}.{} on {}: {}",
|
||||
schema->ks_name(), schema->cf_name(), coordinator, eptr);
|
||||
slogger.error("exception during mutation write to {}: {}", coordinator, eptr);
|
||||
}
|
||||
p->got_failure_response(response_id, coordinator, forward_size + 1, std::nullopt, err, std::move(msg));
|
||||
});
|
||||
|
||||
@@ -910,7 +910,7 @@ future<> storage_service::merge_topology_snapshot(raft_snapshot snp) {
|
||||
frozen_muts_to_apply.push_back(co_await freeze_gently(mut));
|
||||
} else {
|
||||
co_await for_each_split_mutation(std::move(mut), max_size, [&] (mutation m) -> future<> {
|
||||
frozen_muts_to_apply.push_back(co_await freeze_gently(m));
|
||||
frozen_muts_to_apply.push_back(co_await freeze_gently(mut));
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -3026,8 +3026,6 @@ future<> storage_service::drain() {
|
||||
}
|
||||
|
||||
future<> storage_service::do_drain() {
|
||||
co_await utils::get_local_injector().inject("storage_service_drain_wait", utils::wait_for_message(60s));
|
||||
|
||||
// Need to stop transport before group0, otherwise RPCs may fail with raft_group_not_found.
|
||||
co_await stop_transport();
|
||||
|
||||
@@ -4018,9 +4016,6 @@ future<> storage_service::process_tablet_split_candidate(table_id table) noexcep
|
||||
} catch (raft::request_aborted& ex) {
|
||||
slogger.warn("Failed to complete splitting of table {} due to {}", table, ex);
|
||||
break;
|
||||
} catch (seastar::gate_closed_exception& ex) {
|
||||
slogger.warn("Failed to complete splitting of table {} due to {}", table, ex);
|
||||
break;
|
||||
} catch (...) {
|
||||
slogger.error("Failed to complete splitting of table {} due to {}, retrying after {} seconds",
|
||||
table, std::current_exception(), split_retry.sleep_time());
|
||||
@@ -4087,58 +4082,6 @@ future<> storage_service::snitch_reconfigured() {
|
||||
}
|
||||
}
|
||||
|
||||
future<> storage_service::local_topology_barrier() {
|
||||
if (this_shard_id() != 0) {
|
||||
co_await container().invoke_on(0, [] (storage_service& ss) {
|
||||
return ss.local_topology_barrier();
|
||||
});
|
||||
co_return;
|
||||
}
|
||||
|
||||
auto version = _topology_state_machine._topology.version;
|
||||
|
||||
utils::get_local_injector().inject("raft_topology_barrier_and_drain_fail_before", [] {
|
||||
throw std::runtime_error("raft_topology_barrier_and_drain_fail_before injected exception");
|
||||
});
|
||||
|
||||
co_await utils::get_local_injector().inject("pause_before_barrier_and_drain", utils::wait_for_message(std::chrono::minutes(5)));
|
||||
if (_topology_state_machine._topology.tstate == topology::transition_state::write_both_read_old) {
|
||||
for (auto& n : _topology_state_machine._topology.transition_nodes) {
|
||||
if (!_address_map.find(locator::host_id{n.first.uuid()})) {
|
||||
rtlogger.error("The topology transition is in a double write state but the IP of the node in transition is not known");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
co_await container().invoke_on_all([version] (storage_service& ss) -> future<> {
|
||||
const auto current_version = ss._shared_token_metadata.get()->get_version();
|
||||
rtlogger.info("Got raft_topology_cmd::barrier_and_drain, version {}, "
|
||||
"current version {}, stale versions (version: use_count): {}",
|
||||
version, current_version, ss._shared_token_metadata.describe_stale_versions());
|
||||
|
||||
// This shouldn't happen under normal operation, it's only plausible
|
||||
// if the topology change coordinator has
|
||||
// moved to another node and managed to update the topology
|
||||
// parallel to this method. The previous coordinator
|
||||
// should be inactive now, so it won't observe this
|
||||
// exception. By returning exception we aim
|
||||
// to reveal any other conditions where this may arise.
|
||||
if (current_version != version) {
|
||||
co_await coroutine::return_exception(std::runtime_error(
|
||||
::format("raft topology: command::barrier_and_drain, the version has changed, "
|
||||
"version {}, current_version {}, the topology change coordinator "
|
||||
" had probably migrated to another node",
|
||||
version, current_version)));
|
||||
}
|
||||
|
||||
co_await ss._shared_token_metadata.stale_versions_in_use();
|
||||
co_await get_topology_session_manager().drain_closing_sessions();
|
||||
|
||||
rtlogger.info("raft_topology_cmd::barrier_and_drain done");
|
||||
});
|
||||
}
|
||||
|
||||
future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft::term_t term, uint64_t cmd_index, raft_topology_cmd cmd) {
|
||||
raft_topology_cmd_result result;
|
||||
rtlogger.info("topology cmd rpc {} is called index={}", cmd.cmd, cmd_index);
|
||||
@@ -4166,6 +4109,12 @@ future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft
|
||||
state.last_index = cmd_index;
|
||||
}
|
||||
|
||||
// We capture the topology version right after the checks
|
||||
// above, before any yields. This is crucial since _topology_state_machine._topology
|
||||
// might be altered concurrently while this method is running,
|
||||
// which can cause the fence command to apply an invalid fence version.
|
||||
const auto version = _topology_state_machine._topology.version;
|
||||
|
||||
switch (cmd.cmd) {
|
||||
case raft_topology_cmd::command::barrier: {
|
||||
utils::get_local_injector().inject("raft_topology_barrier_fail",
|
||||
@@ -4204,7 +4153,44 @@ future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft
|
||||
}
|
||||
break;
|
||||
case raft_topology_cmd::command::barrier_and_drain: {
|
||||
co_await local_topology_barrier();
|
||||
utils::get_local_injector().inject("raft_topology_barrier_and_drain_fail_before", [] {
|
||||
throw std::runtime_error("raft_topology_barrier_and_drain_fail_before injected exception");
|
||||
});
|
||||
co_await utils::get_local_injector().inject("pause_before_barrier_and_drain", utils::wait_for_message(std::chrono::minutes(5)));
|
||||
if (_topology_state_machine._topology.tstate == topology::transition_state::write_both_read_old) {
|
||||
for (auto& n : _topology_state_machine._topology.transition_nodes) {
|
||||
if (!_address_map.find(locator::host_id{n.first.uuid()})) {
|
||||
rtlogger.error("The topology transition is in a double write state but the IP of the node in transition is not known");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
co_await container().invoke_on_all([version] (storage_service& ss) -> future<> {
|
||||
const auto current_version = ss._shared_token_metadata.get()->get_version();
|
||||
rtlogger.info("Got raft_topology_cmd::barrier_and_drain, version {}, "
|
||||
"current version {}, stale versions (version: use_count): {}",
|
||||
version, current_version, ss._shared_token_metadata.describe_stale_versions());
|
||||
|
||||
// This shouldn't happen under normal operation, it's only plausible
|
||||
// if the topology change coordinator has
|
||||
// moved to another node and managed to update the topology
|
||||
// parallel to this method. The previous coordinator
|
||||
// should be inactive now, so it won't observe this
|
||||
// exception. By returning exception we aim
|
||||
// to reveal any other conditions where this may arise.
|
||||
if (current_version != version) {
|
||||
co_await coroutine::return_exception(std::runtime_error(
|
||||
::format("raft topology: command::barrier_and_drain, the version has changed, "
|
||||
"version {}, current_version {}, the topology change coordinator "
|
||||
" had probably migrated to another node",
|
||||
version, current_version)));
|
||||
}
|
||||
|
||||
co_await ss._shared_token_metadata.stale_versions_in_use();
|
||||
co_await get_topology_session_manager().drain_closing_sessions();
|
||||
|
||||
rtlogger.info("raft_topology_cmd::barrier_and_drain done");
|
||||
});
|
||||
|
||||
co_await utils::get_local_injector().inject("raft_topology_barrier_and_drain_fail", [this] (auto& handler) -> future<> {
|
||||
auto ks = handler.get("keyspace");
|
||||
|
||||
@@ -813,9 +813,6 @@ public:
|
||||
future<bool> ongoing_rf_change(const group0_guard& guard, sstring ks) const;
|
||||
future<> raft_initialize_discovery_leader(const join_node_request_params& params);
|
||||
future<> initialize_done_topology_upgrade_state();
|
||||
// Does the local part of global_token_metadata_barrier(), without a raft group0 barrier.
|
||||
// In particular, waits for non-latest local erms to go die.
|
||||
future<> local_topology_barrier();
|
||||
private:
|
||||
// State machine that is responsible for topology change
|
||||
topology_state_machine& _topology_state_machine;
|
||||
|
||||
@@ -195,9 +195,9 @@ future<std::optional<tasks::task_status>> tablet_virtual_task::wait(tasks::task_
|
||||
} else if (is_resize_task(task_type)) {
|
||||
auto new_tablet_count = _ss.get_token_metadata().tablets().get_tablet_map(table).tablet_count();
|
||||
res->status.state = new_tablet_count == tablet_count ? tasks::task_manager::task_state::suspended : tasks::task_manager::task_state::done;
|
||||
res->status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, _ss.get_token_metadata_ptr()) : utils::chunked_vector<tasks::task_identity>{};
|
||||
res->status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper())) : utils::chunked_vector<tasks::task_identity>{};
|
||||
} else {
|
||||
res->status.children = co_await get_children(get_module(), id, _ss.get_token_metadata_ptr());
|
||||
res->status.children = co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper()));
|
||||
}
|
||||
res->status.end_time = db_clock::now(); // FIXME: Get precise end time.
|
||||
co_return res->status;
|
||||
@@ -312,7 +312,7 @@ future<std::optional<status_helper>> tablet_virtual_task::get_status_helper(task
|
||||
}
|
||||
return make_ready_future();
|
||||
});
|
||||
res.status.children = co_await get_children(get_module(), id, _ss.get_token_metadata_ptr());
|
||||
res.status.children = co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper()));
|
||||
} else if (is_migration_task(task_type)) { // Migration task.
|
||||
auto tablet_id = hint.get_tablet_id();
|
||||
res.pending_replica = tmap.get_tablet_transition_info(tablet_id)->pending_replica;
|
||||
@@ -326,7 +326,7 @@ future<std::optional<status_helper>> tablet_virtual_task::get_status_helper(task
|
||||
if (task_info.tablet_task_id.uuid() == id.uuid()) {
|
||||
update_status(task_info, res.status, sched_nr);
|
||||
res.status.state = tasks::task_manager::task_state::running;
|
||||
res.status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, _ss.get_token_metadata_ptr()) : utils::chunked_vector<tasks::task_identity>{};
|
||||
res.status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper())) : utils::chunked_vector<tasks::task_identity>{};
|
||||
co_return res;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2229,19 +2229,6 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
_tablet_allocator.set_load_stats(reconciled_stats);
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for the background storage group merge to finish before releasing the state machine.
|
||||
// Background merge holds the old erm, so a successful barrier joins with it.
|
||||
// This guarantees that the background merge doesn't run concurrently with the next merge.
|
||||
// Replica-side storage group merge takes compaction locks on the tablet's main compaction group, released
|
||||
// by the background merge. If the next merge starts before the background merge finishes, it can cause a deadlock.
|
||||
// The background merge fiber will try to stop a compaction group which is locked, and the lock is held
|
||||
// by the background merge fiber.
|
||||
tm = nullptr;
|
||||
if (!guard) {
|
||||
guard = co_await start_operation();
|
||||
}
|
||||
co_await global_tablet_token_metadata_barrier(std::move(guard));
|
||||
}
|
||||
|
||||
using get_table_ids_func = std::function<std::unordered_set<table_id>(const db::system_keyspace::topology_requests_entry&)>;
|
||||
|
||||
@@ -201,49 +201,95 @@ public:
|
||||
virtual future<std::optional<entry_info>> next_entry() = 0;
|
||||
};
|
||||
|
||||
// Promoted index information produced by the parser.
|
||||
struct parsed_promoted_index_entry {
|
||||
deletion_time del_time;
|
||||
uint64_t promoted_index_start;
|
||||
uint32_t promoted_index_size;
|
||||
uint32_t num_blocks;
|
||||
};
|
||||
// Allocated inside LSA.
|
||||
class promoted_index {
|
||||
deletion_time _del_time;
|
||||
uint64_t _promoted_index_start;
|
||||
uint32_t _promoted_index_size;
|
||||
uint32_t _num_blocks;
|
||||
public:
|
||||
promoted_index(const schema& s,
|
||||
deletion_time del_time,
|
||||
uint64_t promoted_index_start,
|
||||
uint32_t promoted_index_size,
|
||||
uint32_t num_blocks)
|
||||
: _del_time{del_time}
|
||||
, _promoted_index_start(promoted_index_start)
|
||||
, _promoted_index_size(promoted_index_size)
|
||||
, _num_blocks(num_blocks)
|
||||
{ }
|
||||
|
||||
using promoted_index = parsed_promoted_index_entry;
|
||||
[[nodiscard]] deletion_time get_deletion_time() const { return _del_time; }
|
||||
[[nodiscard]] uint32_t get_promoted_index_size() const { return _promoted_index_size; }
|
||||
|
||||
// Call under allocating_section.
|
||||
// For sstable versions >= mc the returned cursor will be of type `bsearch_clustered_cursor`.
|
||||
std::unique_ptr<clustered_index_cursor> make_cursor(shared_sstable,
|
||||
reader_permit,
|
||||
tracing::trace_state_ptr,
|
||||
file_input_stream_options,
|
||||
use_caching);
|
||||
};
|
||||
|
||||
// A partition index element.
|
||||
// Allocated inside LSA.
|
||||
struct [[gnu::packed]] index_entry {
|
||||
mutable int64_t raw_token;
|
||||
uint64_t data_file_offset;
|
||||
uint32_t key_offset;
|
||||
class index_entry {
|
||||
private:
|
||||
managed_bytes _key;
|
||||
mutable std::optional<dht::token> _token;
|
||||
uint64_t _position;
|
||||
managed_ref<promoted_index> _index;
|
||||
|
||||
uint64_t position() const { return data_file_offset; }
|
||||
dht::raw_token token() const { return dht::raw_token(raw_token); }
|
||||
public:
|
||||
|
||||
key_view get_key() const {
|
||||
return key_view{_key};
|
||||
}
|
||||
|
||||
// May allocate so must be called under allocating_section.
|
||||
decorated_key_view get_decorated_key(const schema& s) const {
|
||||
if (!_token) {
|
||||
_token.emplace(s.get_partitioner().get_token(get_key()));
|
||||
}
|
||||
return decorated_key_view(*_token, get_key());
|
||||
}
|
||||
|
||||
uint64_t position() const { return _position; };
|
||||
|
||||
std::optional<deletion_time> get_deletion_time() const {
|
||||
if (_index) {
|
||||
return _index->get_deletion_time();
|
||||
}
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
index_entry(managed_bytes&& key, uint64_t position, managed_ref<promoted_index>&& index)
|
||||
: _key(std::move(key))
|
||||
, _position(position)
|
||||
, _index(std::move(index))
|
||||
{}
|
||||
|
||||
index_entry(index_entry&&) = default;
|
||||
index_entry& operator=(index_entry&&) = default;
|
||||
|
||||
// Can be nullptr
|
||||
const managed_ref<promoted_index>& get_promoted_index() const { return _index; }
|
||||
managed_ref<promoted_index>& get_promoted_index() { return _index; }
|
||||
uint32_t get_promoted_index_size() const { return _index ? _index->get_promoted_index_size() : 0; }
|
||||
|
||||
size_t external_memory_usage() const {
|
||||
return _key.external_memory_usage() + _index.external_memory_usage();
|
||||
}
|
||||
};
|
||||
|
||||
// Required for optimized LSA migration of storage of managed_vector.
|
||||
static_assert(std::is_trivially_move_assignable_v<index_entry>);
|
||||
static_assert(std::is_trivially_move_assignable_v<parsed_promoted_index_entry>);
|
||||
|
||||
// A partition index page.
|
||||
//
|
||||
// Allocated in the standard allocator space but with an LSA allocator as the current allocator.
|
||||
// So the shallow part is in the standard allocator but all indirect objects are inside LSA.
|
||||
class partition_index_page {
|
||||
public:
|
||||
lsa::chunked_managed_vector<index_entry> _entries;
|
||||
managed_bytes _key_storage;
|
||||
|
||||
// Stores promoted index information of index entries.
|
||||
// The i-th element corresponds to the i-th entry in _entries.
|
||||
// Can be smaller than _entries. If _entries[i] doesn't have a matching element in _promoted_indexes then
|
||||
// that entry doesn't have a promoted index.
|
||||
// It's not chunked, because promoted index is present only when there are large partitions in the page,
|
||||
// which also means the page will have typically only 1 entry due to summary:data_file size ratio.
|
||||
// Kept separately to avoid paying for storage cost in pages where no entry has a promoted index,
|
||||
// which is typical in workloads with small partitions.
|
||||
managed_vector<promoted_index> _promoted_indexes;
|
||||
lsa::chunked_managed_vector<managed_ref<index_entry>> _entries;
|
||||
public:
|
||||
partition_index_page() = default;
|
||||
partition_index_page(partition_index_page&&) noexcept = default;
|
||||
@@ -252,68 +298,15 @@ public:
|
||||
bool empty() const { return _entries.empty(); }
|
||||
size_t size() const { return _entries.size(); }
|
||||
|
||||
stop_iteration clear_gently() {
|
||||
// Vectors have trivial storage, so are fast to destroy.
|
||||
return stop_iteration::yes;
|
||||
}
|
||||
|
||||
void clear_one_entry() {
|
||||
_entries.pop_back();
|
||||
}
|
||||
|
||||
bool has_promoted_index(size_t i) const {
|
||||
return i < _promoted_indexes.size() && _promoted_indexes[i].promoted_index_size > 0;
|
||||
}
|
||||
|
||||
/// Get promoted index for the i-th entry.
|
||||
/// Call only when has_promoted_index(i) is true.
|
||||
const promoted_index& get_promoted_index(size_t i) const {
|
||||
return _promoted_indexes[i];
|
||||
}
|
||||
|
||||
/// Get promoted index for the i-th entry.
|
||||
/// Call only when has_promoted_index(i) is true.
|
||||
promoted_index& get_promoted_index(size_t i) {
|
||||
return _promoted_indexes[i];
|
||||
}
|
||||
|
||||
/// Get promoted index size for the i-th entry.
|
||||
uint32_t get_promoted_index_size(size_t i) const {
|
||||
return has_promoted_index(i) ? get_promoted_index(i).promoted_index_size : 0;
|
||||
}
|
||||
|
||||
/// Get deletion_time for partition represented by the i-th entry.
|
||||
/// Returns disengaged optional if the entry doesn't have a promoted index, so we don't know the deletion_time.
|
||||
/// It has to be read from the data file.
|
||||
std::optional<deletion_time> get_deletion_time(size_t i) const {
|
||||
if (has_promoted_index(i)) {
|
||||
return get_promoted_index(i).del_time;
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
key_view get_key(size_t i) const {
|
||||
auto start = _entries[i].key_offset;
|
||||
auto end = i + 1 < _entries.size() ? _entries[i + 1].key_offset : _key_storage.size();
|
||||
auto v = managed_bytes_view(_key_storage).prefix(end);
|
||||
v.remove_prefix(start);
|
||||
return key_view(v);
|
||||
}
|
||||
|
||||
decorated_key_view get_decorated_key(const schema& s, size_t i) const {
|
||||
auto key = get_key(i);
|
||||
auto t = _entries[i].token();
|
||||
if (!t) {
|
||||
t = dht::raw_token(s.get_partitioner().get_token(key));
|
||||
_entries[i].raw_token = t.value;
|
||||
}
|
||||
return decorated_key_view(dht::token(t), key);
|
||||
}
|
||||
|
||||
size_t external_memory_usage() const {
|
||||
size_t size = _entries.external_memory_usage();
|
||||
size += _promoted_indexes.external_memory_usage();
|
||||
size += _key_storage.external_memory_usage();
|
||||
for (auto&& e : _entries) {
|
||||
size += sizeof(index_entry) + e->external_memory_usage();
|
||||
}
|
||||
return size;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -25,6 +25,14 @@ namespace sstables {
|
||||
extern seastar::logger sstlog;
|
||||
extern thread_local mc::cached_promoted_index::metrics promoted_index_cache_metrics;
|
||||
|
||||
// Promoted index information produced by the parser.
|
||||
struct parsed_promoted_index_entry {
|
||||
deletion_time del_time;
|
||||
uint64_t promoted_index_start;
|
||||
uint32_t promoted_index_size;
|
||||
uint32_t num_blocks;
|
||||
};
|
||||
|
||||
// Partition index entry information produced by the parser.
|
||||
struct parsed_partition_index_entry {
|
||||
temporary_buffer<char> key;
|
||||
@@ -45,10 +53,9 @@ class index_consumer {
|
||||
schema_ptr _s;
|
||||
logalloc::allocating_section _alloc_section;
|
||||
logalloc::region& _region;
|
||||
utils::chunked_vector<parsed_partition_index_entry> _parsed_entries;
|
||||
size_t _max_promoted_index_entry_plus_one = 0; // Highest index +1 in _parsed_entries which has a promoted index.
|
||||
size_t _key_storage_size = 0;
|
||||
public:
|
||||
index_list indexes;
|
||||
|
||||
index_consumer(logalloc::region& r, schema_ptr s)
|
||||
: _s(s)
|
||||
, _alloc_section(abstract_formatter([s] (fmt::format_context& ctx) {
|
||||
@@ -57,63 +64,36 @@ public:
|
||||
, _region(r)
|
||||
{ }
|
||||
|
||||
void consume_entry(parsed_partition_index_entry&& e) {
|
||||
_key_storage_size += e.key.size();
|
||||
_parsed_entries.emplace_back(std::move(e));
|
||||
if (e.promoted_index) {
|
||||
_max_promoted_index_entry_plus_one = std::max(_max_promoted_index_entry_plus_one, _parsed_entries.size());
|
||||
}
|
||||
~index_consumer() {
|
||||
with_allocator(_region.allocator(), [&] {
|
||||
indexes._entries.clear_and_release();
|
||||
});
|
||||
}
|
||||
|
||||
future<index_list> finalize() {
|
||||
index_list result;
|
||||
// In case of exception, need to deallocate under region allocator.
|
||||
auto delete_result = seastar::defer([&] {
|
||||
void consume_entry(parsed_partition_index_entry&& e) {
|
||||
_alloc_section(_region, [&] {
|
||||
with_allocator(_region.allocator(), [&] {
|
||||
result._entries = {};
|
||||
result._promoted_indexes = {};
|
||||
result._key_storage = {};
|
||||
managed_ref<promoted_index> pi;
|
||||
if (e.promoted_index) {
|
||||
pi = make_managed<promoted_index>(*_s,
|
||||
e.promoted_index->del_time,
|
||||
e.promoted_index->promoted_index_start,
|
||||
e.promoted_index->promoted_index_size,
|
||||
e.promoted_index->num_blocks);
|
||||
}
|
||||
auto key = managed_bytes(reinterpret_cast<const bytes::value_type*>(e.key.get()), e.key.size());
|
||||
indexes._entries.emplace_back(make_managed<index_entry>(std::move(key), e.data_file_offset, std::move(pi)));
|
||||
});
|
||||
});
|
||||
auto i = _parsed_entries.begin();
|
||||
size_t key_offset = 0;
|
||||
while (i != _parsed_entries.end()) {
|
||||
_alloc_section(_region, [&] {
|
||||
with_allocator(_region.allocator(), [&] {
|
||||
result._entries.reserve(_parsed_entries.size());
|
||||
result._promoted_indexes.resize(_max_promoted_index_entry_plus_one);
|
||||
if (result._key_storage.empty()) {
|
||||
result._key_storage = managed_bytes(managed_bytes::initialized_later(), _key_storage_size);
|
||||
}
|
||||
managed_bytes_mutable_view key_out(result._key_storage);
|
||||
key_out.remove_prefix(key_offset);
|
||||
while (i != _parsed_entries.end()) {
|
||||
parsed_partition_index_entry& e = *i;
|
||||
if (e.promoted_index) {
|
||||
result._promoted_indexes[result._entries.size()] = *e.promoted_index;
|
||||
}
|
||||
write_fragmented(key_out, std::string_view(e.key.begin(), e.key.size()));
|
||||
result._entries.emplace_back(index_entry{dht::raw_token().value, e.data_file_offset, key_offset});
|
||||
++i;
|
||||
key_offset += e.key.size();
|
||||
if (need_preempt()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
co_await coroutine::maybe_yield();
|
||||
}
|
||||
delete_result.cancel();
|
||||
_parsed_entries.clear();
|
||||
co_return std::move(result);
|
||||
}
|
||||
|
||||
void prepare(uint64_t size) {
|
||||
_max_promoted_index_entry_plus_one = 0;
|
||||
_key_storage_size = 0;
|
||||
_parsed_entries.clear();
|
||||
_parsed_entries.reserve(size);
|
||||
_alloc_section = logalloc::allocating_section();
|
||||
_alloc_section(_region, [&] {
|
||||
with_allocator(_region.allocator(), [&] {
|
||||
indexes._entries.reserve(size);
|
||||
});
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
@@ -218,14 +198,10 @@ public:
|
||||
|
||||
switch (_state) {
|
||||
// START comes first, to make the handling of the 0-quantity case simpler
|
||||
state_START:
|
||||
case state::START:
|
||||
sstlog.trace("{}: pos {} state {} - data.size()={}", fmt::ptr(this), current_pos(), state::START, data.size());
|
||||
_state = state::KEY_SIZE;
|
||||
if (data.size() == 0) {
|
||||
break;
|
||||
}
|
||||
[[fallthrough]];
|
||||
break;
|
||||
case state::KEY_SIZE:
|
||||
sstlog.trace("{}: pos {} state {}", fmt::ptr(this), current_pos(), state::KEY_SIZE);
|
||||
_entry_offset = current_pos();
|
||||
@@ -251,16 +227,7 @@ public:
|
||||
case state::PROMOTED_SIZE:
|
||||
sstlog.trace("{}: pos {} state {}", fmt::ptr(this), current_pos(), state::PROMOTED_SIZE);
|
||||
_position = this->_u64;
|
||||
if (is_mc_format() && data.size() && *data.begin() == 0) { // promoted_index_size == 0
|
||||
data.trim_front(1);
|
||||
_consumer.consume_entry(parsed_partition_index_entry{
|
||||
.key = std::move(_key),
|
||||
.data_file_offset = _position,
|
||||
.index_offset = _entry_offset,
|
||||
.promoted_index = std::nullopt
|
||||
});
|
||||
goto state_START;
|
||||
} else if (read_vint_or_uint32(data) != continuous_data_consumer::read_status::ready) {
|
||||
if (read_vint_or_uint32(data) != continuous_data_consumer::read_status::ready) {
|
||||
_state = state::PARTITION_HEADER_LENGTH_1;
|
||||
break;
|
||||
}
|
||||
@@ -372,6 +339,33 @@ inline file make_tracked_index_file(sstable& sst, reader_permit permit, tracing:
|
||||
return tracing::make_traced_file(std::move(f), std::move(trace_state), format("{}:", sst.index_filename()));
|
||||
}
|
||||
|
||||
inline
|
||||
std::unique_ptr<clustered_index_cursor> promoted_index::make_cursor(shared_sstable sst,
|
||||
reader_permit permit,
|
||||
tracing::trace_state_ptr trace_state,
|
||||
file_input_stream_options options,
|
||||
use_caching caching)
|
||||
{
|
||||
if (sst->get_version() >= sstable_version_types::mc) [[likely]] {
|
||||
seastar::shared_ptr<cached_file> cached_file_ptr = caching
|
||||
? sst->_cached_index_file
|
||||
: seastar::make_shared<cached_file>(make_tracked_index_file(*sst, permit, trace_state, caching),
|
||||
sst->manager().get_cache_tracker().get_index_cached_file_stats(),
|
||||
sst->manager().get_cache_tracker().get_lru(),
|
||||
sst->manager().get_cache_tracker().region(),
|
||||
sst->_index_file_size);
|
||||
return std::make_unique<mc::bsearch_clustered_cursor>(*sst->get_schema(),
|
||||
_promoted_index_start, _promoted_index_size,
|
||||
promoted_index_cache_metrics, permit,
|
||||
sst->get_column_translation(), cached_file_ptr, _num_blocks, trace_state, sst->features());
|
||||
}
|
||||
|
||||
auto file = make_tracked_index_file(*sst, permit, std::move(trace_state), caching);
|
||||
auto promoted_index_stream = make_file_input_stream(std::move(file), _promoted_index_start, _promoted_index_size,options);
|
||||
return std::make_unique<scanning_clustered_index_cursor>(*sst->get_schema(), permit,
|
||||
std::move(promoted_index_stream), _promoted_index_size, _num_blocks, std::nullopt);
|
||||
}
|
||||
|
||||
// Less-comparator for lookups in the partition index.
|
||||
class index_comparator {
|
||||
dht::ring_position_comparator_for_sstables _tri_cmp;
|
||||
@@ -382,16 +376,26 @@ public:
|
||||
return _tri_cmp(e.get_decorated_key(), rp) < 0;
|
||||
}
|
||||
|
||||
bool operator()(const index_entry& e, dht::ring_position_view rp) const {
|
||||
return _tri_cmp(e.get_decorated_key(_tri_cmp.s), rp) < 0;
|
||||
}
|
||||
|
||||
bool operator()(const managed_ref<index_entry>& e, dht::ring_position_view rp) const {
|
||||
return operator()(*e, rp);
|
||||
}
|
||||
|
||||
bool operator()(dht::ring_position_view rp, const managed_ref<index_entry>& e) const {
|
||||
return operator()(rp, *e);
|
||||
}
|
||||
|
||||
bool operator()(dht::ring_position_view rp, const summary_entry& e) const {
|
||||
return _tri_cmp(e.get_decorated_key(), rp) > 0;
|
||||
}
|
||||
};
|
||||
|
||||
inline
|
||||
std::strong_ordering index_entry_tri_cmp(const schema& s, partition_index_page& page, size_t idx, dht::ring_position_view rp) {
|
||||
dht::ring_position_comparator_for_sstables tri_cmp(s);
|
||||
return tri_cmp(page.get_decorated_key(s, idx), rp);
|
||||
}
|
||||
bool operator()(dht::ring_position_view rp, const index_entry& e) const {
|
||||
return _tri_cmp(e.get_decorated_key(_tri_cmp.s), rp) > 0;
|
||||
}
|
||||
};
|
||||
|
||||
// Contains information about index_reader position in the index file
|
||||
struct index_bound {
|
||||
@@ -533,7 +537,7 @@ private:
|
||||
if (ex) {
|
||||
return make_exception_future<index_list>(std::move(ex));
|
||||
}
|
||||
return bound.consumer->finalize();
|
||||
return make_ready_future<index_list>(std::move(bound.consumer->indexes));
|
||||
});
|
||||
});
|
||||
};
|
||||
@@ -546,18 +550,17 @@ private:
|
||||
if (bound.current_list->empty()) {
|
||||
throw malformed_sstable_exception(format("missing index entry for summary index {} (bound {})", summary_idx, fmt::ptr(&bound)), _sstable->index_filename());
|
||||
}
|
||||
bound.data_file_position = bound.current_list->_entries[0].position();
|
||||
bound.data_file_position = bound.current_list->_entries[0]->position();
|
||||
bound.element = indexable_element::partition;
|
||||
bound.end_open_marker.reset();
|
||||
|
||||
if (sstlog.is_enabled(seastar::log_level::trace)) {
|
||||
sstlog.trace("index {} bound {}: page:", fmt::ptr(this), fmt::ptr(&bound));
|
||||
logalloc::reclaim_lock rl(_region);
|
||||
for (size_t i = 0; i < bound.current_list->_entries.size(); ++i) {
|
||||
auto& e = bound.current_list->_entries[i];
|
||||
for (auto&& e : bound.current_list->_entries) {
|
||||
auto dk = dht::decorate_key(*_sstable->_schema,
|
||||
bound.current_list->get_key(i).to_partition_key(*_sstable->_schema));
|
||||
sstlog.trace(" {} -> {}", dk, e.position());
|
||||
e->get_key().to_partition_key(*_sstable->_schema));
|
||||
sstlog.trace(" {} -> {}", dk, e->position());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -601,13 +604,7 @@ private:
|
||||
// Valid if partition_data_ready(bound)
|
||||
index_entry& current_partition_entry(index_bound& bound) {
|
||||
parse_assert(bool(bound.current_list), _sstable->index_filename());
|
||||
return bound.current_list->_entries[bound.current_index_idx];
|
||||
}
|
||||
|
||||
// Valid if partition_data_ready(bound)
|
||||
partition_index_page& current_page(index_bound& bound) {
|
||||
parse_assert(bool(bound.current_list), _sstable->index_filename());
|
||||
return *bound.current_list;
|
||||
return *bound.current_list->_entries[bound.current_index_idx];
|
||||
}
|
||||
|
||||
future<> advance_to_next_partition(index_bound& bound) {
|
||||
@@ -620,7 +617,7 @@ private:
|
||||
if (bound.current_index_idx + 1 < bound.current_list->size()) {
|
||||
++bound.current_index_idx;
|
||||
bound.current_pi_idx = 0;
|
||||
bound.data_file_position = bound.current_list->_entries[bound.current_index_idx].position();
|
||||
bound.data_file_position = bound.current_list->_entries[bound.current_index_idx]->position();
|
||||
bound.element = indexable_element::partition;
|
||||
bound.end_open_marker.reset();
|
||||
return reset_clustered_cursor(bound);
|
||||
@@ -683,13 +680,9 @@ private:
|
||||
return advance_to_page(bound, summary_idx).then([this, &bound, pos, summary_idx] {
|
||||
sstlog.trace("index {}: old page index = {}", fmt::ptr(this), bound.current_index_idx);
|
||||
auto i = _alloc_section(_region, [&] {
|
||||
auto& page = *bound.current_list;
|
||||
auto& s = *_sstable->_schema;
|
||||
auto r = std::views::iota(bound.current_index_idx, page._entries.size());
|
||||
auto it = std::ranges::partition_point(r, [&] (int idx) {
|
||||
return index_entry_tri_cmp(s, page, idx, pos) < 0;
|
||||
});
|
||||
return page._entries.begin() + bound.current_index_idx + std::ranges::distance(r.begin(), it);
|
||||
auto& entries = bound.current_list->_entries;
|
||||
return std::lower_bound(std::begin(entries) + bound.current_index_idx, std::end(entries), pos,
|
||||
index_comparator(*_sstable->_schema));
|
||||
});
|
||||
// i is valid until next allocation point
|
||||
auto& entries = bound.current_list->_entries;
|
||||
@@ -704,7 +697,7 @@ private:
|
||||
}
|
||||
bound.current_index_idx = std::distance(std::begin(entries), i);
|
||||
bound.current_pi_idx = 0;
|
||||
bound.data_file_position = (*i).position();
|
||||
bound.data_file_position = (*i)->position();
|
||||
bound.element = indexable_element::partition;
|
||||
bound.end_open_marker.reset();
|
||||
sstlog.trace("index {}: new page index = {}, pos={}", fmt::ptr(this), bound.current_index_idx, bound.data_file_position);
|
||||
@@ -807,34 +800,6 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
std::unique_ptr<clustered_index_cursor> make_cursor(const parsed_promoted_index_entry& pi,
|
||||
shared_sstable sst,
|
||||
reader_permit permit,
|
||||
tracing::trace_state_ptr trace_state,
|
||||
file_input_stream_options options,
|
||||
use_caching caching)
|
||||
{
|
||||
if (sst->get_version() >= sstable_version_types::mc) [[likely]] {
|
||||
seastar::shared_ptr<cached_file> cached_file_ptr = caching
|
||||
? sst->_cached_index_file
|
||||
: seastar::make_shared<cached_file>(make_tracked_index_file(*sst, permit, trace_state, caching),
|
||||
sst->manager().get_cache_tracker().get_index_cached_file_stats(),
|
||||
sst->manager().get_cache_tracker().get_lru(),
|
||||
sst->manager().get_cache_tracker().region(),
|
||||
sst->_index_file_size);
|
||||
return std::make_unique<mc::bsearch_clustered_cursor>(*sst->get_schema(),
|
||||
pi.promoted_index_start, pi.promoted_index_size,
|
||||
promoted_index_cache_metrics, permit,
|
||||
sst->get_column_translation(), cached_file_ptr, pi.num_blocks, trace_state, sst->features());
|
||||
}
|
||||
|
||||
auto file = make_tracked_index_file(*sst, permit, std::move(trace_state), caching);
|
||||
auto promoted_index_stream = make_file_input_stream(std::move(file), pi.promoted_index_start, pi.promoted_index_size,options);
|
||||
return std::make_unique<scanning_clustered_index_cursor>(*sst->get_schema(), permit,
|
||||
std::move(promoted_index_stream), pi.promoted_index_size, pi.num_blocks, std::nullopt);
|
||||
}
|
||||
|
||||
// Ensures that partition_data_ready() returns true.
|
||||
// Can be called only when !eof()
|
||||
future<> read_partition_data() override {
|
||||
@@ -870,10 +835,10 @@ public:
|
||||
clustered_index_cursor* current_clustered_cursor(index_bound& bound) {
|
||||
if (!bound.clustered_cursor) {
|
||||
_alloc_section(_region, [&] {
|
||||
partition_index_page& page = current_page(bound);
|
||||
if (page.has_promoted_index(bound.current_index_idx)) {
|
||||
promoted_index& pi = page.get_promoted_index(bound.current_index_idx);
|
||||
bound.clustered_cursor = make_cursor(pi, _sstable, _permit, _trace_state,
|
||||
index_entry& e = current_partition_entry(bound);
|
||||
promoted_index* pi = e.get_promoted_index().get();
|
||||
if (pi) {
|
||||
bound.clustered_cursor = pi->make_cursor(_sstable, _permit, _trace_state,
|
||||
get_file_input_stream_options(), _use_caching);
|
||||
}
|
||||
});
|
||||
@@ -896,15 +861,15 @@ public:
|
||||
// It may be unavailable for old sstables for which this information was not generated.
|
||||
// Can be called only when partition_data_ready().
|
||||
std::optional<sstables::deletion_time> partition_tombstone() override {
|
||||
return current_page(_lower_bound).get_deletion_time(_lower_bound.current_index_idx);
|
||||
return current_partition_entry(_lower_bound).get_deletion_time();
|
||||
}
|
||||
|
||||
// Returns the key for current partition.
|
||||
// Can be called only when partition_data_ready().
|
||||
std::optional<partition_key> get_partition_key() override {
|
||||
return _alloc_section(_region, [this] {
|
||||
return current_page(_lower_bound).get_key(_lower_bound.current_index_idx)
|
||||
.to_partition_key(*_sstable->_schema);
|
||||
index_entry& e = current_partition_entry(_lower_bound);
|
||||
return e.get_key().to_partition_key(*_sstable->_schema);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -918,8 +883,8 @@ public:
|
||||
// Returns the number of promoted index entries for the current partition.
|
||||
// Can be called only when partition_data_ready().
|
||||
uint64_t get_promoted_index_size() {
|
||||
partition_index_page& page = current_page(_lower_bound);
|
||||
return page.get_promoted_index_size(_lower_bound.current_index_idx);
|
||||
index_entry& e = current_partition_entry(_lower_bound);
|
||||
return e.get_promoted_index_size();
|
||||
}
|
||||
|
||||
bool partition_data_ready() const override {
|
||||
@@ -1010,9 +975,9 @@ public:
|
||||
return make_ready_future<bool>(false);
|
||||
}
|
||||
return read_partition_data().then([this, key] {
|
||||
index_comparator cmp(*_sstable->_schema);
|
||||
bool found = _alloc_section(_region, [&] {
|
||||
auto& page = current_page(_lower_bound);
|
||||
return index_entry_tri_cmp(*_sstable->_schema, page, _lower_bound.current_index_idx, key) == 0;
|
||||
return cmp(key, current_partition_entry(_lower_bound)) == 0;
|
||||
});
|
||||
return make_ready_future<bool>(found);
|
||||
});
|
||||
|
||||
@@ -189,11 +189,10 @@ public:
|
||||
{}
|
||||
future<std::optional<directory_entry>> get() override {
|
||||
std::filesystem::path dir(_prefix);
|
||||
while (true) {
|
||||
do {
|
||||
if (_pos == _info.size()) {
|
||||
_info.clear();
|
||||
_info = co_await _client->list_objects(_bucket, _prefix, _paging);
|
||||
_pos = 0;
|
||||
}
|
||||
if (_info.empty()) {
|
||||
break;
|
||||
@@ -204,7 +203,7 @@ public:
|
||||
continue;
|
||||
}
|
||||
co_return ent;
|
||||
}
|
||||
} while (false);
|
||||
|
||||
co_return std::nullopt;
|
||||
}
|
||||
@@ -277,7 +276,7 @@ public:
|
||||
co_await f.close();
|
||||
|
||||
auto names = ranges | std::views::transform([](auto& p) { return p.name; }) | std::ranges::to<std::vector<std::string>>();
|
||||
co_await _client->merge_objects(bucket, object, names, {}, as);
|
||||
co_await _client->merge_objects(bucket, object, std::move(names), {}, as);
|
||||
|
||||
co_await parallel_for_each(names, [this, bucket](auto& name) -> future<> {
|
||||
co_await _client->delete_object(bucket, name);
|
||||
|
||||
@@ -257,11 +257,14 @@ public:
|
||||
while (partial_page || i != _cache.end()) {
|
||||
if (partial_page) {
|
||||
auto preempted = with_allocator(_region.allocator(), [&] {
|
||||
while (partial_page->clear_gently() != stop_iteration::yes) {
|
||||
return true;
|
||||
while (!partial_page->empty()) {
|
||||
partial_page->clear_one_entry();
|
||||
if (need_preempt()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
partial_page.reset();
|
||||
return need_preempt();
|
||||
return false;
|
||||
});
|
||||
if (preempted) {
|
||||
auto key = (i != _cache.end()) ? std::optional(i->key()) : std::nullopt;
|
||||
|
||||
@@ -1132,6 +1132,7 @@ public:
|
||||
|
||||
friend class mc::writer;
|
||||
friend class index_reader;
|
||||
friend class promoted_index;
|
||||
friend class sstables_manager;
|
||||
template <typename DataConsumeRowsContext>
|
||||
friend future<std::unique_ptr<DataConsumeRowsContext>>
|
||||
|
||||
@@ -180,11 +180,18 @@ storage_manager::config_updater::config_updater(const db::config& cfg, storage_m
|
||||
{}
|
||||
|
||||
sstables::sstable::version_types sstables_manager::get_highest_supported_format() const noexcept {
|
||||
if (_features.ms_sstable) {
|
||||
return sstable_version_types::ms;
|
||||
} else {
|
||||
return sstable_version_types::me;
|
||||
}
|
||||
// FIXME: start announcing `ms` here after it becomes the default.
|
||||
// (There are several tests which expect that new sstables are written with
|
||||
// the format reported by this API).
|
||||
//
|
||||
// After `ms` becomes the default, this function look like this:
|
||||
//
|
||||
// if (_features.ms_sstable) {
|
||||
// return sstable_version_types::ms;
|
||||
// } else {
|
||||
// return sstable_version_types::me;
|
||||
// }
|
||||
return sstable_version_types::me;
|
||||
}
|
||||
|
||||
sstables::sstable::version_types sstables_manager::get_preferred_sstable_version() const {
|
||||
|
||||
@@ -221,16 +221,10 @@ private:
|
||||
sst->set_sstable_level(0);
|
||||
auto units = co_await sst_manager.dir_semaphore().get_units(1);
|
||||
sstables::sstable_open_config cfg {
|
||||
.unsealed_sstable = true,
|
||||
.ignore_component_digest_mismatch = db.get_config().ignore_component_digest_mismatch(),
|
||||
};
|
||||
co_await sst->load(table.get_effective_replication_map()->get_sharder(*table.schema()), cfg);
|
||||
co_await table.add_new_sstable_and_update_cache(sst, [&sst_manager, sst] (sstables::shared_sstable loading_sst) -> future<> {
|
||||
if (loading_sst == sst) {
|
||||
auto writer_cfg = sst_manager.configure_writer(loading_sst->get_origin());
|
||||
co_await loading_sst->seal_sstable(writer_cfg.backup);
|
||||
}
|
||||
});
|
||||
co_await table.add_sstable_and_update_cache(sst);
|
||||
}
|
||||
|
||||
future<>
|
||||
@@ -301,8 +295,7 @@ private:
|
||||
sstables::sstable_state::normal,
|
||||
sstables::sstable::component_basename(
|
||||
_table.schema()->ks_name(), _table.schema()->cf_name(), descriptor.version, gen, descriptor.format, it->first),
|
||||
sstables::sstable_stream_sink_cfg{.last_component = std::next(it) == components.cend(),
|
||||
.leave_unsealed = true});
|
||||
sstables::sstable_stream_sink_cfg{.last_component = std::next(it) == components.cend()});
|
||||
auto out = co_await sstable_sink->output(foptions, stream_options);
|
||||
|
||||
input_stream src(co_await [this, &it, sstable, f = files.at(it->first)]() -> future<input_stream<char>> {
|
||||
|
||||
@@ -65,8 +65,9 @@ struct send_info {
|
||||
mutation_fragment_v1_stream reader;
|
||||
noncopyable_function<void(size_t)> update;
|
||||
send_info(netw::messaging_service& ms_, streaming::plan_id plan_id_, lw_shared_ptr<replica::table> tbl_, reader_permit permit_,
|
||||
dht::token_range_vector ranges_, locator::host_id id_, uint32_t dst_cpu_id_, stream_reason reason_, service::frozen_topology_guard topo_guard_,
|
||||
noncopyable_function<void(size_t)> update_fn)
|
||||
dht::token_range_vector ranges_, locator::host_id id_,
|
||||
uint32_t dst_cpu_id_, stream_reason reason_, service::frozen_topology_guard topo_guard_,
|
||||
noncopyable_function<void(size_t)> update_fn)
|
||||
: ms(ms_)
|
||||
, plan_id(plan_id_)
|
||||
, cf_id(tbl_->schema()->id())
|
||||
@@ -78,13 +79,12 @@ struct send_info {
|
||||
, ranges(std::move(ranges_))
|
||||
, prs(dht::to_partition_ranges(ranges))
|
||||
, reader(cf->make_streaming_reader(cf->schema(), std::move(permit_), prs, gc_clock::now()))
|
||||
, update(std::move(update_fn)) {
|
||||
, update(std::move(update_fn))
|
||||
{
|
||||
}
|
||||
future<bool> has_relevant_range_on_this_shard() {
|
||||
return do_with(false, ranges.begin(), [this](bool& found_relevant_range, dht::token_range_vector::iterator& ranges_it) {
|
||||
auto stop_cond = [this, &found_relevant_range, &ranges_it] {
|
||||
return ranges_it == ranges.end() || found_relevant_range;
|
||||
};
|
||||
return do_with(false, ranges.begin(), [this] (bool& found_relevant_range, dht::token_range_vector::iterator& ranges_it) {
|
||||
auto stop_cond = [this, &found_relevant_range, &ranges_it] { return ranges_it == ranges.end() || found_relevant_range; };
|
||||
return do_until(std::move(stop_cond), [this, &found_relevant_range, &ranges_it] {
|
||||
dht::token_range range = *ranges_it++;
|
||||
if (!found_relevant_range) {
|
||||
@@ -113,112 +113,93 @@ struct send_info {
|
||||
};
|
||||
|
||||
future<> send_mutation_fragments(lw_shared_ptr<send_info> si) {
|
||||
return si->reader.has_more_fragments().then([si](bool there_is_more) {
|
||||
if (!there_is_more) {
|
||||
// The reader contains no data
|
||||
sslog.info("[Stream #{}] Skip sending ks={}, cf={}, reader contains no data, with new rpc streaming", si->plan_id, si->cf->schema()->ks_name(),
|
||||
si->cf->schema()->cf_name());
|
||||
return make_ready_future<>();
|
||||
}
|
||||
return si->estimate_partitions().then([si](size_t estimated_partitions) {
|
||||
sslog.info("[Stream #{}] Start sending ks={}, cf={}, estimated_partitions={}, with new rpc streaming", si->plan_id, si->cf->schema()->ks_name(),
|
||||
si->cf->schema()->cf_name(), estimated_partitions);
|
||||
return si->ms
|
||||
.make_sink_and_source_for_stream_mutation_fragments(
|
||||
si->reader.schema()->version(), si->plan_id, si->cf_id, estimated_partitions, si->reason, si->topo_guard, si->id)
|
||||
.then_unpack([si](rpc::sink<frozen_mutation_fragment, stream_mutation_fragments_cmd> sink, rpc::source<int32_t> source) mutable {
|
||||
auto got_error_from_peer = make_lw_shared<bool>(false);
|
||||
auto table_is_dropped = make_lw_shared<bool>(false);
|
||||
return si->reader.has_more_fragments().then([si] (bool there_is_more) {
|
||||
if (!there_is_more) {
|
||||
// The reader contains no data
|
||||
sslog.info("[Stream #{}] Skip sending ks={}, cf={}, reader contains no data, with new rpc streaming",
|
||||
si->plan_id, si->cf->schema()->ks_name(), si->cf->schema()->cf_name());
|
||||
return make_ready_future<>();
|
||||
}
|
||||
return si->estimate_partitions().then([si] (size_t estimated_partitions) {
|
||||
sslog.info("[Stream #{}] Start sending ks={}, cf={}, estimated_partitions={}, with new rpc streaming", si->plan_id, si->cf->schema()->ks_name(), si->cf->schema()->cf_name(), estimated_partitions);
|
||||
return si->ms.make_sink_and_source_for_stream_mutation_fragments(si->reader.schema()->version(), si->plan_id, si->cf_id, estimated_partitions, si->reason, si->topo_guard, si->id).then_unpack([si] (rpc::sink<frozen_mutation_fragment, stream_mutation_fragments_cmd> sink, rpc::source<int32_t> source) mutable {
|
||||
auto got_error_from_peer = make_lw_shared<bool>(false);
|
||||
auto table_is_dropped = make_lw_shared<bool>(false);
|
||||
|
||||
auto source_op = [source, got_error_from_peer, table_is_dropped, si]() mutable -> future<> {
|
||||
return repeat([source, got_error_from_peer, table_is_dropped, si]() mutable {
|
||||
return source().then([source, got_error_from_peer, table_is_dropped, si](
|
||||
std::optional<std::tuple<int32_t>> status_opt) mutable {
|
||||
if (status_opt) {
|
||||
auto status = std::get<0>(*status_opt);
|
||||
if (status == -1) {
|
||||
*got_error_from_peer = true;
|
||||
} else if (status == -2) {
|
||||
*got_error_from_peer = true;
|
||||
*table_is_dropped = true;
|
||||
}
|
||||
sslog.debug("Got status code from peer={}, plan_id={}, cf_id={}, status={}", si->id, si->plan_id, si->cf_id, status);
|
||||
// we've got an error from the other side, but we cannot just abandon rpc::source we
|
||||
// need to continue reading until EOS since this will signal that no more work
|
||||
// is left and rpc::source can be destroyed. The sender closes connection immediately
|
||||
// after sending the status, so EOS should arrive shortly.
|
||||
return stop_iteration::no;
|
||||
} else {
|
||||
return stop_iteration::yes;
|
||||
}
|
||||
});
|
||||
});
|
||||
}();
|
||||
auto source_op = [source, got_error_from_peer, table_is_dropped, si] () mutable -> future<> {
|
||||
return repeat([source, got_error_from_peer, table_is_dropped, si] () mutable {
|
||||
return source().then([source, got_error_from_peer, table_is_dropped, si] (std::optional<std::tuple<int32_t>> status_opt) mutable {
|
||||
if (status_opt) {
|
||||
auto status = std::get<0>(*status_opt);
|
||||
if (status == -1) {
|
||||
*got_error_from_peer = true;
|
||||
} else if (status == -2) {
|
||||
*got_error_from_peer = true;
|
||||
*table_is_dropped = true;
|
||||
}
|
||||
sslog.debug("Got status code from peer={}, plan_id={}, cf_id={}, status={}", si->id, si->plan_id, si->cf_id, status);
|
||||
// we've got an error from the other side, but we cannot just abandon rpc::source we
|
||||
// need to continue reading until EOS since this will signal that no more work
|
||||
// is left and rpc::source can be destroyed. The sender closes connection immediately
|
||||
// after sending the status, so EOS should arrive shortly.
|
||||
return stop_iteration::no;
|
||||
} else {
|
||||
return stop_iteration::yes;
|
||||
}
|
||||
});
|
||||
});
|
||||
}();
|
||||
|
||||
auto sink_op = [sink, si, got_error_from_peer]() mutable -> future<> {
|
||||
mutation_fragment_stream_validator validator(*(si->reader.schema()));
|
||||
return do_with(std::move(sink), std::move(validator),
|
||||
[si, got_error_from_peer](rpc::sink<frozen_mutation_fragment, stream_mutation_fragments_cmd>& sink,
|
||||
mutation_fragment_stream_validator& validator) {
|
||||
return repeat([&sink, &validator, si, got_error_from_peer]() mutable {
|
||||
return si->reader().then(
|
||||
[&sink, &validator, si, s = si->reader.schema(), got_error_from_peer](mutation_fragment_opt mf) mutable {
|
||||
if (*got_error_from_peer) {
|
||||
return make_exception_future<stop_iteration>(std::runtime_error("Got status error code from peer"));
|
||||
}
|
||||
if (mf) {
|
||||
if (!validator(mf->mutation_fragment_kind())) {
|
||||
return make_exception_future<stop_iteration>(std::runtime_error(
|
||||
format("Stream reader mutation_fragment validator failed, previous={}, current={}",
|
||||
validator.previous_mutation_fragment_kind(), mf->mutation_fragment_kind())));
|
||||
}
|
||||
frozen_mutation_fragment fmf = freeze(*s, *mf);
|
||||
auto size = fmf.representation().size();
|
||||
si->update(size);
|
||||
return sink(fmf, stream_mutation_fragments_cmd::mutation_fragment_data).then([] {
|
||||
return stop_iteration::no;
|
||||
});
|
||||
} else {
|
||||
if (!validator.on_end_of_stream()) {
|
||||
return make_exception_future<stop_iteration>(
|
||||
std::runtime_error(format("Stream reader mutation_fragment validator failed on "
|
||||
"end_of_stream, previous={}, current=end_of_stream",
|
||||
validator.previous_mutation_fragment_kind())));
|
||||
}
|
||||
return make_ready_future<stop_iteration>(stop_iteration::yes);
|
||||
}
|
||||
});
|
||||
})
|
||||
.then([&sink]() mutable {
|
||||
return sink(frozen_mutation_fragment(bytes_ostream()), stream_mutation_fragments_cmd::end_of_stream);
|
||||
})
|
||||
.handle_exception([&sink](std::exception_ptr ep) mutable {
|
||||
// Notify the receiver the sender has failed
|
||||
return sink(frozen_mutation_fragment(bytes_ostream()), stream_mutation_fragments_cmd::error)
|
||||
.then([ep = std::move(ep)]() mutable {
|
||||
return make_exception_future<>(std::move(ep));
|
||||
});
|
||||
})
|
||||
.finally([&sink]() mutable {
|
||||
return sink.close();
|
||||
});
|
||||
});
|
||||
}();
|
||||
|
||||
return when_all_succeed(std::move(source_op), std::move(sink_op)).then_unpack([got_error_from_peer, table_is_dropped, si] {
|
||||
if (*got_error_from_peer) {
|
||||
if (*table_is_dropped) {
|
||||
sslog.info("[Stream #{}] Skipped streaming the dropped table {}.{}", si->plan_id, si->cf->schema()->ks_name(),
|
||||
si->cf->schema()->cf_name());
|
||||
} else {
|
||||
throw std::runtime_error(
|
||||
format("Peer failed to process mutation_fragment peer={}, plan_id={}, cf_id={}", si->id, si->plan_id, si->cf_id));
|
||||
}
|
||||
auto sink_op = [sink, si, got_error_from_peer] () mutable -> future<> {
|
||||
mutation_fragment_stream_validator validator(*(si->reader.schema()));
|
||||
return do_with(std::move(sink), std::move(validator), [si, got_error_from_peer] (rpc::sink<frozen_mutation_fragment, stream_mutation_fragments_cmd>& sink, mutation_fragment_stream_validator& validator) {
|
||||
return repeat([&sink, &validator, si, got_error_from_peer] () mutable {
|
||||
return si->reader().then([&sink, &validator, si, s = si->reader.schema(), got_error_from_peer] (mutation_fragment_opt mf) mutable {
|
||||
if (*got_error_from_peer) {
|
||||
return make_exception_future<stop_iteration>(std::runtime_error("Got status error code from peer"));
|
||||
}
|
||||
if (mf) {
|
||||
if (!validator(mf->mutation_fragment_kind())) {
|
||||
return make_exception_future<stop_iteration>(std::runtime_error(format("Stream reader mutation_fragment validator failed, previous={}, current={}",
|
||||
validator.previous_mutation_fragment_kind(), mf->mutation_fragment_kind())));
|
||||
}
|
||||
});
|
||||
frozen_mutation_fragment fmf = freeze(*s, *mf);
|
||||
auto size = fmf.representation().size();
|
||||
si->update(size);
|
||||
return sink(fmf, stream_mutation_fragments_cmd::mutation_fragment_data).then([] { return stop_iteration::no; });
|
||||
} else {
|
||||
if (!validator.on_end_of_stream()) {
|
||||
return make_exception_future<stop_iteration>(std::runtime_error(format("Stream reader mutation_fragment validator failed on end_of_stream, previous={}, current=end_of_stream",
|
||||
validator.previous_mutation_fragment_kind())));
|
||||
}
|
||||
return make_ready_future<stop_iteration>(stop_iteration::yes);
|
||||
}
|
||||
});
|
||||
}).then([&sink] () mutable {
|
||||
return sink(frozen_mutation_fragment(bytes_ostream()), stream_mutation_fragments_cmd::end_of_stream);
|
||||
}).handle_exception([&sink] (std::exception_ptr ep) mutable {
|
||||
// Notify the receiver the sender has failed
|
||||
return sink(frozen_mutation_fragment(bytes_ostream()), stream_mutation_fragments_cmd::error).then([ep = std::move(ep)] () mutable {
|
||||
return make_exception_future<>(std::move(ep));
|
||||
});
|
||||
}).finally([&sink] () mutable {
|
||||
return sink.close();
|
||||
});
|
||||
});
|
||||
}();
|
||||
|
||||
return when_all_succeed(std::move(source_op), std::move(sink_op)).then_unpack([got_error_from_peer, table_is_dropped, si] {
|
||||
if (*got_error_from_peer) {
|
||||
if (*table_is_dropped) {
|
||||
sslog.info("[Stream #{}] Skipped streaming the dropped table {}.{}", si->plan_id, si->cf->schema()->ks_name(), si->cf->schema()->cf_name());
|
||||
} else {
|
||||
throw std::runtime_error(format("Peer failed to process mutation_fragment peer={}, plan_id={}, cf_id={}", si->id, si->plan_id, si->cf_id));
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
future<> stream_transfer_task::execute() {
|
||||
@@ -226,55 +207,46 @@ future<> stream_transfer_task::execute() {
|
||||
auto cf_id = this->cf_id;
|
||||
auto id = session->peer;
|
||||
auto& sm = session->manager();
|
||||
auto table_dropped = co_await streaming::with_table_drop_silenced(sm.db(), sm.mm(), cf_id, [this, &sm, cf_id, plan_id, id](const table_id&) {
|
||||
auto table_dropped = co_await streaming::with_table_drop_silenced(sm.db(), sm.mm(), cf_id, [this, &sm, cf_id, plan_id, id] (const table_id &) {
|
||||
auto dst_cpu_id = session->dst_cpu_id;
|
||||
sslog.debug("[Stream #{}] stream_transfer_task: cf_id={}", plan_id, cf_id);
|
||||
sort_and_merge_ranges();
|
||||
auto reason = session->get_reason();
|
||||
auto topo_guard = session->topo_guard();
|
||||
return sm.container()
|
||||
.invoke_on_all([plan_id, cf_id, id, dst_cpu_id, ranges = this->_ranges, reason, topo_guard](stream_manager& sm) mutable {
|
||||
auto tbl = sm.db().find_column_family(cf_id).shared_from_this();
|
||||
return sm.db()
|
||||
.obtain_reader_permit(*tbl, "stream-transfer-task", db::no_timeout, {})
|
||||
.then([&sm, tbl, plan_id, cf_id, id, dst_cpu_id, ranges = std::move(ranges), reason, topo_guard](reader_permit permit) mutable {
|
||||
auto si = make_lw_shared<send_info>(sm.ms(), plan_id, tbl, std::move(permit), std::move(ranges), id, dst_cpu_id, reason,
|
||||
topo_guard, [&sm, plan_id, id](size_t sz) {
|
||||
sm.update_progress(plan_id, id, streaming::progress_info::direction::OUT, sz);
|
||||
});
|
||||
return si->has_relevant_range_on_this_shard()
|
||||
.then([si, plan_id, cf_id](bool has_relevant_range_on_this_shard) {
|
||||
if (!has_relevant_range_on_this_shard) {
|
||||
sslog.debug("[Stream #{}] stream_transfer_task: cf_id={}: ignore ranges on shard={}", plan_id, cf_id,
|
||||
this_shard_id());
|
||||
return make_ready_future<>();
|
||||
}
|
||||
return send_mutation_fragments(std::move(si));
|
||||
})
|
||||
.finally([si] {
|
||||
return si->reader.close();
|
||||
});
|
||||
});
|
||||
})
|
||||
.then([this, plan_id, cf_id, id, &sm] {
|
||||
sslog.debug("[Stream #{}] SEND STREAM_MUTATION_DONE to {}, cf_id={}", plan_id, id, cf_id);
|
||||
return ser::streaming_rpc_verbs::send_stream_mutation_done(&sm.ms(), id, plan_id, _ranges, cf_id, session->dst_cpu_id)
|
||||
.handle_exception([plan_id, id](auto ep) {
|
||||
sslog.warn("[Stream #{}] stream_transfer_task: Fail to send STREAM_MUTATION_DONE to {}: {}", plan_id, id, ep);
|
||||
std::rethrow_exception(ep);
|
||||
});
|
||||
})
|
||||
.then([this, id, plan_id] {
|
||||
_mutation_done_sent = true;
|
||||
sslog.debug("[Stream #{}] GOT STREAM_MUTATION_DONE Reply from {}", plan_id, id);
|
||||
})
|
||||
.handle_exception([plan_id, id, &sm](std::exception_ptr ep) {
|
||||
sslog.warn("[Stream #{}] stream_transfer_task: Fail to send to {}: {}", plan_id, id, ep);
|
||||
utils::get_local_injector().inject("stream_mutation_fragments_table_dropped", [&sm]() {
|
||||
sm.db().find_column_family(table_id::create_null_id());
|
||||
});
|
||||
std::rethrow_exception(ep);
|
||||
return sm.container().invoke_on_all([plan_id, cf_id, id, dst_cpu_id, ranges=this->_ranges, reason, topo_guard] (stream_manager& sm) mutable {
|
||||
auto tbl = sm.db().find_column_family(cf_id).shared_from_this();
|
||||
return sm.db().obtain_reader_permit(*tbl, "stream-transfer-task", db::no_timeout, {}).then([&sm, tbl, plan_id, cf_id, id, dst_cpu_id, ranges=std::move(ranges), reason, topo_guard] (reader_permit permit) mutable {
|
||||
auto si = make_lw_shared<send_info>(sm.ms(), plan_id, tbl, std::move(permit), std::move(ranges), id, dst_cpu_id, reason, topo_guard, [&sm, plan_id, id] (size_t sz) {
|
||||
sm.update_progress(plan_id, id, streaming::progress_info::direction::OUT, sz);
|
||||
});
|
||||
return si->has_relevant_range_on_this_shard().then([si, plan_id, cf_id] (bool has_relevant_range_on_this_shard) {
|
||||
if (!has_relevant_range_on_this_shard) {
|
||||
sslog.debug("[Stream #{}] stream_transfer_task: cf_id={}: ignore ranges on shard={}",
|
||||
plan_id, cf_id, this_shard_id());
|
||||
return make_ready_future<>();
|
||||
}
|
||||
return send_mutation_fragments(std::move(si));
|
||||
}).finally([si] {
|
||||
return si->reader.close();
|
||||
});
|
||||
});
|
||||
}).then([this, plan_id, cf_id, id, &sm] {
|
||||
sslog.debug("[Stream #{}] SEND STREAM_MUTATION_DONE to {}, cf_id={}", plan_id, id, cf_id);
|
||||
return ser::streaming_rpc_verbs::send_stream_mutation_done(&sm.ms(), id, plan_id, _ranges,
|
||||
cf_id, session->dst_cpu_id).handle_exception([plan_id, id] (auto ep) {
|
||||
sslog.warn("[Stream #{}] stream_transfer_task: Fail to send STREAM_MUTATION_DONE to {}: {}", plan_id, id, ep);
|
||||
std::rethrow_exception(ep);
|
||||
});
|
||||
}).then([this, id, plan_id] {
|
||||
_mutation_done_sent = true;
|
||||
sslog.debug("[Stream #{}] GOT STREAM_MUTATION_DONE Reply from {}", plan_id, id);
|
||||
}).handle_exception([plan_id, id, &sm] (std::exception_ptr ep) {
|
||||
sslog.warn("[Stream #{}] stream_transfer_task: Fail to send to {}: {}", plan_id, id, ep);
|
||||
utils::get_local_injector().inject("stream_mutation_fragments_table_dropped", [&sm] () {
|
||||
sm.db().find_column_family(table_id::create_null_id());
|
||||
});
|
||||
std::rethrow_exception(ep);
|
||||
});
|
||||
});
|
||||
// If the table is dropped during streaming, we can ignore the
|
||||
// errors and make the stream successful. This allows user to
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user