Compare commits
144 Commits
next
...
scylla-5.0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0354e13718 | ||
|
|
2750d2e94b | ||
|
|
b4383a389b | ||
|
|
f667c5923a | ||
|
|
e4ba0c56df | ||
|
|
329d55cc4f | ||
|
|
b956293f47 | ||
|
|
6a8c2d3f56 | ||
|
|
27a35c7f98 | ||
|
|
d83134a245 | ||
|
|
b844d14829 | ||
|
|
184df0393e | ||
|
|
1b550dd301 | ||
|
|
01ce53d7fb | ||
|
|
e9c7f89b32 | ||
|
|
93f468c12c | ||
|
|
e54ae9efd9 | ||
|
|
ef40e59c0e | ||
|
|
8c56b0b268 | ||
|
|
fc78d88783 | ||
|
|
31a20c4c54 | ||
|
|
7e42bcfd61 | ||
|
|
2107ffe2d2 | ||
|
|
5a97a1060e | ||
|
|
2b0487c900 | ||
|
|
d3b3c53d9f | ||
|
|
50c2c1b1d4 | ||
|
|
aa647a637a | ||
|
|
2c0040fcb3 | ||
|
|
54564adb7c | ||
|
|
839876e8f2 | ||
|
|
36002e2b7c | ||
|
|
91a8f9e09b | ||
|
|
bc29f350dd | ||
|
|
4fe571f470 | ||
|
|
ebf38eaead | ||
|
|
1c82766f33 | ||
|
|
e1f78c33b4 | ||
|
|
0634b5f734 | ||
|
|
6f020b26e1 | ||
|
|
7f8dcc5657 | ||
|
|
20451760fe | ||
|
|
51b031d04e | ||
|
|
82d1446ca9 | ||
|
|
e0acb0766d | ||
|
|
4f26d489a0 | ||
|
|
43cbc5c836 | ||
|
|
f0c521efdf | ||
|
|
b9a61c8e9a | ||
|
|
32aa1e5287 | ||
|
|
da6a126d79 | ||
|
|
d07e902983 | ||
|
|
3c0fc42f84 | ||
|
|
964ccf9192 | ||
|
|
dfdc128faf | ||
|
|
299122e78d | ||
|
|
23a34d7e42 | ||
|
|
67a2f3aa67 | ||
|
|
66e8cf8cea | ||
|
|
35b66c844c | ||
|
|
9e7a1340b9 | ||
|
|
d5a0750ef3 | ||
|
|
618c483c73 | ||
|
|
f10fd1bc12 | ||
|
|
1891f10141 | ||
|
|
b177dacd36 | ||
|
|
283a722923 | ||
|
|
522d0a81e7 | ||
|
|
cd13911db4 | ||
|
|
32423ebc38 | ||
|
|
97054ee691 | ||
|
|
34085c364f | ||
|
|
323521f4c8 | ||
|
|
1ad59d6a7b | ||
|
|
d3045df9c9 | ||
|
|
be48b7aa8b | ||
|
|
3c4688bcfa | ||
|
|
cc22021876 | ||
|
|
c9e79cb4a3 | ||
|
|
f28542a71e | ||
|
|
527a75a4c0 | ||
|
|
df00f8fcfb | ||
|
|
41a00c744f | ||
|
|
2d7b6cd702 | ||
|
|
ff79228178 | ||
|
|
1803124cc6 | ||
|
|
6fcbf66bfb | ||
|
|
e9a3dee234 | ||
|
|
279cd44c7f | ||
|
|
c99f768381 | ||
|
|
89a540d54a | ||
|
|
338edcc02e | ||
|
|
a8eb5164b2 | ||
|
|
9accb44f9c | ||
|
|
8878007106 | ||
|
|
9da666e778 | ||
|
|
aca355dec1 | ||
|
|
efbb2efd3f | ||
|
|
44dc5c4a1d | ||
|
|
6b34ba3a4f | ||
|
|
f1e25cb4a6 | ||
|
|
c9798746ae | ||
|
|
7f70ffc5ce | ||
|
|
551636ec89 | ||
|
|
e1130a01e7 | ||
|
|
b0233cb7c5 | ||
|
|
e480c5bf4d | ||
|
|
7d90f7e93f | ||
|
|
3e6e8579c6 | ||
|
|
3e98e17d18 | ||
|
|
a214f8cf6e | ||
|
|
e8b92fe34d | ||
|
|
fa479c84ac | ||
|
|
40c26dd2c5 | ||
|
|
2c6f069fd1 | ||
|
|
e27dff0c50 | ||
|
|
3f03260ffb | ||
|
|
1315135fca | ||
|
|
f92622e0de | ||
|
|
3bca608db5 | ||
|
|
a93b72d5dd | ||
|
|
d58ca2edbd | ||
|
|
75740ace2a | ||
|
|
d7a1bf6331 | ||
|
|
bbd7d657cc | ||
|
|
f5bf4c81d1 | ||
|
|
02e8336659 | ||
|
|
601812e11b | ||
|
|
ea466320d2 | ||
|
|
25ea831a15 | ||
|
|
8648c79c9e | ||
|
|
7ae4d0e6f8 | ||
|
|
f3564db941 | ||
|
|
97caf12836 | ||
|
|
839d9ef41a | ||
|
|
782bd50f92 | ||
|
|
0a4d971b4a | ||
|
|
22562f767f | ||
|
|
eb80dd1db5 | ||
|
|
51d699ee21 | ||
|
|
83a33bff8c | ||
|
|
273563b9ad | ||
|
|
891990ec09 | ||
|
|
da0cd2b107 |
2
.gitmodules
vendored
2
.gitmodules
vendored
@@ -1,6 +1,6 @@
|
||||
[submodule "seastar"]
|
||||
path = seastar
|
||||
url = ../seastar
|
||||
url = ../scylla-seastar
|
||||
ignore = dirty
|
||||
[submodule "swagger-ui"]
|
||||
path = swagger-ui
|
||||
|
||||
@@ -60,7 +60,7 @@ fi
|
||||
|
||||
# Default scylla product/version tags
|
||||
PRODUCT=scylla
|
||||
VERSION=5.0.dev
|
||||
VERSION=5.0.7
|
||||
|
||||
if test -f version
|
||||
then
|
||||
|
||||
@@ -78,6 +78,11 @@ future<> controller::start_server() {
|
||||
|
||||
_executor.start(std::ref(_gossiper), std::ref(_proxy), std::ref(_mm), std::ref(_sys_dist_ks), sharded_parameter(get_cdc_metadata, std::ref(_cdc_gen_svc)), _ssg.value()).get();
|
||||
_server.start(std::ref(_executor), std::ref(_proxy), std::ref(_gossiper)).get();
|
||||
// Note: from this point on, if start_server() throws for any reason,
|
||||
// it must first call stop_server() to stop the executor and server
|
||||
// services we just started - or Scylla will cause an assertion
|
||||
// failure when the controller object is destroyed in the exception
|
||||
// unwinding.
|
||||
std::optional<uint16_t> alternator_port;
|
||||
if (_config.alternator_port()) {
|
||||
alternator_port = _config.alternator_port();
|
||||
@@ -104,7 +109,13 @@ future<> controller::start_server() {
|
||||
}
|
||||
opts.erase("require_client_auth");
|
||||
opts.erase("truststore");
|
||||
utils::configure_tls_creds_builder(creds.value(), std::move(opts)).get();
|
||||
try {
|
||||
utils::configure_tls_creds_builder(creds.value(), std::move(opts)).get();
|
||||
} catch(...) {
|
||||
logger.error("Failed to set up Alternator TLS credentials: {}", std::current_exception());
|
||||
stop_server().get();
|
||||
std::throw_with_nested(std::runtime_error("Failed to set up Alternator TLS credentials"));
|
||||
}
|
||||
}
|
||||
bool alternator_enforce_authorization = _config.alternator_enforce_authorization();
|
||||
_server.invoke_on_all(
|
||||
|
||||
@@ -34,6 +34,7 @@
|
||||
#include "expressions.hh"
|
||||
#include "conditions.hh"
|
||||
#include "cql3/constants.hh"
|
||||
#include "cql3/util.hh"
|
||||
#include <optional>
|
||||
#include "utils/overloaded_functor.hh"
|
||||
#include "seastar/json/json_elements.hh"
|
||||
@@ -46,6 +47,7 @@
|
||||
#include <seastar/core/coroutine.hh>
|
||||
#include <boost/range/adaptors.hpp>
|
||||
#include <boost/range/algorithm/find_end.hpp>
|
||||
#include <unordered_set>
|
||||
#include "service/storage_proxy.hh"
|
||||
#include "gms/gossiper.hh"
|
||||
#include "schema_registry.hh"
|
||||
@@ -148,16 +150,16 @@ static void validate_table_name(const std::string& name) {
|
||||
// instead of each component individually as DynamoDB does.
|
||||
// The view_name() function assumes the table_name has already been validated
|
||||
// but validates the legality of index_name and the combination of both.
|
||||
static std::string view_name(const std::string& table_name, const std::string& index_name, const std::string& delim = ":") {
|
||||
static std::string view_name(const std::string& table_name, std::string_view index_name, const std::string& delim = ":") {
|
||||
static const std::regex valid_index_name_chars ("[a-zA-Z0-9_.-]*");
|
||||
if (index_name.length() < 3) {
|
||||
throw api_error::validation("IndexName must be at least 3 characters long");
|
||||
}
|
||||
if (!std::regex_match(index_name.c_str(), valid_index_name_chars)) {
|
||||
if (!std::regex_match(index_name.data(), valid_index_name_chars)) {
|
||||
throw api_error::validation(
|
||||
format("IndexName '{}' must satisfy regular expression pattern: [a-zA-Z0-9_.-]+", index_name));
|
||||
}
|
||||
std::string ret = table_name + delim + index_name;
|
||||
std::string ret = table_name + delim + std::string(index_name);
|
||||
if (ret.length() > max_table_name_length) {
|
||||
throw api_error::validation(
|
||||
format("The total length of TableName ('{}') and IndexName ('{}') cannot exceed {} characters",
|
||||
@@ -166,7 +168,7 @@ static std::string view_name(const std::string& table_name, const std::string& i
|
||||
return ret;
|
||||
}
|
||||
|
||||
static std::string lsi_name(const std::string& table_name, const std::string& index_name) {
|
||||
static std::string lsi_name(const std::string& table_name, std::string_view index_name) {
|
||||
return view_name(table_name, index_name, "!:");
|
||||
}
|
||||
|
||||
@@ -273,16 +275,16 @@ get_table_or_view(service::storage_proxy& proxy, const rjson::value& request) {
|
||||
if (index_name) {
|
||||
if (index_name->IsString()) {
|
||||
orig_table_name = std::move(table_name);
|
||||
table_name = view_name(orig_table_name, index_name->GetString());
|
||||
table_name = view_name(orig_table_name, rjson::to_string_view(*index_name));
|
||||
type = table_or_view_type::gsi;
|
||||
} else {
|
||||
throw api_error::validation(
|
||||
format("Non-string IndexName '{}'", index_name->GetString()));
|
||||
format("Non-string IndexName '{}'", rjson::to_string_view(*index_name)));
|
||||
}
|
||||
// If no tables for global indexes were found, the index may be local
|
||||
if (!proxy.data_dictionary().has_schema(keyspace_name, table_name)) {
|
||||
type = table_or_view_type::lsi;
|
||||
table_name = lsi_name(orig_table_name, index_name->GetString());
|
||||
table_name = lsi_name(orig_table_name, rjson::to_string_view(*index_name));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -432,6 +434,11 @@ future<executor::request_return_type> executor::describe_table(client_state& cli
|
||||
rjson::add(table_description, "BillingModeSummary", rjson::empty_object());
|
||||
rjson::add(table_description["BillingModeSummary"], "BillingMode", "PAY_PER_REQUEST");
|
||||
rjson::add(table_description["BillingModeSummary"], "LastUpdateToPayPerRequestDateTime", rjson::value(creation_date_seconds));
|
||||
// In PAY_PER_REQUEST billing mode, provisioned capacity should return 0
|
||||
rjson::add(table_description, "ProvisionedThroughput", rjson::empty_object());
|
||||
rjson::add(table_description["ProvisionedThroughput"], "ReadCapacityUnits", 0);
|
||||
rjson::add(table_description["ProvisionedThroughput"], "WriteCapacityUnits", 0);
|
||||
rjson::add(table_description["ProvisionedThroughput"], "NumberOfDecreasesToday", 0);
|
||||
|
||||
std::unordered_map<std::string,std::string> key_attribute_types;
|
||||
// Add base table's KeySchema and collect types for AttributeDefinitions:
|
||||
@@ -453,6 +460,11 @@ future<executor::request_return_type> executor::describe_table(client_state& cli
|
||||
rjson::add(view_entry, "IndexName", rjson::from_string(index_name));
|
||||
// Add indexes's KeySchema and collect types for AttributeDefinitions:
|
||||
describe_key_schema(view_entry, *vptr, key_attribute_types);
|
||||
// Add projection type
|
||||
rjson::value projection = rjson::empty_object();
|
||||
rjson::add(projection, "ProjectionType", "ALL");
|
||||
// FIXME: we have to get ProjectionType from the schema when it is added
|
||||
rjson::add(view_entry, "Projection", std::move(projection));
|
||||
// Local secondary indexes are marked by an extra '!' sign occurring before the ':' delimiter
|
||||
rjson::value& index_array = (delim_it > 1 && cf_name[delim_it-1] == '!') ? lsi_array : gsi_array;
|
||||
rjson::push_back(index_array, std::move(view_entry));
|
||||
@@ -884,17 +896,23 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
|
||||
const rjson::value* gsi = rjson::find(request, "GlobalSecondaryIndexes");
|
||||
std::vector<schema_builder> view_builders;
|
||||
std::vector<sstring> where_clauses;
|
||||
std::unordered_set<std::string> index_names;
|
||||
if (gsi) {
|
||||
if (!gsi->IsArray()) {
|
||||
co_return api_error::validation("GlobalSecondaryIndexes must be an array.");
|
||||
}
|
||||
for (const rjson::value& g : gsi->GetArray()) {
|
||||
const rjson::value* index_name = rjson::find(g, "IndexName");
|
||||
if (!index_name || !index_name->IsString()) {
|
||||
const rjson::value* index_name_v = rjson::find(g, "IndexName");
|
||||
if (!index_name_v || !index_name_v->IsString()) {
|
||||
co_return api_error::validation("GlobalSecondaryIndexes IndexName must be a string.");
|
||||
}
|
||||
std::string vname(view_name(table_name, index_name->GetString()));
|
||||
elogger.trace("Adding GSI {}", index_name->GetString());
|
||||
std::string_view index_name = rjson::to_string_view(*index_name_v);
|
||||
auto [it, added] = index_names.emplace(index_name);
|
||||
if (!added) {
|
||||
co_return api_error::validation(format("Duplicate IndexName '{}', ", index_name));
|
||||
}
|
||||
std::string vname(view_name(table_name, index_name));
|
||||
elogger.trace("Adding GSI {}", index_name);
|
||||
// FIXME: read and handle "Projection" parameter. This will
|
||||
// require the MV code to copy just parts of the attrs map.
|
||||
schema_builder view_builder(keyspace_name, vname);
|
||||
@@ -927,9 +945,10 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
|
||||
if (!range_key.empty() && range_key != view_hash_key && range_key != view_range_key) {
|
||||
add_column(view_builder, range_key, attribute_definitions, column_kind::clustering_key);
|
||||
}
|
||||
sstring where_clause = "\"" + view_hash_key + "\" IS NOT NULL";
|
||||
sstring where_clause = format("{} IS NOT NULL", cql3::util::maybe_quote(view_hash_key));
|
||||
if (!view_range_key.empty()) {
|
||||
where_clause = where_clause + " AND \"" + view_hash_key + "\" IS NOT NULL";
|
||||
where_clause = format("{} AND {} IS NOT NULL", where_clause,
|
||||
cql3::util::maybe_quote(view_range_key));
|
||||
}
|
||||
where_clauses.push_back(std::move(where_clause));
|
||||
view_builders.emplace_back(std::move(view_builder));
|
||||
@@ -942,12 +961,17 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
|
||||
throw api_error::validation("LocalSecondaryIndexes must be an array.");
|
||||
}
|
||||
for (const rjson::value& l : lsi->GetArray()) {
|
||||
const rjson::value* index_name = rjson::find(l, "IndexName");
|
||||
if (!index_name || !index_name->IsString()) {
|
||||
const rjson::value* index_name_v = rjson::find(l, "IndexName");
|
||||
if (!index_name_v || !index_name_v->IsString()) {
|
||||
throw api_error::validation("LocalSecondaryIndexes IndexName must be a string.");
|
||||
}
|
||||
std::string vname(lsi_name(table_name, index_name->GetString()));
|
||||
elogger.trace("Adding LSI {}", index_name->GetString());
|
||||
std::string_view index_name = rjson::to_string_view(*index_name_v);
|
||||
auto [it, added] = index_names.emplace(index_name);
|
||||
if (!added) {
|
||||
co_return api_error::validation(format("Duplicate IndexName '{}', ", index_name));
|
||||
}
|
||||
std::string vname(lsi_name(table_name, index_name));
|
||||
elogger.trace("Adding LSI {}", index_name);
|
||||
if (range_key.empty()) {
|
||||
co_return api_error::validation("LocalSecondaryIndex requires that the base table have a range key");
|
||||
}
|
||||
@@ -979,9 +1003,10 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
|
||||
// Note above we don't need to add virtual columns, as all
|
||||
// base columns were copied to view. TODO: reconsider the need
|
||||
// for virtual columns when we support Projection.
|
||||
sstring where_clause = "\"" + view_hash_key + "\" IS NOT NULL";
|
||||
sstring where_clause = format("{} IS NOT NULL", cql3::util::maybe_quote(view_hash_key));
|
||||
if (!view_range_key.empty()) {
|
||||
where_clause = where_clause + " AND \"" + view_range_key + "\" IS NOT NULL";
|
||||
where_clause = format("{} AND {} IS NOT NULL", where_clause,
|
||||
cql3::util::maybe_quote(view_range_key));
|
||||
}
|
||||
where_clauses.push_back(std::move(where_clause));
|
||||
view_builders.emplace_back(std::move(view_builder));
|
||||
@@ -2173,6 +2198,9 @@ static attrs_to_get calculate_attrs_to_get(const rjson::value& req, std::unorder
|
||||
for (auto it = attributes_to_get.Begin(); it != attributes_to_get.End(); ++it) {
|
||||
attribute_path_map_add("AttributesToGet", ret, it->GetString());
|
||||
}
|
||||
if (ret.empty()) {
|
||||
throw api_error::validation("Empty AttributesToGet is not allowed. Consider using Select=COUNT instead.");
|
||||
}
|
||||
return ret;
|
||||
} else if (has_projection_expression) {
|
||||
const rjson::value& projection_expression = req["ProjectionExpression"];
|
||||
@@ -2577,8 +2605,8 @@ static bool hierarchy_actions(
|
||||
// attr member so we can use add()
|
||||
rjson::add_with_string_name(v, attr, std::move(*newv));
|
||||
} else {
|
||||
throw api_error::validation(format("Can't remove document path {} - not present in item",
|
||||
subh.get_value()._path));
|
||||
// Removing a.b when a is a map but a.b doesn't exist
|
||||
// is silently ignored. It's not considered an error.
|
||||
}
|
||||
} else {
|
||||
throw api_error::validation(format("UpdateExpression: document paths not valid for this item:{}", h));
|
||||
|
||||
@@ -116,9 +116,6 @@ future<executor::request_return_type> executor::update_time_to_live(client_state
|
||||
|
||||
future<executor::request_return_type> executor::describe_time_to_live(client_state& client_state, service_permit permit, rjson::value request) {
|
||||
_stats.api_operations.describe_time_to_live++;
|
||||
if (!_proxy.data_dictionary().features().cluster_supports_alternator_ttl()) {
|
||||
co_return api_error::unknown_operation("DescribeTimeToLive not yet supported. Experimental support is available if the 'alternator_ttl' experimental feature is enabled on all nodes.");
|
||||
}
|
||||
schema_ptr schema = get_table(_proxy, request);
|
||||
std::map<sstring, sstring> tags_map = get_tags_of_table(schema);
|
||||
rjson::value desc = rjson::empty_object();
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
#include <seastar/core/sharded.hh>
|
||||
#include <seastar/core/abort_source.hh>
|
||||
#include <seastar/core/semaphore.hh>
|
||||
#include "data_dictionary/data_dictionary.hh"
|
||||
|
||||
namespace replica {
|
||||
class database;
|
||||
|
||||
@@ -624,7 +624,7 @@
|
||||
},
|
||||
{
|
||||
"name":"kn",
|
||||
"description":"Comma seperated keyspaces name to snapshot",
|
||||
"description":"Keyspace(s) to snapshot. Multiple keyspaces can be provided using a comma-separated list. If omitted, snapshot all keyspaces.",
|
||||
"required":false,
|
||||
"allowMultiple":false,
|
||||
"type":"string",
|
||||
@@ -632,7 +632,7 @@
|
||||
},
|
||||
{
|
||||
"name":"cf",
|
||||
"description":"the column family to snapshot",
|
||||
"description":"Table(s) to snapshot. Multiple tables (in a single keyspace) can be provided using a comma-separated list. If omitted, snapshot all tables in the given keyspace(s).",
|
||||
"required":false,
|
||||
"allowMultiple":false,
|
||||
"type":"string",
|
||||
|
||||
@@ -669,19 +669,16 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
|
||||
});
|
||||
}));
|
||||
|
||||
ss::force_keyspace_flush.set(r, [&ctx](std::unique_ptr<request> req) {
|
||||
ss::force_keyspace_flush.set(r, [&ctx](std::unique_ptr<request> req) -> future<json::json_return_type> {
|
||||
auto keyspace = validate_keyspace(ctx, req->param);
|
||||
auto column_families = parse_tables(keyspace, ctx, req->query_parameters, "cf");
|
||||
auto &db = ctx.db.local();
|
||||
if (column_families.empty()) {
|
||||
column_families = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
|
||||
co_await db.flush_on_all(keyspace);
|
||||
} else {
|
||||
co_await db.flush_on_all(keyspace, std::move(column_families));
|
||||
}
|
||||
return ctx.db.invoke_on_all([keyspace, column_families] (replica::database& db) {
|
||||
return parallel_for_each(column_families, [&db, keyspace](const sstring& cf) mutable {
|
||||
return db.find_column_family(keyspace, cf).flush();
|
||||
});
|
||||
}).then([]{
|
||||
return make_ready_future<json::json_return_type>(json_void());
|
||||
});
|
||||
co_return json_void();
|
||||
});
|
||||
|
||||
|
||||
@@ -1284,40 +1281,46 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
|
||||
});
|
||||
});
|
||||
|
||||
ss::take_snapshot.set(r, [&snap_ctl](std::unique_ptr<request> req) {
|
||||
apilog.debug("take_snapshot: {}", req->query_parameters);
|
||||
ss::take_snapshot.set(r, [&snap_ctl](std::unique_ptr<request> req) -> future<json::json_return_type> {
|
||||
apilog.info("take_snapshot: {}", req->query_parameters);
|
||||
auto tag = req->get_query_param("tag");
|
||||
auto column_families = split(req->get_query_param("cf"), ",");
|
||||
auto sfopt = req->get_query_param("sf");
|
||||
auto sf = db::snapshot_ctl::skip_flush(strcasecmp(sfopt.c_str(), "true") == 0);
|
||||
|
||||
std::vector<sstring> keynames = split(req->get_query_param("kn"), ",");
|
||||
|
||||
auto resp = make_ready_future<>();
|
||||
if (column_families.empty()) {
|
||||
resp = snap_ctl.local().take_snapshot(tag, keynames, sf);
|
||||
} else {
|
||||
if (keynames.empty()) {
|
||||
throw httpd::bad_param_exception("The keyspace of column families must be specified");
|
||||
try {
|
||||
if (column_families.empty()) {
|
||||
co_await snap_ctl.local().take_snapshot(tag, keynames, sf);
|
||||
} else {
|
||||
if (keynames.empty()) {
|
||||
throw httpd::bad_param_exception("The keyspace of column families must be specified");
|
||||
}
|
||||
if (keynames.size() > 1) {
|
||||
throw httpd::bad_param_exception("Only one keyspace allowed when specifying a column family");
|
||||
}
|
||||
co_await snap_ctl.local().take_column_family_snapshot(keynames[0], column_families, tag, sf);
|
||||
}
|
||||
if (keynames.size() > 1) {
|
||||
throw httpd::bad_param_exception("Only one keyspace allowed when specifying a column family");
|
||||
}
|
||||
resp = snap_ctl.local().take_column_family_snapshot(keynames[0], column_families, tag, sf);
|
||||
co_return json_void();
|
||||
} catch (...) {
|
||||
apilog.error("take_snapshot failed: {}", std::current_exception());
|
||||
throw;
|
||||
}
|
||||
return resp.then([] {
|
||||
return make_ready_future<json::json_return_type>(json_void());
|
||||
});
|
||||
});
|
||||
|
||||
ss::del_snapshot.set(r, [&snap_ctl](std::unique_ptr<request> req) {
|
||||
ss::del_snapshot.set(r, [&snap_ctl](std::unique_ptr<request> req) -> future<json::json_return_type> {
|
||||
apilog.info("del_snapshot: {}", req->query_parameters);
|
||||
auto tag = req->get_query_param("tag");
|
||||
auto column_family = req->get_query_param("cf");
|
||||
|
||||
std::vector<sstring> keynames = split(req->get_query_param("kn"), ",");
|
||||
return snap_ctl.local().clear_snapshot(tag, keynames, column_family).then([] {
|
||||
return make_ready_future<json::json_return_type>(json_void());
|
||||
});
|
||||
try {
|
||||
co_await snap_ctl.local().clear_snapshot(tag, keynames, column_family);
|
||||
co_return json_void();
|
||||
} catch (...) {
|
||||
apilog.error("del_snapshot failed: {}", std::current_exception());
|
||||
throw;
|
||||
}
|
||||
});
|
||||
|
||||
ss::true_snapshots_size.set(r, [&snap_ctl](std::unique_ptr<request> req) {
|
||||
@@ -1354,7 +1357,7 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
|
||||
if (!req_param<bool>(*req, "disable_snapshot", false)) {
|
||||
auto tag = format("pre-scrub-{:d}", db_clock::now().time_since_epoch().count());
|
||||
f = parallel_for_each(column_families, [&snap_ctl, keyspace, tag](sstring cf) {
|
||||
return snap_ctl.local().take_column_family_snapshot(keyspace, cf, tag);
|
||||
return snap_ctl.local().take_column_family_snapshot(keyspace, cf, tag, db::snapshot_ctl::skip_flush::no, db::snapshot_ctl::allow_view_snapshots::yes);
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -87,19 +87,24 @@ compare_atomic_cell_for_merge(atomic_cell_view left, atomic_cell_view right) {
|
||||
// prefer expiring cells.
|
||||
return left.is_live_and_has_ttl() ? std::strong_ordering::greater : std::strong_ordering::less;
|
||||
}
|
||||
if (left.is_live_and_has_ttl() && left.expiry() != right.expiry()) {
|
||||
return left.expiry() <=> right.expiry();
|
||||
if (left.is_live_and_has_ttl()) {
|
||||
if (left.expiry() != right.expiry()) {
|
||||
return left.expiry() <=> right.expiry();
|
||||
} else {
|
||||
// prefer the cell that was written later,
|
||||
// so it survives longer after it expires, until purged.
|
||||
return right.ttl() <=> left.ttl();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Both are deleted
|
||||
if (left.deletion_time() != right.deletion_time()) {
|
||||
// Origin compares big-endian serialized deletion time. That's because it
|
||||
// delegates to AbstractCell.reconcile() which compares values after
|
||||
// comparing timestamps, which in case of deleted cells will hold
|
||||
// serialized expiry.
|
||||
return (uint64_t) left.deletion_time().time_since_epoch().count()
|
||||
<=> (uint64_t) right.deletion_time().time_since_epoch().count();
|
||||
}
|
||||
|
||||
// Origin compares big-endian serialized deletion time. That's because it
|
||||
// delegates to AbstractCell.reconcile() which compares values after
|
||||
// comparing timestamps, which in case of deleted cells will hold
|
||||
// serialized expiry.
|
||||
return (uint64_t) left.deletion_time().time_since_epoch().count()
|
||||
<=> (uint64_t) right.deletion_time().time_since_epoch().count();
|
||||
}
|
||||
return std::strong_ordering::equal;
|
||||
}
|
||||
|
||||
20
cdc/log.cc
20
cdc/log.cc
@@ -59,7 +59,7 @@ using namespace std::chrono_literals;
|
||||
logging::logger cdc_log("cdc");
|
||||
|
||||
namespace cdc {
|
||||
static schema_ptr create_log_schema(const schema&, std::optional<utils::UUID> = {});
|
||||
static schema_ptr create_log_schema(const schema&, std::optional<utils::UUID> = {}, schema_ptr = nullptr);
|
||||
}
|
||||
|
||||
static constexpr auto cdc_group_name = "cdc";
|
||||
@@ -206,7 +206,7 @@ public:
|
||||
return;
|
||||
}
|
||||
|
||||
auto new_log_schema = create_log_schema(new_schema, log_schema ? std::make_optional(log_schema->id()) : std::nullopt);
|
||||
auto new_log_schema = create_log_schema(new_schema, log_schema ? std::make_optional(log_schema->id()) : std::nullopt, log_schema);
|
||||
|
||||
auto log_mut = log_schema
|
||||
? db::schema_tables::make_update_table_mutations(db, keyspace.metadata(), log_schema, new_log_schema, timestamp, false)
|
||||
@@ -484,7 +484,7 @@ bytes log_data_column_deleted_elements_name_bytes(const bytes& column_name) {
|
||||
return to_bytes(cdc_deleted_elements_column_prefix) + column_name;
|
||||
}
|
||||
|
||||
static schema_ptr create_log_schema(const schema& s, std::optional<utils::UUID> uuid) {
|
||||
static schema_ptr create_log_schema(const schema& s, std::optional<utils::UUID> uuid, schema_ptr old) {
|
||||
schema_builder b(s.ks_name(), log_name(s.cf_name()));
|
||||
b.with_partitioner("com.scylladb.dht.CDCPartitioner");
|
||||
b.set_compaction_strategy(sstables::compaction_strategy_type::time_window);
|
||||
@@ -571,6 +571,20 @@ static schema_ptr create_log_schema(const schema& s, std::optional<utils::UUID>
|
||||
b.set_uuid(*uuid);
|
||||
}
|
||||
|
||||
/**
|
||||
* #10473 - if we are redefining the log table, we need to ensure any dropped
|
||||
* columns are registered in "dropped_columns" table, otherwise clients will not
|
||||
* be able to read data older than now.
|
||||
*/
|
||||
if (old) {
|
||||
// not super efficient, but we don't do this often.
|
||||
for (auto& col : old->all_columns()) {
|
||||
if (!b.has_column({col.name(), col.name_as_text() })) {
|
||||
b.without_column(col.name_as_text(), col.type, api::new_timestamp());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return b.build();
|
||||
}
|
||||
|
||||
|
||||
@@ -1281,6 +1281,13 @@ private:
|
||||
|
||||
const auto& key = _validator.previous_partition_key();
|
||||
|
||||
if (_validator.current_tombstone()) {
|
||||
throw compaction_aborted_exception(
|
||||
_schema->ks_name(),
|
||||
_schema->cf_name(),
|
||||
"scrub compaction cannot handle invalid fragments with an active range tombstone change");
|
||||
}
|
||||
|
||||
// If the unexpected fragment is a partition end, we just drop it.
|
||||
// The only case a partition end is invalid is when it comes after
|
||||
// another partition end, and we can just drop it in that case.
|
||||
|
||||
@@ -317,9 +317,9 @@ future<> compaction_manager::run_custom_job(replica::table* t, sstables::compact
|
||||
|
||||
auto job_ptr = std::make_unique<noncopyable_function<future<>(sstables::compaction_data&)>>(std::move(job));
|
||||
|
||||
task->compaction_done = with_semaphore(_maintenance_ops_sem, 1, [this, task, &job = *job_ptr] () mutable {
|
||||
// take read lock for table, so major compaction and resharding can't proceed in parallel.
|
||||
return with_lock(task->compaction_state.lock.for_read(), [this, task, &job] () mutable {
|
||||
task->compaction_done = with_semaphore(_custom_jobs_sem, 1, [this, task, &job = *job_ptr] () mutable {
|
||||
// We don't need to take task->compaction_state.lock.for_read() as it only serializes minor and major
|
||||
|
||||
// Allow caller to know that task (e.g. reshape) was asked to stop while waiting for a chance to run.
|
||||
if (task->stopping) {
|
||||
throw sstables::compaction_stopped_exception(task->compacting_table->schema()->ks_name(), task->compacting_table->schema()->cf_name(),
|
||||
@@ -335,7 +335,6 @@ future<> compaction_manager::run_custom_job(replica::table* t, sstables::compact
|
||||
// no need to register shared sstables because they're excluded from non-resharding
|
||||
// compaction and some of them may not even belong to current shard.
|
||||
return job(task->compaction_data);
|
||||
});
|
||||
}).then_wrapped([this, task, job_ptr = std::move(job_ptr), type] (future<> f) {
|
||||
_stats.active_tasks--;
|
||||
_tasks.remove(task);
|
||||
@@ -353,32 +352,50 @@ future<> compaction_manager::run_custom_job(replica::table* t, sstables::compact
|
||||
return task->compaction_done.get_future().then([task] {});
|
||||
}
|
||||
|
||||
compaction_manager::compaction_reenabler::compaction_reenabler(compaction_manager& cm, replica::table* t)
|
||||
: _cm(cm)
|
||||
, _table(t)
|
||||
, _compaction_state(cm.get_compaction_state(_table))
|
||||
, _holder(_compaction_state.gate.hold())
|
||||
{
|
||||
_compaction_state.compaction_disabled_counter++;
|
||||
cmlog.debug("Temporarily disabled compaction for {}.{}. compaction_disabled_counter={}",
|
||||
_table->schema()->ks_name(), _table->schema()->cf_name(), _compaction_state.compaction_disabled_counter);
|
||||
}
|
||||
|
||||
compaction_manager::compaction_reenabler::compaction_reenabler(compaction_reenabler&& o) noexcept
|
||||
: _cm(o._cm)
|
||||
, _table(std::exchange(o._table, nullptr))
|
||||
, _compaction_state(o._compaction_state)
|
||||
, _holder(std::move(o._holder))
|
||||
{}
|
||||
|
||||
compaction_manager::compaction_reenabler::~compaction_reenabler() {
|
||||
// submit compaction request if we're the last holder of the gate which is still opened.
|
||||
if (_table && --_compaction_state.compaction_disabled_counter == 0 && !_compaction_state.gate.is_closed()) {
|
||||
cmlog.debug("Reenabling compaction for {}.{}",
|
||||
_table->schema()->ks_name(), _table->schema()->cf_name());
|
||||
try {
|
||||
_cm.submit(_table);
|
||||
} catch (...) {
|
||||
cmlog.warn("compaction_reenabler could not reenable compaction for {}.{}: {}",
|
||||
_table->schema()->ks_name(), _table->schema()->cf_name(), std::current_exception());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
future<compaction_manager::compaction_reenabler>
|
||||
compaction_manager::stop_and_disable_compaction(replica::table* t) {
|
||||
compaction_reenabler cre(*this, t);
|
||||
co_await stop_ongoing_compactions("user-triggered operation", t);
|
||||
co_return cre;
|
||||
}
|
||||
|
||||
future<>
|
||||
compaction_manager::run_with_compaction_disabled(replica::table* t, std::function<future<> ()> func) {
|
||||
auto& c_state = _compaction_state[t];
|
||||
auto holder = c_state.gate.hold();
|
||||
compaction_reenabler cre = co_await stop_and_disable_compaction(t);
|
||||
|
||||
c_state.compaction_disabled_counter++;
|
||||
|
||||
std::exception_ptr err;
|
||||
try {
|
||||
co_await stop_ongoing_compactions("user-triggered operation", t);
|
||||
co_await func();
|
||||
} catch (...) {
|
||||
err = std::current_exception();
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
assert(_compaction_state.contains(t));
|
||||
#endif
|
||||
// submit compaction request if we're the last holder of the gate which is still opened.
|
||||
if (--c_state.compaction_disabled_counter == 0 && !c_state.gate.is_closed()) {
|
||||
submit(t);
|
||||
}
|
||||
if (err) {
|
||||
std::rethrow_exception(err);
|
||||
}
|
||||
co_return;
|
||||
co_await func();
|
||||
}
|
||||
|
||||
void compaction_manager::task::setup_new_compaction() {
|
||||
@@ -584,16 +601,11 @@ future<> compaction_manager::stop() {
|
||||
}
|
||||
}
|
||||
|
||||
void compaction_manager::really_do_stop() {
|
||||
if (_state == state::none || _state == state::stopped) {
|
||||
return;
|
||||
}
|
||||
|
||||
_state = state::stopped;
|
||||
future<> compaction_manager::really_do_stop() {
|
||||
cmlog.info("Asked to stop");
|
||||
// Reset the metrics registry
|
||||
_metrics.clear();
|
||||
_stop_future.emplace(stop_ongoing_compactions("shutdown").then([this] () mutable {
|
||||
return stop_ongoing_compactions("shutdown").then([this] () mutable {
|
||||
reevaluate_postponed_compactions();
|
||||
return std::move(_waiting_reevalution);
|
||||
}).then([this] {
|
||||
@@ -601,12 +613,34 @@ void compaction_manager::really_do_stop() {
|
||||
_compaction_submission_timer.cancel();
|
||||
cmlog.info("Stopped");
|
||||
return _compaction_controller.shutdown();
|
||||
}));
|
||||
});
|
||||
}
|
||||
|
||||
template <typename Ex>
|
||||
requires std::is_base_of_v<std::exception, Ex> &&
|
||||
requires (const Ex& ex) {
|
||||
{ ex.code() } noexcept -> std::same_as<const std::error_code&>;
|
||||
}
|
||||
auto swallow_enospc(const Ex& ex) noexcept {
|
||||
if (ex.code().value() != ENOSPC) {
|
||||
return make_exception_future<>(std::make_exception_ptr(ex));
|
||||
}
|
||||
|
||||
cmlog.warn("Got ENOSPC on stop, ignoring...");
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
void compaction_manager::do_stop() noexcept {
|
||||
if (_state == state::none || _state == state::stopped) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
really_do_stop();
|
||||
_state = state::stopped;
|
||||
_stop_future = really_do_stop()
|
||||
.handle_exception_type([] (const std::system_error& ex) { return swallow_enospc(ex); })
|
||||
.handle_exception_type([] (const storage_io_error& ex) { return swallow_enospc(ex); })
|
||||
;
|
||||
} catch (...) {
|
||||
try {
|
||||
cmlog.error("Failed to stop the manager: {}", std::current_exception());
|
||||
@@ -742,6 +776,7 @@ future<> compaction_manager::perform_offstrategy(replica::table* t) {
|
||||
_stats.active_tasks++;
|
||||
task->setup_new_compaction();
|
||||
|
||||
return with_scheduling_group(_maintenance_sg.cpu, [this, task, t] {
|
||||
return t->run_offstrategy_compaction(task->compaction_data).then_wrapped([this, task, schema = t->schema()] (future<> f) mutable {
|
||||
_stats.active_tasks--;
|
||||
task->finish_compaction();
|
||||
@@ -763,6 +798,7 @@ future<> compaction_manager::perform_offstrategy(replica::table* t) {
|
||||
}
|
||||
return make_ready_future<stop_iteration>(stop_iteration::yes);
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
}).finally([this, task] {
|
||||
@@ -810,7 +846,8 @@ future<> compaction_manager::rewrite_sstables(replica::table* t, sstables::compa
|
||||
auto sstable_level = sst->get_sstable_level();
|
||||
auto run_identifier = sst->run_identifier();
|
||||
auto sstable_set_snapshot = can_purge ? std::make_optional(t.get_sstable_set()) : std::nullopt;
|
||||
auto descriptor = sstables::compaction_descriptor({ sst }, std::move(sstable_set_snapshot), _maintenance_sg.io,
|
||||
// FIXME: this compaction should run with maintenance priority.
|
||||
auto descriptor = sstables::compaction_descriptor({ sst }, std::move(sstable_set_snapshot), service::get_local_compaction_priority(),
|
||||
sstable_level, sstables::compaction_descriptor::default_max_sstable_bytes, run_identifier, options);
|
||||
|
||||
// Releases reference to cleaned sstable such that respective used disk space can be freed.
|
||||
@@ -819,8 +856,9 @@ future<> compaction_manager::rewrite_sstables(replica::table* t, sstables::compa
|
||||
};
|
||||
|
||||
auto maintenance_permit = co_await seastar::get_units(_maintenance_ops_sem, 1);
|
||||
// Take write lock for table to serialize cleanup/upgrade sstables/scrub with major compaction/reshape/reshard.
|
||||
auto write_lock_holder = co_await _compaction_state[&t].lock.hold_write_lock();
|
||||
// FIXME: acquiring the read lock is not needed after acquiring the _maintenance_ops_sem
|
||||
// only major compaction needs to acquire the write lock to synchronize with regular compaction.
|
||||
auto lock_holder = co_await _compaction_state[&t].lock.hold_read_lock();
|
||||
|
||||
_stats.pending_tasks--;
|
||||
_stats.active_tasks++;
|
||||
@@ -852,7 +890,7 @@ future<> compaction_manager::rewrite_sstables(replica::table* t, sstables::compa
|
||||
};
|
||||
|
||||
compaction_backlog_tracker user_initiated(std::make_unique<user_initiated_backlog_tracker>(_compaction_controller.backlog_of_shares(200), _available_memory));
|
||||
completed = co_await with_scheduling_group(_maintenance_sg.cpu, std::ref(perform_rewrite));
|
||||
completed = co_await with_scheduling_group(_compaction_controller.sg(), std::ref(perform_rewrite));
|
||||
} while (!completed);
|
||||
};
|
||||
|
||||
|
||||
@@ -147,6 +147,8 @@ private:
|
||||
// If the operation must be serialized with regular, then the per-table write lock must be taken.
|
||||
seastar::named_semaphore _maintenance_ops_sem = {1, named_semaphore_exception_factory{"maintenance operation"}};
|
||||
|
||||
seastar::named_semaphore _custom_jobs_sem = {1, named_semaphore_exception_factory{"custom jobs"}};
|
||||
|
||||
std::function<void()> compaction_submission_callback();
|
||||
// all registered tables are reevaluated at a constant interval.
|
||||
// Submission is a NO-OP when there's nothing to do, so it's fine to call it regularly.
|
||||
@@ -233,7 +235,7 @@ public:
|
||||
|
||||
// Stop all fibers, without waiting. Safe to be called multiple times.
|
||||
void do_stop() noexcept;
|
||||
void really_do_stop();
|
||||
future<> really_do_stop();
|
||||
|
||||
// Submit a table to be compacted.
|
||||
void submit(replica::table* t);
|
||||
@@ -269,6 +271,31 @@ public:
|
||||
// parameter job is a function that will carry the operation
|
||||
future<> run_custom_job(replica::table* t, sstables::compaction_type type, noncopyable_function<future<>(sstables::compaction_data&)> job);
|
||||
|
||||
class compaction_reenabler {
|
||||
compaction_manager& _cm;
|
||||
replica::table* _table;
|
||||
compaction_state& _compaction_state;
|
||||
gate::holder _holder;
|
||||
|
||||
public:
|
||||
compaction_reenabler(compaction_manager&, replica::table*);
|
||||
compaction_reenabler(compaction_reenabler&&) noexcept;
|
||||
|
||||
~compaction_reenabler();
|
||||
|
||||
replica::table* compacting_table() const noexcept {
|
||||
return _table;
|
||||
}
|
||||
|
||||
const compaction_state& compaction_state() const noexcept {
|
||||
return _compaction_state;
|
||||
}
|
||||
};
|
||||
|
||||
// Disable compaction temporarily for a table t.
|
||||
// Caller should call the compaction_reenabler::reenable
|
||||
future<compaction_reenabler> stop_and_disable_compaction(replica::table* t);
|
||||
|
||||
// Run a function with compaction temporarily disabled for a table T.
|
||||
future<> run_with_compaction_disabled(replica::table* t, std::function<future<> ()> func);
|
||||
|
||||
|
||||
@@ -69,7 +69,11 @@ compaction_descriptor leveled_compaction_strategy::get_major_compaction_job(tabl
|
||||
}
|
||||
|
||||
void leveled_compaction_strategy::notify_completion(const std::vector<shared_sstable>& removed, const std::vector<shared_sstable>& added) {
|
||||
if (removed.empty() || added.empty()) {
|
||||
// All the update here is only relevant for regular compaction's round-robin picking policy, and if
|
||||
// last_compacted_keys wasn't generated by regular, it means regular is disabled since last restart,
|
||||
// therefore we can skip the updates here until regular runs for the first time. Once it runs,
|
||||
// it will be able to generate last_compacted_keys correctly by looking at metadata of files.
|
||||
if (removed.empty() || added.empty() || !_last_compacted_keys) {
|
||||
return;
|
||||
}
|
||||
auto min_level = std::numeric_limits<uint32_t>::max();
|
||||
|
||||
@@ -217,6 +217,7 @@ time_window_compaction_strategy::get_sstables_for_compaction(table_state& table_
|
||||
auto compaction_time = gc_clock::now();
|
||||
|
||||
if (candidates.empty()) {
|
||||
_estimated_remaining_tasks = 0;
|
||||
return compaction_descriptor();
|
||||
}
|
||||
|
||||
|
||||
@@ -615,6 +615,8 @@ arg_parser.add_argument('--static-yaml-cpp', dest='staticyamlcpp', action='store
|
||||
help='Link libyaml-cpp statically')
|
||||
arg_parser.add_argument('--tests-debuginfo', action='store', dest='tests_debuginfo', type=int, default=0,
|
||||
help='Enable(1)/disable(0)compiler debug information generation for tests')
|
||||
arg_parser.add_argument('--perf-tests-debuginfo', action='store', dest='perf_tests_debuginfo', type=int, default=0,
|
||||
help='Enable(1)/disable(0)compiler debug information generation for perf tests')
|
||||
arg_parser.add_argument('--python', action='store', dest='python', default='python3',
|
||||
help='Python3 path')
|
||||
arg_parser.add_argument('--split-dwarf', dest='split_dwarf', action='store_true', default=False,
|
||||
@@ -1377,6 +1379,7 @@ linker_flags = linker_flags(compiler=args.cxx)
|
||||
|
||||
dbgflag = '-g -gz' if args.debuginfo else ''
|
||||
tests_link_rule = 'link' if args.tests_debuginfo else 'link_stripped'
|
||||
perf_tests_link_rule = 'link' if args.perf_tests_debuginfo else 'link_stripped'
|
||||
|
||||
# Strip if debuginfo is disabled, otherwise we end up with partial
|
||||
# debug info from the libraries we static link with
|
||||
@@ -1901,7 +1904,8 @@ with open(buildfile_tmp, 'w') as f:
|
||||
# So we strip the tests by default; The user can very
|
||||
# quickly re-link the test unstripped by adding a "_g"
|
||||
# to the test name, e.g., "ninja build/release/testname_g"
|
||||
f.write('build $builddir/{}/{}: {}.{} {} | {} {}\n'.format(mode, binary, tests_link_rule, mode, str.join(' ', objs), seastar_dep, seastar_testing_dep))
|
||||
link_rule = perf_tests_link_rule if binary.startswith('test/perf/') else tests_link_rule
|
||||
f.write('build $builddir/{}/{}: {}.{} {} | {} {}\n'.format(mode, binary, link_rule, mode, str.join(' ', objs), seastar_dep, seastar_testing_dep))
|
||||
f.write(' libs = {}\n'.format(local_libs))
|
||||
f.write('build $builddir/{}/{}_g: {}.{} {} | {} {}\n'.format(mode, binary, regular_link_rule, mode, str.join(' ', objs), seastar_dep, seastar_testing_dep))
|
||||
f.write(' libs = {}\n'.format(local_libs))
|
||||
|
||||
@@ -1386,7 +1386,7 @@ serviceLevelOrRoleName returns [sstring name]
|
||||
std::transform($name.begin(), $name.end(), $name.begin(), ::tolower); }
|
||||
| t=STRING_LITERAL { $name = sstring($t.text); }
|
||||
| t=QUOTED_NAME { $name = sstring($t.text); }
|
||||
| k=unreserved_keyword { $name = sstring($t.text);
|
||||
| k=unreserved_keyword { $name = k;
|
||||
std::transform($name.begin(), $name.end(), $name.begin(), ::tolower);}
|
||||
| QMARK {add_recognition_error("Bind variables cannot be used for service levels or role names");}
|
||||
;
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
|
||||
#include "cql3_type.hh"
|
||||
#include "cql3/util.hh"
|
||||
#include "exceptions/exceptions.hh"
|
||||
#include "ut_name.hh"
|
||||
#include "data_dictionary/data_dictionary.hh"
|
||||
#include "data_dictionary/user_types_metadata.hh"
|
||||
@@ -436,7 +437,20 @@ sstring maybe_quote(const sstring& identifier) {
|
||||
}
|
||||
|
||||
if (!need_quotes) {
|
||||
return identifier;
|
||||
// A seemingly valid identifier matching [a-z][a-z0-9_]* may still
|
||||
// need quoting if it is a CQL keyword, e.g., "to" (see issue #9450).
|
||||
// While our parser Cql.g has different production rules for different
|
||||
// types of identifiers (column names, table names, etc.), all of
|
||||
// these behave identically for alphanumeric strings: they exclude
|
||||
// many keywords but allow keywords listed as "unreserved keywords".
|
||||
// So we can use any of them, for example cident.
|
||||
try {
|
||||
cql3::util::do_with_parser(identifier, std::mem_fn(&cql3_parser::CqlParser::cident));
|
||||
return identifier;
|
||||
} catch(exceptions::syntax_exception&) {
|
||||
// This alphanumeric string is not a valid identifier, so fall
|
||||
// through to have it quoted:
|
||||
}
|
||||
}
|
||||
if (num_quotes == 0) {
|
||||
return make_sstring("\"", identifier, "\"");
|
||||
|
||||
@@ -81,9 +81,7 @@ public:
|
||||
virtual seastar::future<seastar::shared_ptr<cql_transport::messages::result_message>>
|
||||
execute(query_processor& qp, service::query_state& state, const query_options& options) const = 0;
|
||||
|
||||
virtual bool depends_on_keyspace(const seastar::sstring& ks_name) const = 0;
|
||||
|
||||
virtual bool depends_on_column_family(const seastar::sstring& cf_name) const = 0;
|
||||
virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const = 0;
|
||||
|
||||
virtual seastar::shared_ptr<const metadata> get_result_metadata() const = 0;
|
||||
|
||||
|
||||
@@ -103,10 +103,50 @@ managed_bytes_opt get_value(const column_value& col, const column_value_eval_bag
|
||||
if (!col_type->is_map()) {
|
||||
throw exceptions::invalid_request_exception(format("subscripting non-map column {}", cdef->name_as_text()));
|
||||
}
|
||||
const auto deserialized = cdef->type->deserialize(managed_bytes_view(*data.other_columns[data.sel.index_of(*cdef)]));
|
||||
int32_t index = data.sel.index_of(*cdef);
|
||||
if (index == -1) {
|
||||
throw std::runtime_error(
|
||||
format("Column definition {} does not match any column in the query selection",
|
||||
cdef->name_as_text()));
|
||||
}
|
||||
const managed_bytes_opt& serialized = data.other_columns[index];
|
||||
if (!serialized) {
|
||||
// For null[i] we return null.
|
||||
return std::nullopt;
|
||||
}
|
||||
const auto deserialized = cdef->type->deserialize(managed_bytes_view(*serialized));
|
||||
const auto& data_map = value_cast<map_type_impl::native_type>(deserialized);
|
||||
const auto key = evaluate(*col.sub, options);
|
||||
auto&& key_type = col_type->name_comparator();
|
||||
if (key.is_null()) {
|
||||
// For m[null] return null.
|
||||
// This is different from Cassandra - which treats m[null]
|
||||
// as an invalid request error. But m[null] -> null is more
|
||||
// consistent with our usual null treatement (e.g., both
|
||||
// null[2] and null < 2 return null). It will also allow us
|
||||
// to support non-constant subscripts (e.g., m[a]) where "a"
|
||||
// may be null in some rows and non-null in others, and it's
|
||||
// not an error.
|
||||
return std::nullopt;
|
||||
}
|
||||
if (key.is_unset_value()) {
|
||||
// An m[?] with ? bound to UNSET_VALUE is a invalid query.
|
||||
// We could have detected it earlier while binding, but since
|
||||
// we currently don't, we must protect the following code
|
||||
// which can't work with an UNSET_VALUE. Note that the
|
||||
// placement of this check here means that in an empty table,
|
||||
// where we never need to evaluate the filter expression, this
|
||||
// error will not be detected.
|
||||
throw exceptions::invalid_request_exception(
|
||||
format("Unsupported unset map key for column {}",
|
||||
cdef->name_as_text()));
|
||||
}
|
||||
if (key.type != key_type) {
|
||||
// This can't happen, we always verify the index type earlier.
|
||||
throw std::logic_error(
|
||||
format("Tried to evaluate expression with wrong type for subscript of {}",
|
||||
cdef->name_as_text()));
|
||||
}
|
||||
const auto found = key.view().with_linearized([&] (bytes_view key_bv) {
|
||||
using entry = std::pair<data_value, data_value>;
|
||||
return std::find_if(data_map.cbegin(), data_map.cend(), [&] (const entry& element) {
|
||||
@@ -121,8 +161,16 @@ managed_bytes_opt get_value(const column_value& col, const column_value_eval_bag
|
||||
case column_kind::clustering_key:
|
||||
return managed_bytes(data.clustering_key[cdef->id]);
|
||||
case column_kind::static_column:
|
||||
case column_kind::regular_column:
|
||||
return managed_bytes_opt(data.other_columns[data.sel.index_of(*cdef)]);
|
||||
[[fallthrough]];
|
||||
case column_kind::regular_column: {
|
||||
int32_t index = data.sel.index_of(*cdef);
|
||||
if (index == -1) {
|
||||
throw std::runtime_error(
|
||||
format("Column definition {} does not match any column in the query selection",
|
||||
cdef->name_as_text()));
|
||||
}
|
||||
return managed_bytes_opt(data.other_columns[index]);
|
||||
}
|
||||
default:
|
||||
throw exceptions::unsupported_operation_exception("Unknown column kind");
|
||||
}
|
||||
|
||||
@@ -953,7 +953,7 @@ bool query_processor::migration_subscriber::should_invalidate(
|
||||
sstring ks_name,
|
||||
std::optional<sstring> cf_name,
|
||||
::shared_ptr<cql_statement> statement) {
|
||||
return statement->depends_on_keyspace(ks_name) && (!cf_name || statement->depends_on_column_family(*cf_name));
|
||||
return statement->depends_on(ks_name, cf_name);
|
||||
}
|
||||
|
||||
future<> query_processor::query_internal(
|
||||
|
||||
@@ -514,7 +514,7 @@ statement_restrictions::statement_restrictions(data_dictionary::database db,
|
||||
}
|
||||
|
||||
if (!_nonprimary_key_restrictions->empty()) {
|
||||
if (_has_queriable_regular_index) {
|
||||
if (_has_queriable_regular_index && _partition_range_is_simple) {
|
||||
_uses_secondary_indexing = true;
|
||||
} else if (!allow_filtering) {
|
||||
throw exceptions::invalid_request_exception("Cannot execute this query as it might involve data filtering and "
|
||||
|
||||
@@ -165,7 +165,7 @@ public:
|
||||
|
||||
template<typename RowComparator>
|
||||
void sort(const RowComparator& cmp) {
|
||||
std::sort(_rows.begin(), _rows.end(), std::ref(cmp));
|
||||
std::sort(_rows.begin(), _rows.end(), cmp);
|
||||
}
|
||||
|
||||
metadata& get_metadata();
|
||||
|
||||
@@ -422,11 +422,16 @@ bool result_set_builder::restrictions_filter::do_filter(const selection& selecti
|
||||
}
|
||||
|
||||
auto clustering_columns_restrictions = _restrictions->get_clustering_columns_restrictions();
|
||||
if (dynamic_pointer_cast<cql3::restrictions::multi_column_restriction>(clustering_columns_restrictions)) {
|
||||
bool has_multi_col_clustering_restrictions =
|
||||
dynamic_pointer_cast<cql3::restrictions::multi_column_restriction>(clustering_columns_restrictions) != nullptr;
|
||||
if (has_multi_col_clustering_restrictions) {
|
||||
clustering_key_prefix ckey = clustering_key_prefix::from_exploded(clustering_key);
|
||||
return expr::is_satisfied_by(
|
||||
bool multi_col_clustering_satisfied = expr::is_satisfied_by(
|
||||
clustering_columns_restrictions->expression,
|
||||
partition_key, clustering_key, static_row, row, selection, _options);
|
||||
if (!multi_col_clustering_satisfied) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
auto static_row_iterator = static_row.iterator();
|
||||
@@ -474,6 +479,13 @@ bool result_set_builder::restrictions_filter::do_filter(const selection& selecti
|
||||
if (_skip_ck_restrictions) {
|
||||
continue;
|
||||
}
|
||||
if (has_multi_col_clustering_restrictions) {
|
||||
// Mixing multi column and single column restrictions on clustering
|
||||
// key columns is forbidden.
|
||||
// Since there are multi column restrictions we have to skip
|
||||
// evaluating single column restrictions or we will get an error.
|
||||
continue;
|
||||
}
|
||||
auto clustering_key_restrictions_map = _restrictions->get_single_column_clustering_key_restrictions();
|
||||
auto restr_it = clustering_key_restrictions_map.find(cdef);
|
||||
if (restr_it == clustering_key_restrictions_map.end()) {
|
||||
|
||||
@@ -18,13 +18,7 @@ uint32_t cql3::statements::authentication_statement::get_bound_terms() const {
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool cql3::statements::authentication_statement::depends_on_keyspace(
|
||||
const sstring& ks_name) const {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool cql3::statements::authentication_statement::depends_on_column_family(
|
||||
const sstring& cf_name) const {
|
||||
bool cql3::statements::authentication_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@@ -27,9 +27,7 @@ public:
|
||||
|
||||
uint32_t get_bound_terms() const override;
|
||||
|
||||
bool depends_on_keyspace(const sstring& ks_name) const override;
|
||||
|
||||
bool depends_on_column_family(const sstring& cf_name) const override;
|
||||
bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;
|
||||
|
||||
future<> check_access(query_processor& qp, const service::client_state& state) const override;
|
||||
|
||||
|
||||
@@ -20,13 +20,7 @@ uint32_t cql3::statements::authorization_statement::get_bound_terms() const {
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool cql3::statements::authorization_statement::depends_on_keyspace(
|
||||
const sstring& ks_name) const {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool cql3::statements::authorization_statement::depends_on_column_family(
|
||||
const sstring& cf_name) const {
|
||||
bool cql3::statements::authorization_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@@ -31,9 +31,7 @@ public:
|
||||
|
||||
uint32_t get_bound_terms() const override;
|
||||
|
||||
bool depends_on_keyspace(const sstring& ks_name) const override;
|
||||
|
||||
bool depends_on_column_family(const sstring& cf_name) const override;
|
||||
bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;
|
||||
|
||||
future<> check_access(query_processor& qp, const service::client_state& state) const override;
|
||||
|
||||
|
||||
@@ -70,14 +70,9 @@ batch_statement::batch_statement(type type_,
|
||||
{
|
||||
}
|
||||
|
||||
bool batch_statement::depends_on_keyspace(const sstring& ks_name) const
|
||||
bool batch_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
bool batch_statement::depends_on_column_family(const sstring& cf_name) const
|
||||
{
|
||||
return false;
|
||||
return boost::algorithm::any_of(_statements, [&ks_name, &cf_name] (auto&& s) { return s.statement->depends_on(ks_name, cf_name); });
|
||||
}
|
||||
|
||||
uint32_t batch_statement::get_bound_terms() const
|
||||
|
||||
@@ -88,9 +88,7 @@ public:
|
||||
std::unique_ptr<attributes> attrs,
|
||||
cql_stats& stats);
|
||||
|
||||
virtual bool depends_on_keyspace(const sstring& ks_name) const override;
|
||||
|
||||
virtual bool depends_on_column_family(const sstring& cf_name) const override;
|
||||
virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;
|
||||
|
||||
virtual uint32_t get_bound_terms() const override;
|
||||
|
||||
|
||||
@@ -20,6 +20,7 @@
|
||||
#include "gms/feature_service.hh"
|
||||
#include "tombstone_gc_extension.hh"
|
||||
#include "tombstone_gc.hh"
|
||||
#include "utils/bloom_calculations.hh"
|
||||
|
||||
#include <boost/algorithm/string/predicate.hpp>
|
||||
|
||||
@@ -145,6 +146,16 @@ void cf_prop_defs::validate(const data_dictionary::database db, sstring ks_name,
|
||||
throw exceptions::configuration_exception(KW_MAX_INDEX_INTERVAL + " must be greater than " + KW_MIN_INDEX_INTERVAL);
|
||||
}
|
||||
|
||||
if (get_simple(KW_BF_FP_CHANCE)) {
|
||||
double bloom_filter_fp_chance = get_double(KW_BF_FP_CHANCE, 0/*not used*/);
|
||||
double min_bloom_filter_fp_chance = utils::bloom_calculations::min_supported_bloom_filter_fp_chance();
|
||||
if (bloom_filter_fp_chance <= min_bloom_filter_fp_chance || bloom_filter_fp_chance > 1.0) {
|
||||
throw exceptions::configuration_exception(format(
|
||||
"{} must be larger than {} and less than or equal to 1.0 (got {})",
|
||||
KW_BF_FP_CHANCE, min_bloom_filter_fp_chance, bloom_filter_fp_chance));
|
||||
}
|
||||
}
|
||||
|
||||
speculative_retry::from_sstring(get_string(KW_SPECULATIVE_RETRY, speculative_retry(speculative_retry::type::NONE, 0).to_sstring()));
|
||||
}
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
|
||||
#include "cql3/statements/cf_prop_defs.hh"
|
||||
#include "cql3/column_identifier.hh"
|
||||
#include "data_dictionary/data_dictionary.hh"
|
||||
|
||||
namespace cql3 {
|
||||
|
||||
|
||||
@@ -110,9 +110,6 @@ future<> modification_statement::check_access(query_processor& qp, const service
|
||||
|
||||
future<std::vector<mutation>>
|
||||
modification_statement::get_mutations(query_processor& qp, const query_options& options, db::timeout_clock::time_point timeout, bool local, int64_t now, service::query_state& qs) const {
|
||||
if (_restrictions->range_or_slice_eq_null(options)) { // See #7852 and #9290.
|
||||
throw exceptions::invalid_request_exception("Invalid null value in condition for a key column");
|
||||
}
|
||||
auto cl = options.get_consistency();
|
||||
auto json_cache = maybe_prepare_json_cache(options);
|
||||
auto keys = build_partition_keys(options, json_cache);
|
||||
@@ -255,6 +252,10 @@ modification_statement::do_execute(query_processor& qp, service::query_state& qs
|
||||
|
||||
inc_cql_stats(qs.get_client_state().is_internal());
|
||||
|
||||
if (_restrictions->range_or_slice_eq_null(options)) { // See #7852 and #9290.
|
||||
throw exceptions::invalid_request_exception("Invalid null value in condition for a key column");
|
||||
}
|
||||
|
||||
if (has_conditions()) {
|
||||
return execute_with_condition(qp, qs, options);
|
||||
}
|
||||
@@ -539,12 +540,8 @@ modification_statement::validate(query_processor&, const service::client_state&
|
||||
}
|
||||
}
|
||||
|
||||
bool modification_statement::depends_on_keyspace(const sstring& ks_name) const {
|
||||
return keyspace() == ks_name;
|
||||
}
|
||||
|
||||
bool modification_statement::depends_on_column_family(const sstring& cf_name) const {
|
||||
return column_family() == cf_name;
|
||||
bool modification_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
|
||||
return keyspace() == ks_name && (!cf_name || column_family() == *cf_name);
|
||||
}
|
||||
|
||||
void modification_statement::add_operation(::shared_ptr<operation> op) {
|
||||
|
||||
@@ -137,9 +137,7 @@ public:
|
||||
// Validate before execute, using client state and current schema
|
||||
void validate(query_processor&, const service::client_state& state) const override;
|
||||
|
||||
virtual bool depends_on_keyspace(const sstring& ks_name) const override;
|
||||
|
||||
virtual bool depends_on_column_family(const sstring& cf_name) const override;
|
||||
virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;
|
||||
|
||||
void add_operation(::shared_ptr<operation> op);
|
||||
|
||||
|
||||
@@ -45,12 +45,7 @@ future<> schema_altering_statement::grant_permissions_to_creator(const service::
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
bool schema_altering_statement::depends_on_keyspace(const sstring& ks_name) const
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
bool schema_altering_statement::depends_on_column_family(const sstring& cf_name) const
|
||||
bool schema_altering_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -53,9 +53,7 @@ protected:
|
||||
*/
|
||||
virtual future<> grant_permissions_to_creator(const service::client_state&) const;
|
||||
|
||||
virtual bool depends_on_keyspace(const sstring& ks_name) const override;
|
||||
|
||||
virtual bool depends_on_column_family(const sstring& cf_name) const override;
|
||||
virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;
|
||||
|
||||
virtual uint32_t get_bound_terms() const override;
|
||||
|
||||
|
||||
@@ -167,12 +167,8 @@ void select_statement::validate(query_processor&, const service::client_state& s
|
||||
// Nothing to do, all validation has been done by raw_statemet::prepare()
|
||||
}
|
||||
|
||||
bool select_statement::depends_on_keyspace(const sstring& ks_name) const {
|
||||
return keyspace() == ks_name;
|
||||
}
|
||||
|
||||
bool select_statement::depends_on_column_family(const sstring& cf_name) const {
|
||||
return column_family() == cf_name;
|
||||
bool select_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
|
||||
return keyspace() == ks_name && (!cf_name || column_family() == *cf_name);
|
||||
}
|
||||
|
||||
const sstring& select_statement::keyspace() const {
|
||||
|
||||
@@ -100,8 +100,7 @@ public:
|
||||
virtual uint32_t get_bound_terms() const override;
|
||||
virtual future<> check_access(query_processor& qp, const service::client_state& state) const override;
|
||||
virtual void validate(query_processor&, const service::client_state& state) const override;
|
||||
virtual bool depends_on_keyspace(const sstring& ks_name) const override;
|
||||
virtual bool depends_on_column_family(const sstring& cf_name) const override;
|
||||
virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;
|
||||
|
||||
virtual future<::shared_ptr<cql_transport::messages::result_message>> execute(query_processor& qp,
|
||||
service::query_state& state, const query_options& options) const override;
|
||||
|
||||
@@ -17,13 +17,7 @@ uint32_t service_level_statement::get_bound_terms() const {
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool service_level_statement::depends_on_keyspace(
|
||||
const sstring &ks_name) const {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool service_level_statement::depends_on_column_family(
|
||||
const sstring &cf_name) const {
|
||||
bool service_level_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@@ -43,9 +43,7 @@ public:
|
||||
|
||||
uint32_t get_bound_terms() const override;
|
||||
|
||||
bool depends_on_keyspace(const sstring& ks_name) const override;
|
||||
|
||||
bool depends_on_column_family(const sstring& cf_name) const override;
|
||||
bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;
|
||||
|
||||
future<> check_access(query_processor& qp, const service::client_state& state) const override;
|
||||
|
||||
|
||||
@@ -30,7 +30,7 @@ void sl_prop_defs::validate() {
|
||||
data_value v = duration_type->deserialize(duration_type->from_string(*repr));
|
||||
cql_duration duration = static_pointer_cast<const duration_type_impl>(duration_type)->from_value(v);
|
||||
if (duration.months || duration.days) {
|
||||
throw exceptions::invalid_request_exception("Timeout values cannot be longer than 24h");
|
||||
throw exceptions::invalid_request_exception("Timeout values cannot be expressed in days/months");
|
||||
}
|
||||
if (duration.nanoseconds % 1'000'000 != 0) {
|
||||
throw exceptions::invalid_request_exception("Timeout values must be expressed in millisecond granularity");
|
||||
|
||||
@@ -39,12 +39,7 @@ std::unique_ptr<prepared_statement> truncate_statement::prepare(data_dictionary:
|
||||
return std::make_unique<prepared_statement>(::make_shared<truncate_statement>(*this));
|
||||
}
|
||||
|
||||
bool truncate_statement::depends_on_keyspace(const sstring& ks_name) const
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
bool truncate_statement::depends_on_column_family(const sstring& cf_name) const
|
||||
bool truncate_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -30,9 +30,7 @@ public:
|
||||
|
||||
virtual std::unique_ptr<prepared_statement> prepare(data_dictionary::database db, cql_stats& stats) override;
|
||||
|
||||
virtual bool depends_on_keyspace(const sstring& ks_name) const override;
|
||||
|
||||
virtual bool depends_on_column_family(const sstring& cf_name) const override;
|
||||
virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;
|
||||
|
||||
virtual future<> check_access(query_processor& qp, const service::client_state& state) const override;
|
||||
|
||||
|
||||
@@ -46,12 +46,7 @@ std::unique_ptr<prepared_statement> use_statement::prepare(data_dictionary::data
|
||||
|
||||
}
|
||||
|
||||
bool use_statement::depends_on_keyspace(const sstring& ks_name) const
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
bool use_statement::depends_on_column_family(const sstring& cf_name) const
|
||||
bool use_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -31,9 +31,7 @@ public:
|
||||
|
||||
virtual uint32_t get_bound_terms() const override;
|
||||
|
||||
virtual bool depends_on_keyspace(const seastar::sstring& ks_name) const override;
|
||||
|
||||
virtual bool depends_on_column_family(const seastar::sstring& cf_name) const override;
|
||||
virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;
|
||||
|
||||
virtual seastar::future<> check_access(query_processor& qp, const service::client_state& state) const override;
|
||||
|
||||
|
||||
@@ -18,6 +18,8 @@
|
||||
#include "types/listlike_partial_deserializing_iterator.hh"
|
||||
#include "utils/managed_bytes.hh"
|
||||
#include "exceptions/exceptions.hh"
|
||||
#include <boost/algorithm/string/trim_all.hpp>
|
||||
#include <boost/algorithm/string.hpp>
|
||||
|
||||
static inline bool is_control_char(char c) {
|
||||
return c >= 0 && c <= 0x1F;
|
||||
@@ -78,8 +80,35 @@ static int64_t to_int64_t(const rjson::value& value) {
|
||||
return value.GetInt();
|
||||
} else if (value.IsUint()) {
|
||||
return value.GetUint();
|
||||
} else if (value.GetUint64()) {
|
||||
} else if (value.IsUint64()) {
|
||||
return value.GetUint64(); //NOTICE: large uint64_t values will get overflown
|
||||
} else if (value.IsDouble()) {
|
||||
// We allow specifing integer constants
|
||||
// using scientific notation (for example 1.3e8)
|
||||
// and floating-point numbers ending with .0 (for example 12.0),
|
||||
// but not floating-point numbers with fractional part (12.34).
|
||||
//
|
||||
// The reason is that JSON standard does not have separate
|
||||
// types for integers and floating-point numbers, only
|
||||
// a single "number" type. Some serializers may
|
||||
// produce an integer in that floating-point format.
|
||||
double double_value = value.GetDouble();
|
||||
|
||||
// Check if the value contains disallowed fractional part (.34 from 12.34).
|
||||
// With RapidJSON and an integer value in range [-(2^53)+1, (2^53)-1],
|
||||
// the fractional part will be zero as the entire value
|
||||
// fits in 53-bit significand. RapidJSON's parsing code does not lose accuracy:
|
||||
// when parsing a number like 12.34e8, it accumulates 1234 to a int64_t number,
|
||||
// then converts it to double and multiples by power of 10, never having any
|
||||
// digit in fractional part.
|
||||
double integral;
|
||||
double fractional = std::modf(double_value, &integral);
|
||||
if (fractional != 0.0 && fractional != -0.0) {
|
||||
throw marshal_exception(format("Incorrect JSON floating-point value "
|
||||
"for int64 type: {} (it should not contain fractional part {})", value, fractional));
|
||||
}
|
||||
|
||||
return double_value;
|
||||
}
|
||||
throw marshal_exception(format("Incorrect JSON value for int64 type: {}", value));
|
||||
}
|
||||
@@ -197,6 +226,17 @@ struct from_json_object_visitor {
|
||||
}
|
||||
bytes operator()(const boolean_type_impl& t) {
|
||||
if (!value.IsBool()) {
|
||||
if (value.IsString()) {
|
||||
std::string str(rjson::to_string_view(value));
|
||||
boost::trim_all(str);
|
||||
boost::to_lower(str);
|
||||
|
||||
if (str == "true") {
|
||||
return t.decompose(true);
|
||||
} else if (str == "false") {
|
||||
return t.decompose(false);
|
||||
}
|
||||
}
|
||||
throw marshal_exception(format("Invalid JSON object {}", value));
|
||||
}
|
||||
return t.decompose(value.GetBool());
|
||||
|
||||
@@ -74,6 +74,13 @@ std::unique_ptr<cql3::statements::raw::select_statement> build_select_statement(
|
||||
/// forbids non-alpha-numeric characters in identifier names.
|
||||
/// Quoting involves wrapping the string in double-quotes ("). A double-quote
|
||||
/// character itself is quoted by doubling it.
|
||||
/// maybe_quote() also quotes reserved CQL keywords (e.g., "to", "where")
|
||||
/// but doesn't quote *unreserved* keywords (like ttl, int or as).
|
||||
/// Note that this means that if new reserved keywords are added to the
|
||||
/// parser, a saved output of maybe_quote() may no longer be parsable by
|
||||
/// parser. To avoid this forward-compatibility issue, use quote() instead
|
||||
/// of maybe_quote() - to unconditionally quote an identifier even if it is
|
||||
/// lowercase and not (yet) a keyword.
|
||||
sstring maybe_quote(const sstring& s);
|
||||
|
||||
// Check whether timestamp is not too far in the future as this probably
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
*/
|
||||
|
||||
#include <chrono>
|
||||
#include <exception>
|
||||
#include <seastar/core/future-util.hh>
|
||||
#include <seastar/core/do_with.hh>
|
||||
#include <seastar/core/semaphore.hh>
|
||||
@@ -247,6 +248,7 @@ future<> db::batchlog_manager::replay_all_failed_batches() {
|
||||
} catch (data_dictionary::no_such_keyspace& ex) {
|
||||
// should probably ignore and drop the batch
|
||||
} catch (...) {
|
||||
blogger.warn("Replay failed (will retry): {}", std::current_exception());
|
||||
// timeout, overload etc.
|
||||
// Do _not_ remove the batch, assuning we got a node write error.
|
||||
// Since we don't have hints (which origin is satisfied with),
|
||||
|
||||
25
db/config.cc
25
db/config.cc
@@ -65,6 +65,25 @@ hinted_handoff_enabled_to_json(const db::config::hinted_handoff_enabled_type& h)
|
||||
return value_to_json(h.to_configuration_string());
|
||||
}
|
||||
|
||||
// Convert a value that can be printed with operator<<, or a vector of
|
||||
// such values, to JSON. An example is enum_option<T>, because enum_option<T>
|
||||
// has a operator<<.
|
||||
template <typename T>
|
||||
static json::json_return_type
|
||||
printable_to_json(const T& e) {
|
||||
return value_to_json(format("{}", e));
|
||||
}
|
||||
template <typename T>
|
||||
static json::json_return_type
|
||||
printable_vector_to_json(const std::vector<T>& e) {
|
||||
std::vector<sstring> converted;
|
||||
converted.reserve(e.size());
|
||||
for (const auto& option : e) {
|
||||
converted.push_back(format("{}", option));
|
||||
}
|
||||
return value_to_json(converted);
|
||||
}
|
||||
|
||||
template <>
|
||||
const config_type config_type_for<bool> = config_type("bool", value_to_json<bool>);
|
||||
|
||||
@@ -109,11 +128,11 @@ const config_type config_type_for<db::seed_provider_type> = config_type("seed pr
|
||||
|
||||
template <>
|
||||
const config_type config_type_for<std::vector<enum_option<db::experimental_features_t>>> = config_type(
|
||||
"experimental features", value_to_json<std::vector<sstring>>);
|
||||
"experimental features", printable_vector_to_json<enum_option<db::experimental_features_t>>);
|
||||
|
||||
template <>
|
||||
const config_type config_type_for<enum_option<db::tri_mode_restriction_t>> = config_type(
|
||||
"restriction mode", value_to_json<sstring>);
|
||||
"restriction mode", printable_to_json<enum_option<db::tri_mode_restriction_t>>);
|
||||
|
||||
template <>
|
||||
const config_type config_type_for<db::config::hinted_handoff_enabled_type> = config_type("hinted handoff enabled", hinted_handoff_enabled_to_json);
|
||||
@@ -862,6 +881,8 @@ db::config::config(std::shared_ptr<db::extensions> exts)
|
||||
"Flush tables in the system_schema keyspace after schema modification. This is required for crash recovery, but slows down tests and can be disabled for them")
|
||||
, restrict_replication_simplestrategy(this, "restrict_replication_simplestrategy", liveness::LiveUpdate, value_status::Used, db::tri_mode_restriction_t::mode::FALSE, "Controls whether to disable SimpleStrategy replication. Can be true, false, or warn.")
|
||||
, restrict_dtcs(this, "restrict_dtcs", liveness::LiveUpdate, value_status::Used, db::tri_mode_restriction_t::mode::WARN, "Controls whether to prevent setting DateTieredCompactionStrategy. Can be true, false, or warn.")
|
||||
, cache_index_pages(this, "cache_index_pages", liveness::LiveUpdate, value_status::Used, true,
|
||||
"Keep SSTable index pages in the global cache after a SSTable read. Expected to improve performance for workloads with big partitions, but may degrade performance for workloads with small partitions.")
|
||||
, default_log_level(this, "default_log_level", value_status::Used)
|
||||
, logger_log_level(this, "logger_log_level", value_status::Used)
|
||||
, log_to_stdout(this, "log_to_stdout", value_status::Used)
|
||||
|
||||
@@ -365,6 +365,9 @@ public:
|
||||
named_value<tri_mode_restriction> restrict_replication_simplestrategy;
|
||||
named_value<tri_mode_restriction> restrict_dtcs;
|
||||
|
||||
|
||||
named_value<bool> cache_index_pages;
|
||||
|
||||
seastar::logging_settings logging_settings(const log_cli::options&) const;
|
||||
|
||||
const db::extensions& extensions() const;
|
||||
|
||||
@@ -574,12 +574,8 @@ public:
|
||||
}
|
||||
|
||||
future<> flush_schemas() {
|
||||
return _qp.proxy().get_db().invoke_on_all([this] (replica::database& db) {
|
||||
return parallel_for_each(db::schema_tables::all_table_names(schema_features::full()), [this, &db](const sstring& cf_name) {
|
||||
auto& cf = db.find_column_family(db::schema_tables::NAME, cf_name);
|
||||
return cf.flush();
|
||||
});
|
||||
});
|
||||
auto& db = _qp.db().real_database();
|
||||
return db.flush_on_all(db::schema_tables::NAME, db::schema_tables::all_table_names(schema_features::full()));
|
||||
}
|
||||
|
||||
future<> migrate() {
|
||||
|
||||
@@ -1042,12 +1042,9 @@ static future<> do_merge_schema(distributed<service::storage_proxy>& proxy, std:
|
||||
co_await proxy.local().mutate_locally(std::move(mutations), tracing::trace_state_ptr());
|
||||
|
||||
if (do_flush) {
|
||||
co_await proxy.local().get_db().invoke_on_all([&] (replica::database& db) -> future<> {
|
||||
auto& cfs = column_families;
|
||||
co_await parallel_for_each(cfs.begin(), cfs.end(), [&] (const utils::UUID& id) -> future<> {
|
||||
auto& cf = db.find_column_family(id);
|
||||
co_await cf.flush();
|
||||
});
|
||||
auto& db = proxy.local().local_db();
|
||||
co_await parallel_for_each(column_families, [&db] (const utils::UUID& id) -> future<> {
|
||||
return db.flush_on_all(id);
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -11,6 +11,8 @@
|
||||
*/
|
||||
|
||||
#include <boost/range/adaptors.hpp>
|
||||
#include <seastar/core/coroutine.hh>
|
||||
#include <seastar/coroutine/maybe_yield.hh>
|
||||
#include "db/snapshot-ctl.hh"
|
||||
#include "replica/database.hh"
|
||||
|
||||
@@ -59,24 +61,21 @@ future<> snapshot_ctl::take_snapshot(sstring tag, std::vector<sstring> keyspace_
|
||||
boost::copy(_db.local().get_keyspaces() | boost::adaptors::map_keys, std::back_inserter(keyspace_names));
|
||||
};
|
||||
|
||||
return run_snapshot_modify_operation([tag = std::move(tag), keyspace_names = std::move(keyspace_names), sf, this] {
|
||||
return parallel_for_each(keyspace_names, [tag, this] (auto& ks_name) {
|
||||
return check_snapshot_not_exist(ks_name, tag);
|
||||
}).then([this, tag, keyspace_names, sf] {
|
||||
return _db.invoke_on_all([tag = std::move(tag), keyspace_names, sf] (replica::database& db) {
|
||||
return parallel_for_each(keyspace_names, [&db, tag = std::move(tag), sf] (auto& ks_name) {
|
||||
auto& ks = db.find_keyspace(ks_name);
|
||||
return parallel_for_each(ks.metadata()->cf_meta_data(), [&db, tag = std::move(tag), sf] (auto& pair) {
|
||||
auto& cf = db.find_column_family(pair.second);
|
||||
return cf.snapshot(db, tag, bool(sf));
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
return run_snapshot_modify_operation([tag = std::move(tag), keyspace_names = std::move(keyspace_names), sf, this] () mutable {
|
||||
return do_take_snapshot(std::move(tag), std::move(keyspace_names), sf);
|
||||
});
|
||||
}
|
||||
|
||||
future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf) {
|
||||
future<> snapshot_ctl::do_take_snapshot(sstring tag, std::vector<sstring> keyspace_names, skip_flush sf) {
|
||||
co_await parallel_for_each(keyspace_names, [tag, this] (const auto& ks_name) {
|
||||
return check_snapshot_not_exist(ks_name, tag);
|
||||
});
|
||||
co_await parallel_for_each(keyspace_names, [this, tag = std::move(tag), sf] (const auto& ks_name) {
|
||||
return _db.local().snapshot_on_all(ks_name, tag, bool(sf));
|
||||
});
|
||||
}
|
||||
|
||||
future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf, allow_view_snapshots av) {
|
||||
if (ks_name.empty()) {
|
||||
throw std::runtime_error("You must supply a keyspace name");
|
||||
}
|
||||
@@ -87,25 +86,25 @@ future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, std::vector<
|
||||
throw std::runtime_error("You must supply a snapshot name.");
|
||||
}
|
||||
|
||||
return run_snapshot_modify_operation([this, ks_name = std::move(ks_name), tables = std::move(tables), tag = std::move(tag), sf] {
|
||||
return check_snapshot_not_exist(ks_name, tag, tables).then([this, ks_name, tables, tag, sf] {
|
||||
return do_with(std::vector<sstring>(std::move(tables)),[this, ks_name, tag, sf](const std::vector<sstring>& tables) {
|
||||
return do_for_each(tables, [ks_name, tag, sf, this] (const sstring& table_name) {
|
||||
if (table_name.find(".") != sstring::npos) {
|
||||
throw std::invalid_argument("Cannot take a snapshot of a secondary index by itself. Run snapshot on the table that owns the index.");
|
||||
}
|
||||
return _db.invoke_on_all([ks_name, table_name, tag, sf] (replica::database &db) {
|
||||
auto& cf = db.find_column_family(ks_name, table_name);
|
||||
return cf.snapshot(db, tag, bool(sf));
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
return run_snapshot_modify_operation([this, ks_name = std::move(ks_name), tables = std::move(tables), tag = std::move(tag), sf, av] () mutable {
|
||||
return do_take_column_family_snapshot(std::move(ks_name), std::move(tables), std::move(tag), sf, av);
|
||||
});
|
||||
}
|
||||
|
||||
future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, sstring cf_name, sstring tag, skip_flush sf) {
|
||||
return take_column_family_snapshot(ks_name, std::vector<sstring>{cf_name}, tag, sf);
|
||||
future<> snapshot_ctl::do_take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf, allow_view_snapshots av) {
|
||||
co_await check_snapshot_not_exist(ks_name, tag, tables);
|
||||
|
||||
for (const auto& table_name : tables) {
|
||||
auto& cf = _db.local().find_column_family(ks_name, table_name);
|
||||
if (cf.schema()->is_view() && !av) {
|
||||
throw std::invalid_argument("Do not take a snapshot of a materialized view or a secondary index by itself. Run snapshot on the base table instead.");
|
||||
}
|
||||
}
|
||||
co_await _db.local().snapshot_on_all(ks_name, std::move(tables), std::move(tag), bool(sf));
|
||||
}
|
||||
|
||||
future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, sstring cf_name, sstring tag, skip_flush sf, allow_view_snapshots av) {
|
||||
return take_column_family_snapshot(ks_name, std::vector<sstring>{cf_name}, tag, sf, av);
|
||||
}
|
||||
|
||||
future<> snapshot_ctl::clear_snapshot(sstring tag, std::vector<sstring> keyspace_names, sstring cf_name) {
|
||||
|
||||
@@ -27,6 +27,7 @@ namespace db {
|
||||
class snapshot_ctl : public peering_sharded_service<snapshot_ctl> {
|
||||
public:
|
||||
using skip_flush = bool_class<class skip_flush_tag>;
|
||||
using allow_view_snapshots = bool_class<class allow_view_snapsots_tag>;
|
||||
|
||||
struct snapshot_details {
|
||||
int64_t live;
|
||||
@@ -64,7 +65,7 @@ public:
|
||||
* @param tables a vector of tables names to snapshot
|
||||
* @param tag the tag given to the snapshot; may not be null or empty
|
||||
*/
|
||||
future<> take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf = skip_flush::no);
|
||||
future<> take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf = skip_flush::no, allow_view_snapshots av = allow_view_snapshots::no);
|
||||
|
||||
/**
|
||||
* Takes the snapshot of a specific column family. A snapshot name must be specified.
|
||||
@@ -73,7 +74,7 @@ public:
|
||||
* @param columnFamilyName the column family to snapshot
|
||||
* @param tag the tag given to the snapshot; may not be null or empty
|
||||
*/
|
||||
future<> take_column_family_snapshot(sstring ks_name, sstring cf_name, sstring tag, skip_flush sf = skip_flush::no);
|
||||
future<> take_column_family_snapshot(sstring ks_name, sstring cf_name, sstring tag, skip_flush sf = skip_flush::no, allow_view_snapshots av = allow_view_snapshots::no);
|
||||
|
||||
/**
|
||||
* Remove the snapshot with the given name from the given keyspaces.
|
||||
@@ -97,6 +98,9 @@ private:
|
||||
|
||||
template <typename Func>
|
||||
std::result_of_t<Func()> run_snapshot_list_operation(Func&&);
|
||||
|
||||
future<> do_take_snapshot(sstring tag, std::vector<sstring> keyspace_names, skip_flush sf = skip_flush::no);
|
||||
future<> do_take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf = skip_flush::no, allow_view_snapshots av = allow_view_snapshots::no);
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@@ -2482,10 +2482,14 @@ class db_config_table final : public streaming_virtual_table {
|
||||
for (auto& c_ref : cfg.values()) {
|
||||
auto& c = c_ref.get();
|
||||
if (c.name() == name) {
|
||||
if (c.set_value(value, utils::config_file::config_source::CQL)) {
|
||||
return cfg.broadcast_to_all_shards();
|
||||
} else {
|
||||
return make_exception_future<>(virtual_table_update_exception("option is not live-updateable"));
|
||||
try {
|
||||
if (c.set_value(value, utils::config_file::config_source::CQL)) {
|
||||
return cfg.broadcast_to_all_shards();
|
||||
} else {
|
||||
return make_exception_future<>(virtual_table_update_exception("option is not live-updateable"));
|
||||
}
|
||||
} catch (boost::bad_lexical_cast&) {
|
||||
return make_exception_future<>(virtual_table_update_exception("cannot parse option value"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
|
||||
#include <seastar/core/seastar.hh>
|
||||
#include <seastar/core/sstring.hh>
|
||||
#include <seastar/core/reactor.hh>
|
||||
#include <utility>
|
||||
#include <optional>
|
||||
#include "dht/token.hh"
|
||||
|
||||
@@ -121,6 +121,9 @@ const column_definition* view_info::view_column(const column_definition& base_de
|
||||
|
||||
void view_info::set_base_info(db::view::base_info_ptr base_info) {
|
||||
_base_info = std::move(base_info);
|
||||
// Forget the cached objects which may refer to the base schema.
|
||||
_select_statement = nullptr;
|
||||
_partition_slice = std::nullopt;
|
||||
}
|
||||
|
||||
// A constructor for a base info that can facilitate reads and writes from the materialized view.
|
||||
@@ -322,7 +325,11 @@ public:
|
||||
view_filter_checking_visitor(const schema& base, const view_info& view)
|
||||
: _base(base)
|
||||
, _view(view)
|
||||
, _selection(cql3::selection::selection::wildcard(_base.shared_from_this()))
|
||||
, _selection(cql3::selection::selection::for_columns(_base.shared_from_this(),
|
||||
boost::copy_range<std::vector<const column_definition*>>(
|
||||
_base.regular_columns() | boost::adaptors::transformed([] (const column_definition& cdef) { return &cdef; }))
|
||||
)
|
||||
)
|
||||
{}
|
||||
|
||||
void accept_new_partition(const partition_key& key, uint64_t row_count) {
|
||||
@@ -859,13 +866,18 @@ void view_updates::generate_update(
|
||||
bool same_row = true;
|
||||
for (auto col_id : col_ids) {
|
||||
auto* after = update.cells().find_cell(col_id);
|
||||
// Note: multi-cell columns can't be part of the primary key.
|
||||
auto& cdef = _base->regular_column_at(col_id);
|
||||
if (existing) {
|
||||
auto* before = existing->cells().find_cell(col_id);
|
||||
// Note that this cell is necessarily atomic, because col_ids are
|
||||
// view key columns, and keys must be atomic.
|
||||
if (before && before->as_atomic_cell(cdef).is_live()) {
|
||||
if (after && after->as_atomic_cell(cdef).is_live()) {
|
||||
auto cmp = compare_atomic_cell_for_merge(before->as_atomic_cell(cdef), after->as_atomic_cell(cdef));
|
||||
// We need to compare just the values of the keys, not
|
||||
// metadata like the timestamp. This is because below,
|
||||
// if the old and new view row have the same key, we need
|
||||
// to be sure to reach the update_entry() case.
|
||||
auto cmp = compare_unsigned(before->as_atomic_cell(cdef).value(), after->as_atomic_cell(cdef).value());
|
||||
if (cmp != 0) {
|
||||
same_row = false;
|
||||
}
|
||||
@@ -885,7 +897,13 @@ void view_updates::generate_update(
|
||||
if (same_row) {
|
||||
update_entry(base_key, update, *existing, now);
|
||||
} else {
|
||||
replace_entry(base_key, update, *existing, now);
|
||||
// This code doesn't work if the old and new view row have the
|
||||
// same key, because if they do we get both data and tombstone
|
||||
// for the same timestamp (now) and the tombstone wins. This
|
||||
// is why we need the "same_row" case above - it's not just a
|
||||
// performance optimization.
|
||||
delete_old_entry(base_key, *existing, update, now);
|
||||
create_entry(base_key, update, now);
|
||||
}
|
||||
} else {
|
||||
delete_old_entry(base_key, *existing, update, now);
|
||||
@@ -1293,7 +1311,7 @@ future<> mutate_MV(
|
||||
auto mut_ptr = remote_endpoints.empty() ? std::make_unique<frozen_mutation>(std::move(mut.fm)) : std::make_unique<frozen_mutation>(mut.fm);
|
||||
tracing::trace(tr_state, "Locally applying view update for {}.{}; base token = {}; view token = {}",
|
||||
mut.s->ks_name(), mut.s->cf_name(), base_token, view_token);
|
||||
local_view_update = service::get_local_storage_proxy().mutate_locally(mut.s, *mut_ptr, std::move(tr_state), db::commitlog::force_sync::no).then_wrapped(
|
||||
local_view_update = service::get_local_storage_proxy().mutate_locally(mut.s, *mut_ptr, tr_state, db::commitlog::force_sync::no).then_wrapped(
|
||||
[s = mut.s, &stats, &cf_stats, tr_state, base_token, view_token, my_address, mut_ptr = std::move(mut_ptr),
|
||||
units = sem_units.split(sem_units.count())] (future<>&& f) {
|
||||
--stats.writes;
|
||||
@@ -2031,15 +2049,21 @@ public:
|
||||
// Called in the context of a seastar::thread.
|
||||
void view_builder::execute(build_step& step, exponential_backoff_retry r) {
|
||||
gc_clock::time_point now = gc_clock::now();
|
||||
auto consumer = compact_for_query<emit_only_live_rows::yes, view_builder::consumer>(
|
||||
auto compaction_state = make_lw_shared<compact_for_query_state<emit_only_live_rows::yes>>(
|
||||
*step.reader.schema(),
|
||||
now,
|
||||
step.pslice,
|
||||
batch_size,
|
||||
query::max_partitions,
|
||||
view_builder::consumer{*this, step, now});
|
||||
consumer.consume_new_partition(step.current_key); // Initialize the state in case we're resuming a partition
|
||||
query::max_partitions);
|
||||
auto consumer = compact_for_query<emit_only_live_rows::yes, view_builder::consumer>(compaction_state, view_builder::consumer{*this, step, now});
|
||||
auto built = step.reader.consume_in_thread(std::move(consumer));
|
||||
if (auto ds = std::move(*compaction_state).detach_state()) {
|
||||
auto& range_tombstones = std::get<std::deque<range_tombstone>>(ds->range_tombstones);
|
||||
for (auto& rt : range_tombstones) {
|
||||
step.reader.unpop_mutation_fragment(mutation_fragment(*step.reader.schema(), step.reader.permit(), std::move(rt)));
|
||||
}
|
||||
step.reader.unpop_mutation_fragment(mutation_fragment(*step.reader.schema(), step.reader.permit(), std::move(ds->partition_start)));
|
||||
}
|
||||
|
||||
_as.check();
|
||||
|
||||
|
||||
@@ -154,10 +154,7 @@ private:
|
||||
void delete_old_entry(const partition_key& base_key, const clustering_row& existing, const clustering_row& update, gc_clock::time_point now);
|
||||
void do_delete_old_entry(const partition_key& base_key, const clustering_row& existing, const clustering_row& update, gc_clock::time_point now);
|
||||
void update_entry(const partition_key& base_key, const clustering_row& update, const clustering_row& existing, gc_clock::time_point now);
|
||||
void replace_entry(const partition_key& base_key, const clustering_row& update, const clustering_row& existing, gc_clock::time_point now) {
|
||||
create_entry(base_key, update, now);
|
||||
delete_old_entry(base_key, existing, update, now);
|
||||
}
|
||||
void update_entry_for_computed_column(const partition_key& base_key, const clustering_row& update, const std::optional<clustering_row>& existing, gc_clock::time_point now);
|
||||
};
|
||||
|
||||
class view_update_builder {
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#include "db/view/row_locking.hh"
|
||||
#include <seastar/core/abort_source.hh>
|
||||
#include "mutation.hh"
|
||||
#include <seastar/core/circular_buffer.hh>
|
||||
|
||||
class evictable_reader_handle;
|
||||
|
||||
|
||||
@@ -202,6 +202,12 @@ public:
|
||||
});
|
||||
}
|
||||
|
||||
future<flush_permit> get_all_flush_permits() {
|
||||
return get_units(_background_work_flush_serializer, _max_background_work).then([this] (auto&& units) {
|
||||
return this->get_flush_permit(std::move(units));
|
||||
});
|
||||
}
|
||||
|
||||
bool has_extraneous_flushes_requested() const {
|
||||
return _extraneous_flushes > 0;
|
||||
}
|
||||
|
||||
6
dist/common/scripts/scylla_coredump_setup
vendored
6
dist/common/scripts/scylla_coredump_setup
vendored
@@ -123,10 +123,14 @@ WantedBy=multi-user.target
|
||||
# - Storage: /path/to/file (inacessible)
|
||||
# - Storage: /path/to/file
|
||||
#
|
||||
# After systemd-v248, available coredump file output changed like this:
|
||||
# - Storage: /path/to/file (present)
|
||||
# We need to support both versions.
|
||||
#
|
||||
# reference: https://github.com/systemd/systemd/commit/47f50642075a7a215c9f7b600599cbfee81a2913
|
||||
|
||||
corefail = False
|
||||
res = re.findall(r'Storage: (.*)$', coreinfo, flags=re.MULTILINE)
|
||||
res = re.findall(r'Storage: (\S+)(?: \(.+\))?$', coreinfo, flags=re.MULTILINE)
|
||||
# v232 or later
|
||||
if res:
|
||||
corepath = res[0]
|
||||
|
||||
12
dist/common/scripts/scylla_sysconfig_setup
vendored
12
dist/common/scripts/scylla_sysconfig_setup
vendored
@@ -70,7 +70,17 @@ if __name__ == '__main__':
|
||||
network_mode = args.mode if args.mode else cfg.get('NETWORK_MODE')
|
||||
|
||||
if args.setup_nic_and_disks:
|
||||
rps_cpus = run('{} --tune net --nic {} --get-cpu-mask'.format(perftune_base_command(), ifname), shell=True, check=True, capture_output=True, encoding='utf-8').stdout.strip()
|
||||
res = run('{} --tune net --nic {} --get-cpu-mask'.format(perftune_base_command(), ifname), shell=True, check=True, capture_output=True, encoding='utf-8').stdout
|
||||
# we need to extract CPU mask from output, since perftune.py may also print warning messages (#10082)
|
||||
match = re.match('(.*\n)?(0x[0-9a-f]+(?:,0x[0-9a-f]+)*)', res, re.DOTALL)
|
||||
try:
|
||||
warning = match.group(1)
|
||||
rps_cpus = match.group(2)
|
||||
except:
|
||||
raise Exception(f'Failed to retrive CPU mask: {res}')
|
||||
# print warning message if available
|
||||
if warning:
|
||||
print(warning.strip())
|
||||
if len(rps_cpus) > 0:
|
||||
cpuset = hex2list(rps_cpus)
|
||||
run('/opt/scylladb/scripts/scylla_cpuset_setup --cpuset {}'.format(cpuset), shell=True, check=True)
|
||||
|
||||
6
dist/common/supervisor/scylla_util.sh
vendored
6
dist/common/supervisor/scylla_util.sh
vendored
@@ -6,12 +6,16 @@ is_nonroot() {
|
||||
[ -f "$scylladir"/SCYLLA-NONROOT-FILE ]
|
||||
}
|
||||
|
||||
is_container() {
|
||||
[ -f "$scylladir"/SCYLLA-CONTAINER-FILE ]
|
||||
}
|
||||
|
||||
is_privileged() {
|
||||
[ ${EUID:-${UID}} = 0 ]
|
||||
}
|
||||
|
||||
execsudo() {
|
||||
if is_nonroot; then
|
||||
if is_nonroot || is_container; then
|
||||
exec "$@"
|
||||
else
|
||||
exec sudo -u scylla -g scylla "$@"
|
||||
|
||||
4
dist/docker/debian/build_docker.sh
vendored
4
dist/docker/debian/build_docker.sh
vendored
@@ -82,15 +82,17 @@ run bash -ec "echo 'debconf debconf/frontend select Noninteractive' | debconf-se
|
||||
run bash -ec "rm -rf /etc/rsyslog.conf"
|
||||
run apt-get -y install hostname supervisor openssh-server openssh-client openjdk-11-jre-headless python python-yaml curl rsyslog locales sudo
|
||||
run locale-gen en_US.UTF-8
|
||||
run update-locale LANG=en_US.UTF-8 LANGUAGE=en_US:en LC_ALL=en_US.UTF_8
|
||||
run update-locale LANG=en_US.UTF-8 LANGUAGE=en_US:en LC_ALL=en_US.UTF-8
|
||||
run bash -ec "dpkg -i packages/*.deb"
|
||||
run apt-get -y clean all
|
||||
run bash -ec "cat /scylla_bashrc >> /etc/bash.bashrc"
|
||||
run mkdir -p /etc/supervisor.conf.d
|
||||
run mkdir -p /var/log/scylla
|
||||
run chown -R scylla:scylla /var/lib/scylla
|
||||
run sed -i -e 's/^SCYLLA_ARGS=".*"$/SCYLLA_ARGS="--log-to-syslog 0 --log-to-stdout 1 --default-log-level info --network-stack posix"/' /etc/default/scylla-server
|
||||
|
||||
run mkdir -p /opt/scylladb/supervisor
|
||||
run touch /opt/scylladb/SCYLLA-CONTAINER-FILE
|
||||
bcp dist/common/supervisor/scylla-server.sh /opt/scylladb/supervisor/scylla-server.sh
|
||||
bcp dist/common/supervisor/scylla-jmx.sh /opt/scylladb/supervisor/scylla-jmx.sh
|
||||
bcp dist/common/supervisor/scylla-node-exporter.sh /opt/scylladb/supervisor/scylla-node-exporter.sh
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
[program:scylla-server]
|
||||
[program:scylla]
|
||||
command=/opt/scylladb/supervisor/scylla-server.sh
|
||||
stdout_logfile=/dev/stdout
|
||||
stdout_logfile_maxbytes=0
|
||||
|
||||
41
dist/docker/etc/sysconfig/scylla-server
vendored
41
dist/docker/etc/sysconfig/scylla-server
vendored
@@ -1,41 +0,0 @@
|
||||
# choose following mode: virtio, dpdk, posix
|
||||
NETWORK_MODE=posix
|
||||
|
||||
# tap device name(virtio)
|
||||
TAP=tap0
|
||||
|
||||
# bridge device name (virtio)
|
||||
BRIDGE=virbr0
|
||||
|
||||
# ethernet device name
|
||||
IFNAME=eth0
|
||||
|
||||
# setup NIC's and disks' interrupts, RPS, XPS, nomerges and I/O scheduler (posix)
|
||||
SET_NIC_AND_DISKS=no
|
||||
|
||||
# ethernet device driver (dpdk)
|
||||
ETHDRV=
|
||||
|
||||
# ethernet device PCI ID (dpdk)
|
||||
ETHPCIID=
|
||||
|
||||
# number of hugepages
|
||||
NR_HUGEPAGES=64
|
||||
|
||||
# user for process (must be root for dpdk)
|
||||
USER=scylla
|
||||
|
||||
# group for process
|
||||
GROUP=scylla
|
||||
|
||||
# scylla home dir
|
||||
SCYLLA_HOME=/var/lib/scylla
|
||||
|
||||
# scylla config dir
|
||||
SCYLLA_CONF=/etc/scylla
|
||||
|
||||
# scylla arguments
|
||||
SCYLLA_ARGS="--log-to-syslog 0 --log-to-stdout 1 --default-log-level info --network-stack posix"
|
||||
|
||||
# setup as AMI instance
|
||||
AMI=no
|
||||
@@ -32,7 +32,7 @@
|
||||
logging::logger fmr_logger("flat_mutation_reader");
|
||||
|
||||
flat_mutation_reader& flat_mutation_reader::operator=(flat_mutation_reader&& o) noexcept {
|
||||
if (_impl) {
|
||||
if (_impl && _impl->is_close_required()) {
|
||||
impl* ip = _impl.get();
|
||||
// Abort to enforce calling close() before readers are closed
|
||||
// to prevent leaks and potential use-after-free due to background
|
||||
@@ -45,7 +45,7 @@ flat_mutation_reader& flat_mutation_reader::operator=(flat_mutation_reader&& o)
|
||||
}
|
||||
|
||||
flat_mutation_reader::~flat_mutation_reader() {
|
||||
if (_impl) {
|
||||
if (_impl && _impl->is_close_required()) {
|
||||
impl* ip = _impl.get();
|
||||
// Abort to enforce calling close() before readers are closed
|
||||
// to prevent leaks and potential use-after-free due to background
|
||||
@@ -1580,6 +1580,9 @@ bool mutation_fragment_stream_validator::operator()(dht::token t) {
|
||||
}
|
||||
|
||||
bool mutation_fragment_stream_validator::operator()(mutation_fragment_v2::kind kind, position_in_partition_view pos) {
|
||||
if (kind == mutation_fragment_v2::kind::partition_end && _current_tombstone) {
|
||||
return false;
|
||||
}
|
||||
if (_prev_kind == mutation_fragment_v2::kind::partition_end) {
|
||||
const bool valid = (kind == mutation_fragment_v2::kind::partition_start);
|
||||
if (valid) {
|
||||
@@ -1607,7 +1610,11 @@ bool mutation_fragment_stream_validator::operator()(mutation_fragment::kind kind
|
||||
}
|
||||
|
||||
bool mutation_fragment_stream_validator::operator()(const mutation_fragment_v2& mf) {
|
||||
return (*this)(mf.mutation_fragment_kind(), mf.position());
|
||||
const auto valid = (*this)(mf.mutation_fragment_kind(), mf.position());
|
||||
if (valid && mf.is_range_tombstone_change()) {
|
||||
_current_tombstone = mf.as_range_tombstone_change().tombstone();
|
||||
}
|
||||
return valid;
|
||||
}
|
||||
bool mutation_fragment_stream_validator::operator()(const mutation_fragment& mf) {
|
||||
return (*this)(to_mutation_fragment_kind_v2(mf.mutation_fragment_kind()), mf.position());
|
||||
@@ -1646,11 +1653,17 @@ void mutation_fragment_stream_validator::reset(dht::decorated_key dk) {
|
||||
_prev_partition_key = dk;
|
||||
_prev_pos = position_in_partition::for_partition_start();
|
||||
_prev_kind = mutation_fragment_v2::kind::partition_start;
|
||||
_current_tombstone = {};
|
||||
}
|
||||
|
||||
void mutation_fragment_stream_validator::reset(const mutation_fragment_v2& mf) {
|
||||
_prev_pos = mf.position();
|
||||
_prev_kind = mf.mutation_fragment_kind();
|
||||
if (mf.is_range_tombstone_change()) {
|
||||
_current_tombstone = mf.as_range_tombstone_change().tombstone();
|
||||
} else {
|
||||
_current_tombstone = {};
|
||||
}
|
||||
}
|
||||
void mutation_fragment_stream_validator::reset(const mutation_fragment& mf) {
|
||||
_prev_pos = mf.position();
|
||||
@@ -1719,6 +1732,11 @@ bool mutation_fragment_stream_validating_filter::operator()(mutation_fragment_v2
|
||||
|
||||
fmr_logger.debug("[validator {}] {}:{}", static_cast<void*>(this), kind, pos);
|
||||
|
||||
if (kind == mutation_fragment_v2::kind::partition_end && _current_tombstone) {
|
||||
on_validation_error(fmr_logger, format("[validator {} for {}] Unexpected active tombstone at partition-end: partition key {}: tombstone {}",
|
||||
static_cast<void*>(this), _name, _validator.previous_partition_key(), _current_tombstone));
|
||||
}
|
||||
|
||||
if (_validation_level >= mutation_fragment_stream_validation_level::clustering_key) {
|
||||
valid = _validator(kind, pos);
|
||||
} else {
|
||||
@@ -1745,7 +1763,11 @@ bool mutation_fragment_stream_validating_filter::operator()(mutation_fragment::k
|
||||
}
|
||||
|
||||
bool mutation_fragment_stream_validating_filter::operator()(const mutation_fragment_v2& mv) {
|
||||
return (*this)(mv.mutation_fragment_kind(), mv.position());
|
||||
auto valid = (*this)(mv.mutation_fragment_kind(), mv.position());
|
||||
if (valid && mv.is_range_tombstone_change()) {
|
||||
_current_tombstone = mv.as_range_tombstone_change().tombstone();
|
||||
}
|
||||
return valid;
|
||||
}
|
||||
bool mutation_fragment_stream_validating_filter::operator()(const mutation_fragment& mv) {
|
||||
return (*this)(to_mutation_fragment_kind_v2(mv.mutation_fragment_kind()), mv.position());
|
||||
@@ -1764,7 +1786,7 @@ void mutation_fragment_stream_validating_filter::on_end_of_stream() {
|
||||
}
|
||||
|
||||
flat_mutation_reader_v2& flat_mutation_reader_v2::operator=(flat_mutation_reader_v2&& o) noexcept {
|
||||
if (_impl) {
|
||||
if (_impl && _impl->is_close_required()) {
|
||||
impl* ip = _impl.get();
|
||||
// Abort to enforce calling close() before readers are closed
|
||||
// to prevent leaks and potential use-after-free due to background
|
||||
@@ -1777,7 +1799,7 @@ flat_mutation_reader_v2& flat_mutation_reader_v2::operator=(flat_mutation_reader
|
||||
}
|
||||
|
||||
flat_mutation_reader_v2::~flat_mutation_reader_v2() {
|
||||
if (_impl) {
|
||||
if (_impl && _impl->is_close_required()) {
|
||||
impl* ip = _impl.get();
|
||||
// Abort to enforce calling close() before readers are closed
|
||||
// to prevent leaks and potential use-after-free due to background
|
||||
|
||||
@@ -132,6 +132,7 @@ public:
|
||||
private:
|
||||
tracked_buffer _buffer;
|
||||
size_t _buffer_size = 0;
|
||||
bool _close_required = false;
|
||||
protected:
|
||||
size_t max_buffer_size_in_bytes = default_max_buffer_size_in_bytes();
|
||||
bool _end_of_stream = false;
|
||||
@@ -167,6 +168,8 @@ public:
|
||||
bool is_end_of_stream() const { return _end_of_stream; }
|
||||
bool is_buffer_empty() const { return _buffer.empty(); }
|
||||
bool is_buffer_full() const { return _buffer_size >= max_buffer_size_in_bytes; }
|
||||
bool is_close_required() const { return _close_required; }
|
||||
void set_close_required() { _close_required = true; }
|
||||
static constexpr size_t default_max_buffer_size_in_bytes() { return 8 * 1024; }
|
||||
|
||||
mutation_fragment pop_mutation_fragment() {
|
||||
@@ -504,9 +507,15 @@ public:
|
||||
//
|
||||
// Can be used to skip over entire partitions if interleaved with
|
||||
// `operator()()` calls.
|
||||
future<> next_partition() { return _impl->next_partition(); }
|
||||
future<> next_partition() {
|
||||
_impl->set_close_required();
|
||||
return _impl->next_partition();
|
||||
}
|
||||
|
||||
future<> fill_buffer() { return _impl->fill_buffer(); }
|
||||
future<> fill_buffer() {
|
||||
_impl->set_close_required();
|
||||
return _impl->fill_buffer();
|
||||
}
|
||||
|
||||
// Changes the range of partitions to pr. The range can only be moved
|
||||
// forwards. pr.begin() needs to be larger than pr.end() of the previousl
|
||||
@@ -515,6 +524,7 @@ public:
|
||||
// pr needs to be valid until the reader is destroyed or fast_forward_to()
|
||||
// is called again.
|
||||
future<> fast_forward_to(const dht::partition_range& pr) {
|
||||
_impl->set_close_required();
|
||||
return _impl->fast_forward_to(pr);
|
||||
}
|
||||
// Skips to a later range of rows.
|
||||
@@ -544,6 +554,7 @@ public:
|
||||
// In particular one must first enter a partition by fetching a `partition_start`
|
||||
// fragment before calling `fast_forward_to`.
|
||||
future<> fast_forward_to(position_range cr) {
|
||||
_impl->set_close_required();
|
||||
return _impl->fast_forward_to(std::move(cr));
|
||||
}
|
||||
// Closes the reader.
|
||||
|
||||
@@ -164,6 +164,7 @@ public:
|
||||
private:
|
||||
tracked_buffer _buffer;
|
||||
size_t _buffer_size = 0;
|
||||
bool _close_required = false;
|
||||
protected:
|
||||
size_t max_buffer_size_in_bytes = default_max_buffer_size_in_bytes();
|
||||
|
||||
@@ -205,6 +206,8 @@ public:
|
||||
bool is_end_of_stream() const { return _end_of_stream; }
|
||||
bool is_buffer_empty() const { return _buffer.empty(); }
|
||||
bool is_buffer_full() const { return _buffer_size >= max_buffer_size_in_bytes; }
|
||||
bool is_close_required() const { return _close_required; }
|
||||
void set_close_required() { _close_required = true; }
|
||||
static constexpr size_t default_max_buffer_size_in_bytes() { return 8 * 1024; }
|
||||
|
||||
mutation_fragment_v2 pop_mutation_fragment() {
|
||||
@@ -542,9 +545,15 @@ public:
|
||||
//
|
||||
// Can be used to skip over entire partitions if interleaved with
|
||||
// `operator()()` calls.
|
||||
future<> next_partition() { return _impl->next_partition(); }
|
||||
future<> next_partition() {
|
||||
_impl->set_close_required();
|
||||
return _impl->next_partition();
|
||||
}
|
||||
|
||||
future<> fill_buffer() { return _impl->fill_buffer(); }
|
||||
future<> fill_buffer() {
|
||||
_impl->set_close_required();
|
||||
return _impl->fill_buffer();
|
||||
}
|
||||
|
||||
// Changes the range of partitions to pr. The range can only be moved
|
||||
// forwards. pr.begin() needs to be larger than pr.end() of the previousl
|
||||
@@ -553,6 +562,7 @@ public:
|
||||
// pr needs to be valid until the reader is destroyed or fast_forward_to()
|
||||
// is called again.
|
||||
future<> fast_forward_to(const dht::partition_range& pr) {
|
||||
_impl->set_close_required();
|
||||
return _impl->fast_forward_to(pr);
|
||||
}
|
||||
// Skips to a later range of rows.
|
||||
@@ -582,6 +592,7 @@ public:
|
||||
// In particular one must first enter a partition by fetching a `partition_start`
|
||||
// fragment before calling `fast_forward_to`.
|
||||
future<> fast_forward_to(position_range cr) {
|
||||
_impl->set_close_required();
|
||||
return _impl->fast_forward_to(std::move(cr));
|
||||
}
|
||||
// Closes the reader.
|
||||
|
||||
18
install.sh
18
install.sh
@@ -143,7 +143,7 @@ export LD_LIBRARY_PATH="$prefix/libreloc"
|
||||
export UBSAN_OPTIONS="${UBSAN_OPTIONS:+$UBSAN_OPTIONS:}suppressions=$prefix/libexec/ubsan-suppressions.supp"
|
||||
exec -a "\$0" "$prefix/libexec/$bin" "\$@"
|
||||
EOF
|
||||
chmod +x "$root/$prefix/bin/$bin"
|
||||
chmod 755 "$root/$prefix/bin/$bin"
|
||||
}
|
||||
|
||||
relocate_python3() {
|
||||
@@ -156,11 +156,11 @@ relocate_python3() {
|
||||
local pythonpath="$(dirname "$pythoncmd")"
|
||||
|
||||
if [ ! -x "$script" ]; then
|
||||
cp "$script" "$install"
|
||||
install -m755 "$script" "$install"
|
||||
return
|
||||
fi
|
||||
mkdir -p "$relocateddir"
|
||||
cp "$script" "$relocateddir"
|
||||
install -d -m755 "$relocateddir"
|
||||
install -m755 "$script" "$relocateddir"
|
||||
cat > "$install"<<EOF
|
||||
#!/usr/bin/env bash
|
||||
[[ -z "\$LD_PRELOAD" ]] || { echo "\$0: not compatible with LD_PRELOAD" >&2; exit 110; }
|
||||
@@ -178,7 +178,7 @@ if [ -f "\${DEBIAN_SSL_CERT_FILE}" ]; then
|
||||
fi
|
||||
PYTHONPATH="\${d}:\${d}/libexec:\$PYTHONPATH" PATH="\${d}/../bin:\${d}/$pythonpath:\${PATH}" SSL_CERT_FILE="\${c}" exec -a "\$0" "\${d}/libexec/\${b}" "\$@"
|
||||
EOF
|
||||
chmod +x "$install"
|
||||
chmod 755 "$install"
|
||||
}
|
||||
|
||||
install() {
|
||||
@@ -392,6 +392,7 @@ install -d -m755 -d "$rprefix"/scyllatop
|
||||
cp -r tools/scyllatop/* "$rprefix"/scyllatop
|
||||
install -d -m755 -d "$rprefix"/scripts
|
||||
cp -r dist/common/scripts/* "$rprefix"/scripts
|
||||
chmod 755 "$rprefix"/scripts/*
|
||||
ln -srf "$rprefix/scyllatop/scyllatop.py" "$rprefix/bin/scyllatop"
|
||||
if $supervisor; then
|
||||
install -d -m755 "$rprefix"/supervisor
|
||||
@@ -508,8 +509,13 @@ relocate_python3 "$rprefix"/scripts fix_system_distributed_tables.py
|
||||
if $supervisor; then
|
||||
install -d -m755 `supervisor_dir $retc`
|
||||
for service in scylla-server scylla-jmx scylla-node-exporter; do
|
||||
if [ "$service" = "scylla-server" ]; then
|
||||
program="scylla"
|
||||
else
|
||||
program=$service
|
||||
fi
|
||||
cat << EOS > `supervisor_conf $retc $service`
|
||||
[program:$service]
|
||||
[program:$program]
|
||||
directory=$rprefix
|
||||
command=/bin/bash -c './supervisor/$service.sh'
|
||||
EOS
|
||||
|
||||
@@ -34,6 +34,10 @@ azure_snitch::azure_snitch(const sstring& fname, unsigned io_cpuid) : production
|
||||
}
|
||||
|
||||
future<> azure_snitch::load_config() {
|
||||
if (this_shard_id() != io_cpu_id()) {
|
||||
co_return;
|
||||
}
|
||||
|
||||
sstring region = co_await azure_api_call(REGION_NAME_QUERY_PATH);
|
||||
sstring azure_zone = co_await azure_api_call(ZONE_NAME_QUERY_PATH);
|
||||
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
#include "locator/ec2_snitch.hh"
|
||||
#include <seastar/core/seastar.hh>
|
||||
#include <seastar/core/sleep.hh>
|
||||
#include <seastar/core/do_with.hh>
|
||||
|
||||
#include <boost/algorithm/string/classification.hpp>
|
||||
#include <boost/algorithm/string/split.hpp>
|
||||
@@ -67,6 +69,30 @@ future<> ec2_snitch::start() {
|
||||
}
|
||||
|
||||
future<sstring> ec2_snitch::aws_api_call(sstring addr, uint16_t port, sstring cmd) {
|
||||
return do_with(int(0), [this, addr, port, cmd] (int& i) {
|
||||
return repeat_until_value([this, addr, port, cmd, &i]() -> future<std::optional<sstring>> {
|
||||
++i;
|
||||
return aws_api_call_once(addr, port, cmd).then([] (auto res) {
|
||||
return make_ready_future<std::optional<sstring>>(std::move(res));
|
||||
}).handle_exception([&i] (auto ep) {
|
||||
try {
|
||||
std::rethrow_exception(ep);
|
||||
} catch (const std::system_error &e) {
|
||||
logger().error(e.what());
|
||||
if (i >= AWS_API_CALL_RETRIES - 1) {
|
||||
logger().error("Maximum number of retries exceeded");
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
return sleep(AWS_API_CALL_RETRY_INTERVAL).then([] {
|
||||
return make_ready_future<std::optional<sstring>>(std::nullopt);
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
future<sstring> ec2_snitch::aws_api_call_once(sstring addr, uint16_t port, sstring cmd) {
|
||||
return connect(socket_address(inet_address{addr}, port))
|
||||
.then([this, addr, cmd] (connected_socket fd) {
|
||||
_sd = std::move(fd);
|
||||
|
||||
@@ -16,6 +16,8 @@ public:
|
||||
static constexpr const char* ZONE_NAME_QUERY_REQ = "/latest/meta-data/placement/availability-zone";
|
||||
static constexpr const char* AWS_QUERY_SERVER_ADDR = "169.254.169.254";
|
||||
static constexpr uint16_t AWS_QUERY_SERVER_PORT = 80;
|
||||
static constexpr int AWS_API_CALL_RETRIES = 5;
|
||||
static constexpr auto AWS_API_CALL_RETRY_INTERVAL = std::chrono::seconds{5};
|
||||
|
||||
ec2_snitch(const sstring& fname = "", unsigned io_cpu_id = 0);
|
||||
virtual future<> start() override;
|
||||
@@ -32,5 +34,6 @@ private:
|
||||
output_stream<char> _out;
|
||||
http_response_parser _parser;
|
||||
sstring _zone_req;
|
||||
future<sstring> aws_api_call_once(sstring addr, uint16_t port, const sstring cmd);
|
||||
};
|
||||
} // namespace locator
|
||||
|
||||
39
main.cc
39
main.cc
@@ -367,11 +367,38 @@ static auto defer_verbose_shutdown(const char* what, Func&& func) {
|
||||
startlog.info("Shutting down {}", what);
|
||||
try {
|
||||
func();
|
||||
startlog.info("Shutting down {} was successful", what);
|
||||
} catch (...) {
|
||||
startlog.error("Unexpected error shutting down {}: {}", what, std::current_exception());
|
||||
throw;
|
||||
auto ex = std::current_exception();
|
||||
bool do_abort = true;
|
||||
try {
|
||||
std::rethrow_exception(ex);
|
||||
} catch (const std::system_error& e) {
|
||||
// System error codes we consider "environmental",
|
||||
// i.e. not scylla's fault, therefore there is no point in
|
||||
// aborting and dumping core.
|
||||
for (int i : {EIO, EACCES, ENOSPC}) {
|
||||
if (e.code() == std::error_code(i, std::system_category())) {
|
||||
do_abort = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} catch (...) {
|
||||
}
|
||||
auto msg = fmt::format("Unexpected error shutting down {}: {}", what, ex);
|
||||
if (do_abort) {
|
||||
startlog.error("{}: aborting", msg);
|
||||
abort();
|
||||
} else {
|
||||
startlog.error("{}: exiting, at {}", msg, current_backtrace());
|
||||
|
||||
// Call _exit() rather than exit() to exit immediately
|
||||
// without calling exit handlers, avoiding
|
||||
// boost::intrusive::detail::destructor_impl assert failure
|
||||
// from ~segment_pool exit handler.
|
||||
_exit(255);
|
||||
}
|
||||
}
|
||||
startlog.info("Shutting down {} was successful", what);
|
||||
};
|
||||
|
||||
auto ret = deferred_action(std::move(vfunc));
|
||||
@@ -547,6 +574,12 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
|
||||
cfg->broadcast_to_all_shards().get();
|
||||
|
||||
// We pass this piece of config through a global as a temporary hack.
|
||||
// See the comment at the definition of sstables::global_cache_index_pages.
|
||||
smp::invoke_on_all([&cfg] {
|
||||
sstables::global_cache_index_pages = cfg->cache_index_pages.operator utils::updateable_value<bool>();
|
||||
}).get();
|
||||
|
||||
::sighup_handler sighup_handler(opts, *cfg);
|
||||
auto stop_sighup_handler = defer_verbose_shutdown("sighup", [&] {
|
||||
sighup_handler.stop().get();
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
#include "sstables/shared_sstable.hh"
|
||||
#include <seastar/core/future.hh>
|
||||
#include <seastar/core/io_priority_class.hh>
|
||||
#include "reader_permit.hh"
|
||||
|
||||
class memtable;
|
||||
class flat_mutation_reader;
|
||||
|
||||
@@ -438,6 +438,8 @@ static constexpr unsigned do_get_rpc_client_idx(messaging_verb verb) {
|
||||
// should not be blocked by any data requests.
|
||||
case messaging_verb::GROUP0_PEER_EXCHANGE:
|
||||
case messaging_verb::GROUP0_MODIFY_CONFIG:
|
||||
// ATTN -- if moving GOSSIP_ verbs elsewhere, mind updating the tcp_nodelay
|
||||
// setting in get_rpc_client(), which assumes gossiper verbs live in idx 0
|
||||
return 0;
|
||||
case messaging_verb::PREPARE_MESSAGE:
|
||||
case messaging_verb::PREPARE_DONE_MESSAGE:
|
||||
@@ -695,7 +697,7 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
|
||||
}();
|
||||
|
||||
auto must_tcp_nodelay = [&] {
|
||||
if (idx == 1) {
|
||||
if (idx == 0) {
|
||||
return true; // gossip
|
||||
}
|
||||
if (_cfg.tcp_nodelay == tcp_nodelay_what::local) {
|
||||
@@ -716,10 +718,7 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
|
||||
}
|
||||
opts.tcp_nodelay = must_tcp_nodelay;
|
||||
opts.reuseaddr = true;
|
||||
// We send cookies only for non-default statement tenant clients.
|
||||
if (idx > 3) {
|
||||
opts.isolation_cookie = _scheduling_info_for_connection_index[idx].isolation_cookie;
|
||||
}
|
||||
opts.isolation_cookie = _scheduling_info_for_connection_index[idx].isolation_cookie;
|
||||
|
||||
auto client = must_encrypt ?
|
||||
::make_shared<rpc_protocol_client_wrapper>(_rpc->protocol(), std::move(opts),
|
||||
|
||||
@@ -272,8 +272,8 @@ public:
|
||||
|
||||
future<> lookup_readers(db::timeout_clock::time_point timeout);
|
||||
|
||||
future<> save_readers(flat_mutation_reader::tracked_buffer unconsumed_buffer, detached_compaction_state compaction_state,
|
||||
std::optional<clustering_key_prefix> last_ckey);
|
||||
future<> save_readers(flat_mutation_reader::tracked_buffer unconsumed_buffer, std::optional<detached_compaction_state> compaction_state,
|
||||
dht::decorated_key last_pkey, std::optional<clustering_key_prefix> last_ckey);
|
||||
|
||||
future<> stop();
|
||||
};
|
||||
@@ -580,19 +580,22 @@ future<> read_context::lookup_readers(db::timeout_clock::time_point timeout) {
|
||||
});
|
||||
}
|
||||
|
||||
future<> read_context::save_readers(flat_mutation_reader::tracked_buffer unconsumed_buffer, detached_compaction_state compaction_state,
|
||||
std::optional<clustering_key_prefix> last_ckey) {
|
||||
future<> read_context::save_readers(flat_mutation_reader::tracked_buffer unconsumed_buffer, std::optional<detached_compaction_state> compaction_state,
|
||||
dht::decorated_key last_pkey, std::optional<clustering_key_prefix> last_ckey) {
|
||||
if (_cmd.query_uuid == utils::UUID{}) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
auto last_pkey = compaction_state.partition_start.key();
|
||||
|
||||
const auto cb_stats = dismantle_combined_buffer(std::move(unconsumed_buffer), last_pkey);
|
||||
tracing::trace(_trace_state, "Dismantled combined buffer: {}", cb_stats);
|
||||
|
||||
const auto cs_stats = dismantle_compaction_state(std::move(compaction_state));
|
||||
tracing::trace(_trace_state, "Dismantled compaction state: {}", cs_stats);
|
||||
auto cs_stats = dismantle_buffer_stats{};
|
||||
if (compaction_state) {
|
||||
cs_stats = dismantle_compaction_state(std::move(*compaction_state));
|
||||
tracing::trace(_trace_state, "Dismantled compaction state: {}", cs_stats);
|
||||
} else {
|
||||
tracing::trace(_trace_state, "No compaction state to dismantle, partition exhausted", cs_stats);
|
||||
}
|
||||
|
||||
return do_with(std::move(last_pkey), std::move(last_ckey), [this] (const dht::decorated_key& last_pkey,
|
||||
const std::optional<clustering_key_prefix>& last_ckey) {
|
||||
@@ -745,16 +748,18 @@ future<typename ResultBuilder::result_type> do_query(
|
||||
ResultBuilder&& result_builder) {
|
||||
auto ctx = seastar::make_shared<read_context>(db, s, cmd, ranges, trace_state, timeout);
|
||||
|
||||
co_await ctx->lookup_readers(timeout);
|
||||
|
||||
std::exception_ptr ex;
|
||||
|
||||
try {
|
||||
co_await ctx->lookup_readers(timeout);
|
||||
|
||||
auto [last_ckey, result, unconsumed_buffer, compaction_state] = co_await read_page<ResultBuilder>(ctx, s, cmd, ranges, trace_state,
|
||||
std::move(result_builder));
|
||||
|
||||
if (compaction_state->are_limits_reached() || result.is_short_read()) {
|
||||
co_await ctx->save_readers(std::move(unconsumed_buffer), std::move(*compaction_state).detach_state(), std::move(last_ckey));
|
||||
// Must call before calling 'detached_state()`.
|
||||
auto last_pkey = *compaction_state->current_partition();
|
||||
co_await ctx->save_readers(std::move(unconsumed_buffer), std::move(*compaction_state).detach_state(), std::move(last_pkey), std::move(last_ckey));
|
||||
}
|
||||
|
||||
co_await ctx->stop();
|
||||
|
||||
@@ -167,6 +167,9 @@ class compact_mutation_state {
|
||||
std::unique_ptr<mutation_compactor_garbage_collector> _collector;
|
||||
|
||||
compaction_stats _stats;
|
||||
|
||||
// Remember if we requested to stop mid-partition.
|
||||
stop_iteration _stop = stop_iteration::no;
|
||||
private:
|
||||
template <typename Consumer, typename GCConsumer>
|
||||
requires CompactedFragmentsConsumer<Consumer> && CompactedFragmentsConsumer<GCConsumer>
|
||||
@@ -304,6 +307,7 @@ public:
|
||||
}
|
||||
|
||||
void consume_new_partition(const dht::decorated_key& dk) {
|
||||
_stop = stop_iteration::no;
|
||||
auto& pk = dk.key();
|
||||
_dk = &dk;
|
||||
_return_static_content_on_partition_with_no_rows =
|
||||
@@ -370,9 +374,9 @@ public:
|
||||
_static_row_live = is_live;
|
||||
if (is_live || (!only_live() && !sr.empty())) {
|
||||
partition_is_not_empty(consumer);
|
||||
return consumer.consume(std::move(sr), current_tombstone, is_live);
|
||||
_stop = consumer.consume(std::move(sr), current_tombstone, is_live);
|
||||
}
|
||||
return stop_iteration::no;
|
||||
return _stop;
|
||||
}
|
||||
|
||||
template <typename Consumer, typename GCConsumer>
|
||||
@@ -424,22 +428,21 @@ public:
|
||||
};
|
||||
|
||||
if (only_live() && is_live) {
|
||||
auto stop = consume_row();
|
||||
_stop = consume_row();
|
||||
if (++_rows_in_current_partition == _current_partition_limit) {
|
||||
return stop_iteration::yes;
|
||||
_stop = stop_iteration::yes;
|
||||
}
|
||||
return stop;
|
||||
return _stop;
|
||||
} else if (!only_live()) {
|
||||
auto stop = stop_iteration::no;
|
||||
if (!cr.empty()) {
|
||||
stop = consume_row();
|
||||
_stop = consume_row();
|
||||
}
|
||||
if (!sstable_compaction() && is_live && ++_rows_in_current_partition == _current_partition_limit) {
|
||||
return stop_iteration::yes;
|
||||
_stop = stop_iteration::yes;
|
||||
}
|
||||
return stop;
|
||||
return _stop;
|
||||
}
|
||||
return stop_iteration::no;
|
||||
return _stop;
|
||||
}
|
||||
|
||||
template <typename Consumer, typename GCConsumer>
|
||||
@@ -448,7 +451,8 @@ public:
|
||||
++_stats.range_tombstones;
|
||||
_range_tombstones.apply(rt);
|
||||
// FIXME: drop tombstone if it is fully covered by other range tombstones
|
||||
return do_consume(std::move(rt), consumer, gc_consumer);
|
||||
_stop = do_consume(std::move(rt), consumer, gc_consumer);
|
||||
return _stop;
|
||||
}
|
||||
|
||||
template <typename Consumer, typename GCConsumer>
|
||||
@@ -459,9 +463,9 @@ public:
|
||||
_rt_assembler.emplace();
|
||||
}
|
||||
if (auto rt_opt = _rt_assembler->consume(_schema, std::move(rtc))) {
|
||||
return do_consume(std::move(*rt_opt), consumer, gc_consumer);
|
||||
_stop = do_consume(std::move(*rt_opt), consumer, gc_consumer);
|
||||
}
|
||||
return stop_iteration::no;
|
||||
return _stop;
|
||||
}
|
||||
|
||||
template <typename Consumer, typename GCConsumer>
|
||||
@@ -562,16 +566,31 @@ public:
|
||||
/// compactor will result in the new compactor being in the same state *this
|
||||
/// is (given the same outside parameters of course). Practically this
|
||||
/// allows the compaction state to be stored in the compacted reader.
|
||||
detached_compaction_state detach_state() && {
|
||||
/// If the currently compacted partition is exhausted a disengaged optional
|
||||
/// is returned -- in this case there is no state to detach.
|
||||
std::optional<detached_compaction_state> detach_state() && {
|
||||
// If we exhausted the partition, there is no need to detach-restore the
|
||||
// compaction state.
|
||||
// We exhausted the partition if `consume_partition_end()` was called
|
||||
// without us requesting the consumption to stop (remembered in _stop)
|
||||
// from one of the consume() overloads.
|
||||
// The consume algorithm calls `consume_partition_end()` in two cases:
|
||||
// * on a partition-end fragment
|
||||
// * consume() requested to stop
|
||||
// In the latter case, the partition is not exhausted. Even if the next
|
||||
// fragment to process is a partition-end, it will not be consumed.
|
||||
if (!_stop) {
|
||||
return {};
|
||||
}
|
||||
partition_start ps(std::move(_last_dk), _range_tombstones.get_partition_tombstone());
|
||||
if (_rt_assembler) {
|
||||
if (_current_tombstone) {
|
||||
return {std::move(ps), std::move(_last_static_row), range_tombstone_change(position_in_partition_view::after_key(_last_clustering_pos), _current_tombstone)};
|
||||
return detached_compaction_state{std::move(ps), std::move(_last_static_row), range_tombstone_change(position_in_partition_view::after_key(_last_clustering_pos), _current_tombstone)};
|
||||
} else {
|
||||
return {std::move(ps), std::move(_last_static_row), std::optional<range_tombstone_change>{}};
|
||||
return detached_compaction_state{std::move(ps), std::move(_last_static_row), std::optional<range_tombstone_change>{}};
|
||||
}
|
||||
}
|
||||
return {std::move(ps), std::move(_last_static_row), std::move(_range_tombstones).range_tombstones()};
|
||||
return detached_compaction_state{std::move(ps), std::move(_last_static_row), std::move(_range_tombstones).range_tombstones()};
|
||||
}
|
||||
|
||||
const compaction_stats& stats() const { return _stats; }
|
||||
|
||||
@@ -28,6 +28,7 @@ class mutation_fragment_stream_validator {
|
||||
mutation_fragment_v2::kind _prev_kind;
|
||||
position_in_partition _prev_pos;
|
||||
dht::decorated_key _prev_partition_key;
|
||||
tombstone _current_tombstone;
|
||||
public:
|
||||
explicit mutation_fragment_stream_validator(const schema& s);
|
||||
|
||||
@@ -122,6 +123,12 @@ public:
|
||||
const position_in_partition& previous_position() const {
|
||||
return _prev_pos;
|
||||
}
|
||||
/// Get the current effective tombstone
|
||||
///
|
||||
/// Not meaningful, when operator()(mutation_fragment_v2) is not used.
|
||||
tombstone current_tombstone() const {
|
||||
return _current_tombstone;
|
||||
}
|
||||
/// The previous valid partition key.
|
||||
///
|
||||
/// Only valid if `operator()(const dht::decorated_key&)` or
|
||||
@@ -151,6 +158,7 @@ class mutation_fragment_stream_validating_filter {
|
||||
mutation_fragment_stream_validator _validator;
|
||||
sstring _name;
|
||||
mutation_fragment_stream_validation_level _validation_level;
|
||||
tombstone _current_tombstone;
|
||||
|
||||
public:
|
||||
/// Constructor.
|
||||
|
||||
@@ -826,6 +826,7 @@ public:
|
||||
|
||||
void apply(tombstone deleted_at) {
|
||||
_deleted_at.apply(deleted_at);
|
||||
maybe_shadow();
|
||||
}
|
||||
|
||||
void apply(shadowable_tombstone deleted_at) {
|
||||
|
||||
@@ -1581,11 +1581,7 @@ private:
|
||||
tracing::global_trace_state_ptr _trace_state;
|
||||
const mutation_reader::forwarding _fwd_mr;
|
||||
reader_concurrency_semaphore::inactive_read_handle _irh;
|
||||
bool _drop_partition_start = false;
|
||||
bool _drop_static_row = false;
|
||||
// Validate the partition key of the first emitted partition, set after the
|
||||
// reader was recreated.
|
||||
bool _validate_partition_key = false;
|
||||
bool _reader_recreated = false; // set if reader was recreated since last operation
|
||||
position_in_partition::tri_compare _tri_cmp;
|
||||
|
||||
std::optional<dht::decorated_key> _last_pkey;
|
||||
@@ -1606,10 +1602,9 @@ private:
|
||||
void adjust_partition_slice();
|
||||
flat_mutation_reader_v2 recreate_reader();
|
||||
future<flat_mutation_reader_v2> resume_or_create_reader();
|
||||
void maybe_validate_partition_start(const flat_mutation_reader_v2::tracked_buffer& buffer);
|
||||
void validate_partition_start(const partition_start& ps);
|
||||
void validate_position_in_partition(position_in_partition_view pos) const;
|
||||
bool should_drop_fragment(const mutation_fragment_v2& mf);
|
||||
future<> do_fill_buffer();
|
||||
void examine_first_fragments(mutation_fragment_v2_opt& mf1, mutation_fragment_v2_opt& mf2, mutation_fragment_v2_opt& mf3);
|
||||
|
||||
public:
|
||||
evictable_reader_v2(
|
||||
@@ -1725,9 +1720,6 @@ flat_mutation_reader_v2 evictable_reader_v2::recreate_reader() {
|
||||
_range_override.reset();
|
||||
_slice_override.reset();
|
||||
|
||||
_drop_partition_start = false;
|
||||
_drop_static_row = false;
|
||||
|
||||
if (_last_pkey) {
|
||||
bool partition_range_is_inclusive = true;
|
||||
|
||||
@@ -1736,11 +1728,8 @@ flat_mutation_reader_v2 evictable_reader_v2::recreate_reader() {
|
||||
partition_range_is_inclusive = false;
|
||||
break;
|
||||
case partition_region::static_row:
|
||||
_drop_partition_start = true;
|
||||
break;
|
||||
case partition_region::clustered:
|
||||
_drop_partition_start = true;
|
||||
_drop_static_row = true;
|
||||
adjust_partition_slice();
|
||||
slice = &*_slice_override;
|
||||
break;
|
||||
@@ -1763,7 +1752,7 @@ flat_mutation_reader_v2 evictable_reader_v2::recreate_reader() {
|
||||
_range_override = dht::partition_range({dht::partition_range::bound(*_last_pkey, partition_range_is_inclusive)}, _pr->end());
|
||||
range = &*_range_override;
|
||||
|
||||
_validate_partition_key = true;
|
||||
_reader_recreated = true;
|
||||
}
|
||||
|
||||
return _ms.make_reader_v2(
|
||||
@@ -1788,41 +1777,33 @@ future<flat_mutation_reader_v2> evictable_reader_v2::resume_or_create_reader() {
|
||||
co_return recreate_reader();
|
||||
}
|
||||
|
||||
void evictable_reader_v2::maybe_validate_partition_start(const flat_mutation_reader_v2::tracked_buffer& buffer) {
|
||||
if (!_validate_partition_key || buffer.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
// If this is set we can assume the first fragment is a partition-start.
|
||||
const auto& ps = buffer.front().as_partition_start();
|
||||
void evictable_reader_v2::validate_partition_start(const partition_start& ps) {
|
||||
const auto tri_cmp = dht::ring_position_comparator(*_schema);
|
||||
// If we recreated the reader after fast-forwarding it we won't have
|
||||
// _last_pkey set. In this case it is enough to check if the partition
|
||||
// is in range.
|
||||
if (_last_pkey) {
|
||||
const auto cmp_res = tri_cmp(*_last_pkey, ps.key());
|
||||
if (_drop_partition_start) { // we expect to continue from the same partition
|
||||
if (_next_position_in_partition.region() != partition_region::partition_start) { // we expect to continue from the same partition
|
||||
// We cannot assume the partition we stopped the read at is still alive
|
||||
// when we recreate the reader. It might have been compacted away in the
|
||||
// meanwhile, so allow for a larger partition too.
|
||||
require(
|
||||
cmp_res <= 0,
|
||||
"{}(): validation failed, expected partition with key larger or equal to _last_pkey {} due to _drop_partition_start being set, but got {}",
|
||||
"{}(): validation failed, expected partition with key larger or equal to _last_pkey {}, but got {}",
|
||||
__FUNCTION__,
|
||||
*_last_pkey,
|
||||
ps.key());
|
||||
// Reset drop flags and next pos if we are not continuing from the same partition
|
||||
// Reset next pos if we are not continuing from the same partition
|
||||
if (cmp_res < 0) {
|
||||
// Close previous partition, we are not going to continue it.
|
||||
push_mutation_fragment(*_schema, _permit, partition_end{});
|
||||
_drop_partition_start = false;
|
||||
_drop_static_row = false;
|
||||
_next_position_in_partition = position_in_partition::for_partition_start();
|
||||
}
|
||||
} else { // should be a larger partition
|
||||
require(
|
||||
cmp_res < 0,
|
||||
"{}(): validation failed, expected partition with key larger than _last_pkey {} due to _drop_partition_start being unset, but got {}",
|
||||
"{}(): validation failed, expected partition with key larger than _last_pkey {}, but got {}",
|
||||
__FUNCTION__,
|
||||
*_last_pkey,
|
||||
ps.key());
|
||||
@@ -1836,8 +1817,6 @@ void evictable_reader_v2::maybe_validate_partition_start(const flat_mutation_rea
|
||||
__FUNCTION__,
|
||||
prange,
|
||||
ps.key());
|
||||
|
||||
_validate_partition_key = false;
|
||||
}
|
||||
|
||||
void evictable_reader_v2::validate_position_in_partition(position_in_partition_view pos) const {
|
||||
@@ -1860,7 +1839,12 @@ void evictable_reader_v2::validate_position_in_partition(position_in_partition_v
|
||||
const bool any_contains = std::any_of(ranges.begin(), ranges.end(), [this, &pos] (const query::clustering_range& cr) {
|
||||
// TODO: somehow avoid this copy
|
||||
auto range = position_range(cr);
|
||||
return range.contains(*_schema, pos);
|
||||
// We cannot use range.contains() because that treats range as a
|
||||
// [a, b) range, meaning a range tombstone change with position
|
||||
// after_key(b) will be considered outside of it. Such range
|
||||
// tombstone changes can be emitted however when recreating the
|
||||
// reader on clustering range edge.
|
||||
return _tri_cmp(range.start(), pos) <= 0 && _tri_cmp(pos, range.end()) <= 0;
|
||||
});
|
||||
require(
|
||||
any_contains,
|
||||
@@ -1871,42 +1855,40 @@ void evictable_reader_v2::validate_position_in_partition(position_in_partition_v
|
||||
}
|
||||
}
|
||||
|
||||
bool evictable_reader_v2::should_drop_fragment(const mutation_fragment_v2& mf) {
|
||||
if (_drop_partition_start && mf.is_partition_start()) {
|
||||
_drop_partition_start = false;
|
||||
return true;
|
||||
void evictable_reader_v2::examine_first_fragments(mutation_fragment_v2_opt& mf1, mutation_fragment_v2_opt& mf2, mutation_fragment_v2_opt& mf3) {
|
||||
if (!mf1) {
|
||||
return; // the reader is at EOS
|
||||
}
|
||||
// Unlike partition-start above, a partition is not guaranteed to have a
|
||||
// static row fragment. So reset the flag regardless of whether we could
|
||||
// drop one or not.
|
||||
// We are guaranteed to get here only right after dropping a partition-start,
|
||||
// so if we are not seeing a static row here, the partition doesn't have one.
|
||||
if (_drop_static_row) {
|
||||
_drop_static_row = false;
|
||||
return mf.is_static_row();
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
future<> evictable_reader_v2::do_fill_buffer() {
|
||||
if (!_drop_partition_start && !_drop_static_row) {
|
||||
auto fill_buf_fut = _reader->fill_buffer();
|
||||
if (_validate_partition_key) {
|
||||
fill_buf_fut = fill_buf_fut.then([this] {
|
||||
maybe_validate_partition_start(_reader->buffer());
|
||||
});
|
||||
}
|
||||
return fill_buf_fut;
|
||||
// If engaged, the first fragment is always a partition-start.
|
||||
validate_partition_start(mf1->as_partition_start());
|
||||
if (_tri_cmp(mf1->position(), _next_position_in_partition) < 0) {
|
||||
mf1 = {}; // drop mf1
|
||||
}
|
||||
|
||||
const auto continue_same_partition = _next_position_in_partition.region() != partition_region::partition_start;
|
||||
|
||||
// If we have a first fragment, we are guaranteed to have a second one -- if not else, a partition-end.
|
||||
if (mf2->is_end_of_partition()) {
|
||||
return; // no further fragments, nothing to do
|
||||
}
|
||||
|
||||
// We want to validate the position of the first non-dropped fragment.
|
||||
// If mf2 is a static row and we need to drop it, this will be mf3.
|
||||
if (mf2->is_static_row() && _tri_cmp(mf2->position(), _next_position_in_partition) < 0) {
|
||||
mf2 = {}; // drop mf2
|
||||
} else {
|
||||
if (continue_same_partition) {
|
||||
validate_position_in_partition(mf2->position());
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (mf3->is_end_of_partition()) {
|
||||
return; // no further fragments, nothing to do
|
||||
} else if (continue_same_partition) {
|
||||
validate_position_in_partition(mf3->position());
|
||||
}
|
||||
return repeat([this] {
|
||||
return _reader->fill_buffer().then([this] {
|
||||
maybe_validate_partition_start(_reader->buffer());
|
||||
while (!_reader->is_buffer_empty() && should_drop_fragment(_reader->peek_buffer())) {
|
||||
_reader->pop_mutation_fragment();
|
||||
}
|
||||
return stop_iteration(_reader->is_buffer_full() || _reader->is_end_of_stream());
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
evictable_reader_v2::evictable_reader_v2(
|
||||
@@ -1935,10 +1917,62 @@ future<> evictable_reader_v2::fill_buffer() {
|
||||
co_return;
|
||||
}
|
||||
_reader = co_await resume_or_create_reader();
|
||||
co_await do_fill_buffer();
|
||||
|
||||
if (_reader_recreated) {
|
||||
// Recreating the reader breaks snapshot isolation and creates all sorts
|
||||
// of complications around the continuity of range tombstone changes,
|
||||
// e.g. a range tombstone started by the previous reader object
|
||||
// might not exist anymore with the new reader object.
|
||||
// To avoid complications we reset the tombstone state on each reader
|
||||
// recreation by emitting a null tombstone change, if we read at least
|
||||
// one clustering fragment from the partition.
|
||||
if (_next_position_in_partition.region() == partition_region::clustered
|
||||
&& _tri_cmp(_next_position_in_partition, position_in_partition::before_all_clustered_rows()) > 0) {
|
||||
push_mutation_fragment(*_schema, _permit, range_tombstone_change{position_in_partition_view::before_key(_next_position_in_partition), {}});
|
||||
}
|
||||
auto mf1 = co_await (*_reader)();
|
||||
auto mf2 = co_await (*_reader)();
|
||||
auto mf3 = co_await (*_reader)();
|
||||
examine_first_fragments(mf1, mf2, mf3);
|
||||
if (mf3) {
|
||||
_reader->unpop_mutation_fragment(std::move(*mf3));
|
||||
}
|
||||
if (mf2) {
|
||||
_reader->unpop_mutation_fragment(std::move(*mf2));
|
||||
}
|
||||
if (mf1) {
|
||||
_reader->unpop_mutation_fragment(std::move(*mf1));
|
||||
}
|
||||
_reader_recreated = false;
|
||||
} else {
|
||||
co_await _reader->fill_buffer();
|
||||
}
|
||||
|
||||
_reader->move_buffer_content_to(*this);
|
||||
|
||||
// Ensure that each buffer represents forward progress. Only a concern when
|
||||
// the last fragment in the buffer is range tombstone change. In this case
|
||||
// ensure that:
|
||||
// * buffer().back().position() > _next_position_in_partition;
|
||||
// * _reader.peek()->position() > buffer().back().position();
|
||||
if (!is_buffer_empty() && buffer().back().is_range_tombstone_change()) {
|
||||
auto* next_mf = co_await _reader->peek();
|
||||
|
||||
// First make sure we've made progress w.r.t. _next_position_in_partition.
|
||||
while (next_mf && _tri_cmp(_next_position_in_partition, buffer().back().position()) <= 0) {
|
||||
push_mutation_fragment(_reader->pop_mutation_fragment());
|
||||
next_mf = co_await _reader->peek();
|
||||
}
|
||||
|
||||
const auto last_pos = position_in_partition(buffer().back().position());
|
||||
while (next_mf && _tri_cmp(last_pos, next_mf->position()) == 0) {
|
||||
push_mutation_fragment(_reader->pop_mutation_fragment());
|
||||
next_mf = co_await _reader->peek();
|
||||
}
|
||||
}
|
||||
|
||||
update_next_position();
|
||||
_end_of_stream = _reader->is_end_of_stream() && _reader->is_buffer_empty();
|
||||
_end_of_stream = _reader->is_end_of_stream();
|
||||
maybe_pause(std::move(*_reader));
|
||||
}
|
||||
|
||||
|
||||
@@ -292,14 +292,23 @@ class partition_snapshot_flat_reader : public flat_mutation_reader::impl, public
|
||||
const std::optional<position_in_partition>& last_row,
|
||||
const std::optional<position_in_partition>& last_rts,
|
||||
position_in_partition_view pos) {
|
||||
if (!_rt_stream.empty()) {
|
||||
return _rt_stream.get_next(std::move(pos));
|
||||
}
|
||||
return in_alloc_section([&] () -> mutation_fragment_opt {
|
||||
maybe_refresh_state(ck_range_snapshot, last_row, last_rts);
|
||||
|
||||
position_in_partition::less_compare rt_less(_query_schema);
|
||||
|
||||
// The while below moves range tombstones from partition versions
|
||||
// into _rt_stream, just enough to produce the next range tombstone
|
||||
// The main goal behind moving to _rt_stream is to deoverlap range tombstones
|
||||
// which have the same starting position. This is not in order to satisfy
|
||||
// flat_mutation_reader stream requirements, the reader can emit range tombstones
|
||||
// which have the same position incrementally. This is to guarantee forward
|
||||
// progress in the case iterators get invalidated and maybe_refresh_state()
|
||||
// above needs to restore them. It does so using last_rts, which tracks
|
||||
// the position of the last emitted range tombstone. All range tombstones
|
||||
// with positions <= than last_rts are skipped on refresh. To make progress,
|
||||
// we need to make sure that all range tombstones with duplicated positions
|
||||
// are emitted before maybe_refresh_state().
|
||||
while (has_more_range_tombstones()
|
||||
&& !rt_less(pos, peek_range_tombstone().position())
|
||||
&& (_rt_stream.empty() || !rt_less(_rt_stream.peek_next().position(), peek_range_tombstone().position()))) {
|
||||
|
||||
@@ -444,7 +444,7 @@ public:
|
||||
// When throws, the cursor is invalidated and its position is not changed.
|
||||
bool advance_to(position_in_partition_view lower_bound) {
|
||||
maybe_advance_to(lower_bound);
|
||||
return no_clustering_row_between(_schema, lower_bound, position());
|
||||
return no_clustering_row_between_weak(_schema, lower_bound, position());
|
||||
}
|
||||
|
||||
// Call only when valid.
|
||||
|
||||
@@ -567,6 +567,20 @@ bool no_clustering_row_between(const schema& s, position_in_partition_view a, po
|
||||
}
|
||||
}
|
||||
|
||||
// Returns true if and only if there can't be any clustering_row with position >= a and < b.
|
||||
// It is assumed that a <= b.
|
||||
inline
|
||||
bool no_clustering_row_between_weak(const schema& s, position_in_partition_view a, position_in_partition_view b) {
|
||||
clustering_key_prefix::equality eq(s);
|
||||
if (a.has_key() && b.has_key()) {
|
||||
return eq(a.key(), b.key())
|
||||
&& (a.get_bound_weight() == bound_weight::after_all_prefixed
|
||||
|| b.get_bound_weight() != bound_weight::after_all_prefixed);
|
||||
} else {
|
||||
return !a.has_key() && !b.has_key();
|
||||
}
|
||||
}
|
||||
|
||||
// Includes all position_in_partition objects "p" for which: start <= p < end
|
||||
// And only those.
|
||||
class position_range {
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
|
||||
#include "seastarx.hh"
|
||||
#include <seastar/core/future.hh>
|
||||
#include <seastar/net/socket_defs.hh>
|
||||
#include <vector>
|
||||
|
||||
// Abstraction for a server serving some kind of user-facing protocol.
|
||||
|
||||
19
querier.cc
19
querier.cc
@@ -414,25 +414,6 @@ future<bool> querier_cache::evict_one() noexcept {
|
||||
co_return false;
|
||||
}
|
||||
|
||||
future<> querier_cache::evict_all_for_table(const utils::UUID& schema_id) noexcept {
|
||||
for (auto ip : {&_data_querier_index, &_mutation_querier_index, &_shard_mutation_querier_index}) {
|
||||
auto& idx = *ip;
|
||||
for (auto it = idx.begin(); it != idx.end();) {
|
||||
if (it->second->schema().id() == schema_id) {
|
||||
auto reader_opt = it->second->permit().semaphore().unregister_inactive_read(querier_utils::get_inactive_read_handle(*it->second));
|
||||
it = idx.erase(it);
|
||||
--_stats.population;
|
||||
if (reader_opt) {
|
||||
co_await reader_opt->close();
|
||||
}
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
}
|
||||
co_return;
|
||||
}
|
||||
|
||||
future<> querier_cache::stop() noexcept {
|
||||
co_await _closing_gate.close();
|
||||
|
||||
|
||||
@@ -476,11 +476,6 @@ public:
|
||||
/// is empty).
|
||||
future<bool> evict_one() noexcept;
|
||||
|
||||
/// Evict all queriers that belong to a table.
|
||||
///
|
||||
/// Should be used when dropping a table.
|
||||
future<> evict_all_for_table(const utils::UUID& schema_id) noexcept;
|
||||
|
||||
/// Close all queriers and wait on background work.
|
||||
///
|
||||
/// Should be used before destroying the querier_cache.
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
#include <boost/range/adaptor/reversed.hpp>
|
||||
#include "range_tombstone_list.hh"
|
||||
#include "utils/allocation_strategy.hh"
|
||||
#include "utils/amortized_reserve.hh"
|
||||
#include <seastar/util/variant_utils.hh>
|
||||
|
||||
range_tombstone_list::range_tombstone_list(const range_tombstone_list& x)
|
||||
@@ -96,7 +97,7 @@ void range_tombstone_list::insert_from(const schema& s,
|
||||
if (cmp(end, it->position()) < 0) {
|
||||
// not overlapping
|
||||
if (it->tombstone().tomb == tomb && cmp(end, it->position()) == 0) {
|
||||
rev.update(it, {std::move(start), std::move(start), tomb});
|
||||
rev.update(it, {std::move(start), std::move(end), tomb});
|
||||
} else {
|
||||
auto rt = construct_range_tombstone_entry(std::move(start), std::move(end), tomb);
|
||||
rev.insert(it, *rt);
|
||||
@@ -375,13 +376,13 @@ range_tombstone_list::reverter::insert(range_tombstones_type::iterator it, range
|
||||
|
||||
range_tombstone_list::range_tombstones_type::iterator
|
||||
range_tombstone_list::reverter::erase(range_tombstones_type::iterator it) {
|
||||
_ops.reserve(_ops.size() + 1);
|
||||
amortized_reserve(_ops, _ops.size() + 1);
|
||||
_ops.emplace_back(erase_undo_op(*it));
|
||||
return _dst._tombstones.erase(it);
|
||||
}
|
||||
|
||||
void range_tombstone_list::reverter::update(range_tombstones_type::iterator it, range_tombstone&& new_rt) {
|
||||
_ops.reserve(_ops.size() + 1);
|
||||
amortized_reserve(_ops, _ops.size() + 1);
|
||||
swap(it->tombstone(), new_rt);
|
||||
_ops.emplace_back(update_undo_op(std::move(new_rt), *it));
|
||||
}
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
#include "range_tombstone.hh"
|
||||
#include "query-request.hh"
|
||||
#include "utils/preempt.hh"
|
||||
#include "utils/chunked_vector.hh"
|
||||
#include <iosfwd>
|
||||
#include <variant>
|
||||
|
||||
@@ -106,7 +107,7 @@ class range_tombstone_list final {
|
||||
class reverter {
|
||||
private:
|
||||
using op = std::variant<erase_undo_op, insert_undo_op, update_undo_op>;
|
||||
std::vector<op> _ops;
|
||||
utils::chunked_vector<op> _ops;
|
||||
const schema& _s;
|
||||
protected:
|
||||
range_tombstone_list& _dst;
|
||||
|
||||
@@ -743,6 +743,25 @@ void reader_concurrency_semaphore::clear_inactive_reads() {
|
||||
}
|
||||
}
|
||||
|
||||
future<> reader_concurrency_semaphore::evict_inactive_reads_for_table(utils::UUID id) noexcept {
|
||||
inactive_reads_type evicted_readers;
|
||||
auto it = _inactive_reads.begin();
|
||||
while (it != _inactive_reads.end()) {
|
||||
auto& ir = *it;
|
||||
++it;
|
||||
if (ir.reader.schema()->id() == id) {
|
||||
do_detach_inactive_reader(ir, evict_reason::manual);
|
||||
ir.ttl_timer.cancel();
|
||||
ir.unlink();
|
||||
evicted_readers.push_back(ir);
|
||||
}
|
||||
}
|
||||
while (!evicted_readers.empty()) {
|
||||
std::unique_ptr<inactive_read> irp(&evicted_readers.front());
|
||||
co_await irp->reader.close();
|
||||
}
|
||||
}
|
||||
|
||||
std::runtime_error reader_concurrency_semaphore::stopped_exception() {
|
||||
return std::runtime_error(format("{} was stopped", _name));
|
||||
}
|
||||
@@ -765,11 +784,9 @@ future<> reader_concurrency_semaphore::stop() noexcept {
|
||||
co_return;
|
||||
}
|
||||
|
||||
flat_mutation_reader_v2 reader_concurrency_semaphore::detach_inactive_reader(inactive_read& ir, evict_reason reason) noexcept {
|
||||
auto reader = std::move(ir.reader);
|
||||
void reader_concurrency_semaphore::do_detach_inactive_reader(inactive_read& ir, evict_reason reason) noexcept {
|
||||
ir.detach();
|
||||
reader.permit()._impl->on_evicted();
|
||||
std::unique_ptr<inactive_read> irp(&ir);
|
||||
ir.reader.permit()._impl->on_evicted();
|
||||
try {
|
||||
if (ir.notify_handler) {
|
||||
ir.notify_handler(reason);
|
||||
@@ -788,7 +805,12 @@ flat_mutation_reader_v2 reader_concurrency_semaphore::detach_inactive_reader(ina
|
||||
break;
|
||||
}
|
||||
--_stats.inactive_reads;
|
||||
return reader;
|
||||
}
|
||||
|
||||
flat_mutation_reader_v2 reader_concurrency_semaphore::detach_inactive_reader(inactive_read& ir, evict_reason reason) noexcept {
|
||||
std::unique_ptr<inactive_read> irp(&ir);
|
||||
do_detach_inactive_reader(ir, reason);
|
||||
return std::move(irp->reader);
|
||||
}
|
||||
|
||||
void reader_concurrency_semaphore::evict(inactive_read& ir, evict_reason reason) noexcept {
|
||||
|
||||
@@ -186,6 +186,7 @@ private:
|
||||
std::optional<future<>> _execution_loop_future;
|
||||
|
||||
private:
|
||||
void do_detach_inactive_reader(inactive_read&, evict_reason reason) noexcept;
|
||||
[[nodiscard]] flat_mutation_reader_v2 detach_inactive_reader(inactive_read&, evict_reason reason) noexcept;
|
||||
void evict(inactive_read&, evict_reason reason) noexcept;
|
||||
|
||||
@@ -301,6 +302,9 @@ public:
|
||||
|
||||
/// Clear all inactive reads.
|
||||
void clear_inactive_reads();
|
||||
|
||||
/// Evict all inactive reads the belong to the table designated by the id.
|
||||
future<> evict_inactive_reads_for_table(utils::UUID id) noexcept;
|
||||
private:
|
||||
// The following two functions are extension points for
|
||||
// future inheriting classes that needs to run some stop
|
||||
|
||||
194
repair/repair.cc
194
repair/repair.cc
@@ -25,6 +25,7 @@
|
||||
#include "utils/bit_cast.hh"
|
||||
#include "service/migration_manager.hh"
|
||||
#include "partition_range_compat.hh"
|
||||
#include "gms/feature_service.hh"
|
||||
|
||||
#include <boost/algorithm/string/predicate.hpp>
|
||||
#include <boost/algorithm/string/split.hpp>
|
||||
@@ -41,6 +42,7 @@
|
||||
#include <seastar/core/sleep.hh>
|
||||
|
||||
#include <cfloat>
|
||||
#include <algorithm>
|
||||
|
||||
#include "idl/partition_checksum.dist.hh"
|
||||
|
||||
@@ -118,6 +120,13 @@ std::ostream& operator<<(std::ostream& out, row_level_diff_detect_algorithm algo
|
||||
return out << "unknown";
|
||||
}
|
||||
|
||||
static size_t get_nr_tables(const replica::database& db, const sstring& keyspace) {
|
||||
auto& m = db.get_column_families_mapping();
|
||||
return std::count_if(m.begin(), m.end(), [&keyspace] (auto& e) {
|
||||
return e.first.first == keyspace;
|
||||
});
|
||||
}
|
||||
|
||||
static std::vector<sstring> list_column_families(const replica::database& db, const sstring& keyspace) {
|
||||
std::vector<sstring> ret;
|
||||
for (auto &&e : db.get_column_families_mapping()) {
|
||||
@@ -443,7 +452,7 @@ float tracker::report_progress(streaming::stream_reason reason) {
|
||||
for (auto& x : _repairs) {
|
||||
auto& ri = x.second;
|
||||
if (ri->reason == reason) {
|
||||
nr_ranges_total += ri->nr_ranges_total;
|
||||
nr_ranges_total += ri->ranges_size();
|
||||
nr_ranges_finished += ri->nr_ranges_finished;
|
||||
}
|
||||
}
|
||||
@@ -555,8 +564,8 @@ void repair_info::check_failed_ranges() {
|
||||
rlogger.info("repair id {} on shard {} stats: repair_reason={}, keyspace={}, tables={}, ranges_nr={}, {}",
|
||||
id, shard, reason, keyspace, table_names(), ranges.size(), _stats.get_stats());
|
||||
if (nr_failed_ranges) {
|
||||
rlogger.warn("repair id {} on shard {} failed - {} out of {} ranges failed", id, shard, nr_failed_ranges, ranges.size());
|
||||
throw std::runtime_error(format("repair id {} on shard {} failed to repair {} out of {} ranges", id, shard, nr_failed_ranges, ranges.size()));
|
||||
rlogger.warn("repair id {} on shard {} failed - {} out of {} ranges failed", id, shard, nr_failed_ranges, ranges_size());
|
||||
throw std::runtime_error(format("repair id {} on shard {} failed to repair {} out of {} ranges", id, shard, nr_failed_ranges, ranges_size()));
|
||||
} else {
|
||||
if (dropped_tables.size()) {
|
||||
rlogger.warn("repair id {} on shard {} completed successfully, keyspace={}, ignoring dropped tables={}", id, shard, keyspace, dropped_tables);
|
||||
@@ -582,14 +591,18 @@ repair_neighbors repair_info::get_repair_neighbors(const dht::token_range& range
|
||||
neighbors[range];
|
||||
}
|
||||
|
||||
size_t repair_info::ranges_size() {
|
||||
return ranges.size() * table_ids.size();
|
||||
}
|
||||
|
||||
// Repair a single local range, multiple column families.
|
||||
// Comparable to RepairSession in Origin
|
||||
future<> repair_info::repair_range(const dht::token_range& range) {
|
||||
future<> repair_info::repair_range(const dht::token_range& range, utils::UUID table_id) {
|
||||
check_in_shutdown();
|
||||
check_in_abort();
|
||||
ranges_index++;
|
||||
repair_neighbors neighbors = get_repair_neighbors(range);
|
||||
return do_with(std::move(neighbors.all), std::move(neighbors.mandatory), [this, range] (auto& neighbors, auto& mandatory_neighbors) {
|
||||
return do_with(std::move(neighbors.all), std::move(neighbors.mandatory), [this, range, table_id] (auto& neighbors, auto& mandatory_neighbors) {
|
||||
auto live_neighbors = boost::copy_range<std::vector<gms::inet_address>>(neighbors |
|
||||
boost::adaptors::filtered([this] (const gms::inet_address& node) { return gossiper.is_alive(node); }));
|
||||
for (auto& node : mandatory_neighbors) {
|
||||
@@ -598,7 +611,7 @@ future<> repair_info::repair_range(const dht::token_range& range) {
|
||||
nr_failed_ranges++;
|
||||
auto status = format("failed: mandatory neighbor={} is not alive", node);
|
||||
rlogger.error("Repair {} out of {} ranges, id={}, shard={}, keyspace={}, table={}, range={}, peers={}, live_peers={}, status={}",
|
||||
ranges_index, ranges.size(), id, shard, keyspace, table_names(), range, neighbors, live_neighbors, status);
|
||||
ranges_index, ranges_size(), id, shard, keyspace, table_names(), range, neighbors, live_neighbors, status);
|
||||
abort();
|
||||
return make_exception_future<>(std::runtime_error(format("Repair mandatory neighbor={} is not alive, keyspace={}, mandatory_neighbors={}",
|
||||
node, keyspace, mandatory_neighbors)));
|
||||
@@ -608,7 +621,7 @@ future<> repair_info::repair_range(const dht::token_range& range) {
|
||||
nr_failed_ranges++;
|
||||
auto status = live_neighbors.empty() ? "skipped" : "partial";
|
||||
rlogger.warn("Repair {} out of {} ranges, id={}, shard={}, keyspace={}, table={}, range={}, peers={}, live_peers={}, status={}",
|
||||
ranges_index, ranges.size(), id, shard, keyspace, table_names(), range, neighbors, live_neighbors, status);
|
||||
ranges_index, ranges_size(), id, shard, keyspace, table_names(), range, neighbors, live_neighbors, status);
|
||||
if (live_neighbors.empty()) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
@@ -617,13 +630,12 @@ future<> repair_info::repair_range(const dht::token_range& range) {
|
||||
if (neighbors.empty()) {
|
||||
auto status = "skipped_no_followers";
|
||||
rlogger.warn("Repair {} out of {} ranges, id={}, shard={}, keyspace={}, table={}, range={}, peers={}, live_peers={}, status={}",
|
||||
ranges_index, ranges.size(), id, shard, keyspace, table_names(), range, neighbors, live_neighbors, status);
|
||||
ranges_index, ranges_size(), id, shard, keyspace, table_names(), range, neighbors, live_neighbors, status);
|
||||
return make_ready_future<>();
|
||||
}
|
||||
rlogger.info("Repair {} out of {} ranges, id={}, shard={}, keyspace={}, table={}, range={}, peers={}, live_peers={}",
|
||||
ranges_index, ranges.size(), id, shard, keyspace, table_names(), range, neighbors, live_neighbors);
|
||||
return mm.sync_schema(db.local(), neighbors).then([this, &neighbors, range] {
|
||||
return do_for_each(table_ids.begin(), table_ids.end(), [this, &neighbors, range] (utils::UUID table_id) {
|
||||
ranges_index, ranges_size(), id, shard, keyspace, table_names(), range, neighbors, live_neighbors);
|
||||
return mm.sync_schema(db.local(), neighbors).then([this, &neighbors, range, table_id] {
|
||||
sstring cf;
|
||||
try {
|
||||
cf = db.local().find_column_family(table_id).schema()->cf_name();
|
||||
@@ -641,7 +653,6 @@ future<> repair_info::repair_range(const dht::token_range& range) {
|
||||
nr_failed_ranges++;
|
||||
return make_exception_future<>(std::move(ep));
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
@@ -914,27 +925,55 @@ private:
|
||||
|
||||
|
||||
static future<> do_repair_ranges(lw_shared_ptr<repair_info> ri) {
|
||||
// repair all the ranges in limited parallelism
|
||||
return parallel_for_each(ri->ranges, [ri] (auto&& range) {
|
||||
return with_semaphore(ri->rs.repair_tracker().range_parallelism_semaphore(), 1, [ri, &range] {
|
||||
return ri->repair_range(range).then([ri] {
|
||||
if (ri->reason == streaming::stream_reason::bootstrap) {
|
||||
ri->rs.get_metrics().bootstrap_finished_ranges++;
|
||||
} else if (ri->reason == streaming::stream_reason::replace) {
|
||||
ri->rs.get_metrics().replace_finished_ranges++;
|
||||
} else if (ri->reason == streaming::stream_reason::rebuild) {
|
||||
ri->rs.get_metrics().rebuild_finished_ranges++;
|
||||
} else if (ri->reason == streaming::stream_reason::decommission) {
|
||||
ri->rs.get_metrics().decommission_finished_ranges++;
|
||||
} else if (ri->reason == streaming::stream_reason::removenode) {
|
||||
ri->rs.get_metrics().removenode_finished_ranges++;
|
||||
} else if (ri->reason == streaming::stream_reason::repair) {
|
||||
ri->rs.get_metrics().repair_finished_ranges_sum++;
|
||||
ri->nr_ranges_finished++;
|
||||
}
|
||||
// Repair tables in the keyspace one after another
|
||||
assert(ri->table_names().size() == ri->table_ids.size());
|
||||
for (int idx = 0; idx < ri->table_ids.size(); idx++) {
|
||||
auto table_id = ri->table_ids[idx];
|
||||
auto table_name = ri->table_names()[idx];
|
||||
// repair all the ranges in limited parallelism
|
||||
rlogger.info("repair[{}]: Started to repair {} out of {} tables in keyspace={}, table={}, table_id={}, repair_reason={}",
|
||||
ri->id.uuid, idx + 1, ri->table_ids.size(), ri->keyspace, table_name, table_id, ri->reason);
|
||||
co_await parallel_for_each(ri->ranges, [ri, table_id] (auto&& range) {
|
||||
return with_semaphore(ri->rs.repair_tracker().range_parallelism_semaphore(), 1, [ri, &range, table_id] {
|
||||
return ri->repair_range(range, table_id).then([ri] {
|
||||
if (ri->reason == streaming::stream_reason::bootstrap) {
|
||||
ri->rs.get_metrics().bootstrap_finished_ranges++;
|
||||
} else if (ri->reason == streaming::stream_reason::replace) {
|
||||
ri->rs.get_metrics().replace_finished_ranges++;
|
||||
} else if (ri->reason == streaming::stream_reason::rebuild) {
|
||||
ri->rs.get_metrics().rebuild_finished_ranges++;
|
||||
} else if (ri->reason == streaming::stream_reason::decommission) {
|
||||
ri->rs.get_metrics().decommission_finished_ranges++;
|
||||
} else if (ri->reason == streaming::stream_reason::removenode) {
|
||||
ri->rs.get_metrics().removenode_finished_ranges++;
|
||||
} else if (ri->reason == streaming::stream_reason::repair) {
|
||||
ri->rs.get_metrics().repair_finished_ranges_sum++;
|
||||
ri->nr_ranges_finished++;
|
||||
}
|
||||
rlogger.debug("repair[{}]: node ops progress bootstrap={}, replace={}, rebuild={}, decommission={}, removenode={}, repair={}",
|
||||
ri->id.uuid,
|
||||
ri->rs.get_metrics().bootstrap_finished_percentage(),
|
||||
ri->rs.get_metrics().replace_finished_percentage(),
|
||||
ri->rs.get_metrics().rebuild_finished_percentage(),
|
||||
ri->rs.get_metrics().decommission_finished_percentage(),
|
||||
ri->rs.get_metrics().removenode_finished_percentage(),
|
||||
ri->rs.get_metrics().repair_finished_percentage());
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
if (ri->reason != streaming::stream_reason::repair) {
|
||||
try {
|
||||
auto& table = ri->db.local().find_column_family(table_id);
|
||||
rlogger.debug("repair[{}]: Trigger off-strategy compaction for keyspace={}, table={}",
|
||||
ri->id.uuid, table.schema()->ks_name(), table.schema()->cf_name());
|
||||
table.trigger_offstrategy_compaction();
|
||||
} catch (replica::no_such_column_family&) {
|
||||
// Ignore dropped table
|
||||
}
|
||||
}
|
||||
}
|
||||
co_return;
|
||||
}
|
||||
|
||||
// repair_ranges repairs a list of token ranges, each assumed to be a token
|
||||
@@ -1060,33 +1099,48 @@ int repair_service::do_repair_start(sstring keyspace, std::unordered_map<sstring
|
||||
cfs = std::move(cfs), ranges = std::move(ranges), options = std::move(options), ignore_nodes = std::move(ignore_nodes)] () mutable {
|
||||
auto uuid = id.uuid;
|
||||
|
||||
auto waiting_nodes = db.local().get_token_metadata().get_all_endpoints();
|
||||
std::erase_if(waiting_nodes, [&] (const auto& addr) {
|
||||
return ignore_nodes.contains(addr);
|
||||
});
|
||||
auto participants = get_hosts_participating_in_repair(db.local(), keyspace, ranges, options.data_centers, options.hosts, ignore_nodes).get();
|
||||
auto hints_timeout = std::chrono::seconds(300);
|
||||
auto batchlog_timeout = std::chrono::seconds(300);
|
||||
repair_flush_hints_batchlog_request req{id.uuid, participants, hints_timeout, batchlog_timeout};
|
||||
bool needs_flush_before_repair = false;
|
||||
if (db.local().features().cluster_supports_tombstone_gc_options()) {
|
||||
for (auto& table: cfs) {
|
||||
auto s = db.local().find_column_family(keyspace, table).schema();
|
||||
const auto& options = s->tombstone_gc_options();
|
||||
if (options.mode() == tombstone_gc_mode::repair) {
|
||||
needs_flush_before_repair = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool hints_batchlog_flushed = false;
|
||||
try {
|
||||
parallel_for_each(waiting_nodes, [this, uuid, &req, &participants] (gms::inet_address node) -> future<> {
|
||||
rlogger.info("repair[{}]: Sending repair_flush_hints_batchlog to node={}, participants={}, started",
|
||||
uuid, node, participants);
|
||||
try {
|
||||
auto& ms = get_messaging();
|
||||
auto resp = co_await ser::partition_checksum_rpc_verbs::send_repair_flush_hints_batchlog(&ms, netw::msg_addr(node), req);
|
||||
} catch (...) {
|
||||
rlogger.warn("repair[{}]: Sending repair_flush_hints_batchlog to node={}, participants={}, failed: {}",
|
||||
uuid, node, participants, std::current_exception());
|
||||
throw;
|
||||
}
|
||||
}).get();
|
||||
hints_batchlog_flushed = true;
|
||||
} catch (...) {
|
||||
rlogger.warn("repair[{}]: Sending repair_flush_hints_batchlog to participants={} failed, continue to run repair",
|
||||
uuid, participants);
|
||||
auto participants = get_hosts_participating_in_repair(db.local(), keyspace, ranges, options.data_centers, options.hosts, ignore_nodes).get();
|
||||
if (needs_flush_before_repair) {
|
||||
auto waiting_nodes = db.local().get_token_metadata().get_all_endpoints();
|
||||
std::erase_if(waiting_nodes, [&] (const auto& addr) {
|
||||
return ignore_nodes.contains(addr);
|
||||
});
|
||||
auto hints_timeout = std::chrono::seconds(300);
|
||||
auto batchlog_timeout = std::chrono::seconds(300);
|
||||
repair_flush_hints_batchlog_request req{id.uuid, participants, hints_timeout, batchlog_timeout};
|
||||
|
||||
try {
|
||||
parallel_for_each(waiting_nodes, [this, uuid, &req, &participants] (gms::inet_address node) -> future<> {
|
||||
rlogger.info("repair[{}]: Sending repair_flush_hints_batchlog to node={}, participants={}, started",
|
||||
uuid, node, participants);
|
||||
try {
|
||||
auto& ms = get_messaging();
|
||||
auto resp = co_await ser::partition_checksum_rpc_verbs::send_repair_flush_hints_batchlog(&ms, netw::msg_addr(node), req);
|
||||
} catch (...) {
|
||||
rlogger.warn("repair[{}]: Sending repair_flush_hints_batchlog to node={}, participants={}, failed: {}",
|
||||
uuid, node, participants, std::current_exception());
|
||||
throw;
|
||||
}
|
||||
}).get();
|
||||
hints_batchlog_flushed = true;
|
||||
} catch (...) {
|
||||
rlogger.warn("repair[{}]: Sending repair_flush_hints_batchlog to participants={} failed, continue to run repair",
|
||||
uuid, participants);
|
||||
}
|
||||
} else {
|
||||
rlogger.info("repair[{}]: Skipped sending repair_flush_hints_batchlog to nodes={}", uuid, participants);
|
||||
}
|
||||
|
||||
std::vector<future<>> repair_results;
|
||||
@@ -1288,7 +1342,8 @@ future<> repair_service::bootstrap_with_repair(locator::token_metadata_ptr tmptr
|
||||
auto& strat = ks.get_replication_strategy();
|
||||
dht::token_range_vector desired_ranges = strat.get_pending_address_ranges(tmptr, tokens, myip).get0();
|
||||
seastar::thread::maybe_yield();
|
||||
nr_ranges_total += desired_ranges.size();
|
||||
auto nr_tables = get_nr_tables(db.local(), keyspace_name);
|
||||
nr_ranges_total += desired_ranges.size() * nr_tables;
|
||||
}
|
||||
container().invoke_on_all([nr_ranges_total] (repair_service& rs) {
|
||||
rs.get_metrics().bootstrap_finished_ranges = 0;
|
||||
@@ -1320,7 +1375,8 @@ future<> repair_service::bootstrap_with_repair(locator::token_metadata_ptr tmptr
|
||||
//Collects the source that will have its range moved to the new node
|
||||
std::unordered_map<dht::token_range, repair_neighbors> range_sources;
|
||||
|
||||
rlogger.info("bootstrap_with_repair: started with keyspace={}, nr_ranges={}", keyspace_name, desired_ranges.size());
|
||||
auto nr_tables = get_nr_tables(db.local(), keyspace_name);
|
||||
rlogger.info("bootstrap_with_repair: started with keyspace={}, nr_ranges={}", keyspace_name, desired_ranges.size() * nr_tables);
|
||||
for (auto& desired_range : desired_ranges) {
|
||||
for (auto& x : range_addresses) {
|
||||
const range<dht::token>& src_range = x.first;
|
||||
@@ -1461,7 +1517,8 @@ future<> repair_service::do_decommission_removenode_with_repair(locator::token_m
|
||||
}
|
||||
auto& ks = db.local().find_keyspace(keyspace_name);
|
||||
dht::token_range_vector ranges = ks.get_effective_replication_map()->get_ranges(leaving_node);
|
||||
nr_ranges_total += ranges.size();
|
||||
auto nr_tables = get_nr_tables(db.local(), keyspace_name);
|
||||
nr_ranges_total += ranges.size() * nr_tables;
|
||||
}
|
||||
if (reason == streaming::stream_reason::decommission) {
|
||||
container().invoke_on_all([nr_ranges_total] (repair_service& rs) {
|
||||
@@ -1485,8 +1542,9 @@ future<> repair_service::do_decommission_removenode_with_repair(locator::token_m
|
||||
auto erm = ks.get_effective_replication_map();
|
||||
// First get all ranges the leaving node is responsible for
|
||||
dht::token_range_vector ranges = erm->get_ranges(leaving_node);
|
||||
rlogger.info("{}: started with keyspace={}, leaving_node={}, nr_ranges={}", op, keyspace_name, leaving_node, ranges.size());
|
||||
size_t nr_ranges_total = ranges.size();
|
||||
auto nr_tables = get_nr_tables(db.local(), keyspace_name);
|
||||
rlogger.info("{}: started with keyspace={}, leaving_node={}, nr_ranges={}", op, keyspace_name, leaving_node, ranges.size() * nr_tables);
|
||||
size_t nr_ranges_total = ranges.size() * nr_tables;
|
||||
size_t nr_ranges_skipped = 0;
|
||||
std::unordered_map<dht::token_range, inet_address_vector_replica_set> current_replica_endpoints;
|
||||
// Find (for each range) all nodes that store replicas for these ranges as well
|
||||
@@ -1677,7 +1735,8 @@ future<> repair_service::do_rebuild_replace_with_repair(locator::token_metadata_
|
||||
auto& strat = ks.get_replication_strategy();
|
||||
// Okay to yield since tm is immutable
|
||||
dht::token_range_vector ranges = strat.get_ranges(myip, tmptr).get0();
|
||||
nr_ranges_total += ranges.size();
|
||||
auto nr_tables = get_nr_tables(db.local(), keyspace_name);
|
||||
nr_ranges_total += ranges.size() * nr_tables;
|
||||
|
||||
}
|
||||
if (reason == streaming::stream_reason::rebuild) {
|
||||
@@ -1702,7 +1761,8 @@ future<> repair_service::do_rebuild_replace_with_repair(locator::token_metadata_
|
||||
auto& strat = ks.get_replication_strategy();
|
||||
dht::token_range_vector ranges = strat.get_ranges(myip, tmptr).get0();
|
||||
std::unordered_map<dht::token_range, repair_neighbors> range_sources;
|
||||
rlogger.info("{}: started with keyspace={}, source_dc={}, nr_ranges={}, ignore_nodes={}", op, keyspace_name, source_dc, ranges.size(), ignore_nodes);
|
||||
auto nr_tables = get_nr_tables(db.local(), keyspace_name);
|
||||
rlogger.info("{}: started with keyspace={}, source_dc={}, nr_ranges={}, ignore_nodes={}", op, keyspace_name, source_dc, ranges.size() * nr_tables, ignore_nodes);
|
||||
for (auto it = ranges.begin(); it != ranges.end();) {
|
||||
auto& r = *it;
|
||||
seastar::thread::maybe_yield();
|
||||
@@ -1730,12 +1790,12 @@ future<> repair_service::do_rebuild_replace_with_repair(locator::token_metadata_
|
||||
}
|
||||
}
|
||||
if (reason == streaming::stream_reason::rebuild) {
|
||||
container().invoke_on_all([nr_ranges_skipped] (repair_service& rs) {
|
||||
rs.get_metrics().rebuild_finished_ranges += nr_ranges_skipped;
|
||||
container().invoke_on_all([nr_ranges_skipped, nr_tables] (repair_service& rs) {
|
||||
rs.get_metrics().rebuild_finished_ranges += nr_ranges_skipped * nr_tables;
|
||||
}).get();
|
||||
} else if (reason == streaming::stream_reason::replace) {
|
||||
container().invoke_on_all([nr_ranges_skipped] (repair_service& rs) {
|
||||
rs.get_metrics().replace_finished_ranges += nr_ranges_skipped;
|
||||
container().invoke_on_all([nr_ranges_skipped, nr_tables] (repair_service& rs) {
|
||||
rs.get_metrics().replace_finished_ranges += nr_ranges_skipped * nr_tables;
|
||||
}).get();
|
||||
}
|
||||
auto nr_ranges = ranges.size();
|
||||
|
||||
@@ -200,7 +200,9 @@ public:
|
||||
return _hints_batchlog_flushed;
|
||||
}
|
||||
|
||||
future<> repair_range(const dht::token_range& range);
|
||||
future<> repair_range(const dht::token_range& range, utils::UUID table_id);
|
||||
|
||||
size_t ranges_size();
|
||||
};
|
||||
|
||||
// The repair_tracker tracks ongoing repair operations and their progress.
|
||||
|
||||
@@ -347,9 +347,9 @@ private:
|
||||
// Only needed for local readers, the multishard reader takes care
|
||||
// of pinning tables on used shards.
|
||||
std::optional<utils::phased_barrier::operation> _local_read_op;
|
||||
std::optional<evictable_reader_handle> _reader_handle;
|
||||
// Local reader or multishard reader to read the range
|
||||
flat_mutation_reader _reader;
|
||||
std::optional<evictable_reader_handle> _reader_handle;
|
||||
// Current partition read from disk
|
||||
lw_shared_ptr<const decorated_key_with_hash> _current_dk;
|
||||
uint64_t _reads_issued = 0;
|
||||
|
||||
@@ -67,6 +67,7 @@ public:
|
||||
uint64_t repair_finished_ranges_sum{0};
|
||||
private:
|
||||
seastar::metrics::metric_groups _metrics;
|
||||
public:
|
||||
float bootstrap_finished_percentage();
|
||||
float replace_finished_percentage();
|
||||
float rebuild_finished_percentage();
|
||||
|
||||
@@ -910,10 +910,9 @@ bool database::update_column_family(schema_ptr new_schema) {
|
||||
return columns_changed;
|
||||
}
|
||||
|
||||
future<> database::remove(const column_family& cf) noexcept {
|
||||
void database::remove(const table& cf) noexcept {
|
||||
auto s = cf.schema();
|
||||
auto& ks = find_keyspace(s->ks_name());
|
||||
co_await _querier_cache.evict_all_for_table(s->id());
|
||||
_column_families.erase(s->id());
|
||||
ks.metadata()->remove_column_family(s);
|
||||
_ks_cf_to_uuid.erase(std::make_pair(s->ks_name(), s->cf_name()));
|
||||
@@ -937,13 +936,22 @@ future<> database::drop_column_family(const sstring& ks_name, const sstring& cf_
|
||||
on_internal_error(dblog, fmt::format("drop_column_family {}.{}: UUID={} not found", ks_name, cf_name, uuid));
|
||||
}
|
||||
dblog.debug("Dropping {}.{}", ks_name, cf_name);
|
||||
co_await remove(*cf);
|
||||
remove(*cf);
|
||||
cf->clear_views();
|
||||
co_return co_await cf->await_pending_ops().then([this, &ks, cf, tsf = std::move(tsf), snapshot] {
|
||||
return truncate(ks, *cf, std::move(tsf), snapshot).finally([this, cf] {
|
||||
return cf->stop();
|
||||
});
|
||||
}).finally([cf] {});
|
||||
co_await cf->await_pending_ops();
|
||||
for (auto* sem : {&_read_concurrency_sem, &_streaming_concurrency_sem, &_compaction_concurrency_sem, &_system_read_concurrency_sem}) {
|
||||
co_await sem->evict_inactive_reads_for_table(uuid);
|
||||
}
|
||||
std::exception_ptr ex;
|
||||
try {
|
||||
co_await truncate(ks, *cf, std::move(tsf), snapshot);
|
||||
} catch (...) {
|
||||
ex = std::current_exception();
|
||||
}
|
||||
co_await cf->stop();
|
||||
if (ex) {
|
||||
std::rethrow_exception(std::move(ex));
|
||||
}
|
||||
}
|
||||
|
||||
const utils::UUID& database::find_uuid(std::string_view ks, std::string_view cf) const {
|
||||
@@ -2054,6 +2062,53 @@ future<> database::flush(const sstring& ksname, const sstring& cfname) {
|
||||
return cf.flush();
|
||||
}
|
||||
|
||||
future<> database::flush_on_all(utils::UUID id) {
|
||||
return container().invoke_on_all([id] (replica::database& db) {
|
||||
return db.find_column_family(id).flush();
|
||||
});
|
||||
}
|
||||
|
||||
future<> database::flush_on_all(std::string_view ks_name, std::string_view table_name) {
|
||||
return flush_on_all(find_uuid(ks_name, table_name));
|
||||
}
|
||||
|
||||
future<> database::flush_on_all(std::string_view ks_name, std::vector<sstring> table_names) {
|
||||
return parallel_for_each(table_names, [this, ks_name] (const auto& table_name) {
|
||||
return flush_on_all(ks_name, table_name);
|
||||
});
|
||||
}
|
||||
|
||||
future<> database::flush_on_all(std::string_view ks_name) {
|
||||
return parallel_for_each(find_keyspace(ks_name).metadata()->cf_meta_data(), [this] (auto& pair) {
|
||||
return flush_on_all(pair.second->id());
|
||||
});
|
||||
}
|
||||
|
||||
future<> database::snapshot_on_all(std::string_view ks_name, std::vector<sstring> table_names, sstring tag, bool skip_flush) {
|
||||
co_await parallel_for_each(table_names, [this, ks_name, tag = std::move(tag), skip_flush] (const auto& table_name) -> future<> {
|
||||
if (!skip_flush) {
|
||||
co_await flush_on_all(ks_name, table_name);
|
||||
}
|
||||
co_await container().invoke_on_all([ks_name, &table_name, tag, skip_flush] (replica::database& db) {
|
||||
auto& t = db.find_column_family(ks_name, table_name);
|
||||
return t.snapshot(db, tag);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
future<> database::snapshot_on_all(std::string_view ks_name, sstring tag, bool skip_flush) {
|
||||
auto& ks = find_keyspace(ks_name);
|
||||
co_await parallel_for_each(ks.metadata()->cf_meta_data(), [this, tag = std::move(tag), skip_flush] (const auto& pair) -> future<> {
|
||||
if (!skip_flush) {
|
||||
co_await flush_on_all(pair.second->id());
|
||||
}
|
||||
co_await container().invoke_on_all([id = pair.second, tag, skip_flush] (replica::database& db) {
|
||||
auto& t = db.find_column_family(id);
|
||||
return t.snapshot(db, tag);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
future<> database::truncate(sstring ksname, sstring cfname, timestamp_func tsf) {
|
||||
auto& ks = find_keyspace(ksname);
|
||||
auto& cf = find_column_family(ksname, cfname);
|
||||
@@ -2062,80 +2117,77 @@ future<> database::truncate(sstring ksname, sstring cfname, timestamp_func tsf)
|
||||
|
||||
future<> database::truncate(const keyspace& ks, column_family& cf, timestamp_func tsf, bool with_snapshot) {
|
||||
dblog.debug("Truncating {}.{}", cf.schema()->ks_name(), cf.schema()->cf_name());
|
||||
return with_gate(cf.async_gate(), [this, &ks, &cf, tsf = std::move(tsf), with_snapshot] () mutable -> future<> {
|
||||
const auto auto_snapshot = with_snapshot && get_config().auto_snapshot();
|
||||
const auto should_flush = auto_snapshot;
|
||||
auto holder = cf.async_gate().hold();
|
||||
|
||||
// Force mutations coming in to re-acquire higher rp:s
|
||||
// This creates a "soft" ordering, in that we will guarantee that
|
||||
// any sstable written _after_ we issue the flush below will
|
||||
// only have higher rp:s than we will get from the discard_sstable
|
||||
// call.
|
||||
auto low_mark = cf.set_low_replay_position_mark();
|
||||
const auto auto_snapshot = with_snapshot && get_config().auto_snapshot();
|
||||
const auto should_flush = auto_snapshot;
|
||||
|
||||
const auto uuid = cf.schema()->id();
|
||||
// Force mutations coming in to re-acquire higher rp:s
|
||||
// This creates a "soft" ordering, in that we will guarantee that
|
||||
// any sstable written _after_ we issue the flush below will
|
||||
// only have higher rp:s than we will get from the discard_sstable
|
||||
// call.
|
||||
auto low_mark = cf.set_low_replay_position_mark();
|
||||
|
||||
return _compaction_manager->run_with_compaction_disabled(&cf, [this, &cf, should_flush, auto_snapshot, tsf = std::move(tsf), low_mark]() mutable {
|
||||
future<> f = make_ready_future<>();
|
||||
bool did_flush = false;
|
||||
if (should_flush && cf.can_flush()) {
|
||||
// TODO:
|
||||
// this is not really a guarantee at all that we've actually
|
||||
// gotten all things to disk. Again, need queue-ish or something.
|
||||
f = cf.flush();
|
||||
did_flush = true;
|
||||
} else {
|
||||
f = cf.clear();
|
||||
}
|
||||
return f.then([this, &cf, auto_snapshot, tsf = std::move(tsf), low_mark, should_flush, did_flush] {
|
||||
dblog.debug("Discarding sstable data for truncated CF + indexes");
|
||||
// TODO: notify truncation
|
||||
const auto uuid = cf.schema()->id();
|
||||
|
||||
return tsf().then([this, &cf, auto_snapshot, low_mark, should_flush, did_flush](db_clock::time_point truncated_at) {
|
||||
future<> f = make_ready_future<>();
|
||||
if (auto_snapshot) {
|
||||
auto name = format("{:d}-{}", truncated_at.time_since_epoch().count(), cf.schema()->cf_name());
|
||||
f = cf.snapshot(*this, name);
|
||||
}
|
||||
return f.then([this, &cf, truncated_at, low_mark, should_flush, did_flush] {
|
||||
return cf.discard_sstables(truncated_at).then([this, &cf, truncated_at, low_mark, should_flush, did_flush](db::replay_position rp) {
|
||||
// TODO: indexes.
|
||||
// Note: since discard_sstables was changed to only count tables owned by this shard,
|
||||
// we can get zero rp back. Changed assert, and ensure we save at least low_mark.
|
||||
// #6995 - the assert below was broken in c2c6c71 and remained so for many years.
|
||||
// We nowadays do not flush tables with sstables but autosnapshot=false. This means
|
||||
// the low_mark assertion does not hold, because we maybe/probably never got around to
|
||||
// creating the sstables that would create them.
|
||||
assert(!did_flush || low_mark <= rp || rp == db::replay_position());
|
||||
rp = std::max(low_mark, rp);
|
||||
return truncate_views(cf, truncated_at, should_flush).then([&cf, truncated_at, rp] {
|
||||
// save_truncation_record() may actually fail after we cached the truncation time
|
||||
// but this is not be worse that if failing without caching: at least the correct time
|
||||
// will be available until next reboot and a client will have to retry truncation anyway.
|
||||
cf.cache_truncation_record(truncated_at);
|
||||
return db::system_keyspace::save_truncation_record(cf, truncated_at, rp);
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
}).then([this, uuid] {
|
||||
drop_repair_history_map_for_table(uuid);
|
||||
});
|
||||
});
|
||||
}
|
||||
std::vector<compaction_manager::compaction_reenabler> cres;
|
||||
cres.reserve(1 + cf.views().size());
|
||||
|
||||
future<> database::truncate_views(const column_family& base, db_clock::time_point truncated_at, bool should_flush) {
|
||||
return parallel_for_each(base.views(), [this, truncated_at, should_flush] (view_ptr v) {
|
||||
cres.emplace_back(co_await _compaction_manager->stop_and_disable_compaction(&cf));
|
||||
co_await parallel_for_each(cf.views(), [&, this] (view_ptr v) -> future<> {
|
||||
auto& vcf = find_column_family(v);
|
||||
return _compaction_manager->run_with_compaction_disabled(&vcf, [&vcf, truncated_at, should_flush] {
|
||||
return (should_flush ? vcf.flush() : vcf.clear()).then([&vcf, truncated_at, should_flush] {
|
||||
return vcf.discard_sstables(truncated_at).then([&vcf, truncated_at, should_flush](db::replay_position rp) {
|
||||
return db::system_keyspace::save_truncation_record(vcf, truncated_at, rp);
|
||||
});
|
||||
});
|
||||
});
|
||||
cres.emplace_back(co_await _compaction_manager->stop_and_disable_compaction(&vcf));
|
||||
});
|
||||
|
||||
bool did_flush = false;
|
||||
if (should_flush && cf.can_flush()) {
|
||||
// TODO:
|
||||
// this is not really a guarantee at all that we've actually
|
||||
// gotten all things to disk. Again, need queue-ish or something.
|
||||
co_await cf.flush();
|
||||
did_flush = true;
|
||||
} else {
|
||||
co_await cf.clear();
|
||||
}
|
||||
|
||||
dblog.debug("Discarding sstable data for truncated CF + indexes");
|
||||
// TODO: notify truncation
|
||||
|
||||
db_clock::time_point truncated_at = co_await tsf();
|
||||
|
||||
if (auto_snapshot) {
|
||||
auto name = format("{:d}-{}", truncated_at.time_since_epoch().count(), cf.schema()->cf_name());
|
||||
co_await cf.snapshot(*this, name);
|
||||
}
|
||||
|
||||
db::replay_position rp = co_await cf.discard_sstables(truncated_at);
|
||||
// TODO: indexes.
|
||||
// Note: since discard_sstables was changed to only count tables owned by this shard,
|
||||
// we can get zero rp back. Changed assert, and ensure we save at least low_mark.
|
||||
// #6995 - the assert below was broken in c2c6c71 and remained so for many years.
|
||||
// We nowadays do not flush tables with sstables but autosnapshot=false. This means
|
||||
// the low_mark assertion does not hold, because we maybe/probably never got around to
|
||||
// creating the sstables that would create them.
|
||||
assert(!did_flush || low_mark <= rp || rp == db::replay_position());
|
||||
rp = std::max(low_mark, rp);
|
||||
co_await parallel_for_each(cf.views(), [this, truncated_at, should_flush] (view_ptr v) -> future<> {
|
||||
auto& vcf = find_column_family(v);
|
||||
if (should_flush) {
|
||||
co_await vcf.flush();
|
||||
} else {
|
||||
co_await vcf.clear();
|
||||
}
|
||||
db::replay_position rp = co_await vcf.discard_sstables(truncated_at);
|
||||
co_await db::system_keyspace::save_truncation_record(vcf, truncated_at, rp);
|
||||
});
|
||||
// save_truncation_record() may actually fail after we cached the truncation time
|
||||
// but this is not be worse that if failing without caching: at least the correct time
|
||||
// will be available until next reboot and a client will have to retry truncation anyway.
|
||||
cf.cache_truncation_record(truncated_at);
|
||||
co_await db::system_keyspace::save_truncation_record(cf, truncated_at, rp);
|
||||
|
||||
drop_repair_history_map_for_table(uuid);
|
||||
}
|
||||
|
||||
const sstring& database::get_snitch_name() const {
|
||||
|
||||
@@ -839,7 +839,11 @@ public:
|
||||
|
||||
db::replay_position set_low_replay_position_mark();
|
||||
|
||||
future<> snapshot(database& db, sstring name, bool skip_flush = false);
|
||||
private:
|
||||
future<> snapshot(database& db, sstring name);
|
||||
|
||||
friend class database;
|
||||
public:
|
||||
future<std::unordered_map<sstring, snapshot_details>> get_snapshot_details();
|
||||
|
||||
/*!
|
||||
@@ -1217,7 +1221,7 @@ struct string_pair_eq {
|
||||
// local metadata reads
|
||||
// use shard_of() for data
|
||||
|
||||
class database {
|
||||
class database : public peering_sharded_service<database> {
|
||||
friend class ::database_test;
|
||||
public:
|
||||
enum class table_kind {
|
||||
@@ -1371,6 +1375,7 @@ private:
|
||||
Future update_write_metrics(Future&& f);
|
||||
void update_write_metrics_for_timed_out_write();
|
||||
future<> create_keyspace(const lw_shared_ptr<keyspace_metadata>&, locator::effective_replication_map_factory& erm_factory, bool is_bootstrap, system_keyspace system);
|
||||
void remove(const table&) noexcept;
|
||||
public:
|
||||
static utils::UUID empty_version;
|
||||
|
||||
@@ -1560,6 +1565,17 @@ public:
|
||||
|
||||
future<> flush_all_memtables();
|
||||
future<> flush(const sstring& ks, const sstring& cf);
|
||||
// flush a table identified by the given id on all shards.
|
||||
future<> flush_on_all(utils::UUID id);
|
||||
// flush a single table in a keyspace on all shards.
|
||||
future<> flush_on_all(std::string_view ks_name, std::string_view table_name);
|
||||
// flush a list of tables in a keyspace on all shards.
|
||||
future<> flush_on_all(std::string_view ks_name, std::vector<sstring> table_names);
|
||||
// flush all tables in a keyspace on all shards.
|
||||
future<> flush_on_all(std::string_view ks_name);
|
||||
|
||||
future<> snapshot_on_all(std::string_view ks_name, std::vector<sstring> table_names, sstring tag, bool skip_flush);
|
||||
future<> snapshot_on_all(std::string_view ks_name, sstring tag, bool skip_flush);
|
||||
|
||||
// See #937. Truncation now requires a callback to get a time stamp
|
||||
// that must be guaranteed to be the same for all shards.
|
||||
@@ -1568,11 +1584,9 @@ public:
|
||||
/** Truncates the given column family */
|
||||
future<> truncate(sstring ksname, sstring cfname, timestamp_func);
|
||||
future<> truncate(const keyspace& ks, column_family& cf, timestamp_func, bool with_snapshot = true);
|
||||
future<> truncate_views(const column_family& base, db_clock::time_point truncated_at, bool should_flush);
|
||||
|
||||
bool update_column_family(schema_ptr s);
|
||||
future<> drop_column_family(const sstring& ks_name, const sstring& cf_name, timestamp_func, bool with_snapshot = true);
|
||||
future<> remove(const column_family&) noexcept;
|
||||
|
||||
const logalloc::region_group& dirty_memory_region_group() const {
|
||||
return _dirty_memory_manager.region_group();
|
||||
|
||||
@@ -454,12 +454,13 @@ future<> distributed_loader::handle_sstables_pending_delete(sstring pending_dele
|
||||
});
|
||||
}
|
||||
|
||||
future<> distributed_loader::populate_column_family(distributed<replica::database>& db, sstring sstdir, sstring ks, sstring cf, bool must_exist) {
|
||||
return async([&db, sstdir = std::move(sstdir), ks = std::move(ks), cf = std::move(cf), must_exist] {
|
||||
future<> distributed_loader::populate_column_family(distributed<replica::database>& db, sstring sstdir, sstring ks, sstring cf, allow_offstrategy_compaction do_allow_offstrategy_compaction, must_exist dir_must_exist) {
|
||||
dblog.debug("Populating {}/{}/{} allow_offstrategy_compaction={} must_exist={}", ks, cf, sstdir, do_allow_offstrategy_compaction, dir_must_exist);
|
||||
return async([&db, sstdir = std::move(sstdir), ks = std::move(ks), cf = std::move(cf), do_allow_offstrategy_compaction, dir_must_exist] {
|
||||
assert(this_shard_id() == 0);
|
||||
|
||||
if (!file_exists(sstdir).get0()) {
|
||||
if (must_exist) {
|
||||
if (dir_must_exist) {
|
||||
throw std::runtime_error(format("Populating {}/{} failed: {} does not exist", ks, cf, sstdir));
|
||||
}
|
||||
return;
|
||||
@@ -529,12 +530,14 @@ future<> distributed_loader::populate_column_family(distributed<replica::databas
|
||||
return global_table->make_sstable(sstdir, gen, sst_version, sstables::sstable::format_types::big);
|
||||
}, eligible_for_reshape_on_boot).get();
|
||||
|
||||
directory.invoke_on_all([global_table, &eligible_for_reshape_on_boot] (sstables::sstable_directory& dir) {
|
||||
return dir.do_for_each_sstable([&global_table, &eligible_for_reshape_on_boot] (sstables::shared_sstable sst) {
|
||||
auto requires_offstrategy = sstables::offstrategy(!eligible_for_reshape_on_boot(sst));
|
||||
directory.invoke_on_all([global_table, &eligible_for_reshape_on_boot, do_allow_offstrategy_compaction] (sstables::sstable_directory& dir) {
|
||||
return dir.do_for_each_sstable([&global_table, &eligible_for_reshape_on_boot, do_allow_offstrategy_compaction] (sstables::shared_sstable sst) {
|
||||
auto requires_offstrategy = sstables::offstrategy(do_allow_offstrategy_compaction && !eligible_for_reshape_on_boot(sst));
|
||||
return global_table->add_sstable_and_update_cache(sst, requires_offstrategy);
|
||||
}).then([&global_table] {
|
||||
}).then([&global_table, do_allow_offstrategy_compaction] {
|
||||
if (do_allow_offstrategy_compaction) {
|
||||
global_table->trigger_offstrategy_compaction();
|
||||
}
|
||||
});
|
||||
}).get();
|
||||
});
|
||||
@@ -560,11 +563,11 @@ future<> distributed_loader::populate_keyspace(distributed<replica::database>& d
|
||||
auto sstdir = ks.column_family_directory(ksdir, cfname, uuid);
|
||||
dblog.info("Keyspace {}: Reading CF {} id={} version={}", ks_name, cfname, uuid, s->version());
|
||||
return ks.make_directory_for_column_family(cfname, uuid).then([&db, sstdir, uuid, ks_name, cfname] {
|
||||
return distributed_loader::populate_column_family(db, sstdir + "/" + sstables::staging_dir, ks_name, cfname);
|
||||
return distributed_loader::populate_column_family(db, sstdir + "/" + sstables::staging_dir, ks_name, cfname, allow_offstrategy_compaction::no);
|
||||
}).then([&db, sstdir, ks_name, cfname] {
|
||||
return distributed_loader::populate_column_family(db, sstdir + "/" + sstables::quarantine_dir, ks_name, cfname, false /* must_exist */);
|
||||
return distributed_loader::populate_column_family(db, sstdir + "/" + sstables::quarantine_dir, ks_name, cfname, allow_offstrategy_compaction::no, must_exist::no);
|
||||
}).then([&db, sstdir, uuid, ks_name, cfname] {
|
||||
return distributed_loader::populate_column_family(db, sstdir, ks_name, cfname);
|
||||
return distributed_loader::populate_column_family(db, sstdir, ks_name, cfname, allow_offstrategy_compaction::yes);
|
||||
}).handle_exception([ks_name, cfname, sstdir](std::exception_ptr eptr) {
|
||||
std::string msg =
|
||||
format("Exception while populating keyspace '{}' with column family '{}' from file '{}': {}",
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
#include <seastar/core/distributed.hh>
|
||||
#include <seastar/core/sstring.hh>
|
||||
#include <seastar/core/file.hh>
|
||||
#include <seastar/util/bool_class.hh>
|
||||
#include <vector>
|
||||
#include <functional>
|
||||
#include <filesystem>
|
||||
@@ -67,7 +68,9 @@ class distributed_loader {
|
||||
static future<size_t> make_sstables_available(sstables::sstable_directory& dir,
|
||||
sharded<replica::database>& db, sharded<db::view::view_update_generator>& view_update_generator,
|
||||
std::filesystem::path datadir, sstring ks, sstring cf);
|
||||
static future<> populate_column_family(distributed<replica::database>& db, sstring sstdir, sstring ks, sstring cf, bool must_exist = true);
|
||||
using allow_offstrategy_compaction = bool_class<struct allow_offstrategy_compaction_tag>;
|
||||
using must_exist = bool_class<struct must_exist_tag>;
|
||||
static future<> populate_column_family(distributed<replica::database>& db, sstring sstdir, sstring ks, sstring cf, allow_offstrategy_compaction, must_exist = must_exist::yes);
|
||||
static future<> populate_keyspace(distributed<replica::database>& db, sstring datadir, sstring ks_name);
|
||||
static future<> cleanup_column_family_temp_sst_dirs(sstring sstdir);
|
||||
static future<> handle_sstables_pending_delete(sstring pending_deletes_dir);
|
||||
|
||||
163
replica/table.cc
163
replica/table.cc
@@ -9,6 +9,7 @@
|
||||
#include <seastar/core/seastar.hh>
|
||||
#include <seastar/core/coroutine.hh>
|
||||
#include <seastar/coroutine/maybe_yield.hh>
|
||||
#include <seastar/coroutine/exception.hh>
|
||||
#include <seastar/util/closeable.hh>
|
||||
|
||||
#include "replica/database.hh"
|
||||
@@ -662,11 +663,21 @@ table::try_flush_memtable_to_sstable(lw_shared_ptr<memtable> old, sstable_write_
|
||||
[] (const dht::decorated_key&) { return api::min_timestamp; });
|
||||
}
|
||||
|
||||
mutation_fragment* fragment = co_await reader.peek();
|
||||
if (!fragment) {
|
||||
std::exception_ptr err;
|
||||
try {
|
||||
mutation_fragment* fragment = co_await reader.peek();
|
||||
if (!fragment) {
|
||||
co_await reader.close();
|
||||
_memtables->erase(old);
|
||||
co_return stop_iteration::yes;
|
||||
}
|
||||
} catch (...) {
|
||||
err = std::current_exception();
|
||||
}
|
||||
if (err) {
|
||||
tlogger.error("failed to flush memtable for {}.{}: {}", old->schema()->ks_name(), old->schema()->cf_name(), err);
|
||||
co_await reader.close();
|
||||
_memtables->erase(old);
|
||||
co_return stop_iteration::yes;
|
||||
co_return stop_iteration(_async_gate.is_closed());
|
||||
}
|
||||
|
||||
auto f = consumer(upgrade_to_v2(std::move(reader)));
|
||||
@@ -1426,70 +1437,86 @@ future<> table::write_schema_as_cql(database& db, sstring dir) const {
|
||||
|
||||
}
|
||||
|
||||
future<> table::snapshot(database& db, sstring name, bool skip_flush) {
|
||||
future<> table::snapshot(database& db, sstring name) {
|
||||
auto jsondir = _config.datadir + "/snapshots/" + name;
|
||||
tlogger.debug("snapshot {}: skip_flush={}", jsondir, skip_flush);
|
||||
auto f = skip_flush ? make_ready_future<>() : flush();
|
||||
return f.then([this, &db, jsondir = std::move(jsondir)]() {
|
||||
return with_semaphore(_sstable_deletion_sem, 1, [this, &db, jsondir = std::move(jsondir)]() {
|
||||
auto tables = boost::copy_range<std::vector<sstables::shared_sstable>>(*_sstables->all());
|
||||
return do_with(std::move(tables), std::move(jsondir), [this, &db] (std::vector<sstables::shared_sstable>& tables, const sstring& jsondir) {
|
||||
return io_check([&jsondir] { return recursive_touch_directory(jsondir); }).then([this, &db, &jsondir, &tables] {
|
||||
return max_concurrent_for_each(tables, db.get_config().initial_sstable_loading_concurrency(), [&db, &jsondir] (sstables::shared_sstable sstable) {
|
||||
return with_semaphore(db.get_sharded_sst_dir_semaphore().local(), 1, [&jsondir, sstable] {
|
||||
return io_check([sstable, &dir = jsondir] {
|
||||
return sstable->create_links(dir);
|
||||
});
|
||||
});
|
||||
});
|
||||
}).then([&jsondir, &tables] {
|
||||
return io_check(sync_directory, jsondir);
|
||||
}).finally([this, &tables, &db, &jsondir] {
|
||||
auto shard = std::hash<sstring>()(jsondir) % smp::count;
|
||||
std::unordered_set<sstring> table_names;
|
||||
for (auto& sst : tables) {
|
||||
auto f = sst->get_filename();
|
||||
auto rf = f.substr(sst->get_dir().size() + 1);
|
||||
table_names.insert(std::move(rf));
|
||||
}
|
||||
return smp::submit_to(shard, [requester = this_shard_id(), &jsondir, this, &db,
|
||||
tables = std::move(table_names), datadir = _config.datadir] {
|
||||
tlogger.debug("snapshot {}", jsondir);
|
||||
|
||||
if (!pending_snapshots.contains(jsondir)) {
|
||||
pending_snapshots.emplace(jsondir, make_lw_shared<snapshot_manager>());
|
||||
}
|
||||
auto snapshot = pending_snapshots.at(jsondir);
|
||||
for (auto&& sst: tables) {
|
||||
snapshot->files.insert(std::move(sst));
|
||||
}
|
||||
auto sstable_deletion_guard = co_await get_units(_sstable_deletion_sem, 1);
|
||||
std::exception_ptr ex;
|
||||
|
||||
snapshot->requests.signal(1);
|
||||
auto my_work = make_ready_future<>();
|
||||
if (requester == this_shard_id()) {
|
||||
my_work = snapshot->requests.wait(smp::count).then([&jsondir,
|
||||
&db, snapshot, this] {
|
||||
// this_shard_id() here == requester == this_shard_id() before submit_to() above,
|
||||
// so the db reference is still local
|
||||
return write_schema_as_cql(db, jsondir).handle_exception([&jsondir](std::exception_ptr ptr) {
|
||||
tlogger.error("Failed writing schema file in snapshot in {} with exception {}", jsondir, ptr);
|
||||
return make_ready_future<>();
|
||||
}).finally([&jsondir, snapshot] () mutable {
|
||||
return seal_snapshot(jsondir).handle_exception([&jsondir] (std::exception_ptr ex) {
|
||||
tlogger.error("Failed to seal snapshot in {}: {}. Ignored.", jsondir, ex);
|
||||
}).then([snapshot] {
|
||||
snapshot->manifest_write.signal(smp::count);
|
||||
return make_ready_future<>();
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
return my_work.finally([snapshot] {
|
||||
return snapshot->manifest_write.wait(1);
|
||||
}).then([snapshot] {});
|
||||
std::vector<sstables::shared_sstable> tables;
|
||||
try {
|
||||
tables = boost::copy_range<std::vector<sstables::shared_sstable>>(*_sstables->all());
|
||||
co_await io_check([&jsondir] { return recursive_touch_directory(jsondir); });
|
||||
co_await max_concurrent_for_each(tables, db.get_config().initial_sstable_loading_concurrency(), [&db, &jsondir] (sstables::shared_sstable sstable) {
|
||||
return with_semaphore(db.get_sharded_sst_dir_semaphore().local(), 1, [&jsondir, sstable] {
|
||||
return io_check([sstable, &dir = jsondir] {
|
||||
return sstable->create_links(dir);
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
co_await io_check(sync_directory, jsondir);
|
||||
} catch (...) {
|
||||
ex = std::current_exception();
|
||||
}
|
||||
|
||||
auto shard = std::hash<sstring>()(jsondir) % smp::count;
|
||||
std::unordered_set<sstring> table_names;
|
||||
try {
|
||||
for (auto& sst : tables) {
|
||||
auto f = sst->get_filename();
|
||||
auto rf = f.substr(sst->get_dir().size() + 1);
|
||||
table_names.insert(std::move(rf));
|
||||
}
|
||||
} catch (...) {
|
||||
ex = std::current_exception();
|
||||
}
|
||||
co_await smp::submit_to(shard, [requester = this_shard_id(), &jsondir, this, &db,
|
||||
tables = std::move(table_names), datadir = _config.datadir, ex = std::move(ex)] () mutable -> future<> {
|
||||
if (!pending_snapshots.contains(jsondir)) {
|
||||
try {
|
||||
pending_snapshots.emplace(jsondir, make_lw_shared<snapshot_manager>());
|
||||
} catch (...) {
|
||||
// abort since the process will hang if we can't coordinate
|
||||
// snapshot across shards, similar to failing to allocation a continuation.
|
||||
tlogger.error("Failed allocating snapshot_manager: {}. Aborting.", std::current_exception());
|
||||
abort();
|
||||
}
|
||||
}
|
||||
auto snapshot = pending_snapshots.at(jsondir);
|
||||
try {
|
||||
for (auto&& sst: tables) {
|
||||
snapshot->files.insert(std::move(sst));
|
||||
}
|
||||
} catch (...) {
|
||||
ex = std::current_exception();
|
||||
}
|
||||
|
||||
tlogger.debug("snapshot {}: signal requests", jsondir);
|
||||
snapshot->requests.signal(1);
|
||||
if (requester == this_shard_id()) {
|
||||
tlogger.debug("snapshot {}: waiting for all shards", jsondir);
|
||||
co_await snapshot->requests.wait(smp::count);
|
||||
// this_shard_id() here == requester == this_shard_id() before submit_to() above,
|
||||
// so the db reference is still local
|
||||
tlogger.debug("snapshot {}: writing schema.cql", jsondir);
|
||||
co_await write_schema_as_cql(db, jsondir).handle_exception([&] (std::exception_ptr ptr) {
|
||||
tlogger.error("Failed writing schema file in snapshot in {} with exception {}", jsondir, ptr);
|
||||
ex = std::move(ptr);
|
||||
});
|
||||
tlogger.debug("snapshot {}: seal_snapshot", jsondir);
|
||||
co_await seal_snapshot(jsondir).handle_exception([&] (std::exception_ptr ptr) {
|
||||
tlogger.error("Failed to seal snapshot in {}: {}.", jsondir, ptr);
|
||||
ex = std::move(ptr);
|
||||
});
|
||||
snapshot->manifest_write.signal(smp::count);
|
||||
}
|
||||
tlogger.debug("snapshot {}: waiting for manifest on behalf of shard {}", jsondir, requester);
|
||||
co_await snapshot->manifest_write.wait(1);
|
||||
tlogger.debug("snapshot {}: done: error={}", jsondir, ex);
|
||||
if (ex) {
|
||||
std::rethrow_exception(std::move(ex));
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@@ -1571,13 +1598,14 @@ bool table::can_flush() const {
|
||||
}
|
||||
|
||||
future<> table::clear() {
|
||||
auto permits = co_await _config.dirty_memory_manager->get_all_flush_permits();
|
||||
if (_commitlog) {
|
||||
for (auto& t : *_memtables) {
|
||||
_commitlog->discard_completed_segments(_schema->id(), t->get_and_discard_rp_set());
|
||||
}
|
||||
}
|
||||
_memtables->clear_and_add();
|
||||
return _cache.invalidate(row_cache::external_updater([] { /* There is no underlying mutation source */ }));
|
||||
co_await _cache.invalidate(row_cache::external_updater([] { /* There is no underlying mutation source */ }));
|
||||
}
|
||||
|
||||
// NOTE: does not need to be futurized, but might eventually, depending on
|
||||
@@ -2235,7 +2263,7 @@ std::chrono::milliseconds table::get_coordinator_read_latency_percentile(double
|
||||
|
||||
void
|
||||
table::enable_auto_compaction() {
|
||||
// FIXME: unmute backlog. turn table backlog back on.
|
||||
// XXX: unmute backlog. turn table backlog back on.
|
||||
// see table::disable_auto_compaction() notes.
|
||||
_compaction_disabled_by_user = false;
|
||||
trigger_compaction();
|
||||
@@ -2243,7 +2271,7 @@ table::enable_auto_compaction() {
|
||||
|
||||
future<>
|
||||
table::disable_auto_compaction() {
|
||||
// FIXME: mute backlog. When we disable background compactions
|
||||
// XXX: mute backlog. When we disable background compactions
|
||||
// for the table, we must also disable current backlog of the
|
||||
// table compaction strategy that contributes to the scheduling
|
||||
// group resources prioritization.
|
||||
@@ -2270,9 +2298,8 @@ table::disable_auto_compaction() {
|
||||
// - it will break computation of major compaction descriptor
|
||||
// for new submissions
|
||||
_compaction_disabled_by_user = true;
|
||||
return with_gate(_async_gate, [this] {
|
||||
return compaction_manager().stop_ongoing_compactions("disable auto-compaction", this, sstables::compaction_type::Compaction);
|
||||
});
|
||||
// FIXME: stop ongoing compactions
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
flat_mutation_reader
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user