Compare commits
167 Commits
scylla-5.0
...
next-5.0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f6c2624c86 | ||
|
|
f7d9afd209 | ||
|
|
b011cc2e78 | ||
|
|
fb466dd7b7 | ||
|
|
697e090659 | ||
|
|
2c518f3131 | ||
|
|
e941a5ac34 | ||
|
|
3a7ce5e8aa | ||
|
|
efa4f312f5 | ||
|
|
fb4b71ea02 | ||
|
|
7387922a29 | ||
|
|
cb78c3bf2c | ||
|
|
aeac63a3ee | ||
|
|
e7b50fb8d3 | ||
|
|
6b21f2a351 | ||
|
|
0db8e627a5 | ||
|
|
f1121d2149 | ||
|
|
a0ca8abe42 | ||
|
|
8bceac1713 | ||
|
|
6bcc7c6ed5 | ||
|
|
67f85875cc | ||
|
|
8b874cd4e4 | ||
|
|
b08c582134 | ||
|
|
41556b5f63 | ||
|
|
23e7e594c0 | ||
|
|
e6ac13314d | ||
|
|
382d815459 | ||
|
|
a867b2c0e5 | ||
|
|
846edf78c6 | ||
|
|
0ccc07322b | ||
|
|
0b170192a1 | ||
|
|
fd4b2a3319 | ||
|
|
416929fb2a | ||
|
|
9d8d7048eb | ||
|
|
bae4155ab2 | ||
|
|
d6e2a326cf | ||
|
|
15645ff40b | ||
|
|
a808fc7172 | ||
|
|
dd260bfa82 | ||
|
|
c46935ed5c | ||
|
|
985d6bc4c2 | ||
|
|
7673ff4ae3 | ||
|
|
c441eebf46 | ||
|
|
bf4fa80dd7 | ||
|
|
2010231fe9 | ||
|
|
0a51eb55e3 | ||
|
|
d9c6c6283b | ||
|
|
90a5344261 | ||
|
|
68da667288 | ||
|
|
9adb1a8fdd | ||
|
|
7623fe01b7 | ||
|
|
3b0a0c4876 | ||
|
|
019d5cde1b | ||
|
|
a2e255833a | ||
|
|
f4aa5cacb1 | ||
|
|
8ea9a16f9e | ||
|
|
1aa5283a38 | ||
|
|
2e7b1858ad | ||
|
|
2542b57ddc | ||
|
|
01a9871fc3 | ||
|
|
6bb7fac8d8 | ||
|
|
5dff7489b1 | ||
|
|
2775b1d136 | ||
|
|
2ae5675c0f | ||
|
|
d507ad9424 | ||
|
|
413af945c0 | ||
|
|
9a71680dc7 | ||
|
|
94b8baa797 | ||
|
|
e372a5fe0a | ||
|
|
692e5ed175 | ||
|
|
5a299f65ff | ||
|
|
f4ae2fa5f9 | ||
|
|
07c20bdfea | ||
|
|
8a36c4be54 | ||
|
|
bf92c2b44c | ||
|
|
0e388d2140 | ||
|
|
288eb9d231 | ||
|
|
9219a59802 | ||
|
|
f9cea4dc51 | ||
|
|
081b2b76cc | ||
|
|
dfb229a18a | ||
|
|
60da855c2d | ||
|
|
1718861e94 | ||
|
|
e03e9b1abe | ||
|
|
26c51025c1 | ||
|
|
5c39a4524a | ||
|
|
9823e8d9c5 | ||
|
|
b48c9cae95 | ||
|
|
14077d2def | ||
|
|
25508705a8 | ||
|
|
347da028e9 | ||
|
|
874fa15202 | ||
|
|
99c03cb2af | ||
|
|
6c35d3c5cd | ||
|
|
707622ce15 | ||
|
|
bab36b604c | ||
|
|
8840711e79 | ||
|
|
af18bb3fe9 | ||
|
|
6003cba7a8 | ||
|
|
e9afd076eb | ||
|
|
c5f732d42a | ||
|
|
13a1408135 | ||
|
|
6685e00dd4 | ||
|
|
350bb57291 | ||
|
|
e186ad5b6c | ||
|
|
139e9afc89 | ||
|
|
a42c6f190c | ||
|
|
2b8f0cbd97 | ||
|
|
a2a762e18d | ||
|
|
aa973e2b9e | ||
|
|
e0777f1112 | ||
|
|
cc6311cbc7 | ||
|
|
0354e13718 | ||
|
|
2750d2e94b | ||
|
|
b4383a389b | ||
|
|
f667c5923a | ||
|
|
e4ba0c56df | ||
|
|
329d55cc4f | ||
|
|
b956293f47 | ||
|
|
6a8c2d3f56 | ||
|
|
27a35c7f98 | ||
|
|
d83134a245 | ||
|
|
b844d14829 | ||
|
|
184df0393e | ||
|
|
1b550dd301 | ||
|
|
01ce53d7fb | ||
|
|
e9c7f89b32 | ||
|
|
93f468c12c | ||
|
|
e54ae9efd9 | ||
|
|
ef40e59c0e | ||
|
|
8c56b0b268 | ||
|
|
fc78d88783 | ||
|
|
31a20c4c54 | ||
|
|
7e42bcfd61 | ||
|
|
2107ffe2d2 | ||
|
|
5a97a1060e | ||
|
|
2b0487c900 | ||
|
|
d3b3c53d9f | ||
|
|
50c2c1b1d4 | ||
|
|
aa647a637a | ||
|
|
2c0040fcb3 | ||
|
|
54564adb7c | ||
|
|
839876e8f2 | ||
|
|
36002e2b7c | ||
|
|
91a8f9e09b | ||
|
|
bc29f350dd | ||
|
|
4fe571f470 | ||
|
|
ebf38eaead | ||
|
|
1c82766f33 | ||
|
|
e1f78c33b4 | ||
|
|
0634b5f734 | ||
|
|
6f020b26e1 | ||
|
|
7f8dcc5657 | ||
|
|
20451760fe | ||
|
|
51b031d04e | ||
|
|
82d1446ca9 | ||
|
|
e0acb0766d | ||
|
|
4f26d489a0 | ||
|
|
43cbc5c836 | ||
|
|
f0c521efdf | ||
|
|
b9a61c8e9a | ||
|
|
32aa1e5287 | ||
|
|
da6a126d79 | ||
|
|
d07e902983 | ||
|
|
3c0fc42f84 | ||
|
|
964ccf9192 | ||
|
|
dfdc128faf |
@@ -60,7 +60,7 @@ fi
|
||||
|
||||
# Default scylla product/version tags
|
||||
PRODUCT=scylla
|
||||
VERSION=5.0.2
|
||||
VERSION=5.0.13
|
||||
|
||||
if test -f version
|
||||
then
|
||||
|
||||
@@ -78,6 +78,11 @@ future<> controller::start_server() {
|
||||
|
||||
_executor.start(std::ref(_gossiper), std::ref(_proxy), std::ref(_mm), std::ref(_sys_dist_ks), sharded_parameter(get_cdc_metadata, std::ref(_cdc_gen_svc)), _ssg.value()).get();
|
||||
_server.start(std::ref(_executor), std::ref(_proxy), std::ref(_gossiper)).get();
|
||||
// Note: from this point on, if start_server() throws for any reason,
|
||||
// it must first call stop_server() to stop the executor and server
|
||||
// services we just started - or Scylla will cause an assertion
|
||||
// failure when the controller object is destroyed in the exception
|
||||
// unwinding.
|
||||
std::optional<uint16_t> alternator_port;
|
||||
if (_config.alternator_port()) {
|
||||
alternator_port = _config.alternator_port();
|
||||
@@ -104,7 +109,13 @@ future<> controller::start_server() {
|
||||
}
|
||||
opts.erase("require_client_auth");
|
||||
opts.erase("truststore");
|
||||
utils::configure_tls_creds_builder(creds.value(), std::move(opts)).get();
|
||||
try {
|
||||
utils::configure_tls_creds_builder(creds.value(), std::move(opts)).get();
|
||||
} catch(...) {
|
||||
logger.error("Failed to set up Alternator TLS credentials: {}", std::current_exception());
|
||||
stop_server().get();
|
||||
std::throw_with_nested(std::runtime_error("Failed to set up Alternator TLS credentials"));
|
||||
}
|
||||
}
|
||||
bool alternator_enforce_authorization = _config.alternator_enforce_authorization();
|
||||
_server.invoke_on_all(
|
||||
|
||||
@@ -34,6 +34,7 @@
|
||||
#include "expressions.hh"
|
||||
#include "conditions.hh"
|
||||
#include "cql3/constants.hh"
|
||||
#include "cql3/util.hh"
|
||||
#include <optional>
|
||||
#include "utils/overloaded_functor.hh"
|
||||
#include "seastar/json/json_elements.hh"
|
||||
@@ -46,6 +47,7 @@
|
||||
#include <seastar/core/coroutine.hh>
|
||||
#include <boost/range/adaptors.hpp>
|
||||
#include <boost/range/algorithm/find_end.hpp>
|
||||
#include <unordered_set>
|
||||
#include "service/storage_proxy.hh"
|
||||
#include "gms/gossiper.hh"
|
||||
#include "schema_registry.hh"
|
||||
@@ -148,16 +150,16 @@ static void validate_table_name(const std::string& name) {
|
||||
// instead of each component individually as DynamoDB does.
|
||||
// The view_name() function assumes the table_name has already been validated
|
||||
// but validates the legality of index_name and the combination of both.
|
||||
static std::string view_name(const std::string& table_name, const std::string& index_name, const std::string& delim = ":") {
|
||||
static std::string view_name(const std::string& table_name, std::string_view index_name, const std::string& delim = ":") {
|
||||
static const std::regex valid_index_name_chars ("[a-zA-Z0-9_.-]*");
|
||||
if (index_name.length() < 3) {
|
||||
throw api_error::validation("IndexName must be at least 3 characters long");
|
||||
}
|
||||
if (!std::regex_match(index_name.c_str(), valid_index_name_chars)) {
|
||||
if (!std::regex_match(index_name.data(), valid_index_name_chars)) {
|
||||
throw api_error::validation(
|
||||
format("IndexName '{}' must satisfy regular expression pattern: [a-zA-Z0-9_.-]+", index_name));
|
||||
}
|
||||
std::string ret = table_name + delim + index_name;
|
||||
std::string ret = table_name + delim + std::string(index_name);
|
||||
if (ret.length() > max_table_name_length) {
|
||||
throw api_error::validation(
|
||||
format("The total length of TableName ('{}') and IndexName ('{}') cannot exceed {} characters",
|
||||
@@ -166,7 +168,7 @@ static std::string view_name(const std::string& table_name, const std::string& i
|
||||
return ret;
|
||||
}
|
||||
|
||||
static std::string lsi_name(const std::string& table_name, const std::string& index_name) {
|
||||
static std::string lsi_name(const std::string& table_name, std::string_view index_name) {
|
||||
return view_name(table_name, index_name, "!:");
|
||||
}
|
||||
|
||||
@@ -273,16 +275,16 @@ get_table_or_view(service::storage_proxy& proxy, const rjson::value& request) {
|
||||
if (index_name) {
|
||||
if (index_name->IsString()) {
|
||||
orig_table_name = std::move(table_name);
|
||||
table_name = view_name(orig_table_name, index_name->GetString());
|
||||
table_name = view_name(orig_table_name, rjson::to_string_view(*index_name));
|
||||
type = table_or_view_type::gsi;
|
||||
} else {
|
||||
throw api_error::validation(
|
||||
format("Non-string IndexName '{}'", index_name->GetString()));
|
||||
format("Non-string IndexName '{}'", rjson::to_string_view(*index_name)));
|
||||
}
|
||||
// If no tables for global indexes were found, the index may be local
|
||||
if (!proxy.data_dictionary().has_schema(keyspace_name, table_name)) {
|
||||
type = table_or_view_type::lsi;
|
||||
table_name = lsi_name(orig_table_name, index_name->GetString());
|
||||
table_name = lsi_name(orig_table_name, rjson::to_string_view(*index_name));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -432,6 +434,11 @@ future<executor::request_return_type> executor::describe_table(client_state& cli
|
||||
rjson::add(table_description, "BillingModeSummary", rjson::empty_object());
|
||||
rjson::add(table_description["BillingModeSummary"], "BillingMode", "PAY_PER_REQUEST");
|
||||
rjson::add(table_description["BillingModeSummary"], "LastUpdateToPayPerRequestDateTime", rjson::value(creation_date_seconds));
|
||||
// In PAY_PER_REQUEST billing mode, provisioned capacity should return 0
|
||||
rjson::add(table_description, "ProvisionedThroughput", rjson::empty_object());
|
||||
rjson::add(table_description["ProvisionedThroughput"], "ReadCapacityUnits", 0);
|
||||
rjson::add(table_description["ProvisionedThroughput"], "WriteCapacityUnits", 0);
|
||||
rjson::add(table_description["ProvisionedThroughput"], "NumberOfDecreasesToday", 0);
|
||||
|
||||
std::unordered_map<std::string,std::string> key_attribute_types;
|
||||
// Add base table's KeySchema and collect types for AttributeDefinitions:
|
||||
@@ -453,6 +460,11 @@ future<executor::request_return_type> executor::describe_table(client_state& cli
|
||||
rjson::add(view_entry, "IndexName", rjson::from_string(index_name));
|
||||
// Add indexes's KeySchema and collect types for AttributeDefinitions:
|
||||
describe_key_schema(view_entry, *vptr, key_attribute_types);
|
||||
// Add projection type
|
||||
rjson::value projection = rjson::empty_object();
|
||||
rjson::add(projection, "ProjectionType", "ALL");
|
||||
// FIXME: we have to get ProjectionType from the schema when it is added
|
||||
rjson::add(view_entry, "Projection", std::move(projection));
|
||||
// Local secondary indexes are marked by an extra '!' sign occurring before the ':' delimiter
|
||||
rjson::value& index_array = (delim_it > 1 && cf_name[delim_it-1] == '!') ? lsi_array : gsi_array;
|
||||
rjson::push_back(index_array, std::move(view_entry));
|
||||
@@ -884,17 +896,23 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
|
||||
const rjson::value* gsi = rjson::find(request, "GlobalSecondaryIndexes");
|
||||
std::vector<schema_builder> view_builders;
|
||||
std::vector<sstring> where_clauses;
|
||||
std::unordered_set<std::string> index_names;
|
||||
if (gsi) {
|
||||
if (!gsi->IsArray()) {
|
||||
co_return api_error::validation("GlobalSecondaryIndexes must be an array.");
|
||||
}
|
||||
for (const rjson::value& g : gsi->GetArray()) {
|
||||
const rjson::value* index_name = rjson::find(g, "IndexName");
|
||||
if (!index_name || !index_name->IsString()) {
|
||||
const rjson::value* index_name_v = rjson::find(g, "IndexName");
|
||||
if (!index_name_v || !index_name_v->IsString()) {
|
||||
co_return api_error::validation("GlobalSecondaryIndexes IndexName must be a string.");
|
||||
}
|
||||
std::string vname(view_name(table_name, index_name->GetString()));
|
||||
elogger.trace("Adding GSI {}", index_name->GetString());
|
||||
std::string_view index_name = rjson::to_string_view(*index_name_v);
|
||||
auto [it, added] = index_names.emplace(index_name);
|
||||
if (!added) {
|
||||
co_return api_error::validation(format("Duplicate IndexName '{}', ", index_name));
|
||||
}
|
||||
std::string vname(view_name(table_name, index_name));
|
||||
elogger.trace("Adding GSI {}", index_name);
|
||||
// FIXME: read and handle "Projection" parameter. This will
|
||||
// require the MV code to copy just parts of the attrs map.
|
||||
schema_builder view_builder(keyspace_name, vname);
|
||||
@@ -927,9 +945,10 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
|
||||
if (!range_key.empty() && range_key != view_hash_key && range_key != view_range_key) {
|
||||
add_column(view_builder, range_key, attribute_definitions, column_kind::clustering_key);
|
||||
}
|
||||
sstring where_clause = "\"" + view_hash_key + "\" IS NOT NULL";
|
||||
sstring where_clause = format("{} IS NOT NULL", cql3::util::maybe_quote(view_hash_key));
|
||||
if (!view_range_key.empty()) {
|
||||
where_clause = where_clause + " AND \"" + view_hash_key + "\" IS NOT NULL";
|
||||
where_clause = format("{} AND {} IS NOT NULL", where_clause,
|
||||
cql3::util::maybe_quote(view_range_key));
|
||||
}
|
||||
where_clauses.push_back(std::move(where_clause));
|
||||
view_builders.emplace_back(std::move(view_builder));
|
||||
@@ -942,12 +961,17 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
|
||||
throw api_error::validation("LocalSecondaryIndexes must be an array.");
|
||||
}
|
||||
for (const rjson::value& l : lsi->GetArray()) {
|
||||
const rjson::value* index_name = rjson::find(l, "IndexName");
|
||||
if (!index_name || !index_name->IsString()) {
|
||||
const rjson::value* index_name_v = rjson::find(l, "IndexName");
|
||||
if (!index_name_v || !index_name_v->IsString()) {
|
||||
throw api_error::validation("LocalSecondaryIndexes IndexName must be a string.");
|
||||
}
|
||||
std::string vname(lsi_name(table_name, index_name->GetString()));
|
||||
elogger.trace("Adding LSI {}", index_name->GetString());
|
||||
std::string_view index_name = rjson::to_string_view(*index_name_v);
|
||||
auto [it, added] = index_names.emplace(index_name);
|
||||
if (!added) {
|
||||
co_return api_error::validation(format("Duplicate IndexName '{}', ", index_name));
|
||||
}
|
||||
std::string vname(lsi_name(table_name, index_name));
|
||||
elogger.trace("Adding LSI {}", index_name);
|
||||
if (range_key.empty()) {
|
||||
co_return api_error::validation("LocalSecondaryIndex requires that the base table have a range key");
|
||||
}
|
||||
@@ -979,9 +1003,10 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
|
||||
// Note above we don't need to add virtual columns, as all
|
||||
// base columns were copied to view. TODO: reconsider the need
|
||||
// for virtual columns when we support Projection.
|
||||
sstring where_clause = "\"" + view_hash_key + "\" IS NOT NULL";
|
||||
sstring where_clause = format("{} IS NOT NULL", cql3::util::maybe_quote(view_hash_key));
|
||||
if (!view_range_key.empty()) {
|
||||
where_clause = where_clause + " AND \"" + view_range_key + "\" IS NOT NULL";
|
||||
where_clause = format("{} AND {} IS NOT NULL", where_clause,
|
||||
cql3::util::maybe_quote(view_range_key));
|
||||
}
|
||||
where_clauses.push_back(std::move(where_clause));
|
||||
view_builders.emplace_back(std::move(view_builder));
|
||||
|
||||
@@ -143,19 +143,24 @@ future<alternator::executor::request_return_type> alternator::executor::list_str
|
||||
auto table = find_table(_proxy, request);
|
||||
auto db = _proxy.data_dictionary();
|
||||
auto cfs = db.get_tables();
|
||||
auto i = cfs.begin();
|
||||
auto e = cfs.end();
|
||||
|
||||
if (limit < 1) {
|
||||
throw api_error::validation("Limit must be 1 or more");
|
||||
}
|
||||
|
||||
// TODO: the unordered_map here is not really well suited for partial
|
||||
// querying - we're sorting on local hash order, and creating a table
|
||||
// between queries may or may not miss info. But that should be rare,
|
||||
// and we can probably expect this to be a single call.
|
||||
// # 12601 (maybe?) - sort the set of tables on ID. This should ensure we never
|
||||
// generate duplicates in a paged listing here. Can obviously miss things if they
|
||||
// are added between paged calls and end up with a "smaller" UUID/ARN, but that
|
||||
// is to be expected.
|
||||
std::sort(cfs.begin(), cfs.end(), [](const data_dictionary::table& t1, const data_dictionary::table& t2) {
|
||||
return t1.schema()->id() < t2.schema()->id();
|
||||
});
|
||||
|
||||
auto i = cfs.begin();
|
||||
auto e = cfs.end();
|
||||
|
||||
if (streams_start) {
|
||||
i = std::find_if(i, e, [&](data_dictionary::table t) {
|
||||
i = std::find_if(i, e, [&](const data_dictionary::table& t) {
|
||||
return t.schema()->id() == streams_start
|
||||
&& cdc::get_base_table(db.real_database(), *t.schema())
|
||||
&& is_alternator_keyspace(t.schema()->ks_name())
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
#include <seastar/core/sharded.hh>
|
||||
#include <seastar/core/abort_source.hh>
|
||||
#include <seastar/core/semaphore.hh>
|
||||
#include "data_dictionary/data_dictionary.hh"
|
||||
|
||||
namespace replica {
|
||||
class database;
|
||||
|
||||
@@ -593,6 +593,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
|
||||
if (column_families.empty()) {
|
||||
column_families = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
|
||||
}
|
||||
apilog.debug("force_keyspace_compaction: keyspace={} tables={}", keyspace, column_families);
|
||||
return ctx.db.invoke_on_all([keyspace, column_families] (replica::database& db) -> future<> {
|
||||
auto table_ids = boost::copy_range<std::vector<utils::UUID>>(column_families | boost::adaptors::transformed([&] (auto& cf_name) {
|
||||
return db.find_uuid(keyspace, cf_name);
|
||||
@@ -617,6 +618,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
|
||||
if (column_families.empty()) {
|
||||
column_families = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
|
||||
}
|
||||
apilog.info("force_keyspace_cleanup: keyspace={} tables={}", keyspace, column_families);
|
||||
return ss.local().is_cleanup_allowed(keyspace).then([&ctx, keyspace,
|
||||
column_families = std::move(column_families)] (bool is_cleanup_allowed) mutable {
|
||||
if (!is_cleanup_allowed) {
|
||||
@@ -635,7 +637,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
|
||||
// as a table can be dropped during loop below, let's find it before issuing the cleanup request.
|
||||
for (auto& id : table_ids) {
|
||||
replica::table& t = db.find_column_family(id);
|
||||
co_await cm.perform_cleanup(db, &t);
|
||||
co_await t.perform_cleanup_compaction(db);
|
||||
}
|
||||
co_return;
|
||||
}).then([]{
|
||||
@@ -645,6 +647,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
|
||||
});
|
||||
|
||||
ss::perform_keyspace_offstrategy_compaction.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> tables) -> future<json::json_return_type> {
|
||||
apilog.info("perform_keyspace_offstrategy_compaction: keyspace={} tables={}", keyspace, tables);
|
||||
co_return co_await ctx.db.map_reduce0([&keyspace, &tables] (replica::database& db) -> future<bool> {
|
||||
bool needed = false;
|
||||
for (const auto& table : tables) {
|
||||
@@ -658,6 +661,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
|
||||
ss::upgrade_sstables.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> column_families) {
|
||||
bool exclude_current_version = req_param<bool>(*req, "exclude_current_version", false);
|
||||
|
||||
apilog.info("upgrade_sstables: keyspace={} tables={} exclude_current_version={}", keyspace, column_families, exclude_current_version);
|
||||
return ctx.db.invoke_on_all([=] (replica::database& db) {
|
||||
return do_for_each(column_families, [=, &db](sstring cfname) {
|
||||
auto& cm = db.get_compaction_manager();
|
||||
@@ -672,6 +676,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
|
||||
ss::force_keyspace_flush.set(r, [&ctx](std::unique_ptr<request> req) -> future<json::json_return_type> {
|
||||
auto keyspace = validate_keyspace(ctx, req->param);
|
||||
auto column_families = parse_tables(keyspace, ctx, req->query_parameters, "cf");
|
||||
apilog.info("perform_keyspace_flush: keyspace={} tables={}", keyspace, column_families);
|
||||
auto &db = ctx.db.local();
|
||||
if (column_families.empty()) {
|
||||
co_await db.flush_on_all(keyspace);
|
||||
@@ -683,6 +688,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
|
||||
|
||||
|
||||
ss::decommission.set(r, [&ss](std::unique_ptr<request> req) {
|
||||
apilog.info("decommission");
|
||||
return ss.local().decommission().then([] {
|
||||
return make_ready_future<json::json_return_type>(json_void());
|
||||
});
|
||||
@@ -698,6 +704,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
|
||||
ss::remove_node.set(r, [&ss](std::unique_ptr<request> req) {
|
||||
auto host_id = req->get_query_param("host_id");
|
||||
std::vector<sstring> ignore_nodes_strs= split(req->get_query_param("ignore_nodes"), ",");
|
||||
apilog.info("remove_node: host_id={} ignore_nodes={}", host_id, ignore_nodes_strs);
|
||||
auto ignore_nodes = std::list<gms::inet_address>();
|
||||
for (std::string n : ignore_nodes_strs) {
|
||||
try {
|
||||
@@ -770,6 +777,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
|
||||
});
|
||||
|
||||
ss::drain.set(r, [&ss](std::unique_ptr<request> req) {
|
||||
apilog.info("drain");
|
||||
return ss.local().drain().then([] {
|
||||
return make_ready_future<json::json_return_type>(json_void());
|
||||
});
|
||||
@@ -802,12 +810,14 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
|
||||
});
|
||||
|
||||
ss::stop_gossiping.set(r, [&ss](std::unique_ptr<request> req) {
|
||||
apilog.info("stop_gossiping");
|
||||
return ss.local().stop_gossiping().then([] {
|
||||
return make_ready_future<json::json_return_type>(json_void());
|
||||
});
|
||||
});
|
||||
|
||||
ss::start_gossiping.set(r, [&ss](std::unique_ptr<request> req) {
|
||||
apilog.info("start_gossiping");
|
||||
return ss.local().start_gossiping().then([] {
|
||||
return make_ready_future<json::json_return_type>(json_void());
|
||||
});
|
||||
@@ -904,6 +914,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
|
||||
|
||||
ss::rebuild.set(r, [&ss](std::unique_ptr<request> req) {
|
||||
auto source_dc = req->get_query_param("source_dc");
|
||||
apilog.info("rebuild: source_dc={}", source_dc);
|
||||
return ss.local().rebuild(std::move(source_dc)).then([] {
|
||||
return make_ready_future<json::json_return_type>(json_void());
|
||||
});
|
||||
@@ -940,6 +951,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
|
||||
// FIXME: We should truncate schema tables if more than one node in the cluster.
|
||||
auto& sp = service::get_storage_proxy();
|
||||
auto& fs = sp.local().features();
|
||||
apilog.info("reset_local_schema");
|
||||
return db::schema_tables::recalculate_schema_version(sp, fs).then([] {
|
||||
return make_ready_future<json::json_return_type>(json_void());
|
||||
});
|
||||
@@ -947,6 +959,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
|
||||
|
||||
ss::set_trace_probability.set(r, [](std::unique_ptr<request> req) {
|
||||
auto probability = req->get_query_param("probability");
|
||||
apilog.info("set_trace_probability: probability={}", probability);
|
||||
return futurize_invoke([probability] {
|
||||
double real_prob = std::stod(probability.c_str());
|
||||
return tracing::tracing::tracing_instance().invoke_on_all([real_prob] (auto& local_tracing) {
|
||||
@@ -984,6 +997,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
|
||||
auto ttl = req->get_query_param("ttl");
|
||||
auto threshold = req->get_query_param("threshold");
|
||||
auto fast = req->get_query_param("fast");
|
||||
apilog.info("set_slow_query: enable={} ttl={} threshold={} fast={}", enable, ttl, threshold, fast);
|
||||
try {
|
||||
return tracing::tracing::tracing_instance().invoke_on_all([enable, ttl, threshold, fast] (auto& local_tracing) {
|
||||
if (threshold != "") {
|
||||
@@ -1010,6 +1024,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
|
||||
auto keyspace = validate_keyspace(ctx, req->param);
|
||||
auto tables = parse_tables(keyspace, ctx, req->query_parameters, "cf");
|
||||
|
||||
apilog.info("enable_auto_compaction: keyspace={} tables={}", keyspace, tables);
|
||||
return set_tables_autocompaction(ctx, ss.local(), keyspace, tables, true);
|
||||
});
|
||||
|
||||
@@ -1017,6 +1032,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
|
||||
auto keyspace = validate_keyspace(ctx, req->param);
|
||||
auto tables = parse_tables(keyspace, ctx, req->query_parameters, "cf");
|
||||
|
||||
apilog.info("disable_auto_compaction: keyspace={} tables={}", keyspace, tables);
|
||||
return set_tables_autocompaction(ctx, ss.local(), keyspace, tables, false);
|
||||
});
|
||||
|
||||
@@ -1357,7 +1373,7 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
|
||||
if (!req_param<bool>(*req, "disable_snapshot", false)) {
|
||||
auto tag = format("pre-scrub-{:d}", db_clock::now().time_since_epoch().count());
|
||||
f = parallel_for_each(column_families, [&snap_ctl, keyspace, tag](sstring cf) {
|
||||
return snap_ctl.local().take_column_family_snapshot(keyspace, cf, tag);
|
||||
return snap_ctl.local().take_column_family_snapshot(keyspace, cf, tag, db::snapshot_ctl::skip_flush::no, db::snapshot_ctl::allow_view_snapshots::yes);
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -601,16 +601,11 @@ future<> compaction_manager::stop() {
|
||||
}
|
||||
}
|
||||
|
||||
void compaction_manager::really_do_stop() {
|
||||
if (_state == state::none || _state == state::stopped) {
|
||||
return;
|
||||
}
|
||||
|
||||
_state = state::stopped;
|
||||
future<> compaction_manager::really_do_stop() {
|
||||
cmlog.info("Asked to stop");
|
||||
// Reset the metrics registry
|
||||
_metrics.clear();
|
||||
_stop_future.emplace(stop_ongoing_compactions("shutdown").then([this] () mutable {
|
||||
return stop_ongoing_compactions("shutdown").then([this] () mutable {
|
||||
reevaluate_postponed_compactions();
|
||||
return std::move(_waiting_reevalution);
|
||||
}).then([this] {
|
||||
@@ -618,12 +613,34 @@ void compaction_manager::really_do_stop() {
|
||||
_compaction_submission_timer.cancel();
|
||||
cmlog.info("Stopped");
|
||||
return _compaction_controller.shutdown();
|
||||
}));
|
||||
});
|
||||
}
|
||||
|
||||
template <typename Ex>
|
||||
requires std::is_base_of_v<std::exception, Ex> &&
|
||||
requires (const Ex& ex) {
|
||||
{ ex.code() } noexcept -> std::same_as<const std::error_code&>;
|
||||
}
|
||||
auto swallow_enospc(const Ex& ex) noexcept {
|
||||
if (ex.code().value() != ENOSPC) {
|
||||
return make_exception_future<>(std::make_exception_ptr(ex));
|
||||
}
|
||||
|
||||
cmlog.warn("Got ENOSPC on stop, ignoring...");
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
void compaction_manager::do_stop() noexcept {
|
||||
if (_state == state::none || _state == state::stopped) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
really_do_stop();
|
||||
_state = state::stopped;
|
||||
_stop_future = really_do_stop()
|
||||
.handle_exception_type([] (const std::system_error& ex) { return swallow_enospc(ex); })
|
||||
.handle_exception_type([] (const storage_io_error& ex) { return swallow_enospc(ex); })
|
||||
;
|
||||
} catch (...) {
|
||||
try {
|
||||
cmlog.error("Failed to stop the manager: {}", std::current_exception());
|
||||
|
||||
@@ -235,7 +235,7 @@ public:
|
||||
|
||||
// Stop all fibers, without waiting. Safe to be called multiple times.
|
||||
void do_stop() noexcept;
|
||||
void really_do_stop();
|
||||
future<> really_do_stop();
|
||||
|
||||
// Submit a table to be compacted.
|
||||
void submit(replica::table* t);
|
||||
|
||||
@@ -615,6 +615,8 @@ arg_parser.add_argument('--static-yaml-cpp', dest='staticyamlcpp', action='store
|
||||
help='Link libyaml-cpp statically')
|
||||
arg_parser.add_argument('--tests-debuginfo', action='store', dest='tests_debuginfo', type=int, default=0,
|
||||
help='Enable(1)/disable(0)compiler debug information generation for tests')
|
||||
arg_parser.add_argument('--perf-tests-debuginfo', action='store', dest='perf_tests_debuginfo', type=int, default=0,
|
||||
help='Enable(1)/disable(0)compiler debug information generation for perf tests')
|
||||
arg_parser.add_argument('--python', action='store', dest='python', default='python3',
|
||||
help='Python3 path')
|
||||
arg_parser.add_argument('--split-dwarf', dest='split_dwarf', action='store_true', default=False,
|
||||
@@ -1377,6 +1379,7 @@ linker_flags = linker_flags(compiler=args.cxx)
|
||||
|
||||
dbgflag = '-g -gz' if args.debuginfo else ''
|
||||
tests_link_rule = 'link' if args.tests_debuginfo else 'link_stripped'
|
||||
perf_tests_link_rule = 'link' if args.perf_tests_debuginfo else 'link_stripped'
|
||||
|
||||
# Strip if debuginfo is disabled, otherwise we end up with partial
|
||||
# debug info from the libraries we static link with
|
||||
@@ -1901,7 +1904,8 @@ with open(buildfile_tmp, 'w') as f:
|
||||
# So we strip the tests by default; The user can very
|
||||
# quickly re-link the test unstripped by adding a "_g"
|
||||
# to the test name, e.g., "ninja build/release/testname_g"
|
||||
f.write('build $builddir/{}/{}: {}.{} {} | {} {}\n'.format(mode, binary, tests_link_rule, mode, str.join(' ', objs), seastar_dep, seastar_testing_dep))
|
||||
link_rule = perf_tests_link_rule if binary.startswith('test/perf/') else tests_link_rule
|
||||
f.write('build $builddir/{}/{}: {}.{} {} | {} {}\n'.format(mode, binary, link_rule, mode, str.join(' ', objs), seastar_dep, seastar_testing_dep))
|
||||
f.write(' libs = {}\n'.format(local_libs))
|
||||
f.write('build $builddir/{}/{}_g: {}.{} {} | {} {}\n'.format(mode, binary, regular_link_rule, mode, str.join(' ', objs), seastar_dep, seastar_testing_dep))
|
||||
f.write(' libs = {}\n'.format(local_libs))
|
||||
@@ -2004,7 +2008,8 @@ with open(buildfile_tmp, 'w') as f:
|
||||
f.write('build {}: cxx.{} {} || {}\n'.format(obj, mode, cc, ' '.join(serializers)))
|
||||
if cc.endswith('Parser.cpp'):
|
||||
# Unoptimized parsers end up using huge amounts of stack space and overflowing their stack
|
||||
flags = '-O1'
|
||||
flags = '-O1' if modes[mode]['optimization-level'] in ['0', 'g', 's'] else ''
|
||||
|
||||
if has_sanitize_address_use_after_scope:
|
||||
flags += ' -fno-sanitize-address-use-after-scope'
|
||||
f.write(' obj_cxxflags = %s\n' % flags)
|
||||
|
||||
@@ -1386,7 +1386,7 @@ serviceLevelOrRoleName returns [sstring name]
|
||||
std::transform($name.begin(), $name.end(), $name.begin(), ::tolower); }
|
||||
| t=STRING_LITERAL { $name = sstring($t.text); }
|
||||
| t=QUOTED_NAME { $name = sstring($t.text); }
|
||||
| k=unreserved_keyword { $name = sstring($t.text);
|
||||
| k=unreserved_keyword { $name = k;
|
||||
std::transform($name.begin(), $name.end(), $name.begin(), ::tolower);}
|
||||
| QMARK {add_recognition_error("Bind variables cannot be used for service levels or role names");}
|
||||
;
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
|
||||
#include "cql3_type.hh"
|
||||
#include "cql3/util.hh"
|
||||
#include "exceptions/exceptions.hh"
|
||||
#include "ut_name.hh"
|
||||
#include "data_dictionary/data_dictionary.hh"
|
||||
#include "data_dictionary/user_types_metadata.hh"
|
||||
@@ -436,7 +437,20 @@ sstring maybe_quote(const sstring& identifier) {
|
||||
}
|
||||
|
||||
if (!need_quotes) {
|
||||
return identifier;
|
||||
// A seemingly valid identifier matching [a-z][a-z0-9_]* may still
|
||||
// need quoting if it is a CQL keyword, e.g., "to" (see issue #9450).
|
||||
// While our parser Cql.g has different production rules for different
|
||||
// types of identifiers (column names, table names, etc.), all of
|
||||
// these behave identically for alphanumeric strings: they exclude
|
||||
// many keywords but allow keywords listed as "unreserved keywords".
|
||||
// So we can use any of them, for example cident.
|
||||
try {
|
||||
cql3::util::do_with_parser(identifier, std::mem_fn(&cql3_parser::CqlParser::cident));
|
||||
return identifier;
|
||||
} catch(exceptions::syntax_exception&) {
|
||||
// This alphanumeric string is not a valid identifier, so fall
|
||||
// through to have it quoted:
|
||||
}
|
||||
}
|
||||
if (num_quotes == 0) {
|
||||
return make_sstring("\"", identifier, "\"");
|
||||
|
||||
@@ -1293,7 +1293,7 @@ expression search_and_replace(const expression& e,
|
||||
};
|
||||
},
|
||||
[&] (const binary_operator& oper) -> expression {
|
||||
return binary_operator(recurse(oper.lhs), oper.op, recurse(oper.rhs));
|
||||
return binary_operator(recurse(oper.lhs), oper.op, recurse(oper.rhs), oper.order);
|
||||
},
|
||||
[&] (const column_mutation_attribute& cma) -> expression {
|
||||
return column_mutation_attribute{cma.kind, recurse(cma.column)};
|
||||
|
||||
@@ -83,7 +83,7 @@ public:
|
||||
|
||||
virtual sstring assignment_testable_source_context() const override {
|
||||
auto&& name = _type->field_name(_field);
|
||||
auto sname = sstring(reinterpret_cast<const char*>(name.begin(), name.size()));
|
||||
auto sname = std::string_view(reinterpret_cast<const char*>(name.data()), name.size());
|
||||
return format("{}.{}", _selected, sname);
|
||||
}
|
||||
|
||||
|
||||
@@ -422,11 +422,16 @@ bool result_set_builder::restrictions_filter::do_filter(const selection& selecti
|
||||
}
|
||||
|
||||
auto clustering_columns_restrictions = _restrictions->get_clustering_columns_restrictions();
|
||||
if (dynamic_pointer_cast<cql3::restrictions::multi_column_restriction>(clustering_columns_restrictions)) {
|
||||
bool has_multi_col_clustering_restrictions =
|
||||
dynamic_pointer_cast<cql3::restrictions::multi_column_restriction>(clustering_columns_restrictions) != nullptr;
|
||||
if (has_multi_col_clustering_restrictions) {
|
||||
clustering_key_prefix ckey = clustering_key_prefix::from_exploded(clustering_key);
|
||||
return expr::is_satisfied_by(
|
||||
bool multi_col_clustering_satisfied = expr::is_satisfied_by(
|
||||
clustering_columns_restrictions->expression,
|
||||
partition_key, clustering_key, static_row, row, selection, _options);
|
||||
if (!multi_col_clustering_satisfied) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
auto static_row_iterator = static_row.iterator();
|
||||
@@ -474,6 +479,13 @@ bool result_set_builder::restrictions_filter::do_filter(const selection& selecti
|
||||
if (_skip_ck_restrictions) {
|
||||
continue;
|
||||
}
|
||||
if (has_multi_col_clustering_restrictions) {
|
||||
// Mixing multi column and single column restrictions on clustering
|
||||
// key columns is forbidden.
|
||||
// Since there are multi column restrictions we have to skip
|
||||
// evaluating single column restrictions or we will get an error.
|
||||
continue;
|
||||
}
|
||||
auto clustering_key_restrictions_map = _restrictions->get_single_column_clustering_key_restrictions();
|
||||
auto restr_it = clustering_key_restrictions_map.find(cdef);
|
||||
if (restr_it == clustering_key_restrictions_map.end()) {
|
||||
|
||||
@@ -254,6 +254,10 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::do_
|
||||
if (options.getSerialConsistency() == null)
|
||||
throw new InvalidRequestException("Invalid empty serial consistency level");
|
||||
#endif
|
||||
for (size_t i = 0; i < _statements.size(); ++i) {
|
||||
_statements[i].statement->validate_primary_key_restrictions(options.for_statement(i));
|
||||
}
|
||||
|
||||
if (_has_conditions) {
|
||||
++_stats.cas_batches;
|
||||
_stats.statements_in_cas_batches += _statements.size();
|
||||
|
||||
@@ -121,6 +121,9 @@ std::optional<mutation> cas_request::apply(foreign_ptr<lw_shared_ptr<query::resu
|
||||
|
||||
const update_parameters::prefetch_data::row* cas_request::find_old_row(const cas_row_update& op) const {
|
||||
static const clustering_key empty_ckey = clustering_key::make_empty();
|
||||
if (_key.empty()) {
|
||||
throw exceptions::invalid_request_exception("partition key ranges empty - probably caused by an unset value");
|
||||
}
|
||||
const partition_key& pkey = _key.front().start()->value().key().value();
|
||||
// If a statement has only static columns conditions, we must ignore its clustering columns
|
||||
// restriction when choosing a row to check the conditions, i.e. choose any partition row,
|
||||
@@ -134,6 +137,9 @@ const update_parameters::prefetch_data::row* cas_request::find_old_row(const cas
|
||||
// Another case when we pass an empty clustering key prefix is apparently when the table
|
||||
// doesn't have any clustering key columns and the clustering key range is empty (open
|
||||
// ended on both sides).
|
||||
if (op.ranges.empty()) {
|
||||
throw exceptions::invalid_request_exception("clustering key ranges empty - probably caused by an unset value");
|
||||
}
|
||||
const clustering_key& ckey = !op.statement.has_only_static_column_conditions() && op.ranges.front().start() ?
|
||||
op.ranges.front().start()->value() : empty_ckey;
|
||||
return _rows.find_row(pkey, ckey);
|
||||
|
||||
@@ -20,6 +20,7 @@
|
||||
#include "gms/feature_service.hh"
|
||||
#include "tombstone_gc_extension.hh"
|
||||
#include "tombstone_gc.hh"
|
||||
#include "utils/bloom_calculations.hh"
|
||||
|
||||
#include <boost/algorithm/string/predicate.hpp>
|
||||
|
||||
@@ -145,6 +146,16 @@ void cf_prop_defs::validate(const data_dictionary::database db, sstring ks_name,
|
||||
throw exceptions::configuration_exception(KW_MAX_INDEX_INTERVAL + " must be greater than " + KW_MIN_INDEX_INTERVAL);
|
||||
}
|
||||
|
||||
if (get_simple(KW_BF_FP_CHANCE)) {
|
||||
double bloom_filter_fp_chance = get_double(KW_BF_FP_CHANCE, 0/*not used*/);
|
||||
double min_bloom_filter_fp_chance = utils::bloom_calculations::min_supported_bloom_filter_fp_chance();
|
||||
if (bloom_filter_fp_chance <= min_bloom_filter_fp_chance || bloom_filter_fp_chance > 1.0) {
|
||||
throw exceptions::configuration_exception(format(
|
||||
"{} must be larger than {} and less than or equal to 1.0 (got {})",
|
||||
KW_BF_FP_CHANCE, min_bloom_filter_fp_chance, bloom_filter_fp_chance));
|
||||
}
|
||||
}
|
||||
|
||||
speculative_retry::from_sstring(get_string(KW_SPECULATIVE_RETRY, speculative_retry(speculative_retry::type::NONE, 0).to_sstring()));
|
||||
}
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
|
||||
#include "cql3/statements/cf_prop_defs.hh"
|
||||
#include "cql3/column_identifier.hh"
|
||||
#include "data_dictionary/data_dictionary.hh"
|
||||
|
||||
namespace cql3 {
|
||||
|
||||
|
||||
@@ -110,9 +110,6 @@ future<> modification_statement::check_access(query_processor& qp, const service
|
||||
|
||||
future<std::vector<mutation>>
|
||||
modification_statement::get_mutations(query_processor& qp, const query_options& options, db::timeout_clock::time_point timeout, bool local, int64_t now, service::query_state& qs) const {
|
||||
if (_restrictions->range_or_slice_eq_null(options)) { // See #7852 and #9290.
|
||||
throw exceptions::invalid_request_exception("Invalid null value in condition for a key column");
|
||||
}
|
||||
auto cl = options.get_consistency();
|
||||
auto json_cache = maybe_prepare_json_cache(options);
|
||||
auto keys = build_partition_keys(options, json_cache);
|
||||
@@ -245,6 +242,12 @@ modification_statement::execute(query_processor& qp, service::query_state& qs, c
|
||||
return modify_stage(this, seastar::ref(qp), seastar::ref(qs), seastar::cref(options));
|
||||
}
|
||||
|
||||
void modification_statement::validate_primary_key_restrictions(const query_options& options) const {
|
||||
if (_restrictions->range_or_slice_eq_null(options)) { // See #7852 and #9290.
|
||||
throw exceptions::invalid_request_exception("Invalid null value in condition for a key column");
|
||||
}
|
||||
}
|
||||
|
||||
future<::shared_ptr<cql_transport::messages::result_message>>
|
||||
modification_statement::do_execute(query_processor& qp, service::query_state& qs, const query_options& options) const {
|
||||
if (has_conditions() && options.get_protocol_version() == 1) {
|
||||
@@ -255,6 +258,8 @@ modification_statement::do_execute(query_processor& qp, service::query_state& qs
|
||||
|
||||
inc_cql_stats(qs.get_client_state().is_internal());
|
||||
|
||||
validate_primary_key_restrictions(options);
|
||||
|
||||
if (has_conditions()) {
|
||||
return execute_with_condition(qp, qs, options);
|
||||
}
|
||||
|
||||
@@ -231,6 +231,8 @@ public:
|
||||
// True if this statement needs to read only static column values to check if it can be applied.
|
||||
bool has_only_static_column_conditions() const { return !_has_regular_column_conditions && _has_static_column_conditions; }
|
||||
|
||||
void validate_primary_key_restrictions(const query_options& options) const;
|
||||
|
||||
virtual future<::shared_ptr<cql_transport::messages::result_message>>
|
||||
execute(query_processor& qp, service::query_state& qs, const query_options& options) const override;
|
||||
|
||||
|
||||
@@ -30,7 +30,7 @@ void sl_prop_defs::validate() {
|
||||
data_value v = duration_type->deserialize(duration_type->from_string(*repr));
|
||||
cql_duration duration = static_pointer_cast<const duration_type_impl>(duration_type)->from_value(v);
|
||||
if (duration.months || duration.days) {
|
||||
throw exceptions::invalid_request_exception("Timeout values cannot be longer than 24h");
|
||||
throw exceptions::invalid_request_exception("Timeout values cannot be expressed in days/months");
|
||||
}
|
||||
if (duration.nanoseconds % 1'000'000 != 0) {
|
||||
throw exceptions::invalid_request_exception("Timeout values must be expressed in millisecond granularity");
|
||||
|
||||
@@ -18,6 +18,8 @@
|
||||
#include "types/listlike_partial_deserializing_iterator.hh"
|
||||
#include "utils/managed_bytes.hh"
|
||||
#include "exceptions/exceptions.hh"
|
||||
#include <boost/algorithm/string/trim_all.hpp>
|
||||
#include <boost/algorithm/string.hpp>
|
||||
|
||||
static inline bool is_control_char(char c) {
|
||||
return c >= 0 && c <= 0x1F;
|
||||
@@ -78,8 +80,35 @@ static int64_t to_int64_t(const rjson::value& value) {
|
||||
return value.GetInt();
|
||||
} else if (value.IsUint()) {
|
||||
return value.GetUint();
|
||||
} else if (value.GetUint64()) {
|
||||
} else if (value.IsUint64()) {
|
||||
return value.GetUint64(); //NOTICE: large uint64_t values will get overflown
|
||||
} else if (value.IsDouble()) {
|
||||
// We allow specifing integer constants
|
||||
// using scientific notation (for example 1.3e8)
|
||||
// and floating-point numbers ending with .0 (for example 12.0),
|
||||
// but not floating-point numbers with fractional part (12.34).
|
||||
//
|
||||
// The reason is that JSON standard does not have separate
|
||||
// types for integers and floating-point numbers, only
|
||||
// a single "number" type. Some serializers may
|
||||
// produce an integer in that floating-point format.
|
||||
double double_value = value.GetDouble();
|
||||
|
||||
// Check if the value contains disallowed fractional part (.34 from 12.34).
|
||||
// With RapidJSON and an integer value in range [-(2^53)+1, (2^53)-1],
|
||||
// the fractional part will be zero as the entire value
|
||||
// fits in 53-bit significand. RapidJSON's parsing code does not lose accuracy:
|
||||
// when parsing a number like 12.34e8, it accumulates 1234 to a int64_t number,
|
||||
// then converts it to double and multiples by power of 10, never having any
|
||||
// digit in fractional part.
|
||||
double integral;
|
||||
double fractional = std::modf(double_value, &integral);
|
||||
if (fractional != 0.0 && fractional != -0.0) {
|
||||
throw marshal_exception(format("Incorrect JSON floating-point value "
|
||||
"for int64 type: {} (it should not contain fractional part {})", value, fractional));
|
||||
}
|
||||
|
||||
return double_value;
|
||||
}
|
||||
throw marshal_exception(format("Incorrect JSON value for int64 type: {}", value));
|
||||
}
|
||||
@@ -189,7 +218,7 @@ struct from_json_object_visitor {
|
||||
throw marshal_exception("bytes_type must be represented as string");
|
||||
}
|
||||
std::string_view string_v = rjson::to_string_view(value);
|
||||
if (string_v.size() < 2 && string_v[0] != '0' && string_v[1] != 'x') {
|
||||
if (string_v.size() < 2 || string_v[0] != '0' || string_v[1] != 'x') {
|
||||
throw marshal_exception("Blob JSON strings must start with 0x");
|
||||
}
|
||||
string_v.remove_prefix(2);
|
||||
@@ -197,6 +226,17 @@ struct from_json_object_visitor {
|
||||
}
|
||||
bytes operator()(const boolean_type_impl& t) {
|
||||
if (!value.IsBool()) {
|
||||
if (value.IsString()) {
|
||||
std::string str(rjson::to_string_view(value));
|
||||
boost::trim_all(str);
|
||||
boost::to_lower(str);
|
||||
|
||||
if (str == "true") {
|
||||
return t.decompose(true);
|
||||
} else if (str == "false") {
|
||||
return t.decompose(false);
|
||||
}
|
||||
}
|
||||
throw marshal_exception(format("Invalid JSON object {}", value));
|
||||
}
|
||||
return t.decompose(value.GetBool());
|
||||
|
||||
@@ -74,6 +74,13 @@ std::unique_ptr<cql3::statements::raw::select_statement> build_select_statement(
|
||||
/// forbids non-alpha-numeric characters in identifier names.
|
||||
/// Quoting involves wrapping the string in double-quotes ("). A double-quote
|
||||
/// character itself is quoted by doubling it.
|
||||
/// maybe_quote() also quotes reserved CQL keywords (e.g., "to", "where")
|
||||
/// but doesn't quote *unreserved* keywords (like ttl, int or as).
|
||||
/// Note that this means that if new reserved keywords are added to the
|
||||
/// parser, a saved output of maybe_quote() may no longer be parsable by
|
||||
/// parser. To avoid this forward-compatibility issue, use quote() instead
|
||||
/// of maybe_quote() - to unconditionally quote an identifier even if it is
|
||||
/// lowercase and not (yet) a keyword.
|
||||
sstring maybe_quote(const sstring& s);
|
||||
|
||||
// Check whether timestamp is not too far in the future as this probably
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
*/
|
||||
|
||||
#include <chrono>
|
||||
#include <exception>
|
||||
#include <seastar/core/future-util.hh>
|
||||
#include <seastar/core/do_with.hh>
|
||||
#include <seastar/core/semaphore.hh>
|
||||
@@ -247,6 +248,7 @@ future<> db::batchlog_manager::replay_all_failed_batches() {
|
||||
} catch (data_dictionary::no_such_keyspace& ex) {
|
||||
// should probably ignore and drop the batch
|
||||
} catch (...) {
|
||||
blogger.warn("Replay failed (will retry): {}", std::current_exception());
|
||||
// timeout, overload etc.
|
||||
// Do _not_ remove the batch, assuning we got a node write error.
|
||||
// Since we don't have hints (which origin is satisfied with),
|
||||
|
||||
@@ -881,6 +881,8 @@ db::config::config(std::shared_ptr<db::extensions> exts)
|
||||
"Flush tables in the system_schema keyspace after schema modification. This is required for crash recovery, but slows down tests and can be disabled for them")
|
||||
, restrict_replication_simplestrategy(this, "restrict_replication_simplestrategy", liveness::LiveUpdate, value_status::Used, db::tri_mode_restriction_t::mode::FALSE, "Controls whether to disable SimpleStrategy replication. Can be true, false, or warn.")
|
||||
, restrict_dtcs(this, "restrict_dtcs", liveness::LiveUpdate, value_status::Used, db::tri_mode_restriction_t::mode::WARN, "Controls whether to prevent setting DateTieredCompactionStrategy. Can be true, false, or warn.")
|
||||
, cache_index_pages(this, "cache_index_pages", liveness::LiveUpdate, value_status::Used, true,
|
||||
"Keep SSTable index pages in the global cache after a SSTable read. Expected to improve performance for workloads with big partitions, but may degrade performance for workloads with small partitions.")
|
||||
, default_log_level(this, "default_log_level", value_status::Used)
|
||||
, logger_log_level(this, "logger_log_level", value_status::Used)
|
||||
, log_to_stdout(this, "log_to_stdout", value_status::Used)
|
||||
|
||||
@@ -365,6 +365,9 @@ public:
|
||||
named_value<tri_mode_restriction> restrict_replication_simplestrategy;
|
||||
named_value<tri_mode_restriction> restrict_dtcs;
|
||||
|
||||
|
||||
named_value<bool> cache_index_pages;
|
||||
|
||||
seastar::logging_settings logging_settings(const log_cli::options&) const;
|
||||
|
||||
const db::extensions& extensions() const;
|
||||
|
||||
@@ -75,7 +75,7 @@ future<> snapshot_ctl::do_take_snapshot(sstring tag, std::vector<sstring> keyspa
|
||||
});
|
||||
}
|
||||
|
||||
future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf) {
|
||||
future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf, allow_view_snapshots av) {
|
||||
if (ks_name.empty()) {
|
||||
throw std::runtime_error("You must supply a keyspace name");
|
||||
}
|
||||
@@ -86,25 +86,25 @@ future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, std::vector<
|
||||
throw std::runtime_error("You must supply a snapshot name.");
|
||||
}
|
||||
|
||||
return run_snapshot_modify_operation([this, ks_name = std::move(ks_name), tables = std::move(tables), tag = std::move(tag), sf] () mutable {
|
||||
return do_take_column_family_snapshot(std::move(ks_name), std::move(tables), std::move(tag), sf);
|
||||
return run_snapshot_modify_operation([this, ks_name = std::move(ks_name), tables = std::move(tables), tag = std::move(tag), sf, av] () mutable {
|
||||
return do_take_column_family_snapshot(std::move(ks_name), std::move(tables), std::move(tag), sf, av);
|
||||
});
|
||||
}
|
||||
|
||||
future<> snapshot_ctl::do_take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf) {
|
||||
future<> snapshot_ctl::do_take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf, allow_view_snapshots av) {
|
||||
co_await check_snapshot_not_exist(ks_name, tag, tables);
|
||||
|
||||
for (const auto& table_name : tables) {
|
||||
auto& cf = _db.local().find_column_family(ks_name, table_name);
|
||||
if (cf.schema()->is_view()) {
|
||||
if (cf.schema()->is_view() && !av) {
|
||||
throw std::invalid_argument("Do not take a snapshot of a materialized view or a secondary index by itself. Run snapshot on the base table instead.");
|
||||
}
|
||||
}
|
||||
co_await _db.local().snapshot_on_all(ks_name, std::move(tables), std::move(tag), bool(sf));
|
||||
}
|
||||
|
||||
future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, sstring cf_name, sstring tag, skip_flush sf) {
|
||||
return take_column_family_snapshot(ks_name, std::vector<sstring>{cf_name}, tag, sf);
|
||||
future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, sstring cf_name, sstring tag, skip_flush sf, allow_view_snapshots av) {
|
||||
return take_column_family_snapshot(ks_name, std::vector<sstring>{cf_name}, tag, sf, av);
|
||||
}
|
||||
|
||||
future<> snapshot_ctl::clear_snapshot(sstring tag, std::vector<sstring> keyspace_names, sstring cf_name) {
|
||||
|
||||
@@ -27,6 +27,7 @@ namespace db {
|
||||
class snapshot_ctl : public peering_sharded_service<snapshot_ctl> {
|
||||
public:
|
||||
using skip_flush = bool_class<class skip_flush_tag>;
|
||||
using allow_view_snapshots = bool_class<class allow_view_snapsots_tag>;
|
||||
|
||||
struct snapshot_details {
|
||||
int64_t live;
|
||||
@@ -64,7 +65,7 @@ public:
|
||||
* @param tables a vector of tables names to snapshot
|
||||
* @param tag the tag given to the snapshot; may not be null or empty
|
||||
*/
|
||||
future<> take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf = skip_flush::no);
|
||||
future<> take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf = skip_flush::no, allow_view_snapshots av = allow_view_snapshots::no);
|
||||
|
||||
/**
|
||||
* Takes the snapshot of a specific column family. A snapshot name must be specified.
|
||||
@@ -73,7 +74,7 @@ public:
|
||||
* @param columnFamilyName the column family to snapshot
|
||||
* @param tag the tag given to the snapshot; may not be null or empty
|
||||
*/
|
||||
future<> take_column_family_snapshot(sstring ks_name, sstring cf_name, sstring tag, skip_flush sf = skip_flush::no);
|
||||
future<> take_column_family_snapshot(sstring ks_name, sstring cf_name, sstring tag, skip_flush sf = skip_flush::no, allow_view_snapshots av = allow_view_snapshots::no);
|
||||
|
||||
/**
|
||||
* Remove the snapshot with the given name from the given keyspaces.
|
||||
@@ -99,7 +100,7 @@ private:
|
||||
std::result_of_t<Func()> run_snapshot_list_operation(Func&&);
|
||||
|
||||
future<> do_take_snapshot(sstring tag, std::vector<sstring> keyspace_names, skip_flush sf = skip_flush::no);
|
||||
future<> do_take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf = skip_flush::no);
|
||||
future<> do_take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf = skip_flush::no, allow_view_snapshots av = allow_view_snapshots::no);
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@@ -2482,10 +2482,14 @@ class db_config_table final : public streaming_virtual_table {
|
||||
for (auto& c_ref : cfg.values()) {
|
||||
auto& c = c_ref.get();
|
||||
if (c.name() == name) {
|
||||
if (c.set_value(value, utils::config_file::config_source::CQL)) {
|
||||
return cfg.broadcast_to_all_shards();
|
||||
} else {
|
||||
return make_exception_future<>(virtual_table_update_exception("option is not live-updateable"));
|
||||
try {
|
||||
if (c.set_value(value, utils::config_file::config_source::CQL)) {
|
||||
return cfg.broadcast_to_all_shards();
|
||||
} else {
|
||||
return make_exception_future<>(virtual_table_update_exception("option is not live-updateable"));
|
||||
}
|
||||
} catch (boost::bad_lexical_cast&) {
|
||||
return make_exception_future<>(virtual_table_update_exception("cannot parse option value"));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3068,11 +3072,11 @@ mutation system_keyspace::make_group0_history_state_id_mutation(
|
||||
using namespace std::chrono;
|
||||
assert(*gc_older_than >= gc_clock::duration{0});
|
||||
|
||||
auto ts_millis = duration_cast<milliseconds>(microseconds{ts});
|
||||
auto gc_older_than_millis = duration_cast<milliseconds>(*gc_older_than);
|
||||
assert(gc_older_than_millis < ts_millis);
|
||||
auto ts_micros = microseconds{ts};
|
||||
auto gc_older_than_micros = duration_cast<microseconds>(*gc_older_than);
|
||||
assert(gc_older_than_micros < ts_micros);
|
||||
|
||||
auto tomb_upper_bound = utils::UUID_gen::min_time_UUID(ts_millis - gc_older_than_millis);
|
||||
auto tomb_upper_bound = utils::UUID_gen::min_time_UUID(ts_micros - gc_older_than_micros);
|
||||
// We want to delete all entries with IDs smaller than `tomb_upper_bound`
|
||||
// but the deleted range is of the form (x, +inf) since the schema is reversed.
|
||||
auto range = query::clustering_range::make_starting_with({
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
|
||||
#include <seastar/core/seastar.hh>
|
||||
#include <seastar/core/sstring.hh>
|
||||
#include <seastar/core/reactor.hh>
|
||||
#include <utility>
|
||||
#include <optional>
|
||||
#include "dht/token.hh"
|
||||
|
||||
@@ -10,8 +10,6 @@
|
||||
#include "log.hh"
|
||||
#include "utils/latency.hh"
|
||||
|
||||
#include <seastar/core/when_all.hh>
|
||||
|
||||
static logging::logger mylog("row_locking");
|
||||
|
||||
row_locker::row_locker(schema_ptr s)
|
||||
@@ -76,35 +74,32 @@ row_locker::lock_pk(const dht::decorated_key& pk, bool exclusive, db::timeout_cl
|
||||
future<row_locker::lock_holder>
|
||||
row_locker::lock_ck(const dht::decorated_key& pk, const clustering_key_prefix& cpk, bool exclusive, db::timeout_clock::time_point timeout, stats& stats) {
|
||||
mylog.debug("taking shared lock on partition {}, and {} lock on row {} in it", pk, (exclusive ? "exclusive" : "shared"), cpk);
|
||||
auto ck = cpk;
|
||||
// Create a two-level lock entry for the partition if it doesn't exist already.
|
||||
auto i = _two_level_locks.try_emplace(pk, this).first;
|
||||
// The two-level lock entry we've just created is guaranteed to be kept alive as long as it's locked.
|
||||
// Initiating read locking in the background below ensures that even if the two-level lock is currently
|
||||
// write-locked, releasing the write-lock will synchronously engage any waiting
|
||||
// locks and will keep the entry alive.
|
||||
future<lock_type::holder> lock_partition = i->second._partition_lock.hold_read_lock(timeout);
|
||||
auto j = i->second._row_locks.find(cpk);
|
||||
if (j == i->second._row_locks.end()) {
|
||||
// Not yet locked, need to create the lock. This makes a copy of cpk.
|
||||
try {
|
||||
j = i->second._row_locks.emplace(cpk, lock_type()).first;
|
||||
} catch(...) {
|
||||
// If this emplace() failed, e.g., out of memory, we fail. We
|
||||
// could do nothing - the partition lock we already started
|
||||
// taking will be unlocked automatically after being locked.
|
||||
// But it's better form to wait for the work we started, and it
|
||||
// will also allow us to remove the hash-table row we added.
|
||||
return lock_partition.then([ex = std::current_exception()] (auto lock) {
|
||||
// The lock is automatically released when "lock" goes out of scope.
|
||||
// TODO: unlock (lock = {}) now, search for the partition in the
|
||||
// hash table (we know it's still there, because we held the lock until
|
||||
// now) and remove the unused lock from the hash table if still unused.
|
||||
return make_exception_future<row_locker::lock_holder>(std::current_exception());
|
||||
});
|
||||
}
|
||||
}
|
||||
single_lock_stats &single_lock_stats = exclusive ? stats.exclusive_row : stats.shared_row;
|
||||
single_lock_stats.operations_currently_waiting_for_lock++;
|
||||
utils::latency_counter waiting_latency;
|
||||
waiting_latency.start();
|
||||
future<lock_type::holder> lock_row = exclusive ? j->second.hold_write_lock(timeout) : j->second.hold_read_lock(timeout);
|
||||
return when_all_succeed(std::move(lock_partition), std::move(lock_row))
|
||||
.then_unpack([this, pk = &i->first, cpk = &j->first, exclusive, &single_lock_stats, waiting_latency = std::move(waiting_latency)] (auto lock1, auto lock2) mutable {
|
||||
return lock_partition.then([this, pk = &i->first, row_locks = &i->second._row_locks, ck = std::move(ck), exclusive, &single_lock_stats, waiting_latency = std::move(waiting_latency), timeout] (auto lock1) mutable {
|
||||
auto j = row_locks->find(ck);
|
||||
if (j == row_locks->end()) {
|
||||
// Not yet locked, need to create the lock.
|
||||
j = row_locks->emplace(std::move(ck), lock_type()).first;
|
||||
}
|
||||
auto* cpk = &j->first;
|
||||
auto& row_lock = j->second;
|
||||
// Like to the two-level lock entry above, the row_lock entry we've just created
|
||||
// is guaranteed to be kept alive as long as it's locked.
|
||||
// Initiating read/write locking in the background below ensures that.
|
||||
auto lock_row = exclusive ? row_lock.hold_write_lock(timeout) : row_lock.hold_read_lock(timeout);
|
||||
return lock_row.then([this, pk, cpk, exclusive, &single_lock_stats, waiting_latency = std::move(waiting_latency), lock1 = std::move(lock1)] (auto lock2) mutable {
|
||||
// FIXME: indentation
|
||||
lock1.release();
|
||||
lock2.release();
|
||||
waiting_latency.stop();
|
||||
@@ -112,6 +107,7 @@ row_locker::lock_ck(const dht::decorated_key& pk, const clustering_key_prefix& c
|
||||
single_lock_stats.lock_acquisitions++;
|
||||
single_lock_stats.operations_currently_waiting_for_lock--;
|
||||
return lock_holder(this, pk, cpk, exclusive);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -121,6 +121,9 @@ const column_definition* view_info::view_column(const column_definition& base_de
|
||||
|
||||
void view_info::set_base_info(db::view::base_info_ptr base_info) {
|
||||
_base_info = std::move(base_info);
|
||||
// Forget the cached objects which may refer to the base schema.
|
||||
_select_statement = nullptr;
|
||||
_partition_slice = std::nullopt;
|
||||
}
|
||||
|
||||
// A constructor for a base info that can facilitate reads and writes from the materialized view.
|
||||
@@ -863,13 +866,18 @@ void view_updates::generate_update(
|
||||
bool same_row = true;
|
||||
for (auto col_id : col_ids) {
|
||||
auto* after = update.cells().find_cell(col_id);
|
||||
// Note: multi-cell columns can't be part of the primary key.
|
||||
auto& cdef = _base->regular_column_at(col_id);
|
||||
if (existing) {
|
||||
auto* before = existing->cells().find_cell(col_id);
|
||||
// Note that this cell is necessarily atomic, because col_ids are
|
||||
// view key columns, and keys must be atomic.
|
||||
if (before && before->as_atomic_cell(cdef).is_live()) {
|
||||
if (after && after->as_atomic_cell(cdef).is_live()) {
|
||||
auto cmp = compare_atomic_cell_for_merge(before->as_atomic_cell(cdef), after->as_atomic_cell(cdef));
|
||||
// We need to compare just the values of the keys, not
|
||||
// metadata like the timestamp. This is because below,
|
||||
// if the old and new view row have the same key, we need
|
||||
// to be sure to reach the update_entry() case.
|
||||
auto cmp = compare_unsigned(before->as_atomic_cell(cdef).value(), after->as_atomic_cell(cdef).value());
|
||||
if (cmp != 0) {
|
||||
same_row = false;
|
||||
}
|
||||
@@ -889,7 +897,13 @@ void view_updates::generate_update(
|
||||
if (same_row) {
|
||||
update_entry(base_key, update, *existing, now);
|
||||
} else {
|
||||
replace_entry(base_key, update, *existing, now);
|
||||
// This code doesn't work if the old and new view row have the
|
||||
// same key, because if they do we get both data and tombstone
|
||||
// for the same timestamp (now) and the tombstone wins. This
|
||||
// is why we need the "same_row" case above - it's not just a
|
||||
// performance optimization.
|
||||
delete_old_entry(base_key, *existing, update, now);
|
||||
create_entry(base_key, update, now);
|
||||
}
|
||||
} else {
|
||||
delete_old_entry(base_key, *existing, update, now);
|
||||
@@ -933,8 +947,12 @@ future<stop_iteration> view_update_builder::stop() const {
|
||||
return make_ready_future<stop_iteration>(stop_iteration::yes);
|
||||
}
|
||||
|
||||
future<utils::chunked_vector<frozen_mutation_and_schema>> view_update_builder::build_some() {
|
||||
future<std::optional<utils::chunked_vector<frozen_mutation_and_schema>>> view_update_builder::build_some() {
|
||||
return advance_all().then([this] (stop_iteration ignored) {
|
||||
if (!_update && !_existing) {
|
||||
// Tell the caller there is no more data to build.
|
||||
return make_ready_future<std::optional<utils::chunked_vector<frozen_mutation_and_schema>>>(std::nullopt);
|
||||
}
|
||||
bool do_advance_updates = false;
|
||||
bool do_advance_existings = false;
|
||||
if (_update && _update->is_partition_start()) {
|
||||
@@ -946,22 +964,23 @@ future<utils::chunked_vector<frozen_mutation_and_schema>> view_update_builder::b
|
||||
_existing_tombstone_tracker.set_partition_tombstone(_existing->as_partition_start().partition_tombstone());
|
||||
do_advance_existings = true;
|
||||
}
|
||||
future<stop_iteration> f = make_ready_future<stop_iteration>(stop_iteration::no);
|
||||
if (do_advance_updates) {
|
||||
return do_advance_existings ? advance_all() : advance_updates();
|
||||
f = do_advance_existings ? advance_all() : advance_updates();
|
||||
} else if (do_advance_existings) {
|
||||
return advance_existings();
|
||||
f = advance_existings();
|
||||
}
|
||||
return make_ready_future<stop_iteration>(stop_iteration::no);
|
||||
}).then([this] (stop_iteration ignored) {
|
||||
return repeat([this] {
|
||||
return this->on_results();
|
||||
return std::move(f).then([this] (stop_iteration ignored) {
|
||||
return repeat([this] {
|
||||
return this->on_results();
|
||||
});
|
||||
}).then([this] {
|
||||
utils::chunked_vector<frozen_mutation_and_schema> mutations;
|
||||
for (auto& update : _view_updates) {
|
||||
update.move_to(mutations);
|
||||
}
|
||||
return std::make_optional(mutations);
|
||||
});
|
||||
}).then([this] {
|
||||
utils::chunked_vector<frozen_mutation_and_schema> mutations;
|
||||
for (auto& update : _view_updates) {
|
||||
update.move_to(mutations);
|
||||
}
|
||||
return mutations;
|
||||
});
|
||||
}
|
||||
|
||||
@@ -2035,15 +2054,21 @@ public:
|
||||
// Called in the context of a seastar::thread.
|
||||
void view_builder::execute(build_step& step, exponential_backoff_retry r) {
|
||||
gc_clock::time_point now = gc_clock::now();
|
||||
auto consumer = compact_for_query<emit_only_live_rows::yes, view_builder::consumer>(
|
||||
auto compaction_state = make_lw_shared<compact_for_query_state<emit_only_live_rows::yes>>(
|
||||
*step.reader.schema(),
|
||||
now,
|
||||
step.pslice,
|
||||
batch_size,
|
||||
query::max_partitions,
|
||||
view_builder::consumer{*this, step, now});
|
||||
consumer.consume_new_partition(step.current_key); // Initialize the state in case we're resuming a partition
|
||||
query::max_partitions);
|
||||
auto consumer = compact_for_query<emit_only_live_rows::yes, view_builder::consumer>(compaction_state, view_builder::consumer{*this, step, now});
|
||||
auto built = step.reader.consume_in_thread(std::move(consumer));
|
||||
if (auto ds = std::move(*compaction_state).detach_state()) {
|
||||
auto& range_tombstones = std::get<std::deque<range_tombstone>>(ds->range_tombstones);
|
||||
for (auto& rt : range_tombstones) {
|
||||
step.reader.unpop_mutation_fragment(mutation_fragment(*step.reader.schema(), step.reader.permit(), std::move(rt)));
|
||||
}
|
||||
step.reader.unpop_mutation_fragment(mutation_fragment(*step.reader.schema(), step.reader.permit(), std::move(ds->partition_start)));
|
||||
}
|
||||
|
||||
_as.check();
|
||||
|
||||
@@ -2125,24 +2150,28 @@ update_backlog node_update_backlog::add_fetch(unsigned shard, update_backlog bac
|
||||
return std::max(backlog, _max.load(std::memory_order_relaxed));
|
||||
}
|
||||
|
||||
future<bool> check_view_build_ongoing(db::system_distributed_keyspace& sys_dist_ks, const sstring& ks_name, const sstring& cf_name) {
|
||||
return sys_dist_ks.view_status(ks_name, cf_name).then([] (std::unordered_map<utils::UUID, sstring>&& view_statuses) {
|
||||
return boost::algorithm::any_of(view_statuses | boost::adaptors::map_values, [] (const sstring& view_status) {
|
||||
return view_status == "STARTED";
|
||||
future<bool> check_view_build_ongoing(db::system_distributed_keyspace& sys_dist_ks, const locator::token_metadata& tm, const sstring& ks_name,
|
||||
const sstring& cf_name) {
|
||||
using view_statuses_type = std::unordered_map<utils::UUID, sstring>;
|
||||
return sys_dist_ks.view_status(ks_name, cf_name).then([&tm] (view_statuses_type&& view_statuses) {
|
||||
return boost::algorithm::any_of(view_statuses, [&tm] (const view_statuses_type::value_type& view_status) {
|
||||
// Only consider status of known hosts.
|
||||
return view_status.second == "STARTED" && tm.get_endpoint_for_host_id(view_status.first);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
future<bool> check_needs_view_update_path(db::system_distributed_keyspace& sys_dist_ks, const replica::table& t, streaming::stream_reason reason) {
|
||||
future<bool> check_needs_view_update_path(db::system_distributed_keyspace& sys_dist_ks, const locator::token_metadata& tm, const replica::table& t,
|
||||
streaming::stream_reason reason) {
|
||||
if (is_internal_keyspace(t.schema()->ks_name())) {
|
||||
return make_ready_future<bool>(false);
|
||||
}
|
||||
if (reason == streaming::stream_reason::repair && !t.views().empty()) {
|
||||
return make_ready_future<bool>(true);
|
||||
}
|
||||
return do_with(t.views(), [&sys_dist_ks] (auto& views) {
|
||||
return do_with(t.views(), [&sys_dist_ks, &tm] (auto& views) {
|
||||
return map_reduce(views,
|
||||
[&sys_dist_ks] (const view_ptr& view) { return check_view_build_ongoing(sys_dist_ks, view->ks_name(), view->cf_name()); },
|
||||
[&sys_dist_ks, &tm] (const view_ptr& view) { return check_view_build_ongoing(sys_dist_ks, tm, view->ks_name(), view->cf_name()); },
|
||||
false,
|
||||
std::logical_or<bool>());
|
||||
});
|
||||
|
||||
@@ -154,10 +154,7 @@ private:
|
||||
void delete_old_entry(const partition_key& base_key, const clustering_row& existing, const clustering_row& update, gc_clock::time_point now);
|
||||
void do_delete_old_entry(const partition_key& base_key, const clustering_row& existing, const clustering_row& update, gc_clock::time_point now);
|
||||
void update_entry(const partition_key& base_key, const clustering_row& update, const clustering_row& existing, gc_clock::time_point now);
|
||||
void replace_entry(const partition_key& base_key, const clustering_row& update, const clustering_row& existing, gc_clock::time_point now) {
|
||||
create_entry(base_key, update, now);
|
||||
delete_old_entry(base_key, existing, update, now);
|
||||
}
|
||||
void update_entry_for_computed_column(const partition_key& base_key, const clustering_row& update, const std::optional<clustering_row>& existing, gc_clock::time_point now);
|
||||
};
|
||||
|
||||
class view_update_builder {
|
||||
@@ -188,7 +185,15 @@ public:
|
||||
}
|
||||
view_update_builder(view_update_builder&& other) noexcept = default;
|
||||
|
||||
future<utils::chunked_vector<frozen_mutation_and_schema>> build_some();
|
||||
|
||||
// build_some() works on batches of 100 (max_rows_for_view_updates)
|
||||
// updated rows, but can_skip_view_updates() can decide that some of
|
||||
// these rows do not effect the view, and as a result build_some() can
|
||||
// fewer than 100 rows - in extreme cases even zero (see issue #12297).
|
||||
// So we can't use an empty returned vector to signify that the view
|
||||
// update building is done - and we wrap the return value in an
|
||||
// std::optional, which is disengaged when the iteration is done.
|
||||
future<std::optional<utils::chunked_vector<frozen_mutation_and_schema>>> build_some();
|
||||
|
||||
future<> close() noexcept;
|
||||
|
||||
|
||||
@@ -22,9 +22,13 @@ class system_distributed_keyspace;
|
||||
|
||||
}
|
||||
|
||||
namespace locator {
|
||||
class token_metadata;
|
||||
}
|
||||
|
||||
namespace db::view {
|
||||
|
||||
future<bool> check_view_build_ongoing(db::system_distributed_keyspace& sys_dist_ks, const sstring& ks_name, const sstring& cf_name);
|
||||
future<bool> check_needs_view_update_path(db::system_distributed_keyspace& sys_dist_ks, const replica::table& t, streaming::stream_reason reason);
|
||||
future<bool> check_needs_view_update_path(db::system_distributed_keyspace& sys_dist_ks, const locator::token_metadata& tm, const replica::table& t,
|
||||
streaming::stream_reason reason);
|
||||
|
||||
}
|
||||
|
||||
@@ -83,10 +83,10 @@ future<> view_update_generator::start() {
|
||||
service::get_local_streaming_priority(),
|
||||
nullptr,
|
||||
::mutation_reader::forwarding::no);
|
||||
auto close_sr = deferred_close(staging_sstable_reader);
|
||||
|
||||
inject_failure("view_update_generator_consume_staging_sstable");
|
||||
auto result = staging_sstable_reader.consume_in_thread(view_updating_consumer(s, std::move(permit), *t, sstables, _as, staging_sstable_reader_handle));
|
||||
staging_sstable_reader.close().get();
|
||||
if (result == stop_iteration::yes) {
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#include "db/view/row_locking.hh"
|
||||
#include <seastar/core/abort_source.hh>
|
||||
#include "mutation.hh"
|
||||
#include <seastar/core/circular_buffer.hh>
|
||||
|
||||
class evictable_reader_handle;
|
||||
|
||||
|
||||
@@ -15,11 +15,18 @@
|
||||
|
||||
namespace dht {
|
||||
|
||||
// Note: Cassandra has a special case where for an empty key it returns
|
||||
// minimum_token() instead of 0 (the naturally-calculated hash function for
|
||||
// an empty string). Their thinking was that empty partition keys are not
|
||||
// allowed anyway. However, they *are* allowed in materialized views, so the
|
||||
// empty-key partition should get a real token, not an invalid token, so
|
||||
// we dropped this special case. Since we don't support migrating sstables of
|
||||
// materialized-views from Cassandra, this Cassandra-Scylla incompatiblity
|
||||
// will not cause problems in practice.
|
||||
// Note that get_token(const schema& s, partition_key_view key) below must
|
||||
// use exactly the same algorithm as this function.
|
||||
token
|
||||
murmur3_partitioner::get_token(bytes_view key) const {
|
||||
if (key.empty()) {
|
||||
return minimum_token();
|
||||
}
|
||||
std::array<uint64_t, 2> hash;
|
||||
utils::murmur_hash::hash3_x64_128(key, 0, hash);
|
||||
return get_token(hash[0]);
|
||||
|
||||
3
dist/common/scripts/scylla_coredump_setup
vendored
3
dist/common/scripts/scylla_coredump_setup
vendored
@@ -42,7 +42,8 @@ if __name__ == '__main__':
|
||||
if systemd_unit.available('systemd-coredump@.service'):
|
||||
dropin = '''
|
||||
[Service]
|
||||
TimeoutStartSec=infinity
|
||||
RuntimeMaxSec=infinity
|
||||
TimeoutSec=infinity
|
||||
'''[1:-1]
|
||||
os.makedirs('/etc/systemd/system/systemd-coredump@.service.d', exist_ok=True)
|
||||
with open('/etc/systemd/system/systemd-coredump@.service.d/timeout.conf', 'w') as f:
|
||||
|
||||
23
dist/common/scripts/scylla_raid_setup
vendored
23
dist/common/scripts/scylla_raid_setup
vendored
@@ -16,7 +16,7 @@ import stat
|
||||
import distro
|
||||
from pathlib import Path
|
||||
from scylla_util import *
|
||||
from subprocess import run
|
||||
from subprocess import run, SubprocessError
|
||||
|
||||
if __name__ == '__main__':
|
||||
if os.getuid() > 0:
|
||||
@@ -137,7 +137,9 @@ if __name__ == '__main__':
|
||||
# stalling. The minimum block size for crc enabled filesystems is 1024,
|
||||
# and it also cannot be smaller than the sector size.
|
||||
block_size = max(1024, sector_size)
|
||||
run('udevadm settle', shell=True, check=True)
|
||||
run(f'mkfs.xfs -b size={block_size} {fsdev} -f -K', shell=True, check=True)
|
||||
run('udevadm settle', shell=True, check=True)
|
||||
|
||||
if is_debian_variant():
|
||||
confpath = '/etc/mdadm/mdadm.conf'
|
||||
@@ -153,6 +155,11 @@ if __name__ == '__main__':
|
||||
os.makedirs(mount_at, exist_ok=True)
|
||||
|
||||
uuid = run(f'blkid -s UUID -o value {fsdev}', shell=True, check=True, capture_output=True, encoding='utf-8').stdout.strip()
|
||||
if not uuid:
|
||||
raise Exception(f'Failed to get UUID of {fsdev}')
|
||||
|
||||
uuidpath = f'/dev/disk/by-uuid/{uuid}'
|
||||
|
||||
after = 'local-fs.target'
|
||||
wants = ''
|
||||
if raid and args.raid_level != '0':
|
||||
@@ -169,7 +176,7 @@ After={after}{wants}
|
||||
DefaultDependencies=no
|
||||
|
||||
[Mount]
|
||||
What=/dev/disk/by-uuid/{uuid}
|
||||
What={uuidpath}
|
||||
Where={mount_at}
|
||||
Type=xfs
|
||||
Options=noatime{opt_discard}
|
||||
@@ -191,8 +198,16 @@ WantedBy=multi-user.target
|
||||
systemd_unit.reload()
|
||||
if args.raid_level != '0':
|
||||
md_service.start()
|
||||
mount = systemd_unit(mntunit_bn)
|
||||
mount.start()
|
||||
try:
|
||||
mount = systemd_unit(mntunit_bn)
|
||||
mount.start()
|
||||
except SubprocessError as e:
|
||||
if not os.path.exists(uuidpath):
|
||||
print(f'\nERROR: {uuidpath} is not found\n')
|
||||
elif not stat.S_ISBLK(os.stat(uuidpath).st_mode):
|
||||
print(f'\nERROR: {uuidpath} is not block device\n')
|
||||
raise e
|
||||
|
||||
if args.enable_on_nextboot:
|
||||
mount.enable()
|
||||
uid = pwd.getpwnam('scylla').pw_uid
|
||||
|
||||
4
dist/common/scripts/scylla_setup
vendored
4
dist/common/scripts/scylla_setup
vendored
@@ -214,7 +214,7 @@ if __name__ == '__main__':
|
||||
help='skip raid setup')
|
||||
parser.add_argument('--raid-level-5', action='store_true', default=False,
|
||||
help='use RAID5 for RAID volume')
|
||||
parser.add_argument('--online-discard', default=True,
|
||||
parser.add_argument('--online-discard', default=1, choices=[0, 1], type=int,
|
||||
help='Configure XFS to discard unused blocks as soon as files are deleted')
|
||||
parser.add_argument('--nic',
|
||||
help='specify NIC')
|
||||
@@ -458,7 +458,7 @@ if __name__ == '__main__':
|
||||
args.no_raid_setup = not raid_setup
|
||||
if raid_setup:
|
||||
level = '5' if raid_level_5 else '0'
|
||||
run_setup_script('RAID', f'scylla_raid_setup --disks {disks} --enable-on-nextboot --raid-level={level} --online-discard={int(online_discard)}')
|
||||
run_setup_script('RAID', f'scylla_raid_setup --disks {disks} --enable-on-nextboot --raid-level={level} --online-discard={online_discard}')
|
||||
|
||||
coredump_setup = interactive_ask_service('Do you want to enable coredumps?', 'Yes - sets up coredump to allow a post-mortem analysis of the Scylla state just prior to a crash. No - skips this step.', coredump_setup)
|
||||
args.no_coredump_setup = not coredump_setup
|
||||
|
||||
7
dist/docker/scyllasetup.py
vendored
7
dist/docker/scyllasetup.py
vendored
@@ -68,7 +68,12 @@ class ScyllaSetup:
|
||||
|
||||
def cqlshrc(self):
|
||||
home = os.environ['HOME']
|
||||
hostname = subprocess.check_output(['hostname', '-i']).decode('ascii').strip()
|
||||
if self._rpcAddress:
|
||||
hostname = self._rpcAddress
|
||||
elif self._listenAddress:
|
||||
hostname = self._listenAddress
|
||||
else:
|
||||
hostname = subprocess.check_output(['hostname', '-i']).decode('ascii').strip()
|
||||
with open("%s/.cqlshrc" % home, "w") as cqlshrc:
|
||||
cqlshrc.write("[connection]\nhostname = %s\n" % hostname)
|
||||
|
||||
|
||||
4
dist/redhat/scylla.spec
vendored
4
dist/redhat/scylla.spec
vendored
@@ -7,7 +7,7 @@ Group: Applications/Databases
|
||||
License: AGPLv3
|
||||
URL: http://www.scylladb.com/
|
||||
Source0: %{reloc_pkg}
|
||||
Requires: %{product}-server = %{version} %{product}-conf = %{version} %{product}-python3 = %{version} %{product}-kernel-conf = %{version} %{product}-jmx = %{version} %{product}-tools = %{version} %{product}-tools-core = %{version} %{product}-node-exporter = %{version}
|
||||
Requires: %{product}-server = %{version}-%{release} %{product}-conf = %{version}-%{release} %{product}-python3 = %{version}-%{release} %{product}-kernel-conf = %{version}-%{release} %{product}-jmx = %{version}-%{release} %{product}-tools = %{version}-%{release} %{product}-tools-core = %{version}-%{release} %{product}-node-exporter = %{version}-%{release}
|
||||
Obsoletes: scylla-server < 1.1
|
||||
|
||||
%global _debugsource_template %{nil}
|
||||
@@ -54,7 +54,7 @@ Group: Applications/Databases
|
||||
Summary: The Scylla database server
|
||||
License: AGPLv3
|
||||
URL: http://www.scylladb.com/
|
||||
Requires: %{product}-conf = %{version} %{product}-python3 = %{version}
|
||||
Requires: %{product}-conf = %{version}-%{release} %{product}-python3 = %{version}-%{release}
|
||||
Conflicts: abrt
|
||||
AutoReqProv: no
|
||||
|
||||
|
||||
@@ -32,7 +32,7 @@
|
||||
logging::logger fmr_logger("flat_mutation_reader");
|
||||
|
||||
flat_mutation_reader& flat_mutation_reader::operator=(flat_mutation_reader&& o) noexcept {
|
||||
if (_impl) {
|
||||
if (_impl && _impl->is_close_required()) {
|
||||
impl* ip = _impl.get();
|
||||
// Abort to enforce calling close() before readers are closed
|
||||
// to prevent leaks and potential use-after-free due to background
|
||||
@@ -45,7 +45,7 @@ flat_mutation_reader& flat_mutation_reader::operator=(flat_mutation_reader&& o)
|
||||
}
|
||||
|
||||
flat_mutation_reader::~flat_mutation_reader() {
|
||||
if (_impl) {
|
||||
if (_impl && _impl->is_close_required()) {
|
||||
impl* ip = _impl.get();
|
||||
// Abort to enforce calling close() before readers are closed
|
||||
// to prevent leaks and potential use-after-free due to background
|
||||
@@ -774,11 +774,14 @@ make_flat_mutation_reader_from_mutations_v2(schema_ptr s, reader_permit permit,
|
||||
std::optional<mutation_consume_cookie> _cookie;
|
||||
|
||||
private:
|
||||
void flush_tombstones(position_in_partition_view pos) {
|
||||
void flush_tombstones(position_in_partition_view pos, bool emit_end = false) {
|
||||
_rt_gen.flush(pos, [&] (range_tombstone_change rt) {
|
||||
_current_rt = rt.tombstone();
|
||||
push_mutation_fragment(*_schema, _permit, std::move(rt));
|
||||
});
|
||||
if (emit_end && _current_rt) {
|
||||
push_mutation_fragment(*_schema, _permit, range_tombstone_change(pos, {}));
|
||||
}
|
||||
}
|
||||
void maybe_emit_partition_start() {
|
||||
if (_dk) {
|
||||
@@ -815,10 +818,7 @@ make_flat_mutation_reader_from_mutations_v2(schema_ptr s, reader_permit permit,
|
||||
return stop_iteration::yes;
|
||||
}
|
||||
maybe_emit_partition_start();
|
||||
flush_tombstones(position_in_partition::after_all_clustered_rows());
|
||||
if (_current_rt) {
|
||||
push_mutation_fragment(*_schema, _permit, range_tombstone_change(position_in_partition::after_all_clustered_rows(), {}));
|
||||
}
|
||||
flush_tombstones(position_in_partition::after_all_clustered_rows(), true);
|
||||
push_mutation_fragment(*_schema, _permit, partition_end{});
|
||||
return stop_iteration::no;
|
||||
}
|
||||
@@ -1786,7 +1786,7 @@ void mutation_fragment_stream_validating_filter::on_end_of_stream() {
|
||||
}
|
||||
|
||||
flat_mutation_reader_v2& flat_mutation_reader_v2::operator=(flat_mutation_reader_v2&& o) noexcept {
|
||||
if (_impl) {
|
||||
if (_impl && _impl->is_close_required()) {
|
||||
impl* ip = _impl.get();
|
||||
// Abort to enforce calling close() before readers are closed
|
||||
// to prevent leaks and potential use-after-free due to background
|
||||
@@ -1799,7 +1799,7 @@ flat_mutation_reader_v2& flat_mutation_reader_v2::operator=(flat_mutation_reader
|
||||
}
|
||||
|
||||
flat_mutation_reader_v2::~flat_mutation_reader_v2() {
|
||||
if (_impl) {
|
||||
if (_impl && _impl->is_close_required()) {
|
||||
impl* ip = _impl.get();
|
||||
// Abort to enforce calling close() before readers are closed
|
||||
// to prevent leaks and potential use-after-free due to background
|
||||
@@ -1986,11 +1986,14 @@ flat_mutation_reader_v2 upgrade_to_v2(flat_mutation_reader r) {
|
||||
tombstone _current_rt;
|
||||
std::optional<position_range> _pr;
|
||||
public:
|
||||
void flush_tombstones(position_in_partition_view pos) {
|
||||
void flush_tombstones(position_in_partition_view pos, bool emit_end = false) {
|
||||
_rt_gen.flush(pos, [&] (range_tombstone_change rt) {
|
||||
_current_rt = rt.tombstone();
|
||||
push_mutation_fragment(*_schema, _permit, std::move(rt));
|
||||
});
|
||||
if (emit_end && _current_rt) {
|
||||
push_mutation_fragment(*_schema, _permit, range_tombstone_change(pos, {}));
|
||||
}
|
||||
}
|
||||
void consume(static_row mf) {
|
||||
push_mutation_fragment(*_schema, _permit, std::move(mf));
|
||||
@@ -2015,11 +2018,9 @@ flat_mutation_reader_v2 upgrade_to_v2(flat_mutation_reader r) {
|
||||
push_mutation_fragment(*_schema, _permit, std::move(mf));
|
||||
}
|
||||
void consume(partition_end mf) {
|
||||
flush_tombstones(position_in_partition::after_all_clustered_rows());
|
||||
flush_tombstones(position_in_partition::after_all_clustered_rows(), true);
|
||||
if (_current_rt) {
|
||||
assert(!_pr);
|
||||
push_mutation_fragment(*_schema, _permit, range_tombstone_change(
|
||||
position_in_partition::after_all_clustered_rows(), {}));
|
||||
}
|
||||
push_mutation_fragment(*_schema, _permit, std::move(mf));
|
||||
}
|
||||
@@ -2042,10 +2043,7 @@ flat_mutation_reader_v2 upgrade_to_v2(flat_mutation_reader r) {
|
||||
if (_reader.is_end_of_stream() && _reader.is_buffer_empty()) {
|
||||
if (_pr) {
|
||||
// If !_pr we should flush on partition_end
|
||||
flush_tombstones(_pr->end());
|
||||
if (_current_rt) {
|
||||
push_mutation_fragment(*_schema, _permit, range_tombstone_change(_pr->end(), {}));
|
||||
}
|
||||
flush_tombstones(_pr->end(), true);
|
||||
}
|
||||
_end_of_stream = true;
|
||||
}
|
||||
|
||||
@@ -132,6 +132,7 @@ public:
|
||||
private:
|
||||
tracked_buffer _buffer;
|
||||
size_t _buffer_size = 0;
|
||||
bool _close_required = false;
|
||||
protected:
|
||||
size_t max_buffer_size_in_bytes = default_max_buffer_size_in_bytes();
|
||||
bool _end_of_stream = false;
|
||||
@@ -167,6 +168,8 @@ public:
|
||||
bool is_end_of_stream() const { return _end_of_stream; }
|
||||
bool is_buffer_empty() const { return _buffer.empty(); }
|
||||
bool is_buffer_full() const { return _buffer_size >= max_buffer_size_in_bytes; }
|
||||
bool is_close_required() const { return _close_required; }
|
||||
void set_close_required() { _close_required = true; }
|
||||
static constexpr size_t default_max_buffer_size_in_bytes() { return 8 * 1024; }
|
||||
|
||||
mutation_fragment pop_mutation_fragment() {
|
||||
@@ -504,9 +507,15 @@ public:
|
||||
//
|
||||
// Can be used to skip over entire partitions if interleaved with
|
||||
// `operator()()` calls.
|
||||
future<> next_partition() { return _impl->next_partition(); }
|
||||
future<> next_partition() {
|
||||
_impl->set_close_required();
|
||||
return _impl->next_partition();
|
||||
}
|
||||
|
||||
future<> fill_buffer() { return _impl->fill_buffer(); }
|
||||
future<> fill_buffer() {
|
||||
_impl->set_close_required();
|
||||
return _impl->fill_buffer();
|
||||
}
|
||||
|
||||
// Changes the range of partitions to pr. The range can only be moved
|
||||
// forwards. pr.begin() needs to be larger than pr.end() of the previousl
|
||||
@@ -515,6 +524,7 @@ public:
|
||||
// pr needs to be valid until the reader is destroyed or fast_forward_to()
|
||||
// is called again.
|
||||
future<> fast_forward_to(const dht::partition_range& pr) {
|
||||
_impl->set_close_required();
|
||||
return _impl->fast_forward_to(pr);
|
||||
}
|
||||
// Skips to a later range of rows.
|
||||
@@ -544,6 +554,7 @@ public:
|
||||
// In particular one must first enter a partition by fetching a `partition_start`
|
||||
// fragment before calling `fast_forward_to`.
|
||||
future<> fast_forward_to(position_range cr) {
|
||||
_impl->set_close_required();
|
||||
return _impl->fast_forward_to(std::move(cr));
|
||||
}
|
||||
// Closes the reader.
|
||||
|
||||
@@ -164,6 +164,7 @@ public:
|
||||
private:
|
||||
tracked_buffer _buffer;
|
||||
size_t _buffer_size = 0;
|
||||
bool _close_required = false;
|
||||
protected:
|
||||
size_t max_buffer_size_in_bytes = default_max_buffer_size_in_bytes();
|
||||
|
||||
@@ -205,6 +206,8 @@ public:
|
||||
bool is_end_of_stream() const { return _end_of_stream; }
|
||||
bool is_buffer_empty() const { return _buffer.empty(); }
|
||||
bool is_buffer_full() const { return _buffer_size >= max_buffer_size_in_bytes; }
|
||||
bool is_close_required() const { return _close_required; }
|
||||
void set_close_required() { _close_required = true; }
|
||||
static constexpr size_t default_max_buffer_size_in_bytes() { return 8 * 1024; }
|
||||
|
||||
mutation_fragment_v2 pop_mutation_fragment() {
|
||||
@@ -542,9 +545,15 @@ public:
|
||||
//
|
||||
// Can be used to skip over entire partitions if interleaved with
|
||||
// `operator()()` calls.
|
||||
future<> next_partition() { return _impl->next_partition(); }
|
||||
future<> next_partition() {
|
||||
_impl->set_close_required();
|
||||
return _impl->next_partition();
|
||||
}
|
||||
|
||||
future<> fill_buffer() { return _impl->fill_buffer(); }
|
||||
future<> fill_buffer() {
|
||||
_impl->set_close_required();
|
||||
return _impl->fill_buffer();
|
||||
}
|
||||
|
||||
// Changes the range of partitions to pr. The range can only be moved
|
||||
// forwards. pr.begin() needs to be larger than pr.end() of the previousl
|
||||
@@ -553,6 +562,7 @@ public:
|
||||
// pr needs to be valid until the reader is destroyed or fast_forward_to()
|
||||
// is called again.
|
||||
future<> fast_forward_to(const dht::partition_range& pr) {
|
||||
_impl->set_close_required();
|
||||
return _impl->fast_forward_to(pr);
|
||||
}
|
||||
// Skips to a later range of rows.
|
||||
@@ -582,6 +592,7 @@ public:
|
||||
// In particular one must first enter a partition by fetching a `partition_start`
|
||||
// fragment before calling `fast_forward_to`.
|
||||
future<> fast_forward_to(position_range cr) {
|
||||
_impl->set_close_required();
|
||||
return _impl->fast_forward_to(std::move(cr));
|
||||
}
|
||||
// Closes the reader.
|
||||
|
||||
@@ -1012,10 +1012,10 @@ std::set<inet_address> gossiper::get_live_members() {
|
||||
|
||||
std::set<inet_address> gossiper::get_live_token_owners() {
|
||||
std::set<inet_address> token_owners;
|
||||
for (auto& member : get_live_members()) {
|
||||
auto es = get_endpoint_state_for_endpoint_ptr(member);
|
||||
if (es && !is_dead_state(*es) && get_token_metadata_ptr()->is_member(member)) {
|
||||
token_owners.insert(member);
|
||||
auto normal_token_owners = get_token_metadata_ptr()->get_all_endpoints();
|
||||
for (auto& node: normal_token_owners) {
|
||||
if (is_alive(node)) {
|
||||
token_owners.insert(node);
|
||||
}
|
||||
}
|
||||
return token_owners;
|
||||
@@ -1023,10 +1023,10 @@ std::set<inet_address> gossiper::get_live_token_owners() {
|
||||
|
||||
std::set<inet_address> gossiper::get_unreachable_token_owners() {
|
||||
std::set<inet_address> token_owners;
|
||||
for (auto&& x : _unreachable_endpoints) {
|
||||
auto& endpoint = x.first;
|
||||
if (get_token_metadata_ptr()->is_member(endpoint)) {
|
||||
token_owners.insert(endpoint);
|
||||
auto normal_token_owners = get_token_metadata_ptr()->get_all_endpoints();
|
||||
for (auto& node: normal_token_owners) {
|
||||
if (!is_alive(node)) {
|
||||
token_owners.insert(node);
|
||||
}
|
||||
}
|
||||
return token_owners;
|
||||
|
||||
@@ -215,22 +215,6 @@ effective_replication_map::get_primary_ranges_within_dc(inet_address ep) const {
|
||||
});
|
||||
}
|
||||
|
||||
future<std::unordered_multimap<inet_address, dht::token_range>>
|
||||
abstract_replication_strategy::get_address_ranges(const token_metadata& tm) const {
|
||||
std::unordered_multimap<inet_address, dht::token_range> ret;
|
||||
for (auto& t : tm.sorted_tokens()) {
|
||||
dht::token_range_vector r = tm.get_primary_ranges_for(t);
|
||||
auto eps = co_await calculate_natural_endpoints(t, tm);
|
||||
rslogger.debug("token={}, primary_range={}, address={}", t, r, eps);
|
||||
for (auto ep : eps) {
|
||||
for (auto&& rng : r) {
|
||||
ret.emplace(ep, rng);
|
||||
}
|
||||
}
|
||||
}
|
||||
co_return ret;
|
||||
}
|
||||
|
||||
future<std::unordered_multimap<inet_address, dht::token_range>>
|
||||
abstract_replication_strategy::get_address_ranges(const token_metadata& tm, inet_address endpoint) const {
|
||||
std::unordered_multimap<inet_address, dht::token_range> ret;
|
||||
|
||||
@@ -112,7 +112,6 @@ public:
|
||||
future<dht::token_range_vector> get_ranges(inet_address ep, token_metadata_ptr tmptr) const;
|
||||
|
||||
public:
|
||||
future<std::unordered_multimap<inet_address, dht::token_range>> get_address_ranges(const token_metadata& tm) const;
|
||||
future<std::unordered_multimap<inet_address, dht::token_range>> get_address_ranges(const token_metadata& tm, inet_address endpoint) const;
|
||||
|
||||
// Caller must ensure that token_metadata will not change throughout the call.
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
#include <seastar/core/coroutine.hh>
|
||||
#include <seastar/core/seastar.hh>
|
||||
#include <seastar/http/response_parser.hh>
|
||||
#include <seastar/http/reply.hh>
|
||||
#include <seastar/net/api.hh>
|
||||
#include <seastar/net/dns.hh>
|
||||
|
||||
@@ -47,7 +48,8 @@ future<> azure_snitch::load_config() {
|
||||
|
||||
logger().info("AzureSnitch using region: {}, zone: {}.", azure_region, azure_zone);
|
||||
|
||||
_my_rack = azure_zone;
|
||||
// Zoneless regions return empty zone
|
||||
_my_rack = (azure_zone != "" ? azure_zone : azure_region);
|
||||
_my_dc = azure_region;
|
||||
|
||||
co_return co_await _my_distributed->invoke_on_all([this] (snitch_ptr& local_s) {
|
||||
@@ -90,6 +92,10 @@ future<sstring> azure_snitch::azure_api_call(sstring path) {
|
||||
|
||||
// Read HTTP response header first
|
||||
auto rsp = parser.get_parsed_response();
|
||||
if (rsp->_status_code != static_cast<int>(httpd::reply::status_type::ok)) {
|
||||
throw std::runtime_error(format("Error: HTTP response status {}", rsp->_status_code));
|
||||
}
|
||||
|
||||
auto it = rsp->_headers.find("Content-Length");
|
||||
if (it == rsp->_headers.end()) {
|
||||
throw std::runtime_error("Error: HTTP response does not contain: Content-Length\n");
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
#include "locator/ec2_snitch.hh"
|
||||
#include <seastar/core/seastar.hh>
|
||||
#include <seastar/core/sleep.hh>
|
||||
#include <seastar/core/do_with.hh>
|
||||
#include <seastar/http/reply.hh>
|
||||
|
||||
#include <boost/algorithm/string/classification.hpp>
|
||||
#include <boost/algorithm/string/split.hpp>
|
||||
@@ -67,6 +70,30 @@ future<> ec2_snitch::start() {
|
||||
}
|
||||
|
||||
future<sstring> ec2_snitch::aws_api_call(sstring addr, uint16_t port, sstring cmd) {
|
||||
return do_with(int(0), [this, addr, port, cmd] (int& i) {
|
||||
return repeat_until_value([this, addr, port, cmd, &i]() -> future<std::optional<sstring>> {
|
||||
++i;
|
||||
return aws_api_call_once(addr, port, cmd).then([] (auto res) {
|
||||
return make_ready_future<std::optional<sstring>>(std::move(res));
|
||||
}).handle_exception([&i] (auto ep) {
|
||||
try {
|
||||
std::rethrow_exception(ep);
|
||||
} catch (const std::system_error &e) {
|
||||
logger().error(e.what());
|
||||
if (i >= AWS_API_CALL_RETRIES - 1) {
|
||||
logger().error("Maximum number of retries exceeded");
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
return sleep(AWS_API_CALL_RETRY_INTERVAL).then([] {
|
||||
return make_ready_future<std::optional<sstring>>(std::nullopt);
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
future<sstring> ec2_snitch::aws_api_call_once(sstring addr, uint16_t port, sstring cmd) {
|
||||
return connect(socket_address(inet_address{addr}, port))
|
||||
.then([this, addr, cmd] (connected_socket fd) {
|
||||
_sd = std::move(fd);
|
||||
@@ -88,6 +115,9 @@ future<sstring> ec2_snitch::aws_api_call(sstring addr, uint16_t port, sstring cm
|
||||
|
||||
// Read HTTP response header first
|
||||
auto _rsp = _parser.get_parsed_response();
|
||||
if (_rsp->_status_code != static_cast<int>(httpd::reply::status_type::ok)) {
|
||||
return make_exception_future<sstring>(std::runtime_error(format("Error: HTTP response status {}", _rsp->_status_code)));
|
||||
}
|
||||
auto it = _rsp->_headers.find("Content-Length");
|
||||
if (it == _rsp->_headers.end()) {
|
||||
return make_exception_future<sstring>("Error: HTTP response does not contain: Content-Length\n");
|
||||
|
||||
@@ -16,6 +16,8 @@ public:
|
||||
static constexpr const char* ZONE_NAME_QUERY_REQ = "/latest/meta-data/placement/availability-zone";
|
||||
static constexpr const char* AWS_QUERY_SERVER_ADDR = "169.254.169.254";
|
||||
static constexpr uint16_t AWS_QUERY_SERVER_PORT = 80;
|
||||
static constexpr int AWS_API_CALL_RETRIES = 5;
|
||||
static constexpr auto AWS_API_CALL_RETRY_INTERVAL = std::chrono::seconds{5};
|
||||
|
||||
ec2_snitch(const sstring& fname = "", unsigned io_cpu_id = 0);
|
||||
virtual future<> start() override;
|
||||
@@ -32,5 +34,6 @@ private:
|
||||
output_stream<char> _out;
|
||||
http_response_parser _parser;
|
||||
sstring _zone_req;
|
||||
future<sstring> aws_api_call_once(sstring addr, uint16_t port, const sstring cmd);
|
||||
};
|
||||
} // namespace locator
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
#include <seastar/net/dns.hh>
|
||||
#include <seastar/core/seastar.hh>
|
||||
#include "locator/gce_snitch.hh"
|
||||
#include <seastar/http/reply.hh>
|
||||
|
||||
#include <boost/algorithm/string/split.hpp>
|
||||
#include <boost/algorithm/string/classification.hpp>
|
||||
@@ -106,6 +107,10 @@ future<sstring> gce_snitch::gce_api_call(sstring addr, sstring cmd) {
|
||||
|
||||
// Read HTTP response header first
|
||||
auto rsp = parser.get_parsed_response();
|
||||
if (rsp->_status_code != static_cast<int>(httpd::reply::status_type::ok)) {
|
||||
throw std::runtime_error(format("Error: HTTP response status {}", rsp->_status_code));
|
||||
}
|
||||
|
||||
auto it = rsp->_headers.find("Content-Length");
|
||||
if (it == rsp->_headers.end()) {
|
||||
throw std::runtime_error("Error: HTTP response does not contain: Content-Length\n");
|
||||
|
||||
@@ -786,13 +786,12 @@ void token_metadata_impl::calculate_pending_ranges_for_leaving(
|
||||
const abstract_replication_strategy& strategy,
|
||||
std::unordered_multimap<range<token>, inet_address>& new_pending_ranges,
|
||||
mutable_token_metadata_ptr all_left_metadata) const {
|
||||
std::unordered_multimap<inet_address, dht::token_range> address_ranges = strategy.get_address_ranges(unpimplified_this).get0();
|
||||
// get all ranges that will be affected by leaving nodes
|
||||
std::unordered_set<range<token>> affected_ranges;
|
||||
for (auto endpoint : _leaving_endpoints) {
|
||||
auto r = address_ranges.equal_range(endpoint);
|
||||
for (auto x = r.first; x != r.second; x++) {
|
||||
affected_ranges.emplace(x->second);
|
||||
auto r = strategy.get_address_ranges(unpimplified_this, endpoint).get0();
|
||||
for (const auto& x : r) {
|
||||
affected_ranges.emplace(x.second);
|
||||
}
|
||||
}
|
||||
// for each of those ranges, find what new nodes will be responsible for the range when
|
||||
@@ -826,16 +825,14 @@ void token_metadata_impl::calculate_pending_ranges_for_replacing(
|
||||
if (_replacing_endpoints.empty()) {
|
||||
return;
|
||||
}
|
||||
auto address_ranges = strategy.get_address_ranges(unpimplified_this).get0();
|
||||
for (const auto& node : _replacing_endpoints) {
|
||||
auto existing_node = node.first;
|
||||
auto replacing_node = node.second;
|
||||
auto address_ranges = strategy.get_address_ranges(unpimplified_this, existing_node).get0();
|
||||
for (const auto& x : address_ranges) {
|
||||
seastar::thread::maybe_yield();
|
||||
if (x.first == existing_node) {
|
||||
tlogger.debug("Node {} replaces {} for range {}", replacing_node, existing_node, x.second);
|
||||
new_pending_ranges.emplace(x.second, replacing_node);
|
||||
}
|
||||
tlogger.debug("Node {} replaces {} for range {}", replacing_node, existing_node, x.second);
|
||||
new_pending_ranges.emplace(x.second, replacing_node);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
63
main.cc
63
main.cc
@@ -383,6 +383,8 @@ static auto defer_verbose_shutdown(const char* what, Func&& func) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} catch (const storage_io_error& e) {
|
||||
do_abort = false;
|
||||
} catch (...) {
|
||||
}
|
||||
auto msg = fmt::format("Unexpected error shutting down {}: {}", what, ex);
|
||||
@@ -425,6 +427,39 @@ static int scylla_main(int ac, char** av) {
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// Even on the environment which causes error during initalize Scylla,
|
||||
// "scylla --version" should be able to run without error.
|
||||
// To do so, we need to parse and execute these options before
|
||||
// initializing Scylla/Seastar classes.
|
||||
bpo::options_description preinit_description("Scylla options");
|
||||
bpo::variables_map preinit_vm;
|
||||
preinit_description.add_options()
|
||||
("version", bpo::bool_switch(), "print version number and exit")
|
||||
("build-id", bpo::bool_switch(), "print build-id and exit")
|
||||
("build-mode", bpo::bool_switch(), "print build mode and exit")
|
||||
("list-tools", bpo::bool_switch(), "list included tools and exit");
|
||||
auto preinit_parsed_opts = bpo::command_line_parser(ac, av).options(preinit_description).allow_unregistered().run();
|
||||
bpo::store(preinit_parsed_opts, preinit_vm);
|
||||
if (preinit_vm["version"].as<bool>()) {
|
||||
fmt::print("{}\n", scylla_version());
|
||||
return 0;
|
||||
}
|
||||
if (preinit_vm["build-id"].as<bool>()) {
|
||||
fmt::print("{}\n", get_build_id());
|
||||
return 0;
|
||||
}
|
||||
if (preinit_vm["build-mode"].as<bool>()) {
|
||||
fmt::print("{}\n", scylla_build_mode());
|
||||
return 0;
|
||||
}
|
||||
if (preinit_vm["list-tools"].as<bool>()) {
|
||||
fmt::print(
|
||||
"types - a command-line tool to examine values belonging to scylla types\n"
|
||||
"sstable - a multifunctional command-line tool to examine the content of sstables\n"
|
||||
);
|
||||
return 0;
|
||||
}
|
||||
|
||||
try {
|
||||
runtime::init_uptime();
|
||||
std::setvbuf(stdout, nullptr, _IOLBF, 1000);
|
||||
@@ -479,26 +514,6 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
bpo::variables_map vm;
|
||||
auto parsed_opts = bpo::command_line_parser(ac, av).options(app.get_options_description()).allow_unregistered().run();
|
||||
bpo::store(parsed_opts, vm);
|
||||
if (vm["version"].as<bool>()) {
|
||||
fmt::print("{}\n", scylla_version());
|
||||
return 0;
|
||||
}
|
||||
if (vm["build-id"].as<bool>()) {
|
||||
fmt::print("{}\n", get_build_id());
|
||||
return 0;
|
||||
}
|
||||
if (vm["build-mode"].as<bool>()) {
|
||||
fmt::print("{}\n", scylla_build_mode());
|
||||
return 0;
|
||||
}
|
||||
if (vm["list-tools"].as<bool>()) {
|
||||
fmt::print(
|
||||
"types - a command-line tool to examine values belonging to scylla types\n"
|
||||
"sstable - a multifunctional command-line tool to examine the content of sstables\n"
|
||||
);
|
||||
return 0;
|
||||
}
|
||||
|
||||
print_starting_message(ac, av, parsed_opts);
|
||||
|
||||
sharded<locator::shared_token_metadata> token_metadata;
|
||||
@@ -574,6 +589,12 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
|
||||
cfg->broadcast_to_all_shards().get();
|
||||
|
||||
// We pass this piece of config through a global as a temporary hack.
|
||||
// See the comment at the definition of sstables::global_cache_index_pages.
|
||||
smp::invoke_on_all([&cfg] {
|
||||
sstables::global_cache_index_pages = cfg->cache_index_pages.operator utils::updateable_value<bool>();
|
||||
}).get();
|
||||
|
||||
::sighup_handler sighup_handler(opts, *cfg);
|
||||
auto stop_sighup_handler = defer_verbose_shutdown("sighup", [&] {
|
||||
sighup_handler.stop().get();
|
||||
@@ -1116,7 +1137,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
// ATTN -- sharded repair reference already sits on storage_service and if
|
||||
// it calls repair.local() before this place it'll crash (now it doesn't do
|
||||
// both)
|
||||
supervisor::notify("starting messaging service");
|
||||
supervisor::notify("starting repair service");
|
||||
auto max_memory_repair = memory::stats().total_memory() * 0.1;
|
||||
repair.start(std::ref(gossiper), std::ref(messaging), std::ref(db), std::ref(proxy), std::ref(bm), std::ref(sys_dist_ks), std::ref(view_update_generator), std::ref(mm), max_memory_repair).get();
|
||||
auto stop_repair_service = defer_verbose_shutdown("repair service", [&repair] {
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
#include "sstables/shared_sstable.hh"
|
||||
#include <seastar/core/future.hh>
|
||||
#include <seastar/core/io_priority_class.hh>
|
||||
#include "reader_permit.hh"
|
||||
|
||||
class memtable;
|
||||
class flat_mutation_reader;
|
||||
|
||||
@@ -438,6 +438,8 @@ static constexpr unsigned do_get_rpc_client_idx(messaging_verb verb) {
|
||||
// should not be blocked by any data requests.
|
||||
case messaging_verb::GROUP0_PEER_EXCHANGE:
|
||||
case messaging_verb::GROUP0_MODIFY_CONFIG:
|
||||
// ATTN -- if moving GOSSIP_ verbs elsewhere, mind updating the tcp_nodelay
|
||||
// setting in get_rpc_client(), which assumes gossiper verbs live in idx 0
|
||||
return 0;
|
||||
case messaging_verb::PREPARE_MESSAGE:
|
||||
case messaging_verb::PREPARE_DONE_MESSAGE:
|
||||
@@ -695,7 +697,7 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
|
||||
}();
|
||||
|
||||
auto must_tcp_nodelay = [&] {
|
||||
if (idx == 1) {
|
||||
if (idx == 0) {
|
||||
return true; // gossip
|
||||
}
|
||||
if (_cfg.tcp_nodelay == tcp_nodelay_what::local) {
|
||||
|
||||
@@ -272,8 +272,8 @@ public:
|
||||
|
||||
future<> lookup_readers(db::timeout_clock::time_point timeout);
|
||||
|
||||
future<> save_readers(flat_mutation_reader::tracked_buffer unconsumed_buffer, detached_compaction_state compaction_state,
|
||||
std::optional<clustering_key_prefix> last_ckey);
|
||||
future<> save_readers(flat_mutation_reader::tracked_buffer unconsumed_buffer, std::optional<detached_compaction_state> compaction_state,
|
||||
dht::decorated_key last_pkey, std::optional<clustering_key_prefix> last_ckey);
|
||||
|
||||
future<> stop();
|
||||
};
|
||||
@@ -580,19 +580,22 @@ future<> read_context::lookup_readers(db::timeout_clock::time_point timeout) {
|
||||
});
|
||||
}
|
||||
|
||||
future<> read_context::save_readers(flat_mutation_reader::tracked_buffer unconsumed_buffer, detached_compaction_state compaction_state,
|
||||
std::optional<clustering_key_prefix> last_ckey) {
|
||||
future<> read_context::save_readers(flat_mutation_reader::tracked_buffer unconsumed_buffer, std::optional<detached_compaction_state> compaction_state,
|
||||
dht::decorated_key last_pkey, std::optional<clustering_key_prefix> last_ckey) {
|
||||
if (_cmd.query_uuid == utils::UUID{}) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
auto last_pkey = compaction_state.partition_start.key();
|
||||
|
||||
const auto cb_stats = dismantle_combined_buffer(std::move(unconsumed_buffer), last_pkey);
|
||||
tracing::trace(_trace_state, "Dismantled combined buffer: {}", cb_stats);
|
||||
|
||||
const auto cs_stats = dismantle_compaction_state(std::move(compaction_state));
|
||||
tracing::trace(_trace_state, "Dismantled compaction state: {}", cs_stats);
|
||||
auto cs_stats = dismantle_buffer_stats{};
|
||||
if (compaction_state) {
|
||||
cs_stats = dismantle_compaction_state(std::move(*compaction_state));
|
||||
tracing::trace(_trace_state, "Dismantled compaction state: {}", cs_stats);
|
||||
} else {
|
||||
tracing::trace(_trace_state, "No compaction state to dismantle, partition exhausted", cs_stats);
|
||||
}
|
||||
|
||||
return do_with(std::move(last_pkey), std::move(last_ckey), [this] (const dht::decorated_key& last_pkey,
|
||||
const std::optional<clustering_key_prefix>& last_ckey) {
|
||||
@@ -754,7 +757,9 @@ future<typename ResultBuilder::result_type> do_query(
|
||||
std::move(result_builder));
|
||||
|
||||
if (compaction_state->are_limits_reached() || result.is_short_read()) {
|
||||
co_await ctx->save_readers(std::move(unconsumed_buffer), std::move(*compaction_state).detach_state(), std::move(last_ckey));
|
||||
// Must call before calling 'detached_state()`.
|
||||
auto last_pkey = *compaction_state->current_partition();
|
||||
co_await ctx->save_readers(std::move(unconsumed_buffer), std::move(*compaction_state).detach_state(), std::move(last_pkey), std::move(last_ckey));
|
||||
}
|
||||
|
||||
co_await ctx->stop();
|
||||
|
||||
@@ -167,6 +167,9 @@ class compact_mutation_state {
|
||||
std::unique_ptr<mutation_compactor_garbage_collector> _collector;
|
||||
|
||||
compaction_stats _stats;
|
||||
|
||||
// Remember if we requested to stop mid-partition.
|
||||
stop_iteration _stop = stop_iteration::no;
|
||||
private:
|
||||
template <typename Consumer, typename GCConsumer>
|
||||
requires CompactedFragmentsConsumer<Consumer> && CompactedFragmentsConsumer<GCConsumer>
|
||||
@@ -304,6 +307,7 @@ public:
|
||||
}
|
||||
|
||||
void consume_new_partition(const dht::decorated_key& dk) {
|
||||
_stop = stop_iteration::no;
|
||||
auto& pk = dk.key();
|
||||
_dk = &dk;
|
||||
_return_static_content_on_partition_with_no_rows =
|
||||
@@ -370,9 +374,9 @@ public:
|
||||
_static_row_live = is_live;
|
||||
if (is_live || (!only_live() && !sr.empty())) {
|
||||
partition_is_not_empty(consumer);
|
||||
return consumer.consume(std::move(sr), current_tombstone, is_live);
|
||||
_stop = consumer.consume(std::move(sr), current_tombstone, is_live);
|
||||
}
|
||||
return stop_iteration::no;
|
||||
return _stop;
|
||||
}
|
||||
|
||||
template <typename Consumer, typename GCConsumer>
|
||||
@@ -424,22 +428,21 @@ public:
|
||||
};
|
||||
|
||||
if (only_live() && is_live) {
|
||||
auto stop = consume_row();
|
||||
_stop = consume_row();
|
||||
if (++_rows_in_current_partition == _current_partition_limit) {
|
||||
return stop_iteration::yes;
|
||||
_stop = stop_iteration::yes;
|
||||
}
|
||||
return stop;
|
||||
return _stop;
|
||||
} else if (!only_live()) {
|
||||
auto stop = stop_iteration::no;
|
||||
if (!cr.empty()) {
|
||||
stop = consume_row();
|
||||
_stop = consume_row();
|
||||
}
|
||||
if (!sstable_compaction() && is_live && ++_rows_in_current_partition == _current_partition_limit) {
|
||||
return stop_iteration::yes;
|
||||
_stop = stop_iteration::yes;
|
||||
}
|
||||
return stop;
|
||||
return _stop;
|
||||
}
|
||||
return stop_iteration::no;
|
||||
return _stop;
|
||||
}
|
||||
|
||||
template <typename Consumer, typename GCConsumer>
|
||||
@@ -448,7 +451,8 @@ public:
|
||||
++_stats.range_tombstones;
|
||||
_range_tombstones.apply(rt);
|
||||
// FIXME: drop tombstone if it is fully covered by other range tombstones
|
||||
return do_consume(std::move(rt), consumer, gc_consumer);
|
||||
_stop = do_consume(std::move(rt), consumer, gc_consumer);
|
||||
return _stop;
|
||||
}
|
||||
|
||||
template <typename Consumer, typename GCConsumer>
|
||||
@@ -459,9 +463,9 @@ public:
|
||||
_rt_assembler.emplace();
|
||||
}
|
||||
if (auto rt_opt = _rt_assembler->consume(_schema, std::move(rtc))) {
|
||||
return do_consume(std::move(*rt_opt), consumer, gc_consumer);
|
||||
_stop = do_consume(std::move(*rt_opt), consumer, gc_consumer);
|
||||
}
|
||||
return stop_iteration::no;
|
||||
return _stop;
|
||||
}
|
||||
|
||||
template <typename Consumer, typename GCConsumer>
|
||||
@@ -490,8 +494,16 @@ public:
|
||||
_partition_limit -= _rows_in_current_partition > 0;
|
||||
auto stop = consumer.consume_end_of_partition();
|
||||
if (!sstable_compaction()) {
|
||||
return _row_limit && _partition_limit && stop != stop_iteration::yes
|
||||
stop = _row_limit && _partition_limit && stop != stop_iteration::yes
|
||||
? stop_iteration::no : stop_iteration::yes;
|
||||
// If we decided to stop earlier but decide to continue now, we
|
||||
// are in effect skipping the partition. Do not leave `_stop` at
|
||||
// `stop_iteration::yes` in this case, reset it back to
|
||||
// `stop_iteration::no` as if we exhausted the partition.
|
||||
if (_stop && !stop) {
|
||||
_stop = stop_iteration::no;
|
||||
}
|
||||
return stop;
|
||||
}
|
||||
}
|
||||
return stop_iteration::no;
|
||||
@@ -536,6 +548,7 @@ public:
|
||||
_current_partition_limit = std::min(_row_limit, _partition_row_limit);
|
||||
_query_time = query_time;
|
||||
_stats = {};
|
||||
_stop = stop_iteration::no;
|
||||
|
||||
noop_compacted_fragments_consumer nc;
|
||||
|
||||
@@ -562,16 +575,31 @@ public:
|
||||
/// compactor will result in the new compactor being in the same state *this
|
||||
/// is (given the same outside parameters of course). Practically this
|
||||
/// allows the compaction state to be stored in the compacted reader.
|
||||
detached_compaction_state detach_state() && {
|
||||
/// If the currently compacted partition is exhausted a disengaged optional
|
||||
/// is returned -- in this case there is no state to detach.
|
||||
std::optional<detached_compaction_state> detach_state() && {
|
||||
// If we exhausted the partition, there is no need to detach-restore the
|
||||
// compaction state.
|
||||
// We exhausted the partition if `consume_partition_end()` was called
|
||||
// without us requesting the consumption to stop (remembered in _stop)
|
||||
// from one of the consume() overloads.
|
||||
// The consume algorithm calls `consume_partition_end()` in two cases:
|
||||
// * on a partition-end fragment
|
||||
// * consume() requested to stop
|
||||
// In the latter case, the partition is not exhausted. Even if the next
|
||||
// fragment to process is a partition-end, it will not be consumed.
|
||||
if (!_stop) {
|
||||
return {};
|
||||
}
|
||||
partition_start ps(std::move(_last_dk), _range_tombstones.get_partition_tombstone());
|
||||
if (_rt_assembler) {
|
||||
if (_current_tombstone) {
|
||||
return {std::move(ps), std::move(_last_static_row), range_tombstone_change(position_in_partition_view::after_key(_last_clustering_pos), _current_tombstone)};
|
||||
return detached_compaction_state{std::move(ps), std::move(_last_static_row), range_tombstone_change(position_in_partition_view::after_key(_last_clustering_pos), _current_tombstone)};
|
||||
} else {
|
||||
return {std::move(ps), std::move(_last_static_row), std::optional<range_tombstone_change>{}};
|
||||
return detached_compaction_state{std::move(ps), std::move(_last_static_row), std::optional<range_tombstone_change>{}};
|
||||
}
|
||||
}
|
||||
return {std::move(ps), std::move(_last_static_row), std::move(_range_tombstones).range_tombstones()};
|
||||
return detached_compaction_state{std::move(ps), std::move(_last_static_row), std::move(_range_tombstones).range_tombstones()};
|
||||
}
|
||||
|
||||
const compaction_stats& stats() const { return _stats; }
|
||||
|
||||
@@ -826,6 +826,7 @@ public:
|
||||
|
||||
void apply(tombstone deleted_at) {
|
||||
_deleted_at.apply(deleted_at);
|
||||
maybe_shadow();
|
||||
}
|
||||
|
||||
void apply(shadowable_tombstone deleted_at) {
|
||||
|
||||
@@ -1240,7 +1240,10 @@ future<flat_mutation_reader> evictable_reader::resume_or_create_reader() {
|
||||
if (auto reader_opt = try_resume()) {
|
||||
co_return std::move(*reader_opt);
|
||||
}
|
||||
co_await _permit.maybe_wait_readmission();
|
||||
// See evictable_reader_v2::resume_or_create_reader()
|
||||
if (_permit.needs_readmission()) {
|
||||
co_await _permit.wait_readmission();
|
||||
}
|
||||
co_return recreate_reader();
|
||||
}
|
||||
|
||||
@@ -1581,11 +1584,7 @@ private:
|
||||
tracing::global_trace_state_ptr _trace_state;
|
||||
const mutation_reader::forwarding _fwd_mr;
|
||||
reader_concurrency_semaphore::inactive_read_handle _irh;
|
||||
bool _drop_partition_start = false;
|
||||
bool _drop_static_row = false;
|
||||
// Validate the partition key of the first emitted partition, set after the
|
||||
// reader was recreated.
|
||||
bool _validate_partition_key = false;
|
||||
bool _reader_recreated = false; // set if reader was recreated since last operation
|
||||
position_in_partition::tri_compare _tri_cmp;
|
||||
|
||||
std::optional<dht::decorated_key> _last_pkey;
|
||||
@@ -1606,10 +1605,9 @@ private:
|
||||
void adjust_partition_slice();
|
||||
flat_mutation_reader_v2 recreate_reader();
|
||||
future<flat_mutation_reader_v2> resume_or_create_reader();
|
||||
void maybe_validate_partition_start(const flat_mutation_reader_v2::tracked_buffer& buffer);
|
||||
void validate_partition_start(const partition_start& ps);
|
||||
void validate_position_in_partition(position_in_partition_view pos) const;
|
||||
bool should_drop_fragment(const mutation_fragment_v2& mf);
|
||||
future<> do_fill_buffer();
|
||||
void examine_first_fragments(mutation_fragment_v2_opt& mf1, mutation_fragment_v2_opt& mf2, mutation_fragment_v2_opt& mf3);
|
||||
|
||||
public:
|
||||
evictable_reader_v2(
|
||||
@@ -1725,9 +1723,6 @@ flat_mutation_reader_v2 evictable_reader_v2::recreate_reader() {
|
||||
_range_override.reset();
|
||||
_slice_override.reset();
|
||||
|
||||
_drop_partition_start = false;
|
||||
_drop_static_row = false;
|
||||
|
||||
if (_last_pkey) {
|
||||
bool partition_range_is_inclusive = true;
|
||||
|
||||
@@ -1736,11 +1731,8 @@ flat_mutation_reader_v2 evictable_reader_v2::recreate_reader() {
|
||||
partition_range_is_inclusive = false;
|
||||
break;
|
||||
case partition_region::static_row:
|
||||
_drop_partition_start = true;
|
||||
break;
|
||||
case partition_region::clustered:
|
||||
_drop_partition_start = true;
|
||||
_drop_static_row = true;
|
||||
adjust_partition_slice();
|
||||
slice = &*_slice_override;
|
||||
break;
|
||||
@@ -1763,7 +1755,7 @@ flat_mutation_reader_v2 evictable_reader_v2::recreate_reader() {
|
||||
_range_override = dht::partition_range({dht::partition_range::bound(*_last_pkey, partition_range_is_inclusive)}, _pr->end());
|
||||
range = &*_range_override;
|
||||
|
||||
_validate_partition_key = true;
|
||||
_reader_recreated = true;
|
||||
}
|
||||
|
||||
return _ms.make_reader_v2(
|
||||
@@ -1784,45 +1776,48 @@ future<flat_mutation_reader_v2> evictable_reader_v2::resume_or_create_reader() {
|
||||
if (auto reader_opt = try_resume()) {
|
||||
co_return std::move(*reader_opt);
|
||||
}
|
||||
co_await _permit.maybe_wait_readmission();
|
||||
// When the reader is created the first time and we are actually resuming a
|
||||
// saved reader in `recreate_reader()`, we have two cases here:
|
||||
// * the reader is still alive (in inactive state)
|
||||
// * the reader was evicted
|
||||
// We check for this below with `needs_readmission()` and it is very
|
||||
// important to not allow for preemption between said check and
|
||||
// `recreate_reader()`, otherwise the reader might be evicted between the
|
||||
// check and `recreate_reader()` and the latter will recreate it without
|
||||
// waiting for re-admission.
|
||||
if (_permit.needs_readmission()) {
|
||||
co_await _permit.wait_readmission();
|
||||
}
|
||||
co_return recreate_reader();
|
||||
}
|
||||
|
||||
void evictable_reader_v2::maybe_validate_partition_start(const flat_mutation_reader_v2::tracked_buffer& buffer) {
|
||||
if (!_validate_partition_key || buffer.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
// If this is set we can assume the first fragment is a partition-start.
|
||||
const auto& ps = buffer.front().as_partition_start();
|
||||
void evictable_reader_v2::validate_partition_start(const partition_start& ps) {
|
||||
const auto tri_cmp = dht::ring_position_comparator(*_schema);
|
||||
// If we recreated the reader after fast-forwarding it we won't have
|
||||
// _last_pkey set. In this case it is enough to check if the partition
|
||||
// is in range.
|
||||
if (_last_pkey) {
|
||||
const auto cmp_res = tri_cmp(*_last_pkey, ps.key());
|
||||
if (_drop_partition_start) { // we expect to continue from the same partition
|
||||
if (_next_position_in_partition.region() != partition_region::partition_start) { // we expect to continue from the same partition
|
||||
// We cannot assume the partition we stopped the read at is still alive
|
||||
// when we recreate the reader. It might have been compacted away in the
|
||||
// meanwhile, so allow for a larger partition too.
|
||||
require(
|
||||
cmp_res <= 0,
|
||||
"{}(): validation failed, expected partition with key larger or equal to _last_pkey {} due to _drop_partition_start being set, but got {}",
|
||||
"{}(): validation failed, expected partition with key larger or equal to _last_pkey {}, but got {}",
|
||||
__FUNCTION__,
|
||||
*_last_pkey,
|
||||
ps.key());
|
||||
// Reset drop flags and next pos if we are not continuing from the same partition
|
||||
// Reset next pos if we are not continuing from the same partition
|
||||
if (cmp_res < 0) {
|
||||
// Close previous partition, we are not going to continue it.
|
||||
push_mutation_fragment(*_schema, _permit, partition_end{});
|
||||
_drop_partition_start = false;
|
||||
_drop_static_row = false;
|
||||
_next_position_in_partition = position_in_partition::for_partition_start();
|
||||
}
|
||||
} else { // should be a larger partition
|
||||
require(
|
||||
cmp_res < 0,
|
||||
"{}(): validation failed, expected partition with key larger than _last_pkey {} due to _drop_partition_start being unset, but got {}",
|
||||
"{}(): validation failed, expected partition with key larger than _last_pkey {}, but got {}",
|
||||
__FUNCTION__,
|
||||
*_last_pkey,
|
||||
ps.key());
|
||||
@@ -1836,8 +1831,6 @@ void evictable_reader_v2::maybe_validate_partition_start(const flat_mutation_rea
|
||||
__FUNCTION__,
|
||||
prange,
|
||||
ps.key());
|
||||
|
||||
_validate_partition_key = false;
|
||||
}
|
||||
|
||||
void evictable_reader_v2::validate_position_in_partition(position_in_partition_view pos) const {
|
||||
@@ -1860,7 +1853,12 @@ void evictable_reader_v2::validate_position_in_partition(position_in_partition_v
|
||||
const bool any_contains = std::any_of(ranges.begin(), ranges.end(), [this, &pos] (const query::clustering_range& cr) {
|
||||
// TODO: somehow avoid this copy
|
||||
auto range = position_range(cr);
|
||||
return range.contains(*_schema, pos);
|
||||
// We cannot use range.contains() because that treats range as a
|
||||
// [a, b) range, meaning a range tombstone change with position
|
||||
// after_key(b) will be considered outside of it. Such range
|
||||
// tombstone changes can be emitted however when recreating the
|
||||
// reader on clustering range edge.
|
||||
return _tri_cmp(range.start(), pos) <= 0 && _tri_cmp(pos, range.end()) <= 0;
|
||||
});
|
||||
require(
|
||||
any_contains,
|
||||
@@ -1871,42 +1869,40 @@ void evictable_reader_v2::validate_position_in_partition(position_in_partition_v
|
||||
}
|
||||
}
|
||||
|
||||
bool evictable_reader_v2::should_drop_fragment(const mutation_fragment_v2& mf) {
|
||||
if (_drop_partition_start && mf.is_partition_start()) {
|
||||
_drop_partition_start = false;
|
||||
return true;
|
||||
void evictable_reader_v2::examine_first_fragments(mutation_fragment_v2_opt& mf1, mutation_fragment_v2_opt& mf2, mutation_fragment_v2_opt& mf3) {
|
||||
if (!mf1) {
|
||||
return; // the reader is at EOS
|
||||
}
|
||||
// Unlike partition-start above, a partition is not guaranteed to have a
|
||||
// static row fragment. So reset the flag regardless of whether we could
|
||||
// drop one or not.
|
||||
// We are guaranteed to get here only right after dropping a partition-start,
|
||||
// so if we are not seeing a static row here, the partition doesn't have one.
|
||||
if (_drop_static_row) {
|
||||
_drop_static_row = false;
|
||||
return mf.is_static_row();
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
future<> evictable_reader_v2::do_fill_buffer() {
|
||||
if (!_drop_partition_start && !_drop_static_row) {
|
||||
auto fill_buf_fut = _reader->fill_buffer();
|
||||
if (_validate_partition_key) {
|
||||
fill_buf_fut = fill_buf_fut.then([this] {
|
||||
maybe_validate_partition_start(_reader->buffer());
|
||||
});
|
||||
}
|
||||
return fill_buf_fut;
|
||||
// If engaged, the first fragment is always a partition-start.
|
||||
validate_partition_start(mf1->as_partition_start());
|
||||
if (_tri_cmp(mf1->position(), _next_position_in_partition) < 0) {
|
||||
mf1 = {}; // drop mf1
|
||||
}
|
||||
|
||||
const auto continue_same_partition = _next_position_in_partition.region() != partition_region::partition_start;
|
||||
|
||||
// If we have a first fragment, we are guaranteed to have a second one -- if not else, a partition-end.
|
||||
if (mf2->is_end_of_partition()) {
|
||||
return; // no further fragments, nothing to do
|
||||
}
|
||||
|
||||
// We want to validate the position of the first non-dropped fragment.
|
||||
// If mf2 is a static row and we need to drop it, this will be mf3.
|
||||
if (mf2->is_static_row() && _tri_cmp(mf2->position(), _next_position_in_partition) < 0) {
|
||||
mf2 = {}; // drop mf2
|
||||
} else {
|
||||
if (continue_same_partition) {
|
||||
validate_position_in_partition(mf2->position());
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (mf3->is_end_of_partition()) {
|
||||
return; // no further fragments, nothing to do
|
||||
} else if (continue_same_partition) {
|
||||
validate_position_in_partition(mf3->position());
|
||||
}
|
||||
return repeat([this] {
|
||||
return _reader->fill_buffer().then([this] {
|
||||
maybe_validate_partition_start(_reader->buffer());
|
||||
while (!_reader->is_buffer_empty() && should_drop_fragment(_reader->peek_buffer())) {
|
||||
_reader->pop_mutation_fragment();
|
||||
}
|
||||
return stop_iteration(_reader->is_buffer_full() || _reader->is_end_of_stream());
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
evictable_reader_v2::evictable_reader_v2(
|
||||
@@ -1935,10 +1931,64 @@ future<> evictable_reader_v2::fill_buffer() {
|
||||
co_return;
|
||||
}
|
||||
_reader = co_await resume_or_create_reader();
|
||||
co_await do_fill_buffer();
|
||||
|
||||
if (_reader_recreated) {
|
||||
// Recreating the reader breaks snapshot isolation and creates all sorts
|
||||
// of complications around the continuity of range tombstone changes,
|
||||
// e.g. a range tombstone started by the previous reader object
|
||||
// might not exist anymore with the new reader object.
|
||||
// To avoid complications we reset the tombstone state on each reader
|
||||
// recreation by emitting a null tombstone change, if we read at least
|
||||
// one clustering fragment from the partition.
|
||||
if (_next_position_in_partition.region() == partition_region::clustered
|
||||
&& _tri_cmp(_next_position_in_partition, position_in_partition::before_all_clustered_rows()) > 0) {
|
||||
push_mutation_fragment(*_schema, _permit, range_tombstone_change{position_in_partition_view::before_key(_next_position_in_partition), {}});
|
||||
}
|
||||
auto mf1 = co_await (*_reader)();
|
||||
auto mf2 = co_await (*_reader)();
|
||||
auto mf3 = co_await (*_reader)();
|
||||
examine_first_fragments(mf1, mf2, mf3);
|
||||
if (mf3) {
|
||||
_reader->unpop_mutation_fragment(std::move(*mf3));
|
||||
}
|
||||
if (mf2) {
|
||||
_reader->unpop_mutation_fragment(std::move(*mf2));
|
||||
}
|
||||
if (mf1) {
|
||||
_reader->unpop_mutation_fragment(std::move(*mf1));
|
||||
}
|
||||
_reader_recreated = false;
|
||||
} else {
|
||||
co_await _reader->fill_buffer();
|
||||
}
|
||||
|
||||
_reader->move_buffer_content_to(*this);
|
||||
|
||||
// Ensure that each buffer represents forward progress. Only a concern when
|
||||
// the last fragment in the buffer is range tombstone change. In this case
|
||||
// ensure that:
|
||||
// * buffer().back().position() > _next_position_in_partition;
|
||||
// * _reader.peek()->position() > buffer().back().position();
|
||||
if (!is_buffer_empty() && buffer().back().is_range_tombstone_change()) {
|
||||
auto* next_mf = co_await _reader->peek();
|
||||
|
||||
// First make sure we've made progress w.r.t. _next_position_in_partition.
|
||||
// This loop becomes inifinite when next pos is a partition start.
|
||||
// In that case progress is guranteed anyway, so skip this loop entirely.
|
||||
while (!_next_position_in_partition.is_partition_start() && next_mf && _tri_cmp(_next_position_in_partition, buffer().back().position()) <= 0) {
|
||||
push_mutation_fragment(_reader->pop_mutation_fragment());
|
||||
next_mf = co_await _reader->peek();
|
||||
}
|
||||
|
||||
const auto last_pos = position_in_partition(buffer().back().position());
|
||||
while (next_mf && _tri_cmp(last_pos, next_mf->position()) == 0) {
|
||||
push_mutation_fragment(_reader->pop_mutation_fragment());
|
||||
next_mf = co_await _reader->peek();
|
||||
}
|
||||
}
|
||||
|
||||
update_next_position();
|
||||
_end_of_stream = _reader->is_end_of_stream() && _reader->is_buffer_empty();
|
||||
_end_of_stream = _reader->is_end_of_stream();
|
||||
maybe_pause(std::move(*_reader));
|
||||
}
|
||||
|
||||
|
||||
@@ -444,7 +444,7 @@ public:
|
||||
// When throws, the cursor is invalidated and its position is not changed.
|
||||
bool advance_to(position_in_partition_view lower_bound) {
|
||||
maybe_advance_to(lower_bound);
|
||||
return no_clustering_row_between(_schema, lower_bound, position());
|
||||
return no_clustering_row_between_weak(_schema, lower_bound, position());
|
||||
}
|
||||
|
||||
// Call only when valid.
|
||||
|
||||
@@ -567,6 +567,20 @@ bool no_clustering_row_between(const schema& s, position_in_partition_view a, po
|
||||
}
|
||||
}
|
||||
|
||||
// Returns true if and only if there can't be any clustering_row with position >= a and < b.
|
||||
// It is assumed that a <= b.
|
||||
inline
|
||||
bool no_clustering_row_between_weak(const schema& s, position_in_partition_view a, position_in_partition_view b) {
|
||||
clustering_key_prefix::equality eq(s);
|
||||
if (a.has_key() && b.has_key()) {
|
||||
return eq(a.key(), b.key())
|
||||
&& (a.get_bound_weight() == bound_weight::after_all_prefixed
|
||||
|| b.get_bound_weight() != bound_weight::after_all_prefixed);
|
||||
} else {
|
||||
return !a.has_key() && !b.has_key();
|
||||
}
|
||||
}
|
||||
|
||||
// Includes all position_in_partition objects "p" for which: start <= p < end
|
||||
// And only those.
|
||||
class position_range {
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
|
||||
#include "seastarx.hh"
|
||||
#include <seastar/core/future.hh>
|
||||
#include <seastar/net/socket_defs.hh>
|
||||
#include <vector>
|
||||
|
||||
// Abstraction for a server serving some kind of user-facing protocol.
|
||||
|
||||
19
querier.cc
19
querier.cc
@@ -414,25 +414,6 @@ future<bool> querier_cache::evict_one() noexcept {
|
||||
co_return false;
|
||||
}
|
||||
|
||||
future<> querier_cache::evict_all_for_table(const utils::UUID& schema_id) noexcept {
|
||||
for (auto ip : {&_data_querier_index, &_mutation_querier_index, &_shard_mutation_querier_index}) {
|
||||
auto& idx = *ip;
|
||||
for (auto it = idx.begin(); it != idx.end();) {
|
||||
if (it->second->schema().id() == schema_id) {
|
||||
auto reader_opt = it->second->permit().semaphore().unregister_inactive_read(querier_utils::get_inactive_read_handle(*it->second));
|
||||
it = idx.erase(it);
|
||||
--_stats.population;
|
||||
if (reader_opt) {
|
||||
co_await reader_opt->close();
|
||||
}
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
}
|
||||
co_return;
|
||||
}
|
||||
|
||||
future<> querier_cache::stop() noexcept {
|
||||
co_await _closing_gate.close();
|
||||
|
||||
|
||||
@@ -476,11 +476,6 @@ public:
|
||||
/// is empty).
|
||||
future<bool> evict_one() noexcept;
|
||||
|
||||
/// Evict all queriers that belong to a table.
|
||||
///
|
||||
/// Should be used when dropping a table.
|
||||
future<> evict_all_for_table(const utils::UUID& schema_id) noexcept;
|
||||
|
||||
/// Close all queriers and wait on background work.
|
||||
///
|
||||
/// Should be used before destroying the querier_cache.
|
||||
|
||||
5
query.cc
5
query.cc
@@ -92,14 +92,13 @@ void trim_clustering_row_ranges_to(const schema& s, clustering_row_ranges& range
|
||||
}
|
||||
|
||||
void trim_clustering_row_ranges_to(const schema& s, clustering_row_ranges& ranges, const clustering_key& key, bool reversed) {
|
||||
if (key.is_full(s)) {
|
||||
if (key.is_full(s) || reversed) {
|
||||
return trim_clustering_row_ranges_to(s, ranges,
|
||||
reversed ? position_in_partition_view::before_key(key) : position_in_partition_view::after_key(key), reversed);
|
||||
}
|
||||
auto full_key = key;
|
||||
clustering_key::make_full(s, full_key);
|
||||
return trim_clustering_row_ranges_to(s, ranges,
|
||||
reversed ? position_in_partition_view::after_key(full_key) : position_in_partition_view::before_key(full_key), reversed);
|
||||
return trim_clustering_row_ranges_to(s, ranges, position_in_partition_view::before_key(full_key), reversed);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -68,22 +68,33 @@ public:
|
||||
// for accumulated range tombstones.
|
||||
// After this, only range_tombstones with positions >= upper_bound may be added,
|
||||
// which guarantees that they won't affect the output of this flush.
|
||||
//
|
||||
// If upper_bound == position_in_partition::after_all_clustered_rows(),
|
||||
// emits all remaining range_tombstone_changes.
|
||||
// No range_tombstones may be added after this.
|
||||
//
|
||||
// FIXME: respect preemption
|
||||
template<RangeTombstoneChangeConsumer C>
|
||||
void flush(position_in_partition_view upper_bound, C consumer) {
|
||||
position_in_partition::less_compare less(_schema);
|
||||
std::optional<range_tombstone> prev;
|
||||
void flush(const position_in_partition_view upper_bound, C consumer) {
|
||||
if (_range_tombstones.empty()) {
|
||||
_lower_bound = upper_bound;
|
||||
return;
|
||||
}
|
||||
|
||||
while (!_range_tombstones.empty() && less(_range_tombstones.begin()->end_position(), upper_bound)) {
|
||||
position_in_partition::tri_compare cmp(_schema);
|
||||
std::optional<range_tombstone> prev;
|
||||
bool flush_all = cmp(upper_bound, position_in_partition::after_all_clustered_rows()) == 0;
|
||||
|
||||
while (!_range_tombstones.empty() && (flush_all || (cmp(_range_tombstones.begin()->end_position(), upper_bound) < 0))) {
|
||||
auto rt = _range_tombstones.pop(_range_tombstones.begin());
|
||||
|
||||
if (prev && less(prev->end_position(), rt.position())) { // [1]
|
||||
if (prev && (cmp(prev->end_position(), rt.position()) < 0)) { // [1]
|
||||
// previous range tombstone not adjacent, emit gap.
|
||||
consumer(range_tombstone_change(prev->end_position(), tombstone()));
|
||||
}
|
||||
|
||||
// Check if start of rt was already emitted, emit if not.
|
||||
if (!less(rt.position(), _lower_bound)) {
|
||||
if (cmp(rt.position(), _lower_bound) >= 0) {
|
||||
consumer(range_tombstone_change(rt.position(), rt.tomb));
|
||||
}
|
||||
|
||||
@@ -95,15 +106,15 @@ public:
|
||||
// It cannot get adjacent later because prev->end_position() < upper_bound,
|
||||
// so nothing == prev->end_position() can be added after this invocation.
|
||||
if (prev && (_range_tombstones.empty()
|
||||
|| less(prev->end_position(), _range_tombstones.begin()->position()))) {
|
||||
|| (cmp(prev->end_position(), _range_tombstones.begin()->position()) < 0))) {
|
||||
consumer(range_tombstone_change(prev->end_position(), tombstone())); // [2]
|
||||
}
|
||||
|
||||
// Emit the fragment for start bound of a range_tombstone which is overlapping with upper_bound,
|
||||
// unless no such fragment or already emitted.
|
||||
if (!_range_tombstones.empty()
|
||||
&& less(_range_tombstones.begin()->position(), upper_bound)
|
||||
&& (!less(_range_tombstones.begin()->position(), _lower_bound))) {
|
||||
&& (cmp(_range_tombstones.begin()->position(), upper_bound) < 0)
|
||||
&& (cmp(_range_tombstones.begin()->position(), _lower_bound) >= 0)) {
|
||||
consumer(range_tombstone_change(
|
||||
_range_tombstones.begin()->position(), _range_tombstones.begin()->tombstone().tomb));
|
||||
}
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
#include <boost/range/adaptor/reversed.hpp>
|
||||
#include "range_tombstone_list.hh"
|
||||
#include "utils/allocation_strategy.hh"
|
||||
#include "utils/amortized_reserve.hh"
|
||||
#include <seastar/util/variant_utils.hh>
|
||||
|
||||
range_tombstone_list::range_tombstone_list(const range_tombstone_list& x)
|
||||
@@ -375,13 +376,13 @@ range_tombstone_list::reverter::insert(range_tombstones_type::iterator it, range
|
||||
|
||||
range_tombstone_list::range_tombstones_type::iterator
|
||||
range_tombstone_list::reverter::erase(range_tombstones_type::iterator it) {
|
||||
_ops.reserve(_ops.size() + 1);
|
||||
amortized_reserve(_ops, _ops.size() + 1);
|
||||
_ops.emplace_back(erase_undo_op(*it));
|
||||
return _dst._tombstones.erase(it);
|
||||
}
|
||||
|
||||
void range_tombstone_list::reverter::update(range_tombstones_type::iterator it, range_tombstone&& new_rt) {
|
||||
_ops.reserve(_ops.size() + 1);
|
||||
amortized_reserve(_ops, _ops.size() + 1);
|
||||
swap(it->tombstone(), new_rt);
|
||||
_ops.emplace_back(update_undo_op(std::move(new_rt), *it));
|
||||
}
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
#include "range_tombstone.hh"
|
||||
#include "query-request.hh"
|
||||
#include "utils/preempt.hh"
|
||||
#include "utils/chunked_vector.hh"
|
||||
#include <iosfwd>
|
||||
#include <variant>
|
||||
|
||||
@@ -106,7 +107,7 @@ class range_tombstone_list final {
|
||||
class reverter {
|
||||
private:
|
||||
using op = std::variant<erase_undo_op, insert_undo_op, update_undo_op>;
|
||||
std::vector<op> _ops;
|
||||
utils::chunked_vector<op> _ops;
|
||||
const schema& _s;
|
||||
protected:
|
||||
range_tombstone_list& _dst;
|
||||
|
||||
@@ -294,10 +294,11 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
future<> maybe_wait_readmission() {
|
||||
if (_state != reader_permit::state::evicted) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
bool needs_readmission() const {
|
||||
return _state == reader_permit::state::evicted;
|
||||
}
|
||||
|
||||
future<> wait_readmission() {
|
||||
return _semaphore.do_wait_admission(shared_from_this());
|
||||
}
|
||||
|
||||
@@ -360,8 +361,16 @@ reader_concurrency_semaphore& reader_permit::semaphore() {
|
||||
return _impl->semaphore();
|
||||
}
|
||||
|
||||
future<> reader_permit::maybe_wait_readmission() {
|
||||
return _impl->maybe_wait_readmission();
|
||||
reader_permit::state reader_permit::get_state() const {
|
||||
return _impl->get_state();
|
||||
}
|
||||
|
||||
bool reader_permit::needs_readmission() const {
|
||||
return _impl->needs_readmission();
|
||||
}
|
||||
|
||||
future<> reader_permit::wait_readmission() {
|
||||
return _impl->wait_readmission();
|
||||
}
|
||||
|
||||
void reader_permit::consume(reader_resources res) {
|
||||
@@ -661,11 +670,7 @@ reader_concurrency_semaphore::~reader_concurrency_semaphore() {
|
||||
reader_concurrency_semaphore::inactive_read_handle reader_concurrency_semaphore::register_inactive_read(flat_mutation_reader_v2 reader) noexcept {
|
||||
auto& permit_impl = *reader.permit()._impl;
|
||||
permit_impl.on_register_as_inactive();
|
||||
// Implies _inactive_reads.empty(), we don't queue new readers before
|
||||
// evicting all inactive reads.
|
||||
// Checking the _wait_list covers the count resources only, so check memory
|
||||
// separately.
|
||||
if (_wait_list.empty() && _resources.memory > 0) {
|
||||
if (!should_evict_inactive_read()) {
|
||||
try {
|
||||
auto irp = std::make_unique<inactive_read>(std::move(reader));
|
||||
auto& ir = *irp;
|
||||
@@ -736,10 +741,24 @@ bool reader_concurrency_semaphore::try_evict_one_inactive_read(evict_reason reas
|
||||
|
||||
void reader_concurrency_semaphore::clear_inactive_reads() {
|
||||
while (!_inactive_reads.empty()) {
|
||||
auto& ir = _inactive_reads.front();
|
||||
close_reader(std::move(ir.reader));
|
||||
// Destroying the read unlinks it too.
|
||||
std::unique_ptr<inactive_read> _(&*_inactive_reads.begin());
|
||||
evict(_inactive_reads.front(), evict_reason::manual);
|
||||
}
|
||||
}
|
||||
|
||||
future<> reader_concurrency_semaphore::evict_inactive_reads_for_table(utils::UUID id) noexcept {
|
||||
inactive_reads_type evicted_readers;
|
||||
auto it = _inactive_reads.begin();
|
||||
while (it != _inactive_reads.end()) {
|
||||
auto& ir = *it;
|
||||
++it;
|
||||
if (ir.reader.schema()->id() == id) {
|
||||
do_detach_inactive_reader(ir, evict_reason::manual);
|
||||
evicted_readers.push_back(ir);
|
||||
}
|
||||
}
|
||||
while (!evicted_readers.empty()) {
|
||||
std::unique_ptr<inactive_read> irp(&evicted_readers.front());
|
||||
co_await irp->reader.close();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -765,11 +784,11 @@ future<> reader_concurrency_semaphore::stop() noexcept {
|
||||
co_return;
|
||||
}
|
||||
|
||||
flat_mutation_reader_v2 reader_concurrency_semaphore::detach_inactive_reader(inactive_read& ir, evict_reason reason) noexcept {
|
||||
auto reader = std::move(ir.reader);
|
||||
void reader_concurrency_semaphore::do_detach_inactive_reader(inactive_read& ir, evict_reason reason) noexcept {
|
||||
ir.unlink();
|
||||
ir.ttl_timer.cancel();
|
||||
ir.detach();
|
||||
reader.permit()._impl->on_evicted();
|
||||
std::unique_ptr<inactive_read> irp(&ir);
|
||||
ir.reader.permit()._impl->on_evicted();
|
||||
try {
|
||||
if (ir.notify_handler) {
|
||||
ir.notify_handler(reason);
|
||||
@@ -788,7 +807,12 @@ flat_mutation_reader_v2 reader_concurrency_semaphore::detach_inactive_reader(ina
|
||||
break;
|
||||
}
|
||||
--_stats.inactive_reads;
|
||||
return reader;
|
||||
}
|
||||
|
||||
flat_mutation_reader_v2 reader_concurrency_semaphore::detach_inactive_reader(inactive_read& ir, evict_reason reason) noexcept {
|
||||
std::unique_ptr<inactive_read> irp(&ir);
|
||||
do_detach_inactive_reader(ir, reason);
|
||||
return std::move(irp->reader);
|
||||
}
|
||||
|
||||
void reader_concurrency_semaphore::evict(inactive_read& ir, evict_reason reason) noexcept {
|
||||
@@ -836,35 +860,89 @@ future<> reader_concurrency_semaphore::enqueue_waiter(reader_permit permit, read
|
||||
}
|
||||
|
||||
void reader_concurrency_semaphore::evict_readers_in_background() {
|
||||
if (_evicting) {
|
||||
return;
|
||||
}
|
||||
_evicting = true;
|
||||
// Evict inactive readers in the background while wait list isn't empty
|
||||
// This is safe since stop() closes _gate;
|
||||
(void)with_gate(_close_readers_gate, [this] {
|
||||
return do_until([this] { return _wait_list.empty() || _inactive_reads.empty(); }, [this] {
|
||||
return detach_inactive_reader(_inactive_reads.front(), evict_reason::permit).close();
|
||||
return repeat([this] {
|
||||
if (_inactive_reads.empty() || !should_evict_inactive_read()) {
|
||||
_evicting = false;
|
||||
return make_ready_future<stop_iteration>(stop_iteration::yes);
|
||||
}
|
||||
return detach_inactive_reader(_inactive_reads.front(), evict_reason::permit).close().then([] {
|
||||
return stop_iteration::no;
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
reader_concurrency_semaphore::admit_result
|
||||
reader_concurrency_semaphore::can_admit_read(const reader_permit& permit) const noexcept {
|
||||
if (!_ready_list.empty()) {
|
||||
return {can_admit::no, reason::ready_list};
|
||||
}
|
||||
|
||||
if (!all_used_permits_are_stalled()) {
|
||||
return {can_admit::no, reason::used_permits};
|
||||
}
|
||||
|
||||
if (!has_available_units(permit.base_resources())) {
|
||||
auto reason = _resources.memory >= permit.base_resources().memory ? reason::memory_resources : reason::count_resources;
|
||||
if (_inactive_reads.empty()) {
|
||||
return {can_admit::no, reason};
|
||||
} else {
|
||||
return {can_admit::maybe, reason};
|
||||
}
|
||||
}
|
||||
|
||||
return {can_admit::yes, reason::all_ok};
|
||||
}
|
||||
|
||||
bool reader_concurrency_semaphore::should_evict_inactive_read() const noexcept {
|
||||
if (_resources.memory < 0 || _resources.count < 0) {
|
||||
return true;
|
||||
}
|
||||
if (_wait_list.empty()) {
|
||||
return false;
|
||||
}
|
||||
const auto r = can_admit_read(_wait_list.front().permit).why;
|
||||
return r == reason::memory_resources || r == reason::count_resources;
|
||||
}
|
||||
|
||||
future<> reader_concurrency_semaphore::do_wait_admission(reader_permit permit, read_func func) {
|
||||
if (!_execution_loop_future) {
|
||||
_execution_loop_future.emplace(execution_loop());
|
||||
}
|
||||
if (!_wait_list.empty() || !_ready_list.empty()) {
|
||||
return enqueue_waiter(std::move(permit), std::move(func));
|
||||
}
|
||||
|
||||
if (!has_available_units(permit.base_resources())) {
|
||||
static uint64_t stats::*stats_table[] = {
|
||||
&stats::reads_admitted_immediately,
|
||||
&stats::reads_queued_because_ready_list,
|
||||
&stats::reads_queued_because_used_permits,
|
||||
&stats::reads_queued_because_memory_resources,
|
||||
&stats::reads_queued_because_count_resources
|
||||
};
|
||||
|
||||
const auto [admit, why] = can_admit_read(permit);
|
||||
++(_stats.*stats_table[static_cast<int>(why)]);
|
||||
if (admit != can_admit::yes || !_wait_list.empty()) {
|
||||
auto fut = enqueue_waiter(std::move(permit), std::move(func));
|
||||
if (!_inactive_reads.empty()) {
|
||||
if (admit == can_admit::yes && !_wait_list.empty()) {
|
||||
// This is a contradiction: the semaphore could admit waiters yet it has waiters.
|
||||
// Normally, the semaphore should admit waiters as soon as it can.
|
||||
// So at any point in time, there should either be no waiters, or it
|
||||
// shouldn't be able to admit new reads. Otherwise something went wrong.
|
||||
maybe_dump_reader_permit_diagnostics(*this, _permit_list, "semaphore could admit new reads yet there are waiters");
|
||||
maybe_admit_waiters();
|
||||
} else if (admit == can_admit::maybe) {
|
||||
++_stats.reads_queued_with_eviction;
|
||||
evict_readers_in_background();
|
||||
}
|
||||
return fut;
|
||||
}
|
||||
|
||||
if (!all_used_permits_are_stalled()) {
|
||||
return enqueue_waiter(std::move(permit), std::move(func));
|
||||
}
|
||||
|
||||
permit.on_admission();
|
||||
++_stats.reads_admitted;
|
||||
if (func) {
|
||||
@@ -874,7 +952,8 @@ future<> reader_concurrency_semaphore::do_wait_admission(reader_permit permit, r
|
||||
}
|
||||
|
||||
void reader_concurrency_semaphore::maybe_admit_waiters() noexcept {
|
||||
while (!_wait_list.empty() && _ready_list.empty() && has_available_units(_wait_list.front().permit.base_resources()) && all_used_permits_are_stalled()) {
|
||||
auto admit = can_admit::no;
|
||||
while (!_wait_list.empty() && (admit = can_admit_read(_wait_list.front().permit).decision) == can_admit::yes) {
|
||||
auto& x = _wait_list.front();
|
||||
try {
|
||||
x.permit.on_admission();
|
||||
@@ -889,6 +968,10 @@ void reader_concurrency_semaphore::maybe_admit_waiters() noexcept {
|
||||
}
|
||||
_wait_list.pop_front();
|
||||
}
|
||||
if (admit == can_admit::maybe) {
|
||||
// Evicting readers will trigger another call to `maybe_admit_waiters()` from `signal()`.
|
||||
evict_readers_in_background();
|
||||
}
|
||||
}
|
||||
|
||||
void reader_concurrency_semaphore::on_permit_created(reader_permit::impl& permit) {
|
||||
@@ -965,6 +1048,13 @@ future<> reader_concurrency_semaphore::with_ready_permit(reader_permit permit, r
|
||||
return fut;
|
||||
}
|
||||
|
||||
void reader_concurrency_semaphore::set_resources(resources r) {
|
||||
auto delta = r - _initial_resources;
|
||||
_initial_resources = r;
|
||||
_resources += delta;
|
||||
maybe_admit_waiters();
|
||||
}
|
||||
|
||||
void reader_concurrency_semaphore::broken(std::exception_ptr ex) {
|
||||
if (!ex) {
|
||||
ex = std::make_exception_ptr(broken_semaphore{});
|
||||
|
||||
@@ -74,6 +74,18 @@ public:
|
||||
uint64_t reads_admitted = 0;
|
||||
// Total number of reads enqueued to wait for admission.
|
||||
uint64_t reads_enqueued = 0;
|
||||
// Total number of reads admitted immediately, without queueing
|
||||
uint64_t reads_admitted_immediately = 0;
|
||||
// Total number of reads enqueued because ready_list wasn't empty
|
||||
uint64_t reads_queued_because_ready_list = 0;
|
||||
// Total number of reads enqueued because there are used but unblocked permits
|
||||
uint64_t reads_queued_because_used_permits = 0;
|
||||
// Total number of reads enqueued because there weren't enough memory resources
|
||||
uint64_t reads_queued_because_memory_resources = 0;
|
||||
// Total number of reads enqueued because there weren't enough count resources
|
||||
uint64_t reads_queued_because_count_resources = 0;
|
||||
// Total number of reads enqueued to be maybe admitted after evicting some inactive reads
|
||||
uint64_t reads_queued_with_eviction = 0;
|
||||
// Total number of permits created so far.
|
||||
uint64_t total_permits = 0;
|
||||
// Current number of permits.
|
||||
@@ -169,7 +181,7 @@ public:
|
||||
};
|
||||
|
||||
private:
|
||||
const resources _initial_resources;
|
||||
resources _initial_resources;
|
||||
resources _resources;
|
||||
|
||||
expiring_fifo<entry, expiry_handler, db::timeout_clock> _wait_list;
|
||||
@@ -181,11 +193,13 @@ private:
|
||||
stats _stats;
|
||||
permit_list_type _permit_list;
|
||||
bool _stopped = false;
|
||||
bool _evicting = false;
|
||||
gate _close_readers_gate;
|
||||
gate _permit_gate;
|
||||
std::optional<future<>> _execution_loop_future;
|
||||
|
||||
private:
|
||||
void do_detach_inactive_reader(inactive_read&, evict_reason reason) noexcept;
|
||||
[[nodiscard]] flat_mutation_reader_v2 detach_inactive_reader(inactive_read&, evict_reason reason) noexcept;
|
||||
void evict(inactive_read&, evict_reason reason) noexcept;
|
||||
|
||||
@@ -200,6 +214,19 @@ private:
|
||||
future<> enqueue_waiter(reader_permit permit, read_func func);
|
||||
void evict_readers_in_background();
|
||||
future<> do_wait_admission(reader_permit permit, read_func func = {});
|
||||
|
||||
// Check whether permit can be admitted or not.
|
||||
// The wait list is not taken into consideration, this is the caller's
|
||||
// responsibility.
|
||||
// A return value of can_admit::maybe means admission might be possible if
|
||||
// some of the inactive readers are evicted.
|
||||
enum class can_admit { no, maybe, yes };
|
||||
enum class reason { all_ok = 0, ready_list, used_permits, memory_resources, count_resources };
|
||||
struct admit_result { can_admit decision; reason why; };
|
||||
admit_result can_admit_read(const reader_permit& permit) const noexcept;
|
||||
|
||||
bool should_evict_inactive_read() const noexcept;
|
||||
|
||||
void maybe_admit_waiters() noexcept;
|
||||
|
||||
void on_permit_created(reader_permit::impl&);
|
||||
@@ -301,6 +328,9 @@ public:
|
||||
|
||||
/// Clear all inactive reads.
|
||||
void clear_inactive_reads();
|
||||
|
||||
/// Evict all inactive reads the belong to the table designated by the id.
|
||||
future<> evict_inactive_reads_for_table(utils::UUID id) noexcept;
|
||||
private:
|
||||
// The following two functions are extension points for
|
||||
// future inheriting classes that needs to run some stop
|
||||
@@ -386,6 +416,12 @@ public:
|
||||
/// optimal then just using \ref with_permit().
|
||||
future<> with_ready_permit(reader_permit permit, read_func func);
|
||||
|
||||
/// Set the total resources of the semaphore to \p r.
|
||||
///
|
||||
/// After this call, \ref initial_resources() will reflect the new value.
|
||||
/// Available resources will be adjusted by the delta.
|
||||
void set_resources(resources r);
|
||||
|
||||
const resources initial_resources() const {
|
||||
return _initial_resources;
|
||||
}
|
||||
|
||||
@@ -134,7 +134,12 @@ public:
|
||||
|
||||
reader_concurrency_semaphore& semaphore();
|
||||
|
||||
future<> maybe_wait_readmission();
|
||||
state get_state() const;
|
||||
|
||||
bool needs_readmission() const;
|
||||
|
||||
// Call only when needs_readmission() = true.
|
||||
future<> wait_readmission();
|
||||
|
||||
void consume(reader_resources res);
|
||||
|
||||
@@ -182,6 +187,8 @@ public:
|
||||
reader_resources resources() const { return _resources; }
|
||||
};
|
||||
|
||||
std::ostream& operator<<(std::ostream& os, reader_permit::state s);
|
||||
|
||||
/// Mark a permit as used.
|
||||
///
|
||||
/// Conceptually, a permit is considered used, when at least one reader
|
||||
|
||||
@@ -48,14 +48,42 @@
|
||||
|
||||
logging::logger rlogger("repair");
|
||||
|
||||
node_ops_info::node_ops_info(utils::UUID ops_uuid_, shared_ptr<abort_source> as_, std::list<gms::inet_address>&& ignore_nodes_) noexcept
|
||||
: ops_uuid(ops_uuid_)
|
||||
, as(std::move(as_))
|
||||
, ignore_nodes(std::move(ignore_nodes_))
|
||||
{}
|
||||
|
||||
void node_ops_info::check_abort() {
|
||||
if (abort) {
|
||||
if (as && as->abort_requested()) {
|
||||
auto msg = format("Node operation with ops_uuid={} is aborted", ops_uuid);
|
||||
rlogger.warn("{}", msg);
|
||||
throw std::runtime_error(msg);
|
||||
}
|
||||
}
|
||||
|
||||
future<> node_ops_info::start() {
|
||||
if (as) {
|
||||
co_await _sas.start();
|
||||
_abort_subscription = as->subscribe([this] () noexcept {
|
||||
_abort_done = _sas.invoke_on_all([] (abort_source& as) noexcept {
|
||||
as.request_abort();
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
future<> node_ops_info::stop() noexcept {
|
||||
if (as) {
|
||||
co_await std::exchange(_abort_done, make_ready_future<>());
|
||||
co_await _sas.stop();
|
||||
}
|
||||
}
|
||||
|
||||
abort_source* node_ops_info::local_abort_source() {
|
||||
return as ? &_sas.local() : nullptr;
|
||||
}
|
||||
|
||||
node_ops_metrics::node_ops_metrics(tracker& tracker)
|
||||
: _tracker(tracker)
|
||||
{
|
||||
@@ -436,16 +464,6 @@ void tracker::abort_all_repairs() {
|
||||
rlogger.info0("Aborted {} repair job(s)", count);
|
||||
}
|
||||
|
||||
void tracker::abort_repair_node_ops(utils::UUID ops_uuid) {
|
||||
for (auto& x : _repairs) {
|
||||
auto& ri = x.second;
|
||||
if (ri->ops_uuid() && ri->ops_uuid().value() == ops_uuid) {
|
||||
rlogger.info0("Aborted repair jobs for ops_uuid={}", ops_uuid);
|
||||
ri->abort();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
float tracker::report_progress(streaming::stream_reason reason) {
|
||||
uint64_t nr_ranges_finished = 0;
|
||||
uint64_t nr_ranges_total = 0;
|
||||
@@ -534,7 +552,7 @@ repair_info::repair_info(repair_service& repair,
|
||||
const std::vector<sstring>& hosts_,
|
||||
const std::unordered_set<gms::inet_address>& ignore_nodes_,
|
||||
streaming::stream_reason reason_,
|
||||
std::optional<utils::UUID> ops_uuid,
|
||||
abort_source* as,
|
||||
bool hints_batchlog_flushed)
|
||||
: rs(repair)
|
||||
, db(repair.get_db())
|
||||
@@ -556,8 +574,10 @@ repair_info::repair_info(repair_service& repair,
|
||||
, reason(reason_)
|
||||
, total_rf(db.local().find_keyspace(keyspace).get_effective_replication_map()->get_replication_factor())
|
||||
, nr_ranges_total(ranges.size())
|
||||
, _ops_uuid(std::move(ops_uuid))
|
||||
, _hints_batchlog_flushed(std::move(hints_batchlog_flushed)) {
|
||||
if (as != nullptr) {
|
||||
_abort_subscription = as->subscribe([this] () noexcept { abort(); });
|
||||
}
|
||||
}
|
||||
|
||||
void repair_info::check_failed_ranges() {
|
||||
@@ -575,7 +595,7 @@ void repair_info::check_failed_ranges() {
|
||||
}
|
||||
}
|
||||
|
||||
void repair_info::abort() {
|
||||
void repair_info::abort() noexcept {
|
||||
aborted = true;
|
||||
}
|
||||
|
||||
@@ -1190,7 +1210,7 @@ int repair_service::do_repair_start(sstring keyspace, std::unordered_map<sstring
|
||||
local_repair.get_metrics().repair_total_ranges_sum += ranges.size();
|
||||
auto ri = make_lw_shared<repair_info>(local_repair,
|
||||
std::move(keyspace), std::move(ranges), std::move(table_ids),
|
||||
id, std::move(data_centers), std::move(hosts), std::move(ignore_nodes), streaming::stream_reason::repair, id.uuid, hints_batchlog_flushed);
|
||||
id, std::move(data_centers), std::move(hosts), std::move(ignore_nodes), streaming::stream_reason::repair, nullptr, hints_batchlog_flushed);
|
||||
return repair_ranges(ri);
|
||||
});
|
||||
repair_results.push_back(std::move(f));
|
||||
@@ -1257,12 +1277,12 @@ future<> repair_service::sync_data_using_repair(
|
||||
dht::token_range_vector ranges,
|
||||
std::unordered_map<dht::token_range, repair_neighbors> neighbors,
|
||||
streaming::stream_reason reason,
|
||||
std::optional<utils::UUID> ops_uuid) {
|
||||
shared_ptr<node_ops_info> ops_info) {
|
||||
if (ranges.empty()) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
return container().invoke_on(0, [keyspace = std::move(keyspace), ranges = std::move(ranges), neighbors = std::move(neighbors), reason, ops_uuid] (repair_service& local_repair) mutable {
|
||||
return local_repair.do_sync_data_using_repair(std::move(keyspace), std::move(ranges), std::move(neighbors), reason, ops_uuid);
|
||||
return container().invoke_on(0, [keyspace = std::move(keyspace), ranges = std::move(ranges), neighbors = std::move(neighbors), reason, ops_info] (repair_service& local_repair) mutable {
|
||||
return local_repair.do_sync_data_using_repair(std::move(keyspace), std::move(ranges), std::move(neighbors), reason, ops_info);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -1271,12 +1291,12 @@ future<> repair_service::do_sync_data_using_repair(
|
||||
dht::token_range_vector ranges,
|
||||
std::unordered_map<dht::token_range, repair_neighbors> neighbors,
|
||||
streaming::stream_reason reason,
|
||||
std::optional<utils::UUID> ops_uuid) {
|
||||
shared_ptr<node_ops_info> ops_info) {
|
||||
seastar::sharded<replica::database>& db = get_db();
|
||||
|
||||
repair_uniq_id id = repair_tracker().next_repair_command();
|
||||
rlogger.info("repair id {} to sync data for keyspace={}, status=started", id, keyspace);
|
||||
return repair_tracker().run(id, [this, id, &db, keyspace, ranges = std::move(ranges), neighbors = std::move(neighbors), reason, ops_uuid] () mutable {
|
||||
return repair_tracker().run(id, [this, id, &db, keyspace, ranges = std::move(ranges), neighbors = std::move(neighbors), reason, ops_info] () mutable {
|
||||
auto cfs = list_column_families(db.local(), keyspace);
|
||||
if (cfs.empty()) {
|
||||
rlogger.warn("repair id {} to sync data for keyspace={}, no table in this keyspace", id, keyspace);
|
||||
@@ -1286,14 +1306,15 @@ future<> repair_service::do_sync_data_using_repair(
|
||||
std::vector<future<>> repair_results;
|
||||
repair_results.reserve(smp::count);
|
||||
for (auto shard : boost::irange(unsigned(0), smp::count)) {
|
||||
auto f = container().invoke_on(shard, [keyspace, table_ids, id, ranges, neighbors, reason, ops_uuid] (repair_service& local_repair) mutable {
|
||||
auto f = container().invoke_on(shard, [keyspace, table_ids, id, ranges, neighbors, reason, ops_info] (repair_service& local_repair) mutable {
|
||||
auto data_centers = std::vector<sstring>();
|
||||
auto hosts = std::vector<sstring>();
|
||||
auto ignore_nodes = std::unordered_set<gms::inet_address>();
|
||||
bool hints_batchlog_flushed = false;
|
||||
abort_source* asp = ops_info ? ops_info->local_abort_source() : nullptr;
|
||||
auto ri = make_lw_shared<repair_info>(local_repair,
|
||||
std::move(keyspace), std::move(ranges), std::move(table_ids),
|
||||
id, std::move(data_centers), std::move(hosts), std::move(ignore_nodes), reason, ops_uuid, hints_batchlog_flushed);
|
||||
id, std::move(data_centers), std::move(hosts), std::move(ignore_nodes), reason, asp, hints_batchlog_flushed);
|
||||
ri->neighbors = std::move(neighbors);
|
||||
return repair_ranges(ri);
|
||||
});
|
||||
@@ -1494,7 +1515,7 @@ future<> repair_service::bootstrap_with_repair(locator::token_metadata_ptr tmptr
|
||||
}
|
||||
}
|
||||
auto nr_ranges = desired_ranges.size();
|
||||
sync_data_using_repair(keyspace_name, std::move(desired_ranges), std::move(range_sources), reason, {}).get();
|
||||
sync_data_using_repair(keyspace_name, std::move(desired_ranges), std::move(range_sources), reason, nullptr).get();
|
||||
rlogger.info("bootstrap_with_repair: finished with keyspace={}, nr_ranges={}", keyspace_name, nr_ranges);
|
||||
}
|
||||
rlogger.info("bootstrap_with_repair: finished with keyspaces={}", keyspaces);
|
||||
@@ -1690,8 +1711,7 @@ future<> repair_service::do_decommission_removenode_with_repair(locator::token_m
|
||||
ranges.swap(ranges_for_removenode);
|
||||
}
|
||||
auto nr_ranges_synced = ranges.size();
|
||||
std::optional<utils::UUID> opt_uuid = ops ? std::make_optional<utils::UUID>(ops->ops_uuid) : std::nullopt;
|
||||
sync_data_using_repair(keyspace_name, std::move(ranges), std::move(range_sources), reason, opt_uuid).get();
|
||||
sync_data_using_repair(keyspace_name, std::move(ranges), std::move(range_sources), reason, ops).get();
|
||||
rlogger.info("{}: finished with keyspace={}, leaving_node={}, nr_ranges={}, nr_ranges_synced={}, nr_ranges_skipped={}",
|
||||
op, keyspace_name, leaving_node, nr_ranges_total, nr_ranges_synced, nr_ranges_skipped);
|
||||
}
|
||||
@@ -1715,12 +1735,6 @@ future<> repair_service::removenode_with_repair(locator::token_metadata_ptr tmpt
|
||||
});
|
||||
}
|
||||
|
||||
future<> repair_service::abort_repair_node_ops(utils::UUID ops_uuid) {
|
||||
return container().invoke_on_all([ops_uuid] (repair_service& rs) {
|
||||
rs.repair_tracker().abort_repair_node_ops(ops_uuid);
|
||||
});
|
||||
}
|
||||
|
||||
future<> repair_service::do_rebuild_replace_with_repair(locator::token_metadata_ptr tmptr, sstring op, sstring source_dc, streaming::stream_reason reason, std::list<gms::inet_address> ignore_nodes) {
|
||||
return seastar::async([this, tmptr = std::move(tmptr), source_dc = std::move(source_dc), op = std::move(op), reason, ignore_nodes = std::move(ignore_nodes)] () mutable {
|
||||
seastar::sharded<replica::database>& db = get_db();
|
||||
@@ -1799,7 +1813,7 @@ future<> repair_service::do_rebuild_replace_with_repair(locator::token_metadata_
|
||||
}).get();
|
||||
}
|
||||
auto nr_ranges = ranges.size();
|
||||
sync_data_using_repair(keyspace_name, std::move(ranges), std::move(range_sources), reason, {}).get();
|
||||
sync_data_using_repair(keyspace_name, std::move(ranges), std::move(range_sources), reason, nullptr).get();
|
||||
rlogger.info("{}: finished with keyspace={}, source_dc={}, nr_ranges={}", op, keyspace_name, source_dc, nr_ranges);
|
||||
}
|
||||
rlogger.info("{}: finished with keyspaces={}, source_dc={}", op, keyspaces, source_dc);
|
||||
|
||||
@@ -67,11 +67,28 @@ struct repair_uniq_id {
|
||||
};
|
||||
std::ostream& operator<<(std::ostream& os, const repair_uniq_id& x);
|
||||
|
||||
struct node_ops_info {
|
||||
class node_ops_info {
|
||||
public:
|
||||
utils::UUID ops_uuid;
|
||||
bool abort = false;
|
||||
shared_ptr<abort_source> as;
|
||||
std::list<gms::inet_address> ignore_nodes;
|
||||
|
||||
private:
|
||||
optimized_optional<abort_source::subscription> _abort_subscription;
|
||||
sharded<abort_source> _sas;
|
||||
future<> _abort_done = make_ready_future<>();
|
||||
|
||||
public:
|
||||
node_ops_info(utils::UUID ops_uuid_, shared_ptr<abort_source> as_, std::list<gms::inet_address>&& ignore_nodes_) noexcept;
|
||||
node_ops_info(const node_ops_info&) = delete;
|
||||
node_ops_info(node_ops_info&&) = delete;
|
||||
|
||||
future<> start();
|
||||
future<> stop() noexcept;
|
||||
|
||||
void check_abort();
|
||||
|
||||
abort_source* local_abort_source();
|
||||
};
|
||||
|
||||
// NOTE: repair_start() can be run on any node, but starts a node-global
|
||||
@@ -167,7 +184,7 @@ public:
|
||||
int ranges_index = 0;
|
||||
repair_stats _stats;
|
||||
std::unordered_set<sstring> dropped_tables;
|
||||
std::optional<utils::UUID> _ops_uuid;
|
||||
optimized_optional<abort_source::subscription> _abort_subscription;
|
||||
bool _hints_batchlog_flushed = false;
|
||||
public:
|
||||
repair_info(repair_service& repair,
|
||||
@@ -179,10 +196,10 @@ public:
|
||||
const std::vector<sstring>& hosts_,
|
||||
const std::unordered_set<gms::inet_address>& ingore_nodes_,
|
||||
streaming::stream_reason reason_,
|
||||
std::optional<utils::UUID> ops_uuid,
|
||||
abort_source* as,
|
||||
bool hints_batchlog_flushed);
|
||||
void check_failed_ranges();
|
||||
void abort();
|
||||
void abort() noexcept;
|
||||
void check_in_abort();
|
||||
void check_in_shutdown();
|
||||
repair_neighbors get_repair_neighbors(const dht::token_range& range);
|
||||
@@ -192,9 +209,6 @@ public:
|
||||
const std::vector<sstring>& table_names() {
|
||||
return cfs;
|
||||
}
|
||||
const std::optional<utils::UUID>& ops_uuid() const {
|
||||
return _ops_uuid;
|
||||
};
|
||||
|
||||
bool hints_batchlog_flushed() const {
|
||||
return _hints_batchlog_flushed;
|
||||
@@ -252,7 +266,6 @@ public:
|
||||
future<> run(repair_uniq_id id, std::function<void ()> func);
|
||||
future<repair_status> repair_await_completion(int id, std::chrono::steady_clock::time_point timeout);
|
||||
float report_progress(streaming::stream_reason reason);
|
||||
void abort_repair_node_ops(utils::UUID ops_uuid);
|
||||
};
|
||||
|
||||
future<uint64_t> estimate_partitions(seastar::sharded<replica::database>& db, const sstring& keyspace,
|
||||
|
||||
@@ -347,9 +347,9 @@ private:
|
||||
// Only needed for local readers, the multishard reader takes care
|
||||
// of pinning tables on used shards.
|
||||
std::optional<utils::phased_barrier::operation> _local_read_op;
|
||||
std::optional<evictable_reader_handle> _reader_handle;
|
||||
// Local reader or multishard reader to read the range
|
||||
flat_mutation_reader _reader;
|
||||
std::optional<evictable_reader_handle> _reader_handle;
|
||||
// Current partition read from disk
|
||||
lw_shared_ptr<const decorated_key_with_hash> _current_dk;
|
||||
uint64_t _reads_issued = 0;
|
||||
|
||||
@@ -141,13 +141,13 @@ private:
|
||||
dht::token_range_vector ranges,
|
||||
std::unordered_map<dht::token_range, repair_neighbors> neighbors,
|
||||
streaming::stream_reason reason,
|
||||
std::optional<utils::UUID> ops_uuid);
|
||||
shared_ptr<node_ops_info> ops_info);
|
||||
|
||||
future<> do_sync_data_using_repair(sstring keyspace,
|
||||
dht::token_range_vector ranges,
|
||||
std::unordered_map<dht::token_range, repair_neighbors> neighbors,
|
||||
streaming::stream_reason reason,
|
||||
std::optional<utils::UUID> ops_uuid);
|
||||
shared_ptr<node_ops_info> ops_info);
|
||||
|
||||
future<repair_update_system_table_response> repair_update_system_table_handler(
|
||||
gms::inet_address from,
|
||||
@@ -193,8 +193,6 @@ public:
|
||||
// Abort all the repairs
|
||||
future<> abort_all();
|
||||
|
||||
future<> abort_repair_node_ops(utils::UUID ops_uuid);
|
||||
|
||||
std::unordered_map<node_repair_meta_id, repair_meta_ptr>& repair_meta_map() noexcept {
|
||||
return _repair_metas;
|
||||
}
|
||||
|
||||
@@ -939,7 +939,9 @@ future<> database::drop_column_family(const sstring& ks_name, const sstring& cf_
|
||||
remove(*cf);
|
||||
cf->clear_views();
|
||||
co_await cf->await_pending_ops();
|
||||
co_await _querier_cache.evict_all_for_table(cf->schema()->id());
|
||||
for (auto* sem : {&_read_concurrency_sem, &_streaming_concurrency_sem, &_compaction_concurrency_sem, &_system_read_concurrency_sem}) {
|
||||
co_await sem->evict_inactive_reads_for_table(uuid);
|
||||
}
|
||||
std::exception_ptr ex;
|
||||
try {
|
||||
co_await truncate(ks, *cf, std::move(tsf), snapshot);
|
||||
|
||||
@@ -279,6 +279,7 @@ using sstable_list = sstables::sstable_list;
|
||||
namespace replica {
|
||||
|
||||
class distributed_loader;
|
||||
struct table_population_metadata;
|
||||
|
||||
// The CF has a "stats" structure. But we don't want all fields here,
|
||||
// since some of them are fairly complex for exporting to collectd. Also,
|
||||
@@ -900,6 +901,8 @@ public:
|
||||
// The future value is true iff offstrategy compaction was required.
|
||||
future<bool> perform_offstrategy_compaction();
|
||||
future<> run_offstrategy_compaction(sstables::compaction_data& info);
|
||||
future<> perform_cleanup_compaction(replica::database& db);
|
||||
|
||||
void set_compaction_strategy(sstables::compaction_strategy_type strategy);
|
||||
const sstables::compaction_strategy& get_compaction_strategy() const {
|
||||
return _compaction_strategy;
|
||||
@@ -925,7 +928,11 @@ public:
|
||||
return _config;
|
||||
}
|
||||
|
||||
compaction_manager& get_compaction_manager() const {
|
||||
const compaction_manager& get_compaction_manager() const noexcept {
|
||||
return _compaction_manager;
|
||||
}
|
||||
|
||||
compaction_manager& get_compaction_manager() noexcept {
|
||||
return _compaction_manager;
|
||||
}
|
||||
|
||||
@@ -1080,6 +1087,7 @@ public:
|
||||
friend class ::column_family_test;
|
||||
|
||||
friend class distributed_loader;
|
||||
friend class table_population_metadata;
|
||||
|
||||
private:
|
||||
timer<> _off_strategy_trigger;
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
* SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
*/
|
||||
|
||||
#include <seastar/core/coroutine.hh>
|
||||
#include <seastar/util/closeable.hh>
|
||||
#include "distributed_loader.hh"
|
||||
#include "replica/database.hh"
|
||||
@@ -361,7 +362,7 @@ distributed_loader::process_upload_dir(distributed<replica::database>& db, distr
|
||||
&error_handler_gen_for_upload_dir);
|
||||
}, sstables::sstable_directory::default_sstable_filter()).get();
|
||||
|
||||
const bool use_view_update_path = db::view::check_needs_view_update_path(sys_dist_ks.local(), *global_table, streaming::stream_reason::repair).get0();
|
||||
const bool use_view_update_path = db::view::check_needs_view_update_path(sys_dist_ks.local(), db.local().get_token_metadata(), *global_table, streaming::stream_reason::repair).get0();
|
||||
|
||||
auto datadir = upload.parent_path();
|
||||
if (use_view_update_path) {
|
||||
@@ -454,92 +455,192 @@ future<> distributed_loader::handle_sstables_pending_delete(sstring pending_dele
|
||||
});
|
||||
}
|
||||
|
||||
future<> distributed_loader::populate_column_family(distributed<replica::database>& db, sstring sstdir, sstring ks, sstring cf, allow_offstrategy_compaction do_allow_offstrategy_compaction, must_exist dir_must_exist) {
|
||||
dblog.debug("Populating {}/{}/{} allow_offstrategy_compaction={} must_exist={}", ks, cf, sstdir, do_allow_offstrategy_compaction, dir_must_exist);
|
||||
return async([&db, sstdir = std::move(sstdir), ks = std::move(ks), cf = std::move(cf), do_allow_offstrategy_compaction, dir_must_exist] {
|
||||
class table_population_metadata {
|
||||
distributed<replica::database>& _db;
|
||||
sstring _ks;
|
||||
sstring _cf;
|
||||
global_column_family_ptr _global_table;
|
||||
fs::path _base_path;
|
||||
std::unordered_map<sstring, lw_shared_ptr<sharded<sstables::sstable_directory>>> _sstable_directories;
|
||||
sstables::sstable_version_types _highest_version = sstables::oldest_writable_sstable_format;
|
||||
int64_t _highest_generation = 0;
|
||||
|
||||
public:
|
||||
table_population_metadata(distributed<replica::database>& db, sstring ks, sstring cf)
|
||||
: _db(db)
|
||||
, _ks(std::move(ks))
|
||||
, _cf(std::move(cf))
|
||||
, _global_table(_db, _ks, _cf)
|
||||
, _base_path(_global_table->dir())
|
||||
{}
|
||||
|
||||
~table_population_metadata() {
|
||||
// All directories must have been stopped
|
||||
// using table_population_metadata::stop()
|
||||
assert(_sstable_directories.empty());
|
||||
}
|
||||
|
||||
future<> start() {
|
||||
assert(this_shard_id() == 0);
|
||||
|
||||
if (!file_exists(sstdir).get0()) {
|
||||
if (dir_must_exist) {
|
||||
throw std::runtime_error(format("Populating {}/{} failed: {} does not exist", ks, cf, sstdir));
|
||||
}
|
||||
return;
|
||||
for (auto subdir : { "", sstables::staging_dir, sstables::quarantine_dir }) {
|
||||
co_await start_subdir(subdir);
|
||||
}
|
||||
|
||||
// First pass, cleanup temporary sstable directories and sstables pending delete.
|
||||
cleanup_column_family_temp_sst_dirs(sstdir).get();
|
||||
auto pending_delete_dir = sstdir + "/" + sstables::sstable::pending_delete_dir_basename();
|
||||
auto exists = file_exists(pending_delete_dir).get0();
|
||||
if (exists) {
|
||||
handle_sstables_pending_delete(pending_delete_dir).get();
|
||||
co_await smp::invoke_on_all([this] {
|
||||
_global_table->update_sstables_known_generation(_highest_generation);
|
||||
return _global_table->disable_auto_compaction();
|
||||
});
|
||||
}
|
||||
|
||||
future<> stop() {
|
||||
for (auto it = _sstable_directories.begin(); it != _sstable_directories.end(); it = _sstable_directories.erase(it)) {
|
||||
co_await it->second->stop();
|
||||
}
|
||||
}
|
||||
|
||||
global_column_family_ptr global_table(db, ks, cf);
|
||||
fs::path get_path(std::string_view subdir) {
|
||||
return subdir.empty() ? _base_path : _base_path / subdir;
|
||||
}
|
||||
|
||||
sharded<sstables::sstable_directory> directory;
|
||||
directory.start(fs::path(sstdir), db.local().get_config().initial_sstable_loading_concurrency(), std::ref(db.local().get_sharded_sst_dir_semaphore()),
|
||||
sstables::sstable_directory::need_mutate_level::no,
|
||||
sstables::sstable_directory::lack_of_toc_fatal::yes,
|
||||
sstables::sstable_directory::enable_dangerous_direct_import_of_cassandra_counters(db.local().get_config().enable_dangerous_direct_import_of_cassandra_counters()),
|
||||
sstables::sstable_directory::allow_loading_materialized_view::yes,
|
||||
[&global_table] (fs::path dir, int64_t gen, sstables::sstable_version_types v, sstables::sstable_format_types f) {
|
||||
return global_table->make_sstable(dir.native(), gen, v, f);
|
||||
}).get();
|
||||
distributed<replica::database>& db() noexcept {
|
||||
return _db;
|
||||
}
|
||||
|
||||
auto stop = deferred_stop(directory);
|
||||
const sstring& ks() const noexcept {
|
||||
return _ks;
|
||||
}
|
||||
|
||||
lock_table(directory, db, ks, cf).get();
|
||||
process_sstable_dir(directory).get();
|
||||
const sstring& cf() const noexcept {
|
||||
return _cf;
|
||||
}
|
||||
|
||||
// If we are resharding system tables before we can read them, we will not
|
||||
// know which is the highest format we support: this information is itself stored
|
||||
// in the system tables. In that case we'll rely on what we find on disk: we'll
|
||||
// at least not downgrade any files. If we already know that we support a higher
|
||||
// format than the one we see then we use that.
|
||||
auto sys_format = global_table->get_sstables_manager().get_highest_supported_format();
|
||||
auto sst_version = highest_version_seen(directory, sys_format).get0();
|
||||
auto generation = highest_generation_seen(directory).get0();
|
||||
global_column_family_ptr& global_table() noexcept {
|
||||
return _global_table;
|
||||
};
|
||||
|
||||
db.invoke_on_all([&global_table, generation] (replica::database& db) {
|
||||
global_table->update_sstables_known_generation(generation);
|
||||
return global_table->disable_auto_compaction();
|
||||
}).get();
|
||||
const global_column_family_ptr& global_table() const noexcept {
|
||||
return _global_table;
|
||||
};
|
||||
|
||||
reshard(directory, db, ks, cf, [&global_table, sstdir, sst_version] (shard_id shard) mutable {
|
||||
auto gen = smp::submit_to(shard, [&global_table] () {
|
||||
return global_table->calculate_generation_for_new_table();
|
||||
}).get0();
|
||||
const std::unordered_map<sstring, lw_shared_ptr<sharded<sstables::sstable_directory>>>& sstable_directories() const noexcept {
|
||||
return _sstable_directories;
|
||||
}
|
||||
|
||||
return global_table->make_sstable(sstdir, gen, sst_version, sstables::sstable::format_types::big);
|
||||
}).get();
|
||||
sstables::sstable::version_types highest_version() const noexcept {
|
||||
return _highest_version;
|
||||
}
|
||||
|
||||
// The node is offline at this point so we are very lenient with what we consider
|
||||
// offstrategy.
|
||||
// SSTables created by repair may not conform to compaction strategy layout goal
|
||||
// because data segregation is only performed by compaction
|
||||
// Instead of reshaping them on boot, let's add them to maintenance set and allow
|
||||
// off-strategy compaction to reshape them. This will allow node to become online
|
||||
// ASAP. Given that SSTables with repair origin are disjoint, they can be efficiently
|
||||
// read from.
|
||||
auto eligible_for_reshape_on_boot = [] (const sstables::shared_sstable& sst) {
|
||||
return sst->get_origin() != sstables::repair_origin;
|
||||
};
|
||||
int64_t highest_generation() const noexcept {
|
||||
return _highest_generation;
|
||||
}
|
||||
|
||||
reshape(directory, db, sstables::reshape_mode::relaxed, ks, cf, [global_table, sstdir, sst_version] (shard_id shard) {
|
||||
auto gen = global_table->calculate_generation_for_new_table();
|
||||
return global_table->make_sstable(sstdir, gen, sst_version, sstables::sstable::format_types::big);
|
||||
}, eligible_for_reshape_on_boot).get();
|
||||
private:
|
||||
future<> start_subdir(sstring subdir);
|
||||
};
|
||||
|
||||
directory.invoke_on_all([global_table, &eligible_for_reshape_on_boot, do_allow_offstrategy_compaction] (sstables::sstable_directory& dir) {
|
||||
return dir.do_for_each_sstable([&global_table, &eligible_for_reshape_on_boot, do_allow_offstrategy_compaction] (sstables::shared_sstable sst) {
|
||||
auto requires_offstrategy = sstables::offstrategy(do_allow_offstrategy_compaction && !eligible_for_reshape_on_boot(sst));
|
||||
return global_table->add_sstable_and_update_cache(sst, requires_offstrategy);
|
||||
}).then([&global_table, do_allow_offstrategy_compaction] {
|
||||
if (do_allow_offstrategy_compaction) {
|
||||
global_table->trigger_offstrategy_compaction();
|
||||
}
|
||||
});
|
||||
}).get();
|
||||
future<> table_population_metadata::start_subdir(sstring subdir) {
|
||||
sstring sstdir = get_path(subdir).native();
|
||||
if (!co_await file_exists(sstdir)) {
|
||||
co_return;
|
||||
}
|
||||
|
||||
// First pass, cleanup temporary sstable directories and sstables pending delete.
|
||||
co_await distributed_loader::cleanup_column_family_temp_sst_dirs(sstdir);
|
||||
auto pending_delete_dir = sstdir + "/" + sstables::sstable::pending_delete_dir_basename();
|
||||
auto exists = co_await file_exists(pending_delete_dir);
|
||||
if (exists) {
|
||||
co_await distributed_loader::handle_sstables_pending_delete(pending_delete_dir);
|
||||
}
|
||||
|
||||
auto dptr = make_lw_shared<sharded<sstables::sstable_directory>>();
|
||||
auto& directory = *dptr;
|
||||
auto& global_table = _global_table;
|
||||
auto& db = _db;
|
||||
co_await directory.start(fs::path(sstdir),
|
||||
db.local().get_config().initial_sstable_loading_concurrency(), std::ref(db.local().get_sharded_sst_dir_semaphore()),
|
||||
sstables::sstable_directory::need_mutate_level::no,
|
||||
sstables::sstable_directory::lack_of_toc_fatal::yes,
|
||||
sstables::sstable_directory::enable_dangerous_direct_import_of_cassandra_counters(db.local().get_config().enable_dangerous_direct_import_of_cassandra_counters()),
|
||||
sstables::sstable_directory::allow_loading_materialized_view::yes,
|
||||
[&global_table] (fs::path dir, int64_t gen, sstables::sstable_version_types v, sstables::sstable_format_types f) {
|
||||
return global_table->make_sstable(dir.native(), gen, v, f);
|
||||
});
|
||||
|
||||
// directory must be stopped using table_population_metadata::stop below
|
||||
_sstable_directories[subdir] = dptr;
|
||||
|
||||
co_await distributed_loader::lock_table(directory, _db, _ks, _cf);
|
||||
co_await distributed_loader::process_sstable_dir(directory);
|
||||
|
||||
// If we are resharding system tables before we can read them, we will not
|
||||
// know which is the highest format we support: this information is itself stored
|
||||
// in the system tables. In that case we'll rely on what we find on disk: we'll
|
||||
// at least not downgrade any files. If we already know that we support a higher
|
||||
// format than the one we see then we use that.
|
||||
auto sys_format = global_table->get_sstables_manager().get_highest_supported_format();
|
||||
auto sst_version = co_await highest_version_seen(directory, sys_format);
|
||||
auto generation = co_await highest_generation_seen(directory);
|
||||
|
||||
_highest_version = std::max(sst_version, _highest_version);
|
||||
_highest_generation = std::max(generation, _highest_generation);
|
||||
}
|
||||
|
||||
future<> distributed_loader::populate_column_family(table_population_metadata& metadata, sstring subdir, allow_offstrategy_compaction do_allow_offstrategy_compaction, must_exist dir_must_exist) {
|
||||
auto& db = metadata.db();
|
||||
const auto& ks = metadata.ks();
|
||||
const auto& cf = metadata.cf();
|
||||
auto sstdir = metadata.get_path(subdir).native();
|
||||
dblog.debug("Populating {}/{}/{} allow_offstrategy_compaction={} must_exist={}", ks, cf, sstdir, do_allow_offstrategy_compaction, dir_must_exist);
|
||||
|
||||
assert(this_shard_id() == 0);
|
||||
|
||||
if (!co_await file_exists(sstdir)) {
|
||||
if (dir_must_exist) {
|
||||
throw std::runtime_error(format("Populating {}/{} failed: {} does not exist", metadata.ks(), metadata.cf(), sstdir));
|
||||
}
|
||||
co_return;
|
||||
}
|
||||
|
||||
auto& global_table = metadata.global_table();
|
||||
if (!metadata.sstable_directories().contains(subdir)) {
|
||||
dblog.error("Could not find sstables directory {}.{}/{}", ks, cf, subdir);
|
||||
}
|
||||
auto& directory = *metadata.sstable_directories().at(subdir);
|
||||
auto sst_version = metadata.highest_version();
|
||||
|
||||
co_await reshard(directory, db, ks, cf, [&global_table, sstdir, sst_version] (shard_id shard) mutable {
|
||||
auto gen = smp::submit_to(shard, [&global_table] () {
|
||||
return global_table->calculate_generation_for_new_table();
|
||||
}).get0();
|
||||
|
||||
return global_table->make_sstable(sstdir, gen, sst_version, sstables::sstable::format_types::big);
|
||||
});
|
||||
|
||||
// The node is offline at this point so we are very lenient with what we consider
|
||||
// offstrategy.
|
||||
// SSTables created by repair may not conform to compaction strategy layout goal
|
||||
// because data segregation is only performed by compaction
|
||||
// Instead of reshaping them on boot, let's add them to maintenance set and allow
|
||||
// off-strategy compaction to reshape them. This will allow node to become online
|
||||
// ASAP. Given that SSTables with repair origin are disjoint, they can be efficiently
|
||||
// read from.
|
||||
auto eligible_for_reshape_on_boot = [] (const sstables::shared_sstable& sst) {
|
||||
return sst->get_origin() != sstables::repair_origin;
|
||||
};
|
||||
|
||||
co_await reshape(directory, db, sstables::reshape_mode::relaxed, ks, cf, [global_table, sstdir, sst_version] (shard_id shard) {
|
||||
auto gen = global_table->calculate_generation_for_new_table();
|
||||
return global_table->make_sstable(sstdir, gen, sst_version, sstables::sstable::format_types::big);
|
||||
}, eligible_for_reshape_on_boot);
|
||||
|
||||
co_await directory.invoke_on_all([global_table, &eligible_for_reshape_on_boot, do_allow_offstrategy_compaction] (sstables::sstable_directory& dir) -> future<> {
|
||||
co_await dir.do_for_each_sstable([&global_table, &eligible_for_reshape_on_boot, do_allow_offstrategy_compaction] (sstables::shared_sstable sst) {
|
||||
auto requires_offstrategy = sstables::offstrategy(do_allow_offstrategy_compaction && !eligible_for_reshape_on_boot(sst));
|
||||
return global_table->add_sstable_and_update_cache(sst, requires_offstrategy);
|
||||
});
|
||||
if (do_allow_offstrategy_compaction) {
|
||||
global_table->trigger_offstrategy_compaction();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@@ -549,41 +650,51 @@ future<> distributed_loader::populate_keyspace(distributed<replica::database>& d
|
||||
auto i = keyspaces.find(ks_name);
|
||||
if (i == keyspaces.end()) {
|
||||
dblog.warn("Skipping undefined keyspace: {}", ks_name);
|
||||
return make_ready_future<>();
|
||||
} else {
|
||||
dblog.info("Populating Keyspace {}", ks_name);
|
||||
auto& ks = i->second;
|
||||
auto& column_families = db.local().get_column_families();
|
||||
|
||||
return parallel_for_each(ks.metadata()->cf_meta_data() | boost::adaptors::map_values,
|
||||
[ks_name, ksdir, &ks, &column_families, &db] (schema_ptr s) {
|
||||
utils::UUID uuid = s->id();
|
||||
lw_shared_ptr<replica::column_family> cf = column_families[uuid];
|
||||
sstring cfname = cf->schema()->cf_name();
|
||||
auto sstdir = ks.column_family_directory(ksdir, cfname, uuid);
|
||||
dblog.info("Keyspace {}: Reading CF {} id={} version={}", ks_name, cfname, uuid, s->version());
|
||||
return ks.make_directory_for_column_family(cfname, uuid).then([&db, sstdir, uuid, ks_name, cfname] {
|
||||
return distributed_loader::populate_column_family(db, sstdir + "/" + sstables::staging_dir, ks_name, cfname, allow_offstrategy_compaction::no);
|
||||
}).then([&db, sstdir, ks_name, cfname] {
|
||||
return distributed_loader::populate_column_family(db, sstdir + "/" + sstables::quarantine_dir, ks_name, cfname, allow_offstrategy_compaction::no, must_exist::no);
|
||||
}).then([&db, sstdir, uuid, ks_name, cfname] {
|
||||
return distributed_loader::populate_column_family(db, sstdir, ks_name, cfname, allow_offstrategy_compaction::yes);
|
||||
}).handle_exception([ks_name, cfname, sstdir](std::exception_ptr eptr) {
|
||||
std::string msg =
|
||||
format("Exception while populating keyspace '{}' with column family '{}' from file '{}': {}",
|
||||
ks_name, cfname, sstdir, eptr);
|
||||
dblog.error("Exception while populating keyspace '{}' with column family '{}' from file '{}': {}",
|
||||
ks_name, cfname, sstdir, eptr);
|
||||
try {
|
||||
std::rethrow_exception(eptr);
|
||||
} catch (sstables::compaction_stopped_exception& e) {
|
||||
// swallow compaction stopped exception, to allow clean shutdown.
|
||||
} catch (...) {
|
||||
throw std::runtime_error(msg.c_str());
|
||||
}
|
||||
});
|
||||
});
|
||||
co_return;
|
||||
}
|
||||
|
||||
dblog.info("Populating Keyspace {}", ks_name);
|
||||
auto& ks = i->second;
|
||||
auto& column_families = db.local().get_column_families();
|
||||
|
||||
co_await parallel_for_each(ks.metadata()->cf_meta_data() | boost::adaptors::map_values, [&] (schema_ptr s) -> future<> {
|
||||
utils::UUID uuid = s->id();
|
||||
lw_shared_ptr<replica::column_family> cf = column_families[uuid];
|
||||
sstring cfname = cf->schema()->cf_name();
|
||||
auto sstdir = ks.column_family_directory(ksdir, cfname, uuid);
|
||||
dblog.info("Keyspace {}: Reading CF {} id={} version={}", ks_name, cfname, uuid, s->version());
|
||||
|
||||
auto metadata = table_population_metadata(db, ks_name, cfname);
|
||||
std::exception_ptr ex;
|
||||
|
||||
try {
|
||||
co_await ks.make_directory_for_column_family(cfname, uuid);
|
||||
|
||||
co_await metadata.start();
|
||||
co_await distributed_loader::populate_column_family(metadata, sstables::staging_dir, allow_offstrategy_compaction::no);
|
||||
co_await distributed_loader::populate_column_family(metadata, sstables::quarantine_dir, allow_offstrategy_compaction::no, must_exist::no);
|
||||
co_await distributed_loader::populate_column_family(metadata, "", allow_offstrategy_compaction::yes);
|
||||
} catch (...) {
|
||||
std::exception_ptr eptr = std::current_exception();
|
||||
std::string msg =
|
||||
format("Exception while populating keyspace '{}' with column family '{}' from file '{}': {}",
|
||||
ks_name, cfname, sstdir, eptr);
|
||||
dblog.error("Exception while populating keyspace '{}' with column family '{}' from file '{}': {}",
|
||||
ks_name, cfname, sstdir, eptr);
|
||||
try {
|
||||
std::rethrow_exception(eptr);
|
||||
} catch (sstables::compaction_stopped_exception& e) {
|
||||
// swallow compaction stopped exception, to allow clean shutdown.
|
||||
} catch (...) {
|
||||
ex = std::make_exception_ptr(std::runtime_error(msg.c_str()));
|
||||
}
|
||||
}
|
||||
|
||||
co_await metadata.stop();
|
||||
if (ex) {
|
||||
std::rethrow_exception(std::move(ex));
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
future<> distributed_loader::init_system_keyspace(distributed<replica::database>& db, distributed<service::storage_service>& ss, sharded<gms::gossiper>& g, db::config& cfg) {
|
||||
|
||||
@@ -57,8 +57,11 @@ class distributed_loader_for_tests;
|
||||
|
||||
namespace replica {
|
||||
|
||||
class table_population_metadata;
|
||||
|
||||
class distributed_loader {
|
||||
friend class ::distributed_loader_for_tests;
|
||||
friend class table_population_metadata;
|
||||
|
||||
static future<> reshape(sharded<sstables::sstable_directory>& dir, sharded<replica::database>& db, sstables::reshape_mode mode,
|
||||
sstring ks_name, sstring table_name, sstables::compaction_sstable_creator_fn creator, std::function<bool (const sstables::shared_sstable&)> filter);
|
||||
@@ -70,7 +73,7 @@ class distributed_loader {
|
||||
std::filesystem::path datadir, sstring ks, sstring cf);
|
||||
using allow_offstrategy_compaction = bool_class<struct allow_offstrategy_compaction_tag>;
|
||||
using must_exist = bool_class<struct must_exist_tag>;
|
||||
static future<> populate_column_family(distributed<replica::database>& db, sstring sstdir, sstring ks, sstring cf, allow_offstrategy_compaction, must_exist = must_exist::yes);
|
||||
static future<> populate_column_family(table_population_metadata& metadata, sstring subdir, allow_offstrategy_compaction, must_exist = must_exist::yes);
|
||||
static future<> populate_keyspace(distributed<replica::database>& db, sstring datadir, sstring ks_name);
|
||||
static future<> cleanup_column_family_temp_sst_dirs(sstring sstdir);
|
||||
static future<> handle_sstables_pending_delete(sstring pending_deletes_dir);
|
||||
|
||||
@@ -803,16 +803,15 @@ void table::set_metrics() {
|
||||
}
|
||||
|
||||
void table::rebuild_statistics() {
|
||||
// zeroing live_disk_space_used and live_sstable_count because the
|
||||
// sstable list was re-created
|
||||
_stats.live_disk_space_used = 0;
|
||||
_stats.live_sstable_count = 0;
|
||||
_stats.total_disk_space_used = 0;
|
||||
|
||||
_sstables->for_each_sstable([this] (const sstables::shared_sstable& tab) {
|
||||
update_stats_for_new_sstable(tab->bytes_on_disk());
|
||||
});
|
||||
for (auto& tab : _sstables_compacted_but_not_deleted) {
|
||||
update_stats_for_new_sstable(tab->bytes_on_disk());
|
||||
_stats.total_disk_space_used += tab->bytes_on_disk();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1137,6 +1136,11 @@ future<> table::run_offstrategy_compaction(sstables::compaction_data& info) {
|
||||
tlogger.info("Done with off-strategy compaction for {}.{}", _schema->ks_name(), _schema->cf_name());
|
||||
}
|
||||
|
||||
future<> table::perform_cleanup_compaction(replica::database& db) {
|
||||
co_await flush();
|
||||
co_await get_compaction_manager().perform_cleanup(db, this);
|
||||
}
|
||||
|
||||
void table::set_compaction_strategy(sstables::compaction_strategy_type strategy) {
|
||||
tlogger.debug("Setting compaction strategy of {}.{} to {}", _schema->ks_name(), _schema->cf_name(), sstables::compaction_strategy::name(strategy));
|
||||
auto new_cs = make_compaction_strategy(strategy, _schema->compaction_strategy_options());
|
||||
@@ -1772,29 +1776,30 @@ future<> table::generate_and_propagate_view_updates(const schema_ptr& base,
|
||||
tracing::trace_state_ptr tr_state,
|
||||
gc_clock::time_point now) const {
|
||||
auto base_token = m.token();
|
||||
auto m_schema = m.schema();
|
||||
db::view::view_update_builder builder = co_await db::view::make_view_update_builder(
|
||||
base,
|
||||
std::move(views),
|
||||
make_flat_mutation_reader_from_mutations(m.schema(), std::move(permit), {std::move(m)}),
|
||||
make_flat_mutation_reader_from_mutations(std::move(m_schema), std::move(permit), {std::move(m)}),
|
||||
std::move(existings),
|
||||
now);
|
||||
|
||||
std::exception_ptr err = nullptr;
|
||||
while (true) {
|
||||
utils::chunked_vector<frozen_mutation_and_schema> updates;
|
||||
std::optional<utils::chunked_vector<frozen_mutation_and_schema>> updates;
|
||||
try {
|
||||
updates = co_await builder.build_some();
|
||||
} catch (...) {
|
||||
err = std::current_exception();
|
||||
break;
|
||||
}
|
||||
if (updates.empty()) {
|
||||
if (!updates) {
|
||||
break;
|
||||
}
|
||||
tracing::trace(tr_state, "Generated {} view update mutations", updates.size());
|
||||
auto units = seastar::consume_units(*_config.view_update_concurrency_semaphore, memory_usage_of(updates));
|
||||
tracing::trace(tr_state, "Generated {} view update mutations", updates->size());
|
||||
auto units = seastar::consume_units(*_config.view_update_concurrency_semaphore, memory_usage_of(*updates));
|
||||
try {
|
||||
co_await db::view::mutate_MV(base_token, std::move(updates), _view_stats, *_config.cf_stats, tr_state,
|
||||
co_await db::view::mutate_MV(base_token, std::move(*updates), _view_stats, *_config.cf_stats, tr_state,
|
||||
std::move(units), service::allow_hints::yes, db::view::wait_for_all_updates::no);
|
||||
} catch (...) {
|
||||
// Ignore exceptions: any individual failure to propagate a view update will be reported
|
||||
@@ -1918,14 +1923,14 @@ future<> table::populate_views(
|
||||
while (true) {
|
||||
try {
|
||||
auto updates = co_await builder.build_some();
|
||||
if (updates.empty()) {
|
||||
if (!updates) {
|
||||
break;
|
||||
}
|
||||
size_t update_size = memory_usage_of(updates);
|
||||
size_t update_size = memory_usage_of(*updates);
|
||||
size_t units_to_wait_for = std::min(_config.view_update_concurrency_semaphore_limit, update_size);
|
||||
auto units = co_await seastar::get_units(*_config.view_update_concurrency_semaphore, units_to_wait_for);
|
||||
units.adopt(seastar::consume_units(*_config.view_update_concurrency_semaphore, update_size - units_to_wait_for));
|
||||
co_await db::view::mutate_MV(base_token, std::move(updates), _view_stats, *_config.cf_stats,
|
||||
co_await db::view::mutate_MV(base_token, std::move(*updates), _view_stats, *_config.cf_stats,
|
||||
tracing::trace_state_ptr(), std::move(units), service::allow_hints::no, db::view::wait_for_all_updates::yes);
|
||||
} catch (...) {
|
||||
if (!err) {
|
||||
|
||||
@@ -950,6 +950,11 @@ future<> row_cache::do_update(external_updater eu, memtable& m, Updater updater)
|
||||
_prev_snapshot = {};
|
||||
});
|
||||
utils::coroutine update; // Destroy before cleanup to release snapshots before invalidating.
|
||||
auto destroy_update = defer([&] {
|
||||
with_allocator(_tracker.allocator(), [&] {
|
||||
update = {};
|
||||
});
|
||||
});
|
||||
partition_presence_checker is_present = _prev_snapshot->make_partition_presence_checker();
|
||||
while (!m.partitions.empty()) {
|
||||
with_allocator(_tracker.allocator(), [&] () {
|
||||
@@ -1222,6 +1227,10 @@ void rows_entry::on_evicted(cache_tracker& tracker) noexcept {
|
||||
// That dummy is linked in the LRU, because there may be partitions
|
||||
// with no regular rows, and we need to track them.
|
||||
unlink_from_lru();
|
||||
|
||||
// We still need to break continuity in order to preserve the "older versions are evicted first"
|
||||
// invariant.
|
||||
it->set_continuous(false);
|
||||
} else {
|
||||
// When evicting a dummy with both sides continuous we don't need to break continuity.
|
||||
//
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "mutation_fragment.hh"
|
||||
#include "mutation_fragment_v2.hh"
|
||||
#include "converting_mutation_partition_applier.hh"
|
||||
|
||||
// A StreamedMutationTransformer which transforms the stream to a different schema
|
||||
|
||||
@@ -63,4 +63,15 @@ MemoryLimit=$MEMORY_LIMIT
|
||||
EOS
|
||||
fi
|
||||
|
||||
if [ -e /etc/systemd/system/systemd-coredump@.service.d/timeout.conf ]; then
|
||||
COREDUMP_RUNTIME_MAX=$(grep RuntimeMaxSec /etc/systemd/system/systemd-coredump@.service.d/timeout.conf)
|
||||
if [ -z $COREDUMP_RUNTIME_MAX ]; then
|
||||
cat << EOS > /etc/systemd/system/systemd-coredump@.service.d/timeout.conf
|
||||
[Service]
|
||||
RuntimeMaxSec=infinity
|
||||
TimeoutSec=infinity
|
||||
EOS
|
||||
fi
|
||||
fi
|
||||
|
||||
systemctl --system daemon-reload >/dev/null || true
|
||||
|
||||
2
seastar
2
seastar
Submodule seastar updated: 9a7ba6d57e...62fd873d09
@@ -8,6 +8,7 @@
|
||||
#pragma once
|
||||
|
||||
#include <optional>
|
||||
#include <seastar/core/sharded.hh>
|
||||
|
||||
#include "timestamp.hh"
|
||||
|
||||
|
||||
@@ -78,7 +78,7 @@ future<prepare_response> paxos_state::prepare(storage_proxy& sp, tracing::trace_
|
||||
prv, tr_state, timeout);
|
||||
});
|
||||
});
|
||||
return when_all(std::move(f1), std::move(f2)).then([state = std::move(state), only_digest] (auto t) {
|
||||
return when_all(std::move(f1), std::move(f2)).then([state = std::move(state), only_digest, schema] (auto t) mutable {
|
||||
if (utils::get_local_injector().enter("paxos_error_after_save_promise")) {
|
||||
return make_exception_future<prepare_response>(utils::injected_error("injected_error_after_save_promise"));
|
||||
}
|
||||
@@ -103,8 +103,25 @@ future<prepare_response> paxos_state::prepare(storage_proxy& sp, tracing::trace_
|
||||
auto ex = f2.get_exception();
|
||||
logger.debug("Failed to get data or digest: {}. Ignored.", std::move(ex));
|
||||
}
|
||||
return make_ready_future<prepare_response>(prepare_response(promise(std::move(state._accepted_proposal),
|
||||
std::move(state._most_recent_commit), std::move(data_or_digest))));
|
||||
auto upgrade_if_needed = [schema = std::move(schema)] (std::optional<proposal> p) {
|
||||
if (!p || p->update.schema_version() == schema->version()) {
|
||||
return make_ready_future<std::optional<proposal>>(std::move(p));
|
||||
}
|
||||
// In case current schema is not the same as the schema in the proposal
|
||||
// try to look it up first in the local schema_registry cache and upgrade
|
||||
// the mutation using schema from the cache.
|
||||
//
|
||||
// If there's no schema in the cache, then retrieve persisted column mapping
|
||||
// for that version and upgrade the mutation with it.
|
||||
logger.debug("Stored mutation references outdated schema version. "
|
||||
"Trying to upgrade the accepted proposal mutation to the most recent schema version.");
|
||||
return service::get_column_mapping(p->update.column_family_id(), p->update.schema_version()).then([schema, p = std::move(p)] (const column_mapping& cm) {
|
||||
return make_ready_future<std::optional<proposal>>(proposal(p->ballot, freeze(p->update.unfreeze_upgrading(schema, cm))));
|
||||
});
|
||||
};
|
||||
return when_all_succeed(upgrade_if_needed(std::move(state._accepted_proposal)), upgrade_if_needed(std::move(state._most_recent_commit))).then([data_or_digest = std::move(data_or_digest)] (auto&& u) mutable {
|
||||
return prepare_response(promise(std::move(std::get<0>(u)), std::move(std::get<1>(u)), std::move(data_or_digest)));
|
||||
});
|
||||
});
|
||||
} else {
|
||||
logger.debug("Promise rejected; {} is not sufficiently newer than {}", ballot, state._promised_ballot);
|
||||
@@ -200,15 +217,9 @@ future<> paxos_state::learn(storage_proxy& sp, schema_ptr schema, proposal decis
|
||||
// If there's no schema in the cache, then retrieve persisted column mapping
|
||||
// for that version and upgrade the mutation with it.
|
||||
if (decision.update.schema_version() != schema->version()) {
|
||||
logger.debug("Stored mutation references outdated schema version. "
|
||||
"Trying to upgrade the accepted proposal mutation to the most recent schema version.");
|
||||
return service::get_column_mapping(decision.update.column_family_id(), decision.update.schema_version())
|
||||
.then([&sp, schema, tr_state, timeout, &decision] (const column_mapping& cm) {
|
||||
return do_with(decision.update.unfreeze_upgrading(schema, cm), [&sp, tr_state, timeout] (const mutation& upgraded) {
|
||||
return sp.mutate_locally(upgraded, tr_state, db::commitlog::force_sync::yes, timeout);
|
||||
});
|
||||
});
|
||||
on_internal_error(logger, format("schema version in learn does not match current schema"));
|
||||
}
|
||||
|
||||
return sp.mutate_locally(schema, decision.update, tr_state, db::commitlog::force_sync::yes, timeout);
|
||||
});
|
||||
} else {
|
||||
|
||||
@@ -161,6 +161,11 @@ raft_group0::discover_group0(raft::server_address my_addr) {
|
||||
}
|
||||
}
|
||||
|
||||
future<> raft_group0::abort() {
|
||||
return _shutdown_gate.close();
|
||||
}
|
||||
|
||||
|
||||
future<> raft_group0::join_group0() {
|
||||
assert(this_shard_id() == 0);
|
||||
if (!_raft_gr.is_enabled()) {
|
||||
|
||||
@@ -47,13 +47,7 @@ public:
|
||||
cql3::query_processor& qp,
|
||||
migration_manager& mm);
|
||||
|
||||
future<> abort() {
|
||||
if (!_abort_source.abort_requested()) {
|
||||
_abort_source.request_abort();
|
||||
}
|
||||
return _shutdown_gate.close();
|
||||
}
|
||||
|
||||
future<> abort();
|
||||
|
||||
// Join this node to the cluster-wide Raft group
|
||||
// Called during bootstrap. Is idempotent - it
|
||||
|
||||
@@ -1227,19 +1227,15 @@ future<> paxos_response_handler::learn_decision(lw_shared_ptr<paxos::proposal> d
|
||||
|
||||
auto cdc = _proxy->get_cdc_service();
|
||||
if (cdc && cdc->needs_cdc_augmentation(update_mut_vec)) {
|
||||
f_cdc = cdc->augment_mutation_call(_timeout, std::move(update_mut_vec), tr_state, _cl_for_learn)
|
||||
.then([this, base_tbl_id, cdc = cdc->shared_from_this()] (std::tuple<std::vector<mutation>, lw_shared_ptr<cdc::operation_result_tracker>>&& t) {
|
||||
auto mutations = std::move(std::get<0>(t));
|
||||
auto tracker = std::move(std::get<1>(t));
|
||||
// Pick only the CDC ("augmenting") mutations
|
||||
std::erase_if(mutations, [base_tbl_id = std::move(base_tbl_id)] (const mutation& v) {
|
||||
return v.schema()->id() == base_tbl_id;
|
||||
});
|
||||
if (mutations.empty()) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
return _proxy->mutate_internal(std::move(mutations), _cl_for_learn, false, tr_state, _permit, _timeout, std::move(tracker));
|
||||
auto cdc_shared = cdc->shared_from_this(); // keep CDC service alive
|
||||
auto [mutations, tracker] = co_await cdc->augment_mutation_call(_timeout, std::move(update_mut_vec), tr_state, _cl_for_learn);
|
||||
// Pick only the CDC ("augmenting") mutations
|
||||
std::erase_if(mutations, [base_tbl_id = std::move(base_tbl_id)] (const mutation& v) {
|
||||
return v.schema()->id() == base_tbl_id;
|
||||
});
|
||||
if (!mutations.empty()) {
|
||||
f_cdc = _proxy->mutate_internal(std::move(mutations), _cl_for_learn, false, tr_state, _permit, _timeout, std::move(tracker));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1247,7 +1243,7 @@ future<> paxos_response_handler::learn_decision(lw_shared_ptr<paxos::proposal> d
|
||||
std::array<std::tuple<lw_shared_ptr<paxos::proposal>, schema_ptr, shared_ptr<paxos_response_handler>, dht::token>, 1> m{std::make_tuple(std::move(decision), _schema, shared_from_this(), _key.token())};
|
||||
future<> f_lwt = _proxy->mutate_internal(std::move(m), _cl_for_learn, false, tr_state, _permit, _timeout);
|
||||
|
||||
return when_all_succeed(std::move(f_cdc), std::move(f_lwt)).discard_result();
|
||||
co_await when_all_succeed(std::move(f_cdc), std::move(f_lwt)).discard_result();
|
||||
}
|
||||
|
||||
void paxos_response_handler::prune(utils::UUID ballot) {
|
||||
|
||||
@@ -2282,6 +2282,8 @@ future<> storage_service::removenode(sstring host_id_string, std::list<gms::inet
|
||||
ss._group0->leave_group0(endpoint).get();
|
||||
slogger.info("removenode[{}]: Finished removenode operation, removing node={}, sync_nodes={}, ignore_nodes={}", uuid, endpoint, nodes, ignore_nodes);
|
||||
} catch (...) {
|
||||
slogger.warn("removenode[{}]: removing node={}, sync_nodes={}, ignore_nodes={} failed, error {}",
|
||||
uuid, endpoint, nodes, ignore_nodes, std::current_exception());
|
||||
// we need to revert the effect of prepare verb the removenode ops is failed
|
||||
req.cmd = node_ops_cmd::removenode_abort;
|
||||
parallel_for_each(nodes, [&ss, &req, &nodes_unknown_verb, &nodes_down, uuid] (const gms::inet_address& node) {
|
||||
@@ -2369,8 +2371,7 @@ future<node_ops_cmd_response> storage_service::node_ops_cmd_handler(gms::inet_ad
|
||||
}
|
||||
return update_pending_ranges(tmptr, format("removenode {}", req.leaving_nodes));
|
||||
}).get();
|
||||
auto ops = seastar::make_shared<node_ops_info>(node_ops_info{ops_uuid, false, std::move(req.ignore_nodes)});
|
||||
auto meta = node_ops_meta_data(ops_uuid, coordinator, std::move(ops), [this, coordinator, req = std::move(req)] () mutable {
|
||||
auto meta = node_ops_meta_data(ops_uuid, coordinator, std::move(req.ignore_nodes), [this, coordinator, req = std::move(req)] () mutable {
|
||||
return mutate_token_metadata([this, coordinator, req = std::move(req)] (mutable_token_metadata_ptr tmptr) mutable {
|
||||
for (auto& node : req.leaving_nodes) {
|
||||
slogger.info("removenode[{}]: Removed node={} as leaving node, coordinator={}", req.ops_uuid, node, coordinator);
|
||||
@@ -2380,6 +2381,7 @@ future<node_ops_cmd_response> storage_service::node_ops_cmd_handler(gms::inet_ad
|
||||
});
|
||||
},
|
||||
[this, ops_uuid] () mutable { node_ops_singal_abort(ops_uuid); });
|
||||
meta.start().get();
|
||||
_node_ops.emplace(ops_uuid, std::move(meta));
|
||||
} else if (req.cmd == node_ops_cmd::removenode_heartbeat) {
|
||||
slogger.debug("removenode[{}]: Updated heartbeat from coordinator={}", req.ops_uuid, coordinator);
|
||||
@@ -2418,8 +2420,7 @@ future<node_ops_cmd_response> storage_service::node_ops_cmd_handler(gms::inet_ad
|
||||
}
|
||||
return update_pending_ranges(tmptr, format("decommission {}", req.leaving_nodes));
|
||||
}).get();
|
||||
auto ops = seastar::make_shared<node_ops_info>(node_ops_info{ops_uuid, false, std::move(req.ignore_nodes)});
|
||||
auto meta = node_ops_meta_data(ops_uuid, coordinator, std::move(ops), [this, coordinator, req = std::move(req)] () mutable {
|
||||
auto meta = node_ops_meta_data(ops_uuid, coordinator, std::move(req.ignore_nodes), [this, coordinator, req = std::move(req)] () mutable {
|
||||
return mutate_token_metadata([this, coordinator, req = std::move(req)] (mutable_token_metadata_ptr tmptr) mutable {
|
||||
for (auto& node : req.leaving_nodes) {
|
||||
slogger.info("decommission[{}]: Removed node={} as leaving node, coordinator={}", req.ops_uuid, node, coordinator);
|
||||
@@ -2429,6 +2430,7 @@ future<node_ops_cmd_response> storage_service::node_ops_cmd_handler(gms::inet_ad
|
||||
});
|
||||
},
|
||||
[this, ops_uuid] () mutable { node_ops_singal_abort(ops_uuid); });
|
||||
meta.start().get();
|
||||
_node_ops.emplace(ops_uuid, std::move(meta));
|
||||
} else if (req.cmd == node_ops_cmd::decommission_heartbeat) {
|
||||
slogger.debug("decommission[{}]: Updated heartbeat from coordinator={}", req.ops_uuid, coordinator);
|
||||
@@ -2460,8 +2462,7 @@ future<node_ops_cmd_response> storage_service::node_ops_cmd_handler(gms::inet_ad
|
||||
}
|
||||
return make_ready_future<>();
|
||||
}).get();
|
||||
auto ops = seastar::make_shared<node_ops_info>(node_ops_info{ops_uuid, false, std::move(req.ignore_nodes)});
|
||||
auto meta = node_ops_meta_data(ops_uuid, coordinator, std::move(ops), [this, coordinator, req = std::move(req)] () mutable {
|
||||
auto meta = node_ops_meta_data(ops_uuid, coordinator, std::move(req.ignore_nodes), [this, coordinator, req = std::move(req)] () mutable {
|
||||
return mutate_token_metadata([this, coordinator, req = std::move(req)] (mutable_token_metadata_ptr tmptr) mutable {
|
||||
for (auto& x: req.replace_nodes) {
|
||||
auto existing_node = x.first;
|
||||
@@ -2473,6 +2474,7 @@ future<node_ops_cmd_response> storage_service::node_ops_cmd_handler(gms::inet_ad
|
||||
});
|
||||
},
|
||||
[this, ops_uuid ] { node_ops_singal_abort(ops_uuid); });
|
||||
meta.start().get();
|
||||
_node_ops.emplace(ops_uuid, std::move(meta));
|
||||
} else if (req.cmd == node_ops_cmd::replace_prepare_mark_alive) {
|
||||
// Wait for local node has marked replacing node as alive
|
||||
@@ -2514,8 +2516,7 @@ future<node_ops_cmd_response> storage_service::node_ops_cmd_handler(gms::inet_ad
|
||||
}
|
||||
return update_pending_ranges(tmptr, format("bootstrap {}", req.bootstrap_nodes));
|
||||
}).get();
|
||||
auto ops = seastar::make_shared<node_ops_info>(node_ops_info{ops_uuid, false, std::move(req.ignore_nodes)});
|
||||
auto meta = node_ops_meta_data(ops_uuid, coordinator, std::move(ops), [this, coordinator, req = std::move(req)] () mutable {
|
||||
auto meta = node_ops_meta_data(ops_uuid, coordinator, std::move(req.ignore_nodes), [this, coordinator, req = std::move(req)] () mutable {
|
||||
return mutate_token_metadata([this, coordinator, req = std::move(req)] (mutable_token_metadata_ptr tmptr) mutable {
|
||||
for (auto& x: req.bootstrap_nodes) {
|
||||
auto& endpoint = x.first;
|
||||
@@ -2527,6 +2528,7 @@ future<node_ops_cmd_response> storage_service::node_ops_cmd_handler(gms::inet_ad
|
||||
});
|
||||
},
|
||||
[this, ops_uuid ] { node_ops_singal_abort(ops_uuid); });
|
||||
meta.start().get();
|
||||
_node_ops.emplace(ops_uuid, std::move(meta));
|
||||
} else if (req.cmd == node_ops_cmd::bootstrap_heartbeat) {
|
||||
slogger.debug("bootstrap[{}]: Updated heartbeat from coordinator={}", req.ops_uuid, coordinator);
|
||||
@@ -2789,7 +2791,7 @@ future<> storage_service::removenode_with_stream(gms::inet_address leaving_node,
|
||||
future<> storage_service::restore_replica_count(inet_address endpoint, inet_address notify_endpoint) {
|
||||
if (is_repair_based_node_ops_enabled(streaming::stream_reason::removenode)) {
|
||||
auto ops_uuid = utils::make_random_uuid();
|
||||
auto ops = seastar::make_shared<node_ops_info>(node_ops_info{ops_uuid, false, std::list<gms::inet_address>()});
|
||||
auto ops = seastar::make_shared<node_ops_info>(ops_uuid, nullptr, std::list<gms::inet_address>());
|
||||
return _repair.local().removenode_with_repair(get_token_metadata_ptr(), endpoint, ops).finally([this, notify_endpoint] () {
|
||||
return send_replication_notification(notify_endpoint);
|
||||
});
|
||||
@@ -3550,7 +3552,7 @@ bool storage_service::is_repair_based_node_ops_enabled(streaming::stream_reason
|
||||
node_ops_meta_data::node_ops_meta_data(
|
||||
utils::UUID ops_uuid,
|
||||
gms::inet_address coordinator,
|
||||
shared_ptr<node_ops_info> ops,
|
||||
std::list<gms::inet_address> ignore_nodes,
|
||||
std::function<future<> ()> abort_func,
|
||||
std::function<void ()> signal_func)
|
||||
: _ops_uuid(std::move(ops_uuid))
|
||||
@@ -3558,24 +3560,28 @@ node_ops_meta_data::node_ops_meta_data(
|
||||
, _abort(std::move(abort_func))
|
||||
, _abort_source(seastar::make_shared<abort_source>())
|
||||
, _signal(std::move(signal_func))
|
||||
, _ops(std::move(ops))
|
||||
, _ops(seastar::make_shared<node_ops_info>(_ops_uuid, _abort_source, std::move(ignore_nodes)))
|
||||
, _watchdog([sig = _signal] { sig(); }) {
|
||||
_watchdog.arm(_watchdog_interval);
|
||||
}
|
||||
|
||||
future<> node_ops_meta_data::start() {
|
||||
return _ops ? _ops->start() : make_ready_future<>();
|
||||
}
|
||||
|
||||
future<> node_ops_meta_data::stop() noexcept {
|
||||
return _ops ? _ops->stop() : make_ready_future<>();
|
||||
}
|
||||
|
||||
future<> node_ops_meta_data::abort() {
|
||||
slogger.debug("node_ops_meta_data: ops_uuid={} abort", _ops_uuid);
|
||||
_aborted = true;
|
||||
if (_ops) {
|
||||
_ops->abort = true;
|
||||
}
|
||||
_watchdog.cancel();
|
||||
return _abort();
|
||||
}
|
||||
|
||||
void node_ops_meta_data::update_watchdog() {
|
||||
slogger.debug("node_ops_meta_data: ops_uuid={} update_watchdog", _ops_uuid);
|
||||
if (_aborted) {
|
||||
if (_abort_source->abort_requested()) {
|
||||
return;
|
||||
}
|
||||
_watchdog.cancel();
|
||||
@@ -3612,6 +3618,7 @@ void storage_service::node_ops_done(utils::UUID ops_uuid) {
|
||||
if (it != _node_ops.end()) {
|
||||
node_ops_meta_data& meta = it->second;
|
||||
meta.cancel_watchdog();
|
||||
meta.stop().get();
|
||||
_node_ops.erase(it);
|
||||
}
|
||||
}
|
||||
@@ -3619,6 +3626,24 @@ void storage_service::node_ops_done(utils::UUID ops_uuid) {
|
||||
void storage_service::node_ops_abort(utils::UUID ops_uuid) {
|
||||
slogger.debug("node_ops_abort: ops_uuid={}", ops_uuid);
|
||||
auto permit = seastar::get_units(_node_ops_abort_sem, 1).get0();
|
||||
|
||||
if (!ops_uuid) {
|
||||
for (auto& [uuid, meta] : _node_ops) {
|
||||
meta.abort().get();
|
||||
auto as = meta.get_abort_source();
|
||||
if (as && !as->abort_requested()) {
|
||||
as->request_abort();
|
||||
}
|
||||
}
|
||||
|
||||
for (auto it = _node_ops.begin(); it != _node_ops.end(); it = _node_ops.erase(it)) {
|
||||
node_ops_meta_data& meta = it->second;
|
||||
meta.stop().get();
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
auto it = _node_ops.find(ops_uuid);
|
||||
if (it != _node_ops.end()) {
|
||||
node_ops_meta_data& meta = it->second;
|
||||
@@ -3627,7 +3652,7 @@ void storage_service::node_ops_abort(utils::UUID ops_uuid) {
|
||||
if (as && !as->abort_requested()) {
|
||||
as->request_abort();
|
||||
}
|
||||
_repair.local().abort_repair_node_ops(ops_uuid).get();
|
||||
meta.stop().get();
|
||||
_node_ops.erase(it);
|
||||
}
|
||||
}
|
||||
@@ -3647,17 +3672,18 @@ future<> storage_service::node_ops_abort_thread() {
|
||||
while (!_node_ops_abort_queue.empty()) {
|
||||
auto uuid_opt = _node_ops_abort_queue.front();
|
||||
_node_ops_abort_queue.pop_front();
|
||||
if (!uuid_opt) {
|
||||
return;
|
||||
}
|
||||
try {
|
||||
storage_service::node_ops_abort(*uuid_opt);
|
||||
storage_service::node_ops_abort(uuid_opt.value_or(utils::null_uuid()));
|
||||
} catch (...) {
|
||||
slogger.warn("Failed to abort node operation ops_uuid={}: {}", *uuid_opt, std::current_exception());
|
||||
}
|
||||
if (!uuid_opt) {
|
||||
slogger.info("Stopped node_ops_abort_thread");
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
slogger.info("Stopped node_ops_abort_thread");
|
||||
__builtin_unreachable();
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -104,14 +104,15 @@ class node_ops_meta_data {
|
||||
shared_ptr<node_ops_info> _ops;
|
||||
seastar::timer<lowres_clock> _watchdog;
|
||||
std::chrono::seconds _watchdog_interval{30};
|
||||
bool _aborted = false;
|
||||
public:
|
||||
explicit node_ops_meta_data(
|
||||
utils::UUID ops_uuid,
|
||||
gms::inet_address coordinator,
|
||||
shared_ptr<node_ops_info> ops,
|
||||
std::list<gms::inet_address> ignore_nodes,
|
||||
std::function<future<> ()> abort_func,
|
||||
std::function<void ()> signal_func);
|
||||
future<> start();
|
||||
future<> stop() noexcept;
|
||||
shared_ptr<node_ops_info> get_ops_info();
|
||||
shared_ptr<abort_source> get_abort_source();
|
||||
future<> abort();
|
||||
|
||||
@@ -21,6 +21,7 @@
|
||||
#include "unimplemented.hh"
|
||||
#include "segmented_compress_params.hh"
|
||||
#include "utils/class_registrator.hh"
|
||||
#include "reader_permit.hh"
|
||||
|
||||
namespace sstables {
|
||||
|
||||
@@ -338,16 +339,18 @@ class compressed_file_data_source_impl : public data_source_impl {
|
||||
sstables::compression* _compression_metadata;
|
||||
sstables::compression::segmented_offsets::accessor _offsets;
|
||||
sstables::local_compression _compression;
|
||||
reader_permit _permit;
|
||||
uint64_t _underlying_pos;
|
||||
uint64_t _pos;
|
||||
uint64_t _beg_pos;
|
||||
uint64_t _end_pos;
|
||||
public:
|
||||
compressed_file_data_source_impl(file f, sstables::compression* cm,
|
||||
uint64_t pos, size_t len, file_input_stream_options options)
|
||||
uint64_t pos, size_t len, file_input_stream_options options, reader_permit permit)
|
||||
: _compression_metadata(cm)
|
||||
, _offsets(_compression_metadata->offsets.get_accessor())
|
||||
, _compression(*cm)
|
||||
, _permit(std::move(permit))
|
||||
{
|
||||
_beg_pos = pos;
|
||||
if (pos > _compression_metadata->uncompressed_file_length()) {
|
||||
@@ -412,7 +415,7 @@ public:
|
||||
_pos += out.size();
|
||||
_underlying_pos += addr.chunk_len;
|
||||
|
||||
return out;
|
||||
return make_tracked_temporary_buffer(std::move(out), _permit);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -444,9 +447,9 @@ requires ChecksumUtils<ChecksumType>
|
||||
class compressed_file_data_source : public data_source {
|
||||
public:
|
||||
compressed_file_data_source(file f, sstables::compression* cm,
|
||||
uint64_t offset, size_t len, file_input_stream_options options)
|
||||
uint64_t offset, size_t len, file_input_stream_options options, reader_permit permit)
|
||||
: data_source(std::make_unique<compressed_file_data_source_impl<ChecksumType>>(
|
||||
std::move(f), cm, offset, len, std::move(options)))
|
||||
std::move(f), cm, offset, len, std::move(options), std::move(permit)))
|
||||
{}
|
||||
};
|
||||
|
||||
@@ -454,10 +457,10 @@ template <typename ChecksumType>
|
||||
requires ChecksumUtils<ChecksumType>
|
||||
inline input_stream<char> make_compressed_file_input_stream(
|
||||
file f, sstables::compression *cm, uint64_t offset, size_t len,
|
||||
file_input_stream_options options)
|
||||
file_input_stream_options options, reader_permit permit)
|
||||
{
|
||||
return input_stream<char>(compressed_file_data_source<ChecksumType>(
|
||||
std::move(f), cm, offset, len, std::move(options)));
|
||||
std::move(f), cm, offset, len, std::move(options), std::move(permit)));
|
||||
}
|
||||
|
||||
// For SSTables 2.x (formats 'ka' and 'la'), the full checksum is a combination of checksums of compressed chunks.
|
||||
@@ -569,15 +572,15 @@ inline output_stream<char> make_compressed_file_output_stream(output_stream<char
|
||||
|
||||
input_stream<char> sstables::make_compressed_file_k_l_format_input_stream(file f,
|
||||
sstables::compression* cm, uint64_t offset, size_t len,
|
||||
class file_input_stream_options options)
|
||||
class file_input_stream_options options, reader_permit permit)
|
||||
{
|
||||
return make_compressed_file_input_stream<adler32_utils>(std::move(f), cm, offset, len, std::move(options));
|
||||
return make_compressed_file_input_stream<adler32_utils>(std::move(f), cm, offset, len, std::move(options), std::move(permit));
|
||||
}
|
||||
|
||||
input_stream<char> sstables::make_compressed_file_m_format_input_stream(file f,
|
||||
sstables::compression *cm, uint64_t offset, size_t len,
|
||||
class file_input_stream_options options) {
|
||||
return make_compressed_file_input_stream<crc32_utils>(std::move(f), cm, offset, len, std::move(options));
|
||||
class file_input_stream_options options, reader_permit permit) {
|
||||
return make_compressed_file_input_stream<crc32_utils>(std::move(f), cm, offset, len, std::move(options), std::move(permit));
|
||||
}
|
||||
|
||||
output_stream<char> sstables::make_compressed_file_m_format_output_stream(output_stream<char> out,
|
||||
|
||||
@@ -47,6 +47,8 @@
|
||||
#include "checksum_utils.hh"
|
||||
#include "../compress.hh"
|
||||
|
||||
class reader_permit;
|
||||
|
||||
class compression_parameters;
|
||||
class compressor;
|
||||
using compressor_ptr = shared_ptr<compressor>;
|
||||
@@ -371,11 +373,11 @@ compressor_ptr get_sstable_compressor(const compression&);
|
||||
// sstable alive, and the compression metadata is only a part of it.
|
||||
input_stream<char> make_compressed_file_k_l_format_input_stream(file f,
|
||||
sstables::compression* cm, uint64_t offset, size_t len,
|
||||
class file_input_stream_options options);
|
||||
class file_input_stream_options options, reader_permit permit);
|
||||
|
||||
input_stream<char> make_compressed_file_m_format_input_stream(file f,
|
||||
sstables::compression* cm, uint64_t offset, size_t len,
|
||||
class file_input_stream_options options);
|
||||
class file_input_stream_options options, reader_permit permit);
|
||||
|
||||
output_stream<char> make_compressed_file_m_format_output_stream(output_stream<char> out,
|
||||
sstables::compression* cm,
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#include <list>
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <array>
|
||||
#include <algorithm>
|
||||
#include <iterator>
|
||||
#include <cassert>
|
||||
|
||||
@@ -1070,10 +1070,9 @@ public:
|
||||
|
||||
future<> close() noexcept {
|
||||
// index_bound::close must not fail
|
||||
return close(_lower_bound).then([this] {
|
||||
if (_upper_bound) {
|
||||
return close(*_upper_bound);
|
||||
}
|
||||
auto close_lb = close(_lower_bound);
|
||||
auto close_ub = _upper_bound ? close(*_upper_bound) : make_ready_future<>();
|
||||
return when_all(std::move(close_lb), std::move(close_ub)).discard_result().finally([this] {
|
||||
if (_local_index_cache) {
|
||||
return _local_index_cache->evict_gently();
|
||||
}
|
||||
|
||||
@@ -1142,7 +1142,7 @@ private:
|
||||
}
|
||||
index_reader& get_index_reader() {
|
||||
if (!_index_reader) {
|
||||
auto caching = use_caching(!_slice.options.contains(query::partition_slice::option::bypass_cache));
|
||||
auto caching = use_caching(global_cache_index_pages && !_slice.options.contains(query::partition_slice::option::bypass_cache));
|
||||
_index_reader = std::make_unique<index_reader>(_sst, _consumer.permit(), _consumer.io_priority(),
|
||||
_consumer.trace_state(), caching);
|
||||
}
|
||||
|
||||
@@ -1319,7 +1319,7 @@ private:
|
||||
}
|
||||
index_reader& get_index_reader() {
|
||||
if (!_index_reader) {
|
||||
auto caching = use_caching(!_slice.options.contains(query::partition_slice::option::bypass_cache));
|
||||
auto caching = use_caching(global_cache_index_pages && !_slice.options.contains(query::partition_slice::option::bypass_cache));
|
||||
_index_reader = std::make_unique<index_reader>(_sst, _consumer.permit(), _consumer.io_priority(),
|
||||
_consumer.trace_state(), caching);
|
||||
}
|
||||
@@ -1754,9 +1754,7 @@ public:
|
||||
_monitor.on_read_started(_context->reader_position());
|
||||
}
|
||||
public:
|
||||
void on_out_of_clustering_range() override {
|
||||
push_mutation_fragment(mutation_fragment_v2(*_schema, _permit, partition_end()));
|
||||
}
|
||||
void on_out_of_clustering_range() override { }
|
||||
virtual future<> fast_forward_to(const dht::partition_range& pr) override {
|
||||
on_internal_error(sstlog, "mx_crawling_sstable_mutation_reader: doesn't support fast_forward_to(const dht::partition_range&)");
|
||||
}
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
#include "m_format_read_helpers.hh"
|
||||
#include "sstables/mx/parsers.hh"
|
||||
#include "sstables/index_entry.hh"
|
||||
#include <seastar/core/circular_buffer.hh>
|
||||
|
||||
namespace sstables {
|
||||
|
||||
|
||||
@@ -77,6 +77,18 @@ thread_local disk_error_signal_type sstable_write_error;
|
||||
|
||||
namespace sstables {
|
||||
|
||||
// The below flag governs the mode of index file page caching used by the index
|
||||
// reader.
|
||||
//
|
||||
// If set to true, the reader will read and/or populate a common global cache,
|
||||
// which shares its capacity with the row cache. If false, the reader will use
|
||||
// BYPASS CACHE semantics for index caching.
|
||||
//
|
||||
// This flag is intended to be a temporary hack. The goal is to eventually
|
||||
// solve index caching problems via a smart cache replacement policy.
|
||||
//
|
||||
thread_local utils::updateable_value<bool> global_cache_index_pages(false);
|
||||
|
||||
logging::logger sstlog("sstable");
|
||||
|
||||
// Because this is a noop and won't hold any state, it is better to use a global than a
|
||||
@@ -2275,7 +2287,7 @@ input_stream<char> sstable::data_stream(uint64_t pos, size_t len, const io_prior
|
||||
options.read_ahead = 4;
|
||||
options.dynamic_adjustments = std::move(history);
|
||||
|
||||
file f = make_tracked_file(_data_file, std::move(permit));
|
||||
file f = make_tracked_file(_data_file, permit);
|
||||
if (trace_state) {
|
||||
f = tracing::make_traced_file(std::move(f), std::move(trace_state), format("{}:", get_filename()));
|
||||
}
|
||||
@@ -2284,10 +2296,10 @@ input_stream<char> sstable::data_stream(uint64_t pos, size_t len, const io_prior
|
||||
if (_components->compression) {
|
||||
if (_version >= sstable_version_types::mc) {
|
||||
return make_compressed_file_m_format_input_stream(f, &_components->compression,
|
||||
pos, len, std::move(options));
|
||||
pos, len, std::move(options), permit);
|
||||
} else {
|
||||
return make_compressed_file_k_l_format_input_stream(f, &_components->compression,
|
||||
pos, len, std::move(options));
|
||||
pos, len, std::move(options), permit);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user