mirror of
https://github.com/scylladb/scylladb.git
synced 2026-05-13 03:12:13 +00:00
Compare commits
91 Commits
scylla-5.0
...
next-5.0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f6c2624c86 | ||
|
|
f7d9afd209 | ||
|
|
b011cc2e78 | ||
|
|
fb466dd7b7 | ||
|
|
697e090659 | ||
|
|
2c518f3131 | ||
|
|
e941a5ac34 | ||
|
|
3a7ce5e8aa | ||
|
|
efa4f312f5 | ||
|
|
fb4b71ea02 | ||
|
|
7387922a29 | ||
|
|
cb78c3bf2c | ||
|
|
aeac63a3ee | ||
|
|
e7b50fb8d3 | ||
|
|
6b21f2a351 | ||
|
|
0db8e627a5 | ||
|
|
f1121d2149 | ||
|
|
a0ca8abe42 | ||
|
|
8bceac1713 | ||
|
|
6bcc7c6ed5 | ||
|
|
67f85875cc | ||
|
|
8b874cd4e4 | ||
|
|
b08c582134 | ||
|
|
41556b5f63 | ||
|
|
23e7e594c0 | ||
|
|
e6ac13314d | ||
|
|
382d815459 | ||
|
|
a867b2c0e5 | ||
|
|
846edf78c6 | ||
|
|
0ccc07322b | ||
|
|
0b170192a1 | ||
|
|
fd4b2a3319 | ||
|
|
416929fb2a | ||
|
|
9d8d7048eb | ||
|
|
bae4155ab2 | ||
|
|
d6e2a326cf | ||
|
|
15645ff40b | ||
|
|
a808fc7172 | ||
|
|
dd260bfa82 | ||
|
|
c46935ed5c | ||
|
|
985d6bc4c2 | ||
|
|
7673ff4ae3 | ||
|
|
c441eebf46 | ||
|
|
bf4fa80dd7 | ||
|
|
2010231fe9 | ||
|
|
0a51eb55e3 | ||
|
|
d9c6c6283b | ||
|
|
90a5344261 | ||
|
|
68da667288 | ||
|
|
9adb1a8fdd | ||
|
|
7623fe01b7 | ||
|
|
3b0a0c4876 | ||
|
|
019d5cde1b | ||
|
|
a2e255833a | ||
|
|
f4aa5cacb1 | ||
|
|
8ea9a16f9e | ||
|
|
1aa5283a38 | ||
|
|
2e7b1858ad | ||
|
|
2542b57ddc | ||
|
|
01a9871fc3 | ||
|
|
6bb7fac8d8 | ||
|
|
5dff7489b1 | ||
|
|
2775b1d136 | ||
|
|
2ae5675c0f | ||
|
|
d507ad9424 | ||
|
|
413af945c0 | ||
|
|
9a71680dc7 | ||
|
|
94b8baa797 | ||
|
|
e372a5fe0a | ||
|
|
692e5ed175 | ||
|
|
5a299f65ff | ||
|
|
f4ae2fa5f9 | ||
|
|
07c20bdfea | ||
|
|
8a36c4be54 | ||
|
|
bf92c2b44c | ||
|
|
0e388d2140 | ||
|
|
288eb9d231 | ||
|
|
9219a59802 | ||
|
|
f9cea4dc51 | ||
|
|
081b2b76cc | ||
|
|
dfb229a18a | ||
|
|
60da855c2d | ||
|
|
1718861e94 | ||
|
|
e03e9b1abe | ||
|
|
26c51025c1 | ||
|
|
5c39a4524a | ||
|
|
9823e8d9c5 | ||
|
|
b48c9cae95 | ||
|
|
14077d2def | ||
|
|
25508705a8 | ||
|
|
347da028e9 |
@@ -60,7 +60,7 @@ fi
|
||||
|
||||
# Default scylla product/version tags
|
||||
PRODUCT=scylla
|
||||
VERSION=5.0.8
|
||||
VERSION=5.0.13
|
||||
|
||||
if test -f version
|
||||
then
|
||||
|
||||
@@ -143,19 +143,24 @@ future<alternator::executor::request_return_type> alternator::executor::list_str
|
||||
auto table = find_table(_proxy, request);
|
||||
auto db = _proxy.data_dictionary();
|
||||
auto cfs = db.get_tables();
|
||||
auto i = cfs.begin();
|
||||
auto e = cfs.end();
|
||||
|
||||
if (limit < 1) {
|
||||
throw api_error::validation("Limit must be 1 or more");
|
||||
}
|
||||
|
||||
// TODO: the unordered_map here is not really well suited for partial
|
||||
// querying - we're sorting on local hash order, and creating a table
|
||||
// between queries may or may not miss info. But that should be rare,
|
||||
// and we can probably expect this to be a single call.
|
||||
// # 12601 (maybe?) - sort the set of tables on ID. This should ensure we never
|
||||
// generate duplicates in a paged listing here. Can obviously miss things if they
|
||||
// are added between paged calls and end up with a "smaller" UUID/ARN, but that
|
||||
// is to be expected.
|
||||
std::sort(cfs.begin(), cfs.end(), [](const data_dictionary::table& t1, const data_dictionary::table& t2) {
|
||||
return t1.schema()->id() < t2.schema()->id();
|
||||
});
|
||||
|
||||
auto i = cfs.begin();
|
||||
auto e = cfs.end();
|
||||
|
||||
if (streams_start) {
|
||||
i = std::find_if(i, e, [&](data_dictionary::table t) {
|
||||
i = std::find_if(i, e, [&](const data_dictionary::table& t) {
|
||||
return t.schema()->id() == streams_start
|
||||
&& cdc::get_base_table(db.real_database(), *t.schema())
|
||||
&& is_alternator_keyspace(t.schema()->ks_name())
|
||||
|
||||
@@ -593,6 +593,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
|
||||
if (column_families.empty()) {
|
||||
column_families = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
|
||||
}
|
||||
apilog.debug("force_keyspace_compaction: keyspace={} tables={}", keyspace, column_families);
|
||||
return ctx.db.invoke_on_all([keyspace, column_families] (replica::database& db) -> future<> {
|
||||
auto table_ids = boost::copy_range<std::vector<utils::UUID>>(column_families | boost::adaptors::transformed([&] (auto& cf_name) {
|
||||
return db.find_uuid(keyspace, cf_name);
|
||||
@@ -617,6 +618,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
|
||||
if (column_families.empty()) {
|
||||
column_families = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
|
||||
}
|
||||
apilog.info("force_keyspace_cleanup: keyspace={} tables={}", keyspace, column_families);
|
||||
return ss.local().is_cleanup_allowed(keyspace).then([&ctx, keyspace,
|
||||
column_families = std::move(column_families)] (bool is_cleanup_allowed) mutable {
|
||||
if (!is_cleanup_allowed) {
|
||||
@@ -635,7 +637,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
|
||||
// as a table can be dropped during loop below, let's find it before issuing the cleanup request.
|
||||
for (auto& id : table_ids) {
|
||||
replica::table& t = db.find_column_family(id);
|
||||
co_await cm.perform_cleanup(db, &t);
|
||||
co_await t.perform_cleanup_compaction(db);
|
||||
}
|
||||
co_return;
|
||||
}).then([]{
|
||||
@@ -645,6 +647,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
|
||||
});
|
||||
|
||||
ss::perform_keyspace_offstrategy_compaction.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> tables) -> future<json::json_return_type> {
|
||||
apilog.info("perform_keyspace_offstrategy_compaction: keyspace={} tables={}", keyspace, tables);
|
||||
co_return co_await ctx.db.map_reduce0([&keyspace, &tables] (replica::database& db) -> future<bool> {
|
||||
bool needed = false;
|
||||
for (const auto& table : tables) {
|
||||
@@ -658,6 +661,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
|
||||
ss::upgrade_sstables.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> column_families) {
|
||||
bool exclude_current_version = req_param<bool>(*req, "exclude_current_version", false);
|
||||
|
||||
apilog.info("upgrade_sstables: keyspace={} tables={} exclude_current_version={}", keyspace, column_families, exclude_current_version);
|
||||
return ctx.db.invoke_on_all([=] (replica::database& db) {
|
||||
return do_for_each(column_families, [=, &db](sstring cfname) {
|
||||
auto& cm = db.get_compaction_manager();
|
||||
@@ -672,6 +676,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
|
||||
ss::force_keyspace_flush.set(r, [&ctx](std::unique_ptr<request> req) -> future<json::json_return_type> {
|
||||
auto keyspace = validate_keyspace(ctx, req->param);
|
||||
auto column_families = parse_tables(keyspace, ctx, req->query_parameters, "cf");
|
||||
apilog.info("perform_keyspace_flush: keyspace={} tables={}", keyspace, column_families);
|
||||
auto &db = ctx.db.local();
|
||||
if (column_families.empty()) {
|
||||
co_await db.flush_on_all(keyspace);
|
||||
@@ -683,6 +688,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
|
||||
|
||||
|
||||
ss::decommission.set(r, [&ss](std::unique_ptr<request> req) {
|
||||
apilog.info("decommission");
|
||||
return ss.local().decommission().then([] {
|
||||
return make_ready_future<json::json_return_type>(json_void());
|
||||
});
|
||||
@@ -698,6 +704,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
|
||||
ss::remove_node.set(r, [&ss](std::unique_ptr<request> req) {
|
||||
auto host_id = req->get_query_param("host_id");
|
||||
std::vector<sstring> ignore_nodes_strs= split(req->get_query_param("ignore_nodes"), ",");
|
||||
apilog.info("remove_node: host_id={} ignore_nodes={}", host_id, ignore_nodes_strs);
|
||||
auto ignore_nodes = std::list<gms::inet_address>();
|
||||
for (std::string n : ignore_nodes_strs) {
|
||||
try {
|
||||
@@ -770,6 +777,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
|
||||
});
|
||||
|
||||
ss::drain.set(r, [&ss](std::unique_ptr<request> req) {
|
||||
apilog.info("drain");
|
||||
return ss.local().drain().then([] {
|
||||
return make_ready_future<json::json_return_type>(json_void());
|
||||
});
|
||||
@@ -802,12 +810,14 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
|
||||
});
|
||||
|
||||
ss::stop_gossiping.set(r, [&ss](std::unique_ptr<request> req) {
|
||||
apilog.info("stop_gossiping");
|
||||
return ss.local().stop_gossiping().then([] {
|
||||
return make_ready_future<json::json_return_type>(json_void());
|
||||
});
|
||||
});
|
||||
|
||||
ss::start_gossiping.set(r, [&ss](std::unique_ptr<request> req) {
|
||||
apilog.info("start_gossiping");
|
||||
return ss.local().start_gossiping().then([] {
|
||||
return make_ready_future<json::json_return_type>(json_void());
|
||||
});
|
||||
@@ -904,6 +914,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
|
||||
|
||||
ss::rebuild.set(r, [&ss](std::unique_ptr<request> req) {
|
||||
auto source_dc = req->get_query_param("source_dc");
|
||||
apilog.info("rebuild: source_dc={}", source_dc);
|
||||
return ss.local().rebuild(std::move(source_dc)).then([] {
|
||||
return make_ready_future<json::json_return_type>(json_void());
|
||||
});
|
||||
@@ -940,6 +951,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
|
||||
// FIXME: We should truncate schema tables if more than one node in the cluster.
|
||||
auto& sp = service::get_storage_proxy();
|
||||
auto& fs = sp.local().features();
|
||||
apilog.info("reset_local_schema");
|
||||
return db::schema_tables::recalculate_schema_version(sp, fs).then([] {
|
||||
return make_ready_future<json::json_return_type>(json_void());
|
||||
});
|
||||
@@ -947,6 +959,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
|
||||
|
||||
ss::set_trace_probability.set(r, [](std::unique_ptr<request> req) {
|
||||
auto probability = req->get_query_param("probability");
|
||||
apilog.info("set_trace_probability: probability={}", probability);
|
||||
return futurize_invoke([probability] {
|
||||
double real_prob = std::stod(probability.c_str());
|
||||
return tracing::tracing::tracing_instance().invoke_on_all([real_prob] (auto& local_tracing) {
|
||||
@@ -984,6 +997,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
|
||||
auto ttl = req->get_query_param("ttl");
|
||||
auto threshold = req->get_query_param("threshold");
|
||||
auto fast = req->get_query_param("fast");
|
||||
apilog.info("set_slow_query: enable={} ttl={} threshold={} fast={}", enable, ttl, threshold, fast);
|
||||
try {
|
||||
return tracing::tracing::tracing_instance().invoke_on_all([enable, ttl, threshold, fast] (auto& local_tracing) {
|
||||
if (threshold != "") {
|
||||
@@ -1010,6 +1024,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
|
||||
auto keyspace = validate_keyspace(ctx, req->param);
|
||||
auto tables = parse_tables(keyspace, ctx, req->query_parameters, "cf");
|
||||
|
||||
apilog.info("enable_auto_compaction: keyspace={} tables={}", keyspace, tables);
|
||||
return set_tables_autocompaction(ctx, ss.local(), keyspace, tables, true);
|
||||
});
|
||||
|
||||
@@ -1017,6 +1032,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
|
||||
auto keyspace = validate_keyspace(ctx, req->param);
|
||||
auto tables = parse_tables(keyspace, ctx, req->query_parameters, "cf");
|
||||
|
||||
apilog.info("disable_auto_compaction: keyspace={} tables={}", keyspace, tables);
|
||||
return set_tables_autocompaction(ctx, ss.local(), keyspace, tables, false);
|
||||
});
|
||||
|
||||
|
||||
@@ -2008,7 +2008,8 @@ with open(buildfile_tmp, 'w') as f:
|
||||
f.write('build {}: cxx.{} {} || {}\n'.format(obj, mode, cc, ' '.join(serializers)))
|
||||
if cc.endswith('Parser.cpp'):
|
||||
# Unoptimized parsers end up using huge amounts of stack space and overflowing their stack
|
||||
flags = '-O1'
|
||||
flags = '-O1' if modes[mode]['optimization-level'] in ['0', 'g', 's'] else ''
|
||||
|
||||
if has_sanitize_address_use_after_scope:
|
||||
flags += ' -fno-sanitize-address-use-after-scope'
|
||||
f.write(' obj_cxxflags = %s\n' % flags)
|
||||
|
||||
@@ -1293,7 +1293,7 @@ expression search_and_replace(const expression& e,
|
||||
};
|
||||
},
|
||||
[&] (const binary_operator& oper) -> expression {
|
||||
return binary_operator(recurse(oper.lhs), oper.op, recurse(oper.rhs));
|
||||
return binary_operator(recurse(oper.lhs), oper.op, recurse(oper.rhs), oper.order);
|
||||
},
|
||||
[&] (const column_mutation_attribute& cma) -> expression {
|
||||
return column_mutation_attribute{cma.kind, recurse(cma.column)};
|
||||
|
||||
@@ -83,7 +83,7 @@ public:
|
||||
|
||||
virtual sstring assignment_testable_source_context() const override {
|
||||
auto&& name = _type->field_name(_field);
|
||||
auto sname = sstring(reinterpret_cast<const char*>(name.begin(), name.size()));
|
||||
auto sname = std::string_view(reinterpret_cast<const char*>(name.data()), name.size());
|
||||
return format("{}.{}", _selected, sname);
|
||||
}
|
||||
|
||||
|
||||
@@ -254,6 +254,10 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::do_
|
||||
if (options.getSerialConsistency() == null)
|
||||
throw new InvalidRequestException("Invalid empty serial consistency level");
|
||||
#endif
|
||||
for (size_t i = 0; i < _statements.size(); ++i) {
|
||||
_statements[i].statement->validate_primary_key_restrictions(options.for_statement(i));
|
||||
}
|
||||
|
||||
if (_has_conditions) {
|
||||
++_stats.cas_batches;
|
||||
_stats.statements_in_cas_batches += _statements.size();
|
||||
|
||||
@@ -121,6 +121,9 @@ std::optional<mutation> cas_request::apply(foreign_ptr<lw_shared_ptr<query::resu
|
||||
|
||||
const update_parameters::prefetch_data::row* cas_request::find_old_row(const cas_row_update& op) const {
|
||||
static const clustering_key empty_ckey = clustering_key::make_empty();
|
||||
if (_key.empty()) {
|
||||
throw exceptions::invalid_request_exception("partition key ranges empty - probably caused by an unset value");
|
||||
}
|
||||
const partition_key& pkey = _key.front().start()->value().key().value();
|
||||
// If a statement has only static columns conditions, we must ignore its clustering columns
|
||||
// restriction when choosing a row to check the conditions, i.e. choose any partition row,
|
||||
@@ -134,6 +137,9 @@ const update_parameters::prefetch_data::row* cas_request::find_old_row(const cas
|
||||
// Another case when we pass an empty clustering key prefix is apparently when the table
|
||||
// doesn't have any clustering key columns and the clustering key range is empty (open
|
||||
// ended on both sides).
|
||||
if (op.ranges.empty()) {
|
||||
throw exceptions::invalid_request_exception("clustering key ranges empty - probably caused by an unset value");
|
||||
}
|
||||
const clustering_key& ckey = !op.statement.has_only_static_column_conditions() && op.ranges.front().start() ?
|
||||
op.ranges.front().start()->value() : empty_ckey;
|
||||
return _rows.find_row(pkey, ckey);
|
||||
|
||||
@@ -242,6 +242,12 @@ modification_statement::execute(query_processor& qp, service::query_state& qs, c
|
||||
return modify_stage(this, seastar::ref(qp), seastar::ref(qs), seastar::cref(options));
|
||||
}
|
||||
|
||||
void modification_statement::validate_primary_key_restrictions(const query_options& options) const {
|
||||
if (_restrictions->range_or_slice_eq_null(options)) { // See #7852 and #9290.
|
||||
throw exceptions::invalid_request_exception("Invalid null value in condition for a key column");
|
||||
}
|
||||
}
|
||||
|
||||
future<::shared_ptr<cql_transport::messages::result_message>>
|
||||
modification_statement::do_execute(query_processor& qp, service::query_state& qs, const query_options& options) const {
|
||||
if (has_conditions() && options.get_protocol_version() == 1) {
|
||||
@@ -252,9 +258,7 @@ modification_statement::do_execute(query_processor& qp, service::query_state& qs
|
||||
|
||||
inc_cql_stats(qs.get_client_state().is_internal());
|
||||
|
||||
if (_restrictions->range_or_slice_eq_null(options)) { // See #7852 and #9290.
|
||||
throw exceptions::invalid_request_exception("Invalid null value in condition for a key column");
|
||||
}
|
||||
validate_primary_key_restrictions(options);
|
||||
|
||||
if (has_conditions()) {
|
||||
return execute_with_condition(qp, qs, options);
|
||||
|
||||
@@ -231,6 +231,8 @@ public:
|
||||
// True if this statement needs to read only static column values to check if it can be applied.
|
||||
bool has_only_static_column_conditions() const { return !_has_regular_column_conditions && _has_static_column_conditions; }
|
||||
|
||||
void validate_primary_key_restrictions(const query_options& options) const;
|
||||
|
||||
virtual future<::shared_ptr<cql_transport::messages::result_message>>
|
||||
execute(query_processor& qp, service::query_state& qs, const query_options& options) const override;
|
||||
|
||||
|
||||
@@ -218,7 +218,7 @@ struct from_json_object_visitor {
|
||||
throw marshal_exception("bytes_type must be represented as string");
|
||||
}
|
||||
std::string_view string_v = rjson::to_string_view(value);
|
||||
if (string_v.size() < 2 && string_v[0] != '0' && string_v[1] != 'x') {
|
||||
if (string_v.size() < 2 || string_v[0] != '0' || string_v[1] != 'x') {
|
||||
throw marshal_exception("Blob JSON strings must start with 0x");
|
||||
}
|
||||
string_v.remove_prefix(2);
|
||||
|
||||
@@ -3072,11 +3072,11 @@ mutation system_keyspace::make_group0_history_state_id_mutation(
|
||||
using namespace std::chrono;
|
||||
assert(*gc_older_than >= gc_clock::duration{0});
|
||||
|
||||
auto ts_millis = duration_cast<milliseconds>(microseconds{ts});
|
||||
auto gc_older_than_millis = duration_cast<milliseconds>(*gc_older_than);
|
||||
assert(gc_older_than_millis < ts_millis);
|
||||
auto ts_micros = microseconds{ts};
|
||||
auto gc_older_than_micros = duration_cast<microseconds>(*gc_older_than);
|
||||
assert(gc_older_than_micros < ts_micros);
|
||||
|
||||
auto tomb_upper_bound = utils::UUID_gen::min_time_UUID(ts_millis - gc_older_than_millis);
|
||||
auto tomb_upper_bound = utils::UUID_gen::min_time_UUID(ts_micros - gc_older_than_micros);
|
||||
// We want to delete all entries with IDs smaller than `tomb_upper_bound`
|
||||
// but the deleted range is of the form (x, +inf) since the schema is reversed.
|
||||
auto range = query::clustering_range::make_starting_with({
|
||||
|
||||
@@ -74,33 +74,29 @@ row_locker::lock_pk(const dht::decorated_key& pk, bool exclusive, db::timeout_cl
|
||||
future<row_locker::lock_holder>
|
||||
row_locker::lock_ck(const dht::decorated_key& pk, const clustering_key_prefix& cpk, bool exclusive, db::timeout_clock::time_point timeout, stats& stats) {
|
||||
mylog.debug("taking shared lock on partition {}, and {} lock on row {} in it", pk, (exclusive ? "exclusive" : "shared"), cpk);
|
||||
auto ck = cpk;
|
||||
// Create a two-level lock entry for the partition if it doesn't exist already.
|
||||
auto i = _two_level_locks.try_emplace(pk, this).first;
|
||||
// The two-level lock entry we've just created is guaranteed to be kept alive as long as it's locked.
|
||||
// Initiating read locking in the background below ensures that even if the two-level lock is currently
|
||||
// write-locked, releasing the write-lock will synchronously engage any waiting
|
||||
// locks and will keep the entry alive.
|
||||
future<lock_type::holder> lock_partition = i->second._partition_lock.hold_read_lock(timeout);
|
||||
auto j = i->second._row_locks.find(cpk);
|
||||
if (j == i->second._row_locks.end()) {
|
||||
// Not yet locked, need to create the lock. This makes a copy of cpk.
|
||||
try {
|
||||
j = i->second._row_locks.emplace(cpk, lock_type()).first;
|
||||
} catch(...) {
|
||||
// If this emplace() failed, e.g., out of memory, we fail. We
|
||||
// could do nothing - the partition lock we already started
|
||||
// taking will be unlocked automatically after being locked.
|
||||
// But it's better form to wait for the work we started, and it
|
||||
// will also allow us to remove the hash-table row we added.
|
||||
return lock_partition.then([ex = std::current_exception()] (auto lock) {
|
||||
// The lock is automatically released when "lock" goes out of scope.
|
||||
// TODO: unlock (lock = {}) now, search for the partition in the
|
||||
// hash table (we know it's still there, because we held the lock until
|
||||
// now) and remove the unused lock from the hash table if still unused.
|
||||
return make_exception_future<row_locker::lock_holder>(std::current_exception());
|
||||
});
|
||||
}
|
||||
}
|
||||
single_lock_stats &single_lock_stats = exclusive ? stats.exclusive_row : stats.shared_row;
|
||||
single_lock_stats.operations_currently_waiting_for_lock++;
|
||||
utils::latency_counter waiting_latency;
|
||||
waiting_latency.start();
|
||||
return lock_partition.then([this, pk = &i->first, cpk = &j->first, &row_lock = j->second, exclusive, &single_lock_stats, waiting_latency = std::move(waiting_latency), timeout] (auto lock1) mutable {
|
||||
return lock_partition.then([this, pk = &i->first, row_locks = &i->second._row_locks, ck = std::move(ck), exclusive, &single_lock_stats, waiting_latency = std::move(waiting_latency), timeout] (auto lock1) mutable {
|
||||
auto j = row_locks->find(ck);
|
||||
if (j == row_locks->end()) {
|
||||
// Not yet locked, need to create the lock.
|
||||
j = row_locks->emplace(std::move(ck), lock_type()).first;
|
||||
}
|
||||
auto* cpk = &j->first;
|
||||
auto& row_lock = j->second;
|
||||
// Like to the two-level lock entry above, the row_lock entry we've just created
|
||||
// is guaranteed to be kept alive as long as it's locked.
|
||||
// Initiating read/write locking in the background below ensures that.
|
||||
auto lock_row = exclusive ? row_lock.hold_write_lock(timeout) : row_lock.hold_read_lock(timeout);
|
||||
return lock_row.then([this, pk, cpk, exclusive, &single_lock_stats, waiting_latency = std::move(waiting_latency), lock1 = std::move(lock1)] (auto lock2) mutable {
|
||||
// FIXME: indentation
|
||||
|
||||
@@ -947,8 +947,12 @@ future<stop_iteration> view_update_builder::stop() const {
|
||||
return make_ready_future<stop_iteration>(stop_iteration::yes);
|
||||
}
|
||||
|
||||
future<utils::chunked_vector<frozen_mutation_and_schema>> view_update_builder::build_some() {
|
||||
future<std::optional<utils::chunked_vector<frozen_mutation_and_schema>>> view_update_builder::build_some() {
|
||||
return advance_all().then([this] (stop_iteration ignored) {
|
||||
if (!_update && !_existing) {
|
||||
// Tell the caller there is no more data to build.
|
||||
return make_ready_future<std::optional<utils::chunked_vector<frozen_mutation_and_schema>>>(std::nullopt);
|
||||
}
|
||||
bool do_advance_updates = false;
|
||||
bool do_advance_existings = false;
|
||||
if (_update && _update->is_partition_start()) {
|
||||
@@ -960,22 +964,23 @@ future<utils::chunked_vector<frozen_mutation_and_schema>> view_update_builder::b
|
||||
_existing_tombstone_tracker.set_partition_tombstone(_existing->as_partition_start().partition_tombstone());
|
||||
do_advance_existings = true;
|
||||
}
|
||||
future<stop_iteration> f = make_ready_future<stop_iteration>(stop_iteration::no);
|
||||
if (do_advance_updates) {
|
||||
return do_advance_existings ? advance_all() : advance_updates();
|
||||
f = do_advance_existings ? advance_all() : advance_updates();
|
||||
} else if (do_advance_existings) {
|
||||
return advance_existings();
|
||||
f = advance_existings();
|
||||
}
|
||||
return make_ready_future<stop_iteration>(stop_iteration::no);
|
||||
}).then([this] (stop_iteration ignored) {
|
||||
return repeat([this] {
|
||||
return this->on_results();
|
||||
return std::move(f).then([this] (stop_iteration ignored) {
|
||||
return repeat([this] {
|
||||
return this->on_results();
|
||||
});
|
||||
}).then([this] {
|
||||
utils::chunked_vector<frozen_mutation_and_schema> mutations;
|
||||
for (auto& update : _view_updates) {
|
||||
update.move_to(mutations);
|
||||
}
|
||||
return std::make_optional(mutations);
|
||||
});
|
||||
}).then([this] {
|
||||
utils::chunked_vector<frozen_mutation_and_schema> mutations;
|
||||
for (auto& update : _view_updates) {
|
||||
update.move_to(mutations);
|
||||
}
|
||||
return mutations;
|
||||
});
|
||||
}
|
||||
|
||||
@@ -2145,24 +2150,28 @@ update_backlog node_update_backlog::add_fetch(unsigned shard, update_backlog bac
|
||||
return std::max(backlog, _max.load(std::memory_order_relaxed));
|
||||
}
|
||||
|
||||
future<bool> check_view_build_ongoing(db::system_distributed_keyspace& sys_dist_ks, const sstring& ks_name, const sstring& cf_name) {
|
||||
return sys_dist_ks.view_status(ks_name, cf_name).then([] (std::unordered_map<utils::UUID, sstring>&& view_statuses) {
|
||||
return boost::algorithm::any_of(view_statuses | boost::adaptors::map_values, [] (const sstring& view_status) {
|
||||
return view_status == "STARTED";
|
||||
future<bool> check_view_build_ongoing(db::system_distributed_keyspace& sys_dist_ks, const locator::token_metadata& tm, const sstring& ks_name,
|
||||
const sstring& cf_name) {
|
||||
using view_statuses_type = std::unordered_map<utils::UUID, sstring>;
|
||||
return sys_dist_ks.view_status(ks_name, cf_name).then([&tm] (view_statuses_type&& view_statuses) {
|
||||
return boost::algorithm::any_of(view_statuses, [&tm] (const view_statuses_type::value_type& view_status) {
|
||||
// Only consider status of known hosts.
|
||||
return view_status.second == "STARTED" && tm.get_endpoint_for_host_id(view_status.first);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
future<bool> check_needs_view_update_path(db::system_distributed_keyspace& sys_dist_ks, const replica::table& t, streaming::stream_reason reason) {
|
||||
future<bool> check_needs_view_update_path(db::system_distributed_keyspace& sys_dist_ks, const locator::token_metadata& tm, const replica::table& t,
|
||||
streaming::stream_reason reason) {
|
||||
if (is_internal_keyspace(t.schema()->ks_name())) {
|
||||
return make_ready_future<bool>(false);
|
||||
}
|
||||
if (reason == streaming::stream_reason::repair && !t.views().empty()) {
|
||||
return make_ready_future<bool>(true);
|
||||
}
|
||||
return do_with(t.views(), [&sys_dist_ks] (auto& views) {
|
||||
return do_with(t.views(), [&sys_dist_ks, &tm] (auto& views) {
|
||||
return map_reduce(views,
|
||||
[&sys_dist_ks] (const view_ptr& view) { return check_view_build_ongoing(sys_dist_ks, view->ks_name(), view->cf_name()); },
|
||||
[&sys_dist_ks, &tm] (const view_ptr& view) { return check_view_build_ongoing(sys_dist_ks, tm, view->ks_name(), view->cf_name()); },
|
||||
false,
|
||||
std::logical_or<bool>());
|
||||
});
|
||||
|
||||
@@ -185,7 +185,15 @@ public:
|
||||
}
|
||||
view_update_builder(view_update_builder&& other) noexcept = default;
|
||||
|
||||
future<utils::chunked_vector<frozen_mutation_and_schema>> build_some();
|
||||
|
||||
// build_some() works on batches of 100 (max_rows_for_view_updates)
|
||||
// updated rows, but can_skip_view_updates() can decide that some of
|
||||
// these rows do not effect the view, and as a result build_some() can
|
||||
// fewer than 100 rows - in extreme cases even zero (see issue #12297).
|
||||
// So we can't use an empty returned vector to signify that the view
|
||||
// update building is done - and we wrap the return value in an
|
||||
// std::optional, which is disengaged when the iteration is done.
|
||||
future<std::optional<utils::chunked_vector<frozen_mutation_and_schema>>> build_some();
|
||||
|
||||
future<> close() noexcept;
|
||||
|
||||
|
||||
@@ -22,9 +22,13 @@ class system_distributed_keyspace;
|
||||
|
||||
}
|
||||
|
||||
namespace locator {
|
||||
class token_metadata;
|
||||
}
|
||||
|
||||
namespace db::view {
|
||||
|
||||
future<bool> check_view_build_ongoing(db::system_distributed_keyspace& sys_dist_ks, const sstring& ks_name, const sstring& cf_name);
|
||||
future<bool> check_needs_view_update_path(db::system_distributed_keyspace& sys_dist_ks, const replica::table& t, streaming::stream_reason reason);
|
||||
future<bool> check_needs_view_update_path(db::system_distributed_keyspace& sys_dist_ks, const locator::token_metadata& tm, const replica::table& t,
|
||||
streaming::stream_reason reason);
|
||||
|
||||
}
|
||||
|
||||
@@ -83,10 +83,10 @@ future<> view_update_generator::start() {
|
||||
service::get_local_streaming_priority(),
|
||||
nullptr,
|
||||
::mutation_reader::forwarding::no);
|
||||
auto close_sr = deferred_close(staging_sstable_reader);
|
||||
|
||||
inject_failure("view_update_generator_consume_staging_sstable");
|
||||
auto result = staging_sstable_reader.consume_in_thread(view_updating_consumer(s, std::move(permit), *t, sstables, _as, staging_sstable_reader_handle));
|
||||
staging_sstable_reader.close().get();
|
||||
if (result == stop_iteration::yes) {
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -15,11 +15,18 @@
|
||||
|
||||
namespace dht {
|
||||
|
||||
// Note: Cassandra has a special case where for an empty key it returns
|
||||
// minimum_token() instead of 0 (the naturally-calculated hash function for
|
||||
// an empty string). Their thinking was that empty partition keys are not
|
||||
// allowed anyway. However, they *are* allowed in materialized views, so the
|
||||
// empty-key partition should get a real token, not an invalid token, so
|
||||
// we dropped this special case. Since we don't support migrating sstables of
|
||||
// materialized-views from Cassandra, this Cassandra-Scylla incompatiblity
|
||||
// will not cause problems in practice.
|
||||
// Note that get_token(const schema& s, partition_key_view key) below must
|
||||
// use exactly the same algorithm as this function.
|
||||
token
|
||||
murmur3_partitioner::get_token(bytes_view key) const {
|
||||
if (key.empty()) {
|
||||
return minimum_token();
|
||||
}
|
||||
std::array<uint64_t, 2> hash;
|
||||
utils::murmur_hash::hash3_x64_128(key, 0, hash);
|
||||
return get_token(hash[0]);
|
||||
|
||||
3
dist/common/scripts/scylla_coredump_setup
vendored
3
dist/common/scripts/scylla_coredump_setup
vendored
@@ -42,7 +42,8 @@ if __name__ == '__main__':
|
||||
if systemd_unit.available('systemd-coredump@.service'):
|
||||
dropin = '''
|
||||
[Service]
|
||||
TimeoutStartSec=infinity
|
||||
RuntimeMaxSec=infinity
|
||||
TimeoutSec=infinity
|
||||
'''[1:-1]
|
||||
os.makedirs('/etc/systemd/system/systemd-coredump@.service.d', exist_ok=True)
|
||||
with open('/etc/systemd/system/systemd-coredump@.service.d/timeout.conf', 'w') as f:
|
||||
|
||||
23
dist/common/scripts/scylla_raid_setup
vendored
23
dist/common/scripts/scylla_raid_setup
vendored
@@ -16,7 +16,7 @@ import stat
|
||||
import distro
|
||||
from pathlib import Path
|
||||
from scylla_util import *
|
||||
from subprocess import run
|
||||
from subprocess import run, SubprocessError
|
||||
|
||||
if __name__ == '__main__':
|
||||
if os.getuid() > 0:
|
||||
@@ -137,7 +137,9 @@ if __name__ == '__main__':
|
||||
# stalling. The minimum block size for crc enabled filesystems is 1024,
|
||||
# and it also cannot be smaller than the sector size.
|
||||
block_size = max(1024, sector_size)
|
||||
run('udevadm settle', shell=True, check=True)
|
||||
run(f'mkfs.xfs -b size={block_size} {fsdev} -f -K', shell=True, check=True)
|
||||
run('udevadm settle', shell=True, check=True)
|
||||
|
||||
if is_debian_variant():
|
||||
confpath = '/etc/mdadm/mdadm.conf'
|
||||
@@ -153,6 +155,11 @@ if __name__ == '__main__':
|
||||
os.makedirs(mount_at, exist_ok=True)
|
||||
|
||||
uuid = run(f'blkid -s UUID -o value {fsdev}', shell=True, check=True, capture_output=True, encoding='utf-8').stdout.strip()
|
||||
if not uuid:
|
||||
raise Exception(f'Failed to get UUID of {fsdev}')
|
||||
|
||||
uuidpath = f'/dev/disk/by-uuid/{uuid}'
|
||||
|
||||
after = 'local-fs.target'
|
||||
wants = ''
|
||||
if raid and args.raid_level != '0':
|
||||
@@ -169,7 +176,7 @@ After={after}{wants}
|
||||
DefaultDependencies=no
|
||||
|
||||
[Mount]
|
||||
What=/dev/disk/by-uuid/{uuid}
|
||||
What={uuidpath}
|
||||
Where={mount_at}
|
||||
Type=xfs
|
||||
Options=noatime{opt_discard}
|
||||
@@ -191,8 +198,16 @@ WantedBy=multi-user.target
|
||||
systemd_unit.reload()
|
||||
if args.raid_level != '0':
|
||||
md_service.start()
|
||||
mount = systemd_unit(mntunit_bn)
|
||||
mount.start()
|
||||
try:
|
||||
mount = systemd_unit(mntunit_bn)
|
||||
mount.start()
|
||||
except SubprocessError as e:
|
||||
if not os.path.exists(uuidpath):
|
||||
print(f'\nERROR: {uuidpath} is not found\n')
|
||||
elif not stat.S_ISBLK(os.stat(uuidpath).st_mode):
|
||||
print(f'\nERROR: {uuidpath} is not block device\n')
|
||||
raise e
|
||||
|
||||
if args.enable_on_nextboot:
|
||||
mount.enable()
|
||||
uid = pwd.getpwnam('scylla').pw_uid
|
||||
|
||||
4
dist/common/scripts/scylla_setup
vendored
4
dist/common/scripts/scylla_setup
vendored
@@ -214,7 +214,7 @@ if __name__ == '__main__':
|
||||
help='skip raid setup')
|
||||
parser.add_argument('--raid-level-5', action='store_true', default=False,
|
||||
help='use RAID5 for RAID volume')
|
||||
parser.add_argument('--online-discard', default=True,
|
||||
parser.add_argument('--online-discard', default=1, choices=[0, 1], type=int,
|
||||
help='Configure XFS to discard unused blocks as soon as files are deleted')
|
||||
parser.add_argument('--nic',
|
||||
help='specify NIC')
|
||||
@@ -458,7 +458,7 @@ if __name__ == '__main__':
|
||||
args.no_raid_setup = not raid_setup
|
||||
if raid_setup:
|
||||
level = '5' if raid_level_5 else '0'
|
||||
run_setup_script('RAID', f'scylla_raid_setup --disks {disks} --enable-on-nextboot --raid-level={level} --online-discard={int(online_discard)}')
|
||||
run_setup_script('RAID', f'scylla_raid_setup --disks {disks} --enable-on-nextboot --raid-level={level} --online-discard={online_discard}')
|
||||
|
||||
coredump_setup = interactive_ask_service('Do you want to enable coredumps?', 'Yes - sets up coredump to allow a post-mortem analysis of the Scylla state just prior to a crash. No - skips this step.', coredump_setup)
|
||||
args.no_coredump_setup = not coredump_setup
|
||||
|
||||
7
dist/docker/scyllasetup.py
vendored
7
dist/docker/scyllasetup.py
vendored
@@ -68,7 +68,12 @@ class ScyllaSetup:
|
||||
|
||||
def cqlshrc(self):
|
||||
home = os.environ['HOME']
|
||||
hostname = subprocess.check_output(['hostname', '-i']).decode('ascii').strip()
|
||||
if self._rpcAddress:
|
||||
hostname = self._rpcAddress
|
||||
elif self._listenAddress:
|
||||
hostname = self._listenAddress
|
||||
else:
|
||||
hostname = subprocess.check_output(['hostname', '-i']).decode('ascii').strip()
|
||||
with open("%s/.cqlshrc" % home, "w") as cqlshrc:
|
||||
cqlshrc.write("[connection]\nhostname = %s\n" % hostname)
|
||||
|
||||
|
||||
4
dist/redhat/scylla.spec
vendored
4
dist/redhat/scylla.spec
vendored
@@ -7,7 +7,7 @@ Group: Applications/Databases
|
||||
License: AGPLv3
|
||||
URL: http://www.scylladb.com/
|
||||
Source0: %{reloc_pkg}
|
||||
Requires: %{product}-server = %{version} %{product}-conf = %{version} %{product}-python3 = %{version} %{product}-kernel-conf = %{version} %{product}-jmx = %{version} %{product}-tools = %{version} %{product}-tools-core = %{version} %{product}-node-exporter = %{version}
|
||||
Requires: %{product}-server = %{version}-%{release} %{product}-conf = %{version}-%{release} %{product}-python3 = %{version}-%{release} %{product}-kernel-conf = %{version}-%{release} %{product}-jmx = %{version}-%{release} %{product}-tools = %{version}-%{release} %{product}-tools-core = %{version}-%{release} %{product}-node-exporter = %{version}-%{release}
|
||||
Obsoletes: scylla-server < 1.1
|
||||
|
||||
%global _debugsource_template %{nil}
|
||||
@@ -54,7 +54,7 @@ Group: Applications/Databases
|
||||
Summary: The Scylla database server
|
||||
License: AGPLv3
|
||||
URL: http://www.scylladb.com/
|
||||
Requires: %{product}-conf = %{version} %{product}-python3 = %{version}
|
||||
Requires: %{product}-conf = %{version}-%{release} %{product}-python3 = %{version}-%{release}
|
||||
Conflicts: abrt
|
||||
AutoReqProv: no
|
||||
|
||||
|
||||
@@ -774,11 +774,14 @@ make_flat_mutation_reader_from_mutations_v2(schema_ptr s, reader_permit permit,
|
||||
std::optional<mutation_consume_cookie> _cookie;
|
||||
|
||||
private:
|
||||
void flush_tombstones(position_in_partition_view pos) {
|
||||
void flush_tombstones(position_in_partition_view pos, bool emit_end = false) {
|
||||
_rt_gen.flush(pos, [&] (range_tombstone_change rt) {
|
||||
_current_rt = rt.tombstone();
|
||||
push_mutation_fragment(*_schema, _permit, std::move(rt));
|
||||
});
|
||||
if (emit_end && _current_rt) {
|
||||
push_mutation_fragment(*_schema, _permit, range_tombstone_change(pos, {}));
|
||||
}
|
||||
}
|
||||
void maybe_emit_partition_start() {
|
||||
if (_dk) {
|
||||
@@ -815,10 +818,7 @@ make_flat_mutation_reader_from_mutations_v2(schema_ptr s, reader_permit permit,
|
||||
return stop_iteration::yes;
|
||||
}
|
||||
maybe_emit_partition_start();
|
||||
flush_tombstones(position_in_partition::after_all_clustered_rows());
|
||||
if (_current_rt) {
|
||||
push_mutation_fragment(*_schema, _permit, range_tombstone_change(position_in_partition::after_all_clustered_rows(), {}));
|
||||
}
|
||||
flush_tombstones(position_in_partition::after_all_clustered_rows(), true);
|
||||
push_mutation_fragment(*_schema, _permit, partition_end{});
|
||||
return stop_iteration::no;
|
||||
}
|
||||
@@ -1986,11 +1986,14 @@ flat_mutation_reader_v2 upgrade_to_v2(flat_mutation_reader r) {
|
||||
tombstone _current_rt;
|
||||
std::optional<position_range> _pr;
|
||||
public:
|
||||
void flush_tombstones(position_in_partition_view pos) {
|
||||
void flush_tombstones(position_in_partition_view pos, bool emit_end = false) {
|
||||
_rt_gen.flush(pos, [&] (range_tombstone_change rt) {
|
||||
_current_rt = rt.tombstone();
|
||||
push_mutation_fragment(*_schema, _permit, std::move(rt));
|
||||
});
|
||||
if (emit_end && _current_rt) {
|
||||
push_mutation_fragment(*_schema, _permit, range_tombstone_change(pos, {}));
|
||||
}
|
||||
}
|
||||
void consume(static_row mf) {
|
||||
push_mutation_fragment(*_schema, _permit, std::move(mf));
|
||||
@@ -2015,11 +2018,9 @@ flat_mutation_reader_v2 upgrade_to_v2(flat_mutation_reader r) {
|
||||
push_mutation_fragment(*_schema, _permit, std::move(mf));
|
||||
}
|
||||
void consume(partition_end mf) {
|
||||
flush_tombstones(position_in_partition::after_all_clustered_rows());
|
||||
flush_tombstones(position_in_partition::after_all_clustered_rows(), true);
|
||||
if (_current_rt) {
|
||||
assert(!_pr);
|
||||
push_mutation_fragment(*_schema, _permit, range_tombstone_change(
|
||||
position_in_partition::after_all_clustered_rows(), {}));
|
||||
}
|
||||
push_mutation_fragment(*_schema, _permit, std::move(mf));
|
||||
}
|
||||
@@ -2042,10 +2043,7 @@ flat_mutation_reader_v2 upgrade_to_v2(flat_mutation_reader r) {
|
||||
if (_reader.is_end_of_stream() && _reader.is_buffer_empty()) {
|
||||
if (_pr) {
|
||||
// If !_pr we should flush on partition_end
|
||||
flush_tombstones(_pr->end());
|
||||
if (_current_rt) {
|
||||
push_mutation_fragment(*_schema, _permit, range_tombstone_change(_pr->end(), {}));
|
||||
}
|
||||
flush_tombstones(_pr->end(), true);
|
||||
}
|
||||
_end_of_stream = true;
|
||||
}
|
||||
|
||||
@@ -1012,10 +1012,10 @@ std::set<inet_address> gossiper::get_live_members() {
|
||||
|
||||
std::set<inet_address> gossiper::get_live_token_owners() {
|
||||
std::set<inet_address> token_owners;
|
||||
for (auto& member : get_live_members()) {
|
||||
auto es = get_endpoint_state_for_endpoint_ptr(member);
|
||||
if (es && !is_dead_state(*es) && get_token_metadata_ptr()->is_member(member)) {
|
||||
token_owners.insert(member);
|
||||
auto normal_token_owners = get_token_metadata_ptr()->get_all_endpoints();
|
||||
for (auto& node: normal_token_owners) {
|
||||
if (is_alive(node)) {
|
||||
token_owners.insert(node);
|
||||
}
|
||||
}
|
||||
return token_owners;
|
||||
@@ -1023,10 +1023,10 @@ std::set<inet_address> gossiper::get_live_token_owners() {
|
||||
|
||||
std::set<inet_address> gossiper::get_unreachable_token_owners() {
|
||||
std::set<inet_address> token_owners;
|
||||
for (auto&& x : _unreachable_endpoints) {
|
||||
auto& endpoint = x.first;
|
||||
if (get_token_metadata_ptr()->is_member(endpoint)) {
|
||||
token_owners.insert(endpoint);
|
||||
auto normal_token_owners = get_token_metadata_ptr()->get_all_endpoints();
|
||||
for (auto& node: normal_token_owners) {
|
||||
if (!is_alive(node)) {
|
||||
token_owners.insert(node);
|
||||
}
|
||||
}
|
||||
return token_owners;
|
||||
|
||||
@@ -215,22 +215,6 @@ effective_replication_map::get_primary_ranges_within_dc(inet_address ep) const {
|
||||
});
|
||||
}
|
||||
|
||||
future<std::unordered_multimap<inet_address, dht::token_range>>
|
||||
abstract_replication_strategy::get_address_ranges(const token_metadata& tm) const {
|
||||
std::unordered_multimap<inet_address, dht::token_range> ret;
|
||||
for (auto& t : tm.sorted_tokens()) {
|
||||
dht::token_range_vector r = tm.get_primary_ranges_for(t);
|
||||
auto eps = co_await calculate_natural_endpoints(t, tm);
|
||||
rslogger.debug("token={}, primary_range={}, address={}", t, r, eps);
|
||||
for (auto ep : eps) {
|
||||
for (auto&& rng : r) {
|
||||
ret.emplace(ep, rng);
|
||||
}
|
||||
}
|
||||
}
|
||||
co_return ret;
|
||||
}
|
||||
|
||||
future<std::unordered_multimap<inet_address, dht::token_range>>
|
||||
abstract_replication_strategy::get_address_ranges(const token_metadata& tm, inet_address endpoint) const {
|
||||
std::unordered_multimap<inet_address, dht::token_range> ret;
|
||||
|
||||
@@ -112,7 +112,6 @@ public:
|
||||
future<dht::token_range_vector> get_ranges(inet_address ep, token_metadata_ptr tmptr) const;
|
||||
|
||||
public:
|
||||
future<std::unordered_multimap<inet_address, dht::token_range>> get_address_ranges(const token_metadata& tm) const;
|
||||
future<std::unordered_multimap<inet_address, dht::token_range>> get_address_ranges(const token_metadata& tm, inet_address endpoint) const;
|
||||
|
||||
// Caller must ensure that token_metadata will not change throughout the call.
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
#include <seastar/core/coroutine.hh>
|
||||
#include <seastar/core/seastar.hh>
|
||||
#include <seastar/http/response_parser.hh>
|
||||
#include <seastar/http/reply.hh>
|
||||
#include <seastar/net/api.hh>
|
||||
#include <seastar/net/dns.hh>
|
||||
|
||||
@@ -47,7 +48,8 @@ future<> azure_snitch::load_config() {
|
||||
|
||||
logger().info("AzureSnitch using region: {}, zone: {}.", azure_region, azure_zone);
|
||||
|
||||
_my_rack = azure_zone;
|
||||
// Zoneless regions return empty zone
|
||||
_my_rack = (azure_zone != "" ? azure_zone : azure_region);
|
||||
_my_dc = azure_region;
|
||||
|
||||
co_return co_await _my_distributed->invoke_on_all([this] (snitch_ptr& local_s) {
|
||||
@@ -90,6 +92,10 @@ future<sstring> azure_snitch::azure_api_call(sstring path) {
|
||||
|
||||
// Read HTTP response header first
|
||||
auto rsp = parser.get_parsed_response();
|
||||
if (rsp->_status_code != static_cast<int>(httpd::reply::status_type::ok)) {
|
||||
throw std::runtime_error(format("Error: HTTP response status {}", rsp->_status_code));
|
||||
}
|
||||
|
||||
auto it = rsp->_headers.find("Content-Length");
|
||||
if (it == rsp->_headers.end()) {
|
||||
throw std::runtime_error("Error: HTTP response does not contain: Content-Length\n");
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
#include <seastar/core/seastar.hh>
|
||||
#include <seastar/core/sleep.hh>
|
||||
#include <seastar/core/do_with.hh>
|
||||
#include <seastar/http/reply.hh>
|
||||
|
||||
#include <boost/algorithm/string/classification.hpp>
|
||||
#include <boost/algorithm/string/split.hpp>
|
||||
@@ -114,6 +115,9 @@ future<sstring> ec2_snitch::aws_api_call_once(sstring addr, uint16_t port, sstri
|
||||
|
||||
// Read HTTP response header first
|
||||
auto _rsp = _parser.get_parsed_response();
|
||||
if (_rsp->_status_code != static_cast<int>(httpd::reply::status_type::ok)) {
|
||||
return make_exception_future<sstring>(std::runtime_error(format("Error: HTTP response status {}", _rsp->_status_code)));
|
||||
}
|
||||
auto it = _rsp->_headers.find("Content-Length");
|
||||
if (it == _rsp->_headers.end()) {
|
||||
return make_exception_future<sstring>("Error: HTTP response does not contain: Content-Length\n");
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
#include <seastar/net/dns.hh>
|
||||
#include <seastar/core/seastar.hh>
|
||||
#include "locator/gce_snitch.hh"
|
||||
#include <seastar/http/reply.hh>
|
||||
|
||||
#include <boost/algorithm/string/split.hpp>
|
||||
#include <boost/algorithm/string/classification.hpp>
|
||||
@@ -106,6 +107,10 @@ future<sstring> gce_snitch::gce_api_call(sstring addr, sstring cmd) {
|
||||
|
||||
// Read HTTP response header first
|
||||
auto rsp = parser.get_parsed_response();
|
||||
if (rsp->_status_code != static_cast<int>(httpd::reply::status_type::ok)) {
|
||||
throw std::runtime_error(format("Error: HTTP response status {}", rsp->_status_code));
|
||||
}
|
||||
|
||||
auto it = rsp->_headers.find("Content-Length");
|
||||
if (it == rsp->_headers.end()) {
|
||||
throw std::runtime_error("Error: HTTP response does not contain: Content-Length\n");
|
||||
|
||||
@@ -786,13 +786,12 @@ void token_metadata_impl::calculate_pending_ranges_for_leaving(
|
||||
const abstract_replication_strategy& strategy,
|
||||
std::unordered_multimap<range<token>, inet_address>& new_pending_ranges,
|
||||
mutable_token_metadata_ptr all_left_metadata) const {
|
||||
std::unordered_multimap<inet_address, dht::token_range> address_ranges = strategy.get_address_ranges(unpimplified_this).get0();
|
||||
// get all ranges that will be affected by leaving nodes
|
||||
std::unordered_set<range<token>> affected_ranges;
|
||||
for (auto endpoint : _leaving_endpoints) {
|
||||
auto r = address_ranges.equal_range(endpoint);
|
||||
for (auto x = r.first; x != r.second; x++) {
|
||||
affected_ranges.emplace(x->second);
|
||||
auto r = strategy.get_address_ranges(unpimplified_this, endpoint).get0();
|
||||
for (const auto& x : r) {
|
||||
affected_ranges.emplace(x.second);
|
||||
}
|
||||
}
|
||||
// for each of those ranges, find what new nodes will be responsible for the range when
|
||||
@@ -826,16 +825,14 @@ void token_metadata_impl::calculate_pending_ranges_for_replacing(
|
||||
if (_replacing_endpoints.empty()) {
|
||||
return;
|
||||
}
|
||||
auto address_ranges = strategy.get_address_ranges(unpimplified_this).get0();
|
||||
for (const auto& node : _replacing_endpoints) {
|
||||
auto existing_node = node.first;
|
||||
auto replacing_node = node.second;
|
||||
auto address_ranges = strategy.get_address_ranges(unpimplified_this, existing_node).get0();
|
||||
for (const auto& x : address_ranges) {
|
||||
seastar::thread::maybe_yield();
|
||||
if (x.first == existing_node) {
|
||||
tlogger.debug("Node {} replaces {} for range {}", replacing_node, existing_node, x.second);
|
||||
new_pending_ranges.emplace(x.second, replacing_node);
|
||||
}
|
||||
tlogger.debug("Node {} replaces {} for range {}", replacing_node, existing_node, x.second);
|
||||
new_pending_ranges.emplace(x.second, replacing_node);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
55
main.cc
55
main.cc
@@ -383,6 +383,8 @@ static auto defer_verbose_shutdown(const char* what, Func&& func) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} catch (const storage_io_error& e) {
|
||||
do_abort = false;
|
||||
} catch (...) {
|
||||
}
|
||||
auto msg = fmt::format("Unexpected error shutting down {}: {}", what, ex);
|
||||
@@ -425,6 +427,39 @@ static int scylla_main(int ac, char** av) {
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// Even on the environment which causes error during initalize Scylla,
|
||||
// "scylla --version" should be able to run without error.
|
||||
// To do so, we need to parse and execute these options before
|
||||
// initializing Scylla/Seastar classes.
|
||||
bpo::options_description preinit_description("Scylla options");
|
||||
bpo::variables_map preinit_vm;
|
||||
preinit_description.add_options()
|
||||
("version", bpo::bool_switch(), "print version number and exit")
|
||||
("build-id", bpo::bool_switch(), "print build-id and exit")
|
||||
("build-mode", bpo::bool_switch(), "print build mode and exit")
|
||||
("list-tools", bpo::bool_switch(), "list included tools and exit");
|
||||
auto preinit_parsed_opts = bpo::command_line_parser(ac, av).options(preinit_description).allow_unregistered().run();
|
||||
bpo::store(preinit_parsed_opts, preinit_vm);
|
||||
if (preinit_vm["version"].as<bool>()) {
|
||||
fmt::print("{}\n", scylla_version());
|
||||
return 0;
|
||||
}
|
||||
if (preinit_vm["build-id"].as<bool>()) {
|
||||
fmt::print("{}\n", get_build_id());
|
||||
return 0;
|
||||
}
|
||||
if (preinit_vm["build-mode"].as<bool>()) {
|
||||
fmt::print("{}\n", scylla_build_mode());
|
||||
return 0;
|
||||
}
|
||||
if (preinit_vm["list-tools"].as<bool>()) {
|
||||
fmt::print(
|
||||
"types - a command-line tool to examine values belonging to scylla types\n"
|
||||
"sstable - a multifunctional command-line tool to examine the content of sstables\n"
|
||||
);
|
||||
return 0;
|
||||
}
|
||||
|
||||
try {
|
||||
runtime::init_uptime();
|
||||
std::setvbuf(stdout, nullptr, _IOLBF, 1000);
|
||||
@@ -479,26 +514,6 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
bpo::variables_map vm;
|
||||
auto parsed_opts = bpo::command_line_parser(ac, av).options(app.get_options_description()).allow_unregistered().run();
|
||||
bpo::store(parsed_opts, vm);
|
||||
if (vm["version"].as<bool>()) {
|
||||
fmt::print("{}\n", scylla_version());
|
||||
return 0;
|
||||
}
|
||||
if (vm["build-id"].as<bool>()) {
|
||||
fmt::print("{}\n", get_build_id());
|
||||
return 0;
|
||||
}
|
||||
if (vm["build-mode"].as<bool>()) {
|
||||
fmt::print("{}\n", scylla_build_mode());
|
||||
return 0;
|
||||
}
|
||||
if (vm["list-tools"].as<bool>()) {
|
||||
fmt::print(
|
||||
"types - a command-line tool to examine values belonging to scylla types\n"
|
||||
"sstable - a multifunctional command-line tool to examine the content of sstables\n"
|
||||
);
|
||||
return 0;
|
||||
}
|
||||
|
||||
print_starting_message(ac, av, parsed_opts);
|
||||
|
||||
sharded<locator::shared_token_metadata> token_metadata;
|
||||
|
||||
@@ -494,8 +494,16 @@ public:
|
||||
_partition_limit -= _rows_in_current_partition > 0;
|
||||
auto stop = consumer.consume_end_of_partition();
|
||||
if (!sstable_compaction()) {
|
||||
return _row_limit && _partition_limit && stop != stop_iteration::yes
|
||||
stop = _row_limit && _partition_limit && stop != stop_iteration::yes
|
||||
? stop_iteration::no : stop_iteration::yes;
|
||||
// If we decided to stop earlier but decide to continue now, we
|
||||
// are in effect skipping the partition. Do not leave `_stop` at
|
||||
// `stop_iteration::yes` in this case, reset it back to
|
||||
// `stop_iteration::no` as if we exhausted the partition.
|
||||
if (_stop && !stop) {
|
||||
_stop = stop_iteration::no;
|
||||
}
|
||||
return stop;
|
||||
}
|
||||
}
|
||||
return stop_iteration::no;
|
||||
@@ -540,6 +548,7 @@ public:
|
||||
_current_partition_limit = std::min(_row_limit, _partition_row_limit);
|
||||
_query_time = query_time;
|
||||
_stats = {};
|
||||
_stop = stop_iteration::no;
|
||||
|
||||
noop_compacted_fragments_consumer nc;
|
||||
|
||||
|
||||
@@ -1240,7 +1240,10 @@ future<flat_mutation_reader> evictable_reader::resume_or_create_reader() {
|
||||
if (auto reader_opt = try_resume()) {
|
||||
co_return std::move(*reader_opt);
|
||||
}
|
||||
co_await _permit.maybe_wait_readmission();
|
||||
// See evictable_reader_v2::resume_or_create_reader()
|
||||
if (_permit.needs_readmission()) {
|
||||
co_await _permit.wait_readmission();
|
||||
}
|
||||
co_return recreate_reader();
|
||||
}
|
||||
|
||||
@@ -1773,7 +1776,18 @@ future<flat_mutation_reader_v2> evictable_reader_v2::resume_or_create_reader() {
|
||||
if (auto reader_opt = try_resume()) {
|
||||
co_return std::move(*reader_opt);
|
||||
}
|
||||
co_await _permit.maybe_wait_readmission();
|
||||
// When the reader is created the first time and we are actually resuming a
|
||||
// saved reader in `recreate_reader()`, we have two cases here:
|
||||
// * the reader is still alive (in inactive state)
|
||||
// * the reader was evicted
|
||||
// We check for this below with `needs_readmission()` and it is very
|
||||
// important to not allow for preemption between said check and
|
||||
// `recreate_reader()`, otherwise the reader might be evicted between the
|
||||
// check and `recreate_reader()` and the latter will recreate it without
|
||||
// waiting for re-admission.
|
||||
if (_permit.needs_readmission()) {
|
||||
co_await _permit.wait_readmission();
|
||||
}
|
||||
co_return recreate_reader();
|
||||
}
|
||||
|
||||
@@ -1959,7 +1973,9 @@ future<> evictable_reader_v2::fill_buffer() {
|
||||
auto* next_mf = co_await _reader->peek();
|
||||
|
||||
// First make sure we've made progress w.r.t. _next_position_in_partition.
|
||||
while (next_mf && _tri_cmp(_next_position_in_partition, buffer().back().position()) <= 0) {
|
||||
// This loop becomes inifinite when next pos is a partition start.
|
||||
// In that case progress is guranteed anyway, so skip this loop entirely.
|
||||
while (!_next_position_in_partition.is_partition_start() && next_mf && _tri_cmp(_next_position_in_partition, buffer().back().position()) <= 0) {
|
||||
push_mutation_fragment(_reader->pop_mutation_fragment());
|
||||
next_mf = co_await _reader->peek();
|
||||
}
|
||||
|
||||
5
query.cc
5
query.cc
@@ -92,14 +92,13 @@ void trim_clustering_row_ranges_to(const schema& s, clustering_row_ranges& range
|
||||
}
|
||||
|
||||
void trim_clustering_row_ranges_to(const schema& s, clustering_row_ranges& ranges, const clustering_key& key, bool reversed) {
|
||||
if (key.is_full(s)) {
|
||||
if (key.is_full(s) || reversed) {
|
||||
return trim_clustering_row_ranges_to(s, ranges,
|
||||
reversed ? position_in_partition_view::before_key(key) : position_in_partition_view::after_key(key), reversed);
|
||||
}
|
||||
auto full_key = key;
|
||||
clustering_key::make_full(s, full_key);
|
||||
return trim_clustering_row_ranges_to(s, ranges,
|
||||
reversed ? position_in_partition_view::after_key(full_key) : position_in_partition_view::before_key(full_key), reversed);
|
||||
return trim_clustering_row_ranges_to(s, ranges, position_in_partition_view::before_key(full_key), reversed);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -68,22 +68,33 @@ public:
|
||||
// for accumulated range tombstones.
|
||||
// After this, only range_tombstones with positions >= upper_bound may be added,
|
||||
// which guarantees that they won't affect the output of this flush.
|
||||
//
|
||||
// If upper_bound == position_in_partition::after_all_clustered_rows(),
|
||||
// emits all remaining range_tombstone_changes.
|
||||
// No range_tombstones may be added after this.
|
||||
//
|
||||
// FIXME: respect preemption
|
||||
template<RangeTombstoneChangeConsumer C>
|
||||
void flush(position_in_partition_view upper_bound, C consumer) {
|
||||
position_in_partition::less_compare less(_schema);
|
||||
std::optional<range_tombstone> prev;
|
||||
void flush(const position_in_partition_view upper_bound, C consumer) {
|
||||
if (_range_tombstones.empty()) {
|
||||
_lower_bound = upper_bound;
|
||||
return;
|
||||
}
|
||||
|
||||
while (!_range_tombstones.empty() && less(_range_tombstones.begin()->end_position(), upper_bound)) {
|
||||
position_in_partition::tri_compare cmp(_schema);
|
||||
std::optional<range_tombstone> prev;
|
||||
bool flush_all = cmp(upper_bound, position_in_partition::after_all_clustered_rows()) == 0;
|
||||
|
||||
while (!_range_tombstones.empty() && (flush_all || (cmp(_range_tombstones.begin()->end_position(), upper_bound) < 0))) {
|
||||
auto rt = _range_tombstones.pop(_range_tombstones.begin());
|
||||
|
||||
if (prev && less(prev->end_position(), rt.position())) { // [1]
|
||||
if (prev && (cmp(prev->end_position(), rt.position()) < 0)) { // [1]
|
||||
// previous range tombstone not adjacent, emit gap.
|
||||
consumer(range_tombstone_change(prev->end_position(), tombstone()));
|
||||
}
|
||||
|
||||
// Check if start of rt was already emitted, emit if not.
|
||||
if (!less(rt.position(), _lower_bound)) {
|
||||
if (cmp(rt.position(), _lower_bound) >= 0) {
|
||||
consumer(range_tombstone_change(rt.position(), rt.tomb));
|
||||
}
|
||||
|
||||
@@ -95,15 +106,15 @@ public:
|
||||
// It cannot get adjacent later because prev->end_position() < upper_bound,
|
||||
// so nothing == prev->end_position() can be added after this invocation.
|
||||
if (prev && (_range_tombstones.empty()
|
||||
|| less(prev->end_position(), _range_tombstones.begin()->position()))) {
|
||||
|| (cmp(prev->end_position(), _range_tombstones.begin()->position()) < 0))) {
|
||||
consumer(range_tombstone_change(prev->end_position(), tombstone())); // [2]
|
||||
}
|
||||
|
||||
// Emit the fragment for start bound of a range_tombstone which is overlapping with upper_bound,
|
||||
// unless no such fragment or already emitted.
|
||||
if (!_range_tombstones.empty()
|
||||
&& less(_range_tombstones.begin()->position(), upper_bound)
|
||||
&& (!less(_range_tombstones.begin()->position(), _lower_bound))) {
|
||||
&& (cmp(_range_tombstones.begin()->position(), upper_bound) < 0)
|
||||
&& (cmp(_range_tombstones.begin()->position(), _lower_bound) >= 0)) {
|
||||
consumer(range_tombstone_change(
|
||||
_range_tombstones.begin()->position(), _range_tombstones.begin()->tombstone().tomb));
|
||||
}
|
||||
|
||||
@@ -294,10 +294,11 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
future<> maybe_wait_readmission() {
|
||||
if (_state != reader_permit::state::evicted) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
bool needs_readmission() const {
|
||||
return _state == reader_permit::state::evicted;
|
||||
}
|
||||
|
||||
future<> wait_readmission() {
|
||||
return _semaphore.do_wait_admission(shared_from_this());
|
||||
}
|
||||
|
||||
@@ -360,8 +361,16 @@ reader_concurrency_semaphore& reader_permit::semaphore() {
|
||||
return _impl->semaphore();
|
||||
}
|
||||
|
||||
future<> reader_permit::maybe_wait_readmission() {
|
||||
return _impl->maybe_wait_readmission();
|
||||
reader_permit::state reader_permit::get_state() const {
|
||||
return _impl->get_state();
|
||||
}
|
||||
|
||||
bool reader_permit::needs_readmission() const {
|
||||
return _impl->needs_readmission();
|
||||
}
|
||||
|
||||
future<> reader_permit::wait_readmission() {
|
||||
return _impl->wait_readmission();
|
||||
}
|
||||
|
||||
void reader_permit::consume(reader_resources res) {
|
||||
@@ -661,11 +670,7 @@ reader_concurrency_semaphore::~reader_concurrency_semaphore() {
|
||||
reader_concurrency_semaphore::inactive_read_handle reader_concurrency_semaphore::register_inactive_read(flat_mutation_reader_v2 reader) noexcept {
|
||||
auto& permit_impl = *reader.permit()._impl;
|
||||
permit_impl.on_register_as_inactive();
|
||||
// Implies _inactive_reads.empty(), we don't queue new readers before
|
||||
// evicting all inactive reads.
|
||||
// Checking the _wait_list covers the count resources only, so check memory
|
||||
// separately.
|
||||
if (_wait_list.empty() && _resources.memory > 0) {
|
||||
if (!should_evict_inactive_read()) {
|
||||
try {
|
||||
auto irp = std::make_unique<inactive_read>(std::move(reader));
|
||||
auto& ir = *irp;
|
||||
@@ -736,10 +741,7 @@ bool reader_concurrency_semaphore::try_evict_one_inactive_read(evict_reason reas
|
||||
|
||||
void reader_concurrency_semaphore::clear_inactive_reads() {
|
||||
while (!_inactive_reads.empty()) {
|
||||
auto& ir = _inactive_reads.front();
|
||||
close_reader(std::move(ir.reader));
|
||||
// Destroying the read unlinks it too.
|
||||
std::unique_ptr<inactive_read> _(&*_inactive_reads.begin());
|
||||
evict(_inactive_reads.front(), evict_reason::manual);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -751,8 +753,6 @@ future<> reader_concurrency_semaphore::evict_inactive_reads_for_table(utils::UUI
|
||||
++it;
|
||||
if (ir.reader.schema()->id() == id) {
|
||||
do_detach_inactive_reader(ir, evict_reason::manual);
|
||||
ir.ttl_timer.cancel();
|
||||
ir.unlink();
|
||||
evicted_readers.push_back(ir);
|
||||
}
|
||||
}
|
||||
@@ -785,6 +785,8 @@ future<> reader_concurrency_semaphore::stop() noexcept {
|
||||
}
|
||||
|
||||
void reader_concurrency_semaphore::do_detach_inactive_reader(inactive_read& ir, evict_reason reason) noexcept {
|
||||
ir.unlink();
|
||||
ir.ttl_timer.cancel();
|
||||
ir.detach();
|
||||
ir.reader.permit()._impl->on_evicted();
|
||||
try {
|
||||
@@ -858,35 +860,89 @@ future<> reader_concurrency_semaphore::enqueue_waiter(reader_permit permit, read
|
||||
}
|
||||
|
||||
void reader_concurrency_semaphore::evict_readers_in_background() {
|
||||
if (_evicting) {
|
||||
return;
|
||||
}
|
||||
_evicting = true;
|
||||
// Evict inactive readers in the background while wait list isn't empty
|
||||
// This is safe since stop() closes _gate;
|
||||
(void)with_gate(_close_readers_gate, [this] {
|
||||
return do_until([this] { return _wait_list.empty() || _inactive_reads.empty(); }, [this] {
|
||||
return detach_inactive_reader(_inactive_reads.front(), evict_reason::permit).close();
|
||||
return repeat([this] {
|
||||
if (_inactive_reads.empty() || !should_evict_inactive_read()) {
|
||||
_evicting = false;
|
||||
return make_ready_future<stop_iteration>(stop_iteration::yes);
|
||||
}
|
||||
return detach_inactive_reader(_inactive_reads.front(), evict_reason::permit).close().then([] {
|
||||
return stop_iteration::no;
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
reader_concurrency_semaphore::admit_result
|
||||
reader_concurrency_semaphore::can_admit_read(const reader_permit& permit) const noexcept {
|
||||
if (!_ready_list.empty()) {
|
||||
return {can_admit::no, reason::ready_list};
|
||||
}
|
||||
|
||||
if (!all_used_permits_are_stalled()) {
|
||||
return {can_admit::no, reason::used_permits};
|
||||
}
|
||||
|
||||
if (!has_available_units(permit.base_resources())) {
|
||||
auto reason = _resources.memory >= permit.base_resources().memory ? reason::memory_resources : reason::count_resources;
|
||||
if (_inactive_reads.empty()) {
|
||||
return {can_admit::no, reason};
|
||||
} else {
|
||||
return {can_admit::maybe, reason};
|
||||
}
|
||||
}
|
||||
|
||||
return {can_admit::yes, reason::all_ok};
|
||||
}
|
||||
|
||||
bool reader_concurrency_semaphore::should_evict_inactive_read() const noexcept {
|
||||
if (_resources.memory < 0 || _resources.count < 0) {
|
||||
return true;
|
||||
}
|
||||
if (_wait_list.empty()) {
|
||||
return false;
|
||||
}
|
||||
const auto r = can_admit_read(_wait_list.front().permit).why;
|
||||
return r == reason::memory_resources || r == reason::count_resources;
|
||||
}
|
||||
|
||||
future<> reader_concurrency_semaphore::do_wait_admission(reader_permit permit, read_func func) {
|
||||
if (!_execution_loop_future) {
|
||||
_execution_loop_future.emplace(execution_loop());
|
||||
}
|
||||
if (!_wait_list.empty() || !_ready_list.empty()) {
|
||||
return enqueue_waiter(std::move(permit), std::move(func));
|
||||
}
|
||||
|
||||
if (!has_available_units(permit.base_resources())) {
|
||||
static uint64_t stats::*stats_table[] = {
|
||||
&stats::reads_admitted_immediately,
|
||||
&stats::reads_queued_because_ready_list,
|
||||
&stats::reads_queued_because_used_permits,
|
||||
&stats::reads_queued_because_memory_resources,
|
||||
&stats::reads_queued_because_count_resources
|
||||
};
|
||||
|
||||
const auto [admit, why] = can_admit_read(permit);
|
||||
++(_stats.*stats_table[static_cast<int>(why)]);
|
||||
if (admit != can_admit::yes || !_wait_list.empty()) {
|
||||
auto fut = enqueue_waiter(std::move(permit), std::move(func));
|
||||
if (!_inactive_reads.empty()) {
|
||||
if (admit == can_admit::yes && !_wait_list.empty()) {
|
||||
// This is a contradiction: the semaphore could admit waiters yet it has waiters.
|
||||
// Normally, the semaphore should admit waiters as soon as it can.
|
||||
// So at any point in time, there should either be no waiters, or it
|
||||
// shouldn't be able to admit new reads. Otherwise something went wrong.
|
||||
maybe_dump_reader_permit_diagnostics(*this, _permit_list, "semaphore could admit new reads yet there are waiters");
|
||||
maybe_admit_waiters();
|
||||
} else if (admit == can_admit::maybe) {
|
||||
++_stats.reads_queued_with_eviction;
|
||||
evict_readers_in_background();
|
||||
}
|
||||
return fut;
|
||||
}
|
||||
|
||||
if (!all_used_permits_are_stalled()) {
|
||||
return enqueue_waiter(std::move(permit), std::move(func));
|
||||
}
|
||||
|
||||
permit.on_admission();
|
||||
++_stats.reads_admitted;
|
||||
if (func) {
|
||||
@@ -896,7 +952,8 @@ future<> reader_concurrency_semaphore::do_wait_admission(reader_permit permit, r
|
||||
}
|
||||
|
||||
void reader_concurrency_semaphore::maybe_admit_waiters() noexcept {
|
||||
while (!_wait_list.empty() && _ready_list.empty() && has_available_units(_wait_list.front().permit.base_resources()) && all_used_permits_are_stalled()) {
|
||||
auto admit = can_admit::no;
|
||||
while (!_wait_list.empty() && (admit = can_admit_read(_wait_list.front().permit).decision) == can_admit::yes) {
|
||||
auto& x = _wait_list.front();
|
||||
try {
|
||||
x.permit.on_admission();
|
||||
@@ -911,6 +968,10 @@ void reader_concurrency_semaphore::maybe_admit_waiters() noexcept {
|
||||
}
|
||||
_wait_list.pop_front();
|
||||
}
|
||||
if (admit == can_admit::maybe) {
|
||||
// Evicting readers will trigger another call to `maybe_admit_waiters()` from `signal()`.
|
||||
evict_readers_in_background();
|
||||
}
|
||||
}
|
||||
|
||||
void reader_concurrency_semaphore::on_permit_created(reader_permit::impl& permit) {
|
||||
@@ -987,6 +1048,13 @@ future<> reader_concurrency_semaphore::with_ready_permit(reader_permit permit, r
|
||||
return fut;
|
||||
}
|
||||
|
||||
void reader_concurrency_semaphore::set_resources(resources r) {
|
||||
auto delta = r - _initial_resources;
|
||||
_initial_resources = r;
|
||||
_resources += delta;
|
||||
maybe_admit_waiters();
|
||||
}
|
||||
|
||||
void reader_concurrency_semaphore::broken(std::exception_ptr ex) {
|
||||
if (!ex) {
|
||||
ex = std::make_exception_ptr(broken_semaphore{});
|
||||
|
||||
@@ -74,6 +74,18 @@ public:
|
||||
uint64_t reads_admitted = 0;
|
||||
// Total number of reads enqueued to wait for admission.
|
||||
uint64_t reads_enqueued = 0;
|
||||
// Total number of reads admitted immediately, without queueing
|
||||
uint64_t reads_admitted_immediately = 0;
|
||||
// Total number of reads enqueued because ready_list wasn't empty
|
||||
uint64_t reads_queued_because_ready_list = 0;
|
||||
// Total number of reads enqueued because there are used but unblocked permits
|
||||
uint64_t reads_queued_because_used_permits = 0;
|
||||
// Total number of reads enqueued because there weren't enough memory resources
|
||||
uint64_t reads_queued_because_memory_resources = 0;
|
||||
// Total number of reads enqueued because there weren't enough count resources
|
||||
uint64_t reads_queued_because_count_resources = 0;
|
||||
// Total number of reads enqueued to be maybe admitted after evicting some inactive reads
|
||||
uint64_t reads_queued_with_eviction = 0;
|
||||
// Total number of permits created so far.
|
||||
uint64_t total_permits = 0;
|
||||
// Current number of permits.
|
||||
@@ -169,7 +181,7 @@ public:
|
||||
};
|
||||
|
||||
private:
|
||||
const resources _initial_resources;
|
||||
resources _initial_resources;
|
||||
resources _resources;
|
||||
|
||||
expiring_fifo<entry, expiry_handler, db::timeout_clock> _wait_list;
|
||||
@@ -181,6 +193,7 @@ private:
|
||||
stats _stats;
|
||||
permit_list_type _permit_list;
|
||||
bool _stopped = false;
|
||||
bool _evicting = false;
|
||||
gate _close_readers_gate;
|
||||
gate _permit_gate;
|
||||
std::optional<future<>> _execution_loop_future;
|
||||
@@ -201,6 +214,19 @@ private:
|
||||
future<> enqueue_waiter(reader_permit permit, read_func func);
|
||||
void evict_readers_in_background();
|
||||
future<> do_wait_admission(reader_permit permit, read_func func = {});
|
||||
|
||||
// Check whether permit can be admitted or not.
|
||||
// The wait list is not taken into consideration, this is the caller's
|
||||
// responsibility.
|
||||
// A return value of can_admit::maybe means admission might be possible if
|
||||
// some of the inactive readers are evicted.
|
||||
enum class can_admit { no, maybe, yes };
|
||||
enum class reason { all_ok = 0, ready_list, used_permits, memory_resources, count_resources };
|
||||
struct admit_result { can_admit decision; reason why; };
|
||||
admit_result can_admit_read(const reader_permit& permit) const noexcept;
|
||||
|
||||
bool should_evict_inactive_read() const noexcept;
|
||||
|
||||
void maybe_admit_waiters() noexcept;
|
||||
|
||||
void on_permit_created(reader_permit::impl&);
|
||||
@@ -390,6 +416,12 @@ public:
|
||||
/// optimal then just using \ref with_permit().
|
||||
future<> with_ready_permit(reader_permit permit, read_func func);
|
||||
|
||||
/// Set the total resources of the semaphore to \p r.
|
||||
///
|
||||
/// After this call, \ref initial_resources() will reflect the new value.
|
||||
/// Available resources will be adjusted by the delta.
|
||||
void set_resources(resources r);
|
||||
|
||||
const resources initial_resources() const {
|
||||
return _initial_resources;
|
||||
}
|
||||
|
||||
@@ -134,7 +134,12 @@ public:
|
||||
|
||||
reader_concurrency_semaphore& semaphore();
|
||||
|
||||
future<> maybe_wait_readmission();
|
||||
state get_state() const;
|
||||
|
||||
bool needs_readmission() const;
|
||||
|
||||
// Call only when needs_readmission() = true.
|
||||
future<> wait_readmission();
|
||||
|
||||
void consume(reader_resources res);
|
||||
|
||||
@@ -182,6 +187,8 @@ public:
|
||||
reader_resources resources() const { return _resources; }
|
||||
};
|
||||
|
||||
std::ostream& operator<<(std::ostream& os, reader_permit::state s);
|
||||
|
||||
/// Mark a permit as used.
|
||||
///
|
||||
/// Conceptually, a permit is considered used, when at least one reader
|
||||
|
||||
@@ -279,6 +279,7 @@ using sstable_list = sstables::sstable_list;
|
||||
namespace replica {
|
||||
|
||||
class distributed_loader;
|
||||
struct table_population_metadata;
|
||||
|
||||
// The CF has a "stats" structure. But we don't want all fields here,
|
||||
// since some of them are fairly complex for exporting to collectd. Also,
|
||||
@@ -900,6 +901,8 @@ public:
|
||||
// The future value is true iff offstrategy compaction was required.
|
||||
future<bool> perform_offstrategy_compaction();
|
||||
future<> run_offstrategy_compaction(sstables::compaction_data& info);
|
||||
future<> perform_cleanup_compaction(replica::database& db);
|
||||
|
||||
void set_compaction_strategy(sstables::compaction_strategy_type strategy);
|
||||
const sstables::compaction_strategy& get_compaction_strategy() const {
|
||||
return _compaction_strategy;
|
||||
@@ -925,7 +928,11 @@ public:
|
||||
return _config;
|
||||
}
|
||||
|
||||
compaction_manager& get_compaction_manager() const {
|
||||
const compaction_manager& get_compaction_manager() const noexcept {
|
||||
return _compaction_manager;
|
||||
}
|
||||
|
||||
compaction_manager& get_compaction_manager() noexcept {
|
||||
return _compaction_manager;
|
||||
}
|
||||
|
||||
@@ -1080,6 +1087,7 @@ public:
|
||||
friend class ::column_family_test;
|
||||
|
||||
friend class distributed_loader;
|
||||
friend class table_population_metadata;
|
||||
|
||||
private:
|
||||
timer<> _off_strategy_trigger;
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
* SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
*/
|
||||
|
||||
#include <seastar/core/coroutine.hh>
|
||||
#include <seastar/util/closeable.hh>
|
||||
#include "distributed_loader.hh"
|
||||
#include "replica/database.hh"
|
||||
@@ -361,7 +362,7 @@ distributed_loader::process_upload_dir(distributed<replica::database>& db, distr
|
||||
&error_handler_gen_for_upload_dir);
|
||||
}, sstables::sstable_directory::default_sstable_filter()).get();
|
||||
|
||||
const bool use_view_update_path = db::view::check_needs_view_update_path(sys_dist_ks.local(), *global_table, streaming::stream_reason::repair).get0();
|
||||
const bool use_view_update_path = db::view::check_needs_view_update_path(sys_dist_ks.local(), db.local().get_token_metadata(), *global_table, streaming::stream_reason::repair).get0();
|
||||
|
||||
auto datadir = upload.parent_path();
|
||||
if (use_view_update_path) {
|
||||
@@ -454,92 +455,192 @@ future<> distributed_loader::handle_sstables_pending_delete(sstring pending_dele
|
||||
});
|
||||
}
|
||||
|
||||
future<> distributed_loader::populate_column_family(distributed<replica::database>& db, sstring sstdir, sstring ks, sstring cf, allow_offstrategy_compaction do_allow_offstrategy_compaction, must_exist dir_must_exist) {
|
||||
dblog.debug("Populating {}/{}/{} allow_offstrategy_compaction={} must_exist={}", ks, cf, sstdir, do_allow_offstrategy_compaction, dir_must_exist);
|
||||
return async([&db, sstdir = std::move(sstdir), ks = std::move(ks), cf = std::move(cf), do_allow_offstrategy_compaction, dir_must_exist] {
|
||||
class table_population_metadata {
|
||||
distributed<replica::database>& _db;
|
||||
sstring _ks;
|
||||
sstring _cf;
|
||||
global_column_family_ptr _global_table;
|
||||
fs::path _base_path;
|
||||
std::unordered_map<sstring, lw_shared_ptr<sharded<sstables::sstable_directory>>> _sstable_directories;
|
||||
sstables::sstable_version_types _highest_version = sstables::oldest_writable_sstable_format;
|
||||
int64_t _highest_generation = 0;
|
||||
|
||||
public:
|
||||
table_population_metadata(distributed<replica::database>& db, sstring ks, sstring cf)
|
||||
: _db(db)
|
||||
, _ks(std::move(ks))
|
||||
, _cf(std::move(cf))
|
||||
, _global_table(_db, _ks, _cf)
|
||||
, _base_path(_global_table->dir())
|
||||
{}
|
||||
|
||||
~table_population_metadata() {
|
||||
// All directories must have been stopped
|
||||
// using table_population_metadata::stop()
|
||||
assert(_sstable_directories.empty());
|
||||
}
|
||||
|
||||
future<> start() {
|
||||
assert(this_shard_id() == 0);
|
||||
|
||||
if (!file_exists(sstdir).get0()) {
|
||||
if (dir_must_exist) {
|
||||
throw std::runtime_error(format("Populating {}/{} failed: {} does not exist", ks, cf, sstdir));
|
||||
}
|
||||
return;
|
||||
for (auto subdir : { "", sstables::staging_dir, sstables::quarantine_dir }) {
|
||||
co_await start_subdir(subdir);
|
||||
}
|
||||
|
||||
// First pass, cleanup temporary sstable directories and sstables pending delete.
|
||||
cleanup_column_family_temp_sst_dirs(sstdir).get();
|
||||
auto pending_delete_dir = sstdir + "/" + sstables::sstable::pending_delete_dir_basename();
|
||||
auto exists = file_exists(pending_delete_dir).get0();
|
||||
if (exists) {
|
||||
handle_sstables_pending_delete(pending_delete_dir).get();
|
||||
co_await smp::invoke_on_all([this] {
|
||||
_global_table->update_sstables_known_generation(_highest_generation);
|
||||
return _global_table->disable_auto_compaction();
|
||||
});
|
||||
}
|
||||
|
||||
future<> stop() {
|
||||
for (auto it = _sstable_directories.begin(); it != _sstable_directories.end(); it = _sstable_directories.erase(it)) {
|
||||
co_await it->second->stop();
|
||||
}
|
||||
}
|
||||
|
||||
global_column_family_ptr global_table(db, ks, cf);
|
||||
fs::path get_path(std::string_view subdir) {
|
||||
return subdir.empty() ? _base_path : _base_path / subdir;
|
||||
}
|
||||
|
||||
sharded<sstables::sstable_directory> directory;
|
||||
directory.start(fs::path(sstdir), db.local().get_config().initial_sstable_loading_concurrency(), std::ref(db.local().get_sharded_sst_dir_semaphore()),
|
||||
sstables::sstable_directory::need_mutate_level::no,
|
||||
sstables::sstable_directory::lack_of_toc_fatal::yes,
|
||||
sstables::sstable_directory::enable_dangerous_direct_import_of_cassandra_counters(db.local().get_config().enable_dangerous_direct_import_of_cassandra_counters()),
|
||||
sstables::sstable_directory::allow_loading_materialized_view::yes,
|
||||
[&global_table] (fs::path dir, int64_t gen, sstables::sstable_version_types v, sstables::sstable_format_types f) {
|
||||
return global_table->make_sstable(dir.native(), gen, v, f);
|
||||
}).get();
|
||||
distributed<replica::database>& db() noexcept {
|
||||
return _db;
|
||||
}
|
||||
|
||||
auto stop = deferred_stop(directory);
|
||||
const sstring& ks() const noexcept {
|
||||
return _ks;
|
||||
}
|
||||
|
||||
lock_table(directory, db, ks, cf).get();
|
||||
process_sstable_dir(directory).get();
|
||||
const sstring& cf() const noexcept {
|
||||
return _cf;
|
||||
}
|
||||
|
||||
// If we are resharding system tables before we can read them, we will not
|
||||
// know which is the highest format we support: this information is itself stored
|
||||
// in the system tables. In that case we'll rely on what we find on disk: we'll
|
||||
// at least not downgrade any files. If we already know that we support a higher
|
||||
// format than the one we see then we use that.
|
||||
auto sys_format = global_table->get_sstables_manager().get_highest_supported_format();
|
||||
auto sst_version = highest_version_seen(directory, sys_format).get0();
|
||||
auto generation = highest_generation_seen(directory).get0();
|
||||
global_column_family_ptr& global_table() noexcept {
|
||||
return _global_table;
|
||||
};
|
||||
|
||||
db.invoke_on_all([&global_table, generation] (replica::database& db) {
|
||||
global_table->update_sstables_known_generation(generation);
|
||||
return global_table->disable_auto_compaction();
|
||||
}).get();
|
||||
const global_column_family_ptr& global_table() const noexcept {
|
||||
return _global_table;
|
||||
};
|
||||
|
||||
reshard(directory, db, ks, cf, [&global_table, sstdir, sst_version] (shard_id shard) mutable {
|
||||
auto gen = smp::submit_to(shard, [&global_table] () {
|
||||
return global_table->calculate_generation_for_new_table();
|
||||
}).get0();
|
||||
const std::unordered_map<sstring, lw_shared_ptr<sharded<sstables::sstable_directory>>>& sstable_directories() const noexcept {
|
||||
return _sstable_directories;
|
||||
}
|
||||
|
||||
return global_table->make_sstable(sstdir, gen, sst_version, sstables::sstable::format_types::big);
|
||||
}).get();
|
||||
sstables::sstable::version_types highest_version() const noexcept {
|
||||
return _highest_version;
|
||||
}
|
||||
|
||||
// The node is offline at this point so we are very lenient with what we consider
|
||||
// offstrategy.
|
||||
// SSTables created by repair may not conform to compaction strategy layout goal
|
||||
// because data segregation is only performed by compaction
|
||||
// Instead of reshaping them on boot, let's add them to maintenance set and allow
|
||||
// off-strategy compaction to reshape them. This will allow node to become online
|
||||
// ASAP. Given that SSTables with repair origin are disjoint, they can be efficiently
|
||||
// read from.
|
||||
auto eligible_for_reshape_on_boot = [] (const sstables::shared_sstable& sst) {
|
||||
return sst->get_origin() != sstables::repair_origin;
|
||||
};
|
||||
int64_t highest_generation() const noexcept {
|
||||
return _highest_generation;
|
||||
}
|
||||
|
||||
reshape(directory, db, sstables::reshape_mode::relaxed, ks, cf, [global_table, sstdir, sst_version] (shard_id shard) {
|
||||
auto gen = global_table->calculate_generation_for_new_table();
|
||||
return global_table->make_sstable(sstdir, gen, sst_version, sstables::sstable::format_types::big);
|
||||
}, eligible_for_reshape_on_boot).get();
|
||||
private:
|
||||
future<> start_subdir(sstring subdir);
|
||||
};
|
||||
|
||||
directory.invoke_on_all([global_table, &eligible_for_reshape_on_boot, do_allow_offstrategy_compaction] (sstables::sstable_directory& dir) {
|
||||
return dir.do_for_each_sstable([&global_table, &eligible_for_reshape_on_boot, do_allow_offstrategy_compaction] (sstables::shared_sstable sst) {
|
||||
auto requires_offstrategy = sstables::offstrategy(do_allow_offstrategy_compaction && !eligible_for_reshape_on_boot(sst));
|
||||
return global_table->add_sstable_and_update_cache(sst, requires_offstrategy);
|
||||
}).then([&global_table, do_allow_offstrategy_compaction] {
|
||||
if (do_allow_offstrategy_compaction) {
|
||||
global_table->trigger_offstrategy_compaction();
|
||||
}
|
||||
});
|
||||
}).get();
|
||||
future<> table_population_metadata::start_subdir(sstring subdir) {
|
||||
sstring sstdir = get_path(subdir).native();
|
||||
if (!co_await file_exists(sstdir)) {
|
||||
co_return;
|
||||
}
|
||||
|
||||
// First pass, cleanup temporary sstable directories and sstables pending delete.
|
||||
co_await distributed_loader::cleanup_column_family_temp_sst_dirs(sstdir);
|
||||
auto pending_delete_dir = sstdir + "/" + sstables::sstable::pending_delete_dir_basename();
|
||||
auto exists = co_await file_exists(pending_delete_dir);
|
||||
if (exists) {
|
||||
co_await distributed_loader::handle_sstables_pending_delete(pending_delete_dir);
|
||||
}
|
||||
|
||||
auto dptr = make_lw_shared<sharded<sstables::sstable_directory>>();
|
||||
auto& directory = *dptr;
|
||||
auto& global_table = _global_table;
|
||||
auto& db = _db;
|
||||
co_await directory.start(fs::path(sstdir),
|
||||
db.local().get_config().initial_sstable_loading_concurrency(), std::ref(db.local().get_sharded_sst_dir_semaphore()),
|
||||
sstables::sstable_directory::need_mutate_level::no,
|
||||
sstables::sstable_directory::lack_of_toc_fatal::yes,
|
||||
sstables::sstable_directory::enable_dangerous_direct_import_of_cassandra_counters(db.local().get_config().enable_dangerous_direct_import_of_cassandra_counters()),
|
||||
sstables::sstable_directory::allow_loading_materialized_view::yes,
|
||||
[&global_table] (fs::path dir, int64_t gen, sstables::sstable_version_types v, sstables::sstable_format_types f) {
|
||||
return global_table->make_sstable(dir.native(), gen, v, f);
|
||||
});
|
||||
|
||||
// directory must be stopped using table_population_metadata::stop below
|
||||
_sstable_directories[subdir] = dptr;
|
||||
|
||||
co_await distributed_loader::lock_table(directory, _db, _ks, _cf);
|
||||
co_await distributed_loader::process_sstable_dir(directory);
|
||||
|
||||
// If we are resharding system tables before we can read them, we will not
|
||||
// know which is the highest format we support: this information is itself stored
|
||||
// in the system tables. In that case we'll rely on what we find on disk: we'll
|
||||
// at least not downgrade any files. If we already know that we support a higher
|
||||
// format than the one we see then we use that.
|
||||
auto sys_format = global_table->get_sstables_manager().get_highest_supported_format();
|
||||
auto sst_version = co_await highest_version_seen(directory, sys_format);
|
||||
auto generation = co_await highest_generation_seen(directory);
|
||||
|
||||
_highest_version = std::max(sst_version, _highest_version);
|
||||
_highest_generation = std::max(generation, _highest_generation);
|
||||
}
|
||||
|
||||
future<> distributed_loader::populate_column_family(table_population_metadata& metadata, sstring subdir, allow_offstrategy_compaction do_allow_offstrategy_compaction, must_exist dir_must_exist) {
|
||||
auto& db = metadata.db();
|
||||
const auto& ks = metadata.ks();
|
||||
const auto& cf = metadata.cf();
|
||||
auto sstdir = metadata.get_path(subdir).native();
|
||||
dblog.debug("Populating {}/{}/{} allow_offstrategy_compaction={} must_exist={}", ks, cf, sstdir, do_allow_offstrategy_compaction, dir_must_exist);
|
||||
|
||||
assert(this_shard_id() == 0);
|
||||
|
||||
if (!co_await file_exists(sstdir)) {
|
||||
if (dir_must_exist) {
|
||||
throw std::runtime_error(format("Populating {}/{} failed: {} does not exist", metadata.ks(), metadata.cf(), sstdir));
|
||||
}
|
||||
co_return;
|
||||
}
|
||||
|
||||
auto& global_table = metadata.global_table();
|
||||
if (!metadata.sstable_directories().contains(subdir)) {
|
||||
dblog.error("Could not find sstables directory {}.{}/{}", ks, cf, subdir);
|
||||
}
|
||||
auto& directory = *metadata.sstable_directories().at(subdir);
|
||||
auto sst_version = metadata.highest_version();
|
||||
|
||||
co_await reshard(directory, db, ks, cf, [&global_table, sstdir, sst_version] (shard_id shard) mutable {
|
||||
auto gen = smp::submit_to(shard, [&global_table] () {
|
||||
return global_table->calculate_generation_for_new_table();
|
||||
}).get0();
|
||||
|
||||
return global_table->make_sstable(sstdir, gen, sst_version, sstables::sstable::format_types::big);
|
||||
});
|
||||
|
||||
// The node is offline at this point so we are very lenient with what we consider
|
||||
// offstrategy.
|
||||
// SSTables created by repair may not conform to compaction strategy layout goal
|
||||
// because data segregation is only performed by compaction
|
||||
// Instead of reshaping them on boot, let's add them to maintenance set and allow
|
||||
// off-strategy compaction to reshape them. This will allow node to become online
|
||||
// ASAP. Given that SSTables with repair origin are disjoint, they can be efficiently
|
||||
// read from.
|
||||
auto eligible_for_reshape_on_boot = [] (const sstables::shared_sstable& sst) {
|
||||
return sst->get_origin() != sstables::repair_origin;
|
||||
};
|
||||
|
||||
co_await reshape(directory, db, sstables::reshape_mode::relaxed, ks, cf, [global_table, sstdir, sst_version] (shard_id shard) {
|
||||
auto gen = global_table->calculate_generation_for_new_table();
|
||||
return global_table->make_sstable(sstdir, gen, sst_version, sstables::sstable::format_types::big);
|
||||
}, eligible_for_reshape_on_boot);
|
||||
|
||||
co_await directory.invoke_on_all([global_table, &eligible_for_reshape_on_boot, do_allow_offstrategy_compaction] (sstables::sstable_directory& dir) -> future<> {
|
||||
co_await dir.do_for_each_sstable([&global_table, &eligible_for_reshape_on_boot, do_allow_offstrategy_compaction] (sstables::shared_sstable sst) {
|
||||
auto requires_offstrategy = sstables::offstrategy(do_allow_offstrategy_compaction && !eligible_for_reshape_on_boot(sst));
|
||||
return global_table->add_sstable_and_update_cache(sst, requires_offstrategy);
|
||||
});
|
||||
if (do_allow_offstrategy_compaction) {
|
||||
global_table->trigger_offstrategy_compaction();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@@ -549,41 +650,51 @@ future<> distributed_loader::populate_keyspace(distributed<replica::database>& d
|
||||
auto i = keyspaces.find(ks_name);
|
||||
if (i == keyspaces.end()) {
|
||||
dblog.warn("Skipping undefined keyspace: {}", ks_name);
|
||||
return make_ready_future<>();
|
||||
} else {
|
||||
dblog.info("Populating Keyspace {}", ks_name);
|
||||
auto& ks = i->second;
|
||||
auto& column_families = db.local().get_column_families();
|
||||
|
||||
return parallel_for_each(ks.metadata()->cf_meta_data() | boost::adaptors::map_values,
|
||||
[ks_name, ksdir, &ks, &column_families, &db] (schema_ptr s) {
|
||||
utils::UUID uuid = s->id();
|
||||
lw_shared_ptr<replica::column_family> cf = column_families[uuid];
|
||||
sstring cfname = cf->schema()->cf_name();
|
||||
auto sstdir = ks.column_family_directory(ksdir, cfname, uuid);
|
||||
dblog.info("Keyspace {}: Reading CF {} id={} version={}", ks_name, cfname, uuid, s->version());
|
||||
return ks.make_directory_for_column_family(cfname, uuid).then([&db, sstdir, uuid, ks_name, cfname] {
|
||||
return distributed_loader::populate_column_family(db, sstdir + "/" + sstables::staging_dir, ks_name, cfname, allow_offstrategy_compaction::no);
|
||||
}).then([&db, sstdir, ks_name, cfname] {
|
||||
return distributed_loader::populate_column_family(db, sstdir + "/" + sstables::quarantine_dir, ks_name, cfname, allow_offstrategy_compaction::no, must_exist::no);
|
||||
}).then([&db, sstdir, uuid, ks_name, cfname] {
|
||||
return distributed_loader::populate_column_family(db, sstdir, ks_name, cfname, allow_offstrategy_compaction::yes);
|
||||
}).handle_exception([ks_name, cfname, sstdir](std::exception_ptr eptr) {
|
||||
std::string msg =
|
||||
format("Exception while populating keyspace '{}' with column family '{}' from file '{}': {}",
|
||||
ks_name, cfname, sstdir, eptr);
|
||||
dblog.error("Exception while populating keyspace '{}' with column family '{}' from file '{}': {}",
|
||||
ks_name, cfname, sstdir, eptr);
|
||||
try {
|
||||
std::rethrow_exception(eptr);
|
||||
} catch (sstables::compaction_stopped_exception& e) {
|
||||
// swallow compaction stopped exception, to allow clean shutdown.
|
||||
} catch (...) {
|
||||
throw std::runtime_error(msg.c_str());
|
||||
}
|
||||
});
|
||||
});
|
||||
co_return;
|
||||
}
|
||||
|
||||
dblog.info("Populating Keyspace {}", ks_name);
|
||||
auto& ks = i->second;
|
||||
auto& column_families = db.local().get_column_families();
|
||||
|
||||
co_await parallel_for_each(ks.metadata()->cf_meta_data() | boost::adaptors::map_values, [&] (schema_ptr s) -> future<> {
|
||||
utils::UUID uuid = s->id();
|
||||
lw_shared_ptr<replica::column_family> cf = column_families[uuid];
|
||||
sstring cfname = cf->schema()->cf_name();
|
||||
auto sstdir = ks.column_family_directory(ksdir, cfname, uuid);
|
||||
dblog.info("Keyspace {}: Reading CF {} id={} version={}", ks_name, cfname, uuid, s->version());
|
||||
|
||||
auto metadata = table_population_metadata(db, ks_name, cfname);
|
||||
std::exception_ptr ex;
|
||||
|
||||
try {
|
||||
co_await ks.make_directory_for_column_family(cfname, uuid);
|
||||
|
||||
co_await metadata.start();
|
||||
co_await distributed_loader::populate_column_family(metadata, sstables::staging_dir, allow_offstrategy_compaction::no);
|
||||
co_await distributed_loader::populate_column_family(metadata, sstables::quarantine_dir, allow_offstrategy_compaction::no, must_exist::no);
|
||||
co_await distributed_loader::populate_column_family(metadata, "", allow_offstrategy_compaction::yes);
|
||||
} catch (...) {
|
||||
std::exception_ptr eptr = std::current_exception();
|
||||
std::string msg =
|
||||
format("Exception while populating keyspace '{}' with column family '{}' from file '{}': {}",
|
||||
ks_name, cfname, sstdir, eptr);
|
||||
dblog.error("Exception while populating keyspace '{}' with column family '{}' from file '{}': {}",
|
||||
ks_name, cfname, sstdir, eptr);
|
||||
try {
|
||||
std::rethrow_exception(eptr);
|
||||
} catch (sstables::compaction_stopped_exception& e) {
|
||||
// swallow compaction stopped exception, to allow clean shutdown.
|
||||
} catch (...) {
|
||||
ex = std::make_exception_ptr(std::runtime_error(msg.c_str()));
|
||||
}
|
||||
}
|
||||
|
||||
co_await metadata.stop();
|
||||
if (ex) {
|
||||
std::rethrow_exception(std::move(ex));
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
future<> distributed_loader::init_system_keyspace(distributed<replica::database>& db, distributed<service::storage_service>& ss, sharded<gms::gossiper>& g, db::config& cfg) {
|
||||
|
||||
@@ -57,8 +57,11 @@ class distributed_loader_for_tests;
|
||||
|
||||
namespace replica {
|
||||
|
||||
class table_population_metadata;
|
||||
|
||||
class distributed_loader {
|
||||
friend class ::distributed_loader_for_tests;
|
||||
friend class table_population_metadata;
|
||||
|
||||
static future<> reshape(sharded<sstables::sstable_directory>& dir, sharded<replica::database>& db, sstables::reshape_mode mode,
|
||||
sstring ks_name, sstring table_name, sstables::compaction_sstable_creator_fn creator, std::function<bool (const sstables::shared_sstable&)> filter);
|
||||
@@ -70,7 +73,7 @@ class distributed_loader {
|
||||
std::filesystem::path datadir, sstring ks, sstring cf);
|
||||
using allow_offstrategy_compaction = bool_class<struct allow_offstrategy_compaction_tag>;
|
||||
using must_exist = bool_class<struct must_exist_tag>;
|
||||
static future<> populate_column_family(distributed<replica::database>& db, sstring sstdir, sstring ks, sstring cf, allow_offstrategy_compaction, must_exist = must_exist::yes);
|
||||
static future<> populate_column_family(table_population_metadata& metadata, sstring subdir, allow_offstrategy_compaction, must_exist = must_exist::yes);
|
||||
static future<> populate_keyspace(distributed<replica::database>& db, sstring datadir, sstring ks_name);
|
||||
static future<> cleanup_column_family_temp_sst_dirs(sstring sstdir);
|
||||
static future<> handle_sstables_pending_delete(sstring pending_deletes_dir);
|
||||
|
||||
@@ -803,16 +803,15 @@ void table::set_metrics() {
|
||||
}
|
||||
|
||||
void table::rebuild_statistics() {
|
||||
// zeroing live_disk_space_used and live_sstable_count because the
|
||||
// sstable list was re-created
|
||||
_stats.live_disk_space_used = 0;
|
||||
_stats.live_sstable_count = 0;
|
||||
_stats.total_disk_space_used = 0;
|
||||
|
||||
_sstables->for_each_sstable([this] (const sstables::shared_sstable& tab) {
|
||||
update_stats_for_new_sstable(tab->bytes_on_disk());
|
||||
});
|
||||
for (auto& tab : _sstables_compacted_but_not_deleted) {
|
||||
update_stats_for_new_sstable(tab->bytes_on_disk());
|
||||
_stats.total_disk_space_used += tab->bytes_on_disk();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1137,6 +1136,11 @@ future<> table::run_offstrategy_compaction(sstables::compaction_data& info) {
|
||||
tlogger.info("Done with off-strategy compaction for {}.{}", _schema->ks_name(), _schema->cf_name());
|
||||
}
|
||||
|
||||
future<> table::perform_cleanup_compaction(replica::database& db) {
|
||||
co_await flush();
|
||||
co_await get_compaction_manager().perform_cleanup(db, this);
|
||||
}
|
||||
|
||||
void table::set_compaction_strategy(sstables::compaction_strategy_type strategy) {
|
||||
tlogger.debug("Setting compaction strategy of {}.{} to {}", _schema->ks_name(), _schema->cf_name(), sstables::compaction_strategy::name(strategy));
|
||||
auto new_cs = make_compaction_strategy(strategy, _schema->compaction_strategy_options());
|
||||
@@ -1772,29 +1776,30 @@ future<> table::generate_and_propagate_view_updates(const schema_ptr& base,
|
||||
tracing::trace_state_ptr tr_state,
|
||||
gc_clock::time_point now) const {
|
||||
auto base_token = m.token();
|
||||
auto m_schema = m.schema();
|
||||
db::view::view_update_builder builder = co_await db::view::make_view_update_builder(
|
||||
base,
|
||||
std::move(views),
|
||||
make_flat_mutation_reader_from_mutations(m.schema(), std::move(permit), {std::move(m)}),
|
||||
make_flat_mutation_reader_from_mutations(std::move(m_schema), std::move(permit), {std::move(m)}),
|
||||
std::move(existings),
|
||||
now);
|
||||
|
||||
std::exception_ptr err = nullptr;
|
||||
while (true) {
|
||||
utils::chunked_vector<frozen_mutation_and_schema> updates;
|
||||
std::optional<utils::chunked_vector<frozen_mutation_and_schema>> updates;
|
||||
try {
|
||||
updates = co_await builder.build_some();
|
||||
} catch (...) {
|
||||
err = std::current_exception();
|
||||
break;
|
||||
}
|
||||
if (updates.empty()) {
|
||||
if (!updates) {
|
||||
break;
|
||||
}
|
||||
tracing::trace(tr_state, "Generated {} view update mutations", updates.size());
|
||||
auto units = seastar::consume_units(*_config.view_update_concurrency_semaphore, memory_usage_of(updates));
|
||||
tracing::trace(tr_state, "Generated {} view update mutations", updates->size());
|
||||
auto units = seastar::consume_units(*_config.view_update_concurrency_semaphore, memory_usage_of(*updates));
|
||||
try {
|
||||
co_await db::view::mutate_MV(base_token, std::move(updates), _view_stats, *_config.cf_stats, tr_state,
|
||||
co_await db::view::mutate_MV(base_token, std::move(*updates), _view_stats, *_config.cf_stats, tr_state,
|
||||
std::move(units), service::allow_hints::yes, db::view::wait_for_all_updates::no);
|
||||
} catch (...) {
|
||||
// Ignore exceptions: any individual failure to propagate a view update will be reported
|
||||
@@ -1918,14 +1923,14 @@ future<> table::populate_views(
|
||||
while (true) {
|
||||
try {
|
||||
auto updates = co_await builder.build_some();
|
||||
if (updates.empty()) {
|
||||
if (!updates) {
|
||||
break;
|
||||
}
|
||||
size_t update_size = memory_usage_of(updates);
|
||||
size_t update_size = memory_usage_of(*updates);
|
||||
size_t units_to_wait_for = std::min(_config.view_update_concurrency_semaphore_limit, update_size);
|
||||
auto units = co_await seastar::get_units(*_config.view_update_concurrency_semaphore, units_to_wait_for);
|
||||
units.adopt(seastar::consume_units(*_config.view_update_concurrency_semaphore, update_size - units_to_wait_for));
|
||||
co_await db::view::mutate_MV(base_token, std::move(updates), _view_stats, *_config.cf_stats,
|
||||
co_await db::view::mutate_MV(base_token, std::move(*updates), _view_stats, *_config.cf_stats,
|
||||
tracing::trace_state_ptr(), std::move(units), service::allow_hints::no, db::view::wait_for_all_updates::yes);
|
||||
} catch (...) {
|
||||
if (!err) {
|
||||
|
||||
@@ -950,6 +950,11 @@ future<> row_cache::do_update(external_updater eu, memtable& m, Updater updater)
|
||||
_prev_snapshot = {};
|
||||
});
|
||||
utils::coroutine update; // Destroy before cleanup to release snapshots before invalidating.
|
||||
auto destroy_update = defer([&] {
|
||||
with_allocator(_tracker.allocator(), [&] {
|
||||
update = {};
|
||||
});
|
||||
});
|
||||
partition_presence_checker is_present = _prev_snapshot->make_partition_presence_checker();
|
||||
while (!m.partitions.empty()) {
|
||||
with_allocator(_tracker.allocator(), [&] () {
|
||||
@@ -1222,6 +1227,10 @@ void rows_entry::on_evicted(cache_tracker& tracker) noexcept {
|
||||
// That dummy is linked in the LRU, because there may be partitions
|
||||
// with no regular rows, and we need to track them.
|
||||
unlink_from_lru();
|
||||
|
||||
// We still need to break continuity in order to preserve the "older versions are evicted first"
|
||||
// invariant.
|
||||
it->set_continuous(false);
|
||||
} else {
|
||||
// When evicting a dummy with both sides continuous we don't need to break continuity.
|
||||
//
|
||||
|
||||
@@ -63,4 +63,15 @@ MemoryLimit=$MEMORY_LIMIT
|
||||
EOS
|
||||
fi
|
||||
|
||||
if [ -e /etc/systemd/system/systemd-coredump@.service.d/timeout.conf ]; then
|
||||
COREDUMP_RUNTIME_MAX=$(grep RuntimeMaxSec /etc/systemd/system/systemd-coredump@.service.d/timeout.conf)
|
||||
if [ -z $COREDUMP_RUNTIME_MAX ]; then
|
||||
cat << EOS > /etc/systemd/system/systemd-coredump@.service.d/timeout.conf
|
||||
[Service]
|
||||
RuntimeMaxSec=infinity
|
||||
TimeoutSec=infinity
|
||||
EOS
|
||||
fi
|
||||
fi
|
||||
|
||||
systemctl --system daemon-reload >/dev/null || true
|
||||
|
||||
2
seastar
2
seastar
Submodule seastar updated: 9a7ba6d57e...62fd873d09
@@ -78,7 +78,7 @@ future<prepare_response> paxos_state::prepare(storage_proxy& sp, tracing::trace_
|
||||
prv, tr_state, timeout);
|
||||
});
|
||||
});
|
||||
return when_all(std::move(f1), std::move(f2)).then([state = std::move(state), only_digest] (auto t) {
|
||||
return when_all(std::move(f1), std::move(f2)).then([state = std::move(state), only_digest, schema] (auto t) mutable {
|
||||
if (utils::get_local_injector().enter("paxos_error_after_save_promise")) {
|
||||
return make_exception_future<prepare_response>(utils::injected_error("injected_error_after_save_promise"));
|
||||
}
|
||||
@@ -103,8 +103,25 @@ future<prepare_response> paxos_state::prepare(storage_proxy& sp, tracing::trace_
|
||||
auto ex = f2.get_exception();
|
||||
logger.debug("Failed to get data or digest: {}. Ignored.", std::move(ex));
|
||||
}
|
||||
return make_ready_future<prepare_response>(prepare_response(promise(std::move(state._accepted_proposal),
|
||||
std::move(state._most_recent_commit), std::move(data_or_digest))));
|
||||
auto upgrade_if_needed = [schema = std::move(schema)] (std::optional<proposal> p) {
|
||||
if (!p || p->update.schema_version() == schema->version()) {
|
||||
return make_ready_future<std::optional<proposal>>(std::move(p));
|
||||
}
|
||||
// In case current schema is not the same as the schema in the proposal
|
||||
// try to look it up first in the local schema_registry cache and upgrade
|
||||
// the mutation using schema from the cache.
|
||||
//
|
||||
// If there's no schema in the cache, then retrieve persisted column mapping
|
||||
// for that version and upgrade the mutation with it.
|
||||
logger.debug("Stored mutation references outdated schema version. "
|
||||
"Trying to upgrade the accepted proposal mutation to the most recent schema version.");
|
||||
return service::get_column_mapping(p->update.column_family_id(), p->update.schema_version()).then([schema, p = std::move(p)] (const column_mapping& cm) {
|
||||
return make_ready_future<std::optional<proposal>>(proposal(p->ballot, freeze(p->update.unfreeze_upgrading(schema, cm))));
|
||||
});
|
||||
};
|
||||
return when_all_succeed(upgrade_if_needed(std::move(state._accepted_proposal)), upgrade_if_needed(std::move(state._most_recent_commit))).then([data_or_digest = std::move(data_or_digest)] (auto&& u) mutable {
|
||||
return prepare_response(promise(std::move(std::get<0>(u)), std::move(std::get<1>(u)), std::move(data_or_digest)));
|
||||
});
|
||||
});
|
||||
} else {
|
||||
logger.debug("Promise rejected; {} is not sufficiently newer than {}", ballot, state._promised_ballot);
|
||||
@@ -200,15 +217,9 @@ future<> paxos_state::learn(storage_proxy& sp, schema_ptr schema, proposal decis
|
||||
// If there's no schema in the cache, then retrieve persisted column mapping
|
||||
// for that version and upgrade the mutation with it.
|
||||
if (decision.update.schema_version() != schema->version()) {
|
||||
logger.debug("Stored mutation references outdated schema version. "
|
||||
"Trying to upgrade the accepted proposal mutation to the most recent schema version.");
|
||||
return service::get_column_mapping(decision.update.column_family_id(), decision.update.schema_version())
|
||||
.then([&sp, schema, tr_state, timeout, &decision] (const column_mapping& cm) {
|
||||
return do_with(decision.update.unfreeze_upgrading(schema, cm), [&sp, tr_state, timeout] (const mutation& upgraded) {
|
||||
return sp.mutate_locally(upgraded, tr_state, db::commitlog::force_sync::yes, timeout);
|
||||
});
|
||||
});
|
||||
on_internal_error(logger, format("schema version in learn does not match current schema"));
|
||||
}
|
||||
|
||||
return sp.mutate_locally(schema, decision.update, tr_state, db::commitlog::force_sync::yes, timeout);
|
||||
});
|
||||
} else {
|
||||
|
||||
@@ -1227,19 +1227,15 @@ future<> paxos_response_handler::learn_decision(lw_shared_ptr<paxos::proposal> d
|
||||
|
||||
auto cdc = _proxy->get_cdc_service();
|
||||
if (cdc && cdc->needs_cdc_augmentation(update_mut_vec)) {
|
||||
f_cdc = cdc->augment_mutation_call(_timeout, std::move(update_mut_vec), tr_state, _cl_for_learn)
|
||||
.then([this, base_tbl_id, cdc = cdc->shared_from_this()] (std::tuple<std::vector<mutation>, lw_shared_ptr<cdc::operation_result_tracker>>&& t) {
|
||||
auto mutations = std::move(std::get<0>(t));
|
||||
auto tracker = std::move(std::get<1>(t));
|
||||
// Pick only the CDC ("augmenting") mutations
|
||||
std::erase_if(mutations, [base_tbl_id = std::move(base_tbl_id)] (const mutation& v) {
|
||||
return v.schema()->id() == base_tbl_id;
|
||||
});
|
||||
if (mutations.empty()) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
return _proxy->mutate_internal(std::move(mutations), _cl_for_learn, false, tr_state, _permit, _timeout, std::move(tracker));
|
||||
auto cdc_shared = cdc->shared_from_this(); // keep CDC service alive
|
||||
auto [mutations, tracker] = co_await cdc->augment_mutation_call(_timeout, std::move(update_mut_vec), tr_state, _cl_for_learn);
|
||||
// Pick only the CDC ("augmenting") mutations
|
||||
std::erase_if(mutations, [base_tbl_id = std::move(base_tbl_id)] (const mutation& v) {
|
||||
return v.schema()->id() == base_tbl_id;
|
||||
});
|
||||
if (!mutations.empty()) {
|
||||
f_cdc = _proxy->mutate_internal(std::move(mutations), _cl_for_learn, false, tr_state, _permit, _timeout, std::move(tracker));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1247,7 +1243,7 @@ future<> paxos_response_handler::learn_decision(lw_shared_ptr<paxos::proposal> d
|
||||
std::array<std::tuple<lw_shared_ptr<paxos::proposal>, schema_ptr, shared_ptr<paxos_response_handler>, dht::token>, 1> m{std::make_tuple(std::move(decision), _schema, shared_from_this(), _key.token())};
|
||||
future<> f_lwt = _proxy->mutate_internal(std::move(m), _cl_for_learn, false, tr_state, _permit, _timeout);
|
||||
|
||||
return when_all_succeed(std::move(f_cdc), std::move(f_lwt)).discard_result();
|
||||
co_await when_all_succeed(std::move(f_cdc), std::move(f_lwt)).discard_result();
|
||||
}
|
||||
|
||||
void paxos_response_handler::prune(utils::UUID ballot) {
|
||||
|
||||
@@ -2282,6 +2282,8 @@ future<> storage_service::removenode(sstring host_id_string, std::list<gms::inet
|
||||
ss._group0->leave_group0(endpoint).get();
|
||||
slogger.info("removenode[{}]: Finished removenode operation, removing node={}, sync_nodes={}, ignore_nodes={}", uuid, endpoint, nodes, ignore_nodes);
|
||||
} catch (...) {
|
||||
slogger.warn("removenode[{}]: removing node={}, sync_nodes={}, ignore_nodes={} failed, error {}",
|
||||
uuid, endpoint, nodes, ignore_nodes, std::current_exception());
|
||||
// we need to revert the effect of prepare verb the removenode ops is failed
|
||||
req.cmd = node_ops_cmd::removenode_abort;
|
||||
parallel_for_each(nodes, [&ss, &req, &nodes_unknown_verb, &nodes_down, uuid] (const gms::inet_address& node) {
|
||||
|
||||
@@ -21,6 +21,7 @@
|
||||
#include "unimplemented.hh"
|
||||
#include "segmented_compress_params.hh"
|
||||
#include "utils/class_registrator.hh"
|
||||
#include "reader_permit.hh"
|
||||
|
||||
namespace sstables {
|
||||
|
||||
@@ -338,16 +339,18 @@ class compressed_file_data_source_impl : public data_source_impl {
|
||||
sstables::compression* _compression_metadata;
|
||||
sstables::compression::segmented_offsets::accessor _offsets;
|
||||
sstables::local_compression _compression;
|
||||
reader_permit _permit;
|
||||
uint64_t _underlying_pos;
|
||||
uint64_t _pos;
|
||||
uint64_t _beg_pos;
|
||||
uint64_t _end_pos;
|
||||
public:
|
||||
compressed_file_data_source_impl(file f, sstables::compression* cm,
|
||||
uint64_t pos, size_t len, file_input_stream_options options)
|
||||
uint64_t pos, size_t len, file_input_stream_options options, reader_permit permit)
|
||||
: _compression_metadata(cm)
|
||||
, _offsets(_compression_metadata->offsets.get_accessor())
|
||||
, _compression(*cm)
|
||||
, _permit(std::move(permit))
|
||||
{
|
||||
_beg_pos = pos;
|
||||
if (pos > _compression_metadata->uncompressed_file_length()) {
|
||||
@@ -412,7 +415,7 @@ public:
|
||||
_pos += out.size();
|
||||
_underlying_pos += addr.chunk_len;
|
||||
|
||||
return out;
|
||||
return make_tracked_temporary_buffer(std::move(out), _permit);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -444,9 +447,9 @@ requires ChecksumUtils<ChecksumType>
|
||||
class compressed_file_data_source : public data_source {
|
||||
public:
|
||||
compressed_file_data_source(file f, sstables::compression* cm,
|
||||
uint64_t offset, size_t len, file_input_stream_options options)
|
||||
uint64_t offset, size_t len, file_input_stream_options options, reader_permit permit)
|
||||
: data_source(std::make_unique<compressed_file_data_source_impl<ChecksumType>>(
|
||||
std::move(f), cm, offset, len, std::move(options)))
|
||||
std::move(f), cm, offset, len, std::move(options), std::move(permit)))
|
||||
{}
|
||||
};
|
||||
|
||||
@@ -454,10 +457,10 @@ template <typename ChecksumType>
|
||||
requires ChecksumUtils<ChecksumType>
|
||||
inline input_stream<char> make_compressed_file_input_stream(
|
||||
file f, sstables::compression *cm, uint64_t offset, size_t len,
|
||||
file_input_stream_options options)
|
||||
file_input_stream_options options, reader_permit permit)
|
||||
{
|
||||
return input_stream<char>(compressed_file_data_source<ChecksumType>(
|
||||
std::move(f), cm, offset, len, std::move(options)));
|
||||
std::move(f), cm, offset, len, std::move(options), std::move(permit)));
|
||||
}
|
||||
|
||||
// For SSTables 2.x (formats 'ka' and 'la'), the full checksum is a combination of checksums of compressed chunks.
|
||||
@@ -569,15 +572,15 @@ inline output_stream<char> make_compressed_file_output_stream(output_stream<char
|
||||
|
||||
input_stream<char> sstables::make_compressed_file_k_l_format_input_stream(file f,
|
||||
sstables::compression* cm, uint64_t offset, size_t len,
|
||||
class file_input_stream_options options)
|
||||
class file_input_stream_options options, reader_permit permit)
|
||||
{
|
||||
return make_compressed_file_input_stream<adler32_utils>(std::move(f), cm, offset, len, std::move(options));
|
||||
return make_compressed_file_input_stream<adler32_utils>(std::move(f), cm, offset, len, std::move(options), std::move(permit));
|
||||
}
|
||||
|
||||
input_stream<char> sstables::make_compressed_file_m_format_input_stream(file f,
|
||||
sstables::compression *cm, uint64_t offset, size_t len,
|
||||
class file_input_stream_options options) {
|
||||
return make_compressed_file_input_stream<crc32_utils>(std::move(f), cm, offset, len, std::move(options));
|
||||
class file_input_stream_options options, reader_permit permit) {
|
||||
return make_compressed_file_input_stream<crc32_utils>(std::move(f), cm, offset, len, std::move(options), std::move(permit));
|
||||
}
|
||||
|
||||
output_stream<char> sstables::make_compressed_file_m_format_output_stream(output_stream<char> out,
|
||||
|
||||
@@ -47,6 +47,8 @@
|
||||
#include "checksum_utils.hh"
|
||||
#include "../compress.hh"
|
||||
|
||||
class reader_permit;
|
||||
|
||||
class compression_parameters;
|
||||
class compressor;
|
||||
using compressor_ptr = shared_ptr<compressor>;
|
||||
@@ -371,11 +373,11 @@ compressor_ptr get_sstable_compressor(const compression&);
|
||||
// sstable alive, and the compression metadata is only a part of it.
|
||||
input_stream<char> make_compressed_file_k_l_format_input_stream(file f,
|
||||
sstables::compression* cm, uint64_t offset, size_t len,
|
||||
class file_input_stream_options options);
|
||||
class file_input_stream_options options, reader_permit permit);
|
||||
|
||||
input_stream<char> make_compressed_file_m_format_input_stream(file f,
|
||||
sstables::compression* cm, uint64_t offset, size_t len,
|
||||
class file_input_stream_options options);
|
||||
class file_input_stream_options options, reader_permit permit);
|
||||
|
||||
output_stream<char> make_compressed_file_m_format_output_stream(output_stream<char> out,
|
||||
sstables::compression* cm,
|
||||
|
||||
@@ -2287,7 +2287,7 @@ input_stream<char> sstable::data_stream(uint64_t pos, size_t len, const io_prior
|
||||
options.read_ahead = 4;
|
||||
options.dynamic_adjustments = std::move(history);
|
||||
|
||||
file f = make_tracked_file(_data_file, std::move(permit));
|
||||
file f = make_tracked_file(_data_file, permit);
|
||||
if (trace_state) {
|
||||
f = tracing::make_traced_file(std::move(f), std::move(trace_state), format("{}:", get_filename()));
|
||||
}
|
||||
@@ -2296,10 +2296,10 @@ input_stream<char> sstable::data_stream(uint64_t pos, size_t len, const io_prior
|
||||
if (_components->compression) {
|
||||
if (_version >= sstable_version_types::mc) {
|
||||
return make_compressed_file_m_format_input_stream(f, &_components->compression,
|
||||
pos, len, std::move(options));
|
||||
pos, len, std::move(options), permit);
|
||||
} else {
|
||||
return make_compressed_file_k_l_format_input_stream(f, &_components->compression,
|
||||
pos, len, std::move(options));
|
||||
pos, len, std::move(options), permit);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -29,7 +29,7 @@ std::function<future<> (flat_mutation_reader)> make_streaming_consumer(sstring o
|
||||
std::exception_ptr ex;
|
||||
try {
|
||||
auto cf = db.local().find_column_family(reader.schema()).shared_from_this();
|
||||
auto use_view_update_path = co_await db::view::check_needs_view_update_path(sys_dist_ks.local(), *cf, reason);
|
||||
auto use_view_update_path = co_await db::view::check_needs_view_update_path(sys_dist_ks.local(), db.local().get_token_metadata(), *cf, reason);
|
||||
//FIXME: for better estimations this should be transmitted from remote
|
||||
auto metadata = mutation_source_metadata{};
|
||||
auto& cs = cf->get_compaction_strategy();
|
||||
|
||||
@@ -296,6 +296,17 @@ SEASTAR_TEST_CASE(test_insert_json_types) {
|
||||
}
|
||||
});
|
||||
|
||||
BOOST_REQUIRE_THROW(e.execute_cql(R"(
|
||||
INSERT INTO all_types JSON '{
|
||||
"a": "abc", "c": "6"
|
||||
}'
|
||||
)").get(), marshal_exception);
|
||||
BOOST_REQUIRE_THROW(e.execute_cql(R"(
|
||||
INSERT INTO all_types JSON '{
|
||||
"a": "abc", "c": "0392fa"
|
||||
}'
|
||||
)").get(), marshal_exception);
|
||||
|
||||
e.execute_cql("CREATE TABLE multi_column_pk_table (p1 int, p2 int, p3 int, c1 int, c2 int, v int, PRIMARY KEY((p1, p2, p3), c1, c2));").get();
|
||||
e.require_table_exists("ks", "multi_column_pk_table").get();
|
||||
|
||||
|
||||
@@ -1607,13 +1607,13 @@ SEASTAR_TEST_CASE(test_trim_clustering_row_ranges_to) {
|
||||
check_reversed(
|
||||
{ {excl{9, 39}, incl{10}} },
|
||||
{10},
|
||||
{ {excl{9, 39}, incl{10, null{}}} });
|
||||
{ {excl{9, 39}, excl{10}} });
|
||||
|
||||
// (13)
|
||||
check_reversed(
|
||||
{ {incl{9, 10}, incl{10, 30}} },
|
||||
{10},
|
||||
{ {incl{9, 10}, incl{10, null{}}} });
|
||||
{ {incl{9, 10}, excl{10}} });
|
||||
|
||||
// (14)
|
||||
check_reversed(
|
||||
@@ -3865,6 +3865,50 @@ SEASTAR_THREAD_TEST_CASE(test_evictable_reader_clear_tombstone_in_discontinued_p
|
||||
check(empty_buffer, "end of stream");
|
||||
}
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_evictable_reader_next_pos_is_partition_start) {
|
||||
reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::for_tests{}, get_name(), 1, 0);
|
||||
auto stop_sem = deferred_stop(semaphore);
|
||||
simple_schema s;
|
||||
auto schema = s.schema();
|
||||
auto permit = semaphore.make_tracking_only_permit(s.schema().get(), get_name(), db::no_timeout);
|
||||
|
||||
auto pk = s.make_pkey();
|
||||
const auto prange = dht::partition_range::make_open_ended_both_sides();
|
||||
|
||||
std::deque<mutation_fragment_v2> frags;
|
||||
frags.emplace_back(*schema, permit, partition_start(pk, {}));
|
||||
for (size_t ck = 0; ck < 1000; ++ck) {
|
||||
frags.emplace_back(*schema, permit, range_tombstone_change(position_in_partition::before_key(s.make_ckey(ck)), tombstone(s.new_timestamp(), {})));
|
||||
}
|
||||
frags.emplace_back(*schema, permit, range_tombstone_change(position_in_partition::before_key(s.make_ckey(1001)), tombstone()));
|
||||
frags.emplace_back(*schema, permit, partition_end{});
|
||||
|
||||
const auto max_buf_size = frags[0].memory_usage() + frags[1].memory_usage() + frags[2].memory_usage();
|
||||
|
||||
auto ms = mutation_source([&frags, max_buf_size] (
|
||||
schema_ptr schema,
|
||||
reader_permit permit,
|
||||
const dht::partition_range& pr,
|
||||
const query::partition_slice& ps,
|
||||
const io_priority_class& pc,
|
||||
tracing::trace_state_ptr trace_state,
|
||||
streamed_mutation::forwarding fwd_sm,
|
||||
mutation_reader::forwarding fwd_mr) {
|
||||
auto rd = make_flat_mutation_reader_from_fragments(std::move(schema), std::move(permit), std::move(frags), pr, ps);
|
||||
rd.set_max_buffer_size(max_buf_size);
|
||||
return rd;
|
||||
});
|
||||
|
||||
auto [rd, handle] = make_manually_paused_evictable_reader_v2(ms, schema, permit, prange, schema->full_slice(), default_priority_class(), {},
|
||||
mutation_reader::forwarding::no);
|
||||
auto stop_rd = deferred_close(rd);
|
||||
rd.set_max_buffer_size(max_buf_size);
|
||||
|
||||
rd.fill_buffer().get();
|
||||
auto buf1 = rd.detach_buffer();
|
||||
BOOST_REQUIRE_EQUAL(buf1.size(), 3);
|
||||
}
|
||||
|
||||
struct mutation_bounds {
|
||||
std::optional<mutation> m;
|
||||
position_in_partition lower;
|
||||
|
||||
@@ -3461,3 +3461,100 @@ SEASTAR_THREAD_TEST_CASE(test_compactor_range_tombstone_spanning_many_pages) {
|
||||
BOOST_REQUIRE_EQUAL(res_mut, ref_mut);
|
||||
}
|
||||
}
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_compactor_detach_state) {
|
||||
simple_schema ss;
|
||||
auto pk = ss.make_pkey();
|
||||
auto s = ss.schema();
|
||||
|
||||
tests::reader_concurrency_semaphore_wrapper semaphore;
|
||||
|
||||
auto permit = semaphore.make_permit();
|
||||
|
||||
const auto expiry_point = gc_clock::now() + std::chrono::days(10);
|
||||
|
||||
const auto marker_ts = ss.new_timestamp();
|
||||
const auto tomb_ts = ss.new_timestamp();
|
||||
const auto row_ts = ss.new_timestamp();
|
||||
|
||||
const auto query_time = gc_clock::now();
|
||||
const auto max_rows = std::numeric_limits<uint64_t>::max();
|
||||
const auto max_partitions = std::numeric_limits<uint32_t>::max();
|
||||
|
||||
auto make_frags = [&] {
|
||||
std::deque<mutation_fragment_v2> frags;
|
||||
|
||||
frags.emplace_back(*s, permit, partition_start(pk, {}));
|
||||
|
||||
frags.emplace_back(*s, permit, ss.make_static_row_v2(permit, "static_row"));
|
||||
|
||||
const auto& v_def = *s->get_column_definition(to_bytes("v"));
|
||||
|
||||
frags.emplace_back(*s, permit, range_tombstone_change(position_in_partition::before_key(ss.make_ckey(0)), tombstone{tomb_ts, expiry_point}));
|
||||
|
||||
for (uint32_t ck = 0; ck < 1; ++ck) {
|
||||
auto row = clustering_row(ss.make_ckey(ck));
|
||||
row.cells().apply(v_def, atomic_cell::make_live(*v_def.type, row_ts, serialized("v")));
|
||||
row.marker() = row_marker(marker_ts);
|
||||
frags.emplace_back(mutation_fragment_v2(*s, permit, std::move(row)));
|
||||
}
|
||||
|
||||
frags.emplace_back(*s, permit, range_tombstone_change(position_in_partition::after_key(ss.make_ckey(10)), tombstone{}));
|
||||
|
||||
frags.emplace_back(*s, permit, partition_end{});
|
||||
|
||||
return frags;
|
||||
};
|
||||
|
||||
struct consumer {
|
||||
uint64_t frags = 0;
|
||||
const uint64_t frag_limit;
|
||||
const bool final_stop;
|
||||
|
||||
consumer(uint64_t stop_at, bool final_stop) : frag_limit(stop_at + 1), final_stop(final_stop) { }
|
||||
void consume_new_partition(const dht::decorated_key& dk) { }
|
||||
void consume(const tombstone& t) { }
|
||||
stop_iteration consume(static_row&& sr, tombstone, bool) {
|
||||
const auto ret = ++frags >= frag_limit;
|
||||
testlog.trace("consume(static_row) ret={}", ret);
|
||||
return stop_iteration(ret);
|
||||
}
|
||||
stop_iteration consume(clustering_row&& cr, row_tombstone t, bool is_alive) {
|
||||
const auto ret = ++frags >= frag_limit;
|
||||
testlog.trace("consume(clustering_row) ret={}", ret);
|
||||
return stop_iteration(ret);
|
||||
}
|
||||
stop_iteration consume(range_tombstone&& rt) {
|
||||
const auto ret = ++frags >= frag_limit;
|
||||
testlog.trace("consume(range_tombstone) ret={}", ret);
|
||||
return stop_iteration(ret);
|
||||
}
|
||||
stop_iteration consume_end_of_partition() {
|
||||
testlog.trace("consume_end_of_partition()");
|
||||
return stop_iteration(final_stop);
|
||||
}
|
||||
void consume_end_of_stream() { }
|
||||
};
|
||||
|
||||
// deduct 2 for partition start and end respectively
|
||||
const auto inter_partition_frag_count = make_frags().size() - 2;
|
||||
|
||||
auto check = [&] (uint64_t stop_at, bool final_stop) {
|
||||
testlog.debug("stop_at={}, final_stop={}", stop_at, final_stop);
|
||||
auto compaction_state = make_lw_shared<compact_mutation_state<emit_only_live_rows::no, compact_for_sstables::no>>(*s, query_time, s->full_slice(), max_rows, max_partitions);
|
||||
auto reader = make_flat_mutation_reader_from_fragments(s, permit, make_frags());
|
||||
auto close_reader = deferred_close(reader);
|
||||
reader.consume(compact_for_query<emit_only_live_rows::no, consumer>(compaction_state, consumer(stop_at, final_stop))).get();
|
||||
const auto has_detached_state = bool(std::move(*compaction_state).detach_state());
|
||||
if (stop_at < inter_partition_frag_count) {
|
||||
BOOST_CHECK_EQUAL(has_detached_state, final_stop);
|
||||
} else {
|
||||
BOOST_CHECK(!has_detached_state);
|
||||
}
|
||||
};
|
||||
|
||||
for (unsigned stop_at = 0; stop_at < inter_partition_frag_count; ++stop_at) {
|
||||
check(stop_at, true);
|
||||
check(stop_at, false);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -19,22 +19,30 @@
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_clear_inactive_reads) {
|
||||
simple_schema s;
|
||||
std::vector<reader_permit> permits;
|
||||
std::vector<reader_concurrency_semaphore::inactive_read_handle> handles;
|
||||
|
||||
{
|
||||
reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::no_limits{}, get_name());
|
||||
auto stop_sem = deferred_stop(semaphore);
|
||||
auto clear_permits = defer([&permits] { permits.clear(); });
|
||||
|
||||
for (int i = 0; i < 10; ++i) {
|
||||
handles.emplace_back(semaphore.register_inactive_read(make_empty_flat_reader_v2(s.schema(), semaphore.make_tracking_only_permit(s.schema().get(), get_name(), db::no_timeout))));
|
||||
permits.emplace_back(semaphore.make_tracking_only_permit(s.schema().get(), get_name(), db::no_timeout));
|
||||
handles.emplace_back(semaphore.register_inactive_read(make_empty_flat_reader_v2(s.schema(), permits.back())));
|
||||
}
|
||||
|
||||
BOOST_REQUIRE(std::all_of(handles.begin(), handles.end(), [] (const reader_concurrency_semaphore::inactive_read_handle& handle) { return bool(handle); }));
|
||||
BOOST_REQUIRE(std::all_of(permits.begin(), permits.end(), [] (const reader_permit& permit) { return permit.get_state() == reader_permit::state::inactive; }));
|
||||
|
||||
semaphore.clear_inactive_reads();
|
||||
|
||||
BOOST_REQUIRE(std::all_of(handles.begin(), handles.end(), [] (const reader_concurrency_semaphore::inactive_read_handle& handle) { return !bool(handle); }));
|
||||
BOOST_REQUIRE(std::all_of(permits.begin(), permits.end(), [] (const reader_permit& permit) { return permit.get_state() == reader_permit::state::evicted; }));
|
||||
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 0);
|
||||
|
||||
permits.clear();
|
||||
handles.clear();
|
||||
|
||||
for (int i = 0; i < 10; ++i) {
|
||||
@@ -134,13 +142,18 @@ SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_readmission_preserves
|
||||
const auto consumed_resources = semaphore.available_resources();
|
||||
semaphore.consume(consumed_resources);
|
||||
|
||||
auto fut = permit->maybe_wait_readmission();
|
||||
auto fut = make_ready_future<>();
|
||||
if (permit->needs_readmission()) {
|
||||
fut = permit->wait_readmission();
|
||||
}
|
||||
BOOST_REQUIRE(!fut.available());
|
||||
|
||||
semaphore.signal(consumed_resources);
|
||||
fut.get();
|
||||
} else {
|
||||
permit->maybe_wait_readmission().get();
|
||||
if (permit->needs_readmission()) {
|
||||
permit->wait_readmission().get();
|
||||
}
|
||||
}
|
||||
|
||||
BOOST_REQUIRE_EQUAL(permit->consumed_resources(), residue_units->resources() + base_resources);
|
||||
@@ -223,7 +236,9 @@ SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_forward_progress) {
|
||||
if (auto reader = _permit->semaphore().unregister_inactive_read(std::move(handle)); reader) {
|
||||
_reader = downgrade_to_v1(std::move(*reader));
|
||||
} else {
|
||||
co_await _permit->maybe_wait_readmission();
|
||||
if (_permit->needs_readmission()) {
|
||||
co_await _permit->wait_readmission();
|
||||
}
|
||||
make_reader();
|
||||
}
|
||||
co_await tick(std::get<flat_mutation_reader>(_reader));
|
||||
@@ -690,7 +705,10 @@ SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_admission) {
|
||||
|
||||
const auto stats_before = semaphore.get_stats();
|
||||
|
||||
auto wait_fut = permit.maybe_wait_readmission();
|
||||
auto wait_fut = make_ready_future<>();
|
||||
if (permit.needs_readmission()) {
|
||||
wait_fut = permit.wait_readmission();
|
||||
}
|
||||
wait_fut.wait();
|
||||
BOOST_REQUIRE(!wait_fut.failed());
|
||||
|
||||
@@ -946,3 +964,392 @@ SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_evict_inactive_reads_
|
||||
handles.clear();
|
||||
}
|
||||
}
|
||||
|
||||
// Reproduces https://github.com/scylladb/scylladb/issues/11770
|
||||
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_evict_inactive_reads_when_all_is_blocked) {
|
||||
simple_schema ss;
|
||||
const auto& s = *ss.schema();
|
||||
|
||||
const auto initial_resources = reader_concurrency_semaphore::resources{2, 32 * 1024};
|
||||
reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::for_tests{}, get_name(), initial_resources.count, initial_resources.memory);
|
||||
auto stop_sem = deferred_stop(semaphore);
|
||||
|
||||
class read {
|
||||
reader_permit _permit;
|
||||
promise<> _read_started_pr;
|
||||
future<> _read_started_fut;
|
||||
promise<> _read_done_pr;
|
||||
reader_permit::used_guard _ug;
|
||||
std::optional<reader_permit::blocked_guard> _bg;
|
||||
|
||||
public:
|
||||
explicit read(reader_permit p) : _permit(std::move(p)), _read_started_fut(_read_started_pr.get_future()), _ug(_permit) { }
|
||||
future<> wait_read_started() { return std::move(_read_started_fut); }
|
||||
void set_read_done() { _read_done_pr.set_value(); }
|
||||
void mark_as_blocked() { _bg.emplace(_permit); }
|
||||
void mark_as_unblocked() { _bg.reset(); }
|
||||
reader_concurrency_semaphore::read_func get_read_func() {
|
||||
return [this] (reader_permit permit) -> future<> {
|
||||
_read_started_pr.set_value();
|
||||
co_await _read_done_pr.get_future();
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
auto p1 = semaphore.obtain_permit(&s, get_name(), 1024, db::no_timeout).get();
|
||||
auto irh1 = semaphore.register_inactive_read(make_empty_flat_reader_v2(ss.schema(), p1));
|
||||
|
||||
auto p2 = semaphore.obtain_permit(&s, get_name(), 1024, db::no_timeout).get();
|
||||
read rd2(p2);
|
||||
auto fut2 = semaphore.with_ready_permit(p2, rd2.get_read_func());
|
||||
|
||||
// At this point we expect to have:
|
||||
// * 1 inactive read (not evicted)
|
||||
// * 1 used (but not blocked) read on the ready list
|
||||
// * 1 waiter
|
||||
// * no more count resources left
|
||||
auto p3_fut = semaphore.obtain_permit(&s, get_name(), 1024, db::no_timeout);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.waiters(), 1);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_stats().reads_enqueued, 1);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_stats().used_permits, 1);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_stats().blocked_permits, 0);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 1);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_stats().permit_based_evictions, 0);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources().count, 0);
|
||||
BOOST_REQUIRE(irh1);
|
||||
|
||||
// Start the read emptying the ready list, this should not be enough to admit p3
|
||||
rd2.wait_read_started().get();
|
||||
BOOST_REQUIRE_EQUAL(semaphore.waiters(), 1);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_stats().used_permits, 1);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_stats().blocked_permits, 0);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 1);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_stats().permit_based_evictions, 0);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources().count, 0);
|
||||
BOOST_REQUIRE(irh1);
|
||||
|
||||
// Marking p2 as blocked should now allow p3 to be admitted by evicting p1
|
||||
rd2.mark_as_blocked();
|
||||
BOOST_REQUIRE_EQUAL(semaphore.waiters(), 0);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_stats().used_permits, 1);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_stats().blocked_permits, 1);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 0);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_stats().permit_based_evictions, 1);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources().count, 0);
|
||||
BOOST_REQUIRE(!irh1);
|
||||
|
||||
p3_fut.get();
|
||||
rd2.mark_as_unblocked();
|
||||
rd2.set_read_done();
|
||||
fut2.get();
|
||||
}
|
||||
|
||||
// Check that `stop()` correctly evicts all inactive reads.
|
||||
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_stop_with_inactive_reads) {
|
||||
reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::no_limits{}, get_name());
|
||||
|
||||
simple_schema ss;
|
||||
auto s = ss.schema();
|
||||
|
||||
auto permit = reader_permit_opt(semaphore.obtain_permit(s.get(), get_name(), 1024, db::no_timeout).get());
|
||||
|
||||
auto handle = semaphore.register_inactive_read(make_empty_flat_reader_v2(s, *permit));
|
||||
|
||||
BOOST_REQUIRE(handle);
|
||||
BOOST_REQUIRE_EQUAL(permit->get_state(), reader_permit::state::inactive);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 1);
|
||||
|
||||
// Using BOOST_CHECK_* because an exception thrown here causes a segfault,
|
||||
// due to the stop future not being waited for.
|
||||
auto stop_f = semaphore.stop();
|
||||
BOOST_CHECK(!stop_f.available());
|
||||
BOOST_CHECK(eventually_true([&] { return !semaphore.get_stats().inactive_reads; }));
|
||||
BOOST_CHECK(!handle);
|
||||
BOOST_CHECK_EQUAL(permit->get_state(), reader_permit::state::evicted);
|
||||
|
||||
// Stop waits on all permits, so we need to destroy the permit before we can
|
||||
// wait on the stop future.
|
||||
permit = {};
|
||||
stop_f.get();
|
||||
}
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_set_resources) {
|
||||
const auto initial_resources = reader_concurrency_semaphore::resources{4, 4 * 1024};
|
||||
reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::for_tests{}, get_name(), initial_resources.count, initial_resources.memory);
|
||||
auto stop_sem = deferred_stop(semaphore);
|
||||
|
||||
auto permit1 = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout).get0();
|
||||
auto permit2 = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout).get0();
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), reader_resources(2, 2 * 1024));
|
||||
BOOST_REQUIRE_EQUAL(semaphore.initial_resources(), reader_resources(4, 4 * 1024));
|
||||
|
||||
semaphore.set_resources({8, 8 * 1024});
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), reader_resources(6, 6 * 1024));
|
||||
BOOST_REQUIRE_EQUAL(semaphore.initial_resources(), reader_resources(8, 8 * 1024));
|
||||
|
||||
semaphore.set_resources({2, 2 * 1024});
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), reader_resources(0, 0));
|
||||
BOOST_REQUIRE_EQUAL(semaphore.initial_resources(), reader_resources(2, 2 * 1024));
|
||||
|
||||
semaphore.set_resources({3, 128});
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), reader_resources(1, 128 - 2 * 1024));
|
||||
BOOST_REQUIRE_EQUAL(semaphore.initial_resources(), reader_resources(3, 128));
|
||||
|
||||
semaphore.set_resources({1, 3 * 1024});
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), reader_resources(-1, 1024));
|
||||
BOOST_REQUIRE_EQUAL(semaphore.initial_resources(), reader_resources(1, 3 * 1024));
|
||||
|
||||
auto permit3_fut = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_stats().reads_enqueued, 1);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.waiters(), 1);
|
||||
|
||||
semaphore.set_resources({4, 4 * 1024});
|
||||
BOOST_REQUIRE_EQUAL(semaphore.waiters(), 0);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), reader_resources(1, 1024));
|
||||
BOOST_REQUIRE_EQUAL(semaphore.initial_resources(), reader_resources(4, 4 * 1024));
|
||||
permit3_fut.get();
|
||||
}
|
||||
|
||||
// Check that inactive reads are not needlessly evicted when admission is not
|
||||
// blocked on resources.
|
||||
// This test covers all the cases where eviction should **not** happen.
|
||||
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_no_unnecessary_evicting) {
|
||||
const auto initial_resources = reader_concurrency_semaphore::resources{2, 4 * 1024};
|
||||
reader_concurrency_semaphore semaphore(initial_resources.count, initial_resources.memory, get_name(), 100);
|
||||
auto stop_sem = deferred_stop(semaphore);
|
||||
|
||||
simple_schema ss;
|
||||
auto s = ss.schema();
|
||||
|
||||
auto permit1 = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout).get();
|
||||
|
||||
// There are available resources
|
||||
{
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources().count, 1);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources().memory, 3 * 1024);
|
||||
|
||||
auto handle = semaphore.register_inactive_read(make_empty_flat_reader_v2(s, permit1));
|
||||
BOOST_REQUIRE(handle);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 1);
|
||||
|
||||
semaphore.set_resources(initial_resources);
|
||||
BOOST_REQUIRE(handle);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 1);
|
||||
|
||||
BOOST_REQUIRE(semaphore.unregister_inactive_read(std::move(handle)));
|
||||
}
|
||||
|
||||
// Count resources are on the limit but no one wants more
|
||||
{
|
||||
auto permit2 = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout).get();
|
||||
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources().count, 0);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources().memory, 2 * 1024);
|
||||
|
||||
auto handle = semaphore.register_inactive_read(make_empty_flat_reader_v2(s, permit1));
|
||||
BOOST_REQUIRE(handle);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 1);
|
||||
|
||||
semaphore.set_resources(initial_resources);
|
||||
BOOST_REQUIRE(handle);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 1);
|
||||
|
||||
BOOST_REQUIRE(semaphore.unregister_inactive_read(std::move(handle)));
|
||||
}
|
||||
|
||||
// Memory resources are on the limit but no one wants more
|
||||
{
|
||||
auto units = permit1.consume_memory(3 * 1024);
|
||||
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources().count, 1);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources().memory, 0);
|
||||
|
||||
auto handle = semaphore.register_inactive_read(make_empty_flat_reader_v2(s, permit1));
|
||||
BOOST_REQUIRE(handle);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 1);
|
||||
BOOST_REQUIRE(semaphore.unregister_inactive_read(std::move(handle)));
|
||||
}
|
||||
|
||||
// Up the resource count, we need more permits to check the rest of the scenarios
|
||||
semaphore.set_resources({4, 4 * 1024});
|
||||
|
||||
// There are waiters but they are not blocked on resources
|
||||
{
|
||||
auto permit2 = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout).get();
|
||||
auto permit3 = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout).get();
|
||||
|
||||
std::optional<reader_permit::used_guard> ug1{permit1};
|
||||
std::optional<reader_permit::used_guard> ug2{permit2};
|
||||
|
||||
auto permit4_fut = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.waiters(), 1);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_stats().reads_queued_because_used_permits, 1);
|
||||
|
||||
// First check the register path.
|
||||
auto handle = semaphore.register_inactive_read(make_empty_flat_reader_v2(s, permit3));
|
||||
|
||||
BOOST_REQUIRE(handle);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_stats().permit_based_evictions, 0);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 1);
|
||||
BOOST_REQUIRE_EQUAL(permit3.get_state(), reader_permit::state::inactive);
|
||||
|
||||
// Now check the callback admission path (admission check on resources being freed).
|
||||
ug2.reset();
|
||||
BOOST_REQUIRE(handle);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_stats().permit_based_evictions, 0);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 1);
|
||||
BOOST_REQUIRE_EQUAL(permit3.get_state(), reader_permit::state::inactive);
|
||||
}
|
||||
}
|
||||
|
||||
// Check that inactive reads are evicted when they are blocking admission
|
||||
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_necessary_evicting) {
|
||||
const auto initial_resources = reader_concurrency_semaphore::resources{2, 4 * 1024};
|
||||
reader_concurrency_semaphore semaphore(initial_resources.count, initial_resources.memory, get_name(), 100);
|
||||
auto stop_sem = deferred_stop(semaphore);
|
||||
|
||||
simple_schema ss;
|
||||
auto s = ss.schema();
|
||||
|
||||
uint64_t evicted_reads = 0;
|
||||
|
||||
auto permit1 = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout).get();
|
||||
|
||||
// No count resources - obtaining new permit
|
||||
{
|
||||
auto permit2 = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout).get();
|
||||
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources().count, 0);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources().memory, 2 * 1024);
|
||||
|
||||
auto handle = semaphore.register_inactive_read(make_empty_flat_reader_v2(s, permit1));
|
||||
BOOST_REQUIRE(handle);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 1);
|
||||
|
||||
auto new_permit = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout).get();
|
||||
BOOST_REQUIRE(!handle);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 0);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_stats().permit_based_evictions, ++evicted_reads);
|
||||
}
|
||||
|
||||
BOOST_REQUIRE(permit1.needs_readmission());
|
||||
permit1.wait_readmission().get();
|
||||
|
||||
// No count resources - waiter
|
||||
{
|
||||
auto permit2 = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout).get();
|
||||
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources().count, 0);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources().memory, 2 * 1024);
|
||||
|
||||
auto new_permit_fut = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.waiters(), 1);
|
||||
|
||||
auto handle = semaphore.register_inactive_read(make_empty_flat_reader_v2(s, permit1));
|
||||
BOOST_REQUIRE(!handle);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 0);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_stats().permit_based_evictions, ++evicted_reads);
|
||||
|
||||
new_permit_fut.get();
|
||||
}
|
||||
|
||||
BOOST_REQUIRE(permit1.needs_readmission());
|
||||
permit1.wait_readmission().get();
|
||||
|
||||
// No memory resources
|
||||
{
|
||||
auto units = permit1.consume_memory(3 * 1024);
|
||||
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources().count, 1);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources().memory, 0);
|
||||
|
||||
auto handle = semaphore.register_inactive_read(make_empty_flat_reader_v2(s, permit1));
|
||||
BOOST_REQUIRE(handle);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 1);
|
||||
|
||||
auto new_permit = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout).get();
|
||||
BOOST_REQUIRE(!handle);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 0);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_stats().permit_based_evictions, ++evicted_reads);
|
||||
}
|
||||
|
||||
BOOST_REQUIRE(permit1.needs_readmission());
|
||||
permit1.wait_readmission().get();
|
||||
|
||||
// No memory resources - waiter
|
||||
{
|
||||
auto units = permit1.consume_memory(3 * 1024);
|
||||
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources().count, 1);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources().memory, 0);
|
||||
|
||||
auto new_permit_fut = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.waiters(), 1);
|
||||
|
||||
auto handle = semaphore.register_inactive_read(make_empty_flat_reader_v2(s, permit1));
|
||||
BOOST_REQUIRE(!handle);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 0);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_stats().permit_based_evictions, ++evicted_reads);
|
||||
|
||||
new_permit_fut.get();
|
||||
}
|
||||
|
||||
BOOST_REQUIRE(permit1.needs_readmission());
|
||||
permit1.wait_readmission().get();
|
||||
|
||||
// No count resources - waiter blocked on something else too
|
||||
{
|
||||
auto permit2 = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout).get();
|
||||
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources().count, 0);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources().memory, 2 * 1024);
|
||||
|
||||
std::optional<reader_permit::used_guard> ug{permit2};
|
||||
|
||||
auto new_permit_fut = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.waiters(), 1);
|
||||
|
||||
auto handle = semaphore.register_inactive_read(make_empty_flat_reader_v2(s, permit1));
|
||||
BOOST_REQUIRE(handle);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 1);
|
||||
|
||||
ug.reset();
|
||||
BOOST_REQUIRE(!handle);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 0);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_stats().permit_based_evictions, ++evicted_reads);
|
||||
|
||||
new_permit_fut.get();
|
||||
}
|
||||
|
||||
BOOST_REQUIRE(permit1.needs_readmission());
|
||||
permit1.wait_readmission().get();
|
||||
|
||||
// No memory resources - waiter blocked on something else too
|
||||
{
|
||||
semaphore.set_resources({initial_resources.count + 1, initial_resources.memory});
|
||||
auto permit2 = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout).get();
|
||||
auto units = permit1.consume_memory(2 * 1024);
|
||||
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources().count, 1);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources().memory, 0);
|
||||
|
||||
std::optional<reader_permit::used_guard> ug{permit2};
|
||||
|
||||
auto new_permit_fut = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.waiters(), 1);
|
||||
|
||||
auto handle = semaphore.register_inactive_read(make_empty_flat_reader_v2(s, permit1));
|
||||
BOOST_REQUIRE(handle);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 1);
|
||||
|
||||
ug.reset();
|
||||
thread::yield(); // allow debug builds to schedule the fiber evicting the reads again
|
||||
BOOST_REQUIRE(!handle);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 0);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_stats().permit_based_evictions, ++evicted_reads);
|
||||
|
||||
new_permit_fut.get();
|
||||
|
||||
semaphore.set_resources(initial_resources);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3944,6 +3944,92 @@ SEASTAR_TEST_CASE(test_scans_erase_dummies) {
|
||||
});
|
||||
}
|
||||
|
||||
// Tests the following scenario:
|
||||
//
|
||||
// Initial state:
|
||||
//
|
||||
// v2: ==== <7> [entry2] ==== <9> === <13> ==== <last dummy>
|
||||
// v1: ======================================== <last dummy> [entry1]
|
||||
//
|
||||
// After two eviction events which evict entry1 and entry2, we should end up with:
|
||||
//
|
||||
// v2: ---------------------- <9> === <13> ==== <last dummy>
|
||||
// v1: ---------------------------------------- <last dummy>
|
||||
//
|
||||
// last dummy entries are treated in a special way in rows_entry::on_evicted(), and there
|
||||
// was a bug which didn't clear the continuity on last dummy when it was selected for eviction.
|
||||
// As a result, the view was this:
|
||||
//
|
||||
// v2: ---------------------- <9> === <13> ==== <last dummy>
|
||||
// v1: ======================================== <last dummy>
|
||||
//
|
||||
// This would violate the "older versions are evicted first" rule, which implies
|
||||
// that when entry2 is evicted in v2, the range in which entry2 falls into in all older versions
|
||||
// must be discontinuous. This won't hold if we don't clear continuity on last dummy in v1.
|
||||
// As a result, the range into which entry2 falls into from the perspective of v2 snapshot
|
||||
// would appear as continuous and <7> would be missing from the read result, because
|
||||
// continuity of a snapshot is a union of continuous ranges in all versions.
|
||||
//
|
||||
// Reproduces https://github.com/scylladb/scylladb/issues/12451
|
||||
SEASTAR_TEST_CASE(test_version_merging_with_range_tombstones_over_rowless_version) {
|
||||
return seastar::async([] {
|
||||
simple_schema s;
|
||||
tests::reader_concurrency_semaphore_wrapper semaphore;
|
||||
|
||||
auto pkey = s.make_pkey("pk");
|
||||
auto pr = dht::partition_range::make_singular(pkey);
|
||||
|
||||
memtable_snapshot_source underlying(s.schema());
|
||||
|
||||
mutation m1(s.schema(), pkey);
|
||||
m1.partition().apply(s.new_tombstone());
|
||||
underlying.apply(m1);
|
||||
|
||||
cache_tracker tracker;
|
||||
row_cache cache(s.schema(), snapshot_source([&] { return underlying(); }), tracker);
|
||||
|
||||
// Populate cache
|
||||
assert_that(cache.make_reader(s.schema(), semaphore.make_permit(), pr))
|
||||
.produces(m1);
|
||||
|
||||
mutation m2(s.schema(), pkey);
|
||||
s.delete_range(m2, s.make_ckey_range(7, 13));
|
||||
s.add_row(m2, s.make_ckey(7), "v");
|
||||
s.delete_range(m2, s.make_ckey_range(9, 17));
|
||||
s.add_row(m2, s.make_ckey(9), "v");
|
||||
s.add_row(m2, s.make_ckey(17), "v");
|
||||
|
||||
{
|
||||
auto rd1 = cache.make_reader(s.schema(), semaphore.make_permit(), pr);
|
||||
auto close_rd1 = deferred_close(rd1);
|
||||
rd1.set_max_buffer_size(1); // To hold the snapshot
|
||||
rd1.fill_buffer().get();
|
||||
|
||||
apply(cache, underlying, m2);
|
||||
|
||||
evict_one_row(tracker); // hits last dummy in oldest version.
|
||||
|
||||
assert_that(cache.make_reader(s.schema(), semaphore.make_permit(), pr))
|
||||
.produces(m1 + m2);
|
||||
|
||||
evict_one_row(tracker); // hits entry in the latest version, row (v1) or rtc (v2)
|
||||
|
||||
assert_that(cache.make_reader(s.schema(), semaphore.make_permit(), pr))
|
||||
.produces(m1 + m2);
|
||||
|
||||
evict_one_row(tracker); // hits entry in the latest version, row (both v1 and v2)
|
||||
|
||||
assert_that(cache.make_reader(s.schema(), semaphore.make_permit(), pr))
|
||||
.produces(m1 + m2);
|
||||
}
|
||||
|
||||
tracker.cleaner().drain().get();
|
||||
|
||||
assert_that(cache.make_reader(s.schema(), semaphore.make_permit(), pr))
|
||||
.produces(m1 + m2);
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_eviction_of_upper_bound_of_population_range) {
|
||||
return seastar::async([] {
|
||||
simple_schema s;
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
#include "compaction/compaction_manager.hh"
|
||||
#include "sstables/key.hh"
|
||||
#include "test/lib/sstable_utils.hh"
|
||||
#include "test/lib/reader_concurrency_semaphore.hh"
|
||||
#include <seastar/testing/test_case.hh>
|
||||
#include "schema.hh"
|
||||
#include "compress.hh"
|
||||
@@ -752,6 +753,8 @@ SEASTAR_TEST_CASE(sub_partitions_read) {
|
||||
|
||||
SEASTAR_TEST_CASE(test_skipping_in_compressed_stream) {
|
||||
return seastar::async([] {
|
||||
tests::reader_concurrency_semaphore_wrapper semaphore;
|
||||
|
||||
tmpdir tmp;
|
||||
auto file_path = (tmp.path() / "test").string();
|
||||
file f = open_file_dma(file_path, open_flags::create | open_flags::wo).get0();
|
||||
@@ -787,7 +790,7 @@ SEASTAR_TEST_CASE(test_skipping_in_compressed_stream) {
|
||||
|
||||
auto make_is = [&] {
|
||||
f = open_file_dma(file_path, open_flags::ro).get0();
|
||||
return make_compressed_file_m_format_input_stream(f, &c, 0, uncompressed_size, opts);
|
||||
return make_compressed_file_m_format_input_stream(f, &c, 0, uncompressed_size, opts, semaphore.make_permit());
|
||||
};
|
||||
|
||||
auto expect = [] (input_stream<char>& in, const temporary_buffer<char>& buf) {
|
||||
|
||||
@@ -89,6 +89,24 @@ SEASTAR_THREAD_TEST_CASE(test_clear_gently_non_trivial_unique_ptr) {
|
||||
utils::clear_gently(p).get();
|
||||
BOOST_CHECK(p);
|
||||
BOOST_REQUIRE_EQUAL(cleared_gently, 1);
|
||||
|
||||
cleared_gently = 0;
|
||||
p.reset();
|
||||
utils::clear_gently(p).get();
|
||||
BOOST_CHECK(!p);
|
||||
BOOST_REQUIRE_EQUAL(cleared_gently, 0);
|
||||
}
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_clear_gently_vector_of_unique_ptrs) {
|
||||
int cleared_gently = 0;
|
||||
std::vector<std::unique_ptr<clear_gently_tracker<int>>> v;
|
||||
v.emplace_back(std::make_unique<clear_gently_tracker<int>>(0, [&cleared_gently] (int) {
|
||||
cleared_gently++;
|
||||
}));
|
||||
v.emplace_back(nullptr);
|
||||
|
||||
utils::clear_gently(v).get();
|
||||
BOOST_REQUIRE_EQUAL(cleared_gently, 1);
|
||||
}
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_clear_gently_foreign_ptr) {
|
||||
|
||||
@@ -683,6 +683,13 @@ BOOST_AUTO_TEST_CASE(test_parse_valid_frozen_set) {
|
||||
BOOST_REQUIRE(type->as_cql3_type().to_string() == "frozen<set<int>>");
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(test_empty_frozen_set_compare) {
|
||||
auto parser = db::marshal::type_parser("org.apache.cassandra.db.marshal.FrozenType(org.apache.cassandra.db.marshal.SetType(org.apache.cassandra.db.marshal.Int32Type))");
|
||||
auto type = parser.parse();
|
||||
std::unordered_set<int> set = {1, 2, 3};
|
||||
type->compare(bytes_view(), bytes_view(*data_value(set).serialize()));
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(test_parse_valid_set_frozen_set) {
|
||||
sstring frozen = "org.apache.cassandra.db.marshal.FrozenType(org.apache.cassandra.db.marshal.SetType(org.apache.cassandra.db.marshal.Int32Type))";
|
||||
auto parser = db::marshal::type_parser("org.apache.cassandra.db.marshal.SetType(" + frozen + ")");
|
||||
|
||||
@@ -244,7 +244,14 @@ def get_cql_cluster(ip, ssl_context=None):
|
||||
auth_provider = cassandra.auth.PlainTextAuthProvider(username='cassandra', password='cassandra')
|
||||
return cassandra.cluster.Cluster([ip],
|
||||
auth_provider=auth_provider,
|
||||
ssl_context=ssl_context)
|
||||
ssl_context=ssl_context,
|
||||
# The default timeout for new connections is 5 seconds, and for
|
||||
# requests made by the control connection is 2 seconds. These should
|
||||
# have been more than enough, but in some extreme cases with a very
|
||||
# slow debug build running on a very busy machine, they may not be.
|
||||
# so let's increase them to 60 seconds. See issue #13239.
|
||||
connect_timeout = 60,
|
||||
control_connection_timeout = 60)
|
||||
|
||||
## Test that CQL is serving, for wait_for_services() below.
|
||||
def check_cql(ip, ssl_context=None):
|
||||
|
||||
@@ -233,3 +233,106 @@ def test_view_update_and_alter_base(cql, test_keyspace, scylla_only):
|
||||
# Try to modify an item. This failed in #11542.
|
||||
cql.execute(f'UPDATE {table} SET v=-1 WHERE p=1')
|
||||
assert len(list(cql.execute(f"SELECT v from {mv}"))) == 0
|
||||
|
||||
# Reproducer for issue #12297, reproducing a specific way in which a view
|
||||
# table could be made inconsistent with the base table:
|
||||
# The test writes 500 rows to one partition in a base table, and then uses
|
||||
# USING TIMESTAMP with the right value to cause a base partition deletion
|
||||
# which deletes not the entire partition but just its last 50 rows. As the
|
||||
# 50 rows of the base partition get deleted, we expect 50 rows from the
|
||||
# view table to also get deleted - but bug #12297 was that this wasn't
|
||||
# happening - rather, all rows remained in the view.
|
||||
# The bug cannot be reproduced with 100 rows (and deleting the last 10)
|
||||
# but 113 rows (and 101 rows after deleting the last 12) does reproduce
|
||||
# it. Reproducing the bug also required a setup where USING TIMESTAMP
|
||||
# deleted the *last* rows - using it to delete the *first* rows did not
|
||||
# have a bug (the view rows were deleted fine).
|
||||
@pytest.mark.parametrize("size", [100, 113, 500])
|
||||
def test_long_skipped_view_update_delete_with_timestamp(cql, test_keyspace, size):
|
||||
with new_test_table(cql, test_keyspace, 'p int, c int, x int, y int, primary key (p,c)') as table:
|
||||
with new_materialized_view(cql, table, '*', 'p, x, c', 'p is not null and x is not null and c is not null') as mv:
|
||||
# Write size rows with c=0..(size-1). Because the iteration is in
|
||||
# reverse order, the first row in clustering order (c=0) will
|
||||
# have the latest write timestamp.
|
||||
for i in reversed(range(size)):
|
||||
cql.execute(f'INSERT INTO {table} (p,c,x,y) VALUES (1,{i},{i},{i})')
|
||||
assert list(cql.execute(f"SELECT c FROM {table} WHERE p = 1")) == list(cql.execute(f"SELECT c FROM {mv} WHERE p = 1"))
|
||||
# Get the timestamp of the size*0.9th item. Because we wrote items
|
||||
# in reverse, items 0.9-1.0*size all have earlier timestamp than
|
||||
# that.
|
||||
t = list(cql.execute(f"SELECT writetime(y) FROM {table} WHERE p = 1 and c = {int(size*0.9)}"))[0].writetime_y
|
||||
cql.execute(f'DELETE FROM {table} USING TIMESTAMP {t} WHERE p=1')
|
||||
# After the deletion we expect to see size*0.9 rows remaining
|
||||
# (timestamp ties cannot happen for separate writes, if they
|
||||
# did we could have a bit less), but most importantly, the view
|
||||
# should have exactly the same rows.
|
||||
assert list(cql.execute(f"SELECT c FROM {table} WHERE p = 1")) == list(cql.execute(f"SELECT c FROM {mv} WHERE p = 1"))
|
||||
|
||||
# Same test as above, just that in this version the view partition key is
|
||||
# different from the base's, so we can be sure that Scylla needs to go
|
||||
# through the loop of deleting many view rows and cannot delete an entire
|
||||
# view partition in one fell swoop. In the above test, Scylla *may* contain
|
||||
# such an optimization (currently it doesn't), so it may reach a different
|
||||
# code path.
|
||||
def test_long_skipped_view_update_delete_with_timestamp2(cql, test_keyspace):
|
||||
size = 200
|
||||
with new_test_table(cql, test_keyspace, 'p int, c int, x int, y int, primary key (p,c)') as table:
|
||||
with new_materialized_view(cql, table, '*', 'x, p, c', 'p is not null and x is not null and c is not null') as mv:
|
||||
for i in reversed(range(size)):
|
||||
cql.execute(f'INSERT INTO {table} (p,c,x,y) VALUES (1,{i},{i},{i})')
|
||||
assert list(cql.execute(f"SELECT c FROM {table}")) == sorted(list(cql.execute(f"SELECT c FROM {mv}")))
|
||||
t = list(cql.execute(f"SELECT writetime(y) FROM {table} WHERE p = 1 and c = {int(size*0.9)}"))[0].writetime_y
|
||||
cql.execute(f'DELETE FROM {table} USING TIMESTAMP {t} WHERE p=1')
|
||||
assert list(cql.execute(f"SELECT c FROM {table}")) == sorted(list(cql.execute(f"SELECT c FROM {mv}")))
|
||||
|
||||
# Another, more fundemental, reproducer for issue #12297 where a certain
|
||||
# modification to a base partition modifing more than 100 rows was not
|
||||
# applied to the view beyond the 100th row.
|
||||
# The test above, test_long_skipped_view_update_delete_with_timestamp was one
|
||||
# such specific case, which involved a partition tombstone and a specific
|
||||
# choice of timestamp which causes the first 100 rows to NOT be changed.
|
||||
# In this test we show that the bug is not just about do-nothing tombstones:
|
||||
# In any base modification which involves more than 100 rows, if the first
|
||||
# 100 rows don't change the view (as decided by the can_skip_view_updates()
|
||||
# function), the other rows are wrongly skipped at well and not applied to
|
||||
# the view!
|
||||
# The specific case we use here is an update that sets some irrelevant
|
||||
# (not-selected-by-the-view) column y on 200 rows, and additionally writes
|
||||
# a new row as the 201st row. With bug #12297, that 201st row will be
|
||||
# missing in the view.
|
||||
def test_long_skipped_view_update_irrelevant_column(cql, test_keyspace):
|
||||
size = 200
|
||||
with new_test_table(cql, test_keyspace, 'p int, c int, x int, y int, primary key (p,c)') as table:
|
||||
# Note that column "y" is not selected by the materialized view
|
||||
with new_materialized_view(cql, table, 'p, x, c', 'p, x, c', 'p is not null and x is not null and c is not null') as mv:
|
||||
for i in range(size):
|
||||
cql.execute(f'INSERT INTO {table} (p,c,x,y) VALUES (1,{i},{i},{i})')
|
||||
# In a single batch (a single mutation), update "y" column in all
|
||||
# 'size' existing rows, plus add one new row in the last position
|
||||
# (the partition is sorted by the "c" column). The first 'size'
|
||||
# UPDATEs can be skipped in the view (because y isn't selected),
|
||||
# but the last INSERT can't be skipped - it really adds a new row.
|
||||
cmd = 'BEGIN BATCH '
|
||||
for i in range(size):
|
||||
cmd += f'UPDATE {table} SET y=7 where p=1 and c={i}; '
|
||||
cmd += f'INSERT INTO {table} (p,c,x,y) VALUES (1,{size+1},{size+1},{size+1}); '
|
||||
cmd += 'APPLY BATCH;'
|
||||
cql.execute(cmd)
|
||||
# We should now have the same size+1 rows in both base and view
|
||||
assert list(cql.execute(f"SELECT c FROM {table} WHERE p = 1")) == list(cql.execute(f"SELECT c FROM {mv} WHERE p = 1"))
|
||||
|
||||
# After the previous tests checked elaborate conditions where modifying a
|
||||
# base-table partition resulted in many skipped view updates, let's also
|
||||
# check the more basic situation where the base-table partition modification
|
||||
# (in this case, a deletion) result in many view-table updates, and all
|
||||
# of them should happen even if the code needs to do it internally in
|
||||
# several batches of 100 (for example).
|
||||
def test_mv_long_delete(cql, test_keyspace):
|
||||
size = 300
|
||||
with new_test_table(cql, test_keyspace, 'p int, c int, x int, y int, primary key (p,c)') as table:
|
||||
with new_materialized_view(cql, table, '*', 'p, x, c', 'p is not null and x is not null and c is not null') as mv:
|
||||
for i in range(size):
|
||||
cql.execute(f'INSERT INTO {table} (p,c,x,y) VALUES (1,{i},{i},{i})')
|
||||
cql.execute(f'DELETE FROM {table} WHERE p=1')
|
||||
assert list(cql.execute(f"SELECT c FROM {table} WHERE p = 1")) == []
|
||||
assert list(cql.execute(f"SELECT c FROM {mv} WHERE p = 1")) == []
|
||||
|
||||
@@ -65,6 +65,31 @@ def test_insert_null_key_lwt(cql, table1):
|
||||
with pytest.raises(InvalidRequest, match='null value'):
|
||||
cql.execute(stmt, [None, s])
|
||||
|
||||
# Contains the same checks as test_insert_null_key() and test_insert_null_key_lwt() above, just inside a batch
|
||||
def test_insert_null_key_in_batch(cql, table1):
|
||||
s = unique_key_string()
|
||||
with pytest.raises(InvalidRequest, match='null value'):
|
||||
cql.execute(f"BEGIN BATCH INSERT INTO {table1} (p,c) VALUES ('{s}', null);APPLY BATCH;")
|
||||
with pytest.raises(InvalidRequest, match='null value'):
|
||||
cql.execute(f"BEGIN BATCH INSERT INTO {table1} (p,c) VALUES ('{s}', null) IF NOT EXISTS;APPLY BATCH;")
|
||||
with pytest.raises(InvalidRequest, match='null value'):
|
||||
cql.execute(f"BEGIN BATCH INSERT INTO {table1} (p,c) VALUES (null, '{s}');APPLY BATCH;")
|
||||
with pytest.raises(InvalidRequest, match='null value'):
|
||||
cql.execute(f"BEGIN BATCH INSERT INTO {table1} (p,c) VALUES (null, '{s}') IF NOT EXISTS;APPLY BATCH;")
|
||||
|
||||
# Try the same thing with prepared statement.
|
||||
stmt = cql.prepare(f"BEGIN BATCH INSERT INTO {table1} (p,c) VALUES (?, ?);APPLY BATCH;")
|
||||
with pytest.raises(InvalidRequest, match='null value'):
|
||||
cql.execute(stmt, [s, None])
|
||||
with pytest.raises(InvalidRequest, match='null value'):
|
||||
cql.execute(stmt, [None, s])
|
||||
|
||||
stmt = cql.prepare(f"BEGIN BATCH INSERT INTO {table1} (p,c) VALUES (?, ?) IF NOT EXISTS;APPLY BATCH;")
|
||||
with pytest.raises(InvalidRequest, match='null value'):
|
||||
cql.execute(stmt, [s, None])
|
||||
with pytest.raises(InvalidRequest, match='null value'):
|
||||
cql.execute(stmt, [None, s])
|
||||
|
||||
# Tests handling of "key_column in ?" where ? is bound to null.
|
||||
# Reproduces issue #8265.
|
||||
def test_primary_key_in_null(cql, table1):
|
||||
|
||||
37
test/cql-pytest/test_paging.py
Normal file
37
test/cql-pytest/test_paging.py
Normal file
@@ -0,0 +1,37 @@
|
||||
# Copyright 2020-present ScyllaDB
|
||||
#
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
|
||||
from util import new_test_table
|
||||
from cassandra.query import SimpleStatement
|
||||
import pytest
|
||||
import nodetool
|
||||
|
||||
# Test that the _stop flag set in the compactor at the end of a page is not
|
||||
# sticky and doesn't remain set on the following page. If it does it can cause
|
||||
# the next page (and consequently the entire query) to be terminated prematurely.
|
||||
# This can happen if the code path on the very first consumed fragment doesn't
|
||||
# reset this flag. Currently this is the case for rows completely covered by a
|
||||
# higher level tombstone.
|
||||
def test_sticky_stop_flag(cql, test_keyspace):
|
||||
with new_test_table(cql, test_keyspace, 'pk int, ck int, v int, PRIMARY KEY (pk, ck)') as table:
|
||||
insert_row_id = cql.prepare(f"INSERT INTO {table} (pk, ck, v) VALUES (?, ?, ?)")
|
||||
|
||||
pk = 0
|
||||
|
||||
# Flush the row to disk, to prevent it being compacted away in the
|
||||
# memtable upon writing the partition tombstone.
|
||||
cql.execute(insert_row_id, (pk, 100, 0))
|
||||
nodetool.flush(cql, table)
|
||||
cql.execute(f"DELETE FROM {table} WHERE pk = {pk}")
|
||||
|
||||
for ck in range(0, 200):
|
||||
if ck == 100:
|
||||
continue
|
||||
cql.execute(insert_row_id, (pk, ck, 0))
|
||||
|
||||
statement = SimpleStatement(f"SELECT * FROM {table} WHERE pk = {pk}", fetch_size=100)
|
||||
|
||||
res = list(cql.execute(statement))
|
||||
|
||||
assert len(res) == 199
|
||||
598
test/cql-pytest/test_unset.py
Normal file
598
test/cql-pytest/test_unset.py
Normal file
@@ -0,0 +1,598 @@
|
||||
# Copyright 2023-present ScyllaDB
|
||||
#
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
|
||||
import pytest
|
||||
from util import new_test_table, unique_key_int
|
||||
from cassandra.query import UNSET_VALUE
|
||||
from cassandra.protocol import InvalidRequest
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def table1(cql, test_keyspace):
|
||||
with new_test_table(cql, test_keyspace, "p int PRIMARY KEY, a int, b int, c int, li list<int>") as table:
|
||||
yield table
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def table2(cql, test_keyspace):
|
||||
with new_test_table(cql, test_keyspace, "p int, c int, PRIMARY KEY (p, c)") as table:
|
||||
yield table
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def table3(cql, test_keyspace):
|
||||
with new_test_table(cql, test_keyspace, "p int, c int, r int, PRIMARY KEY (p, c)") as table:
|
||||
yield table
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def table4(cql, test_keyspace):
|
||||
with new_test_table(cql, test_keyspace, "p int, c int, s int, r int, PRIMARY KEY (p, c)") as table:
|
||||
yield table
|
||||
|
||||
# Test INSERT with UNSET_VALUE for the clustering column value
|
||||
def test_insert_unset_clustering_col(cql, table4, scylla_only):
|
||||
p = unique_key_int()
|
||||
def insert(c, s, r):
|
||||
cql.execute(cql.prepare(f"INSERT INTO {table4} (p, c, s, r) VALUES ({p}, ?, ?, ?)"), [c, s, r])
|
||||
def select_rows():
|
||||
return sorted(list(cql.execute(f"SELECT p, c, s, r FROM {table4} WHERE p = {p}")))
|
||||
|
||||
# INSERT (p, UNSET, 2, 3)
|
||||
insert(c=UNSET_VALUE, s=2, r=3)
|
||||
assert select_rows() == []
|
||||
|
||||
# INSERT (p, 1, 2, 3)
|
||||
insert(c=1, s=2, r=3)
|
||||
assert select_rows() == [(p, 1, 2, 3)]
|
||||
|
||||
# INSERT (p, UNSET, 2, 3)
|
||||
insert(c=UNSET_VALUE, s=2, r=3)
|
||||
assert select_rows() == [(p, 1, 2, 3)]
|
||||
|
||||
# INSERT (p, UNSET, 5, 6)
|
||||
insert(c=UNSET_VALUE, s=5, r=6)
|
||||
assert select_rows() == [(p, 1, 2, 3)]
|
||||
|
||||
# Test INSERT with UNSET_VALUE for the static column value
|
||||
def test_insert_unset_static_col(cql, table4):
|
||||
p = unique_key_int()
|
||||
def insert(c, s, r):
|
||||
cql.execute(cql.prepare(f"INSERT INTO {table4} (p, c, s, r) VALUES ({p}, ?, ?, ?)"), [c, s, r])
|
||||
def select_rows():
|
||||
return sorted(list(cql.execute(f"SELECT p, c, s, r FROM {table4} WHERE p = {p}")))
|
||||
|
||||
# INSERT (p, 1, UNSET, 3)
|
||||
insert(c=1, s=UNSET_VALUE, r=3)
|
||||
assert select_rows() == [(p, 1, None, 3)]
|
||||
|
||||
# INSERT (p, 1, 2, 3)
|
||||
insert(c=1, s=2, r=3)
|
||||
assert select_rows() == [(p, 1, 2, 3)]
|
||||
|
||||
# INSERT (p, 1, UNSET, 3)
|
||||
insert(c=1, s=UNSET_VALUE, r=3)
|
||||
assert select_rows() == [(p, 1, 2, 3)]
|
||||
|
||||
# INSERT (p, 1, UNSET, 4)
|
||||
insert(c=1, s=UNSET_VALUE, r=4)
|
||||
assert select_rows() == [(p, 1, 2, 4)]
|
||||
|
||||
# Test INSERT with UNSET_VALUE for the regular column value
|
||||
def test_insert_unset_regular_col(cql, table4):
|
||||
p = unique_key_int()
|
||||
def insert(c, s, r):
|
||||
cql.execute(cql.prepare(f"INSERT INTO {table4} (p, c, s, r) VALUES ({p}, ?, ?, ?)"), [c, s, r])
|
||||
def select_rows():
|
||||
return list(cql.execute(f"SELECT p, c, s, r FROM {table4} WHERE p = {p}"))
|
||||
|
||||
# INSERT (p, 1, 2, UNSET)
|
||||
insert(c=1, s=2, r=UNSET_VALUE)
|
||||
assert select_rows() == [(p, 1, 2, None)]
|
||||
|
||||
# INSERT (p, 1, 2, 3)
|
||||
insert(c=1, s=2, r=3)
|
||||
assert select_rows() == [(p, 1, 2, 3)]
|
||||
|
||||
# INSERT (p, 1, 2, UNSET)
|
||||
insert(c=1, s=2, r=UNSET_VALUE)
|
||||
assert select_rows() == [(p, 1, 2, 3)]
|
||||
|
||||
# INSERT (p, 1, 5, UNSET)
|
||||
insert(c=1, s=5, r=UNSET_VALUE)
|
||||
assert select_rows() == [(p, 1, 5, 3)]
|
||||
|
||||
# Test INSERT with UNSET_VALUE for the clustering column value, using IF NOT EXISTS
|
||||
def test_insert_unset_clustering_col_if_not_exists(cql, table4):
|
||||
p = unique_key_int()
|
||||
def insert(c, s, r):
|
||||
cql.execute(cql.prepare(f"INSERT INTO {table4} (p, c, s, r) VALUES ({p}, ?, ?, ?) IF NOT EXISTS"), [c, s, r])
|
||||
def select_rows():
|
||||
return sorted(list(cql.execute(f"SELECT p, c, s, r FROM {table4} WHERE p = {p}")))
|
||||
|
||||
# INSERT (p, UNSET, 2, 3)
|
||||
with pytest.raises(InvalidRequest, match='unset'):
|
||||
insert(c=UNSET_VALUE, s=2, r=3)
|
||||
|
||||
# INSERT (p, 1, 2, 3)
|
||||
insert(c=1, s=2, r=3)
|
||||
assert select_rows() == [(p, 1, 2, 3)]
|
||||
|
||||
# INSERT (p, UNSET, 2, 3)
|
||||
with pytest.raises(InvalidRequest, match='unset'):
|
||||
insert(c=UNSET_VALUE, s=2, r=3)
|
||||
|
||||
# INSERT (p, UNSET, 5, 6)
|
||||
with pytest.raises(InvalidRequest, match='unset'):
|
||||
insert(c=UNSET_VALUE, s=5, r=6)
|
||||
|
||||
assert select_rows() == [(p, 1, 2, 3)]
|
||||
|
||||
# Test INSERT with UNSET_VALUE for the static column value, using IF NOT EXISTS
|
||||
def test_insert_unset_static_col_if_not_exists(cql, table4):
|
||||
p = unique_key_int()
|
||||
def insert(c, s, r):
|
||||
cql.execute(cql.prepare(f"INSERT INTO {table4} (p, c, s, r) VALUES ({p}, ?, ?, ?) IF NOT EXISTS"), [c, s, r])
|
||||
def select_rows():
|
||||
return sorted(list(cql.execute(f"SELECT p, c, s, r FROM {table4} WHERE p = {p}")))
|
||||
|
||||
# INSERT (p, 1, UNSET, 3)
|
||||
insert(c=1, s=UNSET_VALUE, r=3)
|
||||
assert select_rows() == [(p, 1, None, 3)]
|
||||
|
||||
# INSERT (p, 1, 2, 3)
|
||||
insert(c=1, s=2, r=3)
|
||||
assert select_rows() == [(p, 1, None, 3)]
|
||||
|
||||
# INSERT (p, 1, UNSET, 3)
|
||||
insert(c=1, s=UNSET_VALUE, r=3)
|
||||
assert select_rows() == [(p, 1, None, 3)]
|
||||
|
||||
# INSERT (p, 1, UNSET, 4)
|
||||
insert(c=1, s=UNSET_VALUE, r=4)
|
||||
assert select_rows() == [(p, 1, None, 3)]
|
||||
|
||||
# Test INSERT with UNSET_VALUE for the regular column value, using IF NOT EXISTS
|
||||
def test_insert_unset_regular_col_if_not_exists(cql, table4):
|
||||
p = unique_key_int()
|
||||
def insert(c, s, r):
|
||||
cql.execute(cql.prepare(f"INSERT INTO {table4} (p, c, s, r) VALUES ({p}, ?, ?, ?) IF NOT EXISTS"), [c, s, r])
|
||||
def select_rows():
|
||||
return list(cql.execute(f"SELECT p, c, s, r FROM {table4} WHERE p = {p}"))
|
||||
|
||||
# INSERT (p, 1, 2, UNSET)
|
||||
insert(c=1, s=2, r=UNSET_VALUE)
|
||||
assert select_rows() == [(p, 1, 2, None)]
|
||||
|
||||
# INSERT (p, 1, 2, 3)
|
||||
insert(c=1, s=2, r=3)
|
||||
assert select_rows() == [(p, 1, 2, None)]
|
||||
|
||||
# INSERT (p, 1, 2, UNSET)
|
||||
insert(c=1, s=2, r=UNSET_VALUE)
|
||||
assert select_rows() == [(p, 1, 2, None)]
|
||||
|
||||
# INSERT (p, 1, 5, UNSET)
|
||||
insert(c=1, s=5, r=UNSET_VALUE)
|
||||
assert select_rows() == [(p, 1, 2, None)]
|
||||
|
||||
# Try doing UPDATE table4 SET c=UNSET_VALUE
|
||||
# Should fail, clustering columns can't be updated
|
||||
def test_update_unset_clustering_column(cql, table4):
|
||||
with pytest.raises(InvalidRequest):
|
||||
cql.prepare(f"UPDATE {table4} SET r=123, c = ? WHERE p = 0 AND c = ?")
|
||||
# Prepare fails, no point in executing it.
|
||||
|
||||
# Test doing UPDATE table4 SET s=UNSET_VALUE
|
||||
def test_update_unset_static_column(cql, table4, scylla_only):
|
||||
p = unique_key_int()
|
||||
def select_rows():
|
||||
return list(cql.execute(f"SELECT p, c, s, r FROM {table4} WHERE p = {p}"))
|
||||
|
||||
# UPDATE SET s=UNSET_VALUE
|
||||
update1 = cql.prepare(f"UPDATE {table4} SET s=? WHERE p = {p} AND c = ?")
|
||||
cql.execute(update1, [UNSET_VALUE, 1])
|
||||
assert select_rows() == []
|
||||
|
||||
# Try the same with c = UNSET_VALUE
|
||||
cql.execute(update1, [UNSET_VALUE, UNSET_VALUE])
|
||||
assert select_rows() == []
|
||||
|
||||
# UPDATE SET s=UNSET_VALUE, r=123
|
||||
update2 = cql.prepare(f"UPDATE {table4} SET r=123, s=? WHERE p = {p} AND c = ?")
|
||||
cql.execute(update2, [UNSET_VALUE, 1])
|
||||
assert select_rows() == [(p, 1, None, 123)]
|
||||
|
||||
# Try the same with c = UNSET_VALUE
|
||||
cql.execute(update2, [UNSET_VALUE, UNSET_VALUE])
|
||||
assert select_rows() == [(p, 1, None, 123)]
|
||||
|
||||
# UPDATE SET s=4321
|
||||
update3 = cql.prepare(f"UPDATE {table4} SET s=4321 WHERE p = {p} AND c = ?")
|
||||
cql.execute(update3, [1])
|
||||
assert select_rows() == [(p, 1, 4321, 123)]
|
||||
|
||||
# Try the same with c = UNSET_VALUE
|
||||
cql.execute(update3, [UNSET_VALUE])
|
||||
assert select_rows() == [(p, 1, 4321, 123)]
|
||||
|
||||
# UPDATE SET r=567, s=UNSET_VALUE
|
||||
update4 = cql.prepare(f"UPDATE {table4} SET r=567, s=? WHERE p = {p} AND c = ?")
|
||||
cql.execute(update4, [UNSET_VALUE, 1])
|
||||
assert select_rows() == [(p, 1, 4321, 567)]
|
||||
|
||||
# Try the same with c = UNSET_VALUE
|
||||
cql.execute(update4, [UNSET_VALUE, UNSET_VALUE])
|
||||
assert select_rows() == [(p, 1, 4321, 567)]
|
||||
|
||||
# Test doing UPDATE table4 SET r=UNSET_VALUE
|
||||
def test_update_unset_regular_column(cql, table4, scylla_only):
|
||||
p = unique_key_int()
|
||||
def select_rows():
|
||||
return list(cql.execute(f"SELECT p, c, s, r FROM {table4} WHERE p = {p}"))
|
||||
|
||||
# UPDATE SET r = UNSET_VALUE
|
||||
update1 = cql.prepare(f"UPDATE {table4} SET r = ? WHERE p = {p} AND c = ?")
|
||||
cql.execute(update1, [UNSET_VALUE, 1])
|
||||
assert select_rows() == []
|
||||
|
||||
# Try the same with c = UNSET_VALUE
|
||||
cql.execute(update1, [UNSET_VALUE, UNSET_VALUE])
|
||||
assert select_rows() == []
|
||||
|
||||
# UPDATE SET r = UNSET_VALUE, s = 100
|
||||
update2 = cql.prepare(f"UPDATE {table4} SET r = ?, s = 100 WHERE p = {p} AND c = ?")
|
||||
cql.execute(update2, [UNSET_VALUE, 1])
|
||||
assert select_rows() == [(p, 1, 100, None)]
|
||||
|
||||
# Try the same with c = UNSET_VALUE
|
||||
cql.execute(update2, [UNSET_VALUE, UNSET_VALUE])
|
||||
assert select_rows() == [(p, 1, 100, None)]
|
||||
|
||||
# UPDATE SET r = 200
|
||||
update3 = cql.prepare(f"UPDATE {table4} SET r = 200 WHERE p = {p} AND c = ?")
|
||||
cql.execute(update3, [1])
|
||||
assert select_rows() == [(p, 1, 100, 200)]
|
||||
|
||||
# Try the same with c = UNSET_VALUE
|
||||
cql.execute(update3, [UNSET_VALUE])
|
||||
assert select_rows() == [(p, 1, 100, 200)]
|
||||
|
||||
# UPDATE SET r = UNSET_VALUE, s = 300
|
||||
update4 = cql.prepare(f"UPDATE {table4} SET r = ?, s = 300 WHERE p = {p} AND c = ?")
|
||||
cql.execute(update4, [UNSET_VALUE, 1])
|
||||
assert select_rows() == [(p, 1, 300, 200)]
|
||||
|
||||
# Try the same with c = UNSET_VALUE
|
||||
cql.execute(update4, [UNSET_VALUE, UNSET_VALUE])
|
||||
assert select_rows() == [(p, 1, 300, 200)]
|
||||
|
||||
# Test doing UPDATE table4 SET s=UNSET_VALUE IF EXISTS
|
||||
def test_update_unset_static_column_if_exists(cql, table4):
|
||||
p = unique_key_int()
|
||||
def select_rows():
|
||||
return list(cql.execute(f"SELECT p, c, s, r FROM {table4} WHERE p = {p}"))
|
||||
|
||||
# UPDATE SET s=UNSET_VALUE
|
||||
update1 = cql.prepare(f"UPDATE {table4} SET s=? WHERE p = {p} AND c = ? IF EXISTS")
|
||||
cql.execute(update1, [UNSET_VALUE, 1])
|
||||
assert select_rows() == []
|
||||
|
||||
# Try the same with c = UNSET_VALUE
|
||||
with pytest.raises(InvalidRequest):
|
||||
cql.execute(update1, [UNSET_VALUE, UNSET_VALUE])
|
||||
|
||||
# Insert something into the table so that the updates actually change something
|
||||
cql.execute(f"INSERT INTO {table4} (p, c, s, r) VALUES ({p}, 1, 2, 3)")
|
||||
|
||||
# UPDATE SET s=UNSET_VALUE, r=123
|
||||
update2 = cql.prepare(f"UPDATE {table4} SET r=123, s=? WHERE p = {p} AND c = ? IF EXISTS")
|
||||
cql.execute(update2, [UNSET_VALUE, 1])
|
||||
assert select_rows() == [(p, 1, 2, 123)]
|
||||
|
||||
# Try the same with c = UNSET_VALUE
|
||||
with pytest.raises(InvalidRequest):
|
||||
cql.execute(update2, [UNSET_VALUE, UNSET_VALUE])
|
||||
|
||||
# UPDATE SET s=4321
|
||||
update3 = cql.prepare(f"UPDATE {table4} SET s=4321 WHERE p = {p} AND c = ? IF EXISTS")
|
||||
cql.execute(update3, [1])
|
||||
assert select_rows() == [(p, 1, 4321, 123)]
|
||||
|
||||
# Try the same with c = UNSET_VALUE
|
||||
with pytest.raises(InvalidRequest):
|
||||
cql.execute(update3, [UNSET_VALUE])
|
||||
|
||||
# UPDATE SET r=567, s=UNSET_VALUE
|
||||
update4 = cql.prepare(f"UPDATE {table4} SET r=567, s=? WHERE p = {p} AND c = ? IF EXISTS")
|
||||
cql.execute(update4, [UNSET_VALUE, 1])
|
||||
assert select_rows() == [(p, 1, 4321, 567)]
|
||||
|
||||
# Try the same with c = UNSET_VALUE
|
||||
with pytest.raises(InvalidRequest):
|
||||
cql.execute(update4, [UNSET_VALUE, UNSET_VALUE])
|
||||
|
||||
# Test doing UPDATE table4 SET r=UNSET_VALUE
|
||||
def test_update_unset_regular_column_if_exists(cql, table4):
|
||||
p = unique_key_int()
|
||||
def select_rows():
|
||||
return list(cql.execute(f"SELECT p, c, s, r FROM {table4} WHERE p = {p}"))
|
||||
|
||||
# UPDATE SET r = UNSET_VALUE
|
||||
update1 = cql.prepare(f"UPDATE {table4} SET r = ? WHERE p = {p} AND c = ? IF EXISTS")
|
||||
cql.execute(update1, [UNSET_VALUE, 1])
|
||||
assert select_rows() == []
|
||||
|
||||
# Try the same with c = UNSET_VALUE
|
||||
with pytest.raises(InvalidRequest):
|
||||
cql.execute(update1, [UNSET_VALUE, UNSET_VALUE])
|
||||
|
||||
# Insert something into the table so that the updates actually change something
|
||||
cql.execute(f"INSERT INTO {table4} (p, c, s, r) VALUES ({p}, 1, 2, 3)")
|
||||
|
||||
# UPDATE SET r = UNSET_VALUE, s = 100
|
||||
update2 = cql.prepare(f"UPDATE {table4} SET r = ?, s = 100 WHERE p = {p} AND c = ? IF EXISTS")
|
||||
cql.execute(update2, [UNSET_VALUE, 1])
|
||||
assert select_rows() == [(p, 1, 100, 3)]
|
||||
|
||||
# Try the same with c = UNSET_VALUE
|
||||
with pytest.raises(InvalidRequest):
|
||||
cql.execute(update2, [UNSET_VALUE, UNSET_VALUE])
|
||||
|
||||
# UPDATE SET r = 200
|
||||
update3 = cql.prepare(f"UPDATE {table4} SET r = 200 WHERE p = {p} AND c = ? IF EXISTS")
|
||||
cql.execute(update3, [1])
|
||||
assert select_rows() == [(p, 1, 100, 200)]
|
||||
|
||||
# Try the same with c = UNSET_VALUE
|
||||
with pytest.raises(InvalidRequest):
|
||||
cql.execute(update3, [UNSET_VALUE])
|
||||
|
||||
# UPDATE SET r = UNSET_VALUE, s = 300
|
||||
update4 = cql.prepare(f"UPDATE {table4} SET r = ?, s = 300 WHERE p = {p} AND c = ? IF EXISTS")
|
||||
cql.execute(update4, [UNSET_VALUE, 1])
|
||||
assert select_rows() == [(p, 1, 300, 200)]
|
||||
|
||||
# Try the same with c = UNSET_VALUE
|
||||
with pytest.raises(InvalidRequest):
|
||||
cql.execute(update4, [UNSET_VALUE, UNSET_VALUE])
|
||||
|
||||
# Test doing UPDATE table4 SET s=UNSET_VALUE IF <lwt condition>, unset values in lwt condition are also tested
|
||||
def test_update_unset_static_column_with_lwt(cql, table4):
|
||||
p = unique_key_int()
|
||||
def select_rows():
|
||||
return list(cql.execute(f"SELECT p, c, s, r FROM {table4} WHERE p = {p}"))
|
||||
|
||||
update1 = cql.prepare(f"UPDATE {table4} SET s = ? WHERE p = {p} AND c = ? IF s = ? AND r = ?")
|
||||
|
||||
# UPDATE SET s = 123 WHERE c = 123 AND s = 123 AND r = 123
|
||||
cql.execute(update1, [123, 123, 123, 123])
|
||||
assert select_rows() == []
|
||||
|
||||
# UPDATE SET s = UNSET_VALUE WHERE c = 123 AND s = 123 AND r = 123
|
||||
cql.execute(update1, [UNSET_VALUE, 123, 123, 123])
|
||||
assert select_rows() == []
|
||||
|
||||
# Insert something into the table so that the updates actually change something
|
||||
cql.execute(f"INSERT INTO {table4} (p, c, s, r) VALUES ({p}, 123, 123, 123)")
|
||||
|
||||
# UPDATE SET s = UNSET_VALUE WHERE c = 123 AND s = 123 AND r = 123
|
||||
cql.execute(update1, [UNSET_VALUE, 123, 123, 123])
|
||||
assert select_rows() == [(p, 123, 123, 123)]
|
||||
|
||||
# UPDATE table4 SET s = 321 WHERE c = 123 AND s = 123 AND r = 123
|
||||
cql.execute(update1, [321, 123, 123, 123])
|
||||
assert select_rows() == [(p, 123, 321, 123)]
|
||||
|
||||
# Setting c (clustering column) to UNSET_VALUE should generate an error
|
||||
with pytest.raises(InvalidRequest):
|
||||
cql.execute(update1, [123, UNSET_VALUE, 123, 123])
|
||||
|
||||
# Doing IF s = UNSET generates an error
|
||||
with pytest.raises(InvalidRequest):
|
||||
cql.execute(update1, [9000000, 123, UNSET_VALUE, 123])
|
||||
|
||||
# Doing IF r = UNSET silently skips the update
|
||||
cql.execute(update1, [9000000, 123, 123, UNSET_VALUE])
|
||||
assert select_rows() == [(p, 123, 321, 123)]
|
||||
|
||||
# Doing IF s = UNSET AND r = UNSET generates an error
|
||||
with pytest.raises(InvalidRequest):
|
||||
cql.execute(update1, [9000000, 123, UNSET_VALUE, UNSET_VALUE])
|
||||
|
||||
# Setting everything to UNSET generates an error
|
||||
with pytest.raises(InvalidRequest):
|
||||
cql.execute(update1, [UNSET_VALUE, UNSET_VALUE, UNSET_VALUE, UNSET_VALUE])
|
||||
|
||||
# Test doing UPDATE table4 SET r=UNSET_VALUE IF <lwt condition>, unset values in lwt condition are also tested
|
||||
def test_update_unset_regular_column_with_lwt(cql, table4):
|
||||
p = unique_key_int()
|
||||
def select_rows():
|
||||
return list(cql.execute(f"SELECT p, c, s, r FROM {table4} WHERE p = {p}"))
|
||||
|
||||
update1 = cql.prepare(f"UPDATE {table4} SET r = ? WHERE p = {p} AND c = ? IF s = ? AND r = ?")
|
||||
|
||||
# UPDATE SET r = 123 WHERE c = 123 AND s = 123 AND r = 123
|
||||
cql.execute(update1, [123, 123, 123, 123])
|
||||
assert select_rows() == []
|
||||
|
||||
# UPDATE SET r = UNSET_VALUE WHERE c = 123 AND s = 123 AND r = 123
|
||||
cql.execute(update1, [UNSET_VALUE, 123, 123, 123])
|
||||
assert select_rows() == []
|
||||
|
||||
# Insert something into the table so that the updates actually change something
|
||||
cql.execute(f"INSERT INTO {table4} (p, c, s, r) VALUES ({p}, 123, 123, 123)")
|
||||
|
||||
# UPDATE SET r = UNSET_VALUE WHERE c = 123 AND s = 123 AND r = 123
|
||||
cql.execute(update1, [UNSET_VALUE, 123, 123, 123])
|
||||
assert select_rows() == [(p, 123, 123, 123)]
|
||||
|
||||
# UPDATE table4 SET r = 321 WHERE c = 123 AND s = 123 AND r = 123
|
||||
cql.execute(update1, [321, 123, 123, 123])
|
||||
assert select_rows() == [(p, 123, 123, 321)]
|
||||
|
||||
# Setting c (clustering column) to UNSET_VALUE should generate an error
|
||||
with pytest.raises(InvalidRequest):
|
||||
cql.execute(update1, [123, UNSET_VALUE, 123, 123])
|
||||
|
||||
# Doing IF s = UNSET generates an error
|
||||
with pytest.raises(InvalidRequest):
|
||||
cql.execute(update1, [9000000, 123, UNSET_VALUE, 123])
|
||||
|
||||
# Doing IF r = UNSET generates an error
|
||||
# This didn't cause an error when updating s instead of r
|
||||
with pytest.raises(InvalidRequest):
|
||||
cql.execute(update1, [9000000, 123, 123, UNSET_VALUE])
|
||||
|
||||
# Doing IF s = UNSET AND r = UNSET generates an error
|
||||
with pytest.raises(InvalidRequest):
|
||||
cql.execute(update1, [9000000, 123, UNSET_VALUE, UNSET_VALUE])
|
||||
|
||||
# Setting everything to UNSET generates an error
|
||||
with pytest.raises(InvalidRequest):
|
||||
cql.execute(update1, [UNSET_VALUE, UNSET_VALUE, UNSET_VALUE, UNSET_VALUE])
|
||||
|
||||
|
||||
# A basic test that in a prepared statement with three assignments, one
|
||||
# bound by an UNSET_VALUE is simply not done, but the other ones are.
|
||||
# Try all 2^3 combinations of a 3 column updates with each one set to either
|
||||
# a real value or an UNSET_VALUE.
|
||||
def test_update_unset_value_basic(cql, table1):
|
||||
p = unique_key_int()
|
||||
stmt = cql.prepare(f'UPDATE {table1} SET a=?, b=?, c=? WHERE p={p}')
|
||||
a = 1
|
||||
b = 2
|
||||
c = 3
|
||||
cql.execute(stmt, [a, b, c])
|
||||
assert [(a, b, c)] == list(cql.execute(f'SELECT a,b,c FROM {table1} WHERE p = {p}'))
|
||||
i = 4
|
||||
for unset_a in [False, True]:
|
||||
for unset_b in [False, True]:
|
||||
for unset_c in [False, True]:
|
||||
if unset_a:
|
||||
newa = UNSET_VALUE
|
||||
else:
|
||||
newa = i
|
||||
a = i
|
||||
i += 1
|
||||
if unset_b:
|
||||
newb = UNSET_VALUE
|
||||
else:
|
||||
newb = i
|
||||
b = i
|
||||
i += 1
|
||||
if unset_c:
|
||||
newc = UNSET_VALUE
|
||||
else:
|
||||
newc = i
|
||||
c = i
|
||||
i += 1
|
||||
cql.execute(stmt, [newa, newb, newc])
|
||||
assert [(a, b, c)] == list(cql.execute(f'SELECT a,b,c FROM {table1} WHERE p = {p}'))
|
||||
|
||||
# The expression "SET a=?" is skipped if the bound value is UNSET_VALUE.
|
||||
# But what if it is part of a more complex expression like "SET a=(int)?+1"
|
||||
# (arithmetic expression on the bind variable)? Does the SET also get
|
||||
# skipped? Cassandra, and Scylla, decided that the answer will be no:
|
||||
# We refuse to evaluate expressions involving an UNSET_VALUE, and in
|
||||
# such case the whole write request will fail instead of parts of it being
|
||||
# skipped. See discussion in pull request #12517.
|
||||
|
||||
@pytest.mark.xfail(reason="issue #2693 - Scylla doesn't yet support arithmetic expressions")
|
||||
def test_update_unset_value_expr_arithmetic(cql, table1):
|
||||
p = unique_key_int()
|
||||
stmt = cql.prepare(f'UPDATE {table1} SET a=(int)?+1 WHERE p={p}')
|
||||
cql.execute(stmt, [7])
|
||||
assert [(8,)] == list(cql.execute(f'SELECT a FROM {table1} WHERE p = {p}'))
|
||||
with pytest.raises(InvalidRequest):
|
||||
cql.execute(stmt, [UNSET_VALUE])
|
||||
|
||||
# Despite the decision that expressions will not allow UNSET_VALUE, Cassandra
|
||||
# decided that (quoting its NEWS.txt) "an unset bind counter operation does
|
||||
# not change the counter value.". So "c = c + ?" for a counter, when given
|
||||
# an UNSET_VALUE, will causes the write to be skipped, without error.
|
||||
# The rationale is that "c = c + ?" is not an expression - it doesn't actually
|
||||
# calculate c + ?, but rather it is a primitive increment operation, and
|
||||
# passing ?=UNSET_VALUE should be able to skip this primitive operation.
|
||||
def test_unset_counter_increment(cql, test_keyspace):
|
||||
with new_test_table(cql, test_keyspace, "p int PRIMARY KEY, c counter") as table:
|
||||
p = unique_key_int()
|
||||
stmt = cql.prepare(f'UPDATE {table} SET c=c+? WHERE p={p}')
|
||||
cql.execute(stmt, [3])
|
||||
assert [(3,)] == list(cql.execute(f'SELECT c FROM {table} WHERE p = {p}'))
|
||||
cql.execute(stmt, [UNSET_VALUE])
|
||||
assert [(3,)] == list(cql.execute(f'SELECT c FROM {table} WHERE p = {p}'))
|
||||
|
||||
# Like the counter increment, a list append operation (li=li+?) is a primitive
|
||||
# operation and not expression, so we believe UNSET_VALUE should be able
|
||||
# to skip it, and Scylla indeed does as this test shows. Cassandra fails
|
||||
# this test - it produces an internal error on a bad cast, and we consider
|
||||
# this a Cassandra bug and hence the cassandra_bug tag.
|
||||
def test_unset_list_append(cql, table1, cassandra_bug):
|
||||
p = unique_key_int()
|
||||
stmt = cql.prepare(f'UPDATE {table1} SET li=li+? WHERE p={p}')
|
||||
cql.execute(stmt, [[7]])
|
||||
assert [([7],)] == list(cql.execute(f'SELECT li FROM {table1} WHERE p = {p}'))
|
||||
cql.execute(stmt, [UNSET_VALUE])
|
||||
assert [([7],)] == list(cql.execute(f'SELECT li FROM {table1} WHERE p = {p}'))
|
||||
|
||||
# According to Cassandra's NEWS.txt, "an unset bind ttl is treated as
|
||||
# 'unlimited'". It shouldn't skip the write.
|
||||
# Note that the NEWS.txt is not accurate: An unset ttl isn't really treated
|
||||
# as unlimited, but rather as the default ttl set on the table. The default
|
||||
# ttl is usually unlimited, but not always. We test that case in
|
||||
# test_ttl.py::test_default_ttl_unset()
|
||||
def test_unset_ttl(cql, table1):
|
||||
p = unique_key_int()
|
||||
# First write using a normal TTL:
|
||||
stmt = cql.prepare(f'UPDATE {table1} USING TTL ? SET a=? WHERE p={p}')
|
||||
cql.execute(stmt, [20000, 3])
|
||||
res = list(cql.execute(f'SELECT a, ttl(a) FROM {table1} WHERE p = {p}'))
|
||||
assert res[0].a == 3
|
||||
assert res[0].ttl_a > 10000
|
||||
# Check that an UNSET_VALUE ttl didn't skip the write but reset the TTL
|
||||
# to unlimited (None)
|
||||
cql.execute(stmt, [UNSET_VALUE, 4])
|
||||
assert [(4, None)] == list(cql.execute(f'SELECT a, ttl(a) FROM {table1} WHERE p = {p}'))
|
||||
|
||||
# According to Cassadra's NEWS.txt, "an unset bind timestamp is treated
|
||||
# as 'now'". It shouldn't skip the write.
|
||||
def test_unset_timestamp(cql, table1):
|
||||
p = unique_key_int()
|
||||
stmt = cql.prepare(f'UPDATE {table1} USING TIMESTAMP ? SET a=? WHERE p={p}')
|
||||
cql.execute(stmt, [UNSET_VALUE, 3])
|
||||
assert [(3,)] == list(cql.execute(f'SELECT a FROM {table1} WHERE p = {p}'))
|
||||
|
||||
# According to Cassandra's NEWS.txt, "In a QUERY request an unset limit
|
||||
# is treated as 'unlimited'.". It mustn't cause the query to fail (let alone
|
||||
# be skipped somehow).
|
||||
def test_unset_limit(cql, table2):
|
||||
p = unique_key_int()
|
||||
cql.execute(f'INSERT INTO {table2} (p, c) VALUES ({p}, 1)')
|
||||
cql.execute(f'INSERT INTO {table2} (p, c) VALUES ({p}, 2)')
|
||||
cql.execute(f'INSERT INTO {table2} (p, c) VALUES ({p}, 3)')
|
||||
cql.execute(f'INSERT INTO {table2} (p, c) VALUES ({p}, 4)')
|
||||
stmt = cql.prepare(f'SELECT c FROM {table2} WHERE p={p} limit ?')
|
||||
assert [(1,),(2,)] == list(cql.execute(stmt, [2]))
|
||||
assert [(1,),(2,),(3,),(4,)] == list(cql.execute(stmt, [UNSET_VALUE]))
|
||||
|
||||
# TODO: check that (according to NEWS.txt documentation): "Unset tuple field,
|
||||
# UDT field and map key are not allowed.".
|
||||
|
||||
# Similar to test_unset_insert_where() above, just use an LWT write ("IF
|
||||
# NOT EXISTS"). Test that using an UNSET_VALUE in an LWT condtion causes
|
||||
# a clear error, not silent skip and not a crash as in issue #13001.
|
||||
def test_unset_insert_where_lwt(cql, table2):
|
||||
p = unique_key_int()
|
||||
stmt = cql.prepare(f'INSERT INTO {table2} (p, c) VALUES ({p}, ?) IF NOT EXISTS')
|
||||
with pytest.raises(InvalidRequest, match="unset"):
|
||||
cql.execute(stmt, [UNSET_VALUE])
|
||||
|
||||
# Like test_unset_insert_where_lwt, but using UPDATE
|
||||
# Python driver doesn't allow sending an UNSET_VALUE for the partition key,
|
||||
# so only the clustering key is tested.
|
||||
def test_unset_update_where_lwt(cql, table3):
|
||||
stmt = cql.prepare(f"UPDATE {table3} SET r = 42 WHERE p = 0 AND c = ? IF r = ?")
|
||||
|
||||
with pytest.raises(InvalidRequest, match="unset"):
|
||||
cql.execute(stmt, [UNSET_VALUE, 2])
|
||||
|
||||
with pytest.raises(InvalidRequest, match="unset"):
|
||||
cql.execute(stmt, [1, UNSET_VALUE])
|
||||
Submodule tools/python3 updated: f725ec765b...c888f39bea
14
types.cc
14
types.cc
@@ -757,6 +757,7 @@ bool abstract_type::is_collection() const {
|
||||
bool abstract_type::is_tuple() const {
|
||||
struct visitor {
|
||||
bool operator()(const abstract_type&) { return false; }
|
||||
bool operator()(const reversed_type_impl& t) { return t.underlying_type()->is_tuple(); }
|
||||
bool operator()(const tuple_type_impl&) { return true; }
|
||||
};
|
||||
return visit(*this, visitor{});
|
||||
@@ -1997,6 +1998,10 @@ data_value deserialize_aux(const tuple_type_impl& t, View v) {
|
||||
|
||||
template<FragmentedView View>
|
||||
utils::multiprecision_int deserialize_value(const varint_type_impl&, View v) {
|
||||
if (v.empty()) {
|
||||
throw marshal_exception("cannot deserialize multiprecision int - empty buffer");
|
||||
}
|
||||
skip_empty_fragments(v);
|
||||
bool negative = v.current_fragment().front() < 0;
|
||||
utils::multiprecision_int num;
|
||||
while (v.size_bytes()) {
|
||||
@@ -2093,6 +2098,7 @@ bool deserialize_value(const boolean_type_impl&, View v) {
|
||||
if (v.size_bytes() != 1) {
|
||||
throw marshal_exception(format("cannot deserialize boolean, size mismatch ({:d})", v.size_bytes()));
|
||||
}
|
||||
skip_empty_fragments(v);
|
||||
return v.current_fragment().front() != 0;
|
||||
}
|
||||
|
||||
@@ -2264,9 +2270,11 @@ struct compare_visitor {
|
||||
std::strong_ordering operator()(const listlike_collection_type_impl& l) {
|
||||
using llpdi = listlike_partial_deserializing_iterator;
|
||||
auto sf = cql_serialization_format::internal();
|
||||
return lexicographical_tri_compare(llpdi::begin(v1, sf), llpdi::end(v1, sf), llpdi::begin(v2, sf),
|
||||
llpdi::end(v2, sf),
|
||||
[&] (const managed_bytes_view& o1, const managed_bytes_view& o2) { return l.get_elements_type()->compare(o1, o2); });
|
||||
return with_empty_checks([&] {
|
||||
return lexicographical_tri_compare(llpdi::begin(v1, sf), llpdi::end(v1, sf), llpdi::begin(v2, sf),
|
||||
llpdi::end(v2, sf),
|
||||
[&] (const managed_bytes_view& o1, const managed_bytes_view& o2) { return l.get_elements_type()->compare(o1, o2); });
|
||||
});
|
||||
}
|
||||
std::strong_ordering operator()(const map_type_impl& m) {
|
||||
return map_type_impl::compare_maps(m.get_keys_type(), m.get_values_type(), v1, v2);
|
||||
|
||||
@@ -289,7 +289,7 @@ public:
|
||||
* <b>Warning:</b> this method should only be used for querying as this
|
||||
* doesn't at all guarantee the uniqueness of the resulting UUID.
|
||||
*/
|
||||
static UUID min_time_UUID(milliseconds timestamp = milliseconds{0})
|
||||
static UUID min_time_UUID(decimicroseconds timestamp = decimicroseconds{0})
|
||||
{
|
||||
auto uuid = UUID(create_time(from_unix_timestamp(timestamp)), MIN_CLOCK_SEQ_AND_NODE);
|
||||
assert(uuid.is_timestamp());
|
||||
|
||||
@@ -199,7 +199,13 @@ utils::config_type::to_json(const void* value) const {
|
||||
|
||||
bool
|
||||
utils::config_file::config_src::matches(std::string_view name) const {
|
||||
if (_name == name) {
|
||||
// The below line provides support for option names in the "long_name,short_name" format,
|
||||
// such as "workdir,W". We only want the long name ("workdir") to be used in the YAML.
|
||||
// But since at some point (due to a bug) the YAML config parser expected the silly
|
||||
// double form ("workdir,W") instead, we support both for backward compatibility.
|
||||
std::string_view long_name = _name.substr(0, _name.find_first_of(','));
|
||||
|
||||
if (_name == name || long_name == name) {
|
||||
return true;
|
||||
}
|
||||
if (!_alias.empty() && _alias == name) {
|
||||
|
||||
@@ -60,6 +60,7 @@ concept HasClearGentlyMethod = requires (T x) {
|
||||
|
||||
template <typename T>
|
||||
concept SmartPointer = requires (T x) {
|
||||
{ x.get() } -> std::same_as<typename T::element_type*>;
|
||||
{ *x } -> std::same_as<typename T::element_type&>;
|
||||
};
|
||||
|
||||
@@ -177,7 +178,11 @@ future<> clear_gently(T& o) noexcept {
|
||||
|
||||
template <SmartPointer T>
|
||||
future<> clear_gently(T& o) noexcept {
|
||||
return internal::clear_gently(*o);
|
||||
if (auto p = o.get()) {
|
||||
return internal::clear_gently(*p);
|
||||
} else {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, std::size_t N>
|
||||
|
||||
Reference in New Issue
Block a user