Files
scylladb/mutation/mutation.cc
Botond Dénes 3c34598d88 treewide: move away from tombstone_gc_state(nullptr) ctor
It is ambigous, use the appropriate no-gc or gc-all factories instead,
as appropriate.
A special note for mutation::compacted(): according to the comment above
it, it doesn't drop expired tombstones but as it is currently, it
actually does. Change the tombstone gc param for the underlying call to
compact_for_compaction() to uphold the comment. This is used in tests
mostly, so no fallout expected.

Tests are handled in the next commit, to reduce noise.

Two tests in mutation_test.cc have to be updated:

* test_compactor_range_tombstone_spanning_many_pages
  has to be updated in this commit, as it uses
  mutation_partition::compact_for_query() as well as
  compact_for_query(). The test passes default constructed
  tombstone_gc() to the latter while the former now uses no-gc
  creating a mismatch in tombstone gc behaviour, resulting in test
  failure. Update the test to also pass no-gc to compact_for_query().

* test_query_digest similarly uses mutation_partition::query_mutation()
  and another compaction method, having to match the no-gc now used in
  query_mutation().
2026-03-03 14:09:28 +02:00

492 lines
17 KiB
C++

/*
* Copyright (C) 2014-present ScyllaDB
*/
/*
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
*/
#include <seastar/util/closeable.hh>
#include "mutation.hh"
#include "query/query-result-writer.hh"
#include "mutation_rebuilder.hh"
#include "mutation/json.hh"
#include "types/collection.hh"
#include "types/tuple.hh"
#include "dht/i_partitioner.hh"
#include "reader_concurrency_semaphore.hh"
#include "readers/from_mutations.hh"
logging::logger mlog("mutation");
mutation::data::data(dht::decorated_key&& key, schema_ptr&& schema)
: _schema(std::move(schema))
, _dk(std::move(key))
, _p(*_schema)
{ }
mutation::data::data(partition_key&& key_, schema_ptr&& schema)
: _schema(std::move(schema))
, _dk(dht::decorate_key(*_schema, std::move(key_)))
, _p(*_schema)
{ }
mutation::data::data(schema_ptr&& schema, dht::decorated_key&& key, const mutation_partition& mp)
: _schema(schema)
, _dk(std::move(key))
, _p(*schema, mp)
{ }
mutation::data::data(schema_ptr&& schema, dht::decorated_key&& key, mutation_partition&& mp)
: _schema(std::move(schema))
, _dk(std::move(key))
, _p(std::move(mp))
{ }
void mutation::set_static_cell(const column_definition& def, atomic_cell_or_collection&& value) {
partition().static_row().apply(def, std::move(value));
}
void mutation::set_static_cell(const bytes& name, const data_value& value, api::timestamp_type timestamp, ttl_opt ttl) {
auto column_def = schema()->get_column_definition(name);
if (!column_def) {
throw std::runtime_error(format("no column definition found for '{}'", name));
}
if (!column_def->is_static()) {
throw std::runtime_error(format("column '{}' is not static", name));
}
partition().static_row().apply(*column_def, atomic_cell::make_live(*column_def->type, timestamp, column_def->type->decompose(value), ttl));
}
void mutation::set_clustered_cell(const clustering_key& key, const bytes& name, const data_value& value,
api::timestamp_type timestamp, ttl_opt ttl) {
auto column_def = schema()->get_column_definition(name);
if (!column_def) {
throw std::runtime_error(format("no column definition found for '{}'", name));
}
return set_clustered_cell(key, *column_def, atomic_cell::make_live(*column_def->type, timestamp, column_def->type->decompose(value), ttl));
}
void mutation::set_clustered_cell(const clustering_key& key, const column_definition& def, atomic_cell_or_collection&& value) {
auto& row = partition().clustered_row(*schema(), key).cells();
row.apply(def, std::move(value));
}
void mutation::set_cell(const clustering_key_prefix& prefix, const bytes& name, const data_value& value,
api::timestamp_type timestamp, ttl_opt ttl) {
auto column_def = schema()->get_column_definition(name);
if (!column_def) {
throw std::runtime_error(format("no column definition found for '{}'", name));
}
return set_cell(prefix, *column_def, atomic_cell::make_live(*column_def->type, timestamp, column_def->type->decompose(value), ttl));
}
void mutation::set_cell(const clustering_key_prefix& prefix, const column_definition& def, atomic_cell_or_collection&& value) {
if (def.is_static()) {
set_static_cell(def, std::move(value));
} else if (def.is_regular()) {
set_clustered_cell(prefix, def, std::move(value));
} else {
throw std::runtime_error("attempting to store into a key cell");
}
}
bool mutation::operator==(const mutation& m) const {
return decorated_key().equal(*schema(), m.decorated_key())
&& partition().equal(*schema(), m.partition(), *m.schema());
}
uint64_t
mutation::live_row_count(gc_clock::time_point query_time) const {
return partition().live_row_count(*schema(), query_time);
}
bool
mutation_decorated_key_less_comparator::operator()(const mutation& m1, const mutation& m2) const {
return m1.decorated_key().less_compare(*m1.schema(), m2.decorated_key());
}
std::ranges::subrange<utils::chunked_vector<mutation>::const_iterator>
slice(const utils::chunked_vector<mutation>& partitions, const dht::partition_range& r) {
struct cmp {
bool operator()(const dht::ring_position& pos, const mutation& m) const {
return m.decorated_key().tri_compare(*m.schema(), pos) > 0;
};
bool operator()(const mutation& m, const dht::ring_position& pos) const {
return m.decorated_key().tri_compare(*m.schema(), pos) < 0;
};
};
return std::ranges::subrange(
r.start()
? (r.start()->is_inclusive()
? std::lower_bound(partitions.begin(), partitions.end(), r.start()->value(), cmp())
: std::upper_bound(partitions.begin(), partitions.end(), r.start()->value(), cmp()))
: partitions.cbegin(),
r.end()
? (r.end()->is_inclusive()
? std::upper_bound(partitions.begin(), partitions.end(), r.end()->value(), cmp())
: std::lower_bound(partitions.begin(), partitions.end(), r.end()->value(), cmp()))
: partitions.cend());
}
void
mutation::upgrade(const schema_ptr& new_schema) {
if (_ptr->_schema != new_schema) {
schema_ptr s = new_schema;
partition().upgrade(*schema(), *new_schema);
_ptr->_schema = std::move(s);
}
}
void mutation::apply(mutation&& m) {
mutation_application_stats app_stats;
partition().apply(*schema(), std::move(m.partition()), *m.schema(), app_stats);
}
void mutation::apply(const mutation& m) {
mutation_application_stats app_stats;
partition().apply(*schema(), m.partition(), *m.schema(), app_stats);
}
void mutation::apply(const mutation_fragment& mf) {
partition().apply(*schema(), mf);
}
mutation& mutation::operator=(const mutation& m) {
return *this = mutation(m);
}
mutation mutation::operator+(const mutation& other) const {
auto m = *this;
m.apply(other);
return m;
}
mutation& mutation::operator+=(const mutation& other) {
apply(other);
return *this;
}
mutation& mutation::operator+=(mutation&& other) {
apply(std::move(other));
return *this;
}
mutation mutation::sliced(const query::clustering_row_ranges& ranges) const {
return mutation(schema(), decorated_key(), partition().sliced(*schema(), ranges));
}
mutation mutation::compacted() const {
auto m = *this;
m.partition().compact_for_compaction(*schema(), always_gc, m.decorated_key(), gc_clock::time_point::min(), tombstone_gc_state::no_gc());
return m;
}
size_t mutation::memory_usage(const ::schema& s) const {
auto res = sizeof(*this);
if (_ptr) {
res += sizeof(data);
res += _ptr->_dk.external_memory_usage();
res += _ptr->_p.external_memory_usage(s);
}
return res;
}
mutation reverse(mutation mut) {
auto reverse_schema = mut.schema()->make_reversed();
mutation_rebuilder_v2 reverse_rebuilder(reverse_schema);
return *std::move(mut).consume(reverse_rebuilder, consume_in_reverse::yes).result;
}
namespace {
class mutation_by_size_splitter {
struct partition_state {
mutation_rebuilder_v2 builder;
size_t empty_partition_size;
size_t size = 0;
explicit partition_state(schema_ptr schema)
: builder(std::move(schema))
{
}
};
const schema_ptr _schema;
const size_t _max_size;
std::function<void(mutation)> _process_mutation;
std::optional<partition_state> _state;
template <typename T>
stop_iteration consume_fragment(T&& fragment) {
const auto fragment_size = fragment.memory_usage(*_schema);
if (_state->size && _state->size + _state->empty_partition_size + fragment_size > _max_size) {
// We could end up with an empty mutation if we consumed a range_tombstone_change
// and the next fragment exceeds the limit. The tombstone range may not have been
// closed yet and range_tombstone will not be created.
// This should be a rare case though, so just pop such mutation.
auto m = _state->builder.flush();
if (!m.partition().empty()) {
_process_mutation(std::move(m));
}
_state->size = 0;
}
_state->size += fragment_size;
_state->builder.consume(std::move(fragment));
return stop_iteration::no;
}
public:
mutation_by_size_splitter(schema_ptr schema, size_t max_size, std::function<void(mutation)> process_mutation)
: _schema(std::move(schema))
, _max_size(max_size)
, _process_mutation(process_mutation)
{
}
void consume_new_partition(const dht::decorated_key& dk) {
_state.emplace(_schema);
_state->empty_partition_size = _state->builder.consume_new_partition(dk).memory_usage(*_schema);
}
void consume(tombstone t) {
_state->builder.consume(t);
}
stop_iteration consume(static_row&& sr) {
return consume_fragment(std::move(sr));
}
stop_iteration consume(clustering_row&& cr) {
return consume_fragment(std::move(cr));
}
stop_iteration consume(range_tombstone_change&& rtc) {
return consume_fragment(std::move(rtc));
}
stop_iteration consume_end_of_partition() {
_state->builder.consume_end_of_partition();
if (auto mut_opt = _state->builder.consume_end_of_stream(); mut_opt) {
// This final mutation could be empty if the last consumed fragment was a range_tombstone_change
// with no timestamp (i.e. a closing rtc), but a range_tombstone ending at this position
// was already emitted in the previous mutation (because the previous mutation was flushed
// after consuming a clustering_row at that position).
if (!mut_opt->partition().empty()) {
_process_mutation(std::move(*mut_opt));
}
} else {
on_internal_error(mlog, "consume_end_of_stream didn't return a mutation");
}
_state.reset();
return stop_iteration::no;
}
stop_iteration consume_end_of_stream() {
return stop_iteration::no;
}
};
}
future<> for_each_split_mutation(mutation source, size_t max_size, std::function<void(mutation)> process_mutation) {
reader_concurrency_semaphore sem(reader_concurrency_semaphore::no_limits{}, "split_mutation",
reader_concurrency_semaphore::register_metrics::no);
{
auto s = source.schema();
auto reader = make_mutation_reader_from_mutations(s,
sem.make_tracking_only_permit(s, "split_mutation", db::no_timeout, {}),
std::move(source));
co_await with_closeable(std::move(reader), [&] (mutation_reader& reader) {
return reader.consume(mutation_by_size_splitter(s, max_size, std::move(process_mutation)));
});
}
co_await sem.stop();
}
future<> split_mutation(mutation source, utils::chunked_vector<mutation>& target, size_t max_size) {
return for_each_split_mutation(std::move(source), max_size, [&target] (mutation m) {
target.emplace_back(std::move(m));
});
}
auto fmt::formatter<mutation>::format(const mutation& m, fmt::format_context& ctx) const
-> decltype(ctx.out()) {
const ::schema& s = *m.schema();
const auto& dk = m.decorated_key();
auto out = ctx.out();
out = fmt::format_to(out, "{{table: '{}.{}', key: {{", s.ks_name(), s.cf_name());
auto type_iterator = dk._key.get_compound_type(s)->types().begin();
auto column_iterator = s.partition_key_columns().begin();
for (auto&& e : dk._key.components(s)) {
fmt::format_to(out, "'{}': {}, ", column_iterator->name_as_text(), (*type_iterator)->to_string(to_bytes(e)));
++type_iterator;
++column_iterator;
}
return fmt::format_to(out, "token: {}}}, {}\n}}", dk._token, mutation_partition::printer(s, m.partition()));
}
namespace mutation_json {
void mutation_partition_json_writer::write_each_collection_cell(const collection_mutation_view_description& mv, data_type type,
std::function<void(atomic_cell_view, data_type)> func) {
std::function<void(size_t, bytes_view)> write_key;
std::function<void(size_t, atomic_cell_view)> write_value;
if (auto t = dynamic_cast<const collection_type_impl*>(type.get())) {
write_key = [this, t = t->name_comparator()] (size_t, bytes_view k) { _writer.String(t->to_string(k)); };
write_value = [t = t->value_comparator(), &func] (size_t, atomic_cell_view v) { func(v, t); };
} else if (auto t = dynamic_cast<const tuple_type_impl*>(type.get())) {
write_key = [this] (size_t i, bytes_view) { _writer.String(""); };
write_value = [t, &func] (size_t i, atomic_cell_view v) { func(v, t->type(i)); };
}
if (write_key && write_value) {
_writer.StartArray();
for (size_t i = 0; i < mv.cells.size(); ++i) {
_writer.StartObject();
_writer.Key("key");
write_key(i, mv.cells[i].first);
_writer.Key("value");
write_value(i, mv.cells[i].second);
_writer.EndObject();
}
_writer.EndArray();
} else {
_writer.Null();
}
}
sstring mutation_partition_json_writer::to_string(gc_clock::time_point tp) {
return fmt::format("{:%F %T}z", fmt::gmtime(gc_clock::to_time_t(tp)));
}
void mutation_partition_json_writer::write_atomic_cell_value(const atomic_cell_view& cell, data_type type) {
if (type->is_counter()) {
if (cell.is_counter_update()) {
_writer.Int64(cell.counter_update_value());
} else {
write(counter_cell_view(cell));
}
} else {
_writer.String(type->to_string(cell.value().linearize()));
}
}
void mutation_partition_json_writer::write_collection_value(const collection_mutation_view_description& mv, data_type type) {
write_each_collection_cell(mv, type, [&] (atomic_cell_view v, data_type t) {
if (v.is_live()) {
write_atomic_cell_value(v, t);
} else {
writer().Null();
}
});
}
void mutation_partition_json_writer::write(gc_clock::duration ttl, gc_clock::time_point expiry) {
_writer.Key("ttl");
_writer.AsString(ttl);
_writer.Key("expiry");
_writer.String(to_string(expiry));
}
void mutation_partition_json_writer::write(const tombstone& t) {
_writer.StartObject();
if (t) {
_writer.Key("timestamp");
_writer.Int64(t.timestamp);
_writer.Key("deletion_time");
_writer.String(to_string(t.deletion_time));
}
_writer.EndObject();
}
void mutation_partition_json_writer::write(const row_marker& m) {
_writer.StartObject();
_writer.Key("timestamp");
_writer.Int64(m.timestamp());
if (m.is_live() && m.is_expiring()) {
write(m.ttl(), m.expiry());
}
_writer.EndObject();
}
void mutation_partition_json_writer::write(counter_cell_view cv) {
_writer.StartArray();
for (const auto& shard : cv.shards()) {
_writer.StartObject();
_writer.Key("id");
_writer.AsString(shard.id());
_writer.Key("value");
_writer.Int64(shard.value());
_writer.Key("clock");
_writer.Int64(shard.logical_clock());
_writer.EndObject();
}
_writer.EndArray();
}
void mutation_partition_json_writer::write(const atomic_cell_view& cell, data_type type, bool include_value) {
_writer.StartObject();
_writer.Key("is_live");
_writer.Bool(cell.is_live());
_writer.Key("type");
if (type->is_counter()) {
if (cell.is_counter_update()) {
_writer.String("counter-update");
} else {
_writer.String("counter-shards");
}
} else if (type->is_collection()) {
_writer.String("frozen-collection");
} else {
_writer.String("regular");
}
_writer.Key("timestamp");
_writer.Int64(cell.timestamp());
if (!type->is_counter()) {
if (cell.is_live_and_has_ttl()) {
write(cell.ttl(), cell.expiry());
}
if (!cell.is_live()) {
_writer.Key("deletion_time");
_writer.String(to_string(cell.deletion_time()));
}
}
if (include_value && (type->is_counter() || cell.is_live())) {
_writer.Key("value");
write_atomic_cell_value(cell, type);
}
_writer.EndObject();
}
void mutation_partition_json_writer::write(const collection_mutation_view_description& mv, data_type type, bool include_value) {
_writer.StartObject();
if (mv.tomb) {
_writer.Key("tombstone");
write(mv.tomb);
}
_writer.Key("cells");
write_each_collection_cell(mv, type, [&] (atomic_cell_view v, data_type t) { write(v, t, include_value); });
_writer.EndObject();
}
void mutation_partition_json_writer::write(const atomic_cell_or_collection& cell, const column_definition& cdef, bool include_value) {
if (cdef.is_atomic()) {
write(cell.as_atomic_cell(cdef), cdef.type, include_value);
} else if (cdef.type->is_collection() || cdef.type->is_user_type()) {
cell.as_collection_mutation().with_deserialized(*cdef.type, [&, this] (collection_mutation_view_description mv) {
write(mv, cdef.type, include_value);
});
} else {
_writer.Null();
}
}
void mutation_partition_json_writer::write(const row& r, column_kind kind, bool include_value) {
_writer.StartObject();
r.for_each_cell([this, kind, include_value] (column_id id, const atomic_cell_or_collection& cell) {
auto cdef = _schema.column_at(kind, id);
_writer.Key(cdef.name_as_text());
write(cell, cdef, include_value);
});
_writer.EndObject();
}
} // namespace mutation_json