/* * Copyright (C) 2015-present ScyllaDB */ /* * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0 */ #pragma once #include #include #include "sstables/types.hh" #include "sstables/component_type.hh" #include "checksum_utils.hh" #include "vint-serialization.hh" #include #include "version.hh" #include "counters.hh" #include "dht/i_partitioner_fwd.hh" #include namespace sstables { class index_sampling_state; class compression; class metadata_collector; class sizing_data_sink : public data_sink_impl { uint64_t& _size; public: explicit sizing_data_sink(uint64_t& dest) : _size(dest) { _size = 0; } virtual temporary_buffer allocate_buffer(size_t size) override { return temporary_buffer(size); } virtual future<> put(net::packet data) override { _size += data.len(); return make_ready_future<>(); } virtual future<> put(std::vector> data) override { _size += std::ranges::fold_left(data | std::views::transform(std::mem_fn(&temporary_buffer::size)), 0, std::plus{}); return make_ready_future<>(); } virtual future<> put(temporary_buffer buf) override { _size += buf.size(); return make_ready_future<>(); } virtual future<> flush() override { return make_ready_future<>(); } virtual future<> close() override { return make_ready_future<>(); } }; inline output_stream make_sizing_output_stream(uint64_t& dest) { return output_stream(data_sink(std::make_unique(std::ref(dest))), 4096); } // Must be called from a thread template uint64_t serialized_size(sstable_version_types v, const T& object) { uint64_t size = 0; auto writer = file_writer(make_sizing_output_stream(size)); write(v, writer, object); writer.close(); return size; } template requires ChecksumUtils class checksummed_file_data_sink_impl : public data_sink_impl { data_sink _out; struct checksum& _c; uint32_t& _full_checksum; public: checksummed_file_data_sink_impl(data_sink out, struct checksum& c, uint32_t& full_file_checksum) : _out(std::move(out)) , _c(c) , _full_checksum(full_file_checksum) {} virtual temporary_buffer allocate_buffer(size_t size) override { return _out.allocate_buffer(size); // preserve alignment requirements } virtual future<> put(net::packet data) override { abort(); } virtual future<> put(temporary_buffer buf) override { // bufs will usually be a multiple of chunk size, but this won't be the case for // the last buffer being flushed. for (size_t offset = 0; offset < buf.size(); offset += _c.chunk_size) { size_t size = std::min(size_t(_c.chunk_size), buf.size() - offset); uint32_t per_chunk_checksum = ChecksumType::init_checksum(); per_chunk_checksum = ChecksumType::checksum(per_chunk_checksum, buf.begin() + offset, size); _full_checksum = checksum_combine_or_feed(_full_checksum, per_chunk_checksum, buf.begin() + offset, size); _c.checksums.push_back(per_chunk_checksum); } return _out.put(std::move(buf)); } virtual future<> flush() override { return _out.flush(); } virtual future<> close() override { // Nothing to do, because close at the file_stream level will call flush on us. return _out.close(); } virtual size_t buffer_size() const noexcept override { return _out.buffer_size(); } }; template requires ChecksumUtils class checksummed_file_data_sink : public data_sink { public: checksummed_file_data_sink(data_sink out, struct checksum& cinfo, uint32_t& full_file_checksum) : data_sink(std::make_unique>(std::move(out), cinfo, full_file_checksum)) {} }; template requires ChecksumUtils inline output_stream make_checksummed_file_output_stream(data_sink out, struct checksum& cinfo, uint32_t& full_file_checksum) { return output_stream(checksummed_file_data_sink(std::move(out), cinfo, full_file_checksum)); } template requires ChecksumUtils class checksummed_file_writer : public file_writer { checksum _c; uint32_t _full_checksum; public: checksummed_file_writer(data_sink out, size_t buffer_size, component_name c) : file_writer(make_checksummed_file_output_stream(std::move(out), _c, _full_checksum), std::move(c)) , _c(uint32_t(std::min(size_t(DEFAULT_CHUNK_SIZE), buffer_size)), {}) , _full_checksum(ChecksumType::init_checksum()) {} // Since we are exposing a reference to _full_checksum, we delete the move // constructor. If it is moved, the reference will refer to the old // location. checksummed_file_writer(checksummed_file_writer&&) = delete; checksummed_file_writer(const checksummed_file_writer&) = default; checksum& finalize_checksum() { return _c; } uint32_t full_checksum() { return _full_checksum; } }; using adler32_checksummed_file_writer = checksummed_file_writer; using crc32_checksummed_file_writer = checksummed_file_writer; template requires Writer inline void write_vint_impl(W& out, T value) { using vint_type = std::conditional_t, unsigned_vint, signed_vint>; std::array encoding_buffer; const auto size = vint_type::serialize(value, encoding_buffer.begin()); out.write(reinterpret_cast(encoding_buffer.data()), size); } template requires Writer void write_unsigned_vint(W& out, uint64_t value) { return write_vint_impl(out, value); } template requires Writer void write_signed_vint(W& out, int64_t value) { return write_vint_impl(out, value); } template inline void write_vint(W& out, T value) { return std::is_unsigned_v ? write_unsigned_vint(out, value) : write_signed_vint(out, value); } template void write(sstable_version_types v, W& out, T i) { i = net::hton(i); out.write(reinterpret_cast(&i), sizeof(T)); } template requires Writer && std::is_enum_v inline void write(sstable_version_types v, W& out, T i) { write(v, out, static_cast::type>(i)); } template requires Writer inline void write(sstable_version_types v, W& out, bool i) { write(v, out, static_cast(i)); } inline void write(sstable_version_types v, file_writer& out, double d) { unsigned long tmp = net::hton(std::bit_cast(d)); out.write(reinterpret_cast(&tmp), sizeof(unsigned long)); } inline void write(sstable_version_types v, file_writer& out, const utils::UUID& uuid) { out.write(uuid.serialize()); } template inline void write(sstable_version_types v, file_writer& out, const utils::tagged_uuid& id) { write(v, out, id.uuid()); } template requires Writer inline void write(sstable_version_types v, W& out, const bytes& s) { out.write(s); } template requires Writer inline void write(sstable_version_types v, W& out, bytes_view s) { out.write(reinterpret_cast(s.data()), s.size()); } template requires Writer inline void write(sstable_version_types v, W& out, managed_bytes_view s) { for (bytes_view fragment : fragment_range(s)) { write(v, out, fragment); } } inline void write(sstable_version_types v, file_writer& out, const bytes_ostream& s) { for (bytes_view fragment : s) { write(v, out, fragment); } } template requires Writer inline void write(sstable_version_types v, W& out, const First& first, const Second& second, Rest&&... rest) { write(v, out, first); write(v, out, second, std::forward(rest)...); } template requires Writer inline void write(sstable_version_types v, W& out, const vint& t) { write_vint(out, t.value); } template void write(sstable_version_types v, W& out, const T& t) { // describe_type() is not const correct, so cheat here: const_cast(t).describe_type(v, [v, &out] (auto&&... what) -> void { write(v, out, std::forward(what)...); }); } template void check_truncate_and_assign(T& to, const U from) { to = from; if (to != from) { throw std::overflow_error("assigning U to T caused an overflow"); } } template void write(sstable_version_types v, file_writer& out, const disk_string& s) { Size len = 0; check_truncate_and_assign(len, s.value.size()); write(v, out, len); write(v, out, s.value); } inline void write(sstable_version_types v, file_writer& out, const disk_string_vint_size& s) { uint64_t len = 0; check_truncate_and_assign(len, s.value.size()); write_vint(out, len); write(v, out, s.value); } template inline void write(sstable_version_types v, file_writer& out, const disk_string_view& s) { Size len; check_truncate_and_assign(len, s.value.size()); write(v, out, len, s.value); } template inline void write(sstable_version_types ver, file_writer& out, const disk_data_value_view& v) { SizeType length; check_truncate_and_assign(length, v.value.size()); write(ver, out, length); for (bytes_view frag : fragment_range(v.value)) { write(ver, out, frag); } } template inline void write(sstable_version_types v, file_writer& out, const utils::chunked_vector& arr) { for (auto& a : arr) { write(v, out, a); } } template inline void write(sstable_version_types v, file_writer& out, const utils::chunked_vector& arr) { std::vector tmp; size_t per_loop = 100000 / sizeof(Members); tmp.resize(per_loop); size_t idx = 0; while (idx != arr.size()) { auto now = std::min(arr.size() - idx, per_loop); // copy arr into tmp converting each entry into big-endian representation. auto nr = arr.begin() + idx; for (size_t i = 0; i < now; i++) { tmp[i] = net::hton(nr[i]); } auto p = reinterpret_cast(tmp.data()); auto bytes = now * sizeof(Members); out.write(p, bytes); idx += now; } } template inline void write(sstable_version_types v, file_writer& out, const std::optional& opt) { write(v, out, bool(opt)); if (bool(opt)) { write(v, out, *opt); } } template inline void write(sstable_version_types v, file_writer& out, const disk_array& arr) { Size len = 0; check_truncate_and_assign(len, arr.elements.size()); write(v, out, len); write(v, out, arr.elements); } template inline void write(sstable_version_types v, file_writer& out, const disk_array_vint_size& arr) { uint64_t len = 0; check_truncate_and_assign(len, arr.elements.size()); write_vint(out, len); write(v, out, arr.elements); } template inline void write(sstable_version_types v, file_writer& out, const disk_array_ref& arr) { Size len = 0; check_truncate_and_assign(len, arr.elements.size()); write(v, out, len); write(v, out, arr.elements); } template inline void write(sstable_version_types v, file_writer& out, const std::unordered_map& map) { for (auto& val: map) { write(v, out, val.first, val.second); }; } template inline void write(sstable_version_types v, file_writer& out, const std::pair& val) { write(v, out, val.first, val.second); } template inline void write(sstable_version_types v, file_writer& out, const disk_hash& h) { Size len = 0; check_truncate_and_assign(len, h.map.size()); write(v, out, len); write(v, out, h.map); } class bytes_writer_for_column_name { bytes _buf; bytes::iterator _pos; public: void prepare(size_t size) { _buf = bytes(bytes::initialized_later(), size); _pos = _buf.begin(); } template void write(Args&&... args) { auto write_one = [this] (bytes_view data) { _pos = std::copy(data.begin(), data.end(), _pos); }; auto ignore = { (write_one(bytes_view(args)), 0)... }; (void)ignore; } bytes&& release() && { return std::move(_buf); } }; class file_writer_for_column_name { sstable_version_types _v; file_writer& _fw; public: file_writer_for_column_name(sstable_version_types v, file_writer& fw) : _v(v), _fw(fw) { } void prepare(uint16_t size) { sstables::write(_v, _fw, size); } template void write(Args&&... args) { sstables::write(_v, _fw, std::forward(args)...); } }; template void write_compound_non_dense_column_name(sstable_version_types v, Writer& out, const composite& clustering_key, const std::vector& column_names, composite::eoc marker = composite::eoc::none) { // was defined in the schema, for example. auto c = composite::from_exploded(column_names, true, marker); auto ck_bview = bytes_view(clustering_key); // The marker is not a component, so if the last component is empty (IOW, // only serializes to the marker), then we just replace the key's last byte // with the marker. If the component however it is not empty, then the // marker should be in the end of it, and we just join them together as we // do for any normal component if (c.size() == 1) { if (ck_bview.empty()) { throw std::runtime_error("Open-ended range tombstones are not allowed in LA/KA SSTables."); } ck_bview.remove_suffix(1); } size_t sz = ck_bview.size() + c.size(); if (sz > std::numeric_limits::max()) { throw std::runtime_error(format("Column name too large ({:d} > {:d})", sz, std::numeric_limits::max())); } out.prepare(uint16_t(sz)); out.write(ck_bview, c); } inline void write_compound_non_dense_column_name(sstable_version_types v, file_writer& out, const composite& clustering_key, const std::vector& column_names, composite::eoc marker = composite::eoc::none) { auto w = file_writer_for_column_name(v, out); write_compound_non_dense_column_name(v, w, clustering_key, column_names, marker); } template void write_column_name(sstable_version_types v, Writer& out, bytes_view column_names) { size_t sz = column_names.size(); if (sz > std::numeric_limits::max()) { throw std::runtime_error(format("Column name too large ({:d} > {:d})", sz, std::numeric_limits::max())); } out.prepare(uint16_t(sz)); out.write(column_names); } inline void write_column_name(sstable_version_types v, file_writer& out, bytes_view column_names) { auto w = file_writer_for_column_name(v, out); write_column_name(v, w, column_names); } template void write_column_name(sstable_version_types v, Writer& out, const schema& s, const composite& clustering_element, const std::vector& column_names, composite::eoc marker = composite::eoc::none) { if (s.is_dense()) { write_column_name(v, out, bytes_view(clustering_element)); } else if (s.is_compound()) { write_compound_non_dense_column_name(v, out, clustering_element, column_names, marker); } else { write_column_name(v, out, column_names[0]); } } template requires Writer void write_cell_value(sstable_version_types v, W& out, const abstract_type& type, managed_bytes_view value) { if (!value.empty()) { if (type.value_length_if_fixed()) { write(v, out, value); } else { write_vint(out, value.size()); write(v, out, value); } } } template requires Writer void write_cell_value(sstable_version_types v, W& out, const abstract_type& type, bytes_view value) { if (!value.empty()) { if (type.value_length_if_fixed()) { write(v, out, value); } else { write_vint(out, value.size()); write(v, out, value); } } } template requires Writer void write_counter_value(counter_cell_view ccv, W& out, sstable_version_types v, WriteLengthFunc&& write_len_func) { auto shard_count = ccv.shard_count(); static constexpr auto header_entry_size = sizeof(int16_t); static constexpr auto counter_shard_size = 32u; // counter_id: 16 + clock: 8 + value: 8 auto total_size = sizeof(int16_t) + shard_count * (header_entry_size + counter_shard_size); write_len_func(out, uint32_t(total_size)); write(v, out, int16_t(shard_count)); for (auto i = 0u; i < shard_count; i++) { write(v, out, std::numeric_limits::min() + i); } auto write_shard = [&] (auto&& s) { auto uuid = s.id().uuid(); write(v, out, int64_t(uuid.get_most_significant_bits()), int64_t(uuid.get_least_significant_bits()), int64_t(s.logical_clock()), int64_t(s.value())); }; for (auto&& s : ccv.shards()) { write_shard(s); } } void maybe_add_summary_entry(summary&, const dht::token&, bytes_view key, uint64_t data_offset, uint64_t index_offset, index_sampling_state&); void prepare_summary(summary& s, uint64_t expected_partition_count, uint32_t min_index_interval); future<> seal_summary(summary& s, std::optional&& first_key, std::optional&& last_key, const index_sampling_state& state); void seal_statistics(sstable_version_types, statistics&, metadata_collector&, const sstring partitioner, double bloom_filter_fp_chance, schema_ptr, const dht::decorated_key& first_key, const dht::decorated_key& last_key, const encoding_stats& enc_stats = {}, const std::set& compaction_ancestors = {}); void write(sstable_version_types, file_writer&, const utils::estimated_histogram&); void write(sstable_version_types, file_writer&, const utils::streaming_histogram&); void write(sstable_version_types, file_writer&, const commitlog_interval&); void write(sstable_version_types, file_writer&, const compression&); }