/* * Copyright (C) 2015 ScyllaDB */ /* * This file is part of Scylla. * * Scylla is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Scylla is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Scylla. If not, see . */ #pragma once #include "disk_types.hh" #include "core/enum.hh" #include "bytes.hh" #include "gc_clock.hh" #include "tombstone.hh" #include "utils/streaming_histogram.hh" #include "utils/estimated_histogram.hh" #include "column_name_helper.hh" #include "sstables/key.hh" #include "db/commitlog/replay_position.hh" #include "version.hh" #include #include #include #include "version.hh" #include "encoding_stats.hh" // While the sstable code works with char, bytes_view works with int8_t // (signed char). Rather than change all the code, let's do a cast. static inline bytes_view to_bytes_view(const temporary_buffer& b) { using byte = bytes_view::value_type; return bytes_view(reinterpret_cast(b.get()), b.size()); } namespace sstables { struct commitlog_interval { db::replay_position start; db::replay_position end; }; struct deletion_time { int32_t local_deletion_time; int64_t marked_for_delete_at; template auto describe_type(sstable_version_types v, Describer f) { return f(local_deletion_time, marked_for_delete_at); } bool live() const { return (local_deletion_time == std::numeric_limits::max()) && (marked_for_delete_at == std::numeric_limits::min()); } bool operator==(const deletion_time& d) { return local_deletion_time == d.local_deletion_time && marked_for_delete_at == d.marked_for_delete_at; } bool operator!=(const deletion_time& d) { return !(*this == d); } explicit operator tombstone() { return !live() ? tombstone(marked_for_delete_at, gc_clock::time_point(gc_clock::duration(local_deletion_time))) : tombstone(); } }; struct option { disk_string key; disk_string value; template auto describe_type(sstable_version_types v, Describer f) { return f(key, value); } }; struct filter { uint32_t hashes; disk_array buckets; template auto describe_type(sstable_version_types v, Describer f) { return f(hashes, buckets); } // Create an always positive filter if nothing else is specified. filter() : hashes(0), buckets({}) {} explicit filter(int hashes, utils::chunked_vector buckets) : hashes(hashes), buckets({std::move(buckets)}) {} }; // Do this so we don't have to copy on write time. We can just keep a reference. struct filter_ref { uint32_t hashes; disk_array_ref buckets; template auto describe_type(sstable_version_types v, Describer f) { return f(hashes, buckets); } explicit filter_ref(int hashes, const utils::chunked_vector& buckets) : hashes(hashes), buckets(buckets) {} }; enum class indexable_element { partition, cell }; inline std::ostream& operator<<(std::ostream& o, indexable_element e) { o << static_cast>(e); return o; } class summary_entry { public: dht::token_view token; bytes_view key; uint64_t position; key_view get_key() const { return key_view{key}; } decorated_key_view get_decorated_key() const { return decorated_key_view(token, get_key()); } bool operator==(const summary_entry& x) const { return position == x.position && key == x.key; } }; // Note: Sampling level is present in versions ka and higher. We ATM only support ka, // so it's always there. But we need to make this conditional if we ever want to support // other formats. struct summary_ka { struct header { // The minimum possible amount of indexes per group (sampling level) uint32_t min_index_interval; // The number of entries in the Summary File uint32_t size; // The memory to be consumed to map the whole Summary into memory. uint64_t memory_size; // The actual sampling level. uint32_t sampling_level; // The number of entries the Summary *would* have if the sampling // level would be equal to min_index_interval. uint32_t size_at_full_sampling; } header; // The position in the Summary file for each of the indexes. // NOTE1 that its actual size is determined by the "size" parameter, not // by its preceding size_at_full_sampling // NOTE2: They are laid out in *MEMORY* order, not BE. // NOTE3: The sizes in this array represent positions in the memory stream, // not the file. The memory stream effectively begins after the header, // so every position here has to be added of sizeof(header). utils::chunked_vector positions; // can be large, so use a deque instead of a vector utils::chunked_vector entries; disk_string first_key; disk_string last_key; // NOTE4: There is a structure written by Cassandra into the end of the Summary // file, after the field last_key, that we haven't understand yet, but we know // that its content isn't related to the summary itself. // The structure is basically as follow: // struct { disk_string; uint32_t; uint64_t; disk_string; } // Another interesting fact about this structure is that it is apparently always // filled with the same data. It's too early to judge that the data is useless. // However, it was tested that Cassandra loads successfully a Summary file with // this structure removed from it. Anyway, let's pay attention to it. /* * Returns total amount of memory used by the summary * Similar to origin off heap size */ uint64_t memory_footprint() const { auto sz = sizeof(summary_entry) * entries.size() + sizeof(uint32_t) * positions.size() + sizeof(*this); sz += first_key.value.size() + last_key.value.size(); for (auto& sd : _summary_data) { sz += sd.size(); } return sz; } explicit operator bool() const { return entries.size(); } bytes_view add_summary_data(bytes_view data) { if (_summary_data.empty() || (_summary_index_pos + data.size() > _buffer_size)) { _buffer_size = std::min(_buffer_size << 1, 128u << 10); // Keys are 64kB max, so it might be one key may not fit in a buffer _buffer_size = std::max(_buffer_size, unsigned(data.size())); _summary_data.emplace_back(_buffer_size); _summary_index_pos = 0; } auto ret = _summary_data.back().store_at(_summary_index_pos, data); _summary_index_pos += data.size(); return ret; } private: class summary_data_memory { unsigned _size; std::unique_ptr _data; public: summary_data_memory(unsigned size) : _size(size), _data(std::make_unique(size)) {} bytes_view store_at(unsigned pos, bytes_view src) { auto addr = _data.get() + pos; std::copy_n(src.data(), src.size(), addr); return bytes_view(addr, src.size()); } unsigned size() const { return _size; } }; unsigned _buffer_size = 1 << 10; std::vector _summary_data = {}; unsigned _summary_index_pos = 0; }; using summary = summary_ka; class file_writer; struct metadata { virtual ~metadata() {} virtual uint64_t serialized_size(sstable_version_types v) const = 0; virtual void write(sstable_version_types v, file_writer& write) const = 0; }; template uint64_t serialized_size(sstable_version_types v, const T& object); template typename std::enable_if_t::value && !std::is_enum::value, void> write(sstable_version_types v, file_writer& out, const T& t); // serialized_size() implementation for metadata class template class metadata_base : public metadata { public: virtual uint64_t serialized_size(sstable_version_types v) const override { return sstables::serialized_size(v, static_cast(*this)); } virtual void write(sstable_version_types v, file_writer& writer) const override { return sstables::write(v, writer, static_cast(*this)); } }; struct validation_metadata : public metadata_base { disk_string partitioner; double filter_chance; template auto describe_type(sstable_version_types v, Describer f) { return f(partitioner, filter_chance); } }; struct compaction_metadata : public metadata_base { disk_array ancestors; disk_array cardinality; template auto describe_type(sstable_version_types v, Describer f) { switch (v) { case sstable_version_types::mc: return f( cardinality ); case sstable_version_types::ka: case sstable_version_types::la: return f( ancestors, cardinality ); } // Should never reach here - compiler will complain if switch above does not cover all sstable versions abort(); } }; struct stats_metadata : public metadata_base { utils::estimated_histogram estimated_row_size; utils::estimated_histogram estimated_cells_count; db::replay_position position; int64_t min_timestamp; int64_t max_timestamp; int32_t min_local_deletion_time; // 3_x only int32_t max_local_deletion_time; int32_t min_ttl; // 3_x only int32_t max_ttl; // 3_x only double compression_ratio; utils::streaming_histogram estimated_tombstone_drop_time; uint32_t sstable_level; uint64_t repaired_at; disk_array> min_column_names; disk_array> max_column_names; bool has_legacy_counter_shards; int64_t columns_count; // 3_x only int64_t rows_count; // 3_x only db::replay_position commitlog_lower_bound; // 3_x only disk_array commitlog_intervals; // 3_x only template auto describe_type(sstable_version_types v, Describer f) { switch (v) { case sstable_version_types::mc: return f( estimated_row_size, estimated_cells_count, position, min_timestamp, max_timestamp, min_local_deletion_time, max_local_deletion_time, min_ttl, max_ttl, compression_ratio, estimated_tombstone_drop_time, sstable_level, repaired_at, min_column_names, max_column_names, has_legacy_counter_shards, columns_count, rows_count, commitlog_lower_bound, commitlog_intervals ); case sstable_version_types::ka: case sstable_version_types::la: return f( estimated_row_size, estimated_cells_count, position, min_timestamp, max_timestamp, max_local_deletion_time, compression_ratio, estimated_tombstone_drop_time, sstable_level, repaired_at, min_column_names, max_column_names, has_legacy_counter_shards ); } // Should never reach here - compiler will complain if switch above does not cover all sstable versions abort(); } }; using bytes_array_vint_size = disk_string_vint_size; struct serialization_header : public metadata_base { vint min_timestamp_base; vint min_local_deletion_time_base; vint min_ttl_base; bytes_array_vint_size pk_type_name; disk_array_vint_size clustering_key_types_names; struct column_desc { bytes_array_vint_size name; bytes_array_vint_size type_name; template auto describe_type(sstable_version_types v, Describer f) { return f( name, type_name ); } }; disk_array_vint_size static_columns; disk_array_vint_size regular_columns; template auto describe_type(sstable_version_types v, Describer f) { switch (v) { case sstable_version_types::mc: return f( min_timestamp_base, min_local_deletion_time_base, min_ttl_base, pk_type_name, clustering_key_types_names, static_columns, regular_columns ); case sstable_version_types::ka: case sstable_version_types::la: throw std::runtime_error( "Statistics is malformed: SSTable is in 2.x format but contains serialization header."); } // Should never reach here - compiler will complain if switch above does not cover all sstable versions abort(); } uint64_t get_min_timestamp() const { return min_timestamp_base.value + encoding_stats::timestamp_epoch; } uint32_t get_min_ttl() const { return min_ttl_base.value + encoding_stats::ttl_epoch; } uint32_t get_min_local_deletion_time() const { return min_local_deletion_time_base.value + encoding_stats::deletion_time_epoch; } }; struct disk_token_bound { uint8_t exclusive; // really a boolean disk_string token; template auto describe_type(sstable_version_types v, Describer f) { return f(exclusive, token); } }; struct disk_token_range { disk_token_bound left; disk_token_bound right; template auto describe_type(sstable_version_types v, Describer f) { return f(left, right); } }; // Scylla-specific sharding information. This is a set of token // ranges that are spanned by this sstable. When loading the // sstable, we can see which shards own data in the sstable by // checking each such range. struct sharding_metadata { disk_array token_ranges; template auto describe_type(sstable_version_types v, Describer f) { return f(token_ranges); } }; // Scylla-specific list of features an sstable supports. enum sstable_feature : uint8_t { NonCompoundPIEntries = 0, // See #2993 NonCompoundRangeTombstones = 1, // See #2986 ShadowableTombstones = 2, // See #3885 End = 4, }; // Scylla-specific features enabled for a particular sstable. struct sstable_enabled_features { uint64_t enabled_features; bool is_enabled(sstable_feature f) const { return enabled_features & (1 << f); } void disable(sstable_feature f) { enabled_features &= ~(1<< f); } template auto describe_type(sstable_version_types v, Describer f) { return f(enabled_features); } }; // Numbers are found on disk, so they do matter. Also, setting their sizes of // that of an uint32_t is a bit wasteful, but it simplifies the code a lot // since we can now still use a strongly typed enum without introducing a // notion of "disk-size" vs "memory-size". enum class metadata_type : uint32_t { Validation = 0, Compaction = 1, Stats = 2, Serialization = 3, }; enum class scylla_metadata_type : uint32_t { Sharding = 1, Features = 2, ExtensionAttributes = 3, }; struct scylla_metadata { using extension_attributes = disk_hash, disk_string>; disk_set_of_tagged_union, disk_tagged_union_member, disk_tagged_union_member > data; bool has_feature(sstable_feature f) const { auto features = data.get(); return features && features->is_enabled(f); } const extension_attributes* get_extension_attributes() const { return data.get(); } extension_attributes& get_or_create_extension_attributes() { auto* ext = data.get(); if (ext == nullptr) { data.set(extension_attributes{}); ext = data.get(); } return *ext; } template auto describe_type(sstable_version_types v, Describer f) { return f(data); } }; static constexpr int DEFAULT_CHUNK_SIZE = 65536; // checksums are generated using adler32 algorithm. struct checksum { uint32_t chunk_size; utils::chunked_vector checksums; template auto describe_type(sstable_version_types v, Describer f) { return f(chunk_size, checksums); } }; } namespace std { template <> struct hash : enum_hash {}; } namespace sstables { // Special value to represent expired (i.e., 'dead') liveness info constexpr static uint32_t expired_liveness_ttl = std::numeric_limits::max(); inline bool is_expired_liveness_ttl(uint32_t ttl) { return ttl == expired_liveness_ttl; } struct statistics { disk_hash hash; std::unordered_map> contents; }; enum class column_mask : uint8_t { none = 0x0, deletion = 0x01, expiration = 0x02, counter = 0x04, counter_update = 0x08, range_tombstone = 0x10, shadowable = 0x40 }; inline column_mask operator&(column_mask m1, column_mask m2) { return column_mask(static_cast(m1) & static_cast(m2)); } inline column_mask operator|(column_mask m1, column_mask m2) { return column_mask(static_cast(m1) | static_cast(m2)); } class unfiltered_flags_m final { static const uint8_t END_OF_PARTITION = 0x01u; static const uint8_t IS_MARKER = 0x02u; static const uint8_t HAS_TIMESTAMP = 0x04u; static const uint8_t HAS_TTL = 0x08u; static const uint8_t HAS_DELETION = 0x10u; static const uint8_t HAS_ALL_COLUMNS = 0x20u; static const uint8_t HAS_COMPLEX_DELETION = 0x40u; static const uint8_t HAS_EXTENDED_FLAGS = 0x80u; uint8_t _flags; bool check_flag(const uint8_t flag) const { return (_flags & flag) != 0u; } public: explicit unfiltered_flags_m(uint8_t flags) : _flags(flags) { } bool is_end_of_partition() const { return check_flag(END_OF_PARTITION); } bool is_range_tombstone() const { return check_flag(IS_MARKER); } bool has_extended_flags() const { return check_flag(HAS_EXTENDED_FLAGS); } bool has_timestamp() const { return check_flag(HAS_TIMESTAMP); } bool has_ttl() const { return check_flag(HAS_TTL); } bool has_deletion() const { return check_flag(HAS_DELETION); } bool has_all_columns() const { return check_flag(HAS_ALL_COLUMNS); } bool has_complex_deletion() const { return check_flag(HAS_COMPLEX_DELETION); } }; class unfiltered_extended_flags_m final { static const uint8_t IS_STATIC = 0x01u; // This flag is used by Cassandra but not supported by Scylla because // Scylla's representation of shadowable tombstones is different. // We only check it on reading and error out if set but never set ourselves. static const uint8_t HAS_CASSANDRA_SHADOWABLE_DELETION = 0x02u; // This flag is Scylla-specific and used for writing shadowable tombstones. static const uint8_t HAS_SCYLLA_SHADOWABLE_DELETION = 0x80u; uint8_t _flags; bool check_flag(const uint8_t flag) const { return (_flags & flag) != 0u; } public: explicit unfiltered_extended_flags_m(uint8_t flags) : _flags(flags) { } bool is_static() const { return check_flag(IS_STATIC); } bool has_cassandra_shadowable_deletion() const { return check_flag(HAS_CASSANDRA_SHADOWABLE_DELETION); } bool has_scylla_shadowable_deletion() const { return check_flag(HAS_SCYLLA_SHADOWABLE_DELETION); } }; class column_flags_m final { static const uint8_t IS_DELETED = 0x01u; static const uint8_t IS_EXPIRING = 0x02u; static const uint8_t HAS_EMPTY_VALUE = 0x04u; static const uint8_t USE_ROW_TIMESTAMP = 0x08u; static const uint8_t USE_ROW_TTL = 0x10u; uint8_t _flags; bool check_flag(const uint8_t flag) const { return (_flags & flag) != 0u; } public: explicit column_flags_m(uint8_t flags) : _flags(flags) { } bool use_row_timestamp() const { return check_flag(USE_ROW_TIMESTAMP); } bool use_row_ttl() const { return check_flag(USE_ROW_TTL); } bool is_deleted() const { return check_flag(IS_DELETED); } bool is_expiring() const { return check_flag(IS_EXPIRING); } bool has_value() const { return !check_flag(HAS_EMPTY_VALUE); } }; }