/* * Copyright (C) 2015 ScyllaDB */ /* * This file is part of Scylla. * * Scylla is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Scylla is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Scylla. If not, see . */ #pragma once #include "disk_types.hh" #include "core/enum.hh" #include "bytes.hh" #include "gc_clock.hh" #include "tombstone.hh" #include "streaming_histogram.hh" #include "utils/estimated_histogram.hh" #include "column_name_helper.hh" #include "sstables/key.hh" #include "db/commitlog/replay_position.hh" #include #include #include // While the sstable code works with char, bytes_view works with int8_t // (signed char). Rather than change all the code, let's do a cast. static inline bytes_view to_bytes_view(const temporary_buffer& b) { using byte = bytes_view::value_type; return bytes_view(reinterpret_cast(b.get()), b.size()); } namespace sstables { struct deletion_time { int32_t local_deletion_time; int64_t marked_for_delete_at; template auto describe_type(Describer f) { return f(local_deletion_time, marked_for_delete_at); } bool live() const { return (local_deletion_time == std::numeric_limits::max()) && (marked_for_delete_at == std::numeric_limits::min()); } bool operator==(const deletion_time& d) { return local_deletion_time == d.local_deletion_time && marked_for_delete_at == d.marked_for_delete_at; } bool operator!=(const deletion_time& d) { return !(*this == d); } explicit operator tombstone() { return !live() ? tombstone(marked_for_delete_at, gc_clock::time_point(gc_clock::duration(local_deletion_time))) : tombstone(); } }; struct option { disk_string key; disk_string value; template auto describe_type(Describer f) { return f(key, value); } }; struct filter { uint32_t hashes; disk_array buckets; template auto describe_type(Describer f) { return f(hashes, buckets); } // Create an always positive filter if nothing else is specified. filter() : hashes(0), buckets({}) {} explicit filter(int hashes, std::deque buckets) : hashes(hashes), buckets({std::move(buckets)}) {} }; enum class indexable_element { partition, cell }; // Exploded view of promoted index. // Contains pointers into external buffer, so that buffer must be kept alive // as long as this is used. struct promoted_index { struct entry { composite_view start; composite_view end; uint64_t offset; uint64_t width; }; deletion_time del_time; std::deque entries; }; class index_entry { temporary_buffer _key; uint64_t _position; temporary_buffer _promoted_index_bytes; stdx::optional _promoted_index; private: void parse_promoted_index(const schema&); public: bytes_view get_key_bytes() const { return to_bytes_view(_key); } key_view get_key() const { return key_view{get_key_bytes()}; } uint64_t position() const { return _position; } bytes_view get_promoted_index_bytes() const { return to_bytes_view(_promoted_index_bytes); } index_entry(temporary_buffer&& key, uint64_t position, temporary_buffer&& promoted_index) : _key(std::move(key)), _position(position), _promoted_index_bytes(std::move(promoted_index)) {} index_entry(const index_entry& o) : _key(o._key.get(), o._key.size()) , _position(o._position) , _promoted_index_bytes(o._promoted_index_bytes.get(), o._promoted_index_bytes.size()) { } promoted_index* get_promoted_index(const schema& s) { if (!_promoted_index) { parse_promoted_index(s); } return _promoted_index ? &*_promoted_index : nullptr; } }; struct summary_entry { bytes key; uint64_t position; key_view get_key() const { return key_view{key}; } bool operator==(const summary_entry& x) const { return position == x.position && key == x.key; } }; // Note: Sampling level is present in versions ka and higher. We ATM only support ka, // so it's always there. But we need to make this conditional if we ever want to support // other formats. struct summary_ka { struct header { // The minimum possible amount of indexes per group (sampling level) uint32_t min_index_interval; // The number of entries in the Summary File uint32_t size; // The memory to be consumed to map the whole Summary into memory. uint64_t memory_size; // The actual sampling level. uint32_t sampling_level; // The number of entries the Summary *would* have if the sampling // level would be equal to min_index_interval. uint32_t size_at_full_sampling; } header; // The position in the Summary file for each of the indexes. // NOTE1 that its actual size is determined by the "size" parameter, not // by its preceding size_at_full_sampling // NOTE2: They are laid out in *MEMORY* order, not BE. // NOTE3: The sizes in this array represent positions in the memory stream, // not the file. The memory stream effectively begins after the header, // so every position here has to be added of sizeof(header). std::deque positions; // can be large, so use a deque instead of a vector std::deque entries; disk_string first_key; disk_string last_key; // Used to determine when a summary entry should be added based on min_index_interval. // NOTE: keys_written isn't part of on-disk format of summary. size_t keys_written; // NOTE4: There is a structure written by Cassandra into the end of the Summary // file, after the field last_key, that we haven't understand yet, but we know // that its content isn't related to the summary itself. // The structure is basically as follow: // struct { disk_string; uint32_t; uint64_t; disk_string; } // Another interesting fact about this structure is that it is apparently always // filled with the same data. It's too early to judge that the data is useless. // However, it was tested that Cassandra loads successfully a Summary file with // this structure removed from it. Anyway, let's pay attention to it. /* * Returns total amount of memory used by the summary * Similar to origin off heap size */ uint64_t memory_footprint() const { auto sz = sizeof(summary_entry) * entries.size() + sizeof(uint32_t) * positions.size() + sizeof(*this); sz += first_key.value.size() + last_key.value.size(); for (auto& e : entries) { sz += e.key.size(); } return sz; } explicit operator bool() const { return entries.size(); } }; using summary = summary_ka; class file_writer; struct metadata { virtual ~metadata() {} virtual uint64_t serialized_size() const = 0; virtual void write(file_writer& write) const = 0; }; template uint64_t serialized_size(const T& object); template typename std::enable_if_t::value && !std::is_enum::value, void> write(file_writer& out, const T& t); // serialized_size() implementation for metadata class template class metadata_base : public metadata { public: virtual uint64_t serialized_size() const override { return sstables::serialized_size(static_cast(*this)); } virtual void write(file_writer& writer) const override { return sstables::write(writer, static_cast(*this)); } }; struct validation_metadata : public metadata_base { disk_string partitioner; double filter_chance; template auto describe_type(Describer f) { return f(partitioner, filter_chance); } }; struct compaction_metadata : public metadata_base { disk_array ancestors; disk_array cardinality; template auto describe_type(Describer f) { return f(ancestors, cardinality); } }; struct ka_stats_metadata : public metadata_base { utils::estimated_histogram estimated_row_size; utils::estimated_histogram estimated_column_count; db::replay_position position; int64_t min_timestamp; int64_t max_timestamp; int32_t max_local_deletion_time; double compression_ratio; streaming_histogram estimated_tombstone_drop_time; uint32_t sstable_level; uint64_t repaired_at; disk_array> min_column_names; disk_array> max_column_names; bool has_legacy_counter_shards; template auto describe_type(Describer f) { return f( estimated_row_size, estimated_column_count, position, min_timestamp, max_timestamp, max_local_deletion_time, compression_ratio, estimated_tombstone_drop_time, sstable_level, repaired_at, min_column_names, max_column_names, has_legacy_counter_shards ); } }; using stats_metadata = ka_stats_metadata; struct disk_token_bound { uint8_t exclusive; // really a boolean disk_string token; template auto describe_type(Describer f) { return f(exclusive, token); } }; struct disk_token_range { disk_token_bound left; disk_token_bound right; template auto describe_type(Describer f) { return f(left, right); } }; // Scylla-specific sharding information. This is a set of token // ranges that are spanned by this sstable. When loading the // sstable, we can see which shards own data in the sstable by // checking each such range. struct sharding_metadata { disk_array token_ranges; template auto describe_type(Describer f) { return f(token_ranges); } }; // Numbers are found on disk, so they do matter. Also, setting their sizes of // that of an uint32_t is a bit wasteful, but it simplifies the code a lot // since we can now still use a strongly typed enum without introducing a // notion of "disk-size" vs "memory-size". enum class metadata_type : uint32_t { Validation = 0, Compaction = 1, Stats = 2, }; enum class scylla_metadata_type : uint32_t { Sharding = 1, }; struct scylla_metadata { disk_set_of_tagged_union > data; template auto describe_type(Describer f) { return f(data); } }; static constexpr int DEFAULT_CHUNK_SIZE = 65536; // checksums are generated using adler32 algorithm. struct checksum { uint32_t chunk_size; std::deque checksums; template auto describe_type(Describer f) { return f(chunk_size, checksums); } }; } namespace std { template <> struct hash : enum_hash {}; } namespace sstables { struct statistics { disk_hash hash; std::unordered_map> contents; }; enum class column_mask : uint8_t { none = 0x0, deletion = 0x01, expiration = 0x02, counter = 0x04, counter_update = 0x08, range_tombstone = 0x10, }; inline column_mask operator&(column_mask m1, column_mask m2) { return column_mask(static_cast(m1) & static_cast(m2)); } inline column_mask operator|(column_mask m1, column_mask m2) { return column_mask(static_cast(m1) | static_cast(m2)); } }