/* * Copyright 2015 Cloudius Systems */ #pragma once #include "disk_types.hh" #include "core/enum.hh" #include "bytes.hh" #include "gc_clock.hh" #include "tombstone.hh" #include "streaming_histogram.hh" #include "estimated_histogram.hh" #include "column_name_helper.hh" #include "sstables/key.hh" #include #include #include namespace sstables { struct option { disk_string key; disk_string value; template auto describe_type(Describer f) { return f(key, value); } }; struct filter { uint32_t hashes; disk_array buckets; template auto describe_type(Describer f) { return f(hashes, buckets); } // Create an always positive filter if nothing else is specified. filter() : hashes(0), buckets({}) {} explicit filter(int hashes, std::vector buckets) : hashes(hashes), buckets({std::move(buckets)}) {} }; struct index_entry { disk_string key; uint64_t position; disk_string promoted_index; key_view get_key() const { return { bytes_view(key) }; } }; struct summary_entry { bytes key; uint64_t position; key_view get_key() const { return { key }; } bool operator==(const summary_entry& x) const { return position == x.position && key == x.key; } }; // Note: Sampling level is present in versions ka and higher. We ATM only support ka, // so it's always there. But we need to make this conditional if we ever want to support // other formats. struct summary_ka { struct header { // The minimum possible amount of indexes per group (sampling level) uint32_t min_index_interval; // The number of entries in the Summary File uint32_t size; // The memory to be consumed to map the whole Summary into memory. uint64_t memory_size; // The actual sampling level. uint32_t sampling_level; // The number of entries the Summary *would* have if the sampling // level would be equal to min_index_interval. uint32_t size_at_full_sampling; } header; // The position in the Summary file for each of the indexes. // NOTE1 that its actual size is determined by the "size" parameter, not // by its preceding size_at_full_sampling // NOTE2: They are laid out in *MEMORY* order, not BE. // NOTE3: The sizes in this array represent positions in the memory stream, // not the file. The memory stream effectively begins after the header, // so every position here has to be added of sizeof(header). std::vector positions; std::vector entries; disk_string first_key; disk_string last_key; // Used to determine when a summary entry should be added based on min_index_interval. // NOTE: keys_written isn't part of on-disk format of summary. size_t keys_written; // NOTE4: There is a structure written by Cassandra into the end of the Summary // file, after the field last_key, that we haven't understand yet, but we know // that its content isn't related to the summary itself. // The structure is basically as follow: // struct { disk_string; uint32_t; uint64_t; disk_string; } // Another interesting fact about this structure is that it is apparently always // filled with the same data. It's too early to judge that the data is useless. // However, it was tested that Cassandra loads successfully a Summary file with // this structure removed from it. Anyway, let's pay attention to it. }; using summary = summary_ka; struct replay_position { uint64_t segment; uint32_t position; replay_position() {} replay_position(uint64_t seg, uint32_t pos) { segment = seg; position = pos; } template auto describe_type(Describer f) { return f(segment, position); } }; struct metadata { virtual ~metadata() {} }; struct validation_metadata : public metadata { disk_string partitioner; double filter_chance; size_t serialized_size() { return sizeof(uint16_t) + partitioner.value.size() + sizeof(filter_chance); } template auto describe_type(Describer f) { return f(partitioner, filter_chance); } }; struct compaction_metadata : public metadata { disk_array ancestors; disk_array cardinality; size_t serialized_size() { return sizeof(uint32_t) + (ancestors.elements.size() * sizeof(uint32_t)) + sizeof(uint32_t) + (cardinality.elements.size() * sizeof(uint8_t)); } template auto describe_type(Describer f) { return f(ancestors, cardinality); } }; struct ka_stats_metadata : public metadata { estimated_histogram estimated_row_size; estimated_histogram estimated_column_count; replay_position position; uint64_t min_timestamp; uint64_t max_timestamp; uint32_t max_local_deletion_time; double compression_ratio; streaming_histogram estimated_tombstone_drop_time; uint32_t sstable_level; uint64_t repaired_at; disk_array> min_column_names; disk_array> max_column_names; bool has_legacy_counter_shards; template auto describe_type(Describer f) { return f( estimated_row_size, estimated_column_count, position, min_timestamp, max_timestamp, max_local_deletion_time, compression_ratio, estimated_tombstone_drop_time, sstable_level, repaired_at, min_column_names, max_column_names, has_legacy_counter_shards ); } }; using stats_metadata = ka_stats_metadata; // Numbers are found on disk, so they do matter. Also, setting their sizes of // that of an uint32_t is a bit wasteful, but it simplifies the code a lot // since we can now still use a strongly typed enum without introducing a // notion of "disk-size" vs "memory-size". enum class metadata_type : uint32_t { Validation = 0, Compaction = 1, Stats = 2, }; static constexpr int DEFAULT_CHUNK_SIZE = 65536; // checksums are generated using adler32 algorithm. struct checksum { uint32_t chunk_size; std::vector checksums; template auto describe_type(Describer f) { return f(chunk_size, checksums); } }; } namespace std { template <> struct hash : enum_hash {}; } namespace sstables { struct statistics { disk_hash hash; std::unordered_map> contents; }; struct deletion_time { int32_t local_deletion_time; int64_t marked_for_delete_at; template auto describe_type(Describer f) { return f(local_deletion_time, marked_for_delete_at); } bool live() const { return (local_deletion_time == std::numeric_limits::max()) && (marked_for_delete_at == std::numeric_limits::min()); } explicit operator tombstone() { return tombstone(marked_for_delete_at, gc_clock::time_point(gc_clock::duration(local_deletion_time))); } }; enum class column_mask : uint8_t { none = 0x0, deletion = 0x01, expiration = 0x02, counter = 0x04, counter_update = 0x08, range_tombstone = 0x10, }; inline column_mask operator&(column_mask m1, column_mask m2) { return column_mask(static_cast(m1) & static_cast(m2)); } inline column_mask operator|(column_mask m1, column_mask m2) { return column_mask(static_cast(m1) | static_cast(m2)); } }