Files
scylladb/sstables/types.hh
Nadav Har'El c647d917e0 sstables: move to_bytes_view to header file
Move the to_bytes_view(temporary_buffer<char>) function from source file
to header file where is can be used in more places.

This saves one use of reinterpret_cast (which we are no re-evaluating),
and moreover, we want to use this function also in the promoted index
code (to return a bytes_view from the promoted index which was saved as a
temporary_buffer).

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <1468761437-27046-1-git-send-email-nyh@scylladb.com>
2016-07-17 16:29:26 +03:00

306 lines
9.5 KiB
C++

/*
* Copyright (C) 2015 ScyllaDB
*/
/*
* This file is part of Scylla.
*
* Scylla is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Scylla is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
*/
#pragma once
#include "disk_types.hh"
#include "core/enum.hh"
#include "bytes.hh"
#include "gc_clock.hh"
#include "tombstone.hh"
#include "streaming_histogram.hh"
#include "estimated_histogram.hh"
#include "column_name_helper.hh"
#include "sstables/key.hh"
#include "db/commitlog/replay_position.hh"
#include <vector>
#include <unordered_map>
#include <type_traits>
// While the sstable code works with char, bytes_view works with int8_t
// (signed char). Rather than change all the code, let's do a cast.
static inline bytes_view to_bytes_view(const temporary_buffer<char>& b) {
using byte = bytes_view::value_type;
return bytes_view(reinterpret_cast<const byte*>(b.get()), b.size());
}
namespace sstables {
struct option {
disk_string<uint16_t> key;
disk_string<uint16_t> value;
template <typename Describer>
auto describe_type(Describer f) { return f(key, value); }
};
struct filter {
uint32_t hashes;
disk_array<uint32_t, uint64_t> buckets;
template <typename Describer>
auto describe_type(Describer f) { return f(hashes, buckets); }
// Create an always positive filter if nothing else is specified.
filter() : hashes(0), buckets({}) {}
explicit filter(int hashes, std::deque<uint64_t> buckets) : hashes(hashes), buckets({std::move(buckets)}) {}
};
class index_entry {
temporary_buffer<char> _key;
uint64_t _position;
temporary_buffer<char> _promoted_index;
public:
bytes_view get_key_bytes() const {
return to_bytes_view(_key);
}
key_view get_key() const {
return { get_key_bytes() };
}
uint64_t position() const {
return _position;
}
index_entry(temporary_buffer<char>&& key, uint64_t position, temporary_buffer<char>&& promoted_index)
: _key(std::move(key)), _position(position), _promoted_index(std::move(promoted_index)) {}
};
struct summary_entry {
bytes key;
uint64_t position;
key_view get_key() const {
return { key };
}
bool operator==(const summary_entry& x) const {
return position == x.position && key == x.key;
}
};
// Note: Sampling level is present in versions ka and higher. We ATM only support ka,
// so it's always there. But we need to make this conditional if we ever want to support
// other formats.
struct summary_ka {
struct header {
// The minimum possible amount of indexes per group (sampling level)
uint32_t min_index_interval;
// The number of entries in the Summary File
uint32_t size;
// The memory to be consumed to map the whole Summary into memory.
uint64_t memory_size;
// The actual sampling level.
uint32_t sampling_level;
// The number of entries the Summary *would* have if the sampling
// level would be equal to min_index_interval.
uint32_t size_at_full_sampling;
} header;
// The position in the Summary file for each of the indexes.
// NOTE1 that its actual size is determined by the "size" parameter, not
// by its preceding size_at_full_sampling
// NOTE2: They are laid out in *MEMORY* order, not BE.
// NOTE3: The sizes in this array represent positions in the memory stream,
// not the file. The memory stream effectively begins after the header,
// so every position here has to be added of sizeof(header).
std::deque<uint32_t> positions; // can be large, so use a deque instead of a vector
std::deque<summary_entry> entries;
disk_string<uint32_t> first_key;
disk_string<uint32_t> last_key;
// Used to determine when a summary entry should be added based on min_index_interval.
// NOTE: keys_written isn't part of on-disk format of summary.
size_t keys_written;
// NOTE4: There is a structure written by Cassandra into the end of the Summary
// file, after the field last_key, that we haven't understand yet, but we know
// that its content isn't related to the summary itself.
// The structure is basically as follow:
// struct { disk_string<uint16_t>; uint32_t; uint64_t; disk_string<uint16_t>; }
// Another interesting fact about this structure is that it is apparently always
// filled with the same data. It's too early to judge that the data is useless.
// However, it was tested that Cassandra loads successfully a Summary file with
// this structure removed from it. Anyway, let's pay attention to it.
/*
* Returns total amount of memory used by the summary
* Similar to origin off heap size
*/
uint64_t memory_footprint() const {
return sizeof(summary_entry) * entries.size() + sizeof(uint32_t) * positions.size() + sizeof(*this);
}
explicit operator bool() const {
return entries.size();
}
};
using summary = summary_ka;
struct metadata {
virtual ~metadata() {}
};
struct validation_metadata : public metadata {
disk_string<uint16_t> partitioner;
double filter_chance;
size_t serialized_size() {
return sizeof(uint16_t) + partitioner.value.size() + sizeof(filter_chance);
}
template <typename Describer>
auto describe_type(Describer f) { return f(partitioner, filter_chance); }
};
struct compaction_metadata : public metadata {
disk_array<uint32_t, uint32_t> ancestors;
disk_array<uint32_t, uint8_t> cardinality;
size_t serialized_size() {
return sizeof(uint32_t) + (ancestors.elements.size() * sizeof(uint32_t)) +
sizeof(uint32_t) + (cardinality.elements.size() * sizeof(uint8_t));
}
template <typename Describer>
auto describe_type(Describer f) { return f(ancestors, cardinality); }
};
struct ka_stats_metadata : public metadata {
estimated_histogram estimated_row_size;
estimated_histogram estimated_column_count;
db::replay_position position;
int64_t min_timestamp;
int64_t max_timestamp;
int32_t max_local_deletion_time;
double compression_ratio;
streaming_histogram estimated_tombstone_drop_time;
uint32_t sstable_level;
uint64_t repaired_at;
disk_array<uint32_t, disk_string<uint16_t>> min_column_names;
disk_array<uint32_t, disk_string<uint16_t>> max_column_names;
bool has_legacy_counter_shards;
template <typename Describer>
auto describe_type(Describer f) {
return f(
estimated_row_size,
estimated_column_count,
position,
min_timestamp,
max_timestamp,
max_local_deletion_time,
compression_ratio,
estimated_tombstone_drop_time,
sstable_level,
repaired_at,
min_column_names,
max_column_names,
has_legacy_counter_shards
);
}
};
using stats_metadata = ka_stats_metadata;
// Numbers are found on disk, so they do matter. Also, setting their sizes of
// that of an uint32_t is a bit wasteful, but it simplifies the code a lot
// since we can now still use a strongly typed enum without introducing a
// notion of "disk-size" vs "memory-size".
enum class metadata_type : uint32_t {
Validation = 0,
Compaction = 1,
Stats = 2,
};
static constexpr int DEFAULT_CHUNK_SIZE = 65536;
// checksums are generated using adler32 algorithm.
struct checksum {
uint32_t chunk_size;
std::deque<uint32_t> checksums;
template <typename Describer>
auto describe_type(Describer f) { return f(chunk_size, checksums); }
};
}
namespace std {
template <>
struct hash<sstables::metadata_type> : enum_hash<sstables::metadata_type> {};
}
namespace sstables {
struct statistics {
disk_hash<uint32_t, metadata_type, uint32_t> hash;
std::unordered_map<metadata_type, std::unique_ptr<metadata>> contents;
};
struct deletion_time {
int32_t local_deletion_time;
int64_t marked_for_delete_at;
template <typename Describer>
auto describe_type(Describer f) { return f(local_deletion_time, marked_for_delete_at); }
bool live() const {
return (local_deletion_time == std::numeric_limits<int32_t>::max()) &&
(marked_for_delete_at == std::numeric_limits<int64_t>::min());
}
bool operator==(const deletion_time& d) {
return local_deletion_time == d.local_deletion_time &&
marked_for_delete_at == d.marked_for_delete_at;
}
bool operator!=(const deletion_time& d) {
return !(*this == d);
}
explicit operator tombstone() {
return !live() ? tombstone(marked_for_delete_at, gc_clock::time_point(gc_clock::duration(local_deletion_time))) : tombstone();
}
};
enum class column_mask : uint8_t {
none = 0x0,
deletion = 0x01,
expiration = 0x02,
counter = 0x04,
counter_update = 0x08,
range_tombstone = 0x10,
};
inline column_mask operator&(column_mask m1, column_mask m2) {
return column_mask(static_cast<uint8_t>(m1) & static_cast<uint8_t>(m2));
}
inline column_mask operator|(column_mask m1, column_mask m2) {
return column_mask(static_cast<uint8_t>(m1) | static_cast<uint8_t>(m2));
}
}