mirror of
https://github.com/scylladb/scylladb.git
synced 2026-04-23 10:00:35 +00:00
This series mainly fixes issues with the serialization of promoted
index entries for non-compound schemas and with the serialization of
range tombstones, also for non-compound schemas.
We lift the correct cell name writing code into its own function,
and direct all users to it. We also ensure backward compatibility with
incorrectly generated promoted indexes and range tombstones.
Fixes #2995
Fixes #2986
Fixes #2979
Fixes #2992
Fixes #2993
* git@github.com:duarten/scylla.git promoted-index-serialization/v3:
sstables/sstables: Unify column name writers
sstables/sstables: Don't write index entry for a missing row maker
sstables/sstables: Reuse write_range_tombstone() for row tombstones
sstables/sstables: Lift index writing for row tombstones
sstables/sstables: Leverage index code upon range tombstone consume
sstables/sstables: Move out tombstone check in write_range_tombstone()
sstables/sstables: A schema with static columns is always compound
sstables/sstables: Lift column name writing logic
sstables/sstables: Use schema-aware write_column_name() for
collections
sstables/sstables: Use schema-aware write_column_name() for row marker
sstables/sstables: Use schema-aware write_column_name() for static row
sstables/sstables: Writing promoted index entry leverages
column_name_writer
sstables/sstables: Add supported feature list to sstables
sstables/sstables: Don't use incorrectly serialized promoted index
cql3/single_column_primary_key_restrictions: Implement is_inclusive()
cql3/delete_statement: Constrain range deletions for non-compound
schemas
tests/cql_query_test: Verify range deletion constraints
sstables/sstables: Correctly deserialize range tombstones
service/storage_service: Add feature for correct non-compound RTs
tests/sstable_*: Start the storage service for some cases
sstables/sstable_writer: Prepare to control range tombstone
serialization
sstables/sstables: Correctly serialize range tombstones
tests/sstable_assertions: Fix monotonicity check for promoted indexes
tests/sstable_assertions: Assert a promoted index is empty
tests/sstable_mutation_test: Verify promoted index serializes
correctly
tests/sstable_mutation_test: Verify promoted index repeats tombstones
tests/sstable_mutation_test: Ensure range tombstone serializes
correctly
tests/sstable_datafile_test: Add test for incorrect promoted index
tests/sstable_datafile_test: Verify reading of incorrect range
tombstones
sstables/sstable: Rename schema-oblivious write_column_name() function
sstables/sstables: No promoted index without clustering keys
tests/sstable_mutation_test: Verify promoted index is not generated
sstables/sstables: Optimize column name writing and indexing
compound_compat: Don't assume compoundness
(cherry picked from commit bd1efbc25c)
Also added sstables::make_sstable() to preserve source compatibility in tests.
460 lines
14 KiB
C++
460 lines
14 KiB
C++
/*
|
|
* Copyright (C) 2015 ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* This file is part of Scylla.
|
|
*
|
|
* Scylla is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU Affero General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* Scylla is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include "disk_types.hh"
|
|
#include "core/enum.hh"
|
|
#include "bytes.hh"
|
|
#include "gc_clock.hh"
|
|
#include "tombstone.hh"
|
|
#include "streaming_histogram.hh"
|
|
#include "utils/estimated_histogram.hh"
|
|
#include "column_name_helper.hh"
|
|
#include "sstables/key.hh"
|
|
#include "db/commitlog/replay_position.hh"
|
|
#include <vector>
|
|
#include <unordered_map>
|
|
#include <type_traits>
|
|
|
|
// While the sstable code works with char, bytes_view works with int8_t
|
|
// (signed char). Rather than change all the code, let's do a cast.
|
|
static inline bytes_view to_bytes_view(const temporary_buffer<char>& b) {
|
|
using byte = bytes_view::value_type;
|
|
return bytes_view(reinterpret_cast<const byte*>(b.get()), b.size());
|
|
}
|
|
|
|
namespace sstables {
|
|
|
|
struct deletion_time {
|
|
int32_t local_deletion_time;
|
|
int64_t marked_for_delete_at;
|
|
|
|
template <typename Describer>
|
|
auto describe_type(Describer f) { return f(local_deletion_time, marked_for_delete_at); }
|
|
|
|
bool live() const {
|
|
return (local_deletion_time == std::numeric_limits<int32_t>::max()) &&
|
|
(marked_for_delete_at == std::numeric_limits<int64_t>::min());
|
|
}
|
|
|
|
bool operator==(const deletion_time& d) {
|
|
return local_deletion_time == d.local_deletion_time &&
|
|
marked_for_delete_at == d.marked_for_delete_at;
|
|
}
|
|
bool operator!=(const deletion_time& d) {
|
|
return !(*this == d);
|
|
}
|
|
explicit operator tombstone() {
|
|
return !live() ? tombstone(marked_for_delete_at, gc_clock::time_point(gc_clock::duration(local_deletion_time))) : tombstone();
|
|
}
|
|
};
|
|
|
|
struct option {
|
|
disk_string<uint16_t> key;
|
|
disk_string<uint16_t> value;
|
|
|
|
template <typename Describer>
|
|
auto describe_type(Describer f) { return f(key, value); }
|
|
};
|
|
|
|
struct filter {
|
|
uint32_t hashes;
|
|
disk_array<uint32_t, uint64_t> buckets;
|
|
|
|
template <typename Describer>
|
|
auto describe_type(Describer f) { return f(hashes, buckets); }
|
|
|
|
// Create an always positive filter if nothing else is specified.
|
|
filter() : hashes(0), buckets({}) {}
|
|
explicit filter(int hashes, std::deque<uint64_t> buckets) : hashes(hashes), buckets({std::move(buckets)}) {}
|
|
};
|
|
|
|
enum class indexable_element {
|
|
partition,
|
|
cell
|
|
};
|
|
|
|
// Exploded view of promoted index.
|
|
// Contains pointers into external buffer, so that buffer must be kept alive
|
|
// as long as this is used.
|
|
struct promoted_index {
|
|
struct entry {
|
|
composite_view start;
|
|
composite_view end;
|
|
uint64_t offset;
|
|
uint64_t width;
|
|
};
|
|
deletion_time del_time;
|
|
std::deque<entry> entries;
|
|
};
|
|
|
|
class promoted_index_view {
|
|
bytes_view _bytes;
|
|
public:
|
|
explicit promoted_index_view(bytes_view v) : _bytes(v) {}
|
|
sstables::deletion_time get_deletion_time() const;
|
|
promoted_index parse(const schema&) const;
|
|
explicit operator bool() const { return !_bytes.empty(); }
|
|
};
|
|
|
|
class index_entry {
|
|
temporary_buffer<char> _key;
|
|
mutable stdx::optional<dht::token> _token;
|
|
uint64_t _position;
|
|
temporary_buffer<char> _promoted_index_bytes;
|
|
stdx::optional<promoted_index> _promoted_index;
|
|
public:
|
|
|
|
bytes_view get_key_bytes() const {
|
|
return to_bytes_view(_key);
|
|
}
|
|
|
|
key_view get_key() const {
|
|
return key_view{get_key_bytes()};
|
|
}
|
|
|
|
decorated_key_view get_decorated_key() const {
|
|
if (!_token) {
|
|
_token.emplace(dht::global_partitioner().get_token(get_key()));
|
|
}
|
|
return decorated_key_view(*_token, get_key());
|
|
}
|
|
|
|
uint64_t position() const {
|
|
return _position;
|
|
}
|
|
|
|
bytes_view get_promoted_index_bytes() const {
|
|
return to_bytes_view(_promoted_index_bytes);
|
|
}
|
|
|
|
promoted_index_view get_promoted_index_view() const {
|
|
return promoted_index_view(get_promoted_index_bytes());
|
|
}
|
|
|
|
index_entry(temporary_buffer<char>&& key, uint64_t position, temporary_buffer<char>&& promoted_index)
|
|
: _key(std::move(key)), _position(position), _promoted_index_bytes(std::move(promoted_index)) {}
|
|
|
|
index_entry(const index_entry& o)
|
|
: _key(o._key.get(), o._key.size())
|
|
, _position(o._position)
|
|
, _promoted_index_bytes(o._promoted_index_bytes.get(), o._promoted_index_bytes.size())
|
|
{ }
|
|
|
|
promoted_index* get_promoted_index(const schema& s) {
|
|
if (!_promoted_index) {
|
|
auto v = get_promoted_index_view();
|
|
if (v) {
|
|
_promoted_index = v.parse(s);
|
|
}
|
|
}
|
|
return _promoted_index ? &*_promoted_index : nullptr;
|
|
}
|
|
};
|
|
|
|
struct summary_entry {
|
|
dht::token token;
|
|
bytes key;
|
|
uint64_t position;
|
|
|
|
key_view get_key() const {
|
|
return key_view{key};
|
|
}
|
|
|
|
decorated_key_view get_decorated_key() const {
|
|
return decorated_key_view(token, get_key());
|
|
}
|
|
|
|
bool operator==(const summary_entry& x) const {
|
|
return position == x.position && key == x.key;
|
|
}
|
|
};
|
|
|
|
// Note: Sampling level is present in versions ka and higher. We ATM only support ka,
|
|
// so it's always there. But we need to make this conditional if we ever want to support
|
|
// other formats.
|
|
struct summary_ka {
|
|
struct header {
|
|
// The minimum possible amount of indexes per group (sampling level)
|
|
uint32_t min_index_interval;
|
|
// The number of entries in the Summary File
|
|
uint32_t size;
|
|
// The memory to be consumed to map the whole Summary into memory.
|
|
uint64_t memory_size;
|
|
// The actual sampling level.
|
|
uint32_t sampling_level;
|
|
// The number of entries the Summary *would* have if the sampling
|
|
// level would be equal to min_index_interval.
|
|
uint32_t size_at_full_sampling;
|
|
} header;
|
|
// The position in the Summary file for each of the indexes.
|
|
// NOTE1 that its actual size is determined by the "size" parameter, not
|
|
// by its preceding size_at_full_sampling
|
|
// NOTE2: They are laid out in *MEMORY* order, not BE.
|
|
// NOTE3: The sizes in this array represent positions in the memory stream,
|
|
// not the file. The memory stream effectively begins after the header,
|
|
// so every position here has to be added of sizeof(header).
|
|
std::deque<uint32_t> positions; // can be large, so use a deque instead of a vector
|
|
std::deque<summary_entry> entries;
|
|
|
|
disk_string<uint32_t> first_key;
|
|
disk_string<uint32_t> last_key;
|
|
|
|
// Used to determine when a summary entry should be added based on min_index_interval.
|
|
// NOTE: keys_written isn't part of on-disk format of summary.
|
|
size_t keys_written;
|
|
|
|
// NOTE4: There is a structure written by Cassandra into the end of the Summary
|
|
// file, after the field last_key, that we haven't understand yet, but we know
|
|
// that its content isn't related to the summary itself.
|
|
// The structure is basically as follow:
|
|
// struct { disk_string<uint16_t>; uint32_t; uint64_t; disk_string<uint16_t>; }
|
|
// Another interesting fact about this structure is that it is apparently always
|
|
// filled with the same data. It's too early to judge that the data is useless.
|
|
// However, it was tested that Cassandra loads successfully a Summary file with
|
|
// this structure removed from it. Anyway, let's pay attention to it.
|
|
|
|
/*
|
|
* Returns total amount of memory used by the summary
|
|
* Similar to origin off heap size
|
|
*/
|
|
uint64_t memory_footprint() const {
|
|
auto sz = sizeof(summary_entry) * entries.size() + sizeof(uint32_t) * positions.size() + sizeof(*this);
|
|
sz += first_key.value.size() + last_key.value.size();
|
|
for (auto& e : entries) {
|
|
sz += e.key.size();
|
|
}
|
|
return sz;
|
|
}
|
|
|
|
explicit operator bool() const {
|
|
return entries.size();
|
|
}
|
|
};
|
|
using summary = summary_ka;
|
|
|
|
class file_writer;
|
|
|
|
struct metadata {
|
|
virtual ~metadata() {}
|
|
virtual uint64_t serialized_size() const = 0;
|
|
virtual void write(file_writer& write) const = 0;
|
|
};
|
|
|
|
template <typename T>
|
|
uint64_t serialized_size(const T& object);
|
|
|
|
template <class T>
|
|
typename std::enable_if_t<!std::is_integral<T>::value && !std::is_enum<T>::value, void>
|
|
write(file_writer& out, const T& t);
|
|
|
|
// serialized_size() implementation for metadata class
|
|
template <typename Component>
|
|
class metadata_base : public metadata {
|
|
public:
|
|
virtual uint64_t serialized_size() const override {
|
|
return sstables::serialized_size(static_cast<const Component&>(*this));
|
|
}
|
|
virtual void write(file_writer& writer) const override {
|
|
return sstables::write(writer, static_cast<const Component&>(*this));
|
|
}
|
|
};
|
|
|
|
struct validation_metadata : public metadata_base<validation_metadata> {
|
|
disk_string<uint16_t> partitioner;
|
|
double filter_chance;
|
|
|
|
template <typename Describer>
|
|
auto describe_type(Describer f) { return f(partitioner, filter_chance); }
|
|
};
|
|
|
|
struct compaction_metadata : public metadata_base<compaction_metadata> {
|
|
disk_array<uint32_t, uint32_t> ancestors;
|
|
disk_array<uint32_t, uint8_t> cardinality;
|
|
|
|
template <typename Describer>
|
|
auto describe_type(Describer f) { return f(ancestors, cardinality); }
|
|
};
|
|
|
|
struct ka_stats_metadata : public metadata_base<ka_stats_metadata> {
|
|
utils::estimated_histogram estimated_row_size;
|
|
utils::estimated_histogram estimated_column_count;
|
|
db::replay_position position;
|
|
int64_t min_timestamp;
|
|
int64_t max_timestamp;
|
|
int32_t max_local_deletion_time;
|
|
double compression_ratio;
|
|
streaming_histogram estimated_tombstone_drop_time;
|
|
uint32_t sstable_level;
|
|
uint64_t repaired_at;
|
|
disk_array<uint32_t, disk_string<uint16_t>> min_column_names;
|
|
disk_array<uint32_t, disk_string<uint16_t>> max_column_names;
|
|
bool has_legacy_counter_shards;
|
|
|
|
template <typename Describer>
|
|
auto describe_type(Describer f) {
|
|
return f(
|
|
estimated_row_size,
|
|
estimated_column_count,
|
|
position,
|
|
min_timestamp,
|
|
max_timestamp,
|
|
max_local_deletion_time,
|
|
compression_ratio,
|
|
estimated_tombstone_drop_time,
|
|
sstable_level,
|
|
repaired_at,
|
|
min_column_names,
|
|
max_column_names,
|
|
has_legacy_counter_shards
|
|
);
|
|
}
|
|
};
|
|
using stats_metadata = ka_stats_metadata;
|
|
|
|
struct disk_token_bound {
|
|
uint8_t exclusive; // really a boolean
|
|
disk_string<uint16_t> token;
|
|
|
|
template <typename Describer>
|
|
auto describe_type(Describer f) { return f(exclusive, token); }
|
|
};
|
|
|
|
struct disk_token_range {
|
|
disk_token_bound left;
|
|
disk_token_bound right;
|
|
|
|
template <typename Describer>
|
|
auto describe_type(Describer f) { return f(left, right); }
|
|
};
|
|
|
|
// Scylla-specific sharding information. This is a set of token
|
|
// ranges that are spanned by this sstable. When loading the
|
|
// sstable, we can see which shards own data in the sstable by
|
|
// checking each such range.
|
|
struct sharding_metadata {
|
|
disk_array<uint32_t, disk_token_range> token_ranges;
|
|
|
|
template <typename Describer>
|
|
auto describe_type(Describer f) { return f(token_ranges); }
|
|
};
|
|
|
|
// Scylla-specific list of features an sstable supports.
|
|
enum sstable_feature : uint8_t {
|
|
NonCompoundPIEntries = 0, // See #2993
|
|
NonCompoundRangeTombstones = 1, // See #2986
|
|
End = 2
|
|
};
|
|
|
|
// Scylla-specific features enabled for a particular sstable.
|
|
struct sstable_enabled_features {
|
|
uint64_t enabled_features;
|
|
|
|
bool is_enabled(sstable_feature f) const {
|
|
return enabled_features & (1 << f);
|
|
}
|
|
|
|
void disable(sstable_feature f) {
|
|
enabled_features &= ~(1<< f);
|
|
}
|
|
|
|
template <typename Describer>
|
|
auto describe_type(Describer f) { return f(enabled_features); }
|
|
};
|
|
|
|
// Numbers are found on disk, so they do matter. Also, setting their sizes of
|
|
// that of an uint32_t is a bit wasteful, but it simplifies the code a lot
|
|
// since we can now still use a strongly typed enum without introducing a
|
|
// notion of "disk-size" vs "memory-size".
|
|
enum class metadata_type : uint32_t {
|
|
Validation = 0,
|
|
Compaction = 1,
|
|
Stats = 2,
|
|
};
|
|
|
|
enum class scylla_metadata_type : uint32_t {
|
|
Sharding = 1,
|
|
Features = 2,
|
|
};
|
|
|
|
struct scylla_metadata {
|
|
disk_set_of_tagged_union<scylla_metadata_type,
|
|
disk_tagged_union_member<scylla_metadata_type, scylla_metadata_type::Sharding, sharding_metadata>,
|
|
disk_tagged_union_member<scylla_metadata_type, scylla_metadata_type::Features, sstable_enabled_features>
|
|
> data;
|
|
|
|
bool has_feature(sstable_feature f) const {
|
|
auto features = data.get<scylla_metadata_type::Features, sstable_enabled_features>();
|
|
return features && features->is_enabled(f);
|
|
}
|
|
|
|
template <typename Describer>
|
|
auto describe_type(Describer f) { return f(data); }
|
|
};
|
|
|
|
static constexpr int DEFAULT_CHUNK_SIZE = 65536;
|
|
|
|
// checksums are generated using adler32 algorithm.
|
|
struct checksum {
|
|
uint32_t chunk_size;
|
|
std::deque<uint32_t> checksums;
|
|
|
|
template <typename Describer>
|
|
auto describe_type(Describer f) { return f(chunk_size, checksums); }
|
|
};
|
|
|
|
}
|
|
|
|
namespace std {
|
|
|
|
template <>
|
|
struct hash<sstables::metadata_type> : enum_hash<sstables::metadata_type> {};
|
|
|
|
}
|
|
|
|
namespace sstables {
|
|
|
|
struct statistics {
|
|
disk_hash<uint32_t, metadata_type, uint32_t> hash;
|
|
std::unordered_map<metadata_type, std::unique_ptr<metadata>> contents;
|
|
};
|
|
|
|
enum class column_mask : uint8_t {
|
|
none = 0x0,
|
|
deletion = 0x01,
|
|
expiration = 0x02,
|
|
counter = 0x04,
|
|
counter_update = 0x08,
|
|
range_tombstone = 0x10,
|
|
shadowable = 0x40
|
|
};
|
|
|
|
inline column_mask operator&(column_mask m1, column_mask m2) {
|
|
return column_mask(static_cast<uint8_t>(m1) & static_cast<uint8_t>(m2));
|
|
}
|
|
|
|
inline column_mask operator|(column_mask m1, column_mask m2) {
|
|
return column_mask(static_cast<uint8_t>(m1) | static_cast<uint8_t>(m2));
|
|
}
|
|
}
|
|
|