Files
scylladb/sstables/types.hh
Nadav Har'El 99ecda3c96 sstables: overhaul range tombstone reading
Until recently, we believed that range tombstones we read from sstables will
always be for entire rows (or more generalized clustering-key prefixes),
not for arbitrary ranges. But as we found out, because Cassandra insists
that range tombstones do not overlap, it may take two overlapping row
tombstones and convert them into three range tombstones which look like
general ranges (see the patch for a more detailed example).

Not only do we need to accept such "split" range tombstones, we also need
to convert them back to our internal representation which, in the above
example, involves two overlapping tombstones. This is what this patch does.

This patch also contains a test for this case: We created in Cassandra
an sstable with two overlapping deletions, and verify that when we read
it to Scylla, we get these two overlapping deletions - despite the
sstable file actually having contained three non-overlapping tombstones.

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <b7c07466074bf0db6457323af8622bb5210bb86a.1459399004.git.glauber@scylladb.com>
2016-03-31 12:49:50 +03:00

295 lines
9.2 KiB
C++

/*
* Copyright 2015 Cloudius Systems
*/
/*
* This file is part of Scylla.
*
* Scylla is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Scylla is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
*/
#pragma once
#include "disk_types.hh"
#include "core/enum.hh"
#include "bytes.hh"
#include "gc_clock.hh"
#include "tombstone.hh"
#include "streaming_histogram.hh"
#include "estimated_histogram.hh"
#include "column_name_helper.hh"
#include "sstables/key.hh"
#include "db/commitlog/replay_position.hh"
#include <vector>
#include <unordered_map>
#include <type_traits>
namespace sstables {
struct option {
disk_string<uint16_t> key;
disk_string<uint16_t> value;
template <typename Describer>
auto describe_type(Describer f) { return f(key, value); }
};
struct filter {
uint32_t hashes;
disk_array<uint32_t, uint64_t> buckets;
template <typename Describer>
auto describe_type(Describer f) { return f(hashes, buckets); }
// Create an always positive filter if nothing else is specified.
filter() : hashes(0), buckets({}) {}
explicit filter(int hashes, std::deque<uint64_t> buckets) : hashes(hashes), buckets({std::move(buckets)}) {}
};
class index_entry {
temporary_buffer<char> _key;
uint64_t _position;
temporary_buffer<char> _promoted_index;
public:
bytes_view get_key_bytes() const {
return bytes_view(reinterpret_cast<const bytes::value_type *>(_key.get()), _key.size());
}
key_view get_key() const {
return { get_key_bytes() };
}
uint64_t position() const {
return _position;
}
index_entry(temporary_buffer<char>&& key, uint64_t position, temporary_buffer<char>&& promoted_index)
: _key(std::move(key)), _position(position), _promoted_index(std::move(promoted_index)) {}
};
struct summary_entry {
bytes key;
uint64_t position;
key_view get_key() const {
return { key };
}
bool operator==(const summary_entry& x) const {
return position == x.position && key == x.key;
}
};
// Note: Sampling level is present in versions ka and higher. We ATM only support ka,
// so it's always there. But we need to make this conditional if we ever want to support
// other formats.
struct summary_ka {
struct header {
// The minimum possible amount of indexes per group (sampling level)
uint32_t min_index_interval;
// The number of entries in the Summary File
uint32_t size;
// The memory to be consumed to map the whole Summary into memory.
uint64_t memory_size;
// The actual sampling level.
uint32_t sampling_level;
// The number of entries the Summary *would* have if the sampling
// level would be equal to min_index_interval.
uint32_t size_at_full_sampling;
} header;
// The position in the Summary file for each of the indexes.
// NOTE1 that its actual size is determined by the "size" parameter, not
// by its preceding size_at_full_sampling
// NOTE2: They are laid out in *MEMORY* order, not BE.
// NOTE3: The sizes in this array represent positions in the memory stream,
// not the file. The memory stream effectively begins after the header,
// so every position here has to be added of sizeof(header).
std::deque<uint32_t> positions; // can be large, so use a deque instead of a vector
std::deque<summary_entry> entries;
disk_string<uint32_t> first_key;
disk_string<uint32_t> last_key;
// Used to determine when a summary entry should be added based on min_index_interval.
// NOTE: keys_written isn't part of on-disk format of summary.
size_t keys_written;
// NOTE4: There is a structure written by Cassandra into the end of the Summary
// file, after the field last_key, that we haven't understand yet, but we know
// that its content isn't related to the summary itself.
// The structure is basically as follow:
// struct { disk_string<uint16_t>; uint32_t; uint64_t; disk_string<uint16_t>; }
// Another interesting fact about this structure is that it is apparently always
// filled with the same data. It's too early to judge that the data is useless.
// However, it was tested that Cassandra loads successfully a Summary file with
// this structure removed from it. Anyway, let's pay attention to it.
/*
* Returns total amount of memory used by the summary
* Similar to origin off heap size
*/
uint64_t memory_footprint() const {
return sizeof(summary_entry) * entries.size() + sizeof(uint32_t) * positions.size() + sizeof(*this);
}
};
using summary = summary_ka;
struct metadata {
virtual ~metadata() {}
};
struct validation_metadata : public metadata {
disk_string<uint16_t> partitioner;
double filter_chance;
size_t serialized_size() {
return sizeof(uint16_t) + partitioner.value.size() + sizeof(filter_chance);
}
template <typename Describer>
auto describe_type(Describer f) { return f(partitioner, filter_chance); }
};
struct compaction_metadata : public metadata {
disk_array<uint32_t, uint32_t> ancestors;
disk_array<uint32_t, uint8_t> cardinality;
size_t serialized_size() {
return sizeof(uint32_t) + (ancestors.elements.size() * sizeof(uint32_t)) +
sizeof(uint32_t) + (cardinality.elements.size() * sizeof(uint8_t));
}
template <typename Describer>
auto describe_type(Describer f) { return f(ancestors, cardinality); }
};
struct ka_stats_metadata : public metadata {
estimated_histogram estimated_row_size;
estimated_histogram estimated_column_count;
db::replay_position position;
int64_t min_timestamp;
int64_t max_timestamp;
uint32_t max_local_deletion_time;
double compression_ratio;
streaming_histogram estimated_tombstone_drop_time;
uint32_t sstable_level;
uint64_t repaired_at;
disk_array<uint32_t, disk_string<uint16_t>> min_column_names;
disk_array<uint32_t, disk_string<uint16_t>> max_column_names;
bool has_legacy_counter_shards;
template <typename Describer>
auto describe_type(Describer f) {
return f(
estimated_row_size,
estimated_column_count,
position,
min_timestamp,
max_timestamp,
max_local_deletion_time,
compression_ratio,
estimated_tombstone_drop_time,
sstable_level,
repaired_at,
min_column_names,
max_column_names,
has_legacy_counter_shards
);
}
};
using stats_metadata = ka_stats_metadata;
// Numbers are found on disk, so they do matter. Also, setting their sizes of
// that of an uint32_t is a bit wasteful, but it simplifies the code a lot
// since we can now still use a strongly typed enum without introducing a
// notion of "disk-size" vs "memory-size".
enum class metadata_type : uint32_t {
Validation = 0,
Compaction = 1,
Stats = 2,
};
static constexpr int DEFAULT_CHUNK_SIZE = 65536;
// checksums are generated using adler32 algorithm.
struct checksum {
uint32_t chunk_size;
std::deque<uint32_t> checksums;
template <typename Describer>
auto describe_type(Describer f) { return f(chunk_size, checksums); }
};
}
namespace std {
template <>
struct hash<sstables::metadata_type> : enum_hash<sstables::metadata_type> {};
}
namespace sstables {
struct statistics {
disk_hash<uint32_t, metadata_type, uint32_t> hash;
std::unordered_map<metadata_type, std::unique_ptr<metadata>> contents;
};
struct deletion_time {
int32_t local_deletion_time;
int64_t marked_for_delete_at;
template <typename Describer>
auto describe_type(Describer f) { return f(local_deletion_time, marked_for_delete_at); }
bool live() const {
return (local_deletion_time == std::numeric_limits<int32_t>::max()) &&
(marked_for_delete_at == std::numeric_limits<int64_t>::min());
}
bool operator==(const deletion_time& d) {
return local_deletion_time == d.local_deletion_time &&
marked_for_delete_at == d.marked_for_delete_at;
}
bool operator!=(const deletion_time& d) {
return !(*this == d);
}
explicit operator tombstone() {
return tombstone(marked_for_delete_at, gc_clock::time_point(gc_clock::duration(local_deletion_time)));
}
};
enum class column_mask : uint8_t {
none = 0x0,
deletion = 0x01,
expiration = 0x02,
counter = 0x04,
counter_update = 0x08,
range_tombstone = 0x10,
};
inline column_mask operator&(column_mask m1, column_mask m2) {
return column_mask(static_cast<uint8_t>(m1) & static_cast<uint8_t>(m2));
}
inline column_mask operator|(column_mask m1, column_mask m2) {
return column_mask(static_cast<uint8_t>(m1) | static_cast<uint8_t>(m2));
}
}