mirror of
https://github.com/scylladb/scylladb.git
synced 2026-04-19 16:15:07 +00:00
848 lines
35 KiB
C++
848 lines
35 KiB
C++
/*
|
|
* Copyright (C) 2015 ScyllaDB
|
|
*
|
|
*/
|
|
|
|
/*
|
|
* This file is part of Scylla.
|
|
*
|
|
* Scylla is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU Affero General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* Scylla is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include "core/file.hh"
|
|
#include "core/fstream.hh"
|
|
#include "core/future.hh"
|
|
#include "core/sstring.hh"
|
|
#include "core/enum.hh"
|
|
#include "core/shared_ptr.hh"
|
|
#include "core/distributed.hh"
|
|
#include <unordered_set>
|
|
#include <unordered_map>
|
|
#include "types.hh"
|
|
#include "clustering_key_filter.hh"
|
|
#include "core/enum.hh"
|
|
#include "compress.hh"
|
|
#include "row.hh"
|
|
#include "dht/i_partitioner.hh"
|
|
#include "schema.hh"
|
|
#include "mutation.hh"
|
|
#include "utils/i_filter.hh"
|
|
#include "core/stream.hh"
|
|
#include "writer.hh"
|
|
#include "metadata_collector.hh"
|
|
#include "filter.hh"
|
|
#include "exceptions.hh"
|
|
#include "mutation_reader.hh"
|
|
#include "query-request.hh"
|
|
#include "compound_compat.hh"
|
|
#include "disk-error-handler.hh"
|
|
#include "atomic_deletion.hh"
|
|
#include "sstables/shared_index_lists.hh"
|
|
|
|
namespace sstables {
|
|
|
|
extern logging::logger sstlog;
|
|
|
|
// data_consume_context is an object returned by sstable::data_consume_rows()
|
|
// which allows knowing when the consumer stops reading, and starting it again
|
|
// (e.g., when the consumer wants to stop after every sstable row).
|
|
//
|
|
// The read() method initiates reading into the consumer, and continues to
|
|
// read and feed data into the consumer until one of the consumer's callbacks
|
|
// requests to stop, or until we reach the end of the data range originally
|
|
// requested. read() returns a future which completes when reading stopped.
|
|
// If we're at the end-of-file, the read may complete without reading anything
|
|
// so it's the consumer class's task to check if anything was consumed.
|
|
// Note:
|
|
// The caller MUST ensure that between calling read() on this object,
|
|
// and the time the returned future is completed, the object lives on.
|
|
// Moreover, the sstable object used for the sstable::data_consume_rows()
|
|
// call which created this data_consume_context, must also be kept alive.
|
|
class data_consume_context {
|
|
class impl;
|
|
std::unique_ptr<impl> _pimpl;
|
|
// This object can only be constructed by sstable::data_consume_rows()
|
|
data_consume_context(std::unique_ptr<impl>);
|
|
friend class sstable;
|
|
public:
|
|
future<> read();
|
|
future<> fast_forward_to(uint64_t begin, uint64_t end);
|
|
future<> skip_to(indexable_element, uint64_t begin);
|
|
uint64_t position() const;
|
|
// Define (as defaults) the destructor and move operations in the source
|
|
// file, so here we don't need to know the incomplete impl type.
|
|
~data_consume_context();
|
|
data_consume_context(data_consume_context&&) noexcept;
|
|
data_consume_context& operator=(data_consume_context&&) noexcept;
|
|
};
|
|
|
|
// mutation_reader is an object returned by sstable::read_rows() et al. which
|
|
// allows getting each sstable row in sequence, in mutation format.
|
|
//
|
|
// The read() method reads the next mutation, returning a disengaged optional
|
|
// on EOF. As usual for future-returning functions, a caller which starts a
|
|
// read() MUST ensure that the mutation_reader object continues to live until
|
|
// the returned future is fulfilled. Moreover, the sstable whose read_rows()
|
|
// method was used to open this mutation_reader must also live between the
|
|
// time read() is called and its future ends.
|
|
// As soon as the future returned by read() completes, the object may safely
|
|
// be deleted. In other words, when the read() future is fulfilled, we can
|
|
// be sure there are no background tasks still scheduled.
|
|
class mutation_reader {
|
|
class impl;
|
|
std::unique_ptr<impl> _pimpl;
|
|
// This object can only be constructed by sstable::read_rows() et al.
|
|
mutation_reader(std::unique_ptr<impl>);
|
|
friend class sstable;
|
|
public:
|
|
future<streamed_mutation_opt> read();
|
|
future<> fast_forward_to(const dht::partition_range&);
|
|
// Define (as defaults) the destructor and move operations in the source
|
|
// file, so here we don't need to know the incomplete impl type.
|
|
~mutation_reader();
|
|
mutation_reader(mutation_reader&&);
|
|
mutation_reader& operator=(mutation_reader&&);
|
|
};
|
|
|
|
class key;
|
|
class sstable_writer;
|
|
struct foreign_sstable_open_info;
|
|
struct sstable_open_info;
|
|
|
|
class index_reader;
|
|
|
|
struct sstable_writer_config {
|
|
std::experimental::optional<size_t> promoted_index_block_size;
|
|
uint64_t max_sstable_size = std::numeric_limits<uint64_t>::max();
|
|
bool backup = false;
|
|
bool leave_unsealed = false;
|
|
};
|
|
|
|
class sstable : public enable_lw_shared_from_this<sstable> {
|
|
public:
|
|
enum class component_type {
|
|
Index,
|
|
CompressionInfo,
|
|
Data,
|
|
TOC,
|
|
Summary,
|
|
Digest,
|
|
CRC,
|
|
Filter,
|
|
Statistics,
|
|
TemporaryTOC,
|
|
TemporaryStatistics,
|
|
Scylla,
|
|
Unknown,
|
|
};
|
|
enum class version_types { ka, la };
|
|
enum class format_types { big };
|
|
static const size_t default_buffer_size = 128*1024;
|
|
public:
|
|
sstable(schema_ptr schema, sstring dir, int64_t generation, version_types v, format_types f, gc_clock::time_point now = gc_clock::now(),
|
|
io_error_handler_gen error_handler_gen = default_io_error_handler_gen(), size_t buffer_size = default_buffer_size)
|
|
: sstable_buffer_size(buffer_size)
|
|
, _schema(std::move(schema))
|
|
, _dir(std::move(dir))
|
|
, _generation(generation)
|
|
, _version(v)
|
|
, _format(f)
|
|
, _now(now)
|
|
, _read_error_handler(error_handler_gen(sstable_read_error))
|
|
, _write_error_handler(error_handler_gen(sstable_write_error))
|
|
{ }
|
|
sstable& operator=(const sstable&) = delete;
|
|
sstable(const sstable&) = delete;
|
|
sstable(sstable&&) = default;
|
|
|
|
~sstable();
|
|
|
|
// Read one or few rows at the given byte range from the data file,
|
|
// feeding them into the consumer. This function reads the entire given
|
|
// byte range at once into memory, so it should not be used for iterating
|
|
// over all the rows in the data file (see the next function for that.
|
|
// The function returns a future which completes after all the data has
|
|
// been fed into the consumer. The caller needs to ensure the "consumer"
|
|
// object lives until then (e.g., using the do_with() idiom).
|
|
future<> data_consume_rows_at_once(row_consumer& consumer, uint64_t pos, uint64_t end);
|
|
|
|
// disk_read_range describes a byte ranges covering part of an sstable
|
|
// row that we need to read from disk. Usually this is the whole byte
|
|
// range covering a single sstable row, but in very large rows we might
|
|
// want to only read a subset of the atoms which we know contains the
|
|
// columns we are looking for. When the range to be read does NOT include
|
|
// the entire row, the caller needs to supply the optional "row_info"
|
|
// containing information about the entire row (key and deletion time)
|
|
// which is normally read from the beginning of the row.
|
|
struct disk_read_range {
|
|
// TODO: this should become a vector of ranges
|
|
uint64_t start;
|
|
uint64_t end;
|
|
// When the range above does not cover the beginning of the sstable
|
|
// row, we need to supply information which is only available at the
|
|
// beginning of the row - the row's key and its tombstone if any.
|
|
struct row_info {
|
|
key k;
|
|
deletion_time deltime;
|
|
};
|
|
std::experimental::optional<row_info> ri;
|
|
disk_read_range() : start(0), end(0) {}
|
|
disk_read_range(uint64_t start, uint64_t end) :
|
|
start(start), end(end) { }
|
|
disk_read_range(uint64_t start, uint64_t end, const key& key, const deletion_time& deltime) :
|
|
start(start), end(end), ri(row_info{key, deltime}) { }
|
|
explicit operator bool() const {
|
|
return start != end;
|
|
}
|
|
// found_row() is true if the row was found. This is not the same as
|
|
// operator bool(): It is possible that found_row() but the promoted
|
|
// index ruled out anything to read (in this case "ri" was set).
|
|
bool found_row() const {
|
|
return start != end || ri;
|
|
}
|
|
};
|
|
|
|
// data_consume_rows() iterates over rows in the data file from
|
|
// a particular range, feeding them into the consumer. The iteration is
|
|
// done as efficiently as possible - reading only the data file (not the
|
|
// summary or index files) and reading data in batches.
|
|
//
|
|
// The consumer object may request the iteration to stop before reaching
|
|
// the end of the requested data range (e.g. stop after each sstable row).
|
|
// A context object is returned which allows to resume this consumption:
|
|
// This context's read() method requests that consumption begins, and
|
|
// returns a future which will be resolved when it ends (because the
|
|
// consumer asked to stop, or the data range ended). Only after the
|
|
// returned future is resolved, may read() be called again to consume
|
|
// more.
|
|
// The caller must ensure (e.g., using do_with()) that the context object,
|
|
// as well as the sstable, remains alive as long as a read() is in
|
|
// progress (i.e., returned a future which hasn't completed yet).
|
|
data_consume_context data_consume_rows(row_consumer& consumer, disk_read_range toread);
|
|
|
|
data_consume_context data_consume_single_partition(row_consumer& consumer, disk_read_range toread);
|
|
|
|
// Like data_consume_rows() with bounds, but iterates over whole range
|
|
data_consume_context data_consume_rows(row_consumer& consumer);
|
|
|
|
static component_type component_from_sstring(sstring& s);
|
|
static version_types version_from_sstring(sstring& s);
|
|
static format_types format_from_sstring(sstring& s);
|
|
static const sstring filename(sstring dir, sstring ks, sstring cf, version_types version, int64_t generation,
|
|
format_types format, component_type component);
|
|
static const sstring filename(sstring dir, sstring ks, sstring cf, version_types version, int64_t generation,
|
|
format_types format, sstring component);
|
|
// WARNING: it should only be called to remove components of a sstable with
|
|
// a temporary TOC file.
|
|
static future<> remove_sstable_with_temp_toc(sstring ks, sstring cf, sstring dir, int64_t generation,
|
|
version_types v, format_types f);
|
|
|
|
// load sstable using components shared by a shard
|
|
future<> load(foreign_sstable_open_info info);
|
|
// load all components from disk
|
|
// this variant will be useful for testing purposes and also when loading
|
|
// a new sstable from scratch for sharing its components.
|
|
future<> load();
|
|
future<> open_data();
|
|
future<> update_info_for_opened_data();
|
|
|
|
future<> set_generation(int64_t generation);
|
|
|
|
int64_t generation() const {
|
|
return _generation;
|
|
}
|
|
|
|
// read_row() reads the entire sstable row (partition) at a given
|
|
// partition key k, or a subset of this row. The subset is defined by
|
|
// a filter on the clustering keys which we want to read, which
|
|
// additionally determines also if all the static columns will also be
|
|
// returned in the result.
|
|
future<streamed_mutation_opt> read_row(
|
|
schema_ptr schema,
|
|
const key& k,
|
|
const query::partition_slice& slice = query::full_slice,
|
|
const io_priority_class& pc = default_priority_class(),
|
|
streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no);
|
|
|
|
// Returns a mutation_reader for given range of partitions
|
|
mutation_reader read_range_rows(
|
|
schema_ptr schema,
|
|
const dht::partition_range& range,
|
|
const query::partition_slice& slice = query::full_slice,
|
|
const io_priority_class& pc = default_priority_class(),
|
|
streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no);
|
|
|
|
// read_rows() returns each of the rows in the sstable, in sequence,
|
|
// converted to a "mutation" data structure.
|
|
// This function is implemented efficiently - doing buffered, sequential
|
|
// read of the data file (no need to access the index file).
|
|
// A "mutation_reader" object is returned with which the caller can
|
|
// fetch mutations in sequence, and allows stop iteration any time
|
|
// after getting each row.
|
|
//
|
|
// The caller must ensure (e.g., using do_with()) that the context object,
|
|
// as well as the sstable, remains alive as long as a read() is in
|
|
// progress (i.e., returned a future which hasn't completed yet).
|
|
mutation_reader read_rows(schema_ptr schema,
|
|
const io_priority_class& pc = default_priority_class(),
|
|
streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no);
|
|
|
|
// Write sstable components from a memtable.
|
|
future<> write_components(memtable& mt, bool backup = false,
|
|
const io_priority_class& pc = default_priority_class(), bool leave_unsealed = false);
|
|
|
|
future<> write_components(::mutation_reader mr,
|
|
uint64_t estimated_partitions,
|
|
schema_ptr schema,
|
|
const sstable_writer_config&,
|
|
const io_priority_class& pc = default_priority_class());
|
|
|
|
sstable_writer get_writer(const schema& s,
|
|
uint64_t estimated_partitions,
|
|
const sstable_writer_config&,
|
|
const io_priority_class& pc = default_priority_class());
|
|
|
|
future<> seal_sstable(bool backup);
|
|
|
|
uint64_t get_estimated_key_count() const {
|
|
return ((uint64_t)_components->summary.header.size_at_full_sampling + 1) *
|
|
_components->summary.header.min_index_interval;
|
|
}
|
|
|
|
uint64_t estimated_keys_for_range(const dht::token_range& range);
|
|
|
|
std::vector<dht::decorated_key> get_key_samples(const schema& s, const dht::token_range& range);
|
|
|
|
// mark_for_deletion() specifies that a sstable isn't relevant to the
|
|
// current shard, and thus can be deleted by the deletion manager, if
|
|
// all shards sharing it agree. In case the sstable is unshared, it's
|
|
// guaranteed that all of its on-disk files will be deleted as soon as
|
|
// the in-memory object is destroyed.
|
|
void mark_for_deletion() {
|
|
_marked_for_deletion = true;
|
|
}
|
|
|
|
future<> mark_for_deletion_on_disk();
|
|
|
|
bool marked_for_deletion() const {
|
|
return _marked_for_deletion;
|
|
}
|
|
|
|
void add_ancestor(int64_t generation) {
|
|
_collector.add_ancestor(generation);
|
|
}
|
|
|
|
// Returns true iff this sstable contains data which belongs to many shards.
|
|
bool is_shared() const {
|
|
return _shared;
|
|
}
|
|
|
|
void set_unshared() {
|
|
_shared = false;
|
|
}
|
|
|
|
uint64_t data_size() const;
|
|
uint64_t index_size() const {
|
|
return _index_file_size;
|
|
}
|
|
uint64_t filter_size() const {
|
|
return _filter_file_size;
|
|
}
|
|
|
|
db_clock::time_point data_file_write_time() const {
|
|
return _data_file_write_time;
|
|
}
|
|
|
|
uint64_t filter_memory_size() const {
|
|
return _components->filter->memory_size();
|
|
}
|
|
|
|
// Returns the total bytes of all components.
|
|
uint64_t bytes_on_disk();
|
|
|
|
const partition_key& get_first_partition_key() const;
|
|
const partition_key& get_last_partition_key() const;
|
|
|
|
const dht::decorated_key& get_first_decorated_key() const;
|
|
const dht::decorated_key& get_last_decorated_key() const;
|
|
|
|
// SSTable comparator using the first key (decorated key).
|
|
// Return values are those of a trichotomic comparison.
|
|
int compare_by_first_key(const sstable& other) const;
|
|
|
|
// SSTable comparator using the max timestamp.
|
|
// Return values are those of a trichotomic comparison.
|
|
int compare_by_max_timestamp(const sstable& other) const;
|
|
|
|
const sstring get_filename() const {
|
|
return filename(component_type::Data);
|
|
}
|
|
const sstring& get_dir() const {
|
|
return _dir;
|
|
}
|
|
sstring toc_filename() const;
|
|
|
|
metadata_collector& get_metadata_collector() {
|
|
return _collector;
|
|
}
|
|
|
|
std::vector<std::pair<component_type, sstring>> all_components() const;
|
|
|
|
future<> create_links(sstring dir, int64_t generation) const;
|
|
|
|
future<> create_links(sstring dir) const {
|
|
return create_links(dir, _generation);
|
|
}
|
|
|
|
/**
|
|
* Note. This is using the Origin definition of
|
|
* max_data_age, which is load time. This could maybe
|
|
* be improved upon.
|
|
*/
|
|
gc_clock::time_point max_data_age() const {
|
|
return _now;
|
|
}
|
|
std::vector<sstring> component_filenames() const;
|
|
|
|
template<typename Func, typename... Args>
|
|
auto sstable_write_io_check(Func&& func, Args&&... args) const {
|
|
return do_io_check(_write_error_handler, func, std::forward<Args>(args)...);
|
|
}
|
|
|
|
// Immutable components that can be shared among shards.
|
|
struct shareable_components {
|
|
sstables::compression compression;
|
|
utils::filter_ptr filter;
|
|
sstables::summary summary;
|
|
sstables::statistics statistics;
|
|
stdx::optional<sstables::scylla_metadata> scylla_metadata;
|
|
};
|
|
private:
|
|
size_t sstable_buffer_size = default_buffer_size;
|
|
|
|
static std::unordered_map<version_types, sstring, enum_hash<version_types>> _version_string;
|
|
static std::unordered_map<format_types, sstring, enum_hash<format_types>> _format_string;
|
|
static std::unordered_map<component_type, sstring, enum_hash<component_type>> _component_map;
|
|
|
|
std::unordered_set<component_type, enum_hash<component_type>> _recognized_components;
|
|
std::vector<sstring> _unrecognized_components;
|
|
|
|
foreign_ptr<lw_shared_ptr<shareable_components>> _components = make_foreign(make_lw_shared<shareable_components>());
|
|
shared_index_lists _index_lists;
|
|
bool _shared = true; // across shards; safe default
|
|
// NOTE: _collector and _c_stats are used to generation of statistics file
|
|
// when writing a new sstable.
|
|
metadata_collector _collector;
|
|
column_stats _c_stats;
|
|
file _index_file;
|
|
file _data_file;
|
|
uint64_t _data_file_size;
|
|
uint64_t _index_file_size;
|
|
uint64_t _filter_file_size = 0;
|
|
uint64_t _bytes_on_disk = 0;
|
|
db_clock::time_point _data_file_write_time;
|
|
std::vector<nonwrapping_range<bytes_view>> _clustering_components_ranges;
|
|
stdx::optional<dht::decorated_key> _first;
|
|
stdx::optional<dht::decorated_key> _last;
|
|
|
|
lw_shared_ptr<file_input_stream_history> _single_partition_history = make_lw_shared<file_input_stream_history>();
|
|
lw_shared_ptr<file_input_stream_history> _partition_range_history = make_lw_shared<file_input_stream_history>();
|
|
|
|
// _pi_write is used temporarily for building the promoted
|
|
// index (column sample) of one partition when writing a new sstable.
|
|
struct {
|
|
// Unfortunately we cannot output the promoted index directly to the
|
|
// index file because it needs to be prepended by its size.
|
|
bytes_ostream data;
|
|
uint32_t numblocks;
|
|
deletion_time deltime;
|
|
uint64_t block_start_offset;
|
|
uint64_t block_next_start_offset;
|
|
bytes block_first_colname;
|
|
bytes block_last_colname;
|
|
std::experimental::optional<range_tombstone_accumulator> tombstone_accumulator;
|
|
const schema* schemap;
|
|
size_t desired_block_size;
|
|
} _pi_write;
|
|
|
|
void maybe_flush_pi_block(file_writer& out,
|
|
const composite& clustering_key,
|
|
const std::vector<bytes_view>& column_names);
|
|
|
|
schema_ptr _schema;
|
|
sstring _dir;
|
|
unsigned long _generation = 0;
|
|
version_types _version;
|
|
format_types _format;
|
|
|
|
filter_tracker _filter_tracker;
|
|
|
|
bool _marked_for_deletion = false;
|
|
|
|
gc_clock::time_point _now;
|
|
|
|
io_error_handler _read_error_handler;
|
|
io_error_handler _write_error_handler;
|
|
|
|
const bool has_component(component_type f) const;
|
|
|
|
const sstring filename(component_type f) const;
|
|
|
|
template <sstable::component_type Type, typename T>
|
|
future<> read_simple(T& comp, const io_priority_class& pc);
|
|
|
|
template <sstable::component_type Type, typename T>
|
|
void write_simple(const T& comp, const io_priority_class& pc);
|
|
|
|
void generate_toc(compressor c, double filter_fp_chance);
|
|
void write_toc(const io_priority_class& pc);
|
|
future<> seal_sstable();
|
|
|
|
future<> read_compression(const io_priority_class& pc);
|
|
void write_compression(const io_priority_class& pc);
|
|
|
|
future<> read_scylla_metadata(const io_priority_class& pc);
|
|
void write_scylla_metadata(const io_priority_class& pc);
|
|
|
|
future<> read_filter(const io_priority_class& pc);
|
|
|
|
void write_filter(const io_priority_class& pc);
|
|
|
|
future<> read_summary(const io_priority_class& pc);
|
|
|
|
void write_summary(const io_priority_class& pc) {
|
|
write_simple<component_type::Summary>(_components->summary, pc);
|
|
}
|
|
|
|
// To be called when we try to load an SSTable that lacks a Summary. Could
|
|
// happen if old tools are being used.
|
|
future<> generate_summary(const io_priority_class& pc);
|
|
|
|
future<> read_statistics(const io_priority_class& pc);
|
|
void write_statistics(const io_priority_class& pc);
|
|
// Rewrite statistics component by creating a temporary Statistics and
|
|
// renaming it into place of existing one.
|
|
void rewrite_statistics(const io_priority_class& pc);
|
|
// Validate metadata that's used to optimize reads when user specifies
|
|
// a clustering key range. If this specific metadata is incorrect, then
|
|
// it should be cleared. Otherwise, it could lead to bad decisions.
|
|
// Metadata is probably incorrect if generated by previous Scylla versions.
|
|
void validate_min_max_metadata();
|
|
|
|
void set_first_and_last_keys();
|
|
|
|
// Create one range for each clustering component of this sstable.
|
|
// Each range stores min and max value for that specific component.
|
|
// It does nothing if schema defines no clustering key, and it's supposed
|
|
// to be called when loading an existing sstable or after writing a new one.
|
|
void set_clustering_components_ranges();
|
|
|
|
future<> create_data();
|
|
|
|
future<index_list> read_indexes(uint64_t summary_idx, const io_priority_class& pc);
|
|
index_reader get_index_reader(const io_priority_class& pc);
|
|
|
|
// Return an input_stream which reads exactly the specified byte range
|
|
// from the data file (after uncompression, if the file is compressed).
|
|
// Unlike data_read() below, this method does not read the entire byte
|
|
// range into memory all at once. Rather, this method allows reading the
|
|
// data incrementally as a stream. Knowing in advance the exact amount
|
|
// of bytes to be read using this stream, we can make better choices
|
|
// about the buffer size to read, and where exactly to stop reading
|
|
// (even when a large buffer size is used).
|
|
input_stream<char> data_stream(uint64_t pos, size_t len, const io_priority_class& pc,
|
|
lw_shared_ptr<file_input_stream_history> history);
|
|
|
|
// Read exactly the specific byte range from the data file (after
|
|
// uncompression, if the file is compressed). This can be used to read
|
|
// a specific row from the data file (its position and length can be
|
|
// determined using the index file).
|
|
// This function is intended (and optimized for) random access, not
|
|
// for iteration through all the rows.
|
|
future<temporary_buffer<char>> data_read(uint64_t pos, size_t len, const io_priority_class& pc);
|
|
|
|
future<uint64_t> data_end_position(uint64_t summary_idx, uint64_t index_idx, const index_list& il, const io_priority_class& pc);
|
|
|
|
// Returns data file position for an entry right after all entries mapped by given summary page.
|
|
future<uint64_t> data_end_position(uint64_t summary_idx, const io_priority_class& pc);
|
|
|
|
template <typename T>
|
|
int binary_search(const T& entries, const key& sk, const dht::token& token);
|
|
|
|
template <typename T>
|
|
int binary_search(const T& entries, const key& sk) {
|
|
return binary_search(entries, sk, dht::global_partitioner().get_token(key_view(sk)));
|
|
}
|
|
|
|
// find_disk_ranges finds the ranges of bytes we need to read from the
|
|
// sstable to read the desired columns out of the given key. This range
|
|
// may be the entire byte range of the given partition - as found using
|
|
// the summary and index files - but if the index contains a "promoted
|
|
// index" (a sample of column positions for each key) it may be a smaller
|
|
// range. The returned range may contain columns beyond those requested
|
|
// in slice, so it is the reader's duty to use slice again
|
|
// when parsing the data read from the returned range.
|
|
future<disk_read_range> find_disk_ranges(schema_ptr schema,
|
|
const sstables::key& key,
|
|
const query::partition_slice& slice,
|
|
const io_priority_class& pc);
|
|
|
|
future<summary_entry&> read_summary_entry(size_t i);
|
|
|
|
// FIXME: pending on Bloom filter implementation
|
|
bool filter_has_key(const schema& s, const dht::decorated_key& dk) { return filter_has_key(key::from_partition_key(s, dk._key)); }
|
|
|
|
// NOTE: functions used to generate sstable components.
|
|
void write_row_marker(file_writer& out, const row_marker& marker, const composite& clustering_key);
|
|
void write_clustered_row(file_writer& out, const schema& schema, const clustering_row& clustered_row);
|
|
void write_static_row(file_writer& out, const schema& schema, const row& static_row);
|
|
void write_cell(file_writer& out, atomic_cell_view cell, const column_definition& cdef);
|
|
void write_column_name(file_writer& out, const composite& clustering_key, const std::vector<bytes_view>& column_names, composite::eoc marker = composite::eoc::none);
|
|
void write_column_name(file_writer& out, bytes_view column_names);
|
|
void write_range_tombstone(file_writer& out, const composite& start, bound_kind start_kind, const composite& end, bound_kind stop_kind, std::vector<bytes_view> suffix, const tombstone t);
|
|
void write_range_tombstone(file_writer& out, const composite& start, const composite& end, std::vector<bytes_view> suffix, const tombstone t) {
|
|
write_range_tombstone(out, start, bound_kind::incl_start, end, bound_kind::incl_end, std::move(suffix), std::move(t));
|
|
}
|
|
void write_collection(file_writer& out, const composite& clustering_key, const column_definition& cdef, collection_mutation_view collection);
|
|
|
|
stdx::optional<std::pair<uint64_t, uint64_t>> get_sample_indexes_for_range(const dht::token_range& range);
|
|
public:
|
|
future<> read_toc();
|
|
|
|
bool filter_has_key(const key& key) {
|
|
return _components->filter->is_present(bytes_view(key));
|
|
}
|
|
|
|
bool filter_has_key(utils::hashed_key key) {
|
|
return _components->filter->is_present(key);
|
|
}
|
|
|
|
bool filter_has_key(const schema& s, partition_key_view key) {
|
|
return filter_has_key(key::from_partition_key(s, key));
|
|
}
|
|
|
|
static utils::hashed_key make_hashed_key(const schema& s, const partition_key& key);
|
|
|
|
uint64_t filter_get_false_positive() {
|
|
return _filter_tracker.false_positive;
|
|
}
|
|
uint64_t filter_get_true_positive() {
|
|
return _filter_tracker.true_positive;
|
|
}
|
|
uint64_t filter_get_recent_false_positive() {
|
|
auto t = _filter_tracker.false_positive - _filter_tracker.last_false_positive;
|
|
_filter_tracker.last_false_positive = _filter_tracker.false_positive;
|
|
return t;
|
|
}
|
|
uint64_t filter_get_recent_true_positive() {
|
|
auto t = _filter_tracker.true_positive - _filter_tracker.last_true_positive;
|
|
_filter_tracker.last_true_positive = _filter_tracker.true_positive;
|
|
return t;
|
|
}
|
|
|
|
const stats_metadata& get_stats_metadata() const {
|
|
auto entry = _components->statistics.contents.find(metadata_type::Stats);
|
|
if (entry == _components->statistics.contents.end()) {
|
|
throw std::runtime_error("Stats metadata not available");
|
|
}
|
|
auto& p = entry->second;
|
|
if (!p) {
|
|
throw std::runtime_error("Statistics is malformed");
|
|
}
|
|
const stats_metadata& s = *static_cast<stats_metadata *>(p.get());
|
|
return s;
|
|
}
|
|
const compaction_metadata& get_compaction_metadata() const {
|
|
auto entry = _components->statistics.contents.find(metadata_type::Compaction);
|
|
if (entry == _components->statistics.contents.end()) {
|
|
throw std::runtime_error("Compaction metadata not available");
|
|
}
|
|
auto& p = entry->second;
|
|
if (!p) {
|
|
throw std::runtime_error("Statistics is malformed");
|
|
}
|
|
const compaction_metadata& s = *static_cast<compaction_metadata *>(p.get());
|
|
return s;
|
|
}
|
|
std::vector<unsigned> get_shards_for_this_sstable() const;
|
|
|
|
uint32_t get_sstable_level() const {
|
|
return get_stats_metadata().sstable_level;
|
|
}
|
|
|
|
// This will change sstable level only in memory.
|
|
void set_sstable_level(uint32_t);
|
|
|
|
double get_compression_ratio() const;
|
|
|
|
future<> mutate_sstable_level(uint32_t);
|
|
|
|
const summary& get_summary() const {
|
|
return _components->summary;
|
|
}
|
|
|
|
// Return sstable key range as range<partition_key> reading only the summary component.
|
|
future<range<partition_key>>
|
|
get_sstable_key_range(const schema& s);
|
|
|
|
future<std::vector<shard_id>> get_owning_shards_from_unloaded();
|
|
|
|
const std::vector<nonwrapping_range<bytes_view>>& clustering_components_ranges() const;
|
|
|
|
// returns all info needed for a sstable to be shared with other shards.
|
|
static future<sstable_open_info> load_shared_components(const schema_ptr& s, sstring dir, int generation, version_types v, format_types f);
|
|
|
|
// Allow the test cases from sstable_test.cc to test private methods. We use
|
|
// a placeholder to avoid cluttering this class too much. The sstable_test class
|
|
// will then re-export as public every method it needs.
|
|
friend class test;
|
|
|
|
friend class components_writer;
|
|
friend class sstable_writer;
|
|
friend class index_reader;
|
|
friend class mutation_reader::impl;
|
|
};
|
|
|
|
using shared_sstable = lw_shared_ptr<sstable>;
|
|
using sstable_list = std::unordered_set<shared_sstable>;
|
|
|
|
struct entry_descriptor {
|
|
sstring ks;
|
|
sstring cf;
|
|
sstable::version_types version;
|
|
int64_t generation;
|
|
sstable::format_types format;
|
|
sstable::component_type component;
|
|
|
|
static entry_descriptor make_descriptor(sstring fname);
|
|
|
|
entry_descriptor(sstring ks, sstring cf, sstable::version_types version,
|
|
int64_t generation, sstable::format_types format,
|
|
sstable::component_type component)
|
|
: ks(ks), cf(cf), version(version), generation(generation), format(format), component(component) {}
|
|
};
|
|
|
|
// Waits for all prior tasks started on current shard related to sstable management to finish.
|
|
//
|
|
// There may be asynchronous cleanup started from sstable destructor. Since we can't have blocking
|
|
// destructors in seastar, that cleanup is not waited for. It can be waited for using this function.
|
|
// It is also waited for when seastar exits.
|
|
future<> await_background_jobs();
|
|
|
|
// Invokes await_background_jobs() on all shards
|
|
future<> await_background_jobs_on_all_shards();
|
|
|
|
// When we compact sstables, we have to atomically instantiate the new
|
|
// sstable and delete the old ones. Otherwise, if we compact A+B into C,
|
|
// and if A contained some data that was tombstoned by B, and if B was
|
|
// deleted but A survived, then data from A will be resurrected.
|
|
//
|
|
// There are two violators of the requirement to atomically delete
|
|
// sstables: first sstable instantiation and deletion on disk is atomic
|
|
// only wrt. itself, not other sstables, and second when an sstable is
|
|
// shared among shard, so actual on-disk deletion of an sstable is deferred
|
|
// until all shards agree it can be deleted.
|
|
//
|
|
// When shutting down, we will not be able to complete some deletions.
|
|
// In that case, an atomic_deletion_cancelled exception is returned instead.
|
|
//
|
|
// This function only solves the second problem for now.
|
|
future<> delete_atomically(std::vector<shared_sstable> ssts);
|
|
future<> delete_atomically(std::vector<sstable_to_delete> ssts);
|
|
|
|
// Cancel any deletions scheduled by delete_atomically() and make their
|
|
// futures complete (with an atomic_deletion_cancelled exception).
|
|
void cancel_atomic_deletions();
|
|
|
|
class components_writer {
|
|
sstable& _sst;
|
|
const schema& _schema;
|
|
file_writer& _out;
|
|
file_writer _index;
|
|
uint64_t _max_sstable_size;
|
|
bool _tombstone_written;
|
|
// Remember first and last keys, which we need for the summary file.
|
|
stdx::optional<key> _first_key, _last_key;
|
|
stdx::optional<key> _partition_key;
|
|
private:
|
|
size_t get_offset();
|
|
file_writer index_file_writer(sstable& sst, const io_priority_class& pc);
|
|
void ensure_tombstone_is_written() {
|
|
if (!_tombstone_written) {
|
|
consume(tombstone());
|
|
}
|
|
}
|
|
public:
|
|
components_writer(sstable& sst, const schema& s, file_writer& out, uint64_t estimated_partitions, const sstable_writer_config&, const io_priority_class& pc);
|
|
|
|
void consume_new_partition(const dht::decorated_key& dk);
|
|
void consume(tombstone t);
|
|
stop_iteration consume(static_row&& sr);
|
|
stop_iteration consume(clustering_row&& cr);
|
|
stop_iteration consume(range_tombstone&& rt);
|
|
stop_iteration consume_end_of_partition();
|
|
void consume_end_of_stream();
|
|
};
|
|
|
|
class sstable_writer {
|
|
sstable& _sst;
|
|
const schema& _schema;
|
|
const io_priority_class& _pc;
|
|
bool _backup;
|
|
bool _leave_unsealed;
|
|
bool _compression_enabled;
|
|
std::unique_ptr<file_writer> _writer;
|
|
stdx::optional<components_writer> _components_writer;
|
|
private:
|
|
void prepare_file_writer();
|
|
void finish_file_writer();
|
|
public:
|
|
sstable_writer(sstable& sst, const schema& s, uint64_t estimated_partitions,
|
|
const sstable_writer_config&, const io_priority_class& pc);
|
|
~sstable_writer();
|
|
sstable_writer(sstable_writer&& o) : _sst(o._sst), _schema(o._schema), _pc(o._pc), _backup(o._backup),
|
|
_leave_unsealed(o._leave_unsealed), _compression_enabled(o._compression_enabled), _writer(std::move(o._writer)),
|
|
_components_writer(std::move(o._components_writer)) {}
|
|
void consume_new_partition(const dht::decorated_key& dk) { return _components_writer->consume_new_partition(dk); }
|
|
void consume(tombstone t) { _components_writer->consume(t); }
|
|
stop_iteration consume(static_row&& sr) { return _components_writer->consume(std::move(sr)); }
|
|
stop_iteration consume(clustering_row&& cr) { return _components_writer->consume(std::move(cr)); }
|
|
stop_iteration consume(range_tombstone&& rt) { return _components_writer->consume(std::move(rt)); }
|
|
stop_iteration consume_end_of_partition() { return _components_writer->consume_end_of_partition(); }
|
|
void consume_end_of_stream();
|
|
};
|
|
|
|
// contains data for loading a sstable using components shared by a single shard;
|
|
// can be moved across shards
|
|
struct foreign_sstable_open_info {
|
|
foreign_ptr<lw_shared_ptr<sstable::shareable_components>> components;
|
|
std::vector<shard_id> owners;
|
|
seastar::file_handle data;
|
|
seastar::file_handle index;
|
|
};
|
|
|
|
// can only be used locally
|
|
struct sstable_open_info {
|
|
lw_shared_ptr<sstable::shareable_components> components;
|
|
std::vector<shard_id> owners;
|
|
file data;
|
|
file index;
|
|
};
|
|
|
|
void init_metrics();
|
|
|
|
}
|