mirror of
https://github.com/scylladb/scylladb.git
synced 2026-04-26 11:30:36 +00:00
"This series cleans the streaming_histogram and the estimated histogram that were importad from origin, it then uses it to get the estimated min and max row estimation in the API."
418 lines
16 KiB
C++
418 lines
16 KiB
C++
/*
|
|
* Copyright (C) 2015 Cloudius Systems, Ltd.
|
|
*
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include "core/file.hh"
|
|
#include "core/fstream.hh"
|
|
#include "core/future.hh"
|
|
#include "core/sstring.hh"
|
|
#include "core/enum.hh"
|
|
#include "core/shared_ptr.hh"
|
|
#include "core/distributed.hh"
|
|
#include <unordered_set>
|
|
#include <unordered_map>
|
|
#include "types.hh"
|
|
#include "core/enum.hh"
|
|
#include "compress.hh"
|
|
#include "row.hh"
|
|
#include "database.hh"
|
|
#include "dht/i_partitioner.hh"
|
|
#include "schema.hh"
|
|
#include "mutation.hh"
|
|
#include "utils/i_filter.hh"
|
|
#include "core/stream.hh"
|
|
#include "writer.hh"
|
|
#include "metadata_collector.hh"
|
|
#include "filter.hh"
|
|
|
|
namespace sstables {
|
|
|
|
// data_consume_context is an object returned by sstable::data_consume_rows()
|
|
// which allows knowing when the consumer stops reading, and starting it again
|
|
// (e.g., when the consumer wants to stop after every sstable row).
|
|
//
|
|
// The read() method initiates reading into the consumer, and continues to
|
|
// read and feed data into the consumer until one of the consumer's callbacks
|
|
// requests to stop, or until we reach the end of the data range originally
|
|
// requested. read() returns a future which completes when reading stopped.
|
|
// If we're at the end-of-file, the read may complete without reading anything
|
|
// so it's the consumer class's task to check if anything was consumed.
|
|
// Note:
|
|
// The caller MUST ensure that between calling read() on this object,
|
|
// and the time the returned future is completed, the object lives on.
|
|
// Moreover, the sstable object used for the sstable::data_consume_rows()
|
|
// call which created this data_consume_context, must also be kept alive.
|
|
class data_consume_context {
|
|
class impl;
|
|
std::unique_ptr<impl> _pimpl;
|
|
// This object can only be constructed by sstable::data_consume_rows()
|
|
data_consume_context(std::unique_ptr<impl>);
|
|
friend class sstable;
|
|
public:
|
|
future<> read();
|
|
// Define (as defaults) the destructor and move operations in the source
|
|
// file, so here we don't need to know the incomplete impl type.
|
|
~data_consume_context();
|
|
data_consume_context(data_consume_context&&);
|
|
data_consume_context& operator=(data_consume_context&&);
|
|
};
|
|
|
|
// mutation_reader is an object returned by sstable::read_rows() et al. which
|
|
// allows getting each sstable row in sequence, in mutation format.
|
|
//
|
|
// The read() method reads the next mutation, returning a disengaged optional
|
|
// on EOF. As usual for future-returning functions, a caller which starts a
|
|
// read() MUST ensure that the mutation_reader object continues to live until
|
|
// the returned future is fulfilled. Moreover, the sstable whose read_rows()
|
|
// method was used to open this mutation_reader must also live between the
|
|
// time read() is called and its future ends.
|
|
// As soon as the future returned by read() completes, the object may safely
|
|
// be deleted. In other words, when the read() future is fulfilled, we can
|
|
// be sure there are no background tasks still scheduled.
|
|
class mutation_reader {
|
|
class impl;
|
|
std::unique_ptr<impl> _pimpl;
|
|
// This object can only be constructed by sstable::read_rows() et al.
|
|
mutation_reader(std::unique_ptr<impl>);
|
|
friend class sstable;
|
|
public:
|
|
future<mutation_opt> read();
|
|
// Define (as defaults) the destructor and move operations in the source
|
|
// file, so here we don't need to know the incomplete impl type.
|
|
~mutation_reader();
|
|
mutation_reader(mutation_reader&&);
|
|
mutation_reader& operator=(mutation_reader&&);
|
|
};
|
|
|
|
class key;
|
|
|
|
class malformed_sstable_exception : public std::exception {
|
|
sstring _msg;
|
|
public:
|
|
malformed_sstable_exception(sstring s) : _msg(s) {}
|
|
const char *what() const noexcept {
|
|
return _msg.c_str();
|
|
}
|
|
};
|
|
|
|
using index_list = std::vector<index_entry>;
|
|
|
|
class sstable {
|
|
public:
|
|
enum class component_type {
|
|
Index,
|
|
CompressionInfo,
|
|
Data,
|
|
TOC,
|
|
Summary,
|
|
Digest,
|
|
CRC,
|
|
Filter,
|
|
Statistics,
|
|
};
|
|
enum class version_types { ka, la };
|
|
enum class format_types { big };
|
|
public:
|
|
sstable(sstring ks, sstring cf, sstring dir, unsigned long generation, version_types v, format_types f, gc_clock::time_point now = gc_clock::now())
|
|
: _ks(std::move(ks))
|
|
, _cf(std::move(cf))
|
|
, _dir(std::move(dir))
|
|
, _generation(generation)
|
|
, _version(v)
|
|
, _format(f)
|
|
, _filter_tracker(make_lw_shared<distributed<filter_tracker>>())
|
|
, _now(now)
|
|
{ }
|
|
sstable& operator=(const sstable&) = delete;
|
|
sstable(const sstable&) = delete;
|
|
sstable(sstable&&) = default;
|
|
|
|
~sstable();
|
|
|
|
// Read one or few rows at the given byte range from the data file,
|
|
// feeding them into the consumer. This function reads the entire given
|
|
// byte range at once into memory, so it should not be used for iterating
|
|
// over all the rows in the data file (see the next function for that.
|
|
// The function returns a future which completes after all the data has
|
|
// been fed into the consumer. The caller needs to ensure the "consumer"
|
|
// object lives until then (e.g., using the do_with() idiom).
|
|
future<> data_consume_rows_at_once(row_consumer& consumer, uint64_t pos, uint64_t end);
|
|
|
|
|
|
// data_consume_rows() iterates over rows in the data file from
|
|
// a particular range, feeding them into the consumer. The iteration is
|
|
// done as efficiently as possible - reading only the data file (not the
|
|
// summary or index files) and reading data in batches.
|
|
//
|
|
// The consumer object may request the iteration to stop before reaching
|
|
// the end of the requested data range (e.g. stop after each sstable row).
|
|
// A context object is returned which allows to resume this consumption:
|
|
// This context's read() method requests that consumption begins, and
|
|
// returns a future which will be resolved when it ends (because the
|
|
// consumer asked to stop, or the data range ended). Only after the
|
|
// returned future is resolved, may read() be called again to consume
|
|
// more.
|
|
// The caller must ensure (e.g., using do_with()) that the context object,
|
|
// as well as the sstable, remains alive as long as a read() is in
|
|
// progress (i.e., returned a future which hasn't completed yet).
|
|
data_consume_context data_consume_rows(row_consumer& consumer, uint64_t start, uint64_t end);
|
|
|
|
// Like data_consume_rows() with bounds, but iterates over whole range
|
|
data_consume_context data_consume_rows(row_consumer& consumer);
|
|
|
|
static component_type component_from_sstring(sstring& s);
|
|
static version_types version_from_sstring(sstring& s);
|
|
static format_types format_from_sstring(sstring& s);
|
|
static const sstring filename(sstring dir, sstring ks, sstring cf, version_types version, unsigned long generation,
|
|
format_types format, component_type component, bool temporary = false);
|
|
|
|
future<> load();
|
|
|
|
void set_generation(unsigned long generation) {
|
|
_generation = generation;
|
|
}
|
|
unsigned long generation() const {
|
|
return _generation;
|
|
}
|
|
|
|
future<mutation_opt> read_row(schema_ptr schema, const key& k);
|
|
/**
|
|
* @param schema a schema_ptr object describing this table
|
|
* @param min the minimum token we want to search for (inclusive)
|
|
* @param max the maximum token we want to search for (inclusive)
|
|
* @return a mutation_reader object that can be used to iterate over
|
|
* mutations.
|
|
*/
|
|
mutation_reader read_range_rows(schema_ptr schema,
|
|
const dht::token& min, const dht::token& max);
|
|
|
|
// Returns a mutation_reader for given range of partitions
|
|
mutation_reader read_range_rows(schema_ptr schema, const query::partition_range& range);
|
|
|
|
// read_rows() returns each of the rows in the sstable, in sequence,
|
|
// converted to a "mutation" data structure.
|
|
// This function is implemented efficiently - doing buffered, sequential
|
|
// read of the data file (no need to access the index file).
|
|
// A "mutation_reader" object is returned with which the caller can
|
|
// fetch mutations in sequence, and allows stop iteration any time
|
|
// after getting each row.
|
|
//
|
|
// The caller must ensure (e.g., using do_with()) that the context object,
|
|
// as well as the sstable, remains alive as long as a read() is in
|
|
// progress (i.e., returned a future which hasn't completed yet).
|
|
mutation_reader read_rows(schema_ptr schema);
|
|
|
|
// Write sstable components from a memtable.
|
|
future<> write_components(const memtable& mt);
|
|
future<> write_components(::mutation_reader mr,
|
|
uint64_t estimated_partitions, schema_ptr schema);
|
|
|
|
uint64_t get_estimated_key_count() const {
|
|
return ((uint64_t)_summary.header.size_at_full_sampling + 1) *
|
|
_summary.header.min_index_interval;
|
|
}
|
|
|
|
// mark_for_deletion() specifies that the on-disk files for this sstable
|
|
// should be deleted as soon as the in-memory object is destructed.
|
|
void mark_for_deletion() {
|
|
_marked_for_deletion = true;
|
|
}
|
|
|
|
void add_ancestor(int generation) {
|
|
_collector.add_ancestor(generation);
|
|
}
|
|
|
|
// Returns true iff this sstable contains data which belongs to many shards.
|
|
bool is_shared() {
|
|
return true; // FIXME: set to false for sstables created by compaction process
|
|
}
|
|
|
|
uint64_t data_size();
|
|
uint64_t index_size() {
|
|
return _index_file_size;
|
|
}
|
|
|
|
// Returns the total bytes of all components.
|
|
future<uint64_t> bytes_on_disk();
|
|
|
|
partition_key get_first_partition_key(const schema& s) const;
|
|
partition_key get_last_partition_key(const schema& s) const;
|
|
|
|
const sstring get_filename() {
|
|
return filename(component_type::Data);
|
|
}
|
|
private:
|
|
void do_write_components(::mutation_reader mr,
|
|
uint64_t estimated_partitions, schema_ptr schema, file_writer& out);
|
|
void prepare_write_components(::mutation_reader mr,
|
|
uint64_t estimated_partitions, schema_ptr schema);
|
|
static std::unordered_map<version_types, sstring, enum_hash<version_types>> _version_string;
|
|
static std::unordered_map<format_types, sstring, enum_hash<format_types>> _format_string;
|
|
static std::unordered_map<component_type, sstring, enum_hash<component_type>> _component_map;
|
|
|
|
std::unordered_set<component_type, enum_hash<component_type>> _components;
|
|
|
|
compression _compression;
|
|
utils::filter_ptr _filter;
|
|
summary _summary;
|
|
statistics _statistics;
|
|
// NOTE: _collector and _c_stats are used to generation of statistics file
|
|
// when writing a new sstable.
|
|
metadata_collector _collector;
|
|
column_stats _c_stats;
|
|
file _index_file;
|
|
file _data_file;
|
|
uint64_t _data_file_size;
|
|
uint64_t _index_file_size;
|
|
uint64_t _bytes_on_disk = 0;
|
|
|
|
sstring _ks;
|
|
sstring _cf;
|
|
sstring _dir;
|
|
unsigned long _generation = 0;
|
|
version_types _version;
|
|
format_types _format;
|
|
|
|
lw_shared_ptr<distributed<filter_tracker>> _filter_tracker;
|
|
|
|
bool _marked_for_deletion = false;
|
|
|
|
gc_clock::time_point _now;
|
|
|
|
const bool has_component(component_type f);
|
|
|
|
const sstring filename(component_type f);
|
|
const sstring temporary_filename(component_type f);
|
|
|
|
template <sstable::component_type Type, typename T>
|
|
future<> read_simple(T& comp);
|
|
|
|
template <sstable::component_type Type, typename T>
|
|
void write_simple(T& comp);
|
|
|
|
future<> read_toc();
|
|
void write_toc();
|
|
|
|
future<> read_compression();
|
|
void write_compression();
|
|
|
|
future<> read_filter();
|
|
|
|
void write_filter();
|
|
|
|
future<> read_summary() {
|
|
return read_simple<component_type::Summary>(_summary);
|
|
}
|
|
void write_summary() {
|
|
write_simple<component_type::Summary>(_summary);
|
|
}
|
|
|
|
future<> read_statistics();
|
|
void write_statistics();
|
|
|
|
future<> open_data();
|
|
future<> create_data();
|
|
|
|
future<index_list> read_indexes(uint64_t position, uint64_t quantity);
|
|
|
|
future<index_list> read_indexes(uint64_t position) {
|
|
return read_indexes(position, _summary.header.sampling_level);
|
|
}
|
|
|
|
input_stream<char> data_stream_at(uint64_t pos);
|
|
|
|
// Read exactly the specific byte range from the data file (after
|
|
// uncompression, if the file is compressed). This can be used to read
|
|
// a specific row from the data file (its position and length can be
|
|
// determined using the index file).
|
|
// This function is intended (and optimized for) random access, not
|
|
// for iteration through all the rows.
|
|
future<temporary_buffer<char>> data_read(uint64_t pos, size_t len);
|
|
|
|
future<uint64_t> data_end_position(uint64_t summary_idx, uint64_t index_idx, const index_list& il);
|
|
|
|
// Returns data file position for an entry right after all entries mapped by given summary page.
|
|
future<uint64_t> data_end_position(uint64_t summary_idx);
|
|
|
|
template <typename T>
|
|
int binary_search(const T& entries, const key& sk, const dht::token& token);
|
|
|
|
template <typename T>
|
|
int binary_search(const T& entries, const key& sk) {
|
|
return binary_search(entries, sk, dht::global_partitioner().get_token(key_view(sk)));
|
|
}
|
|
|
|
// Returns position in the data file of the first entry which is not
|
|
// smaller than the supplied ring_position. If no such entry exists, a
|
|
// position right after all entries is returned.
|
|
//
|
|
// The ring_position doesn't have to survive deferring.
|
|
future<uint64_t> lower_bound(schema_ptr, const dht::ring_position&);
|
|
|
|
// Returns position in the data file of the first partition which is
|
|
// greater than the supplied ring_position. If no such entry exists, a
|
|
// position right after all entries is returned.
|
|
//
|
|
// The ring_position doesn't have to survive deferring.
|
|
future<uint64_t> upper_bound(schema_ptr, const dht::ring_position&);
|
|
|
|
future<summary_entry&> read_summary_entry(size_t i);
|
|
|
|
// FIXME: pending on Bloom filter implementation
|
|
bool filter_has_key(const key& key) { return _filter->is_present(bytes_view(key)); }
|
|
bool filter_has_key(const schema& s, const dht::decorated_key& dk) { return filter_has_key(key::from_partition_key(s, dk._key)); }
|
|
|
|
// NOTE: functions used to generate sstable components.
|
|
void write_row_marker(file_writer& out, const rows_entry& clustered_row, const composite& clustering_key);
|
|
void write_clustered_row(file_writer& out, const schema& schema, const rows_entry& clustered_row);
|
|
void write_static_row(file_writer& out, const schema& schema, const row& static_row);
|
|
void write_cell(file_writer& out, atomic_cell_view cell);
|
|
void write_column_name(file_writer& out, const composite& clustering_key, const std::vector<bytes_view>& column_names, composite_marker m = composite_marker::none);
|
|
void write_column_name(file_writer& out, bytes_view column_names);
|
|
void write_range_tombstone(file_writer& out, const composite& clustering_prefix, std::vector<bytes_view> suffix, const tombstone t);
|
|
void write_collection(file_writer& out, const composite& clustering_key, const column_definition& cdef, collection_mutation::view collection);
|
|
public:
|
|
bool filter_has_key(const schema& s, const partition_key& key) {
|
|
return filter_has_key(key::from_partition_key(s, key));
|
|
}
|
|
const stats_metadata& get_stats_metadata() const {
|
|
auto entry = _statistics.contents.find(metadata_type::Stats);
|
|
if (entry == _statistics.contents.end()) {
|
|
throw std::runtime_error("Stats metadata not available");
|
|
}
|
|
auto& p = entry->second;
|
|
if (!p) {
|
|
throw std::runtime_error("Statistics is malformed");
|
|
}
|
|
const stats_metadata& s = *static_cast<stats_metadata *>(p.get());
|
|
return s;
|
|
}
|
|
|
|
// Allow the test cases from sstable_test.cc to test private methods. We use
|
|
// a placeholder to avoid cluttering this class too much. The sstable_test class
|
|
// will then re-export as public every method it needs.
|
|
friend class test;
|
|
};
|
|
|
|
using shared_sstable = lw_shared_ptr<sstable>;
|
|
|
|
struct entry_descriptor {
|
|
sstring ks;
|
|
sstring cf;
|
|
sstable::version_types version;
|
|
unsigned long generation;
|
|
sstable::format_types format;
|
|
sstable::component_type component;
|
|
|
|
static entry_descriptor make_descriptor(sstring fname);
|
|
|
|
entry_descriptor(sstring ks, sstring cf, sstable::version_types version,
|
|
unsigned long generation, sstable::format_types format,
|
|
sstable::component_type component)
|
|
: ks(ks), cf(cf), version(version), generation(generation), format(format), component(component) {}
|
|
};
|
|
}
|