Files
scylladb/sstables/sstables.hh
Avi Kivity 7a14bcd66e Merge "API: add get estimated row size histogram to column family" from Amnon
"This series cleans the streaming_histogram and the estimated histogram that
were importad from origin, it then uses it to get the estimated min and max row
estimation in the API."
2015-08-16 17:31:23 +03:00

418 lines
16 KiB
C++

/*
* Copyright (C) 2015 Cloudius Systems, Ltd.
*
*/
#pragma once
#include "core/file.hh"
#include "core/fstream.hh"
#include "core/future.hh"
#include "core/sstring.hh"
#include "core/enum.hh"
#include "core/shared_ptr.hh"
#include "core/distributed.hh"
#include <unordered_set>
#include <unordered_map>
#include "types.hh"
#include "core/enum.hh"
#include "compress.hh"
#include "row.hh"
#include "database.hh"
#include "dht/i_partitioner.hh"
#include "schema.hh"
#include "mutation.hh"
#include "utils/i_filter.hh"
#include "core/stream.hh"
#include "writer.hh"
#include "metadata_collector.hh"
#include "filter.hh"
namespace sstables {
// data_consume_context is an object returned by sstable::data_consume_rows()
// which allows knowing when the consumer stops reading, and starting it again
// (e.g., when the consumer wants to stop after every sstable row).
//
// The read() method initiates reading into the consumer, and continues to
// read and feed data into the consumer until one of the consumer's callbacks
// requests to stop, or until we reach the end of the data range originally
// requested. read() returns a future which completes when reading stopped.
// If we're at the end-of-file, the read may complete without reading anything
// so it's the consumer class's task to check if anything was consumed.
// Note:
// The caller MUST ensure that between calling read() on this object,
// and the time the returned future is completed, the object lives on.
// Moreover, the sstable object used for the sstable::data_consume_rows()
// call which created this data_consume_context, must also be kept alive.
class data_consume_context {
class impl;
std::unique_ptr<impl> _pimpl;
// This object can only be constructed by sstable::data_consume_rows()
data_consume_context(std::unique_ptr<impl>);
friend class sstable;
public:
future<> read();
// Define (as defaults) the destructor and move operations in the source
// file, so here we don't need to know the incomplete impl type.
~data_consume_context();
data_consume_context(data_consume_context&&);
data_consume_context& operator=(data_consume_context&&);
};
// mutation_reader is an object returned by sstable::read_rows() et al. which
// allows getting each sstable row in sequence, in mutation format.
//
// The read() method reads the next mutation, returning a disengaged optional
// on EOF. As usual for future-returning functions, a caller which starts a
// read() MUST ensure that the mutation_reader object continues to live until
// the returned future is fulfilled. Moreover, the sstable whose read_rows()
// method was used to open this mutation_reader must also live between the
// time read() is called and its future ends.
// As soon as the future returned by read() completes, the object may safely
// be deleted. In other words, when the read() future is fulfilled, we can
// be sure there are no background tasks still scheduled.
class mutation_reader {
class impl;
std::unique_ptr<impl> _pimpl;
// This object can only be constructed by sstable::read_rows() et al.
mutation_reader(std::unique_ptr<impl>);
friend class sstable;
public:
future<mutation_opt> read();
// Define (as defaults) the destructor and move operations in the source
// file, so here we don't need to know the incomplete impl type.
~mutation_reader();
mutation_reader(mutation_reader&&);
mutation_reader& operator=(mutation_reader&&);
};
class key;
class malformed_sstable_exception : public std::exception {
sstring _msg;
public:
malformed_sstable_exception(sstring s) : _msg(s) {}
const char *what() const noexcept {
return _msg.c_str();
}
};
using index_list = std::vector<index_entry>;
class sstable {
public:
enum class component_type {
Index,
CompressionInfo,
Data,
TOC,
Summary,
Digest,
CRC,
Filter,
Statistics,
};
enum class version_types { ka, la };
enum class format_types { big };
public:
sstable(sstring ks, sstring cf, sstring dir, unsigned long generation, version_types v, format_types f, gc_clock::time_point now = gc_clock::now())
: _ks(std::move(ks))
, _cf(std::move(cf))
, _dir(std::move(dir))
, _generation(generation)
, _version(v)
, _format(f)
, _filter_tracker(make_lw_shared<distributed<filter_tracker>>())
, _now(now)
{ }
sstable& operator=(const sstable&) = delete;
sstable(const sstable&) = delete;
sstable(sstable&&) = default;
~sstable();
// Read one or few rows at the given byte range from the data file,
// feeding them into the consumer. This function reads the entire given
// byte range at once into memory, so it should not be used for iterating
// over all the rows in the data file (see the next function for that.
// The function returns a future which completes after all the data has
// been fed into the consumer. The caller needs to ensure the "consumer"
// object lives until then (e.g., using the do_with() idiom).
future<> data_consume_rows_at_once(row_consumer& consumer, uint64_t pos, uint64_t end);
// data_consume_rows() iterates over rows in the data file from
// a particular range, feeding them into the consumer. The iteration is
// done as efficiently as possible - reading only the data file (not the
// summary or index files) and reading data in batches.
//
// The consumer object may request the iteration to stop before reaching
// the end of the requested data range (e.g. stop after each sstable row).
// A context object is returned which allows to resume this consumption:
// This context's read() method requests that consumption begins, and
// returns a future which will be resolved when it ends (because the
// consumer asked to stop, or the data range ended). Only after the
// returned future is resolved, may read() be called again to consume
// more.
// The caller must ensure (e.g., using do_with()) that the context object,
// as well as the sstable, remains alive as long as a read() is in
// progress (i.e., returned a future which hasn't completed yet).
data_consume_context data_consume_rows(row_consumer& consumer, uint64_t start, uint64_t end);
// Like data_consume_rows() with bounds, but iterates over whole range
data_consume_context data_consume_rows(row_consumer& consumer);
static component_type component_from_sstring(sstring& s);
static version_types version_from_sstring(sstring& s);
static format_types format_from_sstring(sstring& s);
static const sstring filename(sstring dir, sstring ks, sstring cf, version_types version, unsigned long generation,
format_types format, component_type component, bool temporary = false);
future<> load();
void set_generation(unsigned long generation) {
_generation = generation;
}
unsigned long generation() const {
return _generation;
}
future<mutation_opt> read_row(schema_ptr schema, const key& k);
/**
* @param schema a schema_ptr object describing this table
* @param min the minimum token we want to search for (inclusive)
* @param max the maximum token we want to search for (inclusive)
* @return a mutation_reader object that can be used to iterate over
* mutations.
*/
mutation_reader read_range_rows(schema_ptr schema,
const dht::token& min, const dht::token& max);
// Returns a mutation_reader for given range of partitions
mutation_reader read_range_rows(schema_ptr schema, const query::partition_range& range);
// read_rows() returns each of the rows in the sstable, in sequence,
// converted to a "mutation" data structure.
// This function is implemented efficiently - doing buffered, sequential
// read of the data file (no need to access the index file).
// A "mutation_reader" object is returned with which the caller can
// fetch mutations in sequence, and allows stop iteration any time
// after getting each row.
//
// The caller must ensure (e.g., using do_with()) that the context object,
// as well as the sstable, remains alive as long as a read() is in
// progress (i.e., returned a future which hasn't completed yet).
mutation_reader read_rows(schema_ptr schema);
// Write sstable components from a memtable.
future<> write_components(const memtable& mt);
future<> write_components(::mutation_reader mr,
uint64_t estimated_partitions, schema_ptr schema);
uint64_t get_estimated_key_count() const {
return ((uint64_t)_summary.header.size_at_full_sampling + 1) *
_summary.header.min_index_interval;
}
// mark_for_deletion() specifies that the on-disk files for this sstable
// should be deleted as soon as the in-memory object is destructed.
void mark_for_deletion() {
_marked_for_deletion = true;
}
void add_ancestor(int generation) {
_collector.add_ancestor(generation);
}
// Returns true iff this sstable contains data which belongs to many shards.
bool is_shared() {
return true; // FIXME: set to false for sstables created by compaction process
}
uint64_t data_size();
uint64_t index_size() {
return _index_file_size;
}
// Returns the total bytes of all components.
future<uint64_t> bytes_on_disk();
partition_key get_first_partition_key(const schema& s) const;
partition_key get_last_partition_key(const schema& s) const;
const sstring get_filename() {
return filename(component_type::Data);
}
private:
void do_write_components(::mutation_reader mr,
uint64_t estimated_partitions, schema_ptr schema, file_writer& out);
void prepare_write_components(::mutation_reader mr,
uint64_t estimated_partitions, schema_ptr schema);
static std::unordered_map<version_types, sstring, enum_hash<version_types>> _version_string;
static std::unordered_map<format_types, sstring, enum_hash<format_types>> _format_string;
static std::unordered_map<component_type, sstring, enum_hash<component_type>> _component_map;
std::unordered_set<component_type, enum_hash<component_type>> _components;
compression _compression;
utils::filter_ptr _filter;
summary _summary;
statistics _statistics;
// NOTE: _collector and _c_stats are used to generation of statistics file
// when writing a new sstable.
metadata_collector _collector;
column_stats _c_stats;
file _index_file;
file _data_file;
uint64_t _data_file_size;
uint64_t _index_file_size;
uint64_t _bytes_on_disk = 0;
sstring _ks;
sstring _cf;
sstring _dir;
unsigned long _generation = 0;
version_types _version;
format_types _format;
lw_shared_ptr<distributed<filter_tracker>> _filter_tracker;
bool _marked_for_deletion = false;
gc_clock::time_point _now;
const bool has_component(component_type f);
const sstring filename(component_type f);
const sstring temporary_filename(component_type f);
template <sstable::component_type Type, typename T>
future<> read_simple(T& comp);
template <sstable::component_type Type, typename T>
void write_simple(T& comp);
future<> read_toc();
void write_toc();
future<> read_compression();
void write_compression();
future<> read_filter();
void write_filter();
future<> read_summary() {
return read_simple<component_type::Summary>(_summary);
}
void write_summary() {
write_simple<component_type::Summary>(_summary);
}
future<> read_statistics();
void write_statistics();
future<> open_data();
future<> create_data();
future<index_list> read_indexes(uint64_t position, uint64_t quantity);
future<index_list> read_indexes(uint64_t position) {
return read_indexes(position, _summary.header.sampling_level);
}
input_stream<char> data_stream_at(uint64_t pos);
// Read exactly the specific byte range from the data file (after
// uncompression, if the file is compressed). This can be used to read
// a specific row from the data file (its position and length can be
// determined using the index file).
// This function is intended (and optimized for) random access, not
// for iteration through all the rows.
future<temporary_buffer<char>> data_read(uint64_t pos, size_t len);
future<uint64_t> data_end_position(uint64_t summary_idx, uint64_t index_idx, const index_list& il);
// Returns data file position for an entry right after all entries mapped by given summary page.
future<uint64_t> data_end_position(uint64_t summary_idx);
template <typename T>
int binary_search(const T& entries, const key& sk, const dht::token& token);
template <typename T>
int binary_search(const T& entries, const key& sk) {
return binary_search(entries, sk, dht::global_partitioner().get_token(key_view(sk)));
}
// Returns position in the data file of the first entry which is not
// smaller than the supplied ring_position. If no such entry exists, a
// position right after all entries is returned.
//
// The ring_position doesn't have to survive deferring.
future<uint64_t> lower_bound(schema_ptr, const dht::ring_position&);
// Returns position in the data file of the first partition which is
// greater than the supplied ring_position. If no such entry exists, a
// position right after all entries is returned.
//
// The ring_position doesn't have to survive deferring.
future<uint64_t> upper_bound(schema_ptr, const dht::ring_position&);
future<summary_entry&> read_summary_entry(size_t i);
// FIXME: pending on Bloom filter implementation
bool filter_has_key(const key& key) { return _filter->is_present(bytes_view(key)); }
bool filter_has_key(const schema& s, const dht::decorated_key& dk) { return filter_has_key(key::from_partition_key(s, dk._key)); }
// NOTE: functions used to generate sstable components.
void write_row_marker(file_writer& out, const rows_entry& clustered_row, const composite& clustering_key);
void write_clustered_row(file_writer& out, const schema& schema, const rows_entry& clustered_row);
void write_static_row(file_writer& out, const schema& schema, const row& static_row);
void write_cell(file_writer& out, atomic_cell_view cell);
void write_column_name(file_writer& out, const composite& clustering_key, const std::vector<bytes_view>& column_names, composite_marker m = composite_marker::none);
void write_column_name(file_writer& out, bytes_view column_names);
void write_range_tombstone(file_writer& out, const composite& clustering_prefix, std::vector<bytes_view> suffix, const tombstone t);
void write_collection(file_writer& out, const composite& clustering_key, const column_definition& cdef, collection_mutation::view collection);
public:
bool filter_has_key(const schema& s, const partition_key& key) {
return filter_has_key(key::from_partition_key(s, key));
}
const stats_metadata& get_stats_metadata() const {
auto entry = _statistics.contents.find(metadata_type::Stats);
if (entry == _statistics.contents.end()) {
throw std::runtime_error("Stats metadata not available");
}
auto& p = entry->second;
if (!p) {
throw std::runtime_error("Statistics is malformed");
}
const stats_metadata& s = *static_cast<stats_metadata *>(p.get());
return s;
}
// Allow the test cases from sstable_test.cc to test private methods. We use
// a placeholder to avoid cluttering this class too much. The sstable_test class
// will then re-export as public every method it needs.
friend class test;
};
using shared_sstable = lw_shared_ptr<sstable>;
struct entry_descriptor {
sstring ks;
sstring cf;
sstable::version_types version;
unsigned long generation;
sstable::format_types format;
sstable::component_type component;
static entry_descriptor make_descriptor(sstring fname);
entry_descriptor(sstring ks, sstring cf, sstable::version_types version,
unsigned long generation, sstable::format_types format,
sstable::component_type component)
: ks(ks), cf(cf), version(version), generation(generation), format(format), component(component) {}
};
}