/*
* Copyright (C) 2014 ScyllaDB
*/
/*
* This file is part of Scylla.
*
* Scylla is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Scylla is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Scylla. If not, see .
*/
#include "log.hh"
#include "lister.hh"
#include "database.hh"
#include "unimplemented.hh"
#include "core/future-util.hh"
#include "db/commitlog/commitlog_entry.hh"
#include "db/system_keyspace.hh"
#include "db/consistency_level.hh"
#include "db/commitlog/commitlog.hh"
#include "db/config.hh"
#include "to_string.hh"
#include "query-result-writer.hh"
#include "cql3/column_identifier.hh"
#include "core/seastar.hh"
#include
#include
#include
#include
#include
#include
#include "sstables/sstables.hh"
#include "sstables/compaction.hh"
#include "sstables/remove.hh"
#include
#include
#include "locator/simple_snitch.hh"
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "frozen_mutation.hh"
#include "mutation_partition_applier.hh"
#include "core/do_with.hh"
#include "service/migration_manager.hh"
#include "service/storage_service.hh"
#include "message/messaging_service.hh"
#include "mutation_query.hh"
#include
#include
#include "utils/latency.hh"
#include "schema_registry.hh"
#include "service/priority_manager.hh"
#include "cell_locking.hh"
#include "db/view/row_locking.hh"
#include "view_info.hh"
#include "memtable-sstable.hh"
#include "db/schema_tables.hh"
#include "db/query_context.hh"
#include "sstables/compaction_manager.hh"
#include "sstables/compaction_backlog_manager.hh"
#include "sstables/progress_monitor.hh"
#include "checked-file-impl.hh"
#include "disk-error-handler.hh"
#include "db/timeout_clock.hh"
using namespace std::chrono_literals;
logging::logger dblog("database");
namespace {
sstables::sstable::version_types get_highest_supported_format() {
if (service::get_local_storage_service().cluster_supports_la_sstable()) {
return sstables::sstable::version_types::la;
} else {
return sstables::sstable::version_types::ka;
}
}
} /* anonymous namespace */
// Handles permit management only, used for situations where we don't want to inform
// the compaction manager about backlogs (i.e., tests)
class permit_monitor : public sstables::write_monitor {
sstable_write_permit _permit;
public:
permit_monitor(sstable_write_permit&& permit)
: _permit(std::move(permit)) {
}
virtual void on_write_started(const sstables::writer_offset_tracker& t) override { }
virtual void on_data_write_completed() override {
// We need to start a flush before the current one finishes, otherwise
// we'll have a period without significant disk activity when the current
// SSTable is being sealed, the caches are being updated, etc. To do that,
// we ensure the permit doesn't outlive this continuation.
_permit = sstable_write_permit::unconditional();
}
virtual void on_write_completed() override { }
virtual void on_flush_completed() override { }
};
// Handles all tasks related to sstable writing: permit management, compaction backlog updates, etc
class database_sstable_write_monitor : public permit_monitor, public backlog_write_progress_manager {
sstables::shared_sstable _sst;
compaction_manager& _compaction_manager;
sstables::compaction_strategy& _compaction_strategy;
const sstables::writer_offset_tracker* _tracker = nullptr;
uint64_t _progress_seen = 0;
api::timestamp_type _maximum_timestamp;
public:
database_sstable_write_monitor(sstable_write_permit&& permit, sstables::shared_sstable sst, compaction_manager& manager,
sstables::compaction_strategy& strategy, api::timestamp_type max_timestamp)
: permit_monitor(std::move(permit))
, _sst(std::move(sst))
, _compaction_manager(manager)
, _compaction_strategy(strategy)
, _maximum_timestamp(max_timestamp)
{}
virtual void on_write_started(const sstables::writer_offset_tracker& t) override {
_tracker = &t;
_compaction_strategy.get_backlog_tracker().register_partially_written_sstable(_sst, *this);
}
virtual void on_data_write_completed() override {
permit_monitor::on_data_write_completed();
_progress_seen = _tracker->offset;
_tracker = nullptr;
}
void write_failed() {
_compaction_strategy.get_backlog_tracker().revert_charges(_sst);
}
virtual uint64_t written() const override {
if (_tracker) {
return _tracker->offset;
}
return _progress_seen;
}
api::timestamp_type maximum_timestamp() const override {
return _maximum_timestamp;
}
unsigned level() const override {
return 0;
}
};
static const std::unordered_set system_keyspaces = {
db::system_keyspace::NAME, db::schema_tables::NAME
};
bool is_system_keyspace(const sstring& name) {
return system_keyspaces.find(name) != system_keyspaces.end();
}
// Used for tests where the CF exists without a database object. We need to pass a valid
// dirty_memory manager in that case.
thread_local dirty_memory_manager default_dirty_memory_manager;
lw_shared_ptr
table::make_memory_only_memtable_list() {
auto get_schema = [this] { return schema(); };
return make_lw_shared(std::move(get_schema), _config.dirty_memory_manager, _config.memory_compaction_scheduling_group);
}
lw_shared_ptr
table::make_memtable_list() {
auto seal = [this] (flush_permit&& permit) {
return seal_active_memtable(std::move(permit));
};
auto get_schema = [this] { return schema(); };
return make_lw_shared(std::move(seal), std::move(get_schema), _config.dirty_memory_manager, _config.memory_compaction_scheduling_group);
}
lw_shared_ptr
table::make_streaming_memtable_list() {
auto seal = [this] (flush_permit&& permit) {
return seal_active_streaming_memtable_immediate(std::move(permit));
};
auto get_schema = [this] { return schema(); };
return make_lw_shared(std::move(seal), std::move(get_schema), _config.streaming_dirty_memory_manager, _config.streaming_scheduling_group);
}
lw_shared_ptr
table::make_streaming_memtable_big_list(streaming_memtable_big& smb) {
auto seal = [this, &smb] (flush_permit&& permit) {
return seal_active_streaming_memtable_big(smb, std::move(permit));
};
auto get_schema = [this] { return schema(); };
return make_lw_shared(std::move(seal), std::move(get_schema), _config.streaming_dirty_memory_manager, _config.streaming_scheduling_group);
}
table::table(schema_ptr schema, config config, db::commitlog* cl, compaction_manager& compaction_manager, cell_locker_stats& cl_stats, cache_tracker& row_cache_tracker)
: _schema(std::move(schema))
, _config(std::move(config))
, _view_stats(sprint("%s_%s_view_replica_update", _schema->ks_name(), _schema->cf_name()))
, _memtables(_config.enable_disk_writes ? make_memtable_list() : make_memory_only_memtable_list())
, _streaming_memtables(_config.enable_disk_writes ? make_streaming_memtable_list() : make_memory_only_memtable_list())
, _compaction_strategy(make_compaction_strategy(_schema->compaction_strategy(), _schema->compaction_strategy_options()))
, _sstables(make_lw_shared(_compaction_strategy.make_sstable_set(_schema)))
, _cache(_schema, sstables_as_snapshot_source(), row_cache_tracker, is_continuous::yes)
, _commitlog(cl)
, _compaction_manager(compaction_manager)
, _index_manager(*this)
, _counter_cell_locks(std::make_unique(_schema, cl_stats))
, _row_locker(_schema)
{
if (!_config.enable_disk_writes) {
dblog.warn("Writes disabled, column family no durable.");
}
set_metrics();
}
partition_presence_checker
table::make_partition_presence_checker(lw_shared_ptr sstables) {
auto sel = make_lw_shared(sstables->make_incremental_selector());
return [this, sstables = std::move(sstables), sel = std::move(sel)] (const dht::decorated_key& key) {
auto& sst = sel->select(key).sstables;
if (sst.empty()) {
return partition_presence_checker_result::definitely_doesnt_exist;
}
auto hk = sstables::sstable::make_hashed_key(*_schema, key.key());
for (auto&& s : sst) {
if (s->filter_has_key(hk)) {
return partition_presence_checker_result::maybe_exists;
}
}
return partition_presence_checker_result::definitely_doesnt_exist;
};
}
snapshot_source
table::sstables_as_snapshot_source() {
return snapshot_source([this] () {
auto sst_set = _sstables;
return mutation_source([this, sst_set] (schema_ptr s,
const dht::partition_range& r,
const query::partition_slice& slice,
const io_priority_class& pc,
tracing::trace_state_ptr trace_state,
streamed_mutation::forwarding fwd,
mutation_reader::forwarding fwd_mr) {
return make_sstable_reader(std::move(s), sst_set, r, slice, pc, std::move(trace_state), fwd, fwd_mr);
}, [this, sst_set] {
return make_partition_presence_checker(sst_set);
});
});
}
// define in .cc, since sstable is forward-declared in .hh
table::~table() {
}
logalloc::occupancy_stats table::occupancy() const {
logalloc::occupancy_stats res;
for (auto m : *_memtables) {
res += m->region().occupancy();
}
for (auto m : *_streaming_memtables) {
res += m->region().occupancy();
}
for (auto smb : _streaming_memtables_big) {
for (auto m : *smb.second->memtables) {
res += m->region().occupancy();
}
}
return res;
}
static
bool belongs_to_current_shard(const dht::decorated_key& dk) {
return dht::shard_of(dk.token()) == engine().cpu_id();
}
// Stores ranges for all components of the same clustering key, index 0 referring to component
// range 0, and so on.
using ck_filter_clustering_key_components = std::vector>;
// Stores an entry for each clustering key range specified by the filter.
using ck_filter_clustering_key_ranges = std::vector;
// Used to split a clustering key range into a range for each component.
// If a range in ck_filtering_all_ranges is composite, a range will be created
// for each component. If it's not composite, a single range is created.
// This split is needed to check for overlap in each component individually.
static ck_filter_clustering_key_ranges
ranges_for_clustering_key_filter(const schema_ptr& schema, const query::clustering_row_ranges& ck_filtering_all_ranges) {
ck_filter_clustering_key_ranges ranges;
for (auto& r : ck_filtering_all_ranges) {
// this vector stores a range for each component of a key, only one if not composite.
ck_filter_clustering_key_components composite_ranges;
if (r.is_full()) {
ranges.push_back({ nonwrapping_range::make_open_ended_both_sides() });
continue;
}
auto start = r.start() ? r.start()->value().components() : clustering_key_prefix::make_empty().components();
auto end = r.end() ? r.end()->value().components() : clustering_key_prefix::make_empty().components();
auto start_it = start.begin();
auto end_it = end.begin();
// This test is enough because equal bounds in nonwrapping_range are inclusive.
auto is_singular = [&schema] (const auto& type_it, const bytes_view& b1, const bytes_view& b2) {
if (type_it == schema->clustering_key_type()->types().end()) {
throw std::runtime_error(sprint("clustering key filter passed more components than defined in schema of %s.%s",
schema->ks_name(), schema->cf_name()));
}
return (*type_it)->compare(b1, b2) == 0;
};
auto type_it = schema->clustering_key_type()->types().begin();
composite_ranges.reserve(schema->clustering_key_size());
// the rule is to ignore any component cn if another component ck (k < n) is not if the form [v, v].
// If we have [v1, v1], [v2, v2], ... {vl3, vr3}, ....
// then we generate [v1, v1], [v2, v2], ... {vl3, vr3}. Where { = '(' or '[', etc.
while (start_it != start.end() && end_it != end.end() && is_singular(type_it++, *start_it, *end_it)) {
composite_ranges.push_back(nonwrapping_range({{ std::move(*start_it++), true }},
{{ std::move(*end_it++), true }}));
}
// handle a single non-singular tail element, if present
if (start_it != start.end() && end_it != end.end()) {
composite_ranges.push_back(nonwrapping_range({{ std::move(*start_it), r.start()->is_inclusive() }},
{{ std::move(*end_it), r.end()->is_inclusive() }}));
} else if (start_it != start.end()) {
composite_ranges.push_back(nonwrapping_range({{ std::move(*start_it), r.start()->is_inclusive() }}, {}));
} else if (end_it != end.end()) {
composite_ranges.push_back(nonwrapping_range({}, {{ std::move(*end_it), r.end()->is_inclusive() }}));
}
ranges.push_back(std::move(composite_ranges));
}
return ranges;
}
// Return true if this sstable possibly stores clustering row(s) specified by ranges.
static inline bool
contains_rows(const sstables::sstable& sst, const schema_ptr& schema, const ck_filter_clustering_key_ranges& ranges) {
auto& clustering_key_types = schema->clustering_key_type()->types();
auto& clustering_components_ranges = sst.clustering_components_ranges();
if (!schema->clustering_key_size() || clustering_components_ranges.empty()) {
return true;
}
return boost::algorithm::any_of(ranges, [&] (const ck_filter_clustering_key_components& range) {
auto s = std::min(range.size(), clustering_components_ranges.size());
return boost::algorithm::all_of(boost::irange(0, s), [&] (unsigned i) {
auto& type = clustering_key_types[i];
return range[i].is_full() || range[i].overlaps(clustering_components_ranges[i], type->as_tri_comparator());
});
});
}
// Filter out sstables for reader using bloom filter and sstable metadata that keeps track
// of a range for each clustering component.
static std::vector
filter_sstable_for_reader(std::vector&& sstables, column_family& cf, const schema_ptr& schema,
const sstables::key& key, const query::partition_slice& slice) {
auto sstable_has_not_key = [&] (const sstables::shared_sstable& sst) {
return !sst->filter_has_key(key);
};
sstables.erase(boost::remove_if(sstables, sstable_has_not_key), sstables.end());
// FIXME: Workaround for https://github.com/scylladb/scylla/issues/3552
// and https://github.com/scylladb/scylla/issues/3553
const bool filtering_broken = true;
// no clustering filtering is applied if schema defines no clustering key or
// compaction strategy thinks it will not benefit from such an optimization.
if (filtering_broken || !schema->clustering_key_size() || !cf.get_compaction_strategy().use_clustering_key_filter()) {
return sstables;
}
::cf_stats* stats = cf.cf_stats();
stats->clustering_filter_count++;
stats->sstables_checked_by_clustering_filter += sstables.size();
auto ck_filtering_all_ranges = slice.get_all_ranges();
// fast path to include all sstables if only one full range was specified.
// For example, this happens if query only specifies a partition key.
if (ck_filtering_all_ranges.size() == 1 && ck_filtering_all_ranges[0].is_full()) {
stats->clustering_filter_fast_path_count++;
stats->surviving_sstables_after_clustering_filter += sstables.size();
return sstables;
}
auto ranges = ranges_for_clustering_key_filter(schema, ck_filtering_all_ranges);
if (ranges.empty()) {
return {};
}
int64_t min_timestamp = std::numeric_limits::max();
auto sstable_has_clustering_key = [&min_timestamp, &schema, &ranges] (const sstables::shared_sstable& sst) {
if (!contains_rows(*sst, schema, ranges)) {
return false; // ordered after sstables that contain clustering rows.
} else {
min_timestamp = std::min(min_timestamp, sst->get_stats_metadata().min_timestamp);
return true;
}
};
auto sstable_has_relevant_tombstone = [&min_timestamp] (const sstables::shared_sstable& sst) {
const auto& stats = sst->get_stats_metadata();
// re-add sstable as candidate if it contains a tombstone that may cover a row in an included sstable.
return (stats.max_timestamp > min_timestamp && stats.estimated_tombstone_drop_time.bin.size());
};
auto skipped = std::partition(sstables.begin(), sstables.end(), sstable_has_clustering_key);
auto actually_skipped = std::partition(skipped, sstables.end(), sstable_has_relevant_tombstone);
sstables.erase(actually_skipped, sstables.end());
stats->surviving_sstables_after_clustering_filter += sstables.size();
return sstables;
}
// Incremental selector implementation for combined_mutation_reader that
// selects readers on-demand as the read progresses through the token
// range.
class incremental_reader_selector : public reader_selector {
const dht::partition_range* _pr;
lw_shared_ptr _sstables;
tracing::trace_state_ptr _trace_state;
sstables::sstable_set::incremental_selector _selector;
std::unordered_set _read_sstables;
sstable_reader_factory_type _fn;
flat_mutation_reader create_reader(sstables::shared_sstable sst) {
tracing::trace(_trace_state, "Reading partition range {} from sstable {}", *_pr, seastar::value_of([&sst] { return sst->get_filename(); }));
return _fn(sst, *_pr);
}
public:
explicit incremental_reader_selector(schema_ptr s,
lw_shared_ptr sstables,
const dht::partition_range& pr,
tracing::trace_state_ptr trace_state,
sstable_reader_factory_type fn)
: reader_selector(s, pr.start() ? pr.start()->value() : dht::ring_position_view::min())
, _pr(&pr)
, _sstables(std::move(sstables))
, _trace_state(std::move(trace_state))
, _selector(_sstables->make_incremental_selector())
, _fn(std::move(fn)) {
dblog.trace("incremental_reader_selector {}: created for range: {} with {} sstables",
this,
*_pr,
_sstables->all()->size());
}
incremental_reader_selector(const incremental_reader_selector&) = delete;
incremental_reader_selector& operator=(const incremental_reader_selector&) = delete;
incremental_reader_selector(incremental_reader_selector&&) = delete;
incremental_reader_selector& operator=(incremental_reader_selector&&) = delete;
virtual std::vector create_new_readers(const std::optional& pos) override {
dblog.trace("incremental_reader_selector {}: {}({})", this, __FUNCTION__, seastar::lazy_deref(pos));
auto readers = std::vector();
do {
auto selection = _selector.select(_selector_position);
_selector_position = selection.next_position;
dblog.trace("incremental_reader_selector {}: {} sstables to consider, advancing selector to {}", this, selection.sstables.size(),
_selector_position);
readers = boost::copy_range>(selection.sstables
| boost::adaptors::filtered([this] (auto& sst) { return _read_sstables.emplace(sst).second; })
| boost::adaptors::transformed([this] (auto& sst) { return this->create_reader(sst); }));
} while (!_selector_position.is_max() && readers.empty() && (!pos || dht::ring_position_tri_compare(*_s, *pos, _selector_position) >= 0));
dblog.trace("incremental_reader_selector {}: created {} new readers", this, readers.size());
return readers;
}
virtual std::vector fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) override {
_pr = ≺
auto pos = dht::ring_position_view::for_range_start(*_pr);
if (dht::ring_position_tri_compare(*_s, pos, _selector_position) >= 0) {
return create_new_readers(pos);
}
return {};
}
};
static flat_mutation_reader
create_single_key_sstable_reader(column_family* cf,
schema_ptr schema,
lw_shared_ptr sstables,
utils::estimated_histogram& sstable_histogram,
const dht::partition_range& pr, // must be singular
const query::partition_slice& slice,
const io_priority_class& pc,
reader_resource_tracker resource_tracker,
tracing::trace_state_ptr trace_state,
streamed_mutation::forwarding fwd,
mutation_reader::forwarding fwd_mr)
{
auto key = sstables::key::from_partition_key(*schema, *pr.start()->value().key());
auto readers = boost::copy_range>(
filter_sstable_for_reader(sstables->select(pr), *cf, schema, key, slice)
| boost::adaptors::transformed([&] (const sstables::shared_sstable& sstable) {
tracing::trace(trace_state, "Reading key {} from sstable {}", pr, seastar::value_of([&sstable] { return sstable->get_filename(); }));
return sstable->read_row_flat(schema, pr.start()->value(), slice, pc, resource_tracker, fwd);
})
);
if (readers.empty()) {
return make_empty_flat_reader(schema);
}
sstable_histogram.add(readers.size());
return make_combined_reader(schema, std::move(readers), fwd, fwd_mr);
}
flat_mutation_reader
table::make_sstable_reader(schema_ptr s,
lw_shared_ptr sstables,
const dht::partition_range& pr,
const query::partition_slice& slice,
const io_priority_class& pc,
tracing::trace_state_ptr trace_state,
streamed_mutation::forwarding fwd,
mutation_reader::forwarding fwd_mr) const {
auto* semaphore = service::get_local_streaming_read_priority().id() == pc.id()
? _config.streaming_read_concurrency_semaphore
: _config.read_concurrency_semaphore;
// CAVEAT: if make_sstable_reader() is called on a single partition
// we want to optimize and read exactly this partition. As a
// consequence, fast_forward_to() will *NOT* work on the result,
// regardless of what the fwd_mr parameter says.
if (pr.is_singular() && pr.start()->value().has_key()) {
const dht::ring_position& pos = pr.start()->value();
if (dht::shard_of(pos.token()) != engine().cpu_id()) {
return make_empty_flat_reader(s); // range doesn't belong to this shard
}
if (semaphore) {
auto ms = mutation_source([semaphore, this, sstables=std::move(sstables)] (
schema_ptr s,
const dht::partition_range& pr,
const query::partition_slice& slice,
const io_priority_class& pc,
tracing::trace_state_ptr trace_state,
streamed_mutation::forwarding fwd,
mutation_reader::forwarding fwd_mr,
reader_resource_tracker tracker) {
return create_single_key_sstable_reader(const_cast(this), std::move(s), std::move(sstables),
_stats.estimated_sstable_per_read, pr, slice, pc, tracker, std::move(trace_state), fwd, fwd_mr);
});
return make_restricted_flat_reader(*semaphore, std::move(ms), std::move(s), pr, slice, pc, std::move(trace_state), fwd, fwd_mr);
} else {
return create_single_key_sstable_reader(const_cast(this), std::move(s), std::move(sstables),
_stats.estimated_sstable_per_read, pr, slice, pc, no_resource_tracking(), std::move(trace_state), fwd, fwd_mr);
}
} else {
if (semaphore) {
auto ms = mutation_source([semaphore, sstables=std::move(sstables)] (
schema_ptr s,
const dht::partition_range& pr,
const query::partition_slice& slice,
const io_priority_class& pc,
tracing::trace_state_ptr trace_state,
streamed_mutation::forwarding fwd,
mutation_reader::forwarding fwd_mr,
reader_resource_tracker tracker) {
return make_local_shard_sstable_reader(std::move(s), std::move(sstables), pr, slice, pc,
tracker, std::move(trace_state), fwd, fwd_mr);
});
return make_restricted_flat_reader(*semaphore, std::move(ms), std::move(s), pr, slice, pc, std::move(trace_state), fwd, fwd_mr);
} else {
return make_local_shard_sstable_reader(std::move(s), std::move(sstables), pr, slice, pc,
no_resource_tracking(), std::move(trace_state), fwd, fwd_mr);
}
}
}
// Exposed for testing, not performance critical.
future
table::find_partition(schema_ptr s, const dht::decorated_key& key) const {
return do_with(dht::partition_range::make_singular(key), [s = std::move(s), this] (auto& range) {
return do_with(this->make_reader(s, range), [s] (flat_mutation_reader& reader) {
return read_mutation_from_flat_mutation_reader(reader).then([] (mutation_opt&& mo) -> std::unique_ptr {
if (!mo) {
return {};
}
return std::make_unique(std::move(mo->partition()));
});
});
});
}
future
table::find_partition_slow(schema_ptr s, const partition_key& key) const {
return find_partition(s, dht::global_partitioner().decorate_key(*s, key));
}
future
table::find_row(schema_ptr s, const dht::decorated_key& partition_key, clustering_key clustering_key) const {
return find_partition(s, partition_key).then([clustering_key = std::move(clustering_key), s] (const_mutation_partition_ptr p) {
if (!p) {
return make_ready_future();
}
auto r = p->find_row(*s, clustering_key);
if (r) {
// FIXME: remove copy if only one data source
return make_ready_future(std::make_unique(*s, column_kind::regular_column, *r));
} else {
return make_ready_future();
}
});
}
flat_mutation_reader
table::make_reader(schema_ptr s,
const dht::partition_range& range,
const query::partition_slice& slice,
const io_priority_class& pc,
tracing::trace_state_ptr trace_state,
streamed_mutation::forwarding fwd,
mutation_reader::forwarding fwd_mr) const {
if (_virtual_reader) {
return (*_virtual_reader).make_reader(s, range, slice, pc, trace_state, fwd, fwd_mr);
}
std::vector readers;
readers.reserve(_memtables->size() + 1);
// We're assuming that cache and memtables are both read atomically
// for single-key queries, so we don't need to special case memtable
// undergoing a move to cache. At any given point in time between
// deferring points the sum of data in memtable and cache is coherent. If
// single-key queries for each data source were performed across deferring
// points, it would be possible that partitions which are ahead of the
// memtable cursor would be placed behind the cache cursor, resulting in
// those partitions being missing in the combined reader.
//
// We need to handle this in range queries though, as they are always
// deferring. scanning_reader from memtable.cc is falling back to reading
// the sstable when memtable is flushed. After memtable is moved to cache,
// new readers will no longer use the old memtable, but until then
// performance may suffer. We should fix this when we add support for
// range queries in cache, so that scans can always be satisfied form
// memtable and cache only, as long as data is not evicted.
//
// https://github.com/scylladb/scylla/issues/309
// https://github.com/scylladb/scylla/issues/185
for (auto&& mt : *_memtables) {
readers.emplace_back(mt->make_flat_reader(s, range, slice, pc, trace_state, fwd, fwd_mr));
}
if (_config.enable_cache) {
readers.emplace_back(_cache.make_reader(s, range, slice, pc, std::move(trace_state), fwd, fwd_mr));
} else {
readers.emplace_back(make_sstable_reader(s, _sstables, range, slice, pc, std::move(trace_state), fwd, fwd_mr));
}
return make_combined_reader(s, std::move(readers), fwd, fwd_mr);
}
sstables::shared_sstable
table::make_streaming_sstable_for_write() {
sstring dir = _config.datadir;
auto newtab = sstables::make_sstable(_schema,
dir, calculate_generation_for_new_table(),
get_highest_supported_format(),
sstables::sstable::format_types::big);
dblog.debug("Created sstable for streaming: ks={}, cf={}, dir={}", schema()->ks_name(), schema()->cf_name(), dir);
return newtab;
}
flat_mutation_reader
table::make_streaming_reader(schema_ptr s,
const dht::partition_range_vector& ranges) const {
auto& slice = s->full_slice();
auto& pc = service::get_local_streaming_read_priority();
auto source = mutation_source([this] (schema_ptr s, const dht::partition_range& range, const query::partition_slice& slice,
const io_priority_class& pc, tracing::trace_state_ptr trace_state, streamed_mutation::forwarding fwd, mutation_reader::forwarding fwd_mr) {
std::vector readers;
readers.reserve(_memtables->size() + 1);
for (auto&& mt : *_memtables) {
readers.emplace_back(mt->make_flat_reader(s, range, slice, pc, trace_state, fwd, fwd_mr));
}
readers.emplace_back(make_sstable_reader(s, _sstables, range, slice, pc, std::move(trace_state), fwd, fwd_mr));
return make_combined_reader(s, std::move(readers), fwd, fwd_mr);
});
return make_flat_multi_range_reader(s, std::move(source), ranges, slice, pc, nullptr, mutation_reader::forwarding::no);
}
future> table::lock_counter_cells(const mutation& m, db::timeout_clock::time_point timeout) {
assert(m.schema() == _counter_cell_locks->schema());
return _counter_cell_locks->lock_cells(m.decorated_key(), partition_cells_range(m.partition()), timeout);
}
// Not performance critical. Currently used for testing only.
template
future
table::for_all_partitions(schema_ptr s, Func&& func) const {
static_assert(std::is_same>::value,
"bad Func signature");
struct iteration_state {
flat_mutation_reader reader;
Func func;
bool ok = true;
bool empty = false;
public:
bool done() const { return !ok || empty; }
iteration_state(schema_ptr s, const column_family& cf, Func&& func)
: reader(cf.make_reader(std::move(s)))
, func(std::move(func))
{ }
};
return do_with(iteration_state(std::move(s), *this, std::move(func)), [] (iteration_state& is) {
return do_until([&is] { return is.done(); }, [&is] {
return read_mutation_from_flat_mutation_reader(is.reader).then([&is](mutation_opt&& mo) {
if (!mo) {
is.empty = true;
} else {
is.ok = is.func(mo->decorated_key(), mo->partition());
}
});
}).then([&is] {
return is.ok;
});
});
}
future
table::for_all_partitions_slow(schema_ptr s, std::function func) const {
return for_all_partitions(std::move(s), std::move(func));
}
static bool belongs_to_current_shard(const std::vector& shards) {
return boost::find(shards, engine().cpu_id()) != shards.end();
}
static bool belongs_to_other_shard(const std::vector& shards) {
return shards.size() != size_t(belongs_to_current_shard(shards));
}
future
table::open_sstable(sstables::foreign_sstable_open_info info, sstring dir, int64_t generation,
sstables::sstable::version_types v, sstables::sstable::format_types f) {
auto sst = sstables::make_sstable(_schema, dir, generation, v, f);
if (!belongs_to_current_shard(info.owners)) {
dblog.debug("sstable {} not relevant for this shard, ignoring", sst->get_filename());
return make_ready_future();
}
if (!belongs_to_other_shard(info.owners)) {
sst->set_unshared();
}
return sst->load(std::move(info)).then([sst] () mutable {
return make_ready_future(std::move(sst));
});
}
void table::load_sstable(sstables::shared_sstable& sst, bool reset_level) {
if (schema()->is_counter() && !sst->has_scylla_component()) {
throw std::runtime_error("Loading non-Scylla SSTables containing counters is not supported. Use sstableloader instead.");
}
auto& shards = sst->get_shards_for_this_sstable();
if (belongs_to_other_shard(shards)) {
// If we're here, this sstable is shared by this and other
// shard(s). Shared sstables cannot be deleted until all
// shards compacted them, so to reduce disk space usage we
// want to start splitting them now.
// However, we need to delay this compaction until we read all
// the sstables belonging to this CF, because we need all of
// them to know which tombstones we can drop, and what
// generation number is free.
_sstables_need_rewrite.emplace(sst->generation(), sst);
}
if (reset_level) {
// When loading a migrated sstable, set level to 0 because
// it may overlap with existing tables in levels > 0.
// This step is optional, because even if we didn't do this
// scylla would detect the overlap, and bring back some of
// the sstables to level 0.
sst->set_sstable_level(0);
}
add_sstable(sst, std::move(shards));
}
void table::update_stats_for_new_sstable(uint64_t disk_space_used_by_sstable, const std::vector& shards_for_the_sstable) noexcept {
assert(!shards_for_the_sstable.empty());
if (*boost::min_element(shards_for_the_sstable) == engine().cpu_id()) {
_stats.live_disk_space_used += disk_space_used_by_sstable;
_stats.total_disk_space_used += disk_space_used_by_sstable;
_stats.live_sstable_count++;
}
}
void table::add_sstable(sstables::shared_sstable sstable, const std::vector& shards_for_the_sstable) {
// allow in-progress reads to continue using old list
auto new_sstables = make_lw_shared(*_sstables);
new_sstables->insert(sstable);
_sstables = std::move(new_sstables);
update_stats_for_new_sstable(sstable->bytes_on_disk(), shards_for_the_sstable);
_compaction_strategy.get_backlog_tracker().add_sstable(sstable);
}
future<>
table::add_sstable_and_update_cache(sstables::shared_sstable sst) {
return get_row_cache().invalidate([this, sst] () noexcept {
// FIXME: this is not really noexcept, but we need to provide strong exception guarantees.
// atomically load all opened sstables into column family.
add_sstable(sst, {engine().cpu_id()});
trigger_compaction();
}, dht::partition_range::make({sst->get_first_decorated_key(), true}, {sst->get_last_decorated_key(), true}));
}
future<>
table::update_cache(lw_shared_ptr m, sstables::shared_sstable sst) {
auto adder = [this, m, sst] {
auto newtab_ms = sst->as_mutation_source();
add_sstable(sst, {engine().cpu_id()});
m->mark_flushed(std::move(newtab_ms));
try_trigger_compaction();
};
if (_config.enable_cache) {
return _cache.update(adder, *m);
} else {
adder();
return m->clear_gently();
}
}
future<>
table::seal_active_streaming_memtable_immediate(flush_permit&& permit) {
return with_scheduling_group(_config.streaming_scheduling_group, [this, permit = std::move(permit)] () mutable {
auto old = _streaming_memtables->back();
if (old->empty()) {
return make_ready_future<>();
}
_streaming_memtables->add_memtable();
_streaming_memtables->erase(old);
dblog.debug("Sealing streaming memtable of {}.{}, partitions: {}, occupancy: {}", _schema->ks_name(), _schema->cf_name(), old->partition_count(), old->occupancy());
auto guard = _streaming_flush_phaser.start();
return with_gate(_streaming_flush_gate, [this, old, permit = std::move(permit)] () mutable {
return with_lock(_sstables_lock.for_read(), [this, old, permit = std::move(permit)] () mutable {
auto newtab = sstables::make_sstable(_schema,
_config.datadir, calculate_generation_for_new_table(),
get_highest_supported_format(),
sstables::sstable::format_types::big);
newtab->set_unshared();
dblog.debug("Flushing to {}", newtab->get_filename());
// This is somewhat similar to the main memtable flush, but with important differences.
//
// The first difference, is that we don't keep aggregate collectd statistics about this one.
// If we ever need to, we'll keep them separate statistics, but we don't want to polute the
// main stats about memtables with streaming memtables.
//
// Lastly, we don't have any commitlog RP to update, and we don't need to deal manipulate the
// memtable list, since this memtable was not available for reading up until this point.
auto fp = permit.release_sstable_write_permit();
database_sstable_write_monitor monitor(std::move(fp), newtab, _compaction_manager, _compaction_strategy, old->get_max_timestamp());
return do_with(std::move(monitor), [this, newtab, old, permit = std::move(permit)] (auto& monitor) mutable {
auto&& priority = service::get_local_streaming_write_priority();
return write_memtable_to_sstable(*old, newtab, monitor, get_large_partition_handler(), incremental_backups_enabled(), priority, false).then([this, newtab, old] {
return newtab->open_data();
}).then([this, old, newtab] () {
return with_scheduling_group(_config.memtable_to_cache_scheduling_group, [this, newtab, old] {
auto adder = [this, newtab] {
add_sstable(newtab, {engine().cpu_id()});
try_trigger_compaction();
dblog.debug("Flushing to {} done", newtab->get_filename());
};
if (_config.enable_cache) {
return _cache.update_invalidating(adder, *old);
} else {
adder();
return old->clear_gently();
}
});
}).handle_exception([old, permit = std::move(permit), &monitor, newtab] (auto ep) {
monitor.write_failed();
newtab->mark_for_deletion();
dblog.error("failed to write streamed sstable: {}", ep);
return make_exception_future<>(ep);
});
});
// We will also not have any retry logic. If we fail here, we'll fail the streaming and let
// the upper layers know. They can then apply any logic they want here.
});
}).finally([guard = std::move(guard)] { });
});
}
future<> table::seal_active_streaming_memtable_big(streaming_memtable_big& smb, flush_permit&& permit) {
return with_scheduling_group(_config.streaming_scheduling_group, [this, &smb, permit = std::move(permit)] () mutable {
auto old = smb.memtables->back();
if (old->empty()) {
return make_ready_future<>();
}
smb.memtables->add_memtable();
smb.memtables->erase(old);
return with_gate(_streaming_flush_gate, [this, old, &smb, permit = std::move(permit)] () mutable {
return with_gate(smb.flush_in_progress, [this, old, &smb, permit = std::move(permit)] () mutable {
return with_lock(_sstables_lock.for_read(), [this, old, &smb, permit = std::move(permit)] () mutable {
auto newtab = sstables::make_sstable(_schema,
_config.datadir, calculate_generation_for_new_table(),
get_highest_supported_format(),
sstables::sstable::format_types::big);
newtab->set_unshared();
auto fp = permit.release_sstable_write_permit();
auto monitor = std::make_unique(std::move(fp), newtab, _compaction_manager, _compaction_strategy, old->get_max_timestamp());
auto&& priority = service::get_local_streaming_write_priority();
auto fut = write_memtable_to_sstable(*old, newtab, *monitor, get_large_partition_handler(), incremental_backups_enabled(), priority, true);
return fut.then_wrapped([this, newtab, old, &smb, permit = std::move(permit), monitor = std::move(monitor)] (future<> f) mutable {
if (!f.failed()) {
smb.sstables.push_back(monitored_sstable{std::move(monitor), newtab});
return make_ready_future<>();
} else {
monitor->write_failed();
newtab->mark_for_deletion();
auto ep = f.get_exception();
dblog.error("failed to write streamed sstable: {}", ep);
return make_exception_future<>(ep);
}
});
});
});
});
});
}
future<>
table::seal_active_memtable(flush_permit&& permit) {
auto old = _memtables->back();
dblog.debug("Sealing active memtable of {}.{}, partitions: {}, occupancy: {}", _schema->ks_name(), _schema->cf_name(), old->partition_count(), old->occupancy());
if (old->empty()) {
dblog.debug("Memtable is empty");
return _flush_barrier.advance_and_await();
}
_memtables->add_memtable();
_stats.memtable_switch_count++;
// This will set evictable occupancy of the old memtable region to zero, so that
// this region is considered last for flushing by dirty_memory_manager::flush_when_needed().
// If we don't do that, the flusher may keep picking up this memtable list for flushing after
// the permit is released even though there is not much to flush in the active memtable of this list.
old->region().ground_evictable_occupancy();
auto previous_flush = _flush_barrier.advance_and_await();
auto op = _flush_barrier.start();
auto memtable_size = old->occupancy().total_space();
_stats.pending_flushes++;
_config.cf_stats->pending_memtables_flushes_count++;
_config.cf_stats->pending_memtables_flushes_bytes += memtable_size;
return do_with(std::move(permit), [this, old] (auto& permit) {
return repeat([this, old, &permit] () mutable {
auto sstable_write_permit = permit.release_sstable_write_permit();
return with_lock(_sstables_lock.for_read(), [this, old, sstable_write_permit = std::move(sstable_write_permit)] () mutable {
return this->try_flush_memtable_to_sstable(old, std::move(sstable_write_permit));
}).then([this, &permit] (auto should_stop) mutable {
if (should_stop) {
return make_ready_future(should_stop);
}
return sleep(10s).then([this, &permit] () mutable {
return std::move(permit).reacquire_sstable_write_permit().then([this, &permit] (auto new_permit) mutable {
permit = std::move(new_permit);
return make_ready_future(stop_iteration::no);
});
});
});
});
}).then([this, memtable_size, old, op = std::move(op), previous_flush = std::move(previous_flush)] () mutable {
_stats.pending_flushes--;
_config.cf_stats->pending_memtables_flushes_count--;
_config.cf_stats->pending_memtables_flushes_bytes -= memtable_size;
if (_commitlog) {
_commitlog->discard_completed_segments(_schema->id(), old->rp_set());
}
return previous_flush.finally([op = std::move(op)] { });
});
// FIXME: release commit log
// FIXME: provide back-pressure to upper layers
}
future
table::try_flush_memtable_to_sstable(lw_shared_ptr old, sstable_write_permit&& permit) {
return with_scheduling_group(_config.memtable_scheduling_group, [this, old = std::move(old), permit = std::move(permit)] () mutable {
auto gen = calculate_generation_for_new_table();
auto newtab = sstables::make_sstable(_schema,
_config.datadir, gen,
get_highest_supported_format(),
sstables::sstable::format_types::big);
newtab->set_unshared();
dblog.debug("Flushing to {}", newtab->get_filename());
// Note that due to our sharded architecture, it is possible that
// in the face of a value change some shards will backup sstables
// while others won't.
//
// This is, in theory, possible to mitigate through a rwlock.
// However, this doesn't differ from the situation where all tables
// are coming from a single shard and the toggle happens in the
// middle of them.
//
// The code as is guarantees that we'll never partially backup a
// single sstable, so that is enough of a guarantee.
database_sstable_write_monitor monitor(std::move(permit), newtab, _compaction_manager, _compaction_strategy, old->get_max_timestamp());
return do_with(std::move(monitor), [this, old, newtab] (auto& monitor) {
auto&& priority = service::get_local_memtable_flush_priority();
auto f = write_memtable_to_sstable(*old, newtab, monitor, get_large_partition_handler(), incremental_backups_enabled(), priority, false);
// Switch back to default scheduling group for post-flush actions, to avoid them being staved by the memtable flush
// controller. Cache update does not affect the input of the memtable cpu controller, so it can be subject to
// priority inversion.
return with_scheduling_group(default_scheduling_group(), [this, &monitor, old = std::move(old), newtab = std::move(newtab), f = std::move(f)] () mutable {
return f.then([this, newtab, old, &monitor] {
return newtab->open_data().then([this, old, newtab] () {
dblog.debug("Flushing to {} done", newtab->get_filename());
return with_scheduling_group(_config.memtable_to_cache_scheduling_group, [this, old, newtab] {
return update_cache(old, newtab);
});
}).then([this, old, newtab] () noexcept {
_memtables->erase(old);
dblog.debug("Memtable for {} replaced", newtab->get_filename());
return stop_iteration::yes;
});
}).handle_exception([this, old, newtab, &monitor] (auto e) {
monitor.write_failed();
newtab->mark_for_deletion();
dblog.error("failed to write sstable {}: {}", newtab->get_filename(), e);
// If we failed this write we will try the write again and that will create a new flush reader
// that will decrease dirty memory again. So we need to reset the accounting.
old->revert_flushed_memory();
return stop_iteration(_async_gate.is_closed());
});
});
});
});
}
void
table::start() {
// FIXME: add option to disable automatic compaction.
start_compaction();
}
future<>
table::stop() {
return _async_gate.close().then([this] {
return when_all(_memtables->request_flush(), _streaming_memtables->request_flush()).discard_result().finally([this] {
return _compaction_manager.remove(this).then([this] {
// Nest, instead of using when_all, so we don't lose any exceptions.
return _streaming_flush_gate.close();
}).then([this] {
return _sstable_deletion_gate.close();
});
});
});
}
static io_error_handler error_handler_for_upload_dir() {
return [] (std::exception_ptr eptr) {
// do nothing about sstable exception and caller will just rethrow it.
};
}
// This function will iterate through upload directory in column family,
// and will do the following for each sstable found:
// 1) Mutate sstable level to 0.
// 2) Create hard links to its components in column family dir.
// 3) Remove all of its components in upload directory.
// At the end, it's expected that upload dir is empty and all of its
// previous content was moved to column family dir.
//
// Return a vector containing descriptor of sstables to be loaded.
future>
distributed_loader::flush_upload_dir(distributed& db, sstring ks_name, sstring cf_name) {
struct work {
std::unordered_map descriptors;
std::vector flushed;
};
return do_with(work(), [&db, ks_name = std::move(ks_name), cf_name = std::move(cf_name)] (work& work) {
auto& cf = db.local().find_column_family(ks_name, cf_name);
return lister::scan_dir(lister::path(cf._config.datadir) / "upload", { directory_entry_type::regular },
[&work] (lister::path parent_dir, directory_entry de) {
auto comps = sstables::entry_descriptor::make_descriptor(parent_dir.native(), de.name);
if (comps.component != component_type::TOC) {
return make_ready_future<>();
}
work.descriptors.emplace(comps.generation, std::move(comps));
return make_ready_future<>();
}, &column_family::manifest_json_filter).then([&db, ks_name = std::move(ks_name), cf_name = std::move(cf_name), &work] {
work.flushed.reserve(work.descriptors.size());
return do_for_each(work.descriptors, [&db, ks_name, cf_name, &work] (auto& pair) {
return db.invoke_on(column_family::calculate_shard_from_sstable_generation(pair.first),
[ks_name, cf_name, &work, comps = pair.second] (database& db) {
auto& cf = db.find_column_family(ks_name, cf_name);
auto sst = sstables::make_sstable(cf.schema(), cf._config.datadir + "/upload", comps.generation,
comps.version, comps.format, gc_clock::now(),
[] (disk_error_signal_type&) { return error_handler_for_upload_dir(); });
auto gen = cf.calculate_generation_for_new_table();
// Read toc content as it will be needed for moving and deleting a sstable.
return sst->read_toc().then([sst, s = cf.schema()] {
if (s->is_counter() && !sst->has_scylla_component()) {
return make_exception_future<>(std::runtime_error("Loading non-Scylla SSTables containing counters is not supported. Use sstableloader instead."));
}
return sst->mutate_sstable_level(0);
}).then([&cf, sst, gen] {
return sst->create_links(cf._config.datadir, gen);
}).then([sst] {
return sstables::remove_by_toc_name(sst->toc_filename(), error_handler_for_upload_dir());
}).then([sst, &cf, gen, comps = comps, &work] () mutable {
comps.generation = gen;
comps.sstdir = cf._config.datadir;
return make_ready_future(std::move(comps));
});
}).then([&work] (sstables::entry_descriptor comps) mutable {
work.flushed.push_back(std::move(comps));
return make_ready_future<>();
});
});
}).then([&work] {
return make_ready_future>(std::move(work.flushed));
});
});
}
future>
table::reshuffle_sstables(std::set all_generations, int64_t start) {
struct work {
int64_t current_gen;
std::set all_generations; // Stores generation of all live sstables in the system.
std::map sstables;
std::unordered_map descriptors;
std::vector reshuffled;
work(int64_t start, std::set gens)
: current_gen(start ? start : 1)
, all_generations(gens) {}
};
return do_with(work(start, std::move(all_generations)), [this] (work& work) {
return lister::scan_dir(_config.datadir, { directory_entry_type::regular }, [this, &work] (lister::path parent_dir, directory_entry de) {
auto comps = sstables::entry_descriptor::make_descriptor(parent_dir.native(), de.name);
if (comps.component != component_type::TOC) {
return make_ready_future<>();
}
// Skip generations that were already loaded by Scylla at a previous stage.
if (work.all_generations.count(comps.generation) != 0) {
return make_ready_future<>();
}
auto sst = sstables::make_sstable(_schema,
_config.datadir, comps.generation,
comps.version, comps.format);
work.sstables.emplace(comps.generation, std::move(sst));
work.descriptors.emplace(comps.generation, std::move(comps));
// FIXME: This is the only place in which we actually issue disk activity aside from
// directory metadata operations.
//
// But without the TOC information, we don't know which files we should link.
// The alternative to that would be to change create link to try creating a
// link for all possible files and handling the failures gracefuly, but that's not
// exactly fast either.
//
// Those SSTables are not known by anyone in the system. So we don't have any kind of
// object describing them. There isn't too much of a choice.
return work.sstables[comps.generation]->read_toc();
}, &manifest_json_filter).then([&work] {
// Note: cannot be parallel because we will be shuffling things around at this stage. Can't race.
return do_for_each(work.sstables, [&work] (auto& pair) {
auto&& comps = std::move(work.descriptors.at(pair.first));
comps.generation = work.current_gen;
work.reshuffled.push_back(std::move(comps));
if (pair.first == work.current_gen) {
++work.current_gen;
return make_ready_future<>();
}
return pair.second->set_generation(work.current_gen++);
});
}).then([&work] {
return make_ready_future>(std::move(work.reshuffled));
});
});
}
seastar::metrics::label column_family_label("cf");
seastar::metrics::label keyspace_label("ks");
void table::set_metrics() {
auto cf = column_family_label(_schema->cf_name());
auto ks = keyspace_label(_schema->ks_name());
namespace ms = seastar::metrics;
if (_config.enable_metrics_reporting) {
_metrics.add_group("column_family", {
ms::make_derive("memtable_switch", ms::description("Number of times flush has resulted in the memtable being switched out"), _stats.memtable_switch_count)(cf)(ks),
ms::make_gauge("pending_tasks", ms::description("Estimated number of tasks pending for this column family"), _stats.pending_flushes)(cf)(ks),
ms::make_gauge("live_disk_space", ms::description("Live disk space used"), _stats.live_disk_space_used)(cf)(ks),
ms::make_gauge("total_disk_space", ms::description("Total disk space used"), _stats.total_disk_space_used)(cf)(ks),
ms::make_gauge("live_sstable", ms::description("Live sstable count"), _stats.live_sstable_count)(cf)(ks),
ms::make_gauge("pending_compaction", ms::description("Estimated number of compactions pending for this column family"), _stats.pending_compactions)(cf)(ks)
});
// Metrics related to row locking
auto add_row_lock_metrics = [this, ks, cf] (row_locker::single_lock_stats& stats, sstring stat_name) {
_metrics.add_group("column_family", {
ms::make_total_operations(sprint("row_lock_%s_acquisitions", stat_name), stats.lock_acquisitions, ms::description(sprint("Row lock acquisitions for %s lock", stat_name)))(cf)(ks),
ms::make_queue_length(sprint("row_lock_%s_operations_currently_waiting_for_lock", stat_name), stats.operations_currently_waiting_for_lock, ms::description(sprint("Operations currently waiting for %s lock", stat_name)))(cf)(ks),
ms::make_histogram(sprint("row_lock_%s_waiting_time", stat_name), ms::description(sprint("Histogram representing time that operations spent on waiting for %s lock", stat_name)),
[&stats] {return stats.estimated_waiting_for_lock.get_histogram(std::chrono::microseconds(100));})(cf)(ks)
});
};
add_row_lock_metrics(_row_locker_stats.exclusive_row, "exclusive_row");
add_row_lock_metrics(_row_locker_stats.shared_row, "shared_row");
add_row_lock_metrics(_row_locker_stats.exclusive_partition, "exclusive_partition");
add_row_lock_metrics(_row_locker_stats.shared_partition, "shared_partition");
// View metrics are created only for base tables, so there's no point in adding them to views (which cannot act as base tables for other views)
if (!_schema->is_view()) {
_metrics.add_group("column_family", {
ms::make_total_operations("view_updates_pushed_remote", _view_stats.view_updates_pushed_remote, ms::description("Number of updates (mutations) pushed to remote view replicas"))(cf)(ks),
ms::make_total_operations("view_updates_failed_remote", _view_stats.view_updates_failed_remote, ms::description("Number of updates (mutations) that failed to be pushed to remote view replicas"))(cf)(ks),
ms::make_total_operations("view_updates_pushed_local", _view_stats.view_updates_pushed_local, ms::description("Number of updates (mutations) pushed to local view replicas"))(cf)(ks),
ms::make_total_operations("view_updates_failed_local", _view_stats.view_updates_failed_local, ms::description("Number of updates (mutations) that failed to be pushed to local view replicas"))(cf)(ks),
});
}
if (_schema->ks_name() != db::system_keyspace::NAME && _schema->ks_name() != db::schema_tables::v3::NAME && _schema->ks_name() != "system_traces") {
_metrics.add_group("column_family", {
ms::make_histogram("read_latency", ms::description("Read latency histogram"), [this] {return _stats.estimated_read.get_histogram(std::chrono::microseconds(100));})(cf)(ks),
ms::make_histogram("write_latency", ms::description("Write latency histogram"), [this] {return _stats.estimated_write.get_histogram(std::chrono::microseconds(100));})(cf)(ks),
ms::make_gauge("cache_hit_rate", ms::description("Cache hit rate"), [this] {return float(_global_cache_hit_rate);})(cf)(ks)
});
}
}
}
void table::rebuild_statistics() {
// zeroing live_disk_space_used and live_sstable_count because the
// sstable list was re-created
_stats.live_disk_space_used = 0;
_stats.live_sstable_count = 0;
for (auto&& tab : boost::range::join(_sstables_compacted_but_not_deleted,
// this might seem dangerous, but "move" here just avoids constness,
// making the two ranges compatible when compiling with boost 1.55.
// Noone is actually moving anything...
std::move(*_sstables->all()))) {
update_stats_for_new_sstable(tab->bytes_on_disk(), tab->get_shards_for_this_sstable());
}
}
void
table::rebuild_sstable_list(const std::vector& new_sstables,
const std::vector& old_sstables) {
auto current_sstables = _sstables;
auto new_sstable_list = _compaction_strategy.make_sstable_set(_schema);
std::unordered_set s(old_sstables.begin(), old_sstables.end());
// this might seem dangerous, but "move" here just avoids constness,
// making the two ranges compatible when compiling with boost 1.55.
// Noone is actually moving anything...
for (auto&& tab : boost::range::join(new_sstables, std::move(*current_sstables->all()))) {
if (!s.count(tab)) {
new_sstable_list.insert(tab);
}
}
_sstables = make_lw_shared(std::move(new_sstable_list));
}
void
table::on_compaction_completion(const std::vector& new_sstables,
const std::vector& sstables_to_remove) {
// Build a new list of _sstables: We remove from the existing list the
// tables we compacted (by now, there might be more sstables flushed
// later), and we add the new tables generated by the compaction.
// We create a new list rather than modifying it in-place, so that
// on-going reads can continue to use the old list.
//
// We only remove old sstables after they are successfully deleted,
// to avoid a new compaction from ignoring data in the old sstables
// if the deletion fails (note deletion of shared sstables can take
// unbounded time, because all shards must agree on the deletion).
// make sure all old sstables belong *ONLY* to current shard before we proceed to their deletion.
for (auto& sst : sstables_to_remove) {
auto shards = sst->get_shards_for_this_sstable();
if (shards.size() > 1) {
throw std::runtime_error(sprint("A regular compaction for %s.%s INCORRECTLY used shared sstable %s. Only resharding work with those!",
_schema->ks_name(), _schema->cf_name(), sst->toc_filename()));
}
if (!belongs_to_current_shard(shards)) {
throw std::runtime_error(sprint("A regular compaction for %s.%s INCORRECTLY used sstable %s which doesn't belong to this shard!",
_schema->ks_name(), _schema->cf_name(), sst->toc_filename()));
}
}
auto new_compacted_but_not_deleted = _sstables_compacted_but_not_deleted;
// rebuilding _sstables_compacted_but_not_deleted first to make the entire rebuild operation exception safe.
new_compacted_but_not_deleted.insert(new_compacted_but_not_deleted.end(), sstables_to_remove.begin(), sstables_to_remove.end());
rebuild_sstable_list(new_sstables, sstables_to_remove);
_sstables_compacted_but_not_deleted = std::move(new_compacted_but_not_deleted);
rebuild_statistics();
// This is done in the background, so we can consider this compaction completed.
seastar::with_gate(_sstable_deletion_gate, [this, sstables_to_remove] {
return sstables::delete_atomically(sstables_to_remove, *get_large_partition_handler()).then_wrapped([this, sstables_to_remove] (future<> f) {
std::exception_ptr eptr;
try {
f.get();
} catch(...) {
eptr = std::current_exception();
}
// unconditionally remove compacted sstables from _sstables_compacted_but_not_deleted,
// or they could stay forever in the set, resulting in deleted files remaining
// opened and disk space not being released until shutdown.
std::unordered_set s(
sstables_to_remove.begin(), sstables_to_remove.end());
auto e = boost::range::remove_if(_sstables_compacted_but_not_deleted, [&] (sstables::shared_sstable sst) -> bool {
return s.count(sst);
});
_sstables_compacted_but_not_deleted.erase(e, _sstables_compacted_but_not_deleted.end());
rebuild_statistics();
if (eptr) {
return make_exception_future<>(eptr);
}
return make_ready_future<>();
}).then([this] {
// refresh underlying data source in row cache to prevent it from holding reference
// to sstables files which were previously deleted.
_cache.refresh_snapshot();
});
});
}
// For replace/remove_ancestors_needed_write, note that we need to update the compaction backlog
// manually. The new tables will be coming from a remote shard and thus unaccounted for in our
// list so far, and the removed ones will no longer be needed by us.
void table::replace_ancestors_needed_rewrite(std::vector new_sstables) {
std::vector old_sstables;
std::unordered_set ancestors;
for (auto& sst : new_sstables) {
auto sst_ancestors = sst->ancestors();
ancestors.insert(sst_ancestors.begin(), sst_ancestors.end());
_compaction_strategy.get_backlog_tracker().add_sstable(sst);
}
for (auto& ancestor : ancestors) {
auto it = _sstables_need_rewrite.find(ancestor);
if (it != _sstables_need_rewrite.end()) {
old_sstables.push_back(it->second);
_compaction_strategy.get_backlog_tracker().remove_sstable(it->second);
_sstables_need_rewrite.erase(it);
}
}
rebuild_sstable_list(new_sstables, old_sstables);
rebuild_statistics();
}
void table::remove_ancestors_needed_rewrite(std::unordered_set ancestors) {
std::vector old_sstables;
for (auto& ancestor : ancestors) {
auto it = _sstables_need_rewrite.find(ancestor);
if (it != _sstables_need_rewrite.end()) {
old_sstables.push_back(it->second);
_compaction_strategy.get_backlog_tracker().remove_sstable(it->second);
_sstables_need_rewrite.erase(it);
}
}
rebuild_sstable_list({}, old_sstables);
rebuild_statistics();
}
future<>
table::compact_sstables(sstables::compaction_descriptor descriptor, bool cleanup) {
if (!descriptor.sstables.size()) {
// if there is nothing to compact, just return.
return make_ready_future<>();
}
return with_lock(_sstables_lock.for_read(), [this, descriptor = std::move(descriptor), cleanup] () mutable {
auto create_sstable = [this] {
auto gen = this->calculate_generation_for_new_table();
auto sst = sstables::make_sstable(_schema, _config.datadir, gen,
get_highest_supported_format(),
sstables::sstable::format_types::big);
sst->set_unshared();
return sst;
};
auto sstables_to_compact = descriptor.sstables;
return sstables::compact_sstables(std::move(descriptor), *this, create_sstable,
cleanup).then([this, sstables_to_compact = std::move(sstables_to_compact)] (auto info) {
_compaction_strategy.notify_completion(sstables_to_compact, info.new_sstables);
this->on_compaction_completion(info.new_sstables, sstables_to_compact);
return info;
});
}).then([this] (auto info) {
if (info.type != sstables::compaction_type::Compaction) {
return make_ready_future<>();
}
// skip update if running without a query context, for example, when running a test case.
if (!db::qctx) {
return make_ready_future<>();
}
// FIXME: add support to merged_rows. merged_rows is a histogram that
// shows how many sstables each row is merged from. This information
// cannot be accessed until we make combined_reader more generic,
// for example, by adding a reducer method.
return db::system_keyspace::update_compaction_history(info.ks, info.cf, info.ended_at,
info.start_size, info.end_size, std::unordered_map{});
});
}
static bool needs_cleanup(const sstables::shared_sstable& sst,
const dht::token_range_vector& owned_ranges,
schema_ptr s) {
auto first = sst->get_first_partition_key();
auto last = sst->get_last_partition_key();
auto first_token = dht::global_partitioner().get_token(*s, first);
auto last_token = dht::global_partitioner().get_token(*s, last);
dht::token_range sst_token_range = dht::token_range::make(first_token, last_token);
// return true iff sst partition range isn't fully contained in any of the owned ranges.
for (auto& r : owned_ranges) {
if (r.contains(sst_token_range, dht::token_comparator())) {
return false;
}
}
return true;
}
future<> table::cleanup_sstables(sstables::compaction_descriptor descriptor) {
dht::token_range_vector r = service::get_local_storage_service().get_local_ranges(_schema->ks_name());
return do_with(std::move(descriptor.sstables), std::move(r), [this] (auto& sstables, auto& owned_ranges) {
return do_for_each(sstables, [this, &owned_ranges] (auto& sst) {
if (!owned_ranges.empty() && !needs_cleanup(sst, owned_ranges, _schema)) {
return make_ready_future<>();
}
// this semaphore ensures that only one cleanup will run per shard.
// That's to prevent node from running out of space when almost all sstables
// need cleanup, so if sstables are cleaned in parallel, we may need almost
// twice the disk space used by those sstables.
static thread_local semaphore sem(1);
return with_semaphore(sem, 1, [this, &sst] {
return this->compact_sstables(sstables::compaction_descriptor({ sst }, sst->get_sstable_level()), true);
});
});
});
}
// Note: We assume that the column_family does not get destroyed during compaction.
future<>
table::compact_all_sstables() {
return _compaction_manager.submit_major_compaction(this);
}
void table::start_compaction() {
set_compaction_strategy(_schema->compaction_strategy());
}
void table::trigger_compaction() {
// Submitting compaction job to compaction manager.
do_trigger_compaction(); // see below
}
void table::try_trigger_compaction() noexcept {
try {
trigger_compaction();
} catch (...) {
dblog.error("Failed to trigger compaction: {}", std::current_exception());
}
}
void table::do_trigger_compaction() {
// But only submit if we're not locked out
if (!_compaction_disabled) {
_compaction_manager.submit(this);
}
}
future<> table::run_compaction(sstables::compaction_descriptor descriptor) {
return compact_sstables(std::move(descriptor));
}
void table::set_compaction_strategy(sstables::compaction_strategy_type strategy) {
dblog.debug("Setting compaction strategy of {}.{} to {}", _schema->ks_name(), _schema->cf_name(), sstables::compaction_strategy::name(strategy));
auto new_cs = make_compaction_strategy(strategy, _schema->compaction_strategy_options());
_compaction_manager.register_backlog_tracker(new_cs.get_backlog_tracker());
auto move_read_charges = new_cs.type() == _compaction_strategy.type();
_compaction_strategy.get_backlog_tracker().transfer_ongoing_charges(new_cs.get_backlog_tracker(), move_read_charges);
auto new_sstables = new_cs.make_sstable_set(_schema);
for (auto&& s : *_sstables->all()) {
new_cs.get_backlog_tracker().add_sstable(s);
new_sstables.insert(s);
}
if (!move_read_charges) {
_compaction_manager.stop_tracking_ongoing_compactions(this);
}
// now exception safe:
_compaction_strategy = std::move(new_cs);
_sstables = std::move(new_sstables);
}
size_t table::sstables_count() const {
return _sstables->all()->size();
}
std::vector table::sstable_count_per_level() const {
std::vector count_per_level;
for (auto&& sst : *_sstables->all()) {
auto level = sst->get_sstable_level();
if (level + 1 > count_per_level.size()) {
count_per_level.resize(level + 1, 0UL);
}
count_per_level[level]++;
}
return count_per_level;
}
int64_t table::get_unleveled_sstables() const {
// TODO: when we support leveled compaction, we should return the number of
// SSTables in L0. If leveled compaction is enabled in this column family,
// then we should return zero, as we currently do.
return 0;
}
future> table::get_sstables_by_partition_key(const sstring& key) const {
return do_with(std::unordered_set(), lw_shared_ptr(make_lw_shared(get_sstable_set().make_incremental_selector())),
partition_key(partition_key::from_nodetool_style_string(_schema, key)),
[this] (std::unordered_set& filenames, lw_shared_ptr& sel, partition_key& pk) {
return do_with(dht::decorated_key(dht::global_partitioner().decorate_key(*_schema, pk)),
[this, &filenames, &sel, &pk](dht::decorated_key& dk) mutable {
auto sst = sel->select(dk).sstables;
auto hk = sstables::sstable::make_hashed_key(*_schema, dk.key());
return do_for_each(sst, [this, &filenames, &dk, hk = std::move(hk)] (std::vector::const_iterator::reference s) mutable {
auto name = s->get_filename();
return s->has_partition_key(hk, dk).then([name = std::move(name), &filenames] (bool contains) mutable {
if (contains) {
filenames.insert(name);
}
});
});
}).then([&filenames] {
return make_ready_future>(filenames);
});
});
}
const sstables::sstable_set& table::get_sstable_set() const {
return *_sstables;
}
lw_shared_ptr table::get_sstables() const {
return _sstables->all();
}
std::vector table::select_sstables(const dht::partition_range& range) const {
return _sstables->select(range);
}
std::vector table::candidates_for_compaction() const {
return boost::copy_range>(*get_sstables()
| boost::adaptors::filtered([this] (auto& sst) { return !_sstables_need_rewrite.count(sst->generation()); }));
}
std::vector table::sstables_need_rewrite() const {
return boost::copy_range>(_sstables_need_rewrite | boost::adaptors::map_values);
}
// Gets the list of all sstables in the column family, including ones that are
// not used for active queries because they have already been compacted, but are
// waiting for delete_atomically() to return.
//
// As long as we haven't deleted them, compaction needs to ensure it doesn't
// garbage-collect a tombstone that covers data in an sstable that may not be
// successfully deleted.
lw_shared_ptr table::get_sstables_including_compacted_undeleted() const {
if (_sstables_compacted_but_not_deleted.empty()) {
return get_sstables();
}
auto ret = make_lw_shared(*_sstables->all());
for (auto&& s : _sstables_compacted_but_not_deleted) {
ret->insert(s);
}
return ret;
}
const std::vector& table::compacted_undeleted_sstables() const {
return _sstables_compacted_but_not_deleted;
}
inline bool table::manifest_json_filter(const lister::path&, const directory_entry& entry) {
// Filter out directories. If type of the entry is unknown - check its name.
if (entry.type.value_or(directory_entry_type::regular) != directory_entry_type::directory && entry.name == "manifest.json") {
return false;
}
return true;
}
// TODO: possibly move it to seastar
template
static future<> invoke_shards_with_ptr(std::unordered_set shards, distributed& s, PtrType ptr, Func&& func) {
return parallel_for_each(std::move(shards), [&s, &func, ptr] (shard_id id) {
return s.invoke_on(id, [func, foreign = make_foreign(ptr)] (Service& s) mutable {
return func(s, std::move(foreign));
});
});
}
future<> distributed_loader::open_sstable(distributed& db, sstables::entry_descriptor comps,
std::function (column_family&, sstables::foreign_sstable_open_info)> func, const io_priority_class& pc) {
// loads components of a sstable from shard S and share it with all other
// shards. Which shard a sstable will be opened at is decided using
// calculate_shard_from_sstable_generation(), which is the inverse of
// calculate_generation_for_new_table(). That ensures every sstable is
// shard-local if reshard wasn't performed. This approach is also expected
// to distribute evenly the resource usage among all shards.
return db.invoke_on(column_family::calculate_shard_from_sstable_generation(comps.generation),
[&db, comps = std::move(comps), func = std::move(func), pc] (database& local) {
return with_semaphore(local.sstable_load_concurrency_sem(), 1, [&db, &local, comps = std::move(comps), func = std::move(func), pc] {
auto& cf = local.find_column_family(comps.ks, comps.cf);
auto f = sstables::sstable::load_shared_components(cf.schema(), comps.sstdir, comps.generation, comps.version, comps.format, pc);
return f.then([&db, comps = std::move(comps), func = std::move(func)] (sstables::sstable_open_info info) {
// shared components loaded, now opening sstable in all shards that own it with shared components
return do_with(std::move(info), [&db, comps = std::move(comps), func = std::move(func)] (auto& info) {
// All shards that own the sstable is interested in it in addition to shard that
// is responsible for its generation. We may need to add manually this shard
// because sstable may not contain data that belong to it.
auto shards_interested_in_this_sstable = boost::copy_range>(info.owners);
shard_id shard_responsible_for_generation = column_family::calculate_shard_from_sstable_generation(comps.generation);
shards_interested_in_this_sstable.insert(shard_responsible_for_generation);
return invoke_shards_with_ptr(std::move(shards_interested_in_this_sstable), db, std::move(info.components),
[owners = info.owners, data = info.data.dup(), index = info.index.dup(), comps, func] (database& db, auto components) {
auto& cf = db.find_column_family(comps.ks, comps.cf);
return func(cf, sstables::foreign_sstable_open_info{std::move(components), owners, data, index});
});
});
});
});
});
}
// global_column_family_ptr provides a way to easily retrieve local instance of a given column family.
class global_column_family_ptr {
distributed& _db;
utils::UUID _id;
private:
column_family& get() const { return _db.local().find_column_family(_id); }
public:
global_column_family_ptr(distributed& db, sstring ks_name, sstring cf_name)
: _db(db)
, _id(_db.local().find_column_family(ks_name, cf_name).schema()->id()) {
}
column_family* operator->() const {
return &get();
}
column_family& operator*() const {
return get();
}
};
template
static future>
load_sstables_with_open_info(std::vector ssts_info, schema_ptr s, sstring dir, Pred&& pred) {
return do_with(std::vector(), [ssts_info = std::move(ssts_info), s, dir, pred] (auto& ssts) mutable {
return parallel_for_each(std::move(ssts_info), [&ssts, s, dir, pred] (auto& info) mutable {
if (!pred(info)) {
return make_ready_future<>();
}
auto sst = sstables::make_sstable(s, dir, info.generation, info.version, info.format);
return sst->load(std::move(info)).then([&ssts, sst] {
ssts.push_back(std::move(sst));
return make_ready_future<>();
});
}).then([&ssts] () mutable {
return std::move(ssts);
});
});
}
// Return all sstables that need resharding in the system. Only one instance of a shared sstable is returned.
static future> get_all_shared_sstables(distributed& db, sstring sstdir, global_column_family_ptr cf) {
class all_shared_sstables {
schema_ptr _schema;
sstring _dir;
std::unordered_map _result;
public:
all_shared_sstables(const sstring& sstdir, global_column_family_ptr cf) : _schema(cf->schema()), _dir(sstdir) {}
future<> operator()(std::vector ssts_info) {
return load_sstables_with_open_info(std::move(ssts_info), _schema, _dir, [this] (auto& info) {
// skip loading of shared sstable that is already stored in _result.
return !_result.count(info.generation);
}).then([this] (std::vector sstables) {
for (auto& sst : sstables) {
auto gen = sst->generation();
_result.emplace(gen, std::move(sst));
}
return make_ready_future<>();
});
}
std::vector get() && {
return boost::copy_range>(std::move(_result) | boost::adaptors::map_values);
}
};
return db.map_reduce(all_shared_sstables(sstdir, cf), [cf, sstdir] (database& db) mutable {
return seastar::async([cf, sstdir] {
return boost::copy_range>(cf->sstables_need_rewrite()
| boost::adaptors::filtered([sstdir] (auto&& sst) { return sst->get_dir() == sstdir; })
| boost::adaptors::transformed([] (auto&& sst) { return sst->get_open_info().get0(); }));
});
});
}
// checks whether or not a given column family is worth resharding by checking if any of its
// sstables has more than one owner shard.
static future worth_resharding(distributed& db, global_column_family_ptr cf) {
auto has_shared_sstables = [cf] (database& db) {
return cf->has_shared_sstables();
};
return db.map_reduce0(has_shared_sstables, bool(false), std::logical_or());
}
// make a set of sstables available at another shard.
template
static future<> forward_sstables_to(shard_id shard, sstring directory, std::vector sstables, global_column_family_ptr cf, Func&& func) {
return seastar::async([sstables = std::move(sstables), directory, shard, cf, func] () mutable {
auto infos = boost::copy_range>(sstables
| boost::adaptors::transformed([] (auto&& sst) { return sst->get_open_info().get0(); }));
smp::submit_to(shard, [cf, func, infos = std::move(infos), directory] () mutable {
return load_sstables_with_open_info(std::move(infos), cf->schema(), directory, [] (auto& p) {
return true;
}).then([func] (std::vector sstables) {
return func(std::move(sstables));
});
}).get();
});
}
// invokes each descriptor at its target shard, which involves forwarding sstables too.
template
static future<> invoke_all_resharding_jobs(global_column_family_ptr cf, sstring directory, std::vector jobs, Func&& func) {
return parallel_for_each(std::move(jobs), [cf, func, &directory] (sstables::resharding_descriptor& job) mutable {
return forward_sstables_to(job.reshard_at, directory, std::move(job.sstables), cf,
[cf, func, level = job.level, max_sstable_bytes = job.max_sstable_bytes] (auto sstables) {
// compaction manager ensures that only one reshard operation will run per shard.
auto job = [func, sstables = std::move(sstables), level, max_sstable_bytes] () mutable {
return func(std::move(sstables), level, max_sstable_bytes);
};
return cf->get_compaction_manager().run_resharding_job(&*cf, std::move(job));
});
});
}
static std::vector sstables_for_shard(const std::vector& sstables, shard_id shard) {
auto belongs_to_shard = [] (const sstables::shared_sstable& sst, unsigned shard) {
auto& shards = sst->get_shards_for_this_sstable();
return boost::range::find(shards, shard) != shards.end();
};
return boost::copy_range>(sstables
| boost::adaptors::filtered([&] (auto& sst) { return belongs_to_shard(sst, shard); }));
}
void distributed_loader::reshard(distributed& db, sstring ks_name, sstring cf_name) {
assert(engine().cpu_id() == 0); // NOTE: should always run on shard 0!
// ensures that only one column family is resharded at a time (that's okay because
// actual resharding is parallelized), and that's needed to prevent the same column
// family from being resharded in parallel (that could happen, for example, if
// refresh (triggers resharding) is issued by user while resharding is going on).
static semaphore sem(1);
with_semaphore(sem, 1, [&db, ks_name = std::move(ks_name), cf_name = std::move(cf_name)] () mutable {
return seastar::async([&db, ks_name = std::move(ks_name), cf_name = std::move(cf_name)] () mutable {
global_column_family_ptr cf(db, ks_name, cf_name);
if (cf->get_compaction_manager().stopped()) {
return;
}
// fast path to detect that this column family doesn't need reshard.
if (!worth_resharding(db, cf).get0()) {
dblog.debug("Nothing to reshard for {}.{}", cf->schema()->ks_name(), cf->schema()->cf_name());
return;
}
parallel_for_each(cf->_config.all_datadirs, [&db, cf] (const sstring& directory) {
auto candidates = get_all_shared_sstables(db, directory, cf).get0();
dblog.debug("{} candidates for resharding for {}.{}", candidates.size(), cf->schema()->ks_name(), cf->schema()->cf_name());
auto jobs = cf->get_compaction_strategy().get_resharding_jobs(*cf, std::move(candidates));
dblog.debug("{} resharding jobs for {}.{}", jobs.size(), cf->schema()->ks_name(), cf->schema()->cf_name());
return invoke_all_resharding_jobs(cf, directory, std::move(jobs), [directory, &cf] (auto sstables, auto level, auto max_sstable_bytes) {
auto creator = [&cf, directory] (shard_id shard) mutable {
// we need generation calculated by instance of cf at requested shard,
// or resource usage wouldn't be fairly distributed among shards.
auto gen = smp::submit_to(shard, [&cf] () {
return cf->calculate_generation_for_new_table();
}).get0();
auto sst = sstables::make_sstable(cf->schema(), directory, gen,
get_highest_supported_format(), sstables::sstable::format_types::big,
gc_clock::now(), default_io_error_handler_gen());
return sst;
};
auto f = sstables::reshard_sstables(sstables, *cf, creator, max_sstable_bytes, level);
return f.then([&cf, sstables = std::move(sstables), directory] (std::vector new_sstables) mutable {
// an input sstable may belong to shard 1 and 2 and only have data which
// token belongs to shard 1. That means resharding will only create a
// sstable for shard 1, but both shards opened the sstable. So our code
// below should ask both shards to remove the resharded table, or it
// wouldn't be deleted by our deletion manager, and resharding would be
// triggered again in the subsequent boot.
return parallel_for_each(boost::irange(0u, smp::count), [&cf, directory, sstables, new_sstables] (auto shard) {
auto old_sstables_for_shard = sstables_for_shard(sstables, shard);
// nothing to do if no input sstable belongs to this shard.
if (old_sstables_for_shard.empty()) {
return make_ready_future<>();
}
auto new_sstables_for_shard = sstables_for_shard(new_sstables, shard);
// sanity checks
for (auto& sst : new_sstables_for_shard) {
auto& shards = sst->get_shards_for_this_sstable();
if (shards.size() != 1) {
throw std::runtime_error(sprint("resharded sstable %s doesn't belong to only one shard", sst->get_filename()));
}
if (shards.front() != shard) {
throw std::runtime_error(sprint("resharded sstable %s should belong to shard %d", sst->get_filename(), shard));
}
}
if (new_sstables_for_shard.empty()) {
// handles case where sstable needing rewrite doesn't produce any sstable
// for a shard it belongs to when resharded (the reason is explained above).
std::unordered_set ancestors;
boost::range::transform(old_sstables_for_shard, std::inserter(ancestors, ancestors.end()),
std::mem_fn(&sstables::sstable::generation));
return smp::submit_to(shard, [cf, ancestors = std::move(ancestors)] () mutable {
cf->remove_ancestors_needed_rewrite(ancestors);
});
} else {
return forward_sstables_to(shard, directory, new_sstables_for_shard, cf, [cf] (auto sstables) {
cf->replace_ancestors_needed_rewrite(sstables);
});
}
}).then([&cf, sstables] {
// schedule deletion of shared sstables after we're certain that new unshared ones were successfully forwarded to respective shards.
sstables::delete_atomically(std::move(sstables), *cf->get_large_partition_handler()).handle_exception([op = sstables::background_jobs().start()] (std::exception_ptr eptr) {
try {
std::rethrow_exception(eptr);
} catch (...) {
dblog.warn("Exception in resharding when deleting sstable file: {}", eptr);
}
});
});
});
});
}).get();
});
});
}
future<> distributed_loader::load_new_sstables(distributed& db, sstring ks, sstring cf, std::vector new_tables) {
return parallel_for_each(new_tables, [&db] (auto comps) {
auto cf_sstable_open = [comps] (column_family& cf, sstables::foreign_sstable_open_info info) {
auto f = cf.open_sstable(std::move(info), cf._config.datadir, comps.generation, comps.version, comps.format);
return f.then([&cf] (sstables::shared_sstable sst) mutable {
if (sst) {
cf._sstables_opened_but_not_loaded.push_back(sst);
}
return make_ready_future<>();
});
};
return distributed_loader::open_sstable(db, comps, cf_sstable_open, service::get_local_compaction_priority());
}).then([&db, ks, cf] {
return db.invoke_on_all([ks = std::move(ks), cfname = std::move(cf)] (database& db) {
auto& cf = db.find_column_family(ks, cfname);
return cf.get_row_cache().invalidate([&cf] () noexcept {
// FIXME: this is not really noexcept, but we need to provide strong exception guarantees.
// atomically load all opened sstables into column family.
for (auto& sst : cf._sstables_opened_but_not_loaded) {
cf.load_sstable(sst, true);
}
cf._sstables_opened_but_not_loaded.clear();
cf.trigger_compaction();
});
});
}).then([&db, ks, cf] () mutable {
return smp::submit_to(0, [&db, ks = std::move(ks), cf = std::move(cf)] () mutable {
distributed_loader::reshard(db, std::move(ks), std::move(cf));
});
});
}
future distributed_loader::probe_file(distributed& db, sstring sstdir, sstring fname) {
using namespace sstables;
entry_descriptor comps = entry_descriptor::make_descriptor(sstdir, fname);
// Every table will have a TOC. Using a specific file as a criteria, as
// opposed to, say verifying _sstables.count() to be zero is more robust
// against parallel loading of the directory contents.
if (comps.component != component_type::TOC) {
return make_ready_future(std::move(comps));
}
auto cf_sstable_open = [sstdir, comps, fname] (column_family& cf, sstables::foreign_sstable_open_info info) {
cf.update_sstables_known_generation(comps.generation);
{
auto i = boost::range::find_if(*cf._sstables->all(), [gen = comps.generation] (sstables::shared_sstable sst) { return sst->generation() == gen; });
if (i != cf._sstables->all()->end()) {
auto new_toc = sstdir + "/" + fname;
throw std::runtime_error(sprint("Attempted to add sstable generation %d twice: new=%s existing=%s",
comps.generation, new_toc, (*i)->toc_filename()));
}
}
return cf.open_sstable(std::move(info), sstdir, comps.generation, comps.version, comps.format).then([&cf] (sstables::shared_sstable sst) mutable {
if (sst) {
return cf.get_row_cache().invalidate([&cf, sst = std::move(sst)] () mutable noexcept {
// FIXME: this is not really noexcept, but we need to provide strong exception guarantees.
cf.load_sstable(sst);
});
}
return make_ready_future<>();
});
};
return distributed_loader::open_sstable(db, comps, cf_sstable_open).then_wrapped([fname] (future<> f) {
try {
f.get();
} catch (malformed_sstable_exception& e) {
dblog.error("malformed sstable {}: {}. Refusing to boot", fname, e.what());
throw;
} catch(...) {
dblog.error("Unrecognized error while processing {}: {}. Refusing to boot",
fname, std::current_exception());
throw;
}
return make_ready_future<>();
}).then([comps] () mutable {
return make_ready_future(std::move(comps));
});
}
future<> distributed_loader::populate_column_family(distributed& db, sstring sstdir, sstring ks, sstring cf) {
// We can catch most errors when we try to load an sstable. But if the TOC
// file is the one missing, we won't try to load the sstable at all. This
// case is still an invalid case, but it is way easier for us to treat it
// by waiting for all files to be loaded, and then checking if we saw a
// file during scan_dir, without its corresponding TOC.
enum class component_status {
has_some_file,
has_toc_file,
has_temporary_toc_file,
};
struct sstable_descriptor {
component_status status;
sstables::sstable::version_types version;
sstables::sstable::format_types format;
};
auto verifier = make_lw_shared>();
return do_with(std::vector>(), [&db, sstdir = std::move(sstdir), verifier, ks, cf] (std::vector>& futures) {
return lister::scan_dir(sstdir, { directory_entry_type::regular }, [&db, verifier, &futures] (lister::path sstdir, directory_entry de) {
// FIXME: The secondary indexes are in this level, but with a directory type, (starting with ".")
auto f = distributed_loader::probe_file(db, sstdir.native(), de.name).then([verifier, sstdir, de] (auto entry) {
if (entry.component == component_type::TemporaryStatistics) {
return remove_file(sstables::sstable::filename(sstdir.native(), entry.ks, entry.cf, entry.version, entry.generation,
entry.format, component_type::TemporaryStatistics));
}
if (verifier->count(entry.generation)) {
if (verifier->at(entry.generation).status == component_status::has_toc_file) {
lister::path file_path(sstdir / de.name.c_str());
if (entry.component == component_type::TOC) {
throw sstables::malformed_sstable_exception("Invalid State encountered. TOC file already processed", file_path.native());
} else if (entry.component == component_type::TemporaryTOC) {
throw sstables::malformed_sstable_exception("Invalid State encountered. Temporary TOC file found after TOC file was processed", file_path.native());
}
} else if (entry.component == component_type::TOC) {
verifier->at(entry.generation).status = component_status::has_toc_file;
} else if (entry.component == component_type::TemporaryTOC) {
verifier->at(entry.generation).status = component_status::has_temporary_toc_file;
}
} else {
if (entry.component == component_type::TOC) {
verifier->emplace(entry.generation, sstable_descriptor{component_status::has_toc_file, entry.version, entry.format});
} else if (entry.component == component_type::TemporaryTOC) {
verifier->emplace(entry.generation, sstable_descriptor{component_status::has_temporary_toc_file, entry.version, entry.format});
} else {
verifier->emplace(entry.generation, sstable_descriptor{component_status::has_some_file, entry.version, entry.format});
}
}
return make_ready_future<>();
});
// push future returned by probe_file into an array of futures,
// so that the supplied callback will not block scan_dir() from
// reading the next entry in the directory.
futures.push_back(std::move(f));
return make_ready_future<>();
}, &column_family::manifest_json_filter).then([&futures] {
return when_all(futures.begin(), futures.end()).then([] (std::vector> ret) {
std::exception_ptr eptr;
for (auto& f : ret) {
try {
if (eptr) {
f.ignore_ready_future();
} else {
f.get();
}
} catch(...) {
eptr = std::current_exception();
}
}
if (eptr) {
return make_exception_future<>(eptr);
}
return make_ready_future<>();
});
}).then([verifier, sstdir, ks = std::move(ks), cf = std::move(cf)] {
return do_for_each(*verifier, [sstdir = std::move(sstdir), ks = std::move(ks), cf = std::move(cf), verifier] (auto v) {
if (v.second.status == component_status::has_temporary_toc_file) {
unsigned long gen = v.first;
sstables::sstable::version_types version = v.second.version;
sstables::sstable::format_types format = v.second.format;
if (engine().cpu_id() != 0) {
dblog.debug("At directory: {}, partial SSTable with generation {} not relevant for this shard, ignoring", sstdir, v.first);
return make_ready_future<>();
}
// shard 0 is the responsible for removing a partial sstable.
return sstables::sstable::remove_sstable_with_temp_toc(ks, cf, sstdir, gen, version, format);
} else if (v.second.status != component_status::has_toc_file) {
throw sstables::malformed_sstable_exception(sprint("At directory: %s: no TOC found for SSTable with generation %d!. Refusing to boot", sstdir, v.first));
}
return make_ready_future<>();
});
});
});
}
inline
flush_controller
make_flush_controller(db::config& cfg, seastar::scheduling_group sg, const ::io_priority_class& iop, std::function fn) {
if (cfg.memtable_flush_static_shares() > 0) {
return flush_controller(sg, iop, cfg.memtable_flush_static_shares());
}
return flush_controller(sg, iop, 50ms, cfg.virtual_dirty_soft_limit(), std::move(fn));
}
inline
std::unique_ptr
make_compaction_manager(db::config& cfg, database_config& dbcfg) {
if (cfg.compaction_static_shares() > 0) {
return std::make_unique(dbcfg.compaction_scheduling_group, service::get_local_compaction_priority(), dbcfg.available_memory, cfg.compaction_static_shares());
}
return std::make_unique(dbcfg.compaction_scheduling_group, service::get_local_compaction_priority(), dbcfg.available_memory);
}
utils::UUID database::empty_version = utils::UUID_gen::get_name_UUID(bytes{});
database::database() : database(db::config(), database_config())
{}
database::database(const db::config& cfg, database_config dbcfg)
: _stats(make_lw_shared())
, _cl_stats(std::make_unique())
, _cfg(std::make_unique(cfg))
// Allow system tables a pool of 10 MB memory to write, but never block on other regions.
, _system_dirty_memory_manager(*this, 10 << 20, cfg.virtual_dirty_soft_limit(), default_scheduling_group())
, _dirty_memory_manager(*this, dbcfg.available_memory * 0.45, cfg.virtual_dirty_soft_limit(), dbcfg.statement_scheduling_group)
, _streaming_dirty_memory_manager(*this, dbcfg.available_memory * 0.10, cfg.virtual_dirty_soft_limit(), dbcfg.streaming_scheduling_group)
, _dbcfg(dbcfg)
, _memtable_controller(make_flush_controller(*_cfg, dbcfg.memtable_scheduling_group, service::get_local_memtable_flush_priority(), [this, limit = float(_dirty_memory_manager.throttle_threshold())] {
return (_dirty_memory_manager.virtual_dirty_memory()) / limit;
}))
, _read_concurrency_sem(max_count_concurrent_reads,
max_memory_concurrent_reads(),
max_inactive_queue_length(),
[this] {
++_stats->sstable_read_queue_overloaded;
return std::make_exception_ptr(std::runtime_error("sstable inactive read queue overloaded"));
},
[this] {
return _querier_cache.evict_one();
})
// No timeouts or queue length limits - a failure here can kill an entire repair.
// Trust the caller to limit concurrency.
, _streaming_concurrency_sem(max_count_streaming_concurrent_reads, max_memory_streaming_concurrent_reads())
, _system_read_concurrency_sem(max_count_system_concurrent_reads, max_memory_system_concurrent_reads())
, _data_query_stage("data_query", &column_family::query)
, _mutation_query_stage()
, _apply_stage("db_apply", &database::do_apply)
, _version(empty_version)
, _compaction_manager(make_compaction_manager(*_cfg, dbcfg))
, _enable_incremental_backups(cfg.incremental_backups())
, _querier_cache(dbcfg.available_memory * 0.04)
, _large_partition_handler(std::make_unique(_cfg->compaction_large_partition_warning_threshold_mb()*1024*1024))
, _result_memory_limiter(dbcfg.available_memory / 10)
{
local_schema_registry().init(*this); // TODO: we're never unbound.
_compaction_manager->start();
setup_metrics();
_row_cache_tracker.set_compaction_scheduling_group(dbcfg.memory_compaction_scheduling_group);
dblog.info("Row: max_vector_size: {}, internal_count: {}", size_t(row::max_vector_size), size_t(row::internal_count));
}
void backlog_controller::adjust() {
auto backlog = _current_backlog();
if (backlog >= _control_points.back().input) {
update_controller(_control_points.back().output);
return;
}
// interpolate to find out which region we are. This run infrequently and there are a fixed
// number of points so a simple loop will do.
size_t idx = 1;
while ((idx < _control_points.size() - 1) && (_control_points[idx].input < backlog)) {
idx++;
}
control_point& cp = _control_points[idx];
control_point& last = _control_points[idx - 1];
float result = last.output + (backlog - last.input) * (cp.output - last.output)/(cp.input - last.input);
update_controller(result);
}
float backlog_controller::backlog_of_shares(float shares) const {
size_t idx = 1;
while ((idx < _control_points.size() - 1) && (_control_points[idx].output < shares)) {
idx++;
}
const control_point& cp = _control_points[idx];
const control_point& last = _control_points[idx - 1];
// Compute the inverse function of the backlog in the interpolation interval that we fall
// into.
//
// The formula for the backlog inside an interpolation point is y = a + bx, so the inverse
// function is x = (y - a) / b
return last.input + (shares - last.output) * (cp.input - last.input) / (cp.output - last.output);
}
void backlog_controller::update_controller(float shares) {
_scheduling_group.set_shares(shares);
if (!_inflight_update.available()) {
return; // next timer will fix it
}
_inflight_update = engine().update_shares_for_class(_io_priority, uint32_t(shares));
}
void
dirty_memory_manager::setup_collectd(sstring namestr) {
namespace sm = seastar::metrics;
_metrics.add_group("memory", {
sm::make_gauge(namestr + "_dirty_bytes", [this] { return real_dirty_memory(); },
sm::description("Holds the current size of a all non-free memory in bytes: used memory + released memory that hasn't been returned to a free memory pool yet. "
"Total memory size minus this value represents the amount of available memory. "
"If this value minus virtual_dirty_bytes is too high then this means that the dirty memory eviction lags behind.")),
sm::make_gauge(namestr +"_virtual_dirty_bytes", [this] { return virtual_dirty_memory(); },
sm::description("Holds the size of used memory in bytes. Compare it to \"dirty_bytes\" to see how many memory is wasted (neither used nor available).")),
});
}
static const metrics::label class_label("class");
void
database::setup_metrics() {
_dirty_memory_manager.setup_collectd("regular");
_system_dirty_memory_manager.setup_collectd("system");
_streaming_dirty_memory_manager.setup_collectd("streaming");
namespace sm = seastar::metrics;
auto user_label_instance = class_label("user");
auto streaming_label_instance = class_label("streaming");
auto system_label_instance = class_label("system");
_metrics.add_group("memory", {
sm::make_gauge("dirty_bytes", [this] { return _dirty_memory_manager.real_dirty_memory() + _system_dirty_memory_manager.real_dirty_memory() + _streaming_dirty_memory_manager.real_dirty_memory(); },
sm::description("Holds the current size of all (\"regular\", \"system\" and \"streaming\") non-free memory in bytes: used memory + released memory that hasn't been returned to a free memory pool yet. "
"Total memory size minus this value represents the amount of available memory. "
"If this value minus virtual_dirty_bytes is too high then this means that the dirty memory eviction lags behind.")),
sm::make_gauge("virtual_dirty_bytes", [this] { return _dirty_memory_manager.virtual_dirty_memory() + _system_dirty_memory_manager.virtual_dirty_memory() + _streaming_dirty_memory_manager.virtual_dirty_memory(); },
sm::description("Holds the size of all (\"regular\", \"system\" and \"streaming\") used memory in bytes. Compare it to \"dirty_bytes\" to see how many memory is wasted (neither used nor available).")),
});
_metrics.add_group("memtables", {
sm::make_gauge("pending_flushes", _cf_stats.pending_memtables_flushes_count,
sm::description("Holds the current number of memtables that are currently being flushed to sstables. "
"High value in this metric may be an indication of storage being a bottleneck.")),
sm::make_gauge("pending_flushes_bytes", _cf_stats.pending_memtables_flushes_bytes,
sm::description("Holds the current number of bytes in memtables that are currently being flushed to sstables. "
"High value in this metric may be an indication of storage being a bottleneck.")),
});
_metrics.add_group("database", {
sm::make_gauge("requests_blocked_memory_current", [this] { return _dirty_memory_manager.region_group().blocked_requests(); },
sm::description(
seastar::format("Holds the current number of requests blocked due to reaching the memory quota ({}B). "
"Non-zero value indicates that our bottleneck is memory and more specifically - the memory quota allocated for the \"database\" component.", _dirty_memory_manager.throttle_threshold()))),
sm::make_derive("requests_blocked_memory", [this] { return _dirty_memory_manager.region_group().blocked_requests_counter(); },
sm::description(seastar::format("Holds the current number of requests blocked due to reaching the memory quota ({}B). "
"Non-zero value indicates that our bottleneck is memory and more specifically - the memory quota allocated for the \"database\" component.", _dirty_memory_manager.throttle_threshold()))),
sm::make_derive("clustering_filter_count", _cf_stats.clustering_filter_count,
sm::description("Counts bloom filter invocations.")),
sm::make_derive("clustering_filter_sstables_checked", _cf_stats.sstables_checked_by_clustering_filter,
sm::description("Counts sstables checked after applying the bloom filter. "
"High value indicates that bloom filter is not very efficient.")),
sm::make_derive("clustering_filter_fast_path_count", _cf_stats.clustering_filter_fast_path_count,
sm::description("Counts number of times bloom filtering short cut to include all sstables when only one full range was specified.")),
sm::make_derive("clustering_filter_surviving_sstables", _cf_stats.surviving_sstables_after_clustering_filter,
sm::description("Counts sstables that survived the clustering key filtering. "
"High value indicates that bloom filter is not very efficient and still have to access a lot of sstables to get data.")),
sm::make_derive("total_writes", _stats->total_writes,
sm::description("Counts the total number of successful write operations performed by this shard.")),
sm::make_derive("total_writes_failed", _stats->total_writes_failed,
sm::description("Counts the total number of failed write operations. "
"A sum of this value plus total_writes represents a total amount of writes attempted on this shard.")),
sm::make_derive("total_writes_timedout", _stats->total_writes_timedout,
sm::description("Counts write operations failed due to a timeout. A positive value is a sign of storage being overloaded.")),
sm::make_derive("total_reads", _stats->total_reads,
sm::description("Counts the total number of successful reads on this shard.")),
sm::make_derive("total_reads_failed", _stats->total_reads_failed,
sm::description("Counts the total number of failed read operations. "
"Add the total_reads to this value to get the total amount of reads issued on this shard.")),
sm::make_derive("querier_cache_lookups", _querier_cache.get_stats().lookups,
sm::description("Counts querier cache lookups (paging queries)")),
sm::make_derive("querier_cache_misses", _querier_cache.get_stats().misses,
sm::description("Counts querier cache lookups that failed to find a cached querier")),
sm::make_derive("querier_cache_drops", _querier_cache.get_stats().drops,
sm::description("Counts querier cache lookups that found a cached querier but had to drop it due to position mismatch")),
sm::make_derive("querier_cache_time_based_evictions", _querier_cache.get_stats().time_based_evictions,
sm::description("Counts querier cache entries that timed out and were evicted.")),
sm::make_derive("querier_cache_resource_based_evictions", _querier_cache.get_stats().resource_based_evictions,
sm::description("Counts querier cache entries that were evicted to free up resources "
"(limited by reader concurency limits) necessary to create new readers.")),
sm::make_derive("querier_cache_memory_based_evictions", _querier_cache.get_stats().memory_based_evictions,
sm::description("Counts querier cache entries that were evicted because the memory usage "
"of the cached queriers were above the limit.")),
sm::make_gauge("querier_cache_population", _querier_cache.get_stats().population,
sm::description("The number of entries currently in the querier cache.")),
sm::make_derive("sstable_read_queue_overloads", _stats->sstable_read_queue_overloaded,
sm::description("Counts the number of times the sstable read queue was overloaded. "
"A non-zero value indicates that we have to drop read requests because they arrive faster than we can serve them.")),
sm::make_gauge("active_reads", [this] { return max_count_concurrent_reads - _read_concurrency_sem.available_resources().count; },
sm::description("Holds the number of currently active read operations. "),
{user_label_instance}),
sm::make_gauge("active_reads_memory_consumption", [this] { return max_memory_concurrent_reads() - _read_concurrency_sem.available_resources().memory; },
sm::description(seastar::format("Holds the amount of memory consumed by currently active read operations. "
"If this value gets close to {} we are likely to start dropping new read requests. "
"In that case sstable_read_queue_overloads is going to get a non-zero value.", max_memory_concurrent_reads())),
{user_label_instance}),
sm::make_gauge("queued_reads", [this] { return _read_concurrency_sem.waiters(); },
sm::description("Holds the number of currently queued read operations."),
{user_label_instance}),
sm::make_gauge("active_reads", [this] { return max_count_streaming_concurrent_reads - _streaming_concurrency_sem.available_resources().count; },
sm::description("Holds the number of currently active read operations issued on behalf of streaming "),
{streaming_label_instance}),
sm::make_gauge("active_reads_memory_consumption", [this] { return max_memory_streaming_concurrent_reads() - _streaming_concurrency_sem.available_resources().memory; },
sm::description(seastar::format("Holds the amount of memory consumed by currently active read operations issued on behalf of streaming "
"If this value gets close to {} we are likely to start dropping new read requests. "
"In that case sstable_read_queue_overloads is going to get a non-zero value.", max_memory_streaming_concurrent_reads())),
{streaming_label_instance}),
sm::make_gauge("queued_reads", [this] { return _streaming_concurrency_sem.waiters(); },
sm::description("Holds the number of currently queued read operations on behalf of streaming."),
{streaming_label_instance}),
sm::make_gauge("active_reads", [this] { return max_count_system_concurrent_reads - _system_read_concurrency_sem.available_resources().count; },
sm::description("Holds the number of currently active read operations from \"system\" keyspace tables. "),
{system_label_instance}),
sm::make_gauge("active_reads_memory_consumption", [this] { return max_memory_system_concurrent_reads() - _system_read_concurrency_sem.available_resources().memory; },
sm::description(seastar::format("Holds the amount of memory consumed by currently active read operations from \"system\" keyspace tables. "
"If this value gets close to {} we are likely to start dropping new read requests. "
"In that case sstable_read_queue_overloads is going to get a non-zero value.", max_memory_system_concurrent_reads())),
{system_label_instance}),
sm::make_gauge("queued_reads", [this] { return _system_read_concurrency_sem.waiters(); },
sm::description("Holds the number of currently queued read operations from \"system\" keyspace tables."),
{system_label_instance}),
sm::make_gauge("total_result_bytes", [this] { return get_result_memory_limiter().total_used_memory(); },
sm::description("Holds the current amount of memory used for results.")),
sm::make_derive("short_data_queries", _stats->short_data_queries,
sm::description("The rate of data queries (data or digest reads) that returned less rows than requested due to result size limiting.")),
sm::make_derive("short_mutation_queries", _stats->short_mutation_queries,
sm::description("The rate of mutation queries that returned less rows than requested due to result size limiting.")),
sm::make_derive("multishard_query_unpopped_fragments", _stats->multishard_query_unpopped_fragments,
sm::description("The total number of fragments that were extracted from the shard reader but were unconsumed by the query and moved back into the reader.")),
sm::make_derive("multishard_query_unpopped_bytes", _stats->multishard_query_unpopped_bytes,
sm::description("The total number of bytes that were extracted from the shard reader but were unconsumed by the query and moved back into the reader.")),
sm::make_derive("multishard_query_failed_reader_stops", _stats->multishard_query_failed_reader_stops,
sm::description("The number of times the stopping of a shard reader failed.")),
sm::make_derive("multishard_query_failed_reader_saves", _stats->multishard_query_failed_reader_saves,
sm::description("The number of times the saving of a shard reader failed.")),
sm::make_total_operations("counter_cell_lock_acquisition", _cl_stats->lock_acquisitions,
sm::description("The number of acquired counter cell locks.")),
sm::make_queue_length("counter_cell_lock_pending", _cl_stats->operations_waiting_for_lock,
sm::description("The number of counter updates waiting for a lock.")),
sm::make_counter("large_partition_exceeding_threshold", [this] { return _large_partition_handler->stats().partitions_bigger_than_threshold; },
sm::description("Number of large partitions exceeding compaction_large_partition_warning_threshold_mb. "
"Large partitions have performance impact and should be avoided, check the documentation for details.")),
});
}
database::~database() {
}
void database::update_version(const utils::UUID& version) {
_version = version;
}
const utils::UUID& database::get_version() const {
return _version;
}
future<> distributed_loader::populate_keyspace(distributed& db, sstring datadir, sstring ks_name) {
auto ksdir = datadir + "/" + ks_name;
auto& keyspaces = db.local().get_keyspaces();
auto i = keyspaces.find(ks_name);
if (i == keyspaces.end()) {
dblog.warn("Skipping undefined keyspace: {}", ks_name);
return make_ready_future<>();
} else {
dblog.info("Populating Keyspace {}", ks_name);
auto& ks = i->second;
auto& column_families = db.local().get_column_families();
return parallel_for_each(ks.metadata()->cf_meta_data() | boost::adaptors::map_values,
[ks_name, ksdir, &ks, &column_families, &db] (schema_ptr s) {
utils::UUID uuid = s->id();
lw_shared_ptr cf = column_families[uuid];
sstring cfname = cf->schema()->cf_name();
auto sstdir = ks.column_family_directory(ksdir, cfname, uuid);
dblog.info("Keyspace {}: Reading CF {} id={} version={}", ks_name, cfname, uuid, s->version());
return ks.make_directory_for_column_family(cfname, uuid).then([&db, sstdir, uuid, ks_name, cfname] {
return distributed_loader::populate_column_family(db, sstdir, ks_name, cfname);
}).handle_exception([ks_name, cfname, sstdir](std::exception_ptr eptr) {
std::string msg =
sprint("Exception while populating keyspace '%s' with column family '%s' from file '%s': %s",
ks_name, cfname, sstdir, eptr);
dblog.error("Exception while populating keyspace '{}' with column family '{}' from file '{}': {}",
ks_name, cfname, sstdir, eptr);
throw std::runtime_error(msg.c_str());
});
});
}
}
static future<> populate(distributed& db, sstring datadir) {
return lister::scan_dir(datadir, { directory_entry_type::directory }, [&db] (lister::path datadir, directory_entry de) {
auto& ks_name = de.name;
if (is_system_keyspace(ks_name)) {
return make_ready_future<>();
}
return distributed_loader::populate_keyspace(db, datadir.native(), ks_name);
});
}
template
static future<>
do_parse_schema_tables(distributed& proxy, const sstring& _cf_name, Func&& func) {
using namespace db::schema_tables;
static_assert(std::is_same, std::result_of_t>::value,
"bad Func signature");
auto cf_name = make_lw_shared(_cf_name);
return db::system_keyspace::query(proxy, db::schema_tables::NAME, *cf_name).then([] (auto rs) {
auto names = std::set();
for (auto& r : rs->rows()) {
auto keyspace_name = r.template get_nonnull("keyspace_name");
names.emplace(keyspace_name);
}
return std::move(names);
}).then([&proxy, cf_name, func = std::forward(func)] (std::set&& names) mutable {
return parallel_for_each(names.begin(), names.end(), [&proxy, cf_name, func = std::forward(func)] (sstring name) mutable {
if (is_system_keyspace(name)) {
return make_ready_future<>();
}
return read_schema_partition_for_keyspace(proxy, *cf_name, name).then([func, cf_name] (auto&& v) mutable {
return do_with(std::move(v), [func = std::forward(func), cf_name] (auto& v) {
return func(v).then_wrapped([cf_name, &v] (future<> f) {
try {
f.get();
} catch (std::exception& e) {
dblog.error("Skipping: {}. Exception occurred when loading system table {}: {}", v.first, *cf_name, e.what());
}
});
});
});
});
});
}
future<> database::parse_system_tables(distributed& proxy) {
using namespace db::schema_tables;
return do_parse_schema_tables(proxy, db::schema_tables::KEYSPACES, [this] (schema_result_value_type &v) {
auto ksm = create_keyspace_from_schema_partition(v);
return create_keyspace(ksm);
}).then([&proxy, this] {
return do_parse_schema_tables(proxy, db::schema_tables::TYPES, [this, &proxy] (schema_result_value_type &v) {
auto&& user_types = create_types_from_schema_partition(v);
auto& ks = this->find_keyspace(v.first);
for (auto&& type : user_types) {
ks.add_user_type(type);
}
return make_ready_future<>();
});
}).then([&proxy, this] {
return do_parse_schema_tables(proxy, db::schema_tables::TABLES, [this, &proxy] (schema_result_value_type &v) {
return create_tables_from_tables_partition(proxy, v.second).then([this] (std::map tables) {
return parallel_for_each(tables.begin(), tables.end(), [this] (auto& t) {
return this->add_column_family_and_make_directory(t.second);
});
});
});
}).then([&proxy, this] {
return do_parse_schema_tables(proxy, db::schema_tables::VIEWS, [this, &proxy] (schema_result_value_type &v) {
return create_views_from_schema_partition(proxy, v.second).then([this] (std::vector views) {
return parallel_for_each(views.begin(), views.end(), [this] (auto&& v) {
return this->add_column_family_and_make_directory(v);
});
});
});
});
}
future<> distributed_loader::init_system_keyspace(distributed& db) {
return seastar::async([&db] {
// We need to init commitlog on shard0 before it is inited on other shards
// because it obtains the list of pre-existing segments for replay, which must
// not include reserve segments created by active commitlogs.
db.invoke_on(0, [] (database& db) {
return db.init_commitlog();
}).get();
db.invoke_on_all([] (database& db) {
if (engine().cpu_id() == 0) {
return make_ready_future<>();
}
return db.init_commitlog();
}).get();
db.invoke_on_all([] (database& db) {
auto& cfg = db.get_config();
bool durable = cfg.data_file_directories().size() > 0;
db::system_keyspace::make(db, durable, cfg.volatile_system_keyspace_for_testing());
}).get();
const auto& cfg = db.local().get_config();
for (auto& data_dir : cfg.data_file_directories()) {
for (auto ksname : system_keyspaces) {
io_check(touch_directory, data_dir + "/" + ksname).get();
distributed_loader::populate_keyspace(db, data_dir, ksname).get();
}
}
db.invoke_on_all([] (database& db) {
for (auto ksname : system_keyspaces) {
auto& ks = db.find_keyspace(ksname);
for (auto& pair : ks.metadata()->cf_meta_data()) {
auto cfm = pair.second;
auto& cf = db.find_column_family(cfm);
cf.mark_ready_for_writes();
}
}
return make_ready_future<>();
}).get();
});
}
future<> distributed_loader::ensure_system_table_directories(distributed& db) {
return parallel_for_each(system_keyspaces, [&db](sstring ksname) {
auto& ks = db.local().find_keyspace(ksname);
return parallel_for_each(ks.metadata()->cf_meta_data(), [&ks] (auto& pair) {
auto cfm = pair.second;
return ks.make_directory_for_column_family(cfm->cf_name(), cfm->id());
});
});
}
future<> distributed_loader::init_non_system_keyspaces(distributed& db, distributed& proxy) {
return seastar::async([&db, &proxy] {
db.invoke_on_all([&proxy] (database& db) {
return db.parse_system_tables(proxy);
}).get();
const auto& cfg = db.local().get_config();
parallel_for_each(cfg.data_file_directories(), [&db] (sstring directory) {
return populate(db, directory);
}).get();
db.invoke_on_all([] (database& db) {
return parallel_for_each(db.get_non_system_column_families(), [] (lw_shared_ptr table) {
// Make sure this is called even if the table is empty
table->mark_ready_for_writes();
return make_ready_future<>();
});
}).get();
});
}
future<>
database::init_commitlog() {
return db::commitlog::create_commitlog(db::commitlog::config::from_db_config(*_cfg, _dbcfg.available_memory)).then([this](db::commitlog&& log) {
_commitlog = std::make_unique(std::move(log));
_commitlog->add_flush_handler([this](db::cf_id_type id, db::replay_position pos) {
if (_column_families.count(id) == 0) {
// the CF has been removed.
_commitlog->discard_completed_segments(id);
return;
}
_column_families[id]->flush();
}).release(); // we have longer life time than CL. Ignore reg anchor
});
}
unsigned
database::shard_of(const dht::token& t) {
return dht::shard_of(t);
}
unsigned
database::shard_of(const mutation& m) {
return shard_of(m.token());
}
unsigned
database::shard_of(const frozen_mutation& m) {
// FIXME: This lookup wouldn't be necessary if we
// sent the partition key in legacy form or together
// with token.
schema_ptr schema = find_schema(m.column_family_id());
return shard_of(dht::global_partitioner().get_token(*schema, m.key(*schema)));
}
void database::add_keyspace(sstring name, keyspace k) {
if (_keyspaces.count(name) != 0) {
throw std::invalid_argument("Keyspace " + name + " already exists");
}
_keyspaces.emplace(std::move(name), std::move(k));
}
future<> database::update_keyspace(const sstring& name) {
auto& proxy = service::get_storage_proxy();
return db::schema_tables::read_schema_partition_for_keyspace(proxy, db::schema_tables::KEYSPACES, name).then([this, name](db::schema_tables::schema_result_value_type&& v) {
auto& ks = find_keyspace(name);
auto tmp_ksm = db::schema_tables::create_keyspace_from_schema_partition(v);
auto new_ksm = ::make_lw_shared(tmp_ksm->name(), tmp_ksm->strategy_name(), tmp_ksm->strategy_options(), tmp_ksm->durable_writes(),
boost::copy_range>(ks.metadata()->cf_meta_data() | boost::adaptors::map_values), ks.metadata()->user_types());
ks.update_from(std::move(new_ksm));
return service::get_local_migration_manager().notify_update_keyspace(ks.metadata());
});
}
void database::drop_keyspace(const sstring& name) {
_keyspaces.erase(name);
}
void database::add_column_family(keyspace& ks, schema_ptr schema, column_family::config cfg) {
schema = local_schema_registry().learn(schema);
schema->registry_entry()->mark_synced();
lw_shared_ptr cf;
if (cfg.enable_commitlog && _commitlog) {
cf = make_lw_shared(schema, std::move(cfg), *_commitlog, *_compaction_manager, *_cl_stats, _row_cache_tracker);
} else {
cf = make_lw_shared(schema, std::move(cfg), column_family::no_commitlog(), *_compaction_manager, *_cl_stats, _row_cache_tracker);
}
auto uuid = schema->id();
if (_column_families.count(uuid) != 0) {
throw std::invalid_argument("UUID " + uuid.to_sstring() + " already mapped");
}
auto kscf = std::make_pair(schema->ks_name(), schema->cf_name());
if (_ks_cf_to_uuid.count(kscf) != 0) {
throw std::invalid_argument("Column family " + schema->cf_name() + " exists");
}
ks.add_or_update_column_family(schema);
cf->start();
_column_families.emplace(uuid, std::move(cf));
_ks_cf_to_uuid.emplace(std::move(kscf), uuid);
if (schema->is_view()) {
find_column_family(schema->view_info()->base_id()).add_or_update_view(view_ptr(schema));
}
}
future<> database::add_column_family_and_make_directory(schema_ptr schema) {
auto& ks = find_keyspace(schema->ks_name());
add_column_family(ks, schema, ks.make_column_family_config(*schema, get_config(), get_large_partition_handler()));
find_column_family(schema).get_index_manager().reload();
return ks.make_directory_for_column_family(schema->cf_name(), schema->id());
}
bool database::update_column_family(schema_ptr new_schema) {
column_family& cfm = find_column_family(new_schema->id());
bool columns_changed = !cfm.schema()->equal_columns(*new_schema);
auto s = local_schema_registry().learn(new_schema);
s->registry_entry()->mark_synced();
cfm.set_schema(s);
find_keyspace(s->ks_name()).metadata()->add_or_update_column_family(s);
if (s->is_view()) {
try {
find_column_family(s->view_info()->base_id()).add_or_update_view(view_ptr(s));
} catch (no_such_column_family&) {
// Update view mutations received after base table drop.
}
}
cfm.get_index_manager().reload();
return columns_changed;
}
void database::remove(const column_family& cf) {
auto s = cf.schema();
auto& ks = find_keyspace(s->ks_name());
_querier_cache.evict_all_for_table(s->id());
_column_families.erase(s->id());
ks.metadata()->remove_column_family(s);
_ks_cf_to_uuid.erase(std::make_pair(s->ks_name(), s->cf_name()));
if (s->is_view()) {
try {
find_column_family(s->view_info()->base_id()).remove_view(view_ptr(s));
} catch (no_such_column_family&) {
// Drop view mutations received after base table drop.
}
}
}
future<> database::drop_column_family(const sstring& ks_name, const sstring& cf_name, timestamp_func tsf, bool snapshot) {
auto uuid = find_uuid(ks_name, cf_name);
auto cf = _column_families.at(uuid);
remove(*cf);
cf->clear_views();
auto& ks = find_keyspace(ks_name);
return when_all_succeed(cf->await_pending_writes(), cf->await_pending_reads()).then([this, &ks, cf, tsf = std::move(tsf), snapshot] {
return truncate(ks, *cf, std::move(tsf), snapshot).finally([this, cf] {
return cf->stop();
});
}).finally([cf] {});
}
const utils::UUID& database::find_uuid(const sstring& ks, const sstring& cf) const {
try {
return _ks_cf_to_uuid.at(std::make_pair(ks, cf));
} catch (...) {
throw std::out_of_range("");
}
}
const utils::UUID& database::find_uuid(const schema_ptr& schema) const {
return find_uuid(schema->ks_name(), schema->cf_name());
}
keyspace& database::find_keyspace(const sstring& name) {
try {
return _keyspaces.at(name);
} catch (...) {
std::throw_with_nested(no_such_keyspace(name));
}
}
const keyspace& database::find_keyspace(const sstring& name) const {
try {
return _keyspaces.at(name);
} catch (...) {
std::throw_with_nested(no_such_keyspace(name));
}
}
bool database::has_keyspace(const sstring& name) const {
return _keyspaces.count(name) != 0;
}
std::vector database::get_non_system_keyspaces() const {
std::vector res;
for (auto const &i : _keyspaces) {
if (!is_system_keyspace(i.first)) {
res.push_back(i.first);
}
}
return res;
}
std::vector> database::get_non_system_column_families() const {
return boost::copy_range>>(
get_column_families()
| boost::adaptors::map_values
| boost::adaptors::filtered([](const lw_shared_ptr& cf) {
return !is_system_keyspace(cf->schema()->ks_name());
}));
}
column_family& database::find_column_family(const sstring& ks_name, const sstring& cf_name) {
try {
return find_column_family(find_uuid(ks_name, cf_name));
} catch (...) {
std::throw_with_nested(no_such_column_family(ks_name, cf_name));
}
}
const column_family& database::find_column_family(const sstring& ks_name, const sstring& cf_name) const {
try {
return find_column_family(find_uuid(ks_name, cf_name));
} catch (...) {
std::throw_with_nested(no_such_column_family(ks_name, cf_name));
}
}
column_family& database::find_column_family(const utils::UUID& uuid) {
try {
return *_column_families.at(uuid);
} catch (...) {
std::throw_with_nested(no_such_column_family(uuid));
}
}
const column_family& database::find_column_family(const utils::UUID& uuid) const {
try {
return *_column_families.at(uuid);
} catch (...) {
std::throw_with_nested(no_such_column_family(uuid));
}
}
bool database::column_family_exists(const utils::UUID& uuid) const {
return _column_families.count(uuid);
}
void
keyspace::create_replication_strategy(const std::map& options) {
using namespace locator;
auto& ss = service::get_local_storage_service();
_replication_strategy =
abstract_replication_strategy::create_replication_strategy(
_metadata->name(), _metadata->strategy_name(),
ss.get_token_metadata(), options);
}
locator::abstract_replication_strategy&
keyspace::get_replication_strategy() {
return *_replication_strategy;
}
const locator::abstract_replication_strategy&
keyspace::get_replication_strategy() const {
return *_replication_strategy;
}
void
keyspace::set_replication_strategy(std::unique_ptr replication_strategy) {
_replication_strategy = std::move(replication_strategy);
}
void keyspace::update_from(::lw_shared_ptr ksm) {
_metadata = std::move(ksm);
create_replication_strategy(_metadata->strategy_options());
}
column_family::config
keyspace::make_column_family_config(const schema& s, const db::config& db_config, db::large_partition_handler* lp_handler) const {
column_family::config cfg;
for (auto& extra : _config.all_datadirs) {
cfg.all_datadirs.push_back(column_family_directory(extra, s.cf_name(), s.id()));
}
cfg.datadir = cfg.all_datadirs[0];
cfg.enable_disk_reads = _config.enable_disk_reads;
cfg.enable_disk_writes = _config.enable_disk_writes;
cfg.enable_commitlog = _config.enable_commitlog;
cfg.enable_cache = _config.enable_cache;
cfg.compaction_enforce_min_threshold = _config.compaction_enforce_min_threshold;
cfg.dirty_memory_manager = _config.dirty_memory_manager;
cfg.streaming_dirty_memory_manager = _config.streaming_dirty_memory_manager;
cfg.read_concurrency_semaphore = _config.read_concurrency_semaphore;
cfg.streaming_read_concurrency_semaphore = _config.streaming_read_concurrency_semaphore;
cfg.cf_stats = _config.cf_stats;
cfg.enable_incremental_backups = _config.enable_incremental_backups;
cfg.compaction_scheduling_group = _config.compaction_scheduling_group;
cfg.memory_compaction_scheduling_group = _config.memory_compaction_scheduling_group;
cfg.memtable_scheduling_group = _config.memtable_scheduling_group;
cfg.memtable_to_cache_scheduling_group = _config.memtable_to_cache_scheduling_group;
cfg.streaming_scheduling_group = _config.streaming_scheduling_group;
cfg.statement_scheduling_group = _config.statement_scheduling_group;
cfg.enable_metrics_reporting = db_config.enable_keyspace_column_family_metrics();
cfg.large_partition_handler = lp_handler;
cfg.view_update_concurrency_semaphore = _config.view_update_concurrency_semaphore;
return cfg;
}
sstring
keyspace::column_family_directory(const sstring& name, utils::UUID uuid) const {
return column_family_directory(_config.datadir, name, uuid);
}
sstring
keyspace::column_family_directory(const sstring& base_path, const sstring& name, utils::UUID uuid) const {
auto uuid_sstring = uuid.to_sstring();
boost::erase_all(uuid_sstring, "-");
return sprint("%s/%s-%s", base_path, name, uuid_sstring);
}
future<>
keyspace::make_directory_for_column_family(const sstring& name, utils::UUID uuid) {
std::vector cfdirs;
for (auto& extra : _config.all_datadirs) {
cfdirs.push_back(column_family_directory(extra, name, uuid));
}
return seastar::async([cfdirs = std::move(cfdirs)] {
for (auto& cfdir : cfdirs) {
io_check(recursive_touch_directory, cfdir).get();
}
io_check(touch_directory, cfdirs[0] + "/upload").get();
});
}
no_such_keyspace::no_such_keyspace(const sstring& ks_name)
: runtime_error{sprint("Can't find a keyspace %s", ks_name)}
{
}
no_such_column_family::no_such_column_family(const utils::UUID& uuid)
: runtime_error{sprint("Can't find a column family with UUID %s", uuid)}
{
}
no_such_column_family::no_such_column_family(const sstring& ks_name, const sstring& cf_name)
: runtime_error{sprint("Can't find a column family %s in keyspace %s", cf_name, ks_name)}
{
}
column_family& database::find_column_family(const schema_ptr& schema) {
return find_column_family(schema->id());
}
const column_family& database::find_column_family(const schema_ptr& schema) const {
return find_column_family(schema->id());
}
using strategy_class_registry = class_registry<
locator::abstract_replication_strategy,
const sstring&,
locator::token_metadata&,
locator::snitch_ptr&,
const std::map&>;
keyspace_metadata::keyspace_metadata(sstring name,
sstring strategy_name,
std::map strategy_options,
bool durable_writes,
std::vector cf_defs,
lw_shared_ptr user_types)
: _name{std::move(name)}
, _strategy_name{strategy_class_registry::to_qualified_class_name(strategy_name.empty() ? "NetworkTopologyStrategy" : strategy_name)}
, _strategy_options{std::move(strategy_options)}
, _durable_writes{durable_writes}
, _user_types{std::move(user_types)}
{
for (auto&& s : cf_defs) {
_cf_meta_data.emplace(s->cf_name(), s);
}
}
void keyspace_metadata::validate() const {
using namespace locator;
auto& ss = service::get_local_storage_service();
abstract_replication_strategy::validate_replication_strategy(name(), strategy_name(), ss.get_token_metadata(), strategy_options());
}
std::vector keyspace_metadata::tables() const {
return boost::copy_range>(_cf_meta_data
| boost::adaptors::map_values
| boost::adaptors::filtered([] (auto&& s) { return !s->is_view(); }));
}
std::vector keyspace_metadata::views() const {
return boost::copy_range>(_cf_meta_data
| boost::adaptors::map_values
| boost::adaptors::filtered(std::mem_fn(&schema::is_view))
| boost::adaptors::transformed([] (auto&& s) { return view_ptr(s); }));
}
schema_ptr database::find_schema(const sstring& ks_name, const sstring& cf_name) const {
try {
return find_schema(find_uuid(ks_name, cf_name));
} catch (std::out_of_range&) {
std::throw_with_nested(no_such_column_family(ks_name, cf_name));
}
}
schema_ptr database::find_schema(const utils::UUID& uuid) const {
return find_column_family(uuid).schema();
}
bool database::has_schema(const sstring& ks_name, const sstring& cf_name) const {
return _ks_cf_to_uuid.count(std::make_pair(ks_name, cf_name)) > 0;
}
std::vector database::get_views() const {
return boost::copy_range>(get_non_system_column_families()
| boost::adaptors::filtered([] (auto& cf) { return cf->schema()->is_view(); })
| boost::adaptors::transformed([] (auto& cf) { return view_ptr(cf->schema()); }));
}
void database::create_in_memory_keyspace(const lw_shared_ptr& ksm) {
keyspace ks(ksm, std::move(make_keyspace_config(*ksm)));
ks.create_replication_strategy(ksm->strategy_options());
_keyspaces.emplace(ksm->name(), std::move(ks));
}
future<>
database::create_keyspace(const lw_shared_ptr& ksm) {
auto i = _keyspaces.find(ksm->name());
if (i != _keyspaces.end()) {
return make_ready_future<>();
}
create_in_memory_keyspace(ksm);
auto& datadir = _keyspaces.at(ksm->name()).datadir();
if (datadir != "") {
return io_check(touch_directory, datadir);
} else {
return make_ready_future<>();
}
}
std::set
database::existing_index_names(const sstring& ks_name, const sstring& cf_to_exclude) const {
std::set names;
for (auto& schema : find_keyspace(ks_name).metadata()->tables()) {
if (!cf_to_exclude.empty() && schema->cf_name() == cf_to_exclude) {
continue;
}
for (const auto& index_name : schema->index_names()) {
names.emplace(index_name);
}
}
return names;
}
// Based on:
// - org.apache.cassandra.db.AbstractCell#reconcile()
// - org.apache.cassandra.db.BufferExpiringCell#reconcile()
// - org.apache.cassandra.db.BufferDeletedCell#reconcile()
int
compare_atomic_cell_for_merge(atomic_cell_view left, atomic_cell_view right) {
if (left.timestamp() != right.timestamp()) {
return left.timestamp() > right.timestamp() ? 1 : -1;
}
if (left.is_live() != right.is_live()) {
return left.is_live() ? -1 : 1;
}
if (left.is_live()) {
auto c = compare_unsigned(left.value(), right.value());
if (c != 0) {
return c;
}
if (left.is_live_and_has_ttl() != right.is_live_and_has_ttl()) {
// prefer expiring cells.
return left.is_live_and_has_ttl() ? 1 : -1;
}
if (left.is_live_and_has_ttl() && left.expiry() != right.expiry()) {
return left.expiry() < right.expiry() ? -1 : 1;
}
} else {
// Both are deleted
if (left.deletion_time() != right.deletion_time()) {
// Origin compares big-endian serialized deletion time. That's because it
// delegates to AbstractCell.reconcile() which compares values after
// comparing timestamps, which in case of deleted cells will hold
// serialized expiry.
return (uint32_t) left.deletion_time().time_since_epoch().count()
< (uint32_t) right.deletion_time().time_since_epoch().count() ? -1 : 1;
}
}
return 0;
}
struct query_state {
explicit query_state(schema_ptr s,
const query::read_command& cmd,
query::result_options opts,
const dht::partition_range_vector& ranges,
query::result_memory_accounter memory_accounter = { })
: schema(std::move(s))
, cmd(cmd)
, builder(cmd.slice, opts, std::move(memory_accounter))
, limit(cmd.row_limit)
, partition_limit(cmd.partition_limit)
, current_partition_range(ranges.begin())
, range_end(ranges.end()){
}
schema_ptr schema;
const query::read_command& cmd;
query::result::builder builder;
uint32_t limit;
uint32_t partition_limit;
bool range_empty = false; // Avoid ubsan false-positive when moving after construction
dht::partition_range_vector::const_iterator current_partition_range;
dht::partition_range_vector::const_iterator range_end;
uint32_t remaining_rows() const {
return limit - builder.row_count();
}
uint32_t remaining_partitions() const {
return partition_limit - builder.partition_count();
}
bool done() const {
return !remaining_rows() || !remaining_partitions() || current_partition_range == range_end || builder.is_short_read();
}
};
future>
table::query(schema_ptr s,
const query::read_command& cmd,
query::result_options opts,
const dht::partition_range_vector& partition_ranges,
tracing::trace_state_ptr trace_state,
query::result_memory_limiter& memory_limiter,
uint64_t max_size,
db::timeout_clock::time_point timeout,
query::querier_cache_context cache_ctx) {
utils::latency_counter lc;
_stats.reads.set_latency(lc);
auto f = opts.request == query::result_request::only_digest
? memory_limiter.new_digest_read(max_size) : memory_limiter.new_data_read(max_size);
return f.then([this, lc, s = std::move(s), &cmd, opts, &partition_ranges,
trace_state = std::move(trace_state), timeout, cache_ctx = std::move(cache_ctx)] (query::result_memory_accounter accounter) mutable {
auto qs_ptr = std::make_unique(std::move(s), cmd, opts, partition_ranges, std::move(accounter));
auto& qs = *qs_ptr;
return do_until(std::bind(&query_state::done, &qs), [this, &qs, trace_state = std::move(trace_state), timeout, cache_ctx = std::move(cache_ctx)] {
auto&& range = *qs.current_partition_range++;
return data_query(qs.schema, as_mutation_source(), range, qs.cmd.slice, qs.remaining_rows(),
qs.remaining_partitions(), qs.cmd.timestamp, qs.builder, trace_state, timeout, cache_ctx);
}).then([qs_ptr = std::move(qs_ptr), &qs] {
return make_ready_future>(
make_lw_shared(qs.builder.build()));
}).finally([lc, this]() mutable {
_stats.reads.mark(lc);
if (lc.is_start()) {
_stats.estimated_read.add(lc.latency(), _stats.reads.hist.count);
}
});
});
}
mutation_source
table::as_mutation_source() const {
return mutation_source([this] (schema_ptr s,
const dht::partition_range& range,
const query::partition_slice& slice,
const io_priority_class& pc,
tracing::trace_state_ptr trace_state,
streamed_mutation::forwarding fwd,
mutation_reader::forwarding fwd_mr) {
return this->make_reader(std::move(s), range, slice, pc, std::move(trace_state), fwd, fwd_mr);
});
}
void table::add_coordinator_read_latency(utils::estimated_histogram::duration latency) {
_stats.estimated_coordinator_read.add(std::chrono::duration_cast(latency).count());
}
std::chrono::milliseconds table::get_coordinator_read_latency_percentile(double percentile) {
if (_cached_percentile != percentile || lowres_clock::now() - _percentile_cache_timestamp > 1s) {
_percentile_cache_timestamp = lowres_clock::now();
_cached_percentile = percentile;
_percentile_cache_value = std::max(_stats.estimated_coordinator_read.percentile(percentile) / 1000, int64_t(1)) * 1ms;
_stats.estimated_coordinator_read *= 0.9; // decay values a little to give new data points more weight
}
return _percentile_cache_value;
}
future, cache_temperature>
database::query(schema_ptr s, const query::read_command& cmd, query::result_options opts, const dht::partition_range_vector& ranges,
tracing::trace_state_ptr trace_state, uint64_t max_result_size, db::timeout_clock::time_point timeout) {
column_family& cf = find_column_family(cmd.cf_id);
query::querier_cache_context cache_ctx(_querier_cache, cmd.query_uuid, cmd.is_first_page);
return _data_query_stage(&cf,
std::move(s),
seastar::cref(cmd),
opts,
seastar::cref(ranges),
std::move(trace_state),
seastar::ref(get_result_memory_limiter()),
max_result_size,
timeout,
std::move(cache_ctx)).then_wrapped([this, s = _stats, hit_rate = cf.get_global_cache_hit_rate(), op = cf.read_in_progress()] (auto f) {
if (f.failed()) {
++s->total_reads_failed;
return make_exception_future, cache_temperature>(f.get_exception());
} else {
++s->total_reads;
auto result = f.get0();
s->short_data_queries += bool(result->is_short_read());
return make_ready_future, cache_temperature>(std::move(result), hit_rate);
}
});
}
future
database::query_mutations(schema_ptr s, const query::read_command& cmd, const dht::partition_range& range,
query::result_memory_accounter&& accounter, tracing::trace_state_ptr trace_state, db::timeout_clock::time_point timeout) {
column_family& cf = find_column_family(cmd.cf_id);
query::querier_cache_context cache_ctx(_querier_cache, cmd.query_uuid, cmd.is_first_page);
return _mutation_query_stage(std::move(s),
cf.as_mutation_source(),
seastar::cref(range),
seastar::cref(cmd.slice),
cmd.row_limit,
cmd.partition_limit,
cmd.timestamp,
std::move(accounter),
std::move(trace_state),
timeout,
std::move(cache_ctx)).then_wrapped([this, s = _stats, hit_rate = cf.get_global_cache_hit_rate(), op = cf.read_in_progress()] (auto f) {
if (f.failed()) {
++s->total_reads_failed;
return make_exception_future(f.get_exception());
} else {
++s->total_reads;
auto result = f.get0();
s->short_mutation_queries += bool(result.is_short_read());
return make_ready_future(std::move(result), hit_rate);
}
});
}
std::unordered_set database::get_initial_tokens() {
std::unordered_set tokens;
sstring tokens_string = get_config().initial_token();
try {
boost::split(tokens, tokens_string, boost::is_any_of(sstring(", ")));
} catch (...) {
throw std::runtime_error(sprint("Unable to parse initial_token=%s", tokens_string));
}
tokens.erase("");
return tokens;
}
std::experimental::optional database::get_replace_address() {
auto& cfg = get_config();
sstring replace_address = cfg.replace_address();
sstring replace_address_first_boot = cfg.replace_address_first_boot();
try {
if (!replace_address.empty()) {
return gms::inet_address(replace_address);
} else if (!replace_address_first_boot.empty()) {
return gms::inet_address(replace_address_first_boot);
}
return std::experimental::nullopt;
} catch (...) {
return std::experimental::nullopt;
}
}
bool database::is_replacing() {
sstring replace_address_first_boot = get_config().replace_address_first_boot();
if (!replace_address_first_boot.empty() && db::system_keyspace::bootstrap_complete()) {
dblog.info("Replace address on first boot requested; this node is already bootstrapped");
return false;
}
return bool(get_replace_address());
}
void database::register_connection_drop_notifier(netw::messaging_service& ms) {
ms.register_connection_drop_notifier([this] (gms::inet_address ep) {
dblog.debug("Drop hit rate info for {} because of disconnect", ep);
for (auto&& cf : get_non_system_column_families()) {
cf->drop_hit_rate(ep);
}
});
}
std::ostream& operator<<(std::ostream& out, const column_family& cf) {
return fprint(out, "{column_family: %s/%s}", cf._schema->ks_name(), cf._schema->cf_name());
}
std::ostream& operator<<(std::ostream& out, const database& db) {
out << "{\n";
for (auto&& e : db._column_families) {
auto&& cf = *e.second;
out << "(" << e.first.to_sstring() << ", " << cf.schema()->cf_name() << ", " << cf.schema()->ks_name() << "): " << cf << "\n";
}
out << "}";
return out;
}
template
void table::do_apply(db::rp_handle&& h, Args&&... args) {
utils::latency_counter lc;
_stats.writes.set_latency(lc);
db::replay_position rp = h;
check_valid_rp(rp);
try {
_memtables->active_memtable().apply(std::forward(args)..., std::move(h));
_highest_rp = std::max(_highest_rp, rp);
} catch (...) {
_failed_counter_applies_to_memtable++;
throw;
}
_stats.writes.mark(lc);
if (lc.is_start()) {
_stats.estimated_write.add(lc.latency(), _stats.writes.hist.count);
}
}
void
table::apply(const mutation& m, db::rp_handle&& h) {
do_apply(std::move(h), m);
}
void
table::apply(const frozen_mutation& m, const schema_ptr& m_schema, db::rp_handle&& h) {
do_apply(std::move(h), m, m_schema);
}
future database::do_apply_counter_update(column_family& cf, const frozen_mutation& fm, schema_ptr m_schema,
db::timeout_clock::time_point timeout,tracing::trace_state_ptr trace_state) {
auto m = fm.unfreeze(m_schema);
m.upgrade(cf.schema());
// prepare partition slice
std::vector static_columns;
static_columns.reserve(m.partition().static_row().size());
m.partition().static_row().for_each_cell([&] (auto id, auto&&) {
static_columns.emplace_back(id);
});
query::clustering_row_ranges cr_ranges;
cr_ranges.reserve(8);
std::vector regular_columns;
regular_columns.reserve(32);
for (auto&& cr : m.partition().clustered_rows()) {
cr_ranges.emplace_back(query::clustering_range::make_singular(cr.key()));
cr.row().cells().for_each_cell([&] (auto id, auto&&) {
regular_columns.emplace_back(id);
});
}
boost::sort(regular_columns);
regular_columns.erase(std::unique(regular_columns.begin(), regular_columns.end()),
regular_columns.end());
auto slice = query::partition_slice(std::move(cr_ranges), std::move(static_columns),
std::move(regular_columns), { }, { }, cql_serialization_format::internal(), query::max_rows);
return do_with(std::move(slice), std::move(m), std::vector(),
[this, &cf, timeout, trace_state = std::move(trace_state), op = cf.write_in_progress()] (const query::partition_slice& slice, mutation& m, std::vector& locks) mutable {
tracing::trace(trace_state, "Acquiring counter locks");
return cf.lock_counter_cells(m, timeout).then([&, m_schema = cf.schema(), trace_state = std::move(trace_state), timeout, this] (std::vector lcs) mutable {
locks = std::move(lcs);
// Before counter update is applied it needs to be transformed from
// deltas to counter shards. To do that, we need to read the current
// counter state for each modified cell...
tracing::trace(trace_state, "Reading counter values from the CF");
return counter_write_query(m_schema, cf.as_mutation_source(), m.decorated_key(), slice, trace_state)
.then([this, &cf, &m, m_schema, timeout, trace_state] (auto mopt) {
// ...now, that we got existing state of all affected counter
// cells we can look for our shard in each of them, increment
// its clock and apply the delta.
transform_counter_updates_to_shards(m, mopt ? &*mopt : nullptr, cf.failed_counter_applies_to_memtable());
tracing::trace(trace_state, "Applying counter update");
return this->apply_with_commitlog(cf, m, timeout);
}).then([&m] {
return std::move(m);
});
});
});
}
void table::apply_streaming_mutation(schema_ptr m_schema, utils::UUID plan_id, const frozen_mutation& m, bool fragmented) {
if (dblog.is_enabled(logging::log_level::trace)) {
dblog.trace("streaming apply {}", m.pretty_printer(m_schema));
}
if (fragmented) {
apply_streaming_big_mutation(std::move(m_schema), plan_id, m);
return;
}
_streaming_memtables->active_memtable().apply(m, m_schema);
}
void table::apply_streaming_big_mutation(schema_ptr m_schema, utils::UUID plan_id, const frozen_mutation& m) {
auto it = _streaming_memtables_big.find(plan_id);
if (it == _streaming_memtables_big.end()) {
it = _streaming_memtables_big.emplace(plan_id, make_lw_shared