That's because a single shard is used to calculate generation for new sstables in upload directory, and that will result in that single shard sharing all the resources with other shards. For refresh without upload dir, it currently works fine because we reshuffle column family dir instead. flush_upload_dir() is now a free function, takes a distributed database object, and uses calculate_shard_from_sstable_generation() to decide which shard will move sstable using its own generation namespace. Fixes #2008. Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com> Message-Id: <b0cccf7bbb61416ff8718bac92fdca90cc5fb9c9.1484253232.git.raphaelsc@scylladb.com>
3563 lines
158 KiB
C++
3563 lines
158 KiB
C++
/*
|
|
* Copyright (C) 2014 ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* This file is part of Scylla.
|
|
*
|
|
* Scylla is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU Affero General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* Scylla is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#include "log.hh"
|
|
#include "database.hh"
|
|
#include "unimplemented.hh"
|
|
#include "core/future-util.hh"
|
|
#include "db/commitlog/commitlog_entry.hh"
|
|
#include "db/system_keyspace.hh"
|
|
#include "db/consistency_level.hh"
|
|
#include "db/commitlog/commitlog.hh"
|
|
#include "db/config.hh"
|
|
#include "to_string.hh"
|
|
#include "query-result-writer.hh"
|
|
#include "nway_merger.hh"
|
|
#include "cql3/column_identifier.hh"
|
|
#include "core/seastar.hh"
|
|
#include <seastar/core/sleep.hh>
|
|
#include <seastar/core/rwlock.hh>
|
|
#include <seastar/core/metrics.hh>
|
|
#include <boost/algorithm/string/classification.hpp>
|
|
#include <boost/algorithm/string/split.hpp>
|
|
#include "sstables/sstables.hh"
|
|
#include "sstables/compaction.hh"
|
|
#include "sstables/remove.hh"
|
|
#include <boost/range/adaptor/transformed.hpp>
|
|
#include <boost/range/adaptor/map.hpp>
|
|
#include "locator/simple_snitch.hh"
|
|
#include <boost/algorithm/cxx11/all_of.hpp>
|
|
#include <boost/algorithm/cxx11/any_of.hpp>
|
|
#include <boost/function_output_iterator.hpp>
|
|
#include <boost/range/algorithm/heap_algorithm.hpp>
|
|
#include <boost/range/algorithm/remove_if.hpp>
|
|
#include <boost/range/algorithm/find.hpp>
|
|
#include <boost/range/algorithm/find_if.hpp>
|
|
#include <boost/range/adaptor/map.hpp>
|
|
#include "frozen_mutation.hh"
|
|
#include "mutation_partition_applier.hh"
|
|
#include "core/do_with.hh"
|
|
#include "service/migration_manager.hh"
|
|
#include "service/storage_service.hh"
|
|
#include "mutation_query.hh"
|
|
#include "sstable_mutation_readers.hh"
|
|
#include <core/fstream.hh>
|
|
#include <seastar/core/enum.hh>
|
|
#include "utils/latency.hh"
|
|
#include "utils/flush_queue.hh"
|
|
#include "schema_registry.hh"
|
|
#include "service/priority_manager.hh"
|
|
|
|
#include "checked-file-impl.hh"
|
|
#include "disk-error-handler.hh"
|
|
|
|
using namespace std::chrono_literals;
|
|
|
|
logging::logger dblog("database");
|
|
|
|
// Slight extension to the flush_queue type.
|
|
class column_family::memtable_flush_queue : public utils::flush_queue<db::replay_position> {
|
|
public:
|
|
template<typename Func, typename Post>
|
|
auto run_cf_flush(db::replay_position rp, Func&& func, Post&& post) {
|
|
// special case: empty rp, yet still data.
|
|
// We generate a few memtables with no valid, "high_rp", yet
|
|
// still containing data -> actual flush.
|
|
// And to make matters worse, we can initiate a flush of N such
|
|
// tables at the same time.
|
|
// Just queue them at the end of the queue and treat them as such.
|
|
if (rp == db::replay_position() && !empty()) {
|
|
rp = highest_key();
|
|
}
|
|
return run_with_ordered_post_op(rp, std::forward<Func>(func), std::forward<Post>(post));
|
|
}
|
|
};
|
|
|
|
// Used for tests where the CF exists without a database object. We need to pass a valid
|
|
// dirty_memory manager in that case.
|
|
thread_local dirty_memory_manager default_dirty_memory_manager;
|
|
|
|
lw_shared_ptr<memtable_list>
|
|
column_family::make_memory_only_memtable_list() {
|
|
auto get_schema = [this] { return schema(); };
|
|
return make_lw_shared<memtable_list>(std::move(get_schema), _config.dirty_memory_manager);
|
|
}
|
|
|
|
lw_shared_ptr<memtable_list>
|
|
column_family::make_memtable_list() {
|
|
auto seal = [this] (memtable_list::flush_behavior behavior) { return seal_active_memtable(behavior); };
|
|
auto get_schema = [this] { return schema(); };
|
|
return make_lw_shared<memtable_list>(std::move(seal), std::move(get_schema), _config.dirty_memory_manager);
|
|
}
|
|
|
|
lw_shared_ptr<memtable_list>
|
|
column_family::make_streaming_memtable_list() {
|
|
auto seal = [this] (memtable_list::flush_behavior behavior) { return seal_active_streaming_memtable(behavior); };
|
|
auto get_schema = [this] { return schema(); };
|
|
return make_lw_shared<memtable_list>(std::move(seal), std::move(get_schema), _config.streaming_dirty_memory_manager);
|
|
}
|
|
|
|
lw_shared_ptr<memtable_list>
|
|
column_family::make_streaming_memtable_big_list(streaming_memtable_big& smb) {
|
|
auto seal = [this, &smb] (memtable_list::flush_behavior) { return seal_active_streaming_memtable_big(smb); };
|
|
auto get_schema = [this] { return schema(); };
|
|
return make_lw_shared<memtable_list>(std::move(seal), std::move(get_schema), _config.streaming_dirty_memory_manager);
|
|
}
|
|
|
|
column_family::column_family(schema_ptr schema, config config, db::commitlog* cl, compaction_manager& compaction_manager)
|
|
: _schema(std::move(schema))
|
|
, _config(std::move(config))
|
|
, _memtables(_config.enable_disk_writes ? make_memtable_list() : make_memory_only_memtable_list())
|
|
, _streaming_memtables(_config.enable_disk_writes ? make_streaming_memtable_list() : make_memory_only_memtable_list())
|
|
, _compaction_strategy(make_compaction_strategy(_schema->compaction_strategy(), _schema->compaction_strategy_options()))
|
|
, _sstables(make_lw_shared(_compaction_strategy.make_sstable_set(_schema)))
|
|
, _cache(_schema, sstables_as_mutation_source(), global_cache_tracker(), _config.max_cached_partition_size_in_bytes)
|
|
, _commitlog(cl)
|
|
, _compaction_manager(compaction_manager)
|
|
, _flush_queue(std::make_unique<memtable_flush_queue>())
|
|
{
|
|
if (!_config.enable_disk_writes) {
|
|
dblog.warn("Writes disabled, column family no durable.");
|
|
}
|
|
}
|
|
|
|
partition_presence_checker
|
|
column_family::make_partition_presence_checker(lw_shared_ptr<sstables::sstable_set> sstables) {
|
|
auto sel = make_lw_shared(sstables->make_incremental_selector());
|
|
return [this, sstables = std::move(sstables), sel = std::move(sel)] (const dht::decorated_key& key) {
|
|
auto& sst = sel->select(key.token());
|
|
if (sst.empty()) {
|
|
return partition_presence_checker_result::definitely_doesnt_exist;
|
|
}
|
|
auto hk = sstables::sstable::make_hashed_key(*_schema, key.key());
|
|
for (auto&& s : sst) {
|
|
if (s->filter_has_key(hk)) {
|
|
return partition_presence_checker_result::maybe_exists;
|
|
}
|
|
}
|
|
return partition_presence_checker_result::definitely_doesnt_exist;
|
|
};
|
|
}
|
|
|
|
mutation_source
|
|
column_family::sstables_as_mutation_source() {
|
|
return mutation_source([this] (schema_ptr s,
|
|
const dht::partition_range& r,
|
|
const query::partition_slice& slice,
|
|
const io_priority_class& pc,
|
|
tracing::trace_state_ptr trace_state) {
|
|
return make_sstable_reader(std::move(s), r, slice, pc, std::move(trace_state));
|
|
});
|
|
}
|
|
|
|
// define in .cc, since sstable is forward-declared in .hh
|
|
column_family::~column_family() {
|
|
}
|
|
|
|
|
|
logalloc::occupancy_stats column_family::occupancy() const {
|
|
logalloc::occupancy_stats res;
|
|
for (auto m : *_memtables) {
|
|
res += m->region().occupancy();
|
|
}
|
|
for (auto m : *_streaming_memtables) {
|
|
res += m->region().occupancy();
|
|
}
|
|
for (auto smb : _streaming_memtables_big) {
|
|
for (auto m : *smb.second->memtables) {
|
|
res += m->region().occupancy();
|
|
}
|
|
}
|
|
return res;
|
|
}
|
|
|
|
static
|
|
bool belongs_to_current_shard(const streamed_mutation& m) {
|
|
return dht::shard_of(m.decorated_key().token()) == engine().cpu_id();
|
|
}
|
|
|
|
// Stores ranges for all components of the same clustering key, index 0 referring to component
|
|
// range 0, and so on.
|
|
using ck_filter_clustering_key_components = std::vector<nonwrapping_range<bytes_view>>;
|
|
// Stores an entry for each clustering key range specified by the filter.
|
|
using ck_filter_clustering_key_ranges = std::vector<ck_filter_clustering_key_components>;
|
|
|
|
// Used to split a clustering key range into a range for each component.
|
|
// If a range in ck_filtering_all_ranges is composite, a range will be created
|
|
// for each component. If it's not composite, a single range is created.
|
|
// This split is needed to check for overlap in each component individually.
|
|
static ck_filter_clustering_key_ranges
|
|
ranges_for_clustering_key_filter(const schema_ptr& schema, const query::clustering_row_ranges& ck_filtering_all_ranges) {
|
|
ck_filter_clustering_key_ranges ranges;
|
|
|
|
for (auto& r : ck_filtering_all_ranges) {
|
|
// this vector stores a range for each component of a key, only one if not composite.
|
|
ck_filter_clustering_key_components composite_ranges;
|
|
|
|
if (r.is_full()) {
|
|
ranges.push_back({ nonwrapping_range<bytes_view>::make_open_ended_both_sides() });
|
|
continue;
|
|
}
|
|
auto start = r.start() ? r.start()->value().components() : clustering_key_prefix::make_empty().components();
|
|
auto end = r.end() ? r.end()->value().components() : clustering_key_prefix::make_empty().components();
|
|
auto start_it = start.begin();
|
|
auto end_it = end.begin();
|
|
|
|
// This test is enough because equal bounds in nonwrapping_range are inclusive.
|
|
auto is_singular = [&schema] (const auto& type_it, const bytes_view& b1, const bytes_view& b2) {
|
|
if (type_it == schema->clustering_key_type()->types().end()) {
|
|
throw std::runtime_error(sprint("clustering key filter passed more components than defined in schema of %s.%s",
|
|
schema->ks_name(), schema->cf_name()));
|
|
}
|
|
return (*type_it)->compare(b1, b2) == 0;
|
|
};
|
|
auto type_it = schema->clustering_key_type()->types().begin();
|
|
composite_ranges.reserve(schema->clustering_key_size());
|
|
|
|
// the rule is to ignore any component cn if another component ck (k < n) is not if the form [v, v].
|
|
// If we have [v1, v1], [v2, v2], ... {vl3, vr3}, ....
|
|
// then we generate [v1, v1], [v2, v2], ... {vl3, vr3}. Where { = '(' or '[', etc.
|
|
while (start_it != start.end() && end_it != end.end() && is_singular(type_it++, *start_it, *end_it)) {
|
|
composite_ranges.push_back(nonwrapping_range<bytes_view>({{ std::move(*start_it++), true }},
|
|
{{ std::move(*end_it++), true }}));
|
|
}
|
|
// handle a single non-singular tail element, if present
|
|
if (start_it != start.end() && end_it != end.end()) {
|
|
composite_ranges.push_back(nonwrapping_range<bytes_view>({{ std::move(*start_it), r.start()->is_inclusive() }},
|
|
{{ std::move(*end_it), r.end()->is_inclusive() }}));
|
|
} else if (start_it != start.end()) {
|
|
composite_ranges.push_back(nonwrapping_range<bytes_view>({{ std::move(*start_it), r.start()->is_inclusive() }}, {}));
|
|
} else if (end_it != end.end()) {
|
|
composite_ranges.push_back(nonwrapping_range<bytes_view>({}, {{ std::move(*end_it), r.end()->is_inclusive() }}));
|
|
}
|
|
|
|
ranges.push_back(std::move(composite_ranges));
|
|
}
|
|
return ranges;
|
|
}
|
|
|
|
// Return true if this sstable possibly stores clustering row(s) specified by ranges.
|
|
static inline bool
|
|
contains_rows(const sstables::sstable& sst, const schema_ptr& schema, const ck_filter_clustering_key_ranges& ranges) {
|
|
auto& clustering_key_types = schema->clustering_key_type()->types();
|
|
auto& clustering_components_ranges = sst.clustering_components_ranges();
|
|
|
|
if (!schema->clustering_key_size() || clustering_components_ranges.empty()) {
|
|
return true;
|
|
}
|
|
return boost::algorithm::any_of(ranges, [&] (const ck_filter_clustering_key_components& range) {
|
|
auto s = std::min(range.size(), clustering_components_ranges.size());
|
|
return boost::algorithm::all_of(boost::irange<unsigned>(0, s), [&] (unsigned i) {
|
|
auto& type = clustering_key_types[i];
|
|
return range[i].is_full() || range[i].overlaps(clustering_components_ranges[i], type->as_tri_comparator());
|
|
});
|
|
});
|
|
}
|
|
|
|
// Filter out sstables for reader using bloom filter and sstable metadata that keeps track
|
|
// of a range for each clustering component.
|
|
static std::vector<sstables::shared_sstable>
|
|
filter_sstable_for_reader(std::vector<sstables::shared_sstable>&& sstables, column_family& cf, const schema_ptr& schema,
|
|
const sstables::key& key, const query::partition_slice& slice) {
|
|
auto sstable_has_not_key = [&] (const sstables::shared_sstable& sst) {
|
|
return !sst->filter_has_key(key);
|
|
};
|
|
sstables.erase(boost::remove_if(sstables, sstable_has_not_key), sstables.end());
|
|
|
|
// no clustering filtering is applied if schema defines no clustering key or
|
|
// compaction strategy thinks it will not benefit from such an optimization.
|
|
if (!schema->clustering_key_size() || !cf.get_compaction_strategy().use_clustering_key_filter()) {
|
|
return sstables;
|
|
}
|
|
::cf_stats* stats = cf.cf_stats();
|
|
stats->clustering_filter_count++;
|
|
stats->sstables_checked_by_clustering_filter += sstables.size();
|
|
|
|
auto ck_filtering_all_ranges = slice.get_all_ranges();
|
|
// fast path to include all sstables if only one full range was specified.
|
|
// For example, this happens if query only specifies a partition key.
|
|
if (ck_filtering_all_ranges.size() == 1 && ck_filtering_all_ranges[0].is_full()) {
|
|
stats->clustering_filter_fast_path_count++;
|
|
stats->surviving_sstables_after_clustering_filter += sstables.size();
|
|
return sstables;
|
|
}
|
|
auto ranges = ranges_for_clustering_key_filter(schema, ck_filtering_all_ranges);
|
|
if (ranges.empty()) {
|
|
return {};
|
|
}
|
|
|
|
int64_t min_timestamp = std::numeric_limits<int64_t>::max();
|
|
auto sstable_has_clustering_key = [&min_timestamp, &schema, &ranges] (const sstables::shared_sstable& sst) {
|
|
if (!contains_rows(*sst, schema, ranges)) {
|
|
return false; // ordered after sstables that contain clustering rows.
|
|
} else {
|
|
min_timestamp = std::min(min_timestamp, sst->get_stats_metadata().min_timestamp);
|
|
return true;
|
|
}
|
|
};
|
|
auto sstable_has_relevant_tombstone = [&min_timestamp] (const sstables::shared_sstable& sst) {
|
|
const auto& stats = sst->get_stats_metadata();
|
|
// re-add sstable as candidate if it contains a tombstone that may cover a row in an included sstable.
|
|
return (stats.max_timestamp > min_timestamp && stats.estimated_tombstone_drop_time.bin.map.size());
|
|
};
|
|
auto skipped = std::partition(sstables.begin(), sstables.end(), sstable_has_clustering_key);
|
|
auto actually_skipped = std::partition(skipped, sstables.end(), sstable_has_relevant_tombstone);
|
|
sstables.erase(actually_skipped, sstables.end());
|
|
stats->surviving_sstables_after_clustering_filter += sstables.size();
|
|
|
|
return sstables;
|
|
}
|
|
|
|
class range_sstable_reader final : public combined_mutation_reader {
|
|
schema_ptr _s;
|
|
const dht::partition_range* _pr;
|
|
lw_shared_ptr<sstables::sstable_set> _sstables;
|
|
|
|
struct sstable_and_reader {
|
|
sstables::shared_sstable _sstable;
|
|
// This indirection is sad, but we need stable pointers to mutation
|
|
// readers. If this ever becomes a performance issue we could store
|
|
// mutation readers in an object pool (we don't need to preserve order
|
|
// and can have holes left in the container when elements are removed).
|
|
std::unique_ptr<mutation_reader> _reader;
|
|
|
|
bool operator<(const sstable_and_reader& other) const {
|
|
return _sstable < other._sstable;
|
|
}
|
|
|
|
struct less_compare {
|
|
bool operator()(const sstable_and_reader& a, const sstable_and_reader& b) {
|
|
return a < b;
|
|
}
|
|
bool operator()(const sstable_and_reader& a, const sstables::shared_sstable& b) {
|
|
return a._sstable < b;
|
|
}
|
|
bool operator()(const sstables::shared_sstable& a, const sstable_and_reader& b) {
|
|
return a < b._sstable;
|
|
}
|
|
};
|
|
};
|
|
std::vector<sstable_and_reader> _current_readers;
|
|
|
|
// Use a pointer instead of copying, so we don't need to regenerate the reader if
|
|
// the priority changes.
|
|
const io_priority_class& _pc;
|
|
tracing::trace_state_ptr _trace_state;
|
|
const query::partition_slice& _slice;
|
|
private:
|
|
std::unique_ptr<mutation_reader> create_reader(sstables::shared_sstable sst) {
|
|
tracing::trace(_trace_state, "Reading partition range {} from sstable {}", *_pr, seastar::value_of([&sst] { return sst->get_filename(); }));
|
|
// FIXME: make sstable::read_range_rows() return ::mutation_reader so that we can drop this wrapper.
|
|
mutation_reader reader =
|
|
make_mutation_reader<sstable_range_wrapping_reader>(sst, _s, *_pr, _slice, _pc);
|
|
if (sst->is_shared()) {
|
|
reader = make_filtering_reader(std::move(reader), belongs_to_current_shard);
|
|
}
|
|
return std::make_unique<mutation_reader>(std::move(reader));
|
|
}
|
|
public:
|
|
range_sstable_reader(schema_ptr s,
|
|
lw_shared_ptr<sstables::sstable_set> sstables,
|
|
const dht::partition_range& pr,
|
|
const query::partition_slice& slice,
|
|
const io_priority_class& pc,
|
|
tracing::trace_state_ptr trace_state)
|
|
: _s(s)
|
|
, _pr(&pr)
|
|
, _sstables(std::move(sstables))
|
|
, _pc(pc)
|
|
, _trace_state(std::move(trace_state))
|
|
, _slice(slice)
|
|
{
|
|
auto ssts = _sstables->select(pr);
|
|
std::vector<mutation_reader*> readers;
|
|
readers.reserve(ssts.size());
|
|
_current_readers.reserve(ssts.size());
|
|
for (auto& sst : ssts) {
|
|
auto reader = create_reader(sst);
|
|
readers.emplace_back(reader.get());
|
|
_current_readers.emplace_back(sstable_and_reader { sst, std::move(reader) });
|
|
}
|
|
init_mutation_reader_set(std::move(readers));
|
|
}
|
|
|
|
range_sstable_reader(range_sstable_reader&&) = delete; // reader takes reference to member fields
|
|
|
|
virtual future<> fast_forward_to(const dht::partition_range& pr) override {
|
|
_pr = ≺
|
|
|
|
auto new_sstables = _sstables->select(pr);
|
|
boost::range::sort(new_sstables);
|
|
boost::range::sort(_current_readers);
|
|
|
|
std::vector<sstables::shared_sstable> to_add;
|
|
std::vector<sstable_and_reader> to_remove, unchanged;
|
|
sstable_and_reader::less_compare cmp;
|
|
boost::set_difference(new_sstables, _current_readers, std::back_inserter(to_add), cmp);
|
|
std::set_difference(_current_readers.begin(), _current_readers.end(), new_sstables.begin(), new_sstables.end(),
|
|
boost::back_move_inserter(to_remove), cmp);
|
|
std::set_intersection(_current_readers.begin(), _current_readers.end(), new_sstables.begin(), new_sstables.end(),
|
|
boost::back_move_inserter(unchanged), cmp);
|
|
|
|
std::vector<sstable_and_reader> to_add_sar;
|
|
boost::transform(to_add, std::back_inserter(to_add_sar), [&] (const sstables::shared_sstable& sst) {
|
|
return sstable_and_reader { sst, create_reader(sst) };
|
|
});
|
|
|
|
auto get_mutation_readers = [] (std::vector<sstable_and_reader>& ssts) {
|
|
std::vector<mutation_reader*> mrs;
|
|
mrs.reserve(ssts.size());
|
|
boost::range::transform(ssts, std::back_inserter(mrs), [] (const sstable_and_reader& s_a_r) {
|
|
return s_a_r._reader.get();
|
|
});
|
|
return mrs;
|
|
};
|
|
|
|
auto to_add_mrs = get_mutation_readers(to_add_sar);
|
|
auto to_remove_mrs = get_mutation_readers(to_remove);
|
|
|
|
unchanged.insert(unchanged.end(), std::make_move_iterator(to_add_sar.begin()), std::make_move_iterator(to_add_sar.end()));
|
|
return combined_mutation_reader::fast_forward_to(std::move(to_add_mrs), std::move(to_remove_mrs), pr).then([this, new_readers = std::move(unchanged)] () mutable {
|
|
_current_readers = std::move(new_readers);
|
|
});
|
|
}
|
|
};
|
|
|
|
class single_key_sstable_reader final : public mutation_reader::impl {
|
|
column_family* _cf;
|
|
schema_ptr _schema;
|
|
dht::ring_position _rp;
|
|
sstables::key _key;
|
|
std::vector<streamed_mutation> _mutations;
|
|
bool _done = false;
|
|
lw_shared_ptr<sstables::sstable_set> _sstables;
|
|
utils::estimated_histogram& _sstable_histogram;
|
|
// Use a pointer instead of copying, so we don't need to regenerate the reader if
|
|
// the priority changes.
|
|
const io_priority_class& _pc;
|
|
const query::partition_slice& _slice;
|
|
tracing::trace_state_ptr _trace_state;
|
|
public:
|
|
single_key_sstable_reader(column_family* cf,
|
|
schema_ptr schema,
|
|
lw_shared_ptr<sstables::sstable_set> sstables,
|
|
utils::estimated_histogram& sstable_histogram,
|
|
const partition_key& key,
|
|
const query::partition_slice& slice,
|
|
const io_priority_class& pc,
|
|
tracing::trace_state_ptr trace_state)
|
|
: _cf(cf)
|
|
, _schema(std::move(schema))
|
|
, _rp(dht::global_partitioner().decorate_key(*_schema, key))
|
|
, _key(sstables::key::from_partition_key(*_schema, key))
|
|
, _sstables(std::move(sstables))
|
|
, _sstable_histogram(sstable_histogram)
|
|
, _pc(pc)
|
|
, _slice(slice)
|
|
, _trace_state(std::move(trace_state))
|
|
{ }
|
|
|
|
virtual future<streamed_mutation_opt> operator()() override {
|
|
if (_done) {
|
|
return make_ready_future<streamed_mutation_opt>();
|
|
}
|
|
auto candidates = filter_sstable_for_reader(_sstables->select(dht::partition_range(_rp)), *_cf, _schema, _key, _slice);
|
|
return parallel_for_each(std::move(candidates),
|
|
[this](const lw_shared_ptr<sstables::sstable>& sstable) {
|
|
tracing::trace(_trace_state, "Reading key {} from sstable {}", *_rp.key(), seastar::value_of([&sstable] { return sstable->get_filename(); }));
|
|
return sstable->read_row(_schema, _key, _slice, _pc).then([this](auto smo) {
|
|
if (smo) {
|
|
_mutations.emplace_back(std::move(*smo));
|
|
}
|
|
});
|
|
}).then([this] () -> streamed_mutation_opt {
|
|
_done = true;
|
|
if (_mutations.empty()) {
|
|
return { };
|
|
}
|
|
_sstable_histogram.add(_mutations.size());
|
|
return merge_mutations(std::move(_mutations));
|
|
});
|
|
}
|
|
};
|
|
|
|
mutation_reader
|
|
column_family::make_sstable_reader(schema_ptr s,
|
|
const dht::partition_range& pr,
|
|
const query::partition_slice& slice,
|
|
const io_priority_class& pc,
|
|
tracing::trace_state_ptr trace_state) const {
|
|
// restricts a reader's concurrency if the configuration specifies it
|
|
auto restrict_reader = [&] (mutation_reader&& in) {
|
|
auto&& config = [this, &pc] () -> const restricted_mutation_reader_config& {
|
|
if (service::get_local_streaming_read_priority().id() == pc.id()) {
|
|
return _config.streaming_read_concurrency_config;
|
|
}
|
|
return _config.read_concurrency_config;
|
|
}();
|
|
if (config.sem) {
|
|
return make_restricted_reader(config, 1, std::move(in));
|
|
} else {
|
|
return std::move(in);
|
|
}
|
|
};
|
|
|
|
if (pr.is_singular() && pr.start()->value().has_key()) {
|
|
const dht::ring_position& pos = pr.start()->value();
|
|
if (dht::shard_of(pos.token()) != engine().cpu_id()) {
|
|
return make_empty_reader(); // range doesn't belong to this shard
|
|
}
|
|
return restrict_reader(make_mutation_reader<single_key_sstable_reader>(const_cast<column_family*>(this), std::move(s), _sstables,
|
|
_stats.estimated_sstable_per_read, *pos.key(), slice, pc, std::move(trace_state)));
|
|
} else {
|
|
// range_sstable_reader is not movable so we need to wrap it
|
|
return restrict_reader(make_mutation_reader<range_sstable_reader>(std::move(s), _sstables, pr, slice, pc, std::move(trace_state)));
|
|
}
|
|
}
|
|
|
|
// Exposed for testing, not performance critical.
|
|
future<column_family::const_mutation_partition_ptr>
|
|
column_family::find_partition(schema_ptr s, const dht::decorated_key& key) const {
|
|
return do_with(dht::partition_range::make_singular(key), [s = std::move(s), this] (auto& range) {
|
|
return do_with(this->make_reader(s, range), [] (mutation_reader& reader) {
|
|
return reader().then([] (auto sm) {
|
|
return mutation_from_streamed_mutation(std::move(sm));
|
|
}).then([] (mutation_opt&& mo) -> std::unique_ptr<const mutation_partition> {
|
|
if (!mo) {
|
|
return {};
|
|
}
|
|
return std::make_unique<const mutation_partition>(std::move(mo->partition()));
|
|
});
|
|
});
|
|
});
|
|
}
|
|
|
|
future<column_family::const_mutation_partition_ptr>
|
|
column_family::find_partition_slow(schema_ptr s, const partition_key& key) const {
|
|
return find_partition(s, dht::global_partitioner().decorate_key(*s, key));
|
|
}
|
|
|
|
future<column_family::const_row_ptr>
|
|
column_family::find_row(schema_ptr s, const dht::decorated_key& partition_key, clustering_key clustering_key) const {
|
|
return find_partition(s, partition_key).then([clustering_key = std::move(clustering_key), s] (const_mutation_partition_ptr p) {
|
|
if (!p) {
|
|
return make_ready_future<const_row_ptr>();
|
|
}
|
|
auto r = p->find_row(*s, clustering_key);
|
|
if (r) {
|
|
// FIXME: remove copy if only one data source
|
|
return make_ready_future<const_row_ptr>(std::make_unique<row>(*r));
|
|
} else {
|
|
return make_ready_future<const_row_ptr>();
|
|
}
|
|
});
|
|
}
|
|
|
|
mutation_reader
|
|
column_family::make_reader(schema_ptr s,
|
|
const dht::partition_range& range,
|
|
const query::partition_slice& slice,
|
|
const io_priority_class& pc,
|
|
tracing::trace_state_ptr trace_state) const {
|
|
if (_virtual_reader) {
|
|
return _virtual_reader(s, range, slice, pc, trace_state);
|
|
}
|
|
|
|
std::vector<mutation_reader> readers;
|
|
readers.reserve(_memtables->size() + 1);
|
|
|
|
// We're assuming that cache and memtables are both read atomically
|
|
// for single-key queries, so we don't need to special case memtable
|
|
// undergoing a move to cache. At any given point in time between
|
|
// deferring points the sum of data in memtable and cache is coherent. If
|
|
// single-key queries for each data source were performed across deferring
|
|
// points, it would be possible that partitions which are ahead of the
|
|
// memtable cursor would be placed behind the cache cursor, resulting in
|
|
// those partitions being missing in the combined reader.
|
|
//
|
|
// We need to handle this in range queries though, as they are always
|
|
// deferring. scanning_reader from memtable.cc is falling back to reading
|
|
// the sstable when memtable is flushed. After memtable is moved to cache,
|
|
// new readers will no longer use the old memtable, but until then
|
|
// performance may suffer. We should fix this when we add support for
|
|
// range queries in cache, so that scans can always be satisfied form
|
|
// memtable and cache only, as long as data is not evicted.
|
|
//
|
|
// https://github.com/scylladb/scylla/issues/309
|
|
// https://github.com/scylladb/scylla/issues/185
|
|
|
|
for (auto&& mt : *_memtables) {
|
|
readers.emplace_back(mt->make_reader(s, range, slice, pc));
|
|
}
|
|
|
|
if (_config.enable_cache) {
|
|
readers.emplace_back(_cache.make_reader(s, range, slice, pc, std::move(trace_state)));
|
|
} else {
|
|
readers.emplace_back(make_sstable_reader(s, range, slice, pc, std::move(trace_state)));
|
|
}
|
|
|
|
return make_combined_reader(std::move(readers));
|
|
}
|
|
|
|
mutation_reader
|
|
column_family::make_streaming_reader(schema_ptr s,
|
|
const dht::partition_range& range) const {
|
|
auto& slice = query::full_slice;
|
|
auto& pc = service::get_local_streaming_read_priority();
|
|
|
|
std::vector<mutation_reader> readers;
|
|
readers.reserve(_memtables->size() + 1);
|
|
|
|
for (auto&& mt : *_memtables) {
|
|
readers.emplace_back(mt->make_reader(s, range, slice, pc));
|
|
}
|
|
|
|
readers.emplace_back(make_sstable_reader(s, range, slice, pc, nullptr));
|
|
|
|
return make_combined_reader(std::move(readers));
|
|
}
|
|
|
|
mutation_reader
|
|
column_family::make_streaming_reader(schema_ptr s,
|
|
const dht::partition_range_vector& ranges) const {
|
|
auto& slice = query::full_slice;
|
|
auto& pc = service::get_local_streaming_read_priority();
|
|
|
|
auto source = mutation_source([this] (schema_ptr s, const dht::partition_range& range, const query::partition_slice& slice,
|
|
const io_priority_class& pc, tracing::trace_state_ptr trace_state) {
|
|
std::vector<mutation_reader> readers;
|
|
readers.reserve(_memtables->size() + 1);
|
|
for (auto&& mt : *_memtables) {
|
|
readers.emplace_back(mt->make_reader(s, range, slice, pc));
|
|
}
|
|
readers.emplace_back(make_sstable_reader(s, range, slice, pc, std::move(trace_state)));
|
|
return make_combined_reader(std::move(readers));
|
|
});
|
|
|
|
return make_multi_range_reader(s, std::move(source), ranges, slice, pc, nullptr);
|
|
}
|
|
|
|
// Not performance critical. Currently used for testing only.
|
|
template <typename Func>
|
|
future<bool>
|
|
column_family::for_all_partitions(schema_ptr s, Func&& func) const {
|
|
static_assert(std::is_same<bool, std::result_of_t<Func(const dht::decorated_key&, const mutation_partition&)>>::value,
|
|
"bad Func signature");
|
|
|
|
struct iteration_state {
|
|
mutation_reader reader;
|
|
Func func;
|
|
bool ok = true;
|
|
bool empty = false;
|
|
public:
|
|
bool done() const { return !ok || empty; }
|
|
iteration_state(schema_ptr s, const column_family& cf, Func&& func)
|
|
: reader(cf.make_reader(std::move(s)))
|
|
, func(std::move(func))
|
|
{ }
|
|
};
|
|
|
|
return do_with(iteration_state(std::move(s), *this, std::move(func)), [] (iteration_state& is) {
|
|
return do_until([&is] { return is.done(); }, [&is] {
|
|
return is.reader().then([] (auto sm) {
|
|
return mutation_from_streamed_mutation(std::move(sm));
|
|
}).then([&is](mutation_opt&& mo) {
|
|
if (!mo) {
|
|
is.empty = true;
|
|
} else {
|
|
is.ok = is.func(mo->decorated_key(), mo->partition());
|
|
}
|
|
});
|
|
}).then([&is] {
|
|
return is.ok;
|
|
});
|
|
});
|
|
}
|
|
|
|
future<bool>
|
|
column_family::for_all_partitions_slow(schema_ptr s, std::function<bool (const dht::decorated_key&, const mutation_partition&)> func) const {
|
|
return for_all_partitions(std::move(s), std::move(func));
|
|
}
|
|
|
|
class lister {
|
|
public:
|
|
using dir_entry_types = std::unordered_set<directory_entry_type, enum_hash<directory_entry_type>>;
|
|
using walker_type = std::function<future<> (directory_entry)>;
|
|
using filter_type = std::function<bool (const sstring&)>;
|
|
private:
|
|
file _f;
|
|
walker_type _walker;
|
|
filter_type _filter;
|
|
dir_entry_types _expected_type;
|
|
subscription<directory_entry> _listing;
|
|
sstring _dirname;
|
|
|
|
public:
|
|
lister(file f, dir_entry_types type, walker_type walker, sstring dirname)
|
|
: _f(std::move(f))
|
|
, _walker(std::move(walker))
|
|
, _filter([] (const sstring& fname) { return true; })
|
|
, _expected_type(type)
|
|
, _listing(_f.list_directory([this] (directory_entry de) { return _visit(de); }))
|
|
, _dirname(dirname) {
|
|
}
|
|
|
|
lister(file f, dir_entry_types type, walker_type walker, filter_type filter, sstring dirname)
|
|
: lister(std::move(f), type, std::move(walker), dirname) {
|
|
_filter = std::move(filter);
|
|
}
|
|
|
|
static future<> scan_dir(sstring name, dir_entry_types type, walker_type walker, filter_type filter = [] (const sstring& fname) { return true; });
|
|
protected:
|
|
future<> _visit(directory_entry de) {
|
|
|
|
return guarantee_type(std::move(de)).then([this] (directory_entry de) {
|
|
// Hide all synthetic directories and hidden files.
|
|
if ((!_expected_type.count(*(de.type))) || (de.name[0] == '.')) {
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
// apply a filter
|
|
if (!_filter(_dirname + "/" + de.name)) {
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
return _walker(de);
|
|
});
|
|
|
|
}
|
|
future<> done() {
|
|
return _listing.done().then([this] {
|
|
return _f.close();
|
|
});
|
|
}
|
|
private:
|
|
future<directory_entry> guarantee_type(directory_entry de) {
|
|
if (de.type) {
|
|
return make_ready_future<directory_entry>(std::move(de));
|
|
} else {
|
|
auto f = engine().file_type(_dirname + "/" + de.name);
|
|
return f.then([de = std::move(de)] (std::experimental::optional<directory_entry_type> t) mutable {
|
|
de.type = t;
|
|
return make_ready_future<directory_entry>(std::move(de));
|
|
});
|
|
}
|
|
}
|
|
};
|
|
|
|
|
|
future<> lister::scan_dir(sstring name, lister::dir_entry_types type, walker_type walker, filter_type filter) {
|
|
return open_checked_directory(general_disk_error_handler, name).then([type, walker = std::move(walker), filter = std::move(filter), name] (file f) {
|
|
auto l = make_lw_shared<lister>(std::move(f), type, walker, filter, name);
|
|
return l->done().then([l] { });
|
|
});
|
|
}
|
|
|
|
static bool belongs_to_current_shard(const std::vector<shard_id>& shards) {
|
|
return boost::find(shards, engine().cpu_id()) != shards.end();
|
|
}
|
|
|
|
static bool belongs_to_other_shard(const std::vector<shard_id>& shards) {
|
|
return shards.size() != size_t(belongs_to_current_shard(shards));
|
|
}
|
|
|
|
future<sstables::shared_sstable>
|
|
column_family::open_sstable(sstables::foreign_sstable_open_info info, sstring dir, int64_t generation,
|
|
sstables::sstable::version_types v, sstables::sstable::format_types f) {
|
|
auto sst = make_lw_shared<sstables::sstable>(_schema, dir, generation, v, f);
|
|
if (!belongs_to_current_shard(info.owners)) {
|
|
dblog.debug("sstable {} not relevant for this shard, ignoring", sst->get_filename());
|
|
sst->mark_for_deletion();
|
|
return make_ready_future<sstables::shared_sstable>();
|
|
}
|
|
return sst->load(std::move(info)).then([sst] () mutable {
|
|
return make_ready_future<sstables::shared_sstable>(std::move(sst));
|
|
});
|
|
}
|
|
|
|
void column_family::load_sstable(sstables::shared_sstable& sst, bool reset_level) {
|
|
auto shards = sst->get_shards_for_this_sstable();
|
|
if (belongs_to_other_shard(shards)) {
|
|
// If we're here, this sstable is shared by this and other
|
|
// shard(s). Shared sstables cannot be deleted until all
|
|
// shards compacted them, so to reduce disk space usage we
|
|
// want to start splitting them now.
|
|
// However, we need to delay this compaction until we read all
|
|
// the sstables belonging to this CF, because we need all of
|
|
// them to know which tombstones we can drop, and what
|
|
// generation number is free.
|
|
_sstables_need_rewrite.push_back(sst);
|
|
}
|
|
if (reset_level) {
|
|
// When loading a migrated sstable, set level to 0 because
|
|
// it may overlap with existing tables in levels > 0.
|
|
// This step is optional, because even if we didn't do this
|
|
// scylla would detect the overlap, and bring back some of
|
|
// the sstables to level 0.
|
|
sst->set_sstable_level(0);
|
|
}
|
|
add_sstable(sst, std::move(shards));
|
|
}
|
|
|
|
// load_sstable() wants to start rewriting sstables which are shared between
|
|
// several shards, but we can't start any compaction before all the sstables
|
|
// of this CF were loaded. So call this function to start rewrites, if any.
|
|
void column_family::start_rewrite() {
|
|
// submit shared sstables in generation order to guarantee that all shards
|
|
// owning a sstable will agree on its deletion nearly the same time,
|
|
// therefore, reducing disk space requirements.
|
|
boost::sort(_sstables_need_rewrite, [] (const sstables::shared_sstable& x, const sstables::shared_sstable& y) {
|
|
return x->generation() < y->generation();
|
|
});
|
|
for (auto sst : _sstables_need_rewrite) {
|
|
dblog.info("Splitting {} for shard", sst->get_filename());
|
|
_compaction_manager.submit_sstable_rewrite(this, sst);
|
|
}
|
|
_sstables_need_rewrite.clear();
|
|
}
|
|
|
|
void column_family::update_stats_for_new_sstable(uint64_t disk_space_used_by_sstable, std::vector<unsigned>&& shards_for_the_sstable) {
|
|
assert(!shards_for_the_sstable.empty());
|
|
if (*boost::min_element(shards_for_the_sstable) == engine().cpu_id()) {
|
|
_stats.live_disk_space_used += disk_space_used_by_sstable;
|
|
_stats.total_disk_space_used += disk_space_used_by_sstable;
|
|
_stats.live_sstable_count++;
|
|
}
|
|
}
|
|
|
|
void column_family::add_sstable(lw_shared_ptr<sstables::sstable> sstable, std::vector<unsigned>&& shards_for_the_sstable) {
|
|
// allow in-progress reads to continue using old list
|
|
_sstables = make_lw_shared(*_sstables);
|
|
update_stats_for_new_sstable(sstable->bytes_on_disk(), std::move(shards_for_the_sstable));
|
|
_sstables->insert(std::move(sstable));
|
|
}
|
|
|
|
future<>
|
|
column_family::update_cache(memtable& m, lw_shared_ptr<sstables::sstable_set> old_sstables) {
|
|
if (_config.enable_cache) {
|
|
// be careful to use the old sstable list, since the new one will hit every
|
|
// mutation in m.
|
|
return _cache.update(m, make_partition_presence_checker(std::move(old_sstables)));
|
|
|
|
} else {
|
|
return m.clear_gently();
|
|
}
|
|
}
|
|
|
|
// FIXME: because we are coalescing, it could be that mutations belonging to the same
|
|
// range end up in two different tables. Technically, we should wait for both. However,
|
|
// the only way we have to make this happen now is to wait on all previous writes. This
|
|
// certainly is an overkill, so we won't do it. We can fix this longer term by looking
|
|
// at the PREPARE messages, and then noting what is the minimum future we should be
|
|
// waiting for.
|
|
future<>
|
|
column_family::seal_active_streaming_memtable_delayed() {
|
|
auto old = _streaming_memtables->back();
|
|
if (old->empty()) {
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
if (!_delayed_streaming_flush.armed()) {
|
|
// We don't want to wait for too long, because the incoming mutations will not be available
|
|
// until we flush them to SSTables. On top of that, if the sender ran out of messages, it won't
|
|
// send more until we respond to some - which depends on these futures resolving. Sure enough,
|
|
// the real fix for that second one is to have better communication between sender and receiver,
|
|
// but that's not realistic ATM. If we did have better negotiation here, we would not need a timer
|
|
// at all.
|
|
_delayed_streaming_flush.arm(2s);
|
|
}
|
|
|
|
return with_gate(_streaming_flush_gate, [this, old] {
|
|
return _waiting_streaming_flushes.get_shared_future();
|
|
});
|
|
}
|
|
|
|
future<>
|
|
column_family::seal_active_streaming_memtable_immediate() {
|
|
auto old = _streaming_memtables->back();
|
|
if (old->empty()) {
|
|
return make_ready_future<>();
|
|
}
|
|
_streaming_memtables->add_memtable();
|
|
_streaming_memtables->erase(old);
|
|
|
|
auto guard = _streaming_flush_phaser.start();
|
|
return with_gate(_streaming_flush_gate, [this, old] {
|
|
_delayed_streaming_flush.cancel();
|
|
auto current_waiters = std::exchange(_waiting_streaming_flushes, shared_promise<>());
|
|
auto f = current_waiters.get_shared_future(); // for this seal
|
|
|
|
with_lock(_sstables_lock.for_read(), [this, old] {
|
|
auto newtab = make_lw_shared<sstables::sstable>(_schema,
|
|
_config.datadir, calculate_generation_for_new_table(),
|
|
sstables::sstable::version_types::ka,
|
|
sstables::sstable::format_types::big);
|
|
|
|
newtab->set_unshared();
|
|
|
|
auto&& priority = service::get_local_streaming_write_priority();
|
|
// This is somewhat similar to the main memtable flush, but with important differences.
|
|
//
|
|
// The first difference, is that we don't keep aggregate collectd statistics about this one.
|
|
// If we ever need to, we'll keep them separate statistics, but we don't want to polute the
|
|
// main stats about memtables with streaming memtables.
|
|
//
|
|
// Second, we will not bother touching the cache after this flush. The current streaming code
|
|
// will invalidate the ranges it touches, so we won't do it twice. Even when that changes, the
|
|
// cache management code in here will have to differ from the main memtable's one. Please see
|
|
// the comment at flush_streaming_mutations() for details.
|
|
//
|
|
// Lastly, we don't have any commitlog RP to update, and we don't need to deal manipulate the
|
|
// memtable list, since this memtable was not available for reading up until this point.
|
|
return newtab->write_components(*old, incremental_backups_enabled(), priority).then([this, newtab, old] {
|
|
return newtab->open_data();
|
|
}).then([this, old, newtab] () {
|
|
add_sstable(newtab, {engine().cpu_id()});
|
|
trigger_compaction();
|
|
}).handle_exception([] (auto ep) {
|
|
dblog.error("failed to write streamed sstable: {}", ep);
|
|
return make_exception_future<>(ep);
|
|
});
|
|
// We will also not have any retry logic. If we fail here, we'll fail the streaming and let
|
|
// the upper layers know. They can then apply any logic they want here.
|
|
}).then_wrapped([this, current_waiters = std::move(current_waiters)] (future <> f) mutable {
|
|
if (f.failed()) {
|
|
current_waiters.set_exception(f.get_exception());
|
|
} else {
|
|
current_waiters.set_value();
|
|
}
|
|
});
|
|
|
|
return f;
|
|
}).finally([guard = std::move(guard)] { });
|
|
}
|
|
|
|
future<> column_family::seal_active_streaming_memtable_big(streaming_memtable_big& smb) {
|
|
auto old = smb.memtables->back();
|
|
if (old->empty()) {
|
|
return make_ready_future<>();
|
|
}
|
|
smb.memtables->add_memtable();
|
|
smb.memtables->erase(old);
|
|
return with_gate(_streaming_flush_gate, [this, old, &smb] {
|
|
return with_gate(smb.flush_in_progress, [this, old, &smb] {
|
|
return with_lock(_sstables_lock.for_read(), [this, old, &smb] {
|
|
auto newtab = make_lw_shared<sstables::sstable>(_schema,
|
|
_config.datadir, calculate_generation_for_new_table(),
|
|
sstables::sstable::version_types::ka,
|
|
sstables::sstable::format_types::big);
|
|
|
|
newtab->set_unshared();
|
|
|
|
auto&& priority = service::get_local_streaming_write_priority();
|
|
return newtab->write_components(*old, incremental_backups_enabled(), priority, true).then([this, newtab, old, &smb] {
|
|
smb.sstables.emplace_back(newtab);
|
|
}).handle_exception([] (auto ep) {
|
|
dblog.error("failed to write streamed sstable: {}", ep);
|
|
return make_exception_future<>(ep);
|
|
});
|
|
});
|
|
});
|
|
});
|
|
}
|
|
|
|
future<>
|
|
column_family::seal_active_memtable(memtable_list::flush_behavior ignored) {
|
|
auto old = _memtables->back();
|
|
dblog.debug("Sealing active memtable of {}.{}, partitions: {}, occupancy: {}", _schema->cf_name(), _schema->ks_name(), old->partition_count(), old->occupancy());
|
|
|
|
if (old->empty()) {
|
|
dblog.debug("Memtable is empty");
|
|
return make_ready_future<>();
|
|
}
|
|
_memtables->add_memtable();
|
|
|
|
assert(_highest_flushed_rp < old->replay_position()
|
|
|| (_highest_flushed_rp == db::replay_position() && old->replay_position() == db::replay_position())
|
|
);
|
|
_highest_flushed_rp = old->replay_position();
|
|
|
|
return _flush_queue->run_cf_flush(old->replay_position(), [old, this] {
|
|
auto memtable_size = old->occupancy().total_space();
|
|
|
|
_config.cf_stats->pending_memtables_flushes_count++;
|
|
_config.cf_stats->pending_memtables_flushes_bytes += memtable_size;
|
|
|
|
return repeat([this, old] {
|
|
return with_lock(_sstables_lock.for_read(), [this, old] {
|
|
_flush_queue->check_open_gate();
|
|
return try_flush_memtable_to_sstable(old);
|
|
});
|
|
}).then([this, memtable_size] {
|
|
_config.cf_stats->pending_memtables_flushes_count--;
|
|
_config.cf_stats->pending_memtables_flushes_bytes -= memtable_size;
|
|
});
|
|
}, [old, this] {
|
|
if (_commitlog) {
|
|
_commitlog->discard_completed_segments(_schema->id(), old->replay_position());
|
|
}
|
|
});
|
|
// FIXME: release commit log
|
|
// FIXME: provide back-pressure to upper layers
|
|
}
|
|
|
|
future<stop_iteration>
|
|
column_family::try_flush_memtable_to_sstable(lw_shared_ptr<memtable> old) {
|
|
auto gen = calculate_generation_for_new_table();
|
|
|
|
auto newtab = make_lw_shared<sstables::sstable>(_schema,
|
|
_config.datadir, gen,
|
|
sstables::sstable::version_types::ka,
|
|
sstables::sstable::format_types::big);
|
|
|
|
newtab->set_unshared();
|
|
dblog.debug("Flushing to {}", newtab->get_filename());
|
|
// Note that due to our sharded architecture, it is possible that
|
|
// in the face of a value change some shards will backup sstables
|
|
// while others won't.
|
|
//
|
|
// This is, in theory, possible to mitigate through a rwlock.
|
|
// However, this doesn't differ from the situation where all tables
|
|
// are coming from a single shard and the toggle happens in the
|
|
// middle of them.
|
|
//
|
|
// The code as is guarantees that we'll never partially backup a
|
|
// single sstable, so that is enough of a guarantee.
|
|
auto&& priority = service::get_local_memtable_flush_priority();
|
|
return newtab->write_components(*old, incremental_backups_enabled(), priority).then([this, newtab, old] {
|
|
return newtab->open_data();
|
|
}).then_wrapped([this, old, newtab] (future<> ret) {
|
|
dblog.debug("Flushing to {} done", newtab->get_filename());
|
|
try {
|
|
ret.get();
|
|
|
|
// Cache updates are serialized because partition_presence_checker
|
|
// is using data source snapshot created before the update starts, so that
|
|
// we can use incremental_selector. If updates were done concurrently we
|
|
// could mispopulate due to stale presence information.
|
|
return with_semaphore(_cache_update_sem, 1, [this, old, newtab] {
|
|
// We must add sstable before we call update_cache(), because
|
|
// memtable's data after moving to cache can be evicted at any time.
|
|
auto old_sstables = _sstables;
|
|
add_sstable(newtab, {engine().cpu_id()});
|
|
old->mark_flushed(newtab);
|
|
|
|
trigger_compaction();
|
|
return update_cache(*old, std::move(old_sstables));
|
|
}).then_wrapped([this, newtab, old] (future<> f) {
|
|
try {
|
|
f.get();
|
|
} catch(...) {
|
|
dblog.error("failed to move memtable for {} to cache: {}", newtab->get_filename(), std::current_exception());
|
|
}
|
|
|
|
_memtables->erase(old);
|
|
dblog.debug("Memtable for {} replaced", newtab->get_filename());
|
|
|
|
return make_ready_future<stop_iteration>(stop_iteration::yes);
|
|
});
|
|
} catch (...) {
|
|
dblog.error("failed to write sstable {}: {}", newtab->get_filename(), std::current_exception());
|
|
// If we failed this write we will try the write again and that will create a new flush reader
|
|
// that will decrease dirty memory again. So we need to reset the accounting.
|
|
old->revert_flushed_memory();
|
|
}
|
|
return sleep(10s).then([] {
|
|
return make_ready_future<stop_iteration>(stop_iteration::no);
|
|
});
|
|
});
|
|
}
|
|
|
|
void
|
|
column_family::start() {
|
|
// FIXME: add option to disable automatic compaction.
|
|
start_compaction();
|
|
}
|
|
|
|
future<>
|
|
column_family::stop() {
|
|
return when_all(_memtables->request_flush(), _streaming_memtables->request_flush()).discard_result().finally([this] {
|
|
return _compaction_manager.remove(this).then([this] {
|
|
// Nest, instead of using when_all, so we don't lose any exceptions.
|
|
return _flush_queue->close().then([this] {
|
|
return _streaming_flush_gate.close();
|
|
});
|
|
}).then([this] {
|
|
return _sstable_deletion_gate.close();
|
|
});
|
|
});
|
|
}
|
|
|
|
static io_error_handler error_handler_for_upload_dir() {
|
|
return [] (std::exception_ptr eptr) {
|
|
// do nothing about sstable exception and caller will just rethrow it.
|
|
};
|
|
}
|
|
|
|
// This function will iterate through upload directory in column family,
|
|
// and will do the following for each sstable found:
|
|
// 1) Mutate sstable level to 0.
|
|
// 2) Create hard links to its components in column family dir.
|
|
// 3) Remove all of its components in upload directory.
|
|
// At the end, it's expected that upload dir is empty and all of its
|
|
// previous content was moved to column family dir.
|
|
//
|
|
// Return a vector containing descriptor of sstables to be loaded.
|
|
future<std::vector<sstables::entry_descriptor>>
|
|
distributed_loader::flush_upload_dir(distributed<database>& db, sstring ks_name, sstring cf_name) {
|
|
struct work {
|
|
std::unordered_map<int64_t, sstables::entry_descriptor> descriptors;
|
|
std::vector<sstables::entry_descriptor> flushed;
|
|
};
|
|
|
|
return do_with(work(), [&db, ks_name = std::move(ks_name), cf_name = std::move(cf_name)] (work& work) {
|
|
auto& cf = db.local().find_column_family(ks_name, cf_name);
|
|
|
|
return lister::scan_dir(cf._config.datadir + "/upload/", { directory_entry_type::regular },
|
|
[&work] (directory_entry de) {
|
|
auto comps = sstables::entry_descriptor::make_descriptor(de.name);
|
|
if (comps.component != sstables::sstable::component_type::TOC) {
|
|
return make_ready_future<>();
|
|
}
|
|
work.descriptors.emplace(comps.generation, std::move(comps));
|
|
return make_ready_future<>();
|
|
}, &column_family::manifest_json_filter).then([&db, ks_name = std::move(ks_name), cf_name = std::move(cf_name), &work] {
|
|
work.flushed.reserve(work.descriptors.size());
|
|
|
|
return do_for_each(work.descriptors, [&db, ks_name, cf_name, &work] (auto& pair) {
|
|
return db.invoke_on(column_family::calculate_shard_from_sstable_generation(pair.first),
|
|
[ks_name, cf_name, comps = pair.second] (database& db) {
|
|
auto& cf = db.find_column_family(ks_name, cf_name);
|
|
|
|
auto sst = make_lw_shared<sstables::sstable>(cf.schema(), cf._config.datadir + "/upload", comps.generation,
|
|
comps.version, comps.format, gc_clock::now(),
|
|
[] (disk_error_signal_type&) { return error_handler_for_upload_dir(); });
|
|
auto gen = cf.calculate_generation_for_new_table();
|
|
|
|
// Read toc content as it will be needed for moving and deleting a sstable.
|
|
return sst->read_toc().then([sst] {
|
|
return sst->mutate_sstable_level(0);
|
|
}).then([&cf, sst, gen] {
|
|
return sst->create_links(cf._config.datadir, gen);
|
|
}).then([sst] {
|
|
return sstables::remove_by_toc_name(sst->toc_filename(), error_handler_for_upload_dir());
|
|
}).then([sst, gen] {
|
|
return make_ready_future<int64_t>(gen);
|
|
});
|
|
}).then([&work, comps = pair.second] (auto gen) mutable {
|
|
comps.generation = gen;
|
|
work.flushed.push_back(std::move(comps));
|
|
return make_ready_future<>();
|
|
});
|
|
});
|
|
}).then([&work] {
|
|
return make_ready_future<std::vector<sstables::entry_descriptor>>(std::move(work.flushed));
|
|
});
|
|
});
|
|
}
|
|
|
|
future<std::vector<sstables::entry_descriptor>>
|
|
column_family::reshuffle_sstables(std::set<int64_t> all_generations, int64_t start) {
|
|
struct work {
|
|
int64_t current_gen;
|
|
std::set<int64_t> all_generations; // Stores generation of all live sstables in the system.
|
|
std::map<int64_t, sstables::shared_sstable> sstables;
|
|
std::unordered_map<int64_t, sstables::entry_descriptor> descriptors;
|
|
std::vector<sstables::entry_descriptor> reshuffled;
|
|
work(int64_t start, std::set<int64_t> gens)
|
|
: current_gen(start ? start : 1)
|
|
, all_generations(gens) {}
|
|
};
|
|
|
|
return do_with(work(start, std::move(all_generations)), [this] (work& work) {
|
|
return lister::scan_dir(_config.datadir, { directory_entry_type::regular }, [this, &work] (directory_entry de) {
|
|
auto comps = sstables::entry_descriptor::make_descriptor(de.name);
|
|
if (comps.component != sstables::sstable::component_type::TOC) {
|
|
return make_ready_future<>();
|
|
}
|
|
// Skip generations that were already loaded by Scylla at a previous stage.
|
|
if (work.all_generations.count(comps.generation) != 0) {
|
|
return make_ready_future<>();
|
|
}
|
|
auto sst = make_lw_shared<sstables::sstable>(_schema,
|
|
_config.datadir, comps.generation,
|
|
comps.version, comps.format);
|
|
work.sstables.emplace(comps.generation, std::move(sst));
|
|
work.descriptors.emplace(comps.generation, std::move(comps));
|
|
// FIXME: This is the only place in which we actually issue disk activity aside from
|
|
// directory metadata operations.
|
|
//
|
|
// But without the TOC information, we don't know which files we should link.
|
|
// The alternative to that would be to change create link to try creating a
|
|
// link for all possible files and handling the failures gracefuly, but that's not
|
|
// exactly fast either.
|
|
//
|
|
// Those SSTables are not known by anyone in the system. So we don't have any kind of
|
|
// object describing them. There isn't too much of a choice.
|
|
return work.sstables[comps.generation]->read_toc();
|
|
}, &manifest_json_filter).then([&work] {
|
|
// Note: cannot be parallel because we will be shuffling things around at this stage. Can't race.
|
|
return do_for_each(work.sstables, [&work] (auto& pair) {
|
|
auto&& comps = std::move(work.descriptors.at(pair.first));
|
|
comps.generation = work.current_gen;
|
|
work.reshuffled.push_back(std::move(comps));
|
|
|
|
if (pair.first == work.current_gen) {
|
|
++work.current_gen;
|
|
return make_ready_future<>();
|
|
}
|
|
return pair.second->set_generation(work.current_gen++);
|
|
});
|
|
}).then([&work] {
|
|
return make_ready_future<std::vector<sstables::entry_descriptor>>(std::move(work.reshuffled));
|
|
});
|
|
});
|
|
}
|
|
|
|
void column_family::rebuild_statistics() {
|
|
// zeroing live_disk_space_used and live_sstable_count because the
|
|
// sstable list was re-created
|
|
_stats.live_disk_space_used = 0;
|
|
_stats.live_sstable_count = 0;
|
|
|
|
for (auto&& tab : boost::range::join(_sstables_compacted_but_not_deleted,
|
|
// this might seem dangerous, but "move" here just avoids constness,
|
|
// making the two ranges compatible when compiling with boost 1.55.
|
|
// Noone is actually moving anything...
|
|
std::move(*_sstables->all()))) {
|
|
update_stats_for_new_sstable(tab->data_size(), tab->get_shards_for_this_sstable());
|
|
}
|
|
}
|
|
|
|
void
|
|
column_family::rebuild_sstable_list(const std::vector<sstables::shared_sstable>& new_sstables,
|
|
const std::vector<sstables::shared_sstable>& sstables_to_remove) {
|
|
// Build a new list of _sstables: We remove from the existing list the
|
|
// tables we compacted (by now, there might be more sstables flushed
|
|
// later), and we add the new tables generated by the compaction.
|
|
// We create a new list rather than modifying it in-place, so that
|
|
// on-going reads can continue to use the old list.
|
|
//
|
|
// We only remove old sstables after they are successfully deleted,
|
|
// to avoid a new compaction from ignoring data in the old sstables
|
|
// if the deletion fails (note deletion of shared sstables can take
|
|
// unbounded time, because all shards must agree on the deletion).
|
|
auto current_sstables = _sstables;
|
|
auto new_sstable_list = _compaction_strategy.make_sstable_set(_schema);
|
|
auto new_compacted_but_not_deleted = _sstables_compacted_but_not_deleted;
|
|
|
|
|
|
std::unordered_set<sstables::shared_sstable> s(
|
|
sstables_to_remove.begin(), sstables_to_remove.end());
|
|
|
|
// First, add the new sstables.
|
|
|
|
// this might seem dangerous, but "move" here just avoids constness,
|
|
// making the two ranges compatible when compiling with boost 1.55.
|
|
// Noone is actually moving anything...
|
|
for (auto&& tab : boost::range::join(new_sstables, std::move(*current_sstables->all()))) {
|
|
// Checks if oldtab is a sstable not being compacted.
|
|
if (!s.count(tab)) {
|
|
new_sstable_list.insert(tab);
|
|
} else {
|
|
new_compacted_but_not_deleted.push_back(tab);
|
|
}
|
|
}
|
|
_sstables = make_lw_shared(std::move(new_sstable_list));
|
|
_sstables_compacted_but_not_deleted = std::move(new_compacted_but_not_deleted);
|
|
|
|
rebuild_statistics();
|
|
|
|
// Second, delete the old sstables. This is done in the background, so we can
|
|
// consider this compaction completed.
|
|
seastar::with_gate(_sstable_deletion_gate, [this, sstables_to_remove] {
|
|
return sstables::delete_atomically(sstables_to_remove).then_wrapped([this, sstables_to_remove] (future<> f) {
|
|
std::exception_ptr eptr;
|
|
try {
|
|
f.get();
|
|
} catch(...) {
|
|
eptr = std::current_exception();
|
|
}
|
|
|
|
// unconditionally remove compacted sstables from _sstables_compacted_but_not_deleted,
|
|
// or they could stay forever in the set, resulting in deleted files remaining
|
|
// opened and disk space not being released until shutdown.
|
|
std::unordered_set<sstables::shared_sstable> s(
|
|
sstables_to_remove.begin(), sstables_to_remove.end());
|
|
auto e = boost::range::remove_if(_sstables_compacted_but_not_deleted, [&] (sstables::shared_sstable sst) -> bool {
|
|
return s.count(sst);
|
|
});
|
|
_sstables_compacted_but_not_deleted.erase(e, _sstables_compacted_but_not_deleted.end());
|
|
rebuild_statistics();
|
|
|
|
if (eptr) {
|
|
return make_exception_future<>(eptr);
|
|
}
|
|
return make_ready_future<>();
|
|
}).handle_exception([] (std::exception_ptr e) {
|
|
try {
|
|
std::rethrow_exception(e);
|
|
} catch (sstables::atomic_deletion_cancelled& adc) {
|
|
dblog.debug("Failed to delete sstables after compaction: {}", adc);
|
|
}
|
|
});
|
|
});
|
|
}
|
|
|
|
future<>
|
|
column_family::compact_sstables(sstables::compaction_descriptor descriptor, bool cleanup) {
|
|
if (!descriptor.sstables.size()) {
|
|
// if there is nothing to compact, just return.
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
return with_lock(_sstables_lock.for_read(), [this, descriptor = std::move(descriptor), cleanup] {
|
|
auto sstables_to_compact = make_lw_shared<std::vector<sstables::shared_sstable>>(std::move(descriptor.sstables));
|
|
|
|
auto create_sstable = [this] {
|
|
auto gen = this->calculate_generation_for_new_table();
|
|
// FIXME: use "tmp" marker in names of incomplete sstable
|
|
auto sst = make_lw_shared<sstables::sstable>(_schema, _config.datadir, gen,
|
|
sstables::sstable::version_types::ka,
|
|
sstables::sstable::format_types::big);
|
|
sst->set_unshared();
|
|
return sst;
|
|
};
|
|
return sstables::compact_sstables(*sstables_to_compact, *this, create_sstable, descriptor.max_sstable_bytes, descriptor.level,
|
|
cleanup).then([this, sstables_to_compact] (auto new_sstables) {
|
|
_compaction_strategy.notify_completion(*sstables_to_compact, new_sstables);
|
|
return this->rebuild_sstable_list(new_sstables, *sstables_to_compact);
|
|
});
|
|
});
|
|
}
|
|
|
|
static bool needs_cleanup(const lw_shared_ptr<sstables::sstable>& sst,
|
|
const lw_shared_ptr<dht::token_range_vector>& owned_ranges,
|
|
schema_ptr s) {
|
|
auto first = sst->get_first_partition_key();
|
|
auto last = sst->get_last_partition_key();
|
|
auto first_token = dht::global_partitioner().get_token(*s, first);
|
|
auto last_token = dht::global_partitioner().get_token(*s, last);
|
|
dht::token_range sst_token_range = dht::token_range::make(first_token, last_token);
|
|
|
|
// return true iff sst partition range isn't fully contained in any of the owned ranges.
|
|
for (auto& r : *owned_ranges) {
|
|
if (r.contains(sst_token_range, dht::token_comparator())) {
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
future<> column_family::cleanup_sstables(sstables::compaction_descriptor descriptor) {
|
|
dht::token_range_vector r = service::get_local_storage_service().get_local_ranges(_schema->ks_name());
|
|
auto owned_ranges = make_lw_shared<dht::token_range_vector>(std::move(r));
|
|
auto sstables_to_cleanup = make_lw_shared<std::vector<sstables::shared_sstable>>(std::move(descriptor.sstables));
|
|
|
|
return parallel_for_each(*sstables_to_cleanup, [this, owned_ranges = std::move(owned_ranges), sstables_to_cleanup] (auto& sst) {
|
|
if (!owned_ranges->empty() && !needs_cleanup(sst, owned_ranges, _schema)) {
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
std::vector<sstables::shared_sstable> sstable_to_compact({ sst });
|
|
return this->compact_sstables(sstables::compaction_descriptor(std::move(sstable_to_compact), sst->get_sstable_level()), true);
|
|
});
|
|
}
|
|
|
|
// FIXME: this is just an example, should be changed to something more general
|
|
// Note: We assume that the column_family does not get destroyed during compaction.
|
|
future<>
|
|
column_family::compact_all_sstables() {
|
|
std::vector<sstables::shared_sstable> sstables;
|
|
sstables.reserve(_sstables->all()->size());
|
|
for (auto&& sst : *_sstables->all()) {
|
|
sstables.push_back(sst);
|
|
}
|
|
// FIXME: check if the lower bound min_compaction_threshold() from schema
|
|
// should be taken into account before proceeding with compaction.
|
|
return compact_sstables(sstables::compaction_descriptor(std::move(sstables)));
|
|
}
|
|
|
|
void column_family::start_compaction() {
|
|
set_compaction_strategy(_schema->compaction_strategy());
|
|
}
|
|
|
|
void column_family::trigger_compaction() {
|
|
// Submitting compaction job to compaction manager.
|
|
do_trigger_compaction(); // see below
|
|
}
|
|
|
|
void column_family::do_trigger_compaction() {
|
|
// But only submit if we're not locked out
|
|
if (!_compaction_disabled) {
|
|
_compaction_manager.submit(this);
|
|
}
|
|
}
|
|
|
|
future<> column_family::run_compaction(sstables::compaction_descriptor descriptor) {
|
|
return compact_sstables(std::move(descriptor));
|
|
}
|
|
|
|
void column_family::set_compaction_strategy(sstables::compaction_strategy_type strategy) {
|
|
dblog.info("Setting compaction strategy of {}.{} to {}", _schema->ks_name(), _schema->cf_name(), sstables::compaction_strategy::name(strategy));
|
|
auto new_cs = make_compaction_strategy(strategy, _schema->compaction_strategy_options());
|
|
auto new_sstables = new_cs.make_sstable_set(_schema);
|
|
for (auto&& s : *_sstables->all()) {
|
|
new_sstables.insert(s);
|
|
}
|
|
// now exception safe:
|
|
_compaction_strategy = std::move(new_cs);
|
|
_sstables = std::move(new_sstables);
|
|
}
|
|
|
|
size_t column_family::sstables_count() const {
|
|
return _sstables->all()->size();
|
|
}
|
|
|
|
std::vector<uint64_t> column_family::sstable_count_per_level() const {
|
|
std::vector<uint64_t> count_per_level;
|
|
for (auto&& sst : *_sstables->all()) {
|
|
auto level = sst->get_sstable_level();
|
|
|
|
if (level + 1 > count_per_level.size()) {
|
|
count_per_level.resize(level + 1, 0UL);
|
|
}
|
|
count_per_level[level]++;
|
|
}
|
|
return count_per_level;
|
|
}
|
|
|
|
int64_t column_family::get_unleveled_sstables() const {
|
|
// TODO: when we support leveled compaction, we should return the number of
|
|
// SSTables in L0. If leveled compaction is enabled in this column family,
|
|
// then we should return zero, as we currently do.
|
|
return 0;
|
|
}
|
|
|
|
const sstables::sstable_set& column_family::get_sstable_set() const {
|
|
return *_sstables;
|
|
}
|
|
|
|
lw_shared_ptr<sstable_list> column_family::get_sstables() const {
|
|
return _sstables->all();
|
|
}
|
|
|
|
std::vector<sstables::shared_sstable> column_family::select_sstables(const dht::partition_range& range) const {
|
|
return _sstables->select(range);
|
|
}
|
|
|
|
// Gets the list of all sstables in the column family, including ones that are
|
|
// not used for active queries because they have already been compacted, but are
|
|
// waiting for delete_atomically() to return.
|
|
//
|
|
// As long as we haven't deleted them, compaction needs to ensure it doesn't
|
|
// garbage-collect a tombstone that covers data in an sstable that may not be
|
|
// successfully deleted.
|
|
lw_shared_ptr<sstable_list> column_family::get_sstables_including_compacted_undeleted() const {
|
|
if (_sstables_compacted_but_not_deleted.empty()) {
|
|
return get_sstables();
|
|
}
|
|
auto ret = make_lw_shared(*_sstables->all());
|
|
for (auto&& s : _sstables_compacted_but_not_deleted) {
|
|
ret->insert(s);
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
const std::vector<sstables::shared_sstable>& column_family::compacted_undeleted_sstables() const {
|
|
return _sstables_compacted_but_not_deleted;
|
|
}
|
|
|
|
inline bool column_family::manifest_json_filter(const sstring& fname) {
|
|
using namespace boost::filesystem;
|
|
|
|
path entry_path(fname);
|
|
if (!is_directory(status(entry_path)) && entry_path.filename() == path("manifest.json")) {
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
// TODO: possibly move it to seastar
|
|
template <typename Service, typename PtrType, typename Func>
|
|
static future<> invoke_all_with_ptr(distributed<Service>& s, PtrType ptr, Func&& func) {
|
|
return parallel_for_each(smp::all_cpus(), [&s, &func, ptr] (unsigned id) {
|
|
return s.invoke_on(id, [func, foreign = make_foreign(ptr)] (Service& s) mutable {
|
|
return func(s, std::move(foreign));
|
|
});
|
|
});
|
|
}
|
|
|
|
future<> distributed_loader::open_sstable(distributed<database>& db, sstables::entry_descriptor comps,
|
|
std::function<future<> (column_family&, sstables::foreign_sstable_open_info)> func) {
|
|
// loads components of a sstable from shard S and share it with all other
|
|
// shards. Which shard a sstable will be opened at is decided using
|
|
// calculate_shard_from_sstable_generation(), which is the inverse of
|
|
// calculate_generation_for_new_table(). That ensures every sstable is
|
|
// shard-local if reshard wasn't performed. This approach is also expected
|
|
// to distribute evenly the resource usage among all shards.
|
|
|
|
return db.invoke_on(column_family::calculate_shard_from_sstable_generation(comps.generation),
|
|
[&db, comps = std::move(comps), func = std::move(func)] (database& local) {
|
|
auto& cf = local.find_column_family(comps.ks, comps.cf);
|
|
|
|
auto f = sstables::sstable::load_shared_components(cf.schema(), cf._config.datadir, comps.generation, comps.version, comps.format);
|
|
return f.then([&db, comps = std::move(comps), func = std::move(func)] (sstables::sstable_open_info info) {
|
|
// shared components loaded, now opening sstable in all shards with shared components
|
|
return do_with(std::move(info), [&db, comps = std::move(comps), func = std::move(func)] (auto& info) {
|
|
return invoke_all_with_ptr(db, std::move(info.components),
|
|
[owners = info.owners, data = info.data.dup(), index = info.index.dup(), comps, func] (database& db, auto components) {
|
|
auto& cf = db.find_column_family(comps.ks, comps.cf);
|
|
return func(cf, sstables::foreign_sstable_open_info{std::move(components), owners, data, index});
|
|
});
|
|
});
|
|
});
|
|
});
|
|
}
|
|
|
|
future<> distributed_loader::load_new_sstables(distributed<database>& db, sstring ks, sstring cf, std::vector<sstables::entry_descriptor> new_tables) {
|
|
return parallel_for_each(new_tables, [&db] (auto comps) {
|
|
auto cf_sstable_open = [comps] (column_family& cf, sstables::foreign_sstable_open_info info) {
|
|
auto f = cf.open_sstable(std::move(info), cf._config.datadir, comps.generation, comps.version, comps.format);
|
|
return f.then([&cf] (sstables::shared_sstable sst) mutable {
|
|
if (sst) {
|
|
cf._sstables_opened_but_not_loaded.push_back(sst);
|
|
}
|
|
return make_ready_future<>();
|
|
});
|
|
};
|
|
return distributed_loader::open_sstable(db, comps, cf_sstable_open);
|
|
}).then([&db, ks = std::move(ks), cf = std::move(cf)] {
|
|
return db.invoke_on_all([ks = std::move(ks), cfname = std::move(cf)] (database& db) {
|
|
auto& cf = db.find_column_family(ks, cfname);
|
|
// atomically load all opened sstables into column family.
|
|
for (auto& sst : cf._sstables_opened_but_not_loaded) {
|
|
cf.load_sstable(sst, true);
|
|
}
|
|
cf._sstables_opened_but_not_loaded.clear();
|
|
cf.start_rewrite();
|
|
cf.trigger_compaction();
|
|
// Drop entire cache for this column family because it may be populated
|
|
// with stale data.
|
|
return cf.get_row_cache().clear();
|
|
});
|
|
});
|
|
}
|
|
|
|
future<sstables::entry_descriptor> distributed_loader::probe_file(distributed<database>& db, sstring sstdir, sstring fname) {
|
|
using namespace sstables;
|
|
|
|
entry_descriptor comps = entry_descriptor::make_descriptor(fname);
|
|
|
|
// Every table will have a TOC. Using a specific file as a criteria, as
|
|
// opposed to, say verifying _sstables.count() to be zero is more robust
|
|
// against parallel loading of the directory contents.
|
|
if (comps.component != sstable::component_type::TOC) {
|
|
return make_ready_future<entry_descriptor>(std::move(comps));
|
|
}
|
|
auto cf_sstable_open = [sstdir, comps, fname] (column_family& cf, sstables::foreign_sstable_open_info info) {
|
|
cf.update_sstables_known_generation(comps.generation);
|
|
{
|
|
auto i = boost::range::find_if(*cf._sstables->all(), [gen = comps.generation] (sstables::shared_sstable sst) { return sst->generation() == gen; });
|
|
if (i != cf._sstables->all()->end()) {
|
|
auto new_toc = sstdir + "/" + fname;
|
|
throw std::runtime_error(sprint("Attempted to add sstable generation %d twice: new=%s existing=%s",
|
|
comps.generation, new_toc, (*i)->toc_filename()));
|
|
}
|
|
}
|
|
return cf.open_sstable(std::move(info), sstdir, comps.generation, comps.version, comps.format).then([&cf] (sstables::shared_sstable sst) mutable {
|
|
if (sst) {
|
|
cf.load_sstable(sst);
|
|
}
|
|
return make_ready_future<>();
|
|
});
|
|
};
|
|
|
|
return distributed_loader::open_sstable(db, comps, cf_sstable_open).then_wrapped([fname] (future<> f) {
|
|
try {
|
|
f.get();
|
|
} catch (malformed_sstable_exception& e) {
|
|
dblog.error("malformed sstable {}: {}. Refusing to boot", fname, e.what());
|
|
throw;
|
|
} catch(...) {
|
|
dblog.error("Unrecognized error while processing {}: {}. Refusing to boot",
|
|
fname, std::current_exception());
|
|
throw;
|
|
}
|
|
return make_ready_future<>();
|
|
}).then([comps] () mutable {
|
|
return make_ready_future<entry_descriptor>(std::move(comps));
|
|
});
|
|
}
|
|
|
|
future<> distributed_loader::populate_column_family(distributed<database>& db, sstring sstdir, sstring ks, sstring cf) {
|
|
// We can catch most errors when we try to load an sstable. But if the TOC
|
|
// file is the one missing, we won't try to load the sstable at all. This
|
|
// case is still an invalid case, but it is way easier for us to treat it
|
|
// by waiting for all files to be loaded, and then checking if we saw a
|
|
// file during scan_dir, without its corresponding TOC.
|
|
enum class status {
|
|
has_some_file,
|
|
has_toc_file,
|
|
has_temporary_toc_file,
|
|
};
|
|
|
|
struct sstable_descriptor {
|
|
std::experimental::optional<sstables::sstable::version_types> version;
|
|
std::experimental::optional<sstables::sstable::format_types> format;
|
|
};
|
|
|
|
auto verifier = make_lw_shared<std::unordered_map<unsigned long, status>>();
|
|
auto descriptor = make_lw_shared<sstable_descriptor>();
|
|
|
|
return do_with(std::vector<future<>>(), [&db, sstdir, verifier, descriptor, ks, cf] (std::vector<future<>>& futures) {
|
|
return lister::scan_dir(sstdir, { directory_entry_type::regular }, [&db, sstdir, verifier, descriptor, &futures] (directory_entry de) {
|
|
// FIXME: The secondary indexes are in this level, but with a directory type, (starting with ".")
|
|
auto f = distributed_loader::probe_file(db, sstdir, de.name).then([verifier, descriptor, sstdir, de] (auto entry) {
|
|
auto filename = sstdir + "/" + de.name;
|
|
if (entry.component == sstables::sstable::component_type::TemporaryStatistics) {
|
|
return remove_file(sstables::sstable::filename(sstdir, entry.ks, entry.cf, entry.version, entry.generation,
|
|
entry.format, sstables::sstable::component_type::TemporaryStatistics));
|
|
}
|
|
|
|
if (verifier->count(entry.generation)) {
|
|
if (verifier->at(entry.generation) == status::has_toc_file) {
|
|
if (entry.component == sstables::sstable::component_type::TOC) {
|
|
throw sstables::malformed_sstable_exception("Invalid State encountered. TOC file already processed", filename);
|
|
} else if (entry.component == sstables::sstable::component_type::TemporaryTOC) {
|
|
throw sstables::malformed_sstable_exception("Invalid State encountered. Temporary TOC file found after TOC file was processed", filename);
|
|
}
|
|
} else if (entry.component == sstables::sstable::component_type::TOC) {
|
|
verifier->at(entry.generation) = status::has_toc_file;
|
|
} else if (entry.component == sstables::sstable::component_type::TemporaryTOC) {
|
|
verifier->at(entry.generation) = status::has_temporary_toc_file;
|
|
}
|
|
} else {
|
|
if (entry.component == sstables::sstable::component_type::TOC) {
|
|
verifier->emplace(entry.generation, status::has_toc_file);
|
|
} else if (entry.component == sstables::sstable::component_type::TemporaryTOC) {
|
|
verifier->emplace(entry.generation, status::has_temporary_toc_file);
|
|
} else {
|
|
verifier->emplace(entry.generation, status::has_some_file);
|
|
}
|
|
}
|
|
|
|
// Retrieve both version and format used for this column family.
|
|
if (!descriptor->version) {
|
|
descriptor->version = entry.version;
|
|
}
|
|
if (!descriptor->format) {
|
|
descriptor->format = entry.format;
|
|
}
|
|
return make_ready_future<>();
|
|
});
|
|
|
|
// push future returned by probe_file into an array of futures,
|
|
// so that the supplied callback will not block scan_dir() from
|
|
// reading the next entry in the directory.
|
|
futures.push_back(std::move(f));
|
|
|
|
return make_ready_future<>();
|
|
}, &column_family::manifest_json_filter).then([&futures] {
|
|
return when_all(futures.begin(), futures.end()).then([] (std::vector<future<>> ret) {
|
|
std::exception_ptr eptr;
|
|
|
|
for (auto& f : ret) {
|
|
try {
|
|
if (eptr) {
|
|
f.ignore_ready_future();
|
|
} else {
|
|
f.get();
|
|
}
|
|
} catch(...) {
|
|
eptr = std::current_exception();
|
|
}
|
|
}
|
|
|
|
if (eptr) {
|
|
return make_exception_future<>(eptr);
|
|
}
|
|
return make_ready_future<>();
|
|
});
|
|
}).then([verifier, sstdir, descriptor, ks = std::move(ks), cf = std::move(cf)] {
|
|
return parallel_for_each(*verifier, [sstdir = std::move(sstdir), ks = std::move(ks), cf = std::move(cf), descriptor] (auto v) {
|
|
if (v.second == status::has_temporary_toc_file) {
|
|
unsigned long gen = v.first;
|
|
assert(descriptor->version);
|
|
sstables::sstable::version_types version = descriptor->version.value();
|
|
assert(descriptor->format);
|
|
sstables::sstable::format_types format = descriptor->format.value();
|
|
|
|
if (engine().cpu_id() != 0) {
|
|
dblog.debug("At directory: {}, partial SSTable with generation {} not relevant for this shard, ignoring", sstdir, v.first);
|
|
return make_ready_future<>();
|
|
}
|
|
// shard 0 is the responsible for removing a partial sstable.
|
|
return sstables::sstable::remove_sstable_with_temp_toc(ks, cf, sstdir, gen, version, format);
|
|
} else if (v.second != status::has_toc_file) {
|
|
throw sstables::malformed_sstable_exception(sprint("At directory: %s: no TOC found for SSTable with generation %d!. Refusing to boot", sstdir, v.first));
|
|
}
|
|
return make_ready_future<>();
|
|
});
|
|
});
|
|
}).then([&db, ks, cf] {
|
|
return db.invoke_on_all([ks = std::move(ks), cfname = std::move(cf)] (database& db) {
|
|
auto& cf = db.find_column_family(ks, cfname);
|
|
// Make sure this is called even if CF is empty
|
|
cf.mark_ready_for_writes();
|
|
});
|
|
});
|
|
|
|
}
|
|
|
|
utils::UUID database::empty_version = utils::UUID_gen::get_name_UUID(bytes{});
|
|
|
|
database::database() : database(db::config())
|
|
{}
|
|
|
|
database::database(const db::config& cfg)
|
|
: _stats(make_lw_shared<db_stats>())
|
|
, _cfg(std::make_unique<db::config>(cfg))
|
|
// Allow system tables a pool of 10 MB memory to write, but never block on other regions.
|
|
, _system_dirty_memory_manager(*this, 10 << 20)
|
|
, _dirty_memory_manager(*this, memory::stats().total_memory() * 0.45)
|
|
, _streaming_dirty_memory_manager(*this, memory::stats().total_memory() * 0.10)
|
|
, _version(empty_version)
|
|
, _enable_incremental_backups(cfg.incremental_backups())
|
|
{
|
|
_compaction_manager.start();
|
|
setup_metrics();
|
|
|
|
dblog.info("Row: max_vector_size: {}, internal_count: {}", size_t(row::max_vector_size), size_t(row::internal_count));
|
|
}
|
|
|
|
|
|
void
|
|
dirty_memory_manager::setup_collectd(sstring namestr) {
|
|
namespace sm = seastar::metrics;
|
|
|
|
_metrics.add_group("memory", {
|
|
sm::make_gauge(namestr + "_dirty_bytes", [this] { return real_dirty_memory(); },
|
|
sm::description("Holds the current size of a all non-free memory in bytes: used memory + released memory that hasn't been returned to a free memory pool yet. "
|
|
"Total memory size minus this value represents the amount of available memory. "
|
|
"If this value minus virtual_dirty_bytes is too high then this means that the dirty memory eviction lags behind.")),
|
|
|
|
sm::make_gauge(namestr +"_virtual_dirty_bytes", [this] { return virtual_dirty_memory(); },
|
|
sm::description("Holds the size of used memory in bytes. Compare it to \"dirty_bytes\" to see how many memory is wasted (neither used nor available).")),
|
|
});
|
|
}
|
|
|
|
void
|
|
database::setup_metrics() {
|
|
_dirty_memory_manager.setup_collectd("regular");
|
|
_system_dirty_memory_manager.setup_collectd("system");
|
|
_streaming_dirty_memory_manager.setup_collectd("streaming");
|
|
|
|
namespace sm = seastar::metrics;
|
|
|
|
_metrics.add_group("memory", {
|
|
sm::make_gauge("dirty_bytes", [this] { return _dirty_memory_manager.real_dirty_memory() + _system_dirty_memory_manager.real_dirty_memory() + _streaming_dirty_memory_manager.real_dirty_memory(); },
|
|
sm::description("Holds the current size of all (\"regular\", \"system\" and \"streaming\") non-free memory in bytes: used memory + released memory that hasn't been returned to a free memory pool yet. "
|
|
"Total memory size minus this value represents the amount of available memory. "
|
|
"If this value minus virtual_dirty_bytes is too high then this means that the dirty memory eviction lags behind.")),
|
|
|
|
sm::make_gauge("virtual_dirty_bytes", [this] { return _dirty_memory_manager.virtual_dirty_memory() + _system_dirty_memory_manager.virtual_dirty_memory() + _streaming_dirty_memory_manager.virtual_dirty_memory(); },
|
|
sm::description("Holds the size of all (\"regular\", \"system\" and \"streaming\") used memory in bytes. Compare it to \"dirty_bytes\" to see how many memory is wasted (neither used nor available).")),
|
|
});
|
|
|
|
_metrics.add_group("memtables", {
|
|
sm::make_gauge("pending_flushes", _cf_stats.pending_memtables_flushes_count,
|
|
sm::description("Holds the current number of memtables that are currently being flushed to sstables. "
|
|
"High value in this mertic may be an indication of storage being a bottleneck.")),
|
|
|
|
sm::make_gauge("pending_flushes_bytes", _cf_stats.pending_memtables_flushes_bytes,
|
|
sm::description("Holds the current number of bytes in memtables that are currently being flushed to sstables. "
|
|
"High value in this mertic may be an indication of storage being a bottleneck.")),
|
|
});
|
|
|
|
_metrics.add_group("database", {
|
|
sm::make_gauge("requests_blocked_memory", [this] { return _dirty_memory_manager.region_group().blocked_requests(); },
|
|
sm::description(
|
|
seastar::format("Holds the current number of requests blocked due to reaching the memory quota ({}B). "
|
|
"Non-zero value indicates that our bottleneck is memory and more specifically - the memory quota allocated for the \"database\" component.", _dirty_memory_manager.throttle_threshold()))),
|
|
|
|
sm::make_derive("requests_blocked_memory", [this] { return _dirty_memory_manager.region_group().blocked_requests_counter(); },
|
|
sm::description(seastar::format("Holds the current number of requests blocked due to reaching the memory quota ({}B). "
|
|
"Non-zero value indicates that our bottleneck is memory and more specifically - the memory quota allocated for the \"database\" component.", _dirty_memory_manager.throttle_threshold()))),
|
|
|
|
sm::make_derive("clustering_filter_count", _cf_stats.clustering_filter_count,
|
|
sm::description("Counts bloom filter invocations.")),
|
|
|
|
sm::make_derive("clustering_filter_sstables_checked", _cf_stats.sstables_checked_by_clustering_filter,
|
|
sm::description("Counts sstables checked after applying the bloom filter. "
|
|
"High value indicates that bloom filter is not very efficient.")),
|
|
|
|
sm::make_derive("clustering_filter_fast_path_count", _cf_stats.clustering_filter_fast_path_count,
|
|
sm::description("Counts number of times bloom filtering short cut to include all sstables when only one full range was specified.")),
|
|
|
|
sm::make_derive("clustering_filter_surviving_sstables", _cf_stats.surviving_sstables_after_clustering_filter,
|
|
sm::description("Counts sstables that survived the clustering key filtering. "
|
|
"High value indicates that bloom filter is not very efficient and still have to access a lot of sstables to get data.")),
|
|
|
|
sm::make_derive("total_writes", _stats->total_writes,
|
|
sm::description("Counts the total number of successful write operations performed by this shard.")),
|
|
|
|
sm::make_derive("total_writes_failed", _stats->total_writes_failed,
|
|
sm::description("Counts the total number of failed write operations. "
|
|
"A sum of this value plus total_writes represents a total amount of writes attempted on this shard.")),
|
|
|
|
sm::make_derive("total_writes_timedout", _stats->total_writes_timedout,
|
|
sm::description("Counts write operations failed due to a timeout. None zero value is a sign of storage being overloaded.")),
|
|
|
|
sm::make_derive("total_reads", _stats->total_reads,
|
|
sm::description("Counts the total number of successful reads on this shard.")),
|
|
|
|
sm::make_derive("total_reads_failed", _stats->total_reads_failed,
|
|
sm::description("Counts the total number of failed read operations. "
|
|
"Add the total_reads to this value to get the total amount of reads issued on this shard.")),
|
|
|
|
sm::make_derive("sstable_read_queue_overloads", _stats->sstable_read_queue_overloaded,
|
|
sm::description("Counts the number of times the sstable read queue was overloaded. "
|
|
"A non-zero value indicates that we have to drop read requests because they arrive faster than we can serve them.")),
|
|
|
|
sm::make_gauge("active_reads", [this] { return max_concurrent_reads() - _read_concurrency_sem.current(); },
|
|
sm::description(seastar::format("Holds the number of currently active read operations. "
|
|
"If this vlaue gets close to {} we are likely to start dropping new read requests. "
|
|
"In that case sstable_read_queue_overloads is going to get a non-zero value.", max_concurrent_reads()))),
|
|
|
|
sm::make_gauge("queued_reads", [this] { return _read_concurrency_sem.waiters(); },
|
|
sm::description("Holds the number of currently queued read operations.")),
|
|
|
|
sm::make_gauge("active_reads_system_keyspace", [this] { return max_system_concurrent_reads() - _system_read_concurrency_sem.current(); },
|
|
sm::description(seastar::format("Holds the number of currently active read operations from \"system\" keyspace tables. "
|
|
"If this vlaue gets close to {} we are likely to start dropping new read requests. "
|
|
"In that case sstable_read_queue_overloads is going to get a non-zero value.", max_system_concurrent_reads()))),
|
|
|
|
sm::make_gauge("queued_reads_system_keyspace", [this] { return _system_read_concurrency_sem.waiters(); },
|
|
sm::description("Holds the number of currently queued read operations from \"system\" keyspace tables.")),
|
|
|
|
sm::make_gauge("total_result_bytes", [this] { return get_result_memory_limiter().total_used_memory(); },
|
|
sm::description("Holds the current amount of memory used for results.")),
|
|
|
|
sm::make_derive("short_data_queries", _stats->short_data_queries,
|
|
sm::description("The rate of data queries (data or digest reads) that returned less rows than requested due to result size limiting.")),
|
|
|
|
sm::make_derive("short_mutation_queries", _stats->short_mutation_queries,
|
|
sm::description("The rate of mutation queries that returned less rows than requested due to result size limiting.")),
|
|
});
|
|
}
|
|
|
|
database::~database() {
|
|
}
|
|
|
|
void database::update_version(const utils::UUID& version) {
|
|
_version = version;
|
|
}
|
|
|
|
const utils::UUID& database::get_version() const {
|
|
return _version;
|
|
}
|
|
|
|
future<> distributed_loader::populate_keyspace(distributed<database>& db, sstring datadir, sstring ks_name) {
|
|
auto ksdir = datadir + "/" + ks_name;
|
|
auto& keyspaces = db.local().get_keyspaces();
|
|
auto i = keyspaces.find(ks_name);
|
|
if (i == keyspaces.end()) {
|
|
dblog.warn("Skipping undefined keyspace: {}", ks_name);
|
|
return make_ready_future<>();
|
|
} else {
|
|
dblog.info("Populating Keyspace {}", ks_name);
|
|
auto& ks = i->second;
|
|
auto& column_families = db.local().get_column_families();
|
|
|
|
return parallel_for_each(ks.metadata()->cf_meta_data() | boost::adaptors::map_values,
|
|
[ks_name, &ks, &column_families, &db] (schema_ptr s) {
|
|
utils::UUID uuid = s->id();
|
|
lw_shared_ptr<column_family> cf = column_families[uuid];
|
|
sstring cfname = cf->schema()->cf_name();
|
|
auto sstdir = ks.column_family_directory(cfname, uuid);
|
|
dblog.info("Keyspace {}: Reading CF {} ", ks_name, cfname);
|
|
return ks.make_directory_for_column_family(cfname, uuid).then([&db, sstdir, uuid, ks_name, cfname] {
|
|
return distributed_loader::populate_column_family(db, sstdir, ks_name, cfname);
|
|
}).handle_exception([ks_name, cfname, sstdir](std::exception_ptr eptr) {
|
|
std::string msg =
|
|
sprint("Exception while populating keyspace '%s' with column family '%s' from file '%s': %s",
|
|
ks_name, cfname, sstdir, eptr);
|
|
dblog.error("Exception while populating keyspace '{}' with column family '{}' from file '{}': {}",
|
|
ks_name, cfname, sstdir, eptr);
|
|
throw std::runtime_error(msg.c_str());
|
|
});
|
|
});
|
|
}
|
|
}
|
|
|
|
static future<> populate(distributed<database>& db, sstring datadir) {
|
|
return lister::scan_dir(datadir, { directory_entry_type::directory }, [&db, datadir] (directory_entry de) {
|
|
auto& ks_name = de.name;
|
|
if (ks_name == "system") {
|
|
return make_ready_future<>();
|
|
}
|
|
return distributed_loader::populate_keyspace(db, datadir, ks_name);
|
|
});
|
|
}
|
|
|
|
template <typename Func>
|
|
static future<>
|
|
do_parse_system_tables(distributed<service::storage_proxy>& proxy, const sstring& _cf_name, Func&& func) {
|
|
using namespace db::schema_tables;
|
|
static_assert(std::is_same<future<>, std::result_of_t<Func(schema_result_value_type&)>>::value,
|
|
"bad Func signature");
|
|
|
|
|
|
auto cf_name = make_lw_shared<sstring>(_cf_name);
|
|
return db::system_keyspace::query(proxy, *cf_name).then([] (auto rs) {
|
|
auto names = std::set<sstring>();
|
|
for (auto& r : rs->rows()) {
|
|
auto keyspace_name = r.template get_nonnull<sstring>("keyspace_name");
|
|
names.emplace(keyspace_name);
|
|
}
|
|
return std::move(names);
|
|
}).then([&proxy, cf_name, func = std::forward<Func>(func)] (std::set<sstring>&& names) mutable {
|
|
return parallel_for_each(names.begin(), names.end(), [&proxy, cf_name, func = std::forward<Func>(func)] (sstring name) mutable {
|
|
if (name == "system") {
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
return read_schema_partition_for_keyspace(proxy, *cf_name, name).then([func, cf_name] (auto&& v) mutable {
|
|
return do_with(std::move(v), [func = std::forward<Func>(func), cf_name] (auto& v) {
|
|
return func(v).then_wrapped([cf_name, &v] (future<> f) {
|
|
try {
|
|
f.get();
|
|
} catch (std::exception& e) {
|
|
dblog.error("Skipping: {}. Exception occurred when loading system table {}: {}", v.first, *cf_name, e.what());
|
|
}
|
|
});
|
|
});
|
|
});
|
|
});
|
|
});
|
|
}
|
|
|
|
future<> database::parse_system_tables(distributed<service::storage_proxy>& proxy) {
|
|
using namespace db::schema_tables;
|
|
return do_parse_system_tables(proxy, db::schema_tables::KEYSPACES, [this] (schema_result_value_type &v) {
|
|
auto ksm = create_keyspace_from_schema_partition(v);
|
|
return create_keyspace(ksm);
|
|
}).then([&proxy, this] {
|
|
return do_parse_system_tables(proxy, db::schema_tables::USERTYPES, [this, &proxy] (schema_result_value_type &v) {
|
|
auto&& user_types = create_types_from_schema_partition(v);
|
|
auto& ks = this->find_keyspace(v.first);
|
|
for (auto&& type : user_types) {
|
|
ks.add_user_type(type);
|
|
}
|
|
return make_ready_future<>();
|
|
});
|
|
}).then([&proxy, this] {
|
|
return do_parse_system_tables(proxy, db::schema_tables::VIEWS, [this, &proxy] (schema_result_value_type &v) {
|
|
return create_views_from_schema_partition(proxy, v.second).then([this] (std::vector<view_ptr> views) {
|
|
return parallel_for_each(views.begin(), views.end(), [this] (auto&& v) {
|
|
return this->add_column_family_and_make_directory(v);
|
|
});
|
|
});
|
|
});
|
|
}).then([&proxy, this] {
|
|
return do_parse_system_tables(proxy, db::schema_tables::COLUMNFAMILIES, [this, &proxy] (schema_result_value_type &v) {
|
|
return create_tables_from_tables_partition(proxy, v.second).then([this] (std::map<sstring, schema_ptr> tables) {
|
|
return parallel_for_each(tables.begin(), tables.end(), [this] (auto& t) {
|
|
return this->add_column_family_and_make_directory(t.second);
|
|
});
|
|
});
|
|
});
|
|
});
|
|
}
|
|
|
|
future<> distributed_loader::init_system_keyspace(distributed<database>& db) {
|
|
return seastar::async([&db] {
|
|
// We need to init commitlog on shard0 before it is inited on other shards
|
|
// because it obtains the list of pre-existing segments for replay, which must
|
|
// not include reserve segments created by active commitlogs.
|
|
db.invoke_on(0, [] (database& db) {
|
|
return db.init_commitlog();
|
|
}).get();
|
|
db.invoke_on_all([] (database& db) {
|
|
if (engine().cpu_id() == 0) {
|
|
return make_ready_future<>();
|
|
}
|
|
return db.init_commitlog();
|
|
}).get();
|
|
|
|
db.invoke_on_all([] (database& db) {
|
|
auto& cfg = db.get_config();
|
|
bool durable = cfg.data_file_directories().size() > 0;
|
|
db::system_keyspace::make(db, durable, cfg.volatile_system_keyspace_for_testing());
|
|
}).get();
|
|
|
|
// FIXME support multiple directories
|
|
const auto& cfg = db.local().get_config();
|
|
auto data_dir = cfg.data_file_directories()[0];
|
|
io_check(touch_directory, data_dir + "/" + db::system_keyspace::NAME).get();
|
|
distributed_loader::populate_keyspace(db, data_dir, db::system_keyspace::NAME).get();
|
|
|
|
db.invoke_on_all([] (database& db) {
|
|
auto& ks = db.find_keyspace(db::system_keyspace::NAME);
|
|
for (auto& pair : ks.metadata()->cf_meta_data()) {
|
|
auto cfm = pair.second;
|
|
auto& cf = db.find_column_family(cfm);
|
|
cf.mark_ready_for_writes();
|
|
}
|
|
return make_ready_future<>();
|
|
}).get();
|
|
});
|
|
}
|
|
|
|
future<> distributed_loader::init_non_system_keyspaces(distributed<database>& db, distributed<service::storage_proxy>& proxy) {
|
|
return seastar::async([&db, &proxy] {
|
|
db.invoke_on_all([&proxy] (database& db) {
|
|
return db.parse_system_tables(proxy);
|
|
}).get();
|
|
|
|
const auto& cfg = db.local().get_config();
|
|
populate(db, cfg.data_file_directories()[0]).get();
|
|
});
|
|
}
|
|
|
|
future<>
|
|
database::init_commitlog() {
|
|
return db::commitlog::create_commitlog(*_cfg).then([this](db::commitlog&& log) {
|
|
_commitlog = std::make_unique<db::commitlog>(std::move(log));
|
|
_commitlog->add_flush_handler([this](db::cf_id_type id, db::replay_position pos) {
|
|
if (_column_families.count(id) == 0) {
|
|
// the CF has been removed.
|
|
_commitlog->discard_completed_segments(id, pos);
|
|
return;
|
|
}
|
|
_column_families[id]->flush(pos);
|
|
}).release(); // we have longer life time than CL. Ignore reg anchor
|
|
});
|
|
}
|
|
|
|
unsigned
|
|
database::shard_of(const dht::token& t) {
|
|
return dht::shard_of(t);
|
|
}
|
|
|
|
unsigned
|
|
database::shard_of(const mutation& m) {
|
|
return shard_of(m.token());
|
|
}
|
|
|
|
unsigned
|
|
database::shard_of(const frozen_mutation& m) {
|
|
// FIXME: This lookup wouldn't be necessary if we
|
|
// sent the partition key in legacy form or together
|
|
// with token.
|
|
schema_ptr schema = find_schema(m.column_family_id());
|
|
return shard_of(dht::global_partitioner().get_token(*schema, m.key(*schema)));
|
|
}
|
|
|
|
void database::add_keyspace(sstring name, keyspace k) {
|
|
if (_keyspaces.count(name) != 0) {
|
|
throw std::invalid_argument("Keyspace " + name + " already exists");
|
|
}
|
|
_keyspaces.emplace(std::move(name), std::move(k));
|
|
}
|
|
|
|
future<> database::update_keyspace(const sstring& name) {
|
|
auto& proxy = service::get_storage_proxy();
|
|
return db::schema_tables::read_schema_partition_for_keyspace(proxy, db::schema_tables::KEYSPACES, name).then([this, name](db::schema_tables::schema_result_value_type&& v) {
|
|
auto& ks = find_keyspace(name);
|
|
|
|
auto tmp_ksm = db::schema_tables::create_keyspace_from_schema_partition(v);
|
|
auto new_ksm = ::make_lw_shared<keyspace_metadata>(tmp_ksm->name(), tmp_ksm->strategy_name(), tmp_ksm->strategy_options(), tmp_ksm->durable_writes(),
|
|
boost::copy_range<std::vector<schema_ptr>>(ks.metadata()->cf_meta_data() | boost::adaptors::map_values), ks.metadata()->user_types());
|
|
ks.update_from(std::move(new_ksm));
|
|
return service::get_local_migration_manager().notify_update_keyspace(ks.metadata());
|
|
});
|
|
}
|
|
|
|
void database::drop_keyspace(const sstring& name) {
|
|
_keyspaces.erase(name);
|
|
}
|
|
|
|
void database::add_column_family(keyspace& ks, schema_ptr schema, column_family::config cfg) {
|
|
schema = local_schema_registry().learn(schema);
|
|
schema->registry_entry()->mark_synced();
|
|
|
|
lw_shared_ptr<column_family> cf;
|
|
if (cfg.enable_commitlog && _commitlog) {
|
|
cf = make_lw_shared<column_family>(schema, std::move(cfg), *_commitlog, _compaction_manager);
|
|
} else {
|
|
cf = make_lw_shared<column_family>(schema, std::move(cfg), column_family::no_commitlog(), _compaction_manager);
|
|
}
|
|
|
|
auto uuid = schema->id();
|
|
if (_column_families.count(uuid) != 0) {
|
|
throw std::invalid_argument("UUID " + uuid.to_sstring() + " already mapped");
|
|
}
|
|
auto kscf = std::make_pair(schema->ks_name(), schema->cf_name());
|
|
if (_ks_cf_to_uuid.count(kscf) != 0) {
|
|
throw std::invalid_argument("Column family " + schema->cf_name() + " exists");
|
|
}
|
|
ks.add_or_update_column_family(schema);
|
|
cf->start();
|
|
_column_families.emplace(uuid, std::move(cf));
|
|
_ks_cf_to_uuid.emplace(std::move(kscf), uuid);
|
|
if (schema->is_view()) {
|
|
find_column_family(schema->view_info()->base_id()).add_or_update_view(view_ptr(schema));
|
|
}
|
|
}
|
|
|
|
future<> database::add_column_family_and_make_directory(schema_ptr schema) {
|
|
auto& ks = find_keyspace(schema->ks_name());
|
|
add_column_family(ks, schema, ks.make_column_family_config(*schema, get_config()));
|
|
return ks.make_directory_for_column_family(schema->cf_name(), schema->id());
|
|
}
|
|
|
|
bool database::update_column_family(schema_ptr new_schema) {
|
|
column_family& cfm = find_column_family(new_schema->id());
|
|
bool columns_changed = !cfm.schema()->equal_columns(*new_schema);
|
|
auto s = local_schema_registry().learn(new_schema);
|
|
s->registry_entry()->mark_synced();
|
|
cfm.set_schema(s);
|
|
find_keyspace(s->ks_name()).metadata()->add_or_update_column_family(s);
|
|
if (s->is_view()) {
|
|
try {
|
|
find_column_family(s->view_info()->base_id()).add_or_update_view(view_ptr(s));
|
|
} catch (no_such_column_family&) {
|
|
// Update view mutations received after base table drop.
|
|
}
|
|
}
|
|
return columns_changed;
|
|
}
|
|
|
|
future<> database::drop_column_family(const sstring& ks_name, const sstring& cf_name, timestamp_func tsf) {
|
|
auto uuid = find_uuid(ks_name, cf_name);
|
|
auto& ks = find_keyspace(ks_name);
|
|
auto cf = _column_families.at(uuid);
|
|
auto&& s = cf->schema();
|
|
_column_families.erase(uuid);
|
|
ks.metadata()->remove_column_family(s);
|
|
_ks_cf_to_uuid.erase(std::make_pair(ks_name, cf_name));
|
|
if (s->is_view()) {
|
|
try {
|
|
find_column_family(s->view_info()->base_id()).remove_view(view_ptr(s));
|
|
} catch (no_such_column_family&) {
|
|
// Drop view mutations received after base table drop.
|
|
}
|
|
}
|
|
return truncate(ks, *cf, std::move(tsf)).then([this, cf] {
|
|
return cf->stop();
|
|
}).then([this, cf] {
|
|
return make_ready_future<>();
|
|
});
|
|
}
|
|
|
|
const utils::UUID& database::find_uuid(const sstring& ks, const sstring& cf) const {
|
|
try {
|
|
return _ks_cf_to_uuid.at(std::make_pair(ks, cf));
|
|
} catch (...) {
|
|
throw std::out_of_range("");
|
|
}
|
|
}
|
|
|
|
const utils::UUID& database::find_uuid(const schema_ptr& schema) const {
|
|
return find_uuid(schema->ks_name(), schema->cf_name());
|
|
}
|
|
|
|
keyspace& database::find_keyspace(const sstring& name) {
|
|
try {
|
|
return _keyspaces.at(name);
|
|
} catch (...) {
|
|
std::throw_with_nested(no_such_keyspace(name));
|
|
}
|
|
}
|
|
|
|
const keyspace& database::find_keyspace(const sstring& name) const {
|
|
try {
|
|
return _keyspaces.at(name);
|
|
} catch (...) {
|
|
std::throw_with_nested(no_such_keyspace(name));
|
|
}
|
|
}
|
|
|
|
bool database::has_keyspace(const sstring& name) const {
|
|
return _keyspaces.count(name) != 0;
|
|
}
|
|
|
|
std::vector<sstring> database::get_non_system_keyspaces() const {
|
|
std::vector<sstring> res;
|
|
for (auto const &i : _keyspaces) {
|
|
if (i.first != db::system_keyspace::NAME) {
|
|
res.push_back(i.first);
|
|
}
|
|
}
|
|
return res;
|
|
}
|
|
|
|
std::vector<lw_shared_ptr<column_family>> database::get_non_system_column_families() const {
|
|
return boost::copy_range<std::vector<lw_shared_ptr<column_family>>>(
|
|
get_column_families()
|
|
| boost::adaptors::map_values
|
|
| boost::adaptors::filtered([](const lw_shared_ptr<column_family>& cf) {
|
|
return cf->schema()->ks_name() != db::system_keyspace::NAME;
|
|
}));
|
|
}
|
|
|
|
column_family& database::find_column_family(const sstring& ks_name, const sstring& cf_name) {
|
|
try {
|
|
return find_column_family(find_uuid(ks_name, cf_name));
|
|
} catch (...) {
|
|
std::throw_with_nested(no_such_column_family(ks_name, cf_name));
|
|
}
|
|
}
|
|
|
|
const column_family& database::find_column_family(const sstring& ks_name, const sstring& cf_name) const {
|
|
try {
|
|
return find_column_family(find_uuid(ks_name, cf_name));
|
|
} catch (...) {
|
|
std::throw_with_nested(no_such_column_family(ks_name, cf_name));
|
|
}
|
|
}
|
|
|
|
column_family& database::find_column_family(const utils::UUID& uuid) {
|
|
try {
|
|
return *_column_families.at(uuid);
|
|
} catch (...) {
|
|
std::throw_with_nested(no_such_column_family(uuid));
|
|
}
|
|
}
|
|
|
|
const column_family& database::find_column_family(const utils::UUID& uuid) const {
|
|
try {
|
|
return *_column_families.at(uuid);
|
|
} catch (...) {
|
|
std::throw_with_nested(no_such_column_family(uuid));
|
|
}
|
|
}
|
|
|
|
bool database::column_family_exists(const utils::UUID& uuid) const {
|
|
return _column_families.count(uuid);
|
|
}
|
|
|
|
void
|
|
keyspace::create_replication_strategy(const std::map<sstring, sstring>& options) {
|
|
using namespace locator;
|
|
|
|
auto& ss = service::get_local_storage_service();
|
|
_replication_strategy =
|
|
abstract_replication_strategy::create_replication_strategy(
|
|
_metadata->name(), _metadata->strategy_name(),
|
|
ss.get_token_metadata(), options);
|
|
}
|
|
|
|
locator::abstract_replication_strategy&
|
|
keyspace::get_replication_strategy() {
|
|
return *_replication_strategy;
|
|
}
|
|
|
|
|
|
const locator::abstract_replication_strategy&
|
|
keyspace::get_replication_strategy() const {
|
|
return *_replication_strategy;
|
|
}
|
|
|
|
void
|
|
keyspace::set_replication_strategy(std::unique_ptr<locator::abstract_replication_strategy> replication_strategy) {
|
|
_replication_strategy = std::move(replication_strategy);
|
|
}
|
|
|
|
void keyspace::update_from(::lw_shared_ptr<keyspace_metadata> ksm) {
|
|
_metadata = std::move(ksm);
|
|
create_replication_strategy(_metadata->strategy_options());
|
|
}
|
|
|
|
column_family::config
|
|
keyspace::make_column_family_config(const schema& s, const db::config& db_config) const {
|
|
column_family::config cfg;
|
|
cfg.datadir = column_family_directory(s.cf_name(), s.id());
|
|
cfg.enable_disk_reads = _config.enable_disk_reads;
|
|
cfg.enable_disk_writes = _config.enable_disk_writes;
|
|
cfg.enable_commitlog = _config.enable_commitlog;
|
|
cfg.enable_cache = _config.enable_cache;
|
|
cfg.dirty_memory_manager = _config.dirty_memory_manager;
|
|
cfg.streaming_dirty_memory_manager = _config.streaming_dirty_memory_manager;
|
|
cfg.read_concurrency_config = _config.read_concurrency_config;
|
|
cfg.streaming_read_concurrency_config = _config.streaming_read_concurrency_config;
|
|
cfg.cf_stats = _config.cf_stats;
|
|
cfg.enable_incremental_backups = _config.enable_incremental_backups;
|
|
cfg.max_cached_partition_size_in_bytes = db_config.max_cached_partition_size_in_kb() * 1024;
|
|
|
|
return cfg;
|
|
}
|
|
|
|
sstring
|
|
keyspace::column_family_directory(const sstring& name, utils::UUID uuid) const {
|
|
auto uuid_sstring = uuid.to_sstring();
|
|
boost::erase_all(uuid_sstring, "-");
|
|
return sprint("%s/%s-%s", _config.datadir, name, uuid_sstring);
|
|
}
|
|
|
|
future<>
|
|
keyspace::make_directory_for_column_family(const sstring& name, utils::UUID uuid) {
|
|
auto cfdir = column_family_directory(name, uuid);
|
|
return seastar::async([cfdir = std::move(cfdir)] {
|
|
io_check(touch_directory, cfdir).get();
|
|
io_check(touch_directory, cfdir + "/upload").get();
|
|
});
|
|
}
|
|
|
|
no_such_keyspace::no_such_keyspace(const sstring& ks_name)
|
|
: runtime_error{sprint("Can't find a keyspace %s", ks_name)}
|
|
{
|
|
}
|
|
|
|
no_such_column_family::no_such_column_family(const utils::UUID& uuid)
|
|
: runtime_error{sprint("Can't find a column family with UUID %s", uuid)}
|
|
{
|
|
}
|
|
|
|
no_such_column_family::no_such_column_family(const sstring& ks_name, const sstring& cf_name)
|
|
: runtime_error{sprint("Can't find a column family %s in keyspace %s", cf_name, ks_name)}
|
|
{
|
|
}
|
|
|
|
column_family& database::find_column_family(const schema_ptr& schema) {
|
|
return find_column_family(schema->id());
|
|
}
|
|
|
|
const column_family& database::find_column_family(const schema_ptr& schema) const {
|
|
return find_column_family(schema->id());
|
|
}
|
|
|
|
void keyspace_metadata::validate() const {
|
|
using namespace locator;
|
|
|
|
auto& ss = service::get_local_storage_service();
|
|
abstract_replication_strategy::validate_replication_strategy(name(), strategy_name(), ss.get_token_metadata(), strategy_options());
|
|
}
|
|
|
|
std::vector<schema_ptr> keyspace_metadata::tables() const {
|
|
return boost::copy_range<std::vector<schema_ptr>>(_cf_meta_data
|
|
| boost::adaptors::map_values
|
|
| boost::adaptors::filtered([] (auto&& s) { return !s->is_view(); }));
|
|
}
|
|
|
|
std::vector<view_ptr> keyspace_metadata::views() const {
|
|
return boost::copy_range<std::vector<view_ptr>>(_cf_meta_data
|
|
| boost::adaptors::map_values
|
|
| boost::adaptors::filtered(std::mem_fn(&schema::is_view))
|
|
| boost::adaptors::transformed([] (auto&& s) { return view_ptr(s); }));
|
|
}
|
|
|
|
schema_ptr database::find_schema(const sstring& ks_name, const sstring& cf_name) const {
|
|
try {
|
|
return find_schema(find_uuid(ks_name, cf_name));
|
|
} catch (std::out_of_range&) {
|
|
std::throw_with_nested(no_such_column_family(ks_name, cf_name));
|
|
}
|
|
}
|
|
|
|
schema_ptr database::find_schema(const utils::UUID& uuid) const {
|
|
return find_column_family(uuid).schema();
|
|
}
|
|
|
|
bool database::has_schema(const sstring& ks_name, const sstring& cf_name) const {
|
|
return _ks_cf_to_uuid.count(std::make_pair(ks_name, cf_name)) > 0;
|
|
}
|
|
|
|
|
|
void database::create_in_memory_keyspace(const lw_shared_ptr<keyspace_metadata>& ksm) {
|
|
keyspace ks(ksm, std::move(make_keyspace_config(*ksm)));
|
|
ks.create_replication_strategy(ksm->strategy_options());
|
|
_keyspaces.emplace(ksm->name(), std::move(ks));
|
|
}
|
|
|
|
future<>
|
|
database::create_keyspace(const lw_shared_ptr<keyspace_metadata>& ksm) {
|
|
auto i = _keyspaces.find(ksm->name());
|
|
if (i != _keyspaces.end()) {
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
create_in_memory_keyspace(ksm);
|
|
auto& datadir = _keyspaces.at(ksm->name()).datadir();
|
|
if (datadir != "") {
|
|
return io_check(touch_directory, datadir);
|
|
} else {
|
|
return make_ready_future<>();
|
|
}
|
|
}
|
|
|
|
std::set<sstring>
|
|
database::existing_index_names(const sstring& cf_to_exclude) const {
|
|
std::set<sstring> names;
|
|
for (auto& p : _column_families) {
|
|
auto& cf = *p.second;
|
|
if (!cf_to_exclude.empty() && cf.schema()->cf_name() == cf_to_exclude) {
|
|
continue;
|
|
}
|
|
for (auto& cd : cf.schema()->all_columns_in_select_order()) {
|
|
if (cd.idx_info.index_name) {
|
|
names.emplace(*cd.idx_info.index_name);
|
|
}
|
|
}
|
|
}
|
|
return names;
|
|
}
|
|
|
|
// Based on:
|
|
// - org.apache.cassandra.db.AbstractCell#reconcile()
|
|
// - org.apache.cassandra.db.BufferExpiringCell#reconcile()
|
|
// - org.apache.cassandra.db.BufferDeletedCell#reconcile()
|
|
int
|
|
compare_atomic_cell_for_merge(atomic_cell_view left, atomic_cell_view right) {
|
|
if (left.timestamp() != right.timestamp()) {
|
|
return left.timestamp() > right.timestamp() ? 1 : -1;
|
|
}
|
|
if (left.is_live() != right.is_live()) {
|
|
return left.is_live() ? -1 : 1;
|
|
}
|
|
if (left.is_live()) {
|
|
auto c = compare_unsigned(left.value(), right.value());
|
|
if (c != 0) {
|
|
return c;
|
|
}
|
|
if (left.is_live_and_has_ttl()
|
|
&& right.is_live_and_has_ttl()
|
|
&& left.expiry() != right.expiry())
|
|
{
|
|
return left.expiry() < right.expiry() ? -1 : 1;
|
|
}
|
|
} else {
|
|
// Both are deleted
|
|
if (left.deletion_time() != right.deletion_time()) {
|
|
// Origin compares big-endian serialized deletion time. That's because it
|
|
// delegates to AbstractCell.reconcile() which compares values after
|
|
// comparing timestamps, which in case of deleted cells will hold
|
|
// serialized expiry.
|
|
return (uint32_t) left.deletion_time().time_since_epoch().count()
|
|
< (uint32_t) right.deletion_time().time_since_epoch().count() ? -1 : 1;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
struct query_state {
|
|
explicit query_state(schema_ptr s,
|
|
const query::read_command& cmd,
|
|
query::result_request request,
|
|
const dht::partition_range_vector& ranges,
|
|
query::result_memory_accounter memory_accounter = { })
|
|
: schema(std::move(s))
|
|
, cmd(cmd)
|
|
, builder(cmd.slice, request, std::move(memory_accounter))
|
|
, limit(cmd.row_limit)
|
|
, partition_limit(cmd.partition_limit)
|
|
, current_partition_range(ranges.begin())
|
|
, range_end(ranges.end()){
|
|
}
|
|
schema_ptr schema;
|
|
const query::read_command& cmd;
|
|
query::result::builder builder;
|
|
uint32_t limit;
|
|
uint32_t partition_limit;
|
|
bool range_empty = false; // Avoid ubsan false-positive when moving after construction
|
|
dht::partition_range_vector::const_iterator current_partition_range;
|
|
dht::partition_range_vector::const_iterator range_end;
|
|
mutation_reader reader;
|
|
uint32_t remaining_rows() const {
|
|
return limit - builder.row_count();
|
|
}
|
|
uint32_t remaining_partitions() const {
|
|
return partition_limit - builder.partition_count();
|
|
}
|
|
bool done() const {
|
|
return !remaining_rows() || !remaining_partitions() || current_partition_range == range_end || builder.is_short_read();
|
|
}
|
|
};
|
|
|
|
future<lw_shared_ptr<query::result>>
|
|
column_family::query(schema_ptr s, const query::read_command& cmd, query::result_request request,
|
|
const dht::partition_range_vector& partition_ranges,
|
|
tracing::trace_state_ptr trace_state, query::result_memory_limiter& memory_limiter,
|
|
uint64_t max_size) {
|
|
utils::latency_counter lc;
|
|
_stats.reads.set_latency(lc);
|
|
auto f = request == query::result_request::only_digest
|
|
? memory_limiter.new_digest_read(max_size) : memory_limiter.new_data_read(max_size);
|
|
return f.then([this, lc, s = std::move(s), &cmd, request, &partition_ranges, trace_state = std::move(trace_state)] (query::result_memory_accounter accounter) mutable {
|
|
auto qs_ptr = std::make_unique<query_state>(std::move(s), cmd, request, partition_ranges, std::move(accounter));
|
|
auto& qs = *qs_ptr;
|
|
return do_until(std::bind(&query_state::done, &qs), [this, &qs, trace_state = std::move(trace_state)] {
|
|
auto&& range = *qs.current_partition_range++;
|
|
return data_query(qs.schema, as_mutation_source(trace_state), range, qs.cmd.slice, qs.remaining_rows(),
|
|
qs.remaining_partitions(), qs.cmd.timestamp, qs.builder);
|
|
}).then([qs_ptr = std::move(qs_ptr), &qs] {
|
|
return make_ready_future<lw_shared_ptr<query::result>>(
|
|
make_lw_shared<query::result>(qs.builder.build()));
|
|
}).finally([lc, this]() mutable {
|
|
_stats.reads.mark(lc);
|
|
if (lc.is_start()) {
|
|
_stats.estimated_read.add(lc.latency(), _stats.reads.hist.count);
|
|
}
|
|
});
|
|
});
|
|
}
|
|
|
|
mutation_source
|
|
column_family::as_mutation_source(tracing::trace_state_ptr trace_state) const {
|
|
return mutation_source([this, trace_state = std::move(trace_state)] (schema_ptr s,
|
|
const dht::partition_range& range,
|
|
const query::partition_slice& slice,
|
|
const io_priority_class& pc) {
|
|
return this->make_reader(std::move(s), range, slice, pc, std::move(trace_state));
|
|
});
|
|
}
|
|
|
|
future<lw_shared_ptr<query::result>>
|
|
database::query(schema_ptr s, const query::read_command& cmd, query::result_request request, const dht::partition_range_vector& ranges, tracing::trace_state_ptr trace_state,
|
|
uint64_t max_result_size) {
|
|
column_family& cf = find_column_family(cmd.cf_id);
|
|
return cf.query(std::move(s), cmd, request, ranges, std::move(trace_state), get_result_memory_limiter(), max_result_size).then_wrapped([this, s = _stats] (auto f) {
|
|
if (f.failed()) {
|
|
++s->total_reads_failed;
|
|
} else {
|
|
++s->total_reads;
|
|
auto result = f.get0();
|
|
s->short_data_queries += bool(result->is_short_read());
|
|
return make_ready_future<lw_shared_ptr<query::result>>(std::move(result));
|
|
}
|
|
return f;
|
|
});
|
|
}
|
|
|
|
future<reconcilable_result>
|
|
database::query_mutations(schema_ptr s, const query::read_command& cmd, const dht::partition_range& range,
|
|
query::result_memory_accounter&& accounter, tracing::trace_state_ptr trace_state) {
|
|
column_family& cf = find_column_family(cmd.cf_id);
|
|
return mutation_query(std::move(s), cf.as_mutation_source(std::move(trace_state)), range, cmd.slice, cmd.row_limit, cmd.partition_limit,
|
|
cmd.timestamp, std::move(accounter)).then_wrapped([this, s = _stats] (auto f) {
|
|
if (f.failed()) {
|
|
++s->total_reads_failed;
|
|
} else {
|
|
++s->total_reads;
|
|
auto result = f.get0();
|
|
s->short_mutation_queries += bool(result.is_short_read());
|
|
return make_ready_future<reconcilable_result>(std::move(result));
|
|
}
|
|
return f;
|
|
});
|
|
}
|
|
|
|
std::unordered_set<sstring> database::get_initial_tokens() {
|
|
std::unordered_set<sstring> tokens;
|
|
sstring tokens_string = get_config().initial_token();
|
|
try {
|
|
boost::split(tokens, tokens_string, boost::is_any_of(sstring(", ")));
|
|
} catch (...) {
|
|
throw std::runtime_error(sprint("Unable to parse initial_token=%s", tokens_string));
|
|
}
|
|
tokens.erase("");
|
|
return tokens;
|
|
}
|
|
|
|
std::experimental::optional<gms::inet_address> database::get_replace_address() {
|
|
auto& cfg = get_config();
|
|
sstring replace_address = cfg.replace_address();
|
|
sstring replace_address_first_boot = cfg.replace_address_first_boot();
|
|
try {
|
|
if (!replace_address.empty()) {
|
|
return gms::inet_address(replace_address);
|
|
} else if (!replace_address_first_boot.empty()) {
|
|
return gms::inet_address(replace_address_first_boot);
|
|
}
|
|
return std::experimental::nullopt;
|
|
} catch (...) {
|
|
return std::experimental::nullopt;
|
|
}
|
|
}
|
|
|
|
bool database::is_replacing() {
|
|
sstring replace_address_first_boot = get_config().replace_address_first_boot();
|
|
if (!replace_address_first_boot.empty() && db::system_keyspace::bootstrap_complete()) {
|
|
dblog.info("Replace address on first boot requested; this node is already bootstrapped");
|
|
return false;
|
|
}
|
|
return bool(get_replace_address());
|
|
}
|
|
|
|
std::ostream& operator<<(std::ostream& out, const atomic_cell_or_collection& c) {
|
|
return out << to_hex(c._data);
|
|
}
|
|
|
|
std::ostream& operator<<(std::ostream& os, const mutation& m) {
|
|
const ::schema& s = *m.schema();
|
|
fprint(os, "{%s.%s key %s data ", s.ks_name(), s.cf_name(), m.decorated_key());
|
|
os << m.partition() << "}";
|
|
return os;
|
|
}
|
|
|
|
std::ostream& operator<<(std::ostream& out, const column_family& cf) {
|
|
return fprint(out, "{column_family: %s/%s}", cf._schema->ks_name(), cf._schema->cf_name());
|
|
}
|
|
|
|
std::ostream& operator<<(std::ostream& out, const database& db) {
|
|
out << "{\n";
|
|
for (auto&& e : db._column_families) {
|
|
auto&& cf = *e.second;
|
|
out << "(" << e.first.to_sstring() << ", " << cf.schema()->cf_name() << ", " << cf.schema()->ks_name() << "): " << cf << "\n";
|
|
}
|
|
out << "}";
|
|
return out;
|
|
}
|
|
|
|
void
|
|
column_family::apply(const mutation& m, const db::replay_position& rp) {
|
|
utils::latency_counter lc;
|
|
_stats.writes.set_latency(lc);
|
|
_memtables->active_memtable().apply(m, rp);
|
|
_stats.writes.mark(lc);
|
|
if (lc.is_start()) {
|
|
_stats.estimated_write.add(lc.latency(), _stats.writes.hist.count);
|
|
}
|
|
}
|
|
|
|
void
|
|
column_family::apply(const frozen_mutation& m, const schema_ptr& m_schema, const db::replay_position& rp) {
|
|
utils::latency_counter lc;
|
|
_stats.writes.set_latency(lc);
|
|
check_valid_rp(rp);
|
|
_memtables->active_memtable().apply(m, m_schema, rp);
|
|
_stats.writes.mark(lc);
|
|
if (lc.is_start()) {
|
|
_stats.estimated_write.add(lc.latency(), _stats.writes.hist.count);
|
|
}
|
|
}
|
|
|
|
void column_family::apply_streaming_mutation(schema_ptr m_schema, utils::UUID plan_id, const frozen_mutation& m, bool fragmented) {
|
|
if (fragmented) {
|
|
apply_streaming_big_mutation(std::move(m_schema), plan_id, m);
|
|
return;
|
|
}
|
|
_streaming_memtables->active_memtable().apply(m, m_schema);
|
|
}
|
|
|
|
void column_family::apply_streaming_big_mutation(schema_ptr m_schema, utils::UUID plan_id, const frozen_mutation& m) {
|
|
auto it = _streaming_memtables_big.find(plan_id);
|
|
if (it == _streaming_memtables_big.end()) {
|
|
it = _streaming_memtables_big.emplace(plan_id, make_lw_shared<streaming_memtable_big>()).first;
|
|
it->second->memtables = _config.enable_disk_writes ? make_streaming_memtable_big_list(*it->second) : make_memory_only_memtable_list();
|
|
}
|
|
auto entry = it->second;
|
|
entry->memtables->active_memtable().apply(m, m_schema);
|
|
}
|
|
|
|
void
|
|
column_family::check_valid_rp(const db::replay_position& rp) const {
|
|
if (rp < _highest_flushed_rp) {
|
|
throw replay_position_reordered_exception();
|
|
}
|
|
}
|
|
|
|
future<> dirty_memory_manager::shutdown() {
|
|
_db_shutdown_requested = true;
|
|
_should_flush.signal();
|
|
return std::move(_waiting_flush).then([this] {
|
|
return _region_group.shutdown();
|
|
});
|
|
}
|
|
|
|
future<> memtable_list::request_flush() {
|
|
if (!may_flush()) {
|
|
return make_ready_future<>();
|
|
} else if (!_flush_coalescing) {
|
|
_flush_coalescing = shared_promise<>();
|
|
return _dirty_memory_manager->get_flush_permit().then([this] (auto permit) {
|
|
auto current_flush = std::move(*_flush_coalescing);
|
|
_flush_coalescing = {};
|
|
return _dirty_memory_manager->flush_one(*this, std::move(permit)).then_wrapped([this, current_flush = std::move(current_flush)] (auto f) mutable {
|
|
if (f.failed()) {
|
|
current_flush.set_exception(f.get_exception());
|
|
} else {
|
|
current_flush.set_value();
|
|
}
|
|
});
|
|
});
|
|
} else {
|
|
return _flush_coalescing->get_shared_future();
|
|
}
|
|
}
|
|
|
|
lw_shared_ptr<memtable> memtable_list::new_memtable() {
|
|
return make_lw_shared<memtable>(_current_schema(), *_dirty_memory_manager, this);
|
|
}
|
|
|
|
future<> dirty_memory_manager::flush_one(memtable_list& mtlist, semaphore_units<> permit) {
|
|
if (mtlist.back()->empty()) {
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
auto* region = &(mtlist.back()->region());
|
|
auto schema = mtlist.back()->schema();
|
|
|
|
add_to_flush_manager(region, std::move(permit));
|
|
return get_units(_background_work_flush_serializer, 1).then([this, &mtlist, region, schema] (auto permit) mutable {
|
|
return mtlist.seal_active_memtable(memtable_list::flush_behavior::immediate).then_wrapped([this, region, schema, permit = std::move(permit)] (auto f) {
|
|
// There are two cases in which we may still need to remove the permits from here.
|
|
//
|
|
// 1) Some exception happenend, and we can't know at which point. It could be that because
|
|
// of that, the permits are still dangling. We have to remove it.
|
|
// 2) If we are using a memory-only Column Family. That will never create a memtable
|
|
// flush object, and we'll never get rid of the permits. So we have to remove it
|
|
// here.
|
|
this->remove_from_flush_manager(region);
|
|
if (f.failed()) {
|
|
dblog.error("Failed to flush memtable, {}:{}", schema->ks_name(), schema->cf_name());
|
|
}
|
|
return std::move(f);
|
|
});
|
|
});
|
|
}
|
|
|
|
future<> dirty_memory_manager::flush_when_needed() {
|
|
if (!_db) {
|
|
return make_ready_future<>();
|
|
}
|
|
// If there are explicit flushes requested, we must wait for them to finish before we stop.
|
|
return do_until([this] { return _db_shutdown_requested; }, [this] {
|
|
auto has_work = [this] { return has_pressure() || _db_shutdown_requested; };
|
|
return _should_flush.wait(std::move(has_work)).then([this] {
|
|
return get_flush_permit().then([this] (auto permit) {
|
|
// We give priority to explicit flushes. They are mainly user-initiated flushes,
|
|
// flushes coming from a DROP statement, or commitlog flushes.
|
|
if (_flush_serializer.waiters()) {
|
|
return make_ready_future<>();
|
|
}
|
|
// condition abated while we waited for the semaphore
|
|
if (!this->has_pressure() || _db_shutdown_requested) {
|
|
return make_ready_future<>();
|
|
}
|
|
// There are many criteria that can be used to select what is the best memtable to
|
|
// flush. Most of the time we want some coordination with the commitlog to allow us to
|
|
// release commitlog segments as early as we can.
|
|
//
|
|
// But during pressure condition, we'll just pick the CF that holds the largest
|
|
// memtable. The advantage of doing this is that this is objectively the one that will
|
|
// release the biggest amount of memory and is less likely to be generating tiny
|
|
// SSTables.
|
|
memtable& candidate_memtable = memtable::from_region(*(this->_region_group.get_largest_region()));
|
|
dirty_memory_manager* candidate_dirty_manager = &(dirty_memory_manager::from_region_group(candidate_memtable.region_group()));
|
|
// Do not wait. The semaphore will protect us against a concurrent flush. But we
|
|
// want to start a new one as soon as the permits are destroyed and the semaphore is
|
|
// made ready again, not when we are done with the current one.
|
|
candidate_dirty_manager->flush_one(*(candidate_memtable.get_memtable_list()), std::move(permit));
|
|
return make_ready_future<>();
|
|
});
|
|
});
|
|
}).finally([this] {
|
|
// We'll try to acquire the permit here to make sure we only really stop when there are no
|
|
// in-flight flushes. Our stop condition checks for the presence of waiters, but it could be
|
|
// that we have no waiters, but a flush still in flight. We wait for all background work to
|
|
// stop. When that stops, we know that the foreground work in the _flush_serializer has
|
|
// stopped as well.
|
|
return get_units(_background_work_flush_serializer, _max_background_work);
|
|
});
|
|
}
|
|
|
|
void dirty_memory_manager::start_reclaiming() {
|
|
_should_flush.signal();
|
|
}
|
|
|
|
future<> database::apply_in_memory(const frozen_mutation& m, schema_ptr m_schema, db::replay_position rp, timeout_clock::time_point timeout) {
|
|
return _dirty_memory_manager.region_group().run_when_memory_available([this, &m, m_schema = std::move(m_schema), rp = std::move(rp)] {
|
|
try {
|
|
auto& cf = find_column_family(m.column_family_id());
|
|
cf.apply(m, m_schema, rp);
|
|
} catch (no_such_column_family&) {
|
|
dblog.error("Attempting to mutate non-existent table {}", m.column_family_id());
|
|
}
|
|
}, timeout);
|
|
}
|
|
|
|
future<> database::do_apply(schema_ptr s, const frozen_mutation& m, timeout_clock::time_point timeout) {
|
|
// I'm doing a nullcheck here since the init code path for db etc
|
|
// is a little in flux and commitlog is created only when db is
|
|
// initied from datadir.
|
|
auto uuid = m.column_family_id();
|
|
auto& cf = find_column_family(uuid);
|
|
if (!s->is_synced()) {
|
|
throw std::runtime_error(sprint("attempted to mutate using not synced schema of %s.%s, version=%s",
|
|
s->ks_name(), s->cf_name(), s->version()));
|
|
}
|
|
if (cf.commitlog() != nullptr) {
|
|
commitlog_entry_writer cew(s, m);
|
|
return cf.commitlog()->add_entry(uuid, cew, timeout).then([&m, this, s, timeout](auto rp) {
|
|
return this->apply_in_memory(m, s, rp, timeout).handle_exception([this, s, &m, timeout] (auto ep) {
|
|
try {
|
|
std::rethrow_exception(ep);
|
|
} catch (replay_position_reordered_exception&) {
|
|
// expensive, but we're assuming this is super rare.
|
|
// if we failed to apply the mutation due to future re-ordering
|
|
// (which should be the ever only reason for rp mismatch in CF)
|
|
// let's just try again, add the mutation to the CL once more,
|
|
// and assume success in inevitable eventually.
|
|
dblog.debug("replay_position reordering detected");
|
|
return this->apply(s, m, timeout);
|
|
}
|
|
});
|
|
});
|
|
}
|
|
return apply_in_memory(m, s, db::replay_position(), timeout);
|
|
}
|
|
|
|
future<> database::apply(schema_ptr s, const frozen_mutation& m, timeout_clock::time_point timeout) {
|
|
if (dblog.is_enabled(logging::log_level::trace)) {
|
|
dblog.trace("apply {}", m.pretty_printer(s));
|
|
}
|
|
return do_apply(std::move(s), m, timeout).then_wrapped([this, s = _stats] (auto f) {
|
|
if (f.failed()) {
|
|
++s->total_writes_failed;
|
|
try {
|
|
f.get();
|
|
} catch (const timed_out_error&) {
|
|
++s->total_writes_timedout;
|
|
throw;
|
|
}
|
|
assert(0 && "should not reach");
|
|
}
|
|
++s->total_writes;
|
|
return f;
|
|
});
|
|
}
|
|
|
|
future<> database::apply_streaming_mutation(schema_ptr s, utils::UUID plan_id, const frozen_mutation& m, bool fragmented) {
|
|
if (!s->is_synced()) {
|
|
throw std::runtime_error(sprint("attempted to mutate using not synced schema of %s.%s, version=%s",
|
|
s->ks_name(), s->cf_name(), s->version()));
|
|
}
|
|
return _streaming_dirty_memory_manager.region_group().run_when_memory_available([this, &m, plan_id, fragmented, s = std::move(s)] {
|
|
auto uuid = m.column_family_id();
|
|
auto& cf = find_column_family(uuid);
|
|
cf.apply_streaming_mutation(s, plan_id, std::move(m), fragmented);
|
|
});
|
|
}
|
|
|
|
keyspace::config
|
|
database::make_keyspace_config(const keyspace_metadata& ksm) {
|
|
// FIXME support multiple directories
|
|
keyspace::config cfg;
|
|
if (_cfg->data_file_directories().size() > 0) {
|
|
cfg.datadir = sprint("%s/%s", _cfg->data_file_directories()[0], ksm.name());
|
|
cfg.enable_disk_writes = !_cfg->enable_in_memory_data_store();
|
|
cfg.enable_disk_reads = true; // we allways read from disk
|
|
cfg.enable_commitlog = ksm.durable_writes() && _cfg->enable_commitlog() && !_cfg->enable_in_memory_data_store();
|
|
cfg.enable_cache = _cfg->enable_cache();
|
|
|
|
} else {
|
|
cfg.datadir = "";
|
|
cfg.enable_disk_writes = false;
|
|
cfg.enable_disk_reads = false;
|
|
cfg.enable_commitlog = false;
|
|
cfg.enable_cache = false;
|
|
}
|
|
cfg.dirty_memory_manager = &_dirty_memory_manager;
|
|
cfg.streaming_dirty_memory_manager = &_streaming_dirty_memory_manager;
|
|
cfg.read_concurrency_config.sem = &_read_concurrency_sem;
|
|
cfg.read_concurrency_config.timeout = _cfg->read_request_timeout_in_ms() * 1ms;
|
|
// Assume a queued read takes up 10kB of memory, and allow 2% of memory to be filled up with such reads.
|
|
cfg.read_concurrency_config.max_queue_length = memory::stats().total_memory() * 0.02 / 10000;
|
|
cfg.read_concurrency_config.raise_queue_overloaded_exception = [this] {
|
|
++_stats->sstable_read_queue_overloaded;
|
|
throw std::runtime_error("sstable inactive read queue overloaded");
|
|
};
|
|
cfg.streaming_read_concurrency_config = cfg.read_concurrency_config;
|
|
cfg.streaming_read_concurrency_config.timeout = {};
|
|
cfg.cf_stats = &_cf_stats;
|
|
cfg.enable_incremental_backups = _enable_incremental_backups;
|
|
return cfg;
|
|
}
|
|
|
|
namespace db {
|
|
|
|
std::ostream& operator<<(std::ostream& os, const write_type& t) {
|
|
switch(t) {
|
|
case write_type::SIMPLE: os << "SIMPLE"; break;
|
|
case write_type::BATCH: os << "BATCH"; break;
|
|
case write_type::UNLOGGED_BATCH: os << "UNLOGGED_BATCH"; break;
|
|
case write_type::COUNTER: os << "COUNTER"; break;
|
|
case write_type::BATCH_LOG: os << "BATCH_LOG"; break;
|
|
case write_type::CAS: os << "CAS"; break;
|
|
default:
|
|
assert(false);
|
|
}
|
|
return os;
|
|
}
|
|
|
|
std::ostream& operator<<(std::ostream& os, db::consistency_level cl) {
|
|
switch (cl) {
|
|
case db::consistency_level::ANY: return os << "ANY";
|
|
case db::consistency_level::ONE: return os << "ONE";
|
|
case db::consistency_level::TWO: return os << "TWO";
|
|
case db::consistency_level::THREE: return os << "THREE";
|
|
case db::consistency_level::QUORUM: return os << "QUORUM";
|
|
case db::consistency_level::ALL: return os << "ALL";
|
|
case db::consistency_level::LOCAL_QUORUM: return os << "LOCAL_QUORUM";
|
|
case db::consistency_level::EACH_QUORUM: return os << "EACH_QUORUM";
|
|
case db::consistency_level::SERIAL: return os << "SERIAL";
|
|
case db::consistency_level::LOCAL_SERIAL: return os << "LOCAL_SERIAL";
|
|
case db::consistency_level::LOCAL_ONE: return os << "LOCAL_ONE";
|
|
default: abort();
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
std::ostream&
|
|
operator<<(std::ostream& os, const exploded_clustering_prefix& ecp) {
|
|
// Can't pass to_hex() to transformed(), since it is overloaded, so wrap:
|
|
auto enhex = [] (auto&& x) { return to_hex(x); };
|
|
return fprint(os, "prefix{%s}", ::join(":", ecp._v | boost::adaptors::transformed(enhex)));
|
|
}
|
|
|
|
std::ostream&
|
|
operator<<(std::ostream& os, const atomic_cell_view& acv) {
|
|
if (acv.is_live()) {
|
|
return fprint(os, "atomic_cell{%s;ts=%d;expiry=%d,ttl=%d}",
|
|
to_hex(acv.value()),
|
|
acv.timestamp(),
|
|
acv.is_live_and_has_ttl() ? acv.expiry().time_since_epoch().count() : -1,
|
|
acv.is_live_and_has_ttl() ? acv.ttl().count() : 0);
|
|
} else {
|
|
return fprint(os, "atomic_cell{DEAD;ts=%d;deletion_time=%d}",
|
|
acv.timestamp(), acv.deletion_time().time_since_epoch().count());
|
|
}
|
|
}
|
|
|
|
std::ostream&
|
|
operator<<(std::ostream& os, const atomic_cell& ac) {
|
|
return os << atomic_cell_view(ac);
|
|
}
|
|
|
|
future<>
|
|
database::stop() {
|
|
return _compaction_manager.stop().then([this] {
|
|
// try to ensure that CL has done disk flushing
|
|
if (_commitlog != nullptr) {
|
|
return _commitlog->shutdown();
|
|
}
|
|
return make_ready_future<>();
|
|
}).then([this] {
|
|
return parallel_for_each(_column_families, [this] (auto& val_pair) {
|
|
return val_pair.second->stop();
|
|
});
|
|
}).then([this] {
|
|
return _system_dirty_memory_manager.shutdown();
|
|
}).then([this] {
|
|
return _dirty_memory_manager.shutdown();
|
|
}).then([this] {
|
|
return _streaming_dirty_memory_manager.shutdown();
|
|
});
|
|
}
|
|
|
|
future<> database::flush_all_memtables() {
|
|
return parallel_for_each(_column_families, [this] (auto& cfp) {
|
|
return cfp.second->flush();
|
|
});
|
|
}
|
|
|
|
future<> database::truncate(sstring ksname, sstring cfname, timestamp_func tsf) {
|
|
auto& ks = find_keyspace(ksname);
|
|
auto& cf = find_column_family(ksname, cfname);
|
|
return truncate(ks, cf, std::move(tsf));
|
|
}
|
|
|
|
future<> database::truncate(const keyspace& ks, column_family& cf, timestamp_func tsf)
|
|
{
|
|
const auto durable = ks.metadata()->durable_writes();
|
|
const auto auto_snapshot = get_config().auto_snapshot();
|
|
|
|
future<> f = make_ready_future<>();
|
|
if (durable || auto_snapshot) {
|
|
// TODO:
|
|
// this is not really a guarantee at all that we've actually
|
|
// gotten all things to disk. Again, need queue-ish or something.
|
|
f = cf.flush();
|
|
} else {
|
|
f = cf.clear();
|
|
}
|
|
|
|
return cf.run_with_compaction_disabled([f = std::move(f), &cf, auto_snapshot, tsf = std::move(tsf)]() mutable {
|
|
return f.then([&cf, auto_snapshot, tsf = std::move(tsf)] {
|
|
dblog.debug("Discarding sstable data for truncated CF + indexes");
|
|
// TODO: notify truncation
|
|
|
|
return tsf().then([&cf, auto_snapshot](db_clock::time_point truncated_at) {
|
|
future<> f = make_ready_future<>();
|
|
if (auto_snapshot) {
|
|
auto name = sprint("%d-%s", truncated_at.time_since_epoch().count(), cf.schema()->cf_name());
|
|
f = cf.snapshot(name);
|
|
}
|
|
return f.then([&cf, truncated_at] {
|
|
return cf.discard_sstables(truncated_at).then([&cf, truncated_at](db::replay_position rp) {
|
|
// TODO: indexes.
|
|
return db::system_keyspace::save_truncation_record(cf, truncated_at, rp);
|
|
});
|
|
});
|
|
});
|
|
});
|
|
});
|
|
}
|
|
|
|
const sstring& database::get_snitch_name() const {
|
|
return _cfg->endpoint_snitch();
|
|
}
|
|
|
|
// For the filesystem operations, this code will assume that all keyspaces are visible in all shards
|
|
// (as we have been doing for a lot of the other operations, like the snapshot itself).
|
|
future<> database::clear_snapshot(sstring tag, std::vector<sstring> keyspace_names) {
|
|
std::vector<std::reference_wrapper<keyspace>> keyspaces;
|
|
|
|
if (keyspace_names.empty()) {
|
|
// if keyspace names are not given - apply to all existing local keyspaces
|
|
for (auto& ks: _keyspaces) {
|
|
keyspaces.push_back(std::reference_wrapper<keyspace>(ks.second));
|
|
}
|
|
} else {
|
|
for (auto& ksname: keyspace_names) {
|
|
try {
|
|
keyspaces.push_back(std::reference_wrapper<keyspace>(find_keyspace(ksname)));
|
|
} catch (no_such_keyspace& e) {
|
|
return make_exception_future(std::current_exception());
|
|
}
|
|
}
|
|
}
|
|
|
|
return parallel_for_each(keyspaces, [this, tag] (auto& ks) {
|
|
return parallel_for_each(ks.get().metadata()->cf_meta_data(), [this, tag] (auto& pair) {
|
|
auto& cf = this->find_column_family(pair.second);
|
|
return cf.clear_snapshot(tag);
|
|
}).then_wrapped([] (future<> f) {
|
|
dblog.debug("Cleared out snapshot directories");
|
|
});
|
|
});
|
|
}
|
|
|
|
future<> update_schema_version_and_announce(distributed<service::storage_proxy>& proxy)
|
|
{
|
|
return db::schema_tables::calculate_schema_digest(proxy).then([&proxy] (utils::UUID uuid) {
|
|
return proxy.local().get_db().invoke_on_all([uuid] (database& db) {
|
|
db.update_version(uuid);
|
|
return make_ready_future<>();
|
|
}).then([uuid] {
|
|
return db::system_keyspace::update_schema_version(uuid).then([uuid] {
|
|
dblog.info("Schema version changed to {}", uuid);
|
|
return service::get_local_migration_manager().passive_announce(uuid);
|
|
});
|
|
});
|
|
});
|
|
}
|
|
|
|
// Snapshots: snapshotting the files themselves is easy: if more than one CF
|
|
// happens to link an SSTable twice, all but one will fail, and we will end up
|
|
// with one copy.
|
|
//
|
|
// The problem for us, is that the snapshot procedure is supposed to leave a
|
|
// manifest file inside its directory. So if we just call snapshot() from
|
|
// multiple shards, only the last one will succeed, writing its own SSTables to
|
|
// the manifest leaving all other shards' SSTables unaccounted for.
|
|
//
|
|
// Moreover, for things like drop table, the operation should only proceed when the
|
|
// snapshot is complete. That includes the manifest file being correctly written,
|
|
// and for this reason we need to wait for all shards to finish their snapshotting
|
|
// before we can move on.
|
|
//
|
|
// To know which files we must account for in the manifest, we will keep an
|
|
// SSTable set. Theoretically, we could just rescan the snapshot directory and
|
|
// see what's in there. But we would need to wait for all shards to finish
|
|
// before we can do that anyway. That is the hard part, and once that is done
|
|
// keeping the files set is not really a big deal.
|
|
//
|
|
// This code assumes that all shards will be snapshotting at the same time. So
|
|
// far this is a safe assumption, but if we ever want to take snapshots from a
|
|
// group of shards only, this code will have to be updated to account for that.
|
|
struct snapshot_manager {
|
|
std::unordered_set<sstring> files;
|
|
semaphore requests;
|
|
semaphore manifest_write;
|
|
snapshot_manager() : requests(0), manifest_write(0) {}
|
|
};
|
|
static thread_local std::unordered_map<sstring, lw_shared_ptr<snapshot_manager>> pending_snapshots;
|
|
|
|
static future<>
|
|
seal_snapshot(sstring jsondir) {
|
|
std::ostringstream ss;
|
|
int n = 0;
|
|
ss << "{" << std::endl << "\t\"files\" : [ ";
|
|
for (auto&& rf: pending_snapshots.at(jsondir)->files) {
|
|
if (n++ > 0) {
|
|
ss << ", ";
|
|
}
|
|
ss << "\"" << rf << "\"";
|
|
}
|
|
ss << " ]" << std::endl << "}" << std::endl;
|
|
|
|
auto json = ss.str();
|
|
auto jsonfile = jsondir + "/manifest.json";
|
|
|
|
dblog.debug("Storing manifest {}", jsonfile);
|
|
|
|
return io_check(recursive_touch_directory, jsondir).then([jsonfile, json = std::move(json)] {
|
|
return open_checked_file_dma(general_disk_error_handler, jsonfile, open_flags::wo | open_flags::create | open_flags::truncate).then([json](file f) {
|
|
return do_with(make_file_output_stream(std::move(f)), [json] (output_stream<char>& out) {
|
|
return out.write(json.c_str(), json.size()).then([&out] {
|
|
return out.flush();
|
|
}).then([&out] {
|
|
return out.close();
|
|
});
|
|
});
|
|
});
|
|
}).then([jsondir] {
|
|
return io_check(sync_directory, std::move(jsondir));
|
|
}).finally([jsondir] {
|
|
pending_snapshots.erase(jsondir);
|
|
return make_ready_future<>();
|
|
});
|
|
}
|
|
|
|
future<> column_family::snapshot(sstring name) {
|
|
return flush().then([this, name = std::move(name)]() {
|
|
auto tables = boost::copy_range<std::vector<sstables::shared_sstable>>(*_sstables->all());
|
|
return do_with(std::move(tables), [this, name](std::vector<sstables::shared_sstable> & tables) {
|
|
auto jsondir = _config.datadir + "/snapshots/" + name;
|
|
|
|
return parallel_for_each(tables, [name](sstables::shared_sstable sstable) {
|
|
auto dir = sstable->get_dir() + "/snapshots/" + name;
|
|
return io_check(recursive_touch_directory, dir).then([sstable, dir] {
|
|
return sstable->create_links(dir).then_wrapped([] (future<> f) {
|
|
// If the SSTables are shared, one of the CPUs will fail here.
|
|
// That is completely fine, though. We only need one link.
|
|
try {
|
|
f.get();
|
|
} catch (std::system_error& e) {
|
|
if (e.code() != std::error_code(EEXIST, std::system_category())) {
|
|
throw;
|
|
}
|
|
}
|
|
return make_ready_future<>();
|
|
});
|
|
});
|
|
}).then([jsondir, &tables] {
|
|
// This is not just an optimization. If we have no files, jsondir may not have been created,
|
|
// and sync_directory would throw.
|
|
if (tables.size()) {
|
|
return io_check(sync_directory, std::move(jsondir));
|
|
} else {
|
|
return make_ready_future<>();
|
|
}
|
|
}).finally([this, &tables, jsondir] {
|
|
auto shard = std::hash<sstring>()(jsondir) % smp::count;
|
|
std::unordered_set<sstring> table_names;
|
|
for (auto& sst : tables) {
|
|
auto f = sst->get_filename();
|
|
auto rf = f.substr(sst->get_dir().size() + 1);
|
|
table_names.insert(std::move(rf));
|
|
}
|
|
return smp::submit_to(shard, [requester = engine().cpu_id(), jsondir = std::move(jsondir),
|
|
tables = std::move(table_names), datadir = _config.datadir] {
|
|
|
|
if (pending_snapshots.count(jsondir) == 0) {
|
|
pending_snapshots.emplace(jsondir, make_lw_shared<snapshot_manager>());
|
|
}
|
|
auto snapshot = pending_snapshots.at(jsondir);
|
|
for (auto&& sst: tables) {
|
|
snapshot->files.insert(std::move(sst));
|
|
}
|
|
|
|
snapshot->requests.signal(1);
|
|
auto my_work = make_ready_future<>();
|
|
if (requester == engine().cpu_id()) {
|
|
my_work = snapshot->requests.wait(smp::count).then([jsondir = std::move(jsondir),
|
|
snapshot] () mutable {
|
|
return seal_snapshot(jsondir).then([snapshot] {
|
|
snapshot->manifest_write.signal(smp::count);
|
|
return make_ready_future<>();
|
|
});
|
|
});
|
|
}
|
|
return my_work.then([snapshot] {
|
|
return snapshot->manifest_write.wait(1);
|
|
}).then([snapshot] {});
|
|
});
|
|
});
|
|
});
|
|
});
|
|
}
|
|
|
|
future<bool> column_family::snapshot_exists(sstring tag) {
|
|
sstring jsondir = _config.datadir + "/snapshots/" + tag;
|
|
return open_checked_directory(general_disk_error_handler, std::move(jsondir)).then_wrapped([] (future<file> f) {
|
|
try {
|
|
f.get0();
|
|
return make_ready_future<bool>(true);
|
|
} catch (std::system_error& e) {
|
|
if (e.code() != std::error_code(ENOENT, std::system_category())) {
|
|
throw;
|
|
}
|
|
return make_ready_future<bool>(false);
|
|
}
|
|
});
|
|
}
|
|
|
|
enum class missing { no, yes };
|
|
static missing
|
|
file_missing(future<> f) {
|
|
try {
|
|
f.get();
|
|
return missing::no;
|
|
} catch (std::system_error& e) {
|
|
if (e.code() != std::error_code(ENOENT, std::system_category())) {
|
|
throw;
|
|
}
|
|
return missing::yes;
|
|
}
|
|
}
|
|
|
|
future<> column_family::clear_snapshot(sstring tag) {
|
|
sstring jsondir = _config.datadir + "/snapshots/";
|
|
sstring parent = _config.datadir;
|
|
if (!tag.empty()) {
|
|
jsondir += tag;
|
|
parent += "/snapshots/";
|
|
}
|
|
|
|
lister::dir_entry_types dir_and_files = { directory_entry_type::regular, directory_entry_type::directory };
|
|
return lister::scan_dir(jsondir, dir_and_files, [this, curr_dir = jsondir, dir_and_files, tag] (directory_entry de) {
|
|
// FIXME: We really need a better directory walker. This should eventually be part of the seastar infrastructure.
|
|
// It's hard to write this in a fully recursive manner because we need to keep information about the parent directory,
|
|
// so we can remove the file. For now, we'll take advantage of the fact that we will at most visit 2 levels and keep
|
|
// it ugly but simple.
|
|
auto recurse = make_ready_future<>();
|
|
if (de.type == directory_entry_type::directory) {
|
|
// Should only recurse when tag is empty, meaning delete all snapshots
|
|
if (!tag.empty()) {
|
|
throw std::runtime_error(sprint("Unexpected directory %s found at %s! Aborting", de.name, curr_dir));
|
|
}
|
|
auto newdir = curr_dir + "/" + de.name;
|
|
recurse = lister::scan_dir(newdir, dir_and_files, [this, curr_dir = newdir] (directory_entry de) {
|
|
return io_check(remove_file, curr_dir + "/" + de.name);
|
|
});
|
|
}
|
|
return recurse.then([fname = curr_dir + "/" + de.name] {
|
|
return io_check(remove_file, fname);
|
|
});
|
|
}).then_wrapped([jsondir] (future<> f) {
|
|
// Fine if directory does not exist. If it did, we delete it
|
|
if (file_missing(std::move(f)) == missing::no) {
|
|
return io_check(remove_file, jsondir);
|
|
}
|
|
return make_ready_future<>();
|
|
}).then([parent] {
|
|
return io_check(sync_directory, parent).then_wrapped([] (future<> f) {
|
|
// Should always exist for empty tags, but may not exist for a single tag if we never took
|
|
// snapshots. We will check this here just to mask out the exception, without silencing
|
|
// unexpected ones.
|
|
file_missing(std::move(f));
|
|
return make_ready_future<>();
|
|
});
|
|
});
|
|
}
|
|
|
|
future<std::unordered_map<sstring, column_family::snapshot_details>> column_family::get_snapshot_details() {
|
|
std::unordered_map<sstring, snapshot_details> all_snapshots;
|
|
return do_with(std::move(all_snapshots), [this] (auto& all_snapshots) {
|
|
return io_check([&] { return engine().file_exists(_config.datadir + "/snapshots"); }).then([this, &all_snapshots](bool file_exists) {
|
|
if (!file_exists) {
|
|
return make_ready_future<>();
|
|
}
|
|
return lister::scan_dir(_config.datadir + "/snapshots", { directory_entry_type::directory }, [this, &all_snapshots] (directory_entry de) {
|
|
auto snapshot_name = de.name;
|
|
auto snapshot = _config.datadir + "/snapshots/" + snapshot_name;
|
|
all_snapshots.emplace(snapshot_name, snapshot_details());
|
|
return lister::scan_dir(snapshot, { directory_entry_type::regular }, [this, &all_snapshots, snapshot, snapshot_name] (directory_entry de) {
|
|
return io_check(file_size, snapshot + "/" + de.name).then([this, &all_snapshots, snapshot_name, name = de.name] (auto size) {
|
|
// The manifest is the only file expected to be in this directory not belonging to the SSTable.
|
|
// For it, we account the total size, but zero it for the true size calculation.
|
|
//
|
|
// All the others should just generate an exception: there is something wrong, so don't blindly
|
|
// add it to the size.
|
|
if (name != "manifest.json") {
|
|
sstables::entry_descriptor::make_descriptor(name);
|
|
all_snapshots.at(snapshot_name).total += size;
|
|
} else {
|
|
size = 0;
|
|
}
|
|
return make_ready_future<uint64_t>(size);
|
|
}).then([this, &all_snapshots, snapshot_name, name = de.name] (auto size) {
|
|
// FIXME: When we support multiple data directories, the file may not necessarily
|
|
// live in this same location. May have to test others as well.
|
|
return io_check(file_size, _config.datadir + "/" + name).then_wrapped([&all_snapshots, snapshot_name, size] (auto fut) {
|
|
try {
|
|
// File exists in the main SSTable directory. Snapshots are not contributing to size
|
|
fut.get0();
|
|
} catch (std::system_error& e) {
|
|
if (e.code() != std::error_code(ENOENT, std::system_category())) {
|
|
throw;
|
|
}
|
|
all_snapshots.at(snapshot_name).live += size;
|
|
}
|
|
return make_ready_future<>();
|
|
});
|
|
});
|
|
});
|
|
});
|
|
}).then([&all_snapshots] {
|
|
return std::move(all_snapshots);
|
|
});
|
|
});
|
|
}
|
|
|
|
future<> column_family::flush() {
|
|
_stats.pending_flushes++;
|
|
|
|
// highest_flushed_rp is only updated when we flush. If the memtable is currently alive, then
|
|
// the most up2date replay position is the one that's in there now. Otherwise, if the memtable
|
|
// hasn't received any writes yet, that's the one from the last flush we made.
|
|
auto desired_rp = _memtables->back()->empty() ? _highest_flushed_rp : _memtables->back()->replay_position();
|
|
return _memtables->request_flush().finally([this, desired_rp] {
|
|
_stats.pending_flushes--;
|
|
// In origin memtable_switch_count is incremented inside
|
|
// ColumnFamilyMeetrics Flush.run
|
|
_stats.memtable_switch_count++;
|
|
// wait for all up until us.
|
|
return _flush_queue->wait_for_pending(desired_rp);
|
|
});
|
|
}
|
|
|
|
future<> column_family::flush(const db::replay_position& pos) {
|
|
// Technically possible if we've already issued the
|
|
// sstable write, but it is not done yet.
|
|
if (pos < _highest_flushed_rp) {
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
// TODO: Origin looks at "secondary" memtables
|
|
// It also consideres "minReplayPosition", which is simply where
|
|
// the CL "started" (the first ever RP in this run).
|
|
// We ignore this for now and just say that if we're asked for
|
|
// a CF and it exists, we pretty much have to have data that needs
|
|
// flushing. Let's do it.
|
|
return _memtables->request_flush();
|
|
}
|
|
|
|
// FIXME: We can do much better than this in terms of cache management. Right
|
|
// now, we only have to flush the touched ranges because of the possibility of
|
|
// streaming containing token ownership changes.
|
|
//
|
|
// Right now we can't differentiate between that and a normal repair process,
|
|
// so we always flush. When we can differentiate those streams, we should not
|
|
// be indiscriminately touching the cache during repair. We will just have to
|
|
// invalidate the entries that are relevant to things we already have in the cache.
|
|
future<> column_family::flush_streaming_mutations(utils::UUID plan_id, dht::partition_range_vector ranges) {
|
|
// This will effectively take the gate twice for this call. The proper way to fix that would
|
|
// be to change seal_active_streaming_memtable_delayed to take a range parameter. However, we
|
|
// need this code to go away as soon as we can (see FIXME above). So the double gate is a better
|
|
// temporary counter measure.
|
|
return with_gate(_streaming_flush_gate, [this, plan_id, ranges = std::move(ranges)] {
|
|
return flush_streaming_big_mutations(plan_id).then([this] {
|
|
return _streaming_memtables->seal_active_memtable(memtable_list::flush_behavior::delayed);
|
|
}).finally([this] {
|
|
return _streaming_flush_phaser.advance_and_await();
|
|
}).finally([this, ranges = std::move(ranges)] {
|
|
if (!_config.enable_cache) {
|
|
return make_ready_future<>();
|
|
}
|
|
return do_with(std::move(ranges), [this] (auto& ranges) {
|
|
return parallel_for_each(ranges, [this](auto&& range) {
|
|
return _cache.invalidate(range);
|
|
});
|
|
});
|
|
});
|
|
});
|
|
}
|
|
|
|
future<> column_family::flush_streaming_big_mutations(utils::UUID plan_id) {
|
|
auto it = _streaming_memtables_big.find(plan_id);
|
|
if (it == _streaming_memtables_big.end()) {
|
|
return make_ready_future<>();
|
|
}
|
|
auto entry = it->second;
|
|
_streaming_memtables_big.erase(it);
|
|
return entry->memtables->request_flush().then([entry] {
|
|
return entry->flush_in_progress.close();
|
|
}).then([this, entry] {
|
|
return parallel_for_each(entry->sstables, [this] (auto& sst) {
|
|
return sst->seal_sstable(this->incremental_backups_enabled()).then([sst] {
|
|
return sst->open_data();
|
|
});
|
|
}).then([this, entry] {
|
|
for (auto&& sst : entry->sstables) {
|
|
// seal_active_streaming_memtable_big() ensures sst is unshared.
|
|
add_sstable(sst, {engine().cpu_id()});
|
|
}
|
|
trigger_compaction();
|
|
});
|
|
});
|
|
}
|
|
|
|
future<> column_family::fail_streaming_mutations(utils::UUID plan_id) {
|
|
auto it = _streaming_memtables_big.find(plan_id);
|
|
if (it == _streaming_memtables_big.end()) {
|
|
return make_ready_future<>();
|
|
}
|
|
auto entry = it->second;
|
|
_streaming_memtables_big.erase(it);
|
|
return entry->flush_in_progress.close().then([this, entry] {
|
|
for (auto&& sst : entry->sstables) {
|
|
sst->mark_for_deletion();
|
|
}
|
|
});
|
|
}
|
|
|
|
future<> column_family::clear() {
|
|
_memtables->clear();
|
|
_memtables->add_memtable();
|
|
_streaming_memtables->clear();
|
|
_streaming_memtables->add_memtable();
|
|
_streaming_memtables_big.clear();
|
|
return _cache.clear();
|
|
}
|
|
|
|
// NOTE: does not need to be futurized, but might eventually, depending on
|
|
// if we implement notifications, whatnot.
|
|
future<db::replay_position> column_family::discard_sstables(db_clock::time_point truncated_at) {
|
|
assert(_compaction_disabled > 0);
|
|
|
|
return with_lock(_sstables_lock.for_read(), [this, truncated_at] {
|
|
db::replay_position rp;
|
|
auto gc_trunc = to_gc_clock(truncated_at);
|
|
|
|
auto pruned = make_lw_shared(_compaction_strategy.make_sstable_set(_schema));
|
|
std::vector<sstables::shared_sstable> remove;
|
|
|
|
for (auto&p : *_sstables->all()) {
|
|
if (p->max_data_age() <= gc_trunc) {
|
|
rp = std::max(p->get_stats_metadata().position, rp);
|
|
remove.emplace_back(p);
|
|
continue;
|
|
}
|
|
pruned->insert(p);
|
|
}
|
|
|
|
_sstables = std::move(pruned);
|
|
dblog.debug("cleaning out row cache");
|
|
return _cache.clear().then([rp, remove = std::move(remove)] () mutable {
|
|
return parallel_for_each(remove, [](sstables::shared_sstable s) {
|
|
return sstables::delete_atomically({s});
|
|
}).then([rp] {
|
|
return make_ready_future<db::replay_position>(rp);
|
|
}).finally([remove] {}); // keep the objects alive until here.
|
|
});
|
|
});
|
|
}
|
|
|
|
|
|
std::ostream& operator<<(std::ostream& os, const user_types_metadata& m) {
|
|
os << "org.apache.cassandra.config.UTMetaData@" << &m;
|
|
return os;
|
|
}
|
|
|
|
std::ostream& operator<<(std::ostream& os, const keyspace_metadata& m) {
|
|
os << "KSMetaData{";
|
|
os << "name=" << m._name;
|
|
os << ", strategyClass=" << m._strategy_name;
|
|
os << ", strategyOptions={";
|
|
int n = 0;
|
|
for (auto& p : m._strategy_options) {
|
|
if (n++ != 0) {
|
|
os << ", ";
|
|
}
|
|
os << p.first << "=" << p.second;
|
|
}
|
|
os << "}";
|
|
os << ", cfMetaData={";
|
|
n = 0;
|
|
for (auto& p : m._cf_meta_data) {
|
|
if (n++ != 0) {
|
|
os << ", ";
|
|
}
|
|
os << p.first << "=" << p.second;
|
|
}
|
|
os << "}";
|
|
os << ", durable_writes=" << m._durable_writes;
|
|
os << ", userTypes=" << m._user_types;
|
|
os << "}";
|
|
return os;
|
|
}
|
|
|
|
void column_family::set_schema(schema_ptr s) {
|
|
dblog.debug("Changing schema version of {}.{} ({}) from {} to {}",
|
|
_schema->ks_name(), _schema->cf_name(), _schema->id(), _schema->version(), s->version());
|
|
|
|
for (auto& m : *_memtables) {
|
|
m->set_schema(s);
|
|
}
|
|
|
|
for (auto& m : *_streaming_memtables) {
|
|
m->set_schema(s);
|
|
}
|
|
|
|
for (auto smb : _streaming_memtables_big) {
|
|
for (auto m : *smb.second->memtables) {
|
|
m->set_schema(s);
|
|
}
|
|
}
|
|
|
|
_cache.set_schema(s);
|
|
_schema = std::move(s);
|
|
|
|
set_compaction_strategy(_schema->compaction_strategy());
|
|
trigger_compaction();
|
|
}
|
|
|
|
void column_family::update_view_schemas() {
|
|
_view_schemas = boost::copy_range<std::vector<view_ptr>>(_views | boost::adaptors::map_values | boost::adaptors::transformed([] (auto&& s) {
|
|
return view_ptr(s.schema());
|
|
}));
|
|
}
|
|
|
|
void column_family::add_or_update_view(view_ptr v) {
|
|
auto e = _views.emplace(v->cf_name(), v);
|
|
if (!e.second) {
|
|
e.first->second.update(v);
|
|
}
|
|
update_view_schemas();
|
|
}
|
|
|
|
void column_family::remove_view(view_ptr v) {
|
|
_views.erase(v->cf_name());
|
|
update_view_schemas();
|
|
}
|
|
|
|
const std::vector<view_ptr>& column_family::views() const {
|
|
return _view_schemas;
|
|
}
|