Compare commits
50 Commits
debug_form
...
scylla-1.7
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9b26a57288 | ||
|
|
31b5ef13c2 | ||
|
|
4bbee01288 | ||
|
|
3cc03f88fd | ||
|
|
4179d8f7c4 | ||
|
|
c20ddaf5af | ||
|
|
29dd48621b | ||
|
|
87de77a5ea | ||
|
|
66c4dcba8e | ||
|
|
7cfdc08af9 | ||
|
|
fdbe5caf41 | ||
|
|
522e62089b | ||
|
|
699648d5a1 | ||
|
|
698a4e62d9 | ||
|
|
63bec22d28 | ||
|
|
3d14e6e802 | ||
|
|
ea4a2dad96 | ||
|
|
655e6197cb | ||
|
|
1a1370d33e | ||
|
|
7f17424a4e | ||
|
|
dd56f1bec7 | ||
|
|
5df61797d6 | ||
|
|
b6db9e3d51 | ||
|
|
f2595bea85 | ||
|
|
e930ef0ee0 | ||
|
|
4cf0f88724 | ||
|
|
372f07b06e | ||
|
|
0ccc6630a8 | ||
|
|
b95a2338be | ||
|
|
f2d0ac9994 | ||
|
|
56725de0db | ||
|
|
6f479c8999 | ||
|
|
8c0488bce9 | ||
|
|
68dd11e275 | ||
|
|
a64c53d05f | ||
|
|
42e7a59cca | ||
|
|
2cd019ee47 | ||
|
|
bc8b553bec | ||
|
|
0ba98be899 | ||
|
|
d6899134a7 | ||
|
|
5253031110 | ||
|
|
a203c87f0d | ||
|
|
37fc0e6840 | ||
|
|
0429e5d8ea | ||
|
|
3c147437ac | ||
|
|
e4b3f02286 | ||
|
|
5a8013e155 | ||
|
|
fdba5b8eac | ||
|
|
558a52802a | ||
|
|
4f416c7272 |
2
.gitmodules
vendored
2
.gitmodules
vendored
@@ -1,6 +1,6 @@
|
||||
[submodule "seastar"]
|
||||
path = seastar
|
||||
url = ../seastar
|
||||
url = ../scylla-seastar
|
||||
ignore = dirty
|
||||
[submodule "swagger-ui"]
|
||||
path = swagger-ui
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#!/bin/sh
|
||||
|
||||
VERSION=666.development
|
||||
VERSION=1.7.rc2
|
||||
|
||||
if test -f version
|
||||
then
|
||||
|
||||
@@ -246,7 +246,8 @@ future<> auth::auth::setup() {
|
||||
std::map<sstring, sstring> opts;
|
||||
opts["replication_factor"] = "1";
|
||||
auto ksm = keyspace_metadata::new_keyspace(AUTH_KS, "org.apache.cassandra.locator.SimpleStrategy", opts, true);
|
||||
f = service::get_local_migration_manager().announce_new_keyspace(ksm, false);
|
||||
// We use min_timestamp so that default keyspace metadata will loose with any manual adjustments. See issue #2129.
|
||||
f = service::get_local_migration_manager().announce_new_keyspace(ksm, api::min_timestamp, false);
|
||||
}
|
||||
|
||||
return f.then([] {
|
||||
|
||||
@@ -22,13 +22,28 @@
|
||||
#pragma once
|
||||
|
||||
#include <boost/intrusive/unordered_set.hpp>
|
||||
|
||||
#if __has_include(<boost/container/small_vector.hpp>)
|
||||
|
||||
#include <boost/container/small_vector.hpp>
|
||||
|
||||
template <typename T, size_t N>
|
||||
using small_vector = boost::container::small_vector<T, N>;
|
||||
|
||||
#else
|
||||
|
||||
#include <vector>
|
||||
template <typename T, size_t N>
|
||||
using small_vector = std::vector<T>;
|
||||
|
||||
#endif
|
||||
|
||||
#include "fnv1a_hasher.hh"
|
||||
#include "streamed_mutation.hh"
|
||||
#include "mutation_partition.hh"
|
||||
|
||||
class cells_range {
|
||||
using ids_vector_type = boost::container::small_vector<column_id, 5>;
|
||||
using ids_vector_type = small_vector<column_id, 5>;
|
||||
|
||||
position_in_partition_view _position;
|
||||
ids_vector_type _ids;
|
||||
@@ -147,7 +162,7 @@ class cell_locker {
|
||||
// temporarily removed from its parent partition_entry.
|
||||
// Returns true if the cell_entry still exist in the new schema and
|
||||
// should be reinserted.
|
||||
bool upgrade(const schema& from, const schema& to, column_kind kind) {
|
||||
bool upgrade(const schema& from, const schema& to, column_kind kind) noexcept {
|
||||
auto& old_column_mapping = from.get_column_mapping();
|
||||
auto& column = old_column_mapping.column_at(kind, _address.id);
|
||||
auto cdef = to.get_column_definition(column.name());
|
||||
@@ -170,7 +185,9 @@ class cell_locker {
|
||||
}
|
||||
|
||||
~cell_entry() {
|
||||
assert(is_linked());
|
||||
if (!is_linked()) {
|
||||
return;
|
||||
}
|
||||
unlink();
|
||||
if (!--_parent._cell_count) {
|
||||
delete &_parent;
|
||||
@@ -286,10 +303,9 @@ class cell_locker {
|
||||
};
|
||||
|
||||
class equal_compare {
|
||||
schema_ptr _schema;
|
||||
dht::decorated_key_equals_comparator _cmp;
|
||||
public:
|
||||
explicit equal_compare(const schema s) : _cmp(s) { }
|
||||
explicit equal_compare(const schema& s) : _cmp(s) { }
|
||||
bool operator()(const dht::decorated_key& dk, const partition_entry& pe) {
|
||||
return _cmp(dk, pe._key);
|
||||
}
|
||||
@@ -386,22 +402,19 @@ struct cell_locker::locker {
|
||||
|
||||
partition_cells_range _range;
|
||||
partition_cells_range::iterator _current_ck;
|
||||
cells_range _cells_range;
|
||||
cells_range::const_iterator _current_cell;
|
||||
|
||||
std::vector<locked_cell> _locks;
|
||||
private:
|
||||
void update_ck() {
|
||||
if (!is_done()) {
|
||||
_cells_range = *_current_ck;
|
||||
_current_cell = _cells_range.begin();
|
||||
_current_cell = _current_ck->begin();
|
||||
}
|
||||
}
|
||||
|
||||
future<> lock_next();
|
||||
|
||||
bool is_done() const { return _current_ck == _range.end(); }
|
||||
std::vector<locked_cell> get() && { return std::move(_locks); }
|
||||
public:
|
||||
explicit locker(const ::schema& s, partition_entry& pe, partition_cells_range&& range)
|
||||
: _hasher(s)
|
||||
@@ -413,18 +426,22 @@ public:
|
||||
update_ck();
|
||||
}
|
||||
|
||||
future<std::vector<locked_cell>> lock_all() && {
|
||||
locker(const locker&) = delete;
|
||||
locker(locker&&) = delete;
|
||||
|
||||
future<> lock_all() {
|
||||
// Cannot defer before first call to lock_next().
|
||||
return lock_next().then([this] {
|
||||
return do_until([this] { return is_done(); }, [this] {
|
||||
return lock_next();
|
||||
}).then([&] {
|
||||
return std::move(*this).get();
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
std::vector<locked_cell> get() && { return std::move(_locks); }
|
||||
};
|
||||
|
||||
inline
|
||||
future<std::vector<locked_cell>> cell_locker::lock_cells(const dht::decorated_key& dk, partition_cells_range&& range) {
|
||||
partition_entry::hasher pe_hash;
|
||||
partition_entry::equal_compare pe_eq(*_schema);
|
||||
@@ -460,14 +477,17 @@ future<std::vector<locked_cell>> cell_locker::lock_cells(const dht::decorated_ke
|
||||
return make_ready_future<std::vector<locked_cell>>(std::move(locks));
|
||||
}
|
||||
|
||||
return do_with(locker(*_schema, *it, std::move(range)), [] (auto& locker) mutable {
|
||||
return std::move(locker).lock_all();
|
||||
auto l = std::make_unique<locker>(*_schema, *it, std::move(range));
|
||||
auto f = l->lock_all();
|
||||
return f.then([l = std::move(l)] {
|
||||
return std::move(*l).get();
|
||||
});
|
||||
}
|
||||
|
||||
inline
|
||||
future<> cell_locker::locker::lock_next() {
|
||||
while (!is_done()) {
|
||||
if (_current_cell == _cells_range.end() || _cells_range.empty()) {
|
||||
if (_current_cell == _current_ck->end()) {
|
||||
++_current_ck;
|
||||
update_ck();
|
||||
continue;
|
||||
@@ -475,7 +495,7 @@ future<> cell_locker::locker::lock_next() {
|
||||
|
||||
auto cid = *_current_cell++;
|
||||
|
||||
cell_address ca { position_in_partition(_cells_range.position()), cid };
|
||||
cell_address ca { position_in_partition(_current_ck->position()), cid };
|
||||
auto it = _partition_entry.cells().find(ca, _hasher, _eq_cmp);
|
||||
if (it != _partition_entry.cells().end()) {
|
||||
return it->lock().then([this, ce = it->shared_from_this()] () mutable {
|
||||
@@ -483,27 +503,25 @@ future<> cell_locker::locker::lock_next() {
|
||||
});
|
||||
}
|
||||
|
||||
auto cell = make_lw_shared<cell_entry>(_partition_entry, position_in_partition(_cells_range.position()), cid);
|
||||
auto cell = make_lw_shared<cell_entry>(_partition_entry, position_in_partition(_current_ck->position()), cid);
|
||||
_partition_entry.insert(cell);
|
||||
_locks.emplace_back(std::move(cell));
|
||||
}
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
inline
|
||||
bool cell_locker::partition_entry::upgrade(schema_ptr new_schema) {
|
||||
if (_schema == new_schema) {
|
||||
return true;
|
||||
}
|
||||
|
||||
auto buckets = std::make_unique<cells_type::bucket_type[]>(initial_bucket_count);
|
||||
auto buckets = std::make_unique<cells_type::bucket_type[]>(_cells.bucket_count());
|
||||
auto cells = cells_type(cells_type::bucket_traits(buckets.get(), _cells.bucket_count()),
|
||||
cell_entry::hasher(*new_schema), cell_entry::equal_compare(*new_schema));
|
||||
|
||||
while (!_cells.empty()) {
|
||||
auto it = _cells.begin();
|
||||
auto& cell = *it;
|
||||
_cells.erase(it);
|
||||
|
||||
_cells.clear_and_dispose([&] (cell_entry* cell_ptr) noexcept {
|
||||
auto& cell = *cell_ptr;
|
||||
auto kind = cell.position().is_static_row() ? column_kind::static_column
|
||||
: column_kind::regular_column;
|
||||
auto reinsert = cell.upgrade(*_schema, *new_schema, kind);
|
||||
@@ -512,9 +530,16 @@ bool cell_locker::partition_entry::upgrade(schema_ptr new_schema) {
|
||||
} else {
|
||||
_cell_count--;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// bi::unordered_set move assignment is actually a swap.
|
||||
// Original _buckets cannot be destroyed before the container using them is
|
||||
// so we need to explicitly make sure that the original _cells is no more.
|
||||
_cells = std::move(cells);
|
||||
auto destroy = [] (auto) { };
|
||||
destroy(std::move(cells));
|
||||
|
||||
_buckets = std::move(buckets);
|
||||
_schema = new_schema;
|
||||
return _cell_count;
|
||||
}
|
||||
|
||||
@@ -788,3 +788,23 @@ commitlog_total_space_in_mb: -1
|
||||
# By default, Scylla binds all interfaces to the prometheus API
|
||||
# It is possible to restrict the listening address to a specific one
|
||||
# prometheus_address: 0.0.0.0
|
||||
|
||||
# Distribution of data among cores (shards) within a node
|
||||
#
|
||||
# Scylla distributes data within a node among shards, using a round-robin
|
||||
# strategy:
|
||||
# [shard0] [shard1] ... [shardN-1] [shard0] [shard1] ... [shardN-1] ...
|
||||
#
|
||||
# Scylla versions 1.6 and below used just one repetition of the pattern;
|
||||
# this intefered with data placement among nodes (vnodes).
|
||||
#
|
||||
# Scylla versions 1.7 and above use 4096 repetitions of the pattern; this
|
||||
# provides for better data distribution.
|
||||
#
|
||||
# the value below is log (base 2) of the number of repetitions.
|
||||
#
|
||||
# Set to 0 to avoid rewriting all data when upgrading from Scylla 1.6 and
|
||||
# below.
|
||||
#
|
||||
# Keep at 12 for new clusters.
|
||||
murmur3_partitioner_ignore_msb_bits: 12
|
||||
|
||||
@@ -230,6 +230,7 @@ scylla_tests = [
|
||||
'tests/virtual_reader_test',
|
||||
'tests/view_schema_test',
|
||||
'tests/counter_test',
|
||||
'tests/cell_locker_test',
|
||||
]
|
||||
|
||||
apps = [
|
||||
|
||||
@@ -67,6 +67,14 @@ functions::init() {
|
||||
declare(aggregate_fcts::make_max_function<int64_t>());
|
||||
declare(aggregate_fcts::make_min_function<int64_t>());
|
||||
|
||||
declare(aggregate_fcts::make_count_function<float>());
|
||||
declare(aggregate_fcts::make_max_function<float>());
|
||||
declare(aggregate_fcts::make_min_function<float>());
|
||||
|
||||
declare(aggregate_fcts::make_count_function<double>());
|
||||
declare(aggregate_fcts::make_max_function<double>());
|
||||
declare(aggregate_fcts::make_min_function<double>());
|
||||
|
||||
//FIXME:
|
||||
//declare(aggregate_fcts::make_count_function<bytes>());
|
||||
//declare(aggregate_fcts::make_max_function<bytes>());
|
||||
@@ -78,15 +86,17 @@ functions::init() {
|
||||
declare(make_blob_as_varchar_fct());
|
||||
declare(aggregate_fcts::make_sum_function<int32_t>());
|
||||
declare(aggregate_fcts::make_sum_function<int64_t>());
|
||||
declare(aggregate_fcts::make_avg_function<int32_t>());
|
||||
declare(aggregate_fcts::make_avg_function<int64_t>());
|
||||
declare(aggregate_fcts::make_sum_function<float>());
|
||||
declare(aggregate_fcts::make_sum_function<double>());
|
||||
#if 0
|
||||
declare(AggregateFcts.sumFunctionForFloat);
|
||||
declare(AggregateFcts.sumFunctionForDouble);
|
||||
declare(AggregateFcts.sumFunctionForDecimal);
|
||||
declare(AggregateFcts.sumFunctionForVarint);
|
||||
declare(AggregateFcts.avgFunctionForFloat);
|
||||
declare(AggregateFcts.avgFunctionForDouble);
|
||||
#endif
|
||||
declare(aggregate_fcts::make_avg_function<int32_t>());
|
||||
declare(aggregate_fcts::make_avg_function<int64_t>());
|
||||
declare(aggregate_fcts::make_avg_function<float>());
|
||||
declare(aggregate_fcts::make_avg_function<double>());
|
||||
#if 0
|
||||
declare(AggregateFcts.avgFunctionForVarint);
|
||||
declare(AggregateFcts.avgFunctionForDecimal);
|
||||
#endif
|
||||
|
||||
30
database.cc
30
database.cc
@@ -1379,13 +1379,20 @@ future<> column_family::cleanup_sstables(sstables::compaction_descriptor descrip
|
||||
auto owned_ranges = make_lw_shared<dht::token_range_vector>(std::move(r));
|
||||
auto sstables_to_cleanup = make_lw_shared<std::vector<sstables::shared_sstable>>(std::move(descriptor.sstables));
|
||||
|
||||
return parallel_for_each(*sstables_to_cleanup, [this, owned_ranges = std::move(owned_ranges), sstables_to_cleanup] (auto& sst) {
|
||||
return do_for_each(*sstables_to_cleanup, [this, owned_ranges = std::move(owned_ranges), sstables_to_cleanup] (auto& sst) {
|
||||
if (!owned_ranges->empty() && !needs_cleanup(sst, owned_ranges, _schema)) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
std::vector<sstables::shared_sstable> sstable_to_compact({ sst });
|
||||
return this->compact_sstables(sstables::compaction_descriptor(std::move(sstable_to_compact), sst->get_sstable_level()), true);
|
||||
// this semaphore ensures that only one cleanup will run per shard.
|
||||
// That's to prevent node from running out of space when almost all sstables
|
||||
// need cleanup, so if sstables are cleaned in parallel, we may need almost
|
||||
// twice the disk space used by those sstables.
|
||||
static thread_local semaphore sem(1);
|
||||
|
||||
return with_semaphore(sem, 1, [this, &sst] {
|
||||
return this->compact_sstables(sstables::compaction_descriptor({ sst }, sst->get_sstable_level()), true);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
@@ -1802,7 +1809,7 @@ database::setup_metrics() {
|
||||
});
|
||||
|
||||
_metrics.add_group("database", {
|
||||
sm::make_gauge("requests_blocked_memory", [this] { return _dirty_memory_manager.region_group().blocked_requests(); },
|
||||
sm::make_gauge("requests_blocked_memory_current", [this] { return _dirty_memory_manager.region_group().blocked_requests(); },
|
||||
sm::description(
|
||||
seastar::format("Holds the current number of requests blocked due to reaching the memory quota ({}B). "
|
||||
"Non-zero value indicates that our bottleneck is memory and more specifically - the memory quota allocated for the \"database\" component.", _dirty_memory_manager.throttle_threshold()))),
|
||||
@@ -2663,7 +2670,7 @@ column_family::apply(const frozen_mutation& m, const schema_ptr& m_schema, const
|
||||
do_apply(m, m_schema, rp);
|
||||
}
|
||||
|
||||
future<frozen_mutation> database::do_apply_counter_update(column_family& cf, const frozen_mutation& fm, schema_ptr m_schema) {
|
||||
future<frozen_mutation> database::do_apply_counter_update(column_family& cf, const frozen_mutation& fm, schema_ptr m_schema, timeout_clock::time_point timeout) {
|
||||
auto m = fm.unfreeze(m_schema);
|
||||
m.upgrade(cf.schema());
|
||||
|
||||
@@ -2689,9 +2696,9 @@ future<frozen_mutation> database::do_apply_counter_update(column_family& cf, con
|
||||
cql_serialization_format::internal(), query::max_rows);
|
||||
|
||||
return do_with(std::move(slice), std::move(m), std::vector<locked_cell>(), stdx::optional<frozen_mutation>(),
|
||||
[this, &cf] (const query::partition_slice& slice, mutation& m, std::vector<locked_cell>& locks,
|
||||
[this, &cf, timeout] (const query::partition_slice& slice, mutation& m, std::vector<locked_cell>& locks,
|
||||
stdx::optional<frozen_mutation>& fm) mutable {
|
||||
return cf.lock_counter_cells(m).then([&, m_schema = cf.schema(), this] (std::vector<locked_cell> lcs) {
|
||||
return cf.lock_counter_cells(m).then([&, timeout, m_schema = cf.schema(), this] (std::vector<locked_cell> lcs) {
|
||||
locks = std::move(lcs);
|
||||
|
||||
// Before counter update is applied it needs to be transformed from
|
||||
@@ -2702,7 +2709,7 @@ future<frozen_mutation> database::do_apply_counter_update(column_family& cf, con
|
||||
return mutation_query(m_schema, cf.as_mutation_source({}),
|
||||
dht::partition_range::make_singular(m.decorated_key()),
|
||||
slice, query::max_rows, query::max_partitions,
|
||||
gc_clock::now(), { }).then([this, &cf, &m, &fm, m_schema] (auto result) {
|
||||
gc_clock::now(), { }).then([this, timeout, &cf, &m, &fm, m_schema] (auto result) {
|
||||
|
||||
// ...now, that we got existing state of all affected counter
|
||||
// cells we can look for our shard in each of them, increment
|
||||
@@ -2714,9 +2721,8 @@ future<frozen_mutation> database::do_apply_counter_update(column_family& cf, con
|
||||
transform_counter_updates_to_shards(m, mopt ? &*mopt : nullptr, cf.failed_counter_applies_to_memtable());
|
||||
|
||||
// FIXME: oh dear, another freeze
|
||||
// FIXME: timeout
|
||||
fm = freeze(m);
|
||||
return this->do_apply(m_schema, *fm, { });
|
||||
return this->do_apply(m_schema, *fm, timeout);
|
||||
}).then([&fm] {
|
||||
return std::move(*fm);
|
||||
});
|
||||
@@ -2854,7 +2860,7 @@ future<> dirty_memory_manager::flush_when_needed() {
|
||||
});
|
||||
}
|
||||
|
||||
void dirty_memory_manager::start_reclaiming() {
|
||||
void dirty_memory_manager::start_reclaiming() noexcept {
|
||||
_should_flush.signal();
|
||||
}
|
||||
|
||||
@@ -2876,7 +2882,7 @@ future<frozen_mutation> database::apply_counter_update(schema_ptr s, const froze
|
||||
}
|
||||
try {
|
||||
auto& cf = find_column_family(m.column_family_id());
|
||||
return do_apply_counter_update(cf, m, s);
|
||||
return do_apply_counter_update(cf, m, s, timeout);
|
||||
} catch (no_such_column_family&) {
|
||||
dblog.error("Attempting to mutate non-existent table {}", m.column_family_id());
|
||||
throw;
|
||||
|
||||
@@ -149,7 +149,7 @@ class dirty_memory_manager: public logalloc::region_group_reclaimer {
|
||||
std::unordered_map<const logalloc::region*, flush_permit> _flush_manager;
|
||||
|
||||
future<> _waiting_flush;
|
||||
virtual void start_reclaiming() override;
|
||||
virtual void start_reclaiming() noexcept override;
|
||||
|
||||
bool has_pressure() const {
|
||||
return over_soft_limit();
|
||||
@@ -1126,7 +1126,7 @@ private:
|
||||
|
||||
query::result_memory_limiter _result_memory_limiter;
|
||||
|
||||
future<frozen_mutation> do_apply_counter_update(column_family& cf, const frozen_mutation& fm, schema_ptr m_schema);
|
||||
future<frozen_mutation> do_apply_counter_update(column_family& cf, const frozen_mutation& fm, schema_ptr m_schema, timeout_clock::time_point timeout);
|
||||
public:
|
||||
static utils::UUID empty_version;
|
||||
|
||||
|
||||
@@ -1588,7 +1588,7 @@ db::commitlog::read_log_file(file f, commit_load_reader_func next, position_type
|
||||
bool failed = false;
|
||||
|
||||
work(file f, position_type o = 0)
|
||||
: f(f), fin(make_file_input_stream(f, o, make_file_input_stream_options())), start_off(o) {
|
||||
: f(f), fin(make_file_input_stream(f, 0, make_file_input_stream_options())), start_off(o) {
|
||||
}
|
||||
work(work&&) = default;
|
||||
|
||||
|
||||
@@ -61,13 +61,19 @@
|
||||
|
||||
static logging::logger logger("commitlog_replayer");
|
||||
|
||||
struct column_mappings {
|
||||
std::unordered_map<table_schema_version, column_mapping> map;
|
||||
future<> stop() { return make_ready_future<>(); }
|
||||
};
|
||||
|
||||
class db::commitlog_replayer::impl {
|
||||
seastar::sharded<column_mappings> _column_mappings;
|
||||
struct column_mappings {
|
||||
std::unordered_map<table_schema_version, column_mapping> map;
|
||||
future<> stop() { return make_ready_future<>(); }
|
||||
};
|
||||
|
||||
// we want the processing methods to be const, since they use
|
||||
// shard-sharing of data -> read only
|
||||
// this one is special since it is thread local.
|
||||
// Should actually make sharded::local a const function (it does
|
||||
// not modify content), but...
|
||||
mutable seastar::sharded<column_mappings> _column_mappings;
|
||||
|
||||
friend class db::commitlog_replayer;
|
||||
public:
|
||||
impl(seastar::sharded<cql3::query_processor>& db);
|
||||
@@ -94,13 +100,35 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
future<> process(stats*, temporary_buffer<char> buf, replay_position rp);
|
||||
future<stats> recover(sstring file);
|
||||
// move start/stop of the thread local bookkeep to "top level"
|
||||
// and also make sure to assert on it actually being started.
|
||||
future<> start() {
|
||||
return _column_mappings.start();
|
||||
}
|
||||
future<> stop() {
|
||||
return _column_mappings.stop();
|
||||
}
|
||||
|
||||
future<> process(stats*, temporary_buffer<char> buf, replay_position rp) const;
|
||||
future<stats> recover(sstring file) const;
|
||||
|
||||
typedef std::unordered_map<utils::UUID, replay_position> rp_map;
|
||||
typedef std::unordered_map<unsigned, rp_map> shard_rpm_map;
|
||||
typedef std::unordered_map<unsigned, replay_position> shard_rp_map;
|
||||
|
||||
replay_position min_pos(unsigned shard) const {
|
||||
auto i = _min_pos.find(shard);
|
||||
return i != _min_pos.end() ? i->second : replay_position();
|
||||
}
|
||||
replay_position cf_min_pos(const utils::UUID& uuid, unsigned shard) const {
|
||||
auto i = _rpm.find(shard);
|
||||
if (i == _rpm.end()) {
|
||||
return replay_position();
|
||||
}
|
||||
auto j = i->second.find(uuid);
|
||||
return j != i->second.end() ? j->second : replay_position();
|
||||
}
|
||||
|
||||
seastar::sharded<cql3::query_processor>&
|
||||
_qp;
|
||||
shard_rpm_map
|
||||
@@ -175,7 +203,6 @@ future<> db::commitlog_replayer::impl::init() {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (auto&p : _min_pos) {
|
||||
logger.debug("minimum position for shard {}: {}", p.first, p.second);
|
||||
}
|
||||
@@ -188,9 +215,11 @@ future<> db::commitlog_replayer::impl::init() {
|
||||
}
|
||||
|
||||
future<db::commitlog_replayer::impl::stats>
|
||||
db::commitlog_replayer::impl::recover(sstring file) {
|
||||
db::commitlog_replayer::impl::recover(sstring file) const {
|
||||
assert(_column_mappings.local_is_initialized());
|
||||
|
||||
replay_position rp{commitlog::descriptor(file)};
|
||||
auto gp = _min_pos[rp.shard_id()];
|
||||
auto gp = min_pos(rp.shard_id());
|
||||
|
||||
if (rp.id < gp.id) {
|
||||
logger.debug("skipping replay of fully-flushed {}", file);
|
||||
@@ -220,7 +249,7 @@ db::commitlog_replayer::impl::recover(sstring file) {
|
||||
});
|
||||
}
|
||||
|
||||
future<> db::commitlog_replayer::impl::process(stats* s, temporary_buffer<char> buf, replay_position rp) {
|
||||
future<> db::commitlog_replayer::impl::process(stats* s, temporary_buffer<char> buf, replay_position rp) const {
|
||||
try {
|
||||
|
||||
commitlog_entry_reader cer(buf);
|
||||
@@ -238,17 +267,16 @@ future<> db::commitlog_replayer::impl::process(stats* s, temporary_buffer<char>
|
||||
const column_mapping& src_cm = cm_it->second;
|
||||
|
||||
auto shard_id = rp.shard_id();
|
||||
if (rp < _min_pos[shard_id]) {
|
||||
if (rp < min_pos(shard_id)) {
|
||||
logger.trace("entry {} is less than global min position. skipping", rp);
|
||||
s->skipped_mutations++;
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
auto uuid = fm.column_family_id();
|
||||
auto& map = _rpm[shard_id];
|
||||
auto i = map.find(uuid);
|
||||
if (i != map.end() && rp <= i->second) {
|
||||
logger.trace("entry {} at {} is younger than recorded replay position {}. skipping", fm.column_family_id(), rp, i->second);
|
||||
auto cf_rp = cf_min_pos(uuid, shard_id);
|
||||
if (rp <= cf_rp) {
|
||||
logger.trace("entry {} at {} is younger than recorded replay position {}. skipping", fm.column_family_id(), rp, cf_rp);
|
||||
s->skipped_mutations++;
|
||||
return make_ready_future<>();
|
||||
}
|
||||
@@ -323,42 +351,55 @@ future<db::commitlog_replayer> db::commitlog_replayer::create_replayer(seastar::
|
||||
}
|
||||
|
||||
future<> db::commitlog_replayer::recover(std::vector<sstring> files) {
|
||||
return _impl->_column_mappings.start().then([this, files = std::move(files)] {
|
||||
typedef std::unordered_multimap<unsigned, sstring> shard_file_map;
|
||||
|
||||
logger.info("Replaying {}", join(", ", files));
|
||||
return map_reduce(files, [this](auto f) {
|
||||
logger.debug("Replaying {}", f);
|
||||
return _impl->recover(f).then([f](impl::stats stats) {
|
||||
if (stats.corrupt_bytes != 0) {
|
||||
logger.warn("Corrupted file: {}. {} bytes skipped.", f, stats.corrupt_bytes);
|
||||
}
|
||||
logger.debug("Log replay of {} complete, {} replayed mutations ({} invalid, {} skipped)"
|
||||
, f
|
||||
, stats.applied_mutations
|
||||
, stats.invalid_mutations
|
||||
, stats.skipped_mutations
|
||||
|
||||
// pre-compute work per shard already.
|
||||
auto map = ::make_lw_shared<shard_file_map>();
|
||||
for (auto& f : files) {
|
||||
commitlog::descriptor d(f);
|
||||
replay_position p = d;
|
||||
map->emplace(p.shard_id() % smp::count, std::move(f));
|
||||
}
|
||||
|
||||
return _impl->start().then([this, map] {
|
||||
return map_reduce(smp::all_cpus(), [this, map](unsigned id) {
|
||||
return smp::submit_to(id, [this, id, map]() {
|
||||
auto total = ::make_lw_shared<impl::stats>();
|
||||
// TODO: or something. For now, we do this serialized per shard,
|
||||
// to reduce mutation congestion. We could probably (says avi)
|
||||
// do 2 segments in parallel or something, but lets use this first.
|
||||
auto range = map->equal_range(id);
|
||||
return do_for_each(range.first, range.second, [this, total](const std::pair<unsigned, sstring>& p) {
|
||||
auto&f = p.second;
|
||||
logger.debug("Replaying {}", f);
|
||||
return _impl->recover(f).then([f, total](impl::stats stats) {
|
||||
if (stats.corrupt_bytes != 0) {
|
||||
logger.warn("Corrupted file: {}. {} bytes skipped.", f, stats.corrupt_bytes);
|
||||
}
|
||||
logger.debug("Log replay of {} complete, {} replayed mutations ({} invalid, {} skipped)"
|
||||
, f
|
||||
, stats.applied_mutations
|
||||
, stats.invalid_mutations
|
||||
, stats.skipped_mutations
|
||||
);
|
||||
*total += stats;
|
||||
});
|
||||
}).then([total] {
|
||||
return make_ready_future<impl::stats>(*total);
|
||||
});
|
||||
});
|
||||
}, impl::stats(), std::plus<impl::stats>()).then([](impl::stats totals) {
|
||||
logger.info("Log replay complete, {} replayed mutations ({} invalid, {} skipped)"
|
||||
, totals.applied_mutations
|
||||
, totals.invalid_mutations
|
||||
, totals.skipped_mutations
|
||||
);
|
||||
return make_ready_future<impl::stats>(stats);
|
||||
}).handle_exception([f](auto ep) -> future<impl::stats> {
|
||||
logger.error("Error recovering {}: {}", f, ep);
|
||||
try {
|
||||
std::rethrow_exception(ep);
|
||||
} catch (std::invalid_argument&) {
|
||||
logger.error("Scylla cannot process {}. Make sure to fully flush all Cassandra commit log files to sstable before migrating.", f);
|
||||
throw;
|
||||
} catch (...) {
|
||||
throw;
|
||||
}
|
||||
});
|
||||
}, impl::stats(), std::plus<impl::stats>()).then([](impl::stats totals) {
|
||||
logger.info("Log replay complete, {} replayed mutations ({} invalid, {} skipped)"
|
||||
, totals.applied_mutations
|
||||
, totals.invalid_mutations
|
||||
, totals.skipped_mutations
|
||||
);
|
||||
}).finally([this] {
|
||||
return _impl->_column_mappings.stop();
|
||||
return _impl->stop();
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
future<> db::commitlog_replayer::recover(sstring f) {
|
||||
|
||||
@@ -77,6 +77,15 @@ namespace schema_tables {
|
||||
|
||||
logging::logger logger("schema_tables");
|
||||
|
||||
struct push_back_and_return {
|
||||
std::vector<mutation> muts;
|
||||
|
||||
std::vector<mutation> operator()(mutation&& m) {
|
||||
muts.emplace_back(std::move(m));
|
||||
return std::move(muts);
|
||||
}
|
||||
};
|
||||
|
||||
struct qualified_name {
|
||||
sstring keyspace_name;
|
||||
sstring table_name;
|
||||
@@ -547,6 +556,14 @@ read_schema_partition_for_table(distributed<service::storage_proxy>& proxy, sche
|
||||
return query_partition_mutation(proxy.local(), std::move(schema), std::move(cmd), std::move(keyspace_key));
|
||||
}
|
||||
|
||||
future<mutation>
|
||||
read_keyspace_mutation(distributed<service::storage_proxy>& proxy, const sstring& keyspace_name) {
|
||||
schema_ptr s = keyspaces();
|
||||
auto key = partition_key::from_singular(*s, keyspace_name);
|
||||
auto cmd = make_lw_shared<query::read_command>(s->id(), s->version(), query::full_slice);
|
||||
return query_partition_mutation(proxy.local(), std::move(s), std::move(cmd), std::move(key));
|
||||
}
|
||||
|
||||
static semaphore the_merge_lock {1};
|
||||
|
||||
future<> merge_lock() {
|
||||
@@ -1182,19 +1199,18 @@ void add_type_to_schema_mutation(user_type type, api::timestamp_type timestamp,
|
||||
mutations.emplace_back(std::move(m));
|
||||
}
|
||||
|
||||
std::vector<mutation> make_create_type_mutations(lw_shared_ptr<keyspace_metadata> keyspace, user_type type, api::timestamp_type timestamp)
|
||||
future<std::vector<mutation>> make_create_type_mutations(lw_shared_ptr<keyspace_metadata> keyspace, user_type type, api::timestamp_type timestamp)
|
||||
{
|
||||
// Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
|
||||
auto mutations = make_create_keyspace_mutations(keyspace, timestamp, false);
|
||||
std::vector<mutation> mutations;
|
||||
add_type_to_schema_mutation(type, timestamp, mutations);
|
||||
return mutations;
|
||||
|
||||
// Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
|
||||
return read_keyspace_mutation(service::get_storage_proxy(), keyspace->name()).then(push_back_and_return{std::move(mutations)});
|
||||
}
|
||||
|
||||
std::vector<mutation> make_drop_type_mutations(lw_shared_ptr<keyspace_metadata> keyspace, user_type type, api::timestamp_type timestamp)
|
||||
future<std::vector<mutation>> make_drop_type_mutations(lw_shared_ptr<keyspace_metadata> keyspace, user_type type, api::timestamp_type timestamp)
|
||||
{
|
||||
// Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
|
||||
auto mutations = make_create_keyspace_mutations(keyspace, timestamp, false);
|
||||
|
||||
std::vector<mutation> mutations;
|
||||
schema_ptr s = usertypes();
|
||||
auto pkey = partition_key::from_singular(*s, type->_keyspace);
|
||||
auto ckey = clustering_key::from_singular(*s, type->get_name_as_string());
|
||||
@@ -1202,19 +1218,21 @@ std::vector<mutation> make_drop_type_mutations(lw_shared_ptr<keyspace_metadata>
|
||||
m.partition().apply_delete(*s, ckey, tombstone(timestamp, gc_clock::now()));
|
||||
mutations.emplace_back(std::move(m));
|
||||
|
||||
return mutations;
|
||||
// Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
|
||||
return read_keyspace_mutation(service::get_storage_proxy(), keyspace->name()).then(push_back_and_return{std::move(mutations)});
|
||||
}
|
||||
|
||||
/*
|
||||
* Table metadata serialization/deserialization.
|
||||
*/
|
||||
|
||||
std::vector<mutation> make_create_table_mutations(lw_shared_ptr<keyspace_metadata> keyspace, schema_ptr table, api::timestamp_type timestamp)
|
||||
future<std::vector<mutation>> make_create_table_mutations(lw_shared_ptr<keyspace_metadata> keyspace, schema_ptr table, api::timestamp_type timestamp)
|
||||
{
|
||||
// Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
|
||||
auto mutations = make_create_keyspace_mutations(keyspace, timestamp, false);
|
||||
std::vector<mutation> mutations;
|
||||
add_table_or_view_to_schema_mutation(table, timestamp, true, mutations);
|
||||
return mutations;
|
||||
|
||||
// Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
|
||||
return read_keyspace_mutation(service::get_storage_proxy(), keyspace->name()).then(push_back_and_return{std::move(mutations)});
|
||||
}
|
||||
|
||||
static schema_mutations make_table_mutations(schema_ptr table, api::timestamp_type timestamp, bool with_columns_and_triggers)
|
||||
@@ -1347,15 +1365,13 @@ static void make_update_columns_mutations(schema_ptr old_table,
|
||||
mutations.emplace_back(std::move(columns_mutation));
|
||||
}
|
||||
|
||||
std::vector<mutation> make_update_table_mutations(lw_shared_ptr<keyspace_metadata> keyspace,
|
||||
future<std::vector<mutation>> make_update_table_mutations(lw_shared_ptr<keyspace_metadata> keyspace,
|
||||
schema_ptr old_table,
|
||||
schema_ptr new_table,
|
||||
api::timestamp_type timestamp,
|
||||
bool from_thrift)
|
||||
{
|
||||
// Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
|
||||
auto mutations = make_create_keyspace_mutations(keyspace, timestamp, false);
|
||||
|
||||
std::vector<mutation> mutations;
|
||||
add_table_or_view_to_schema_mutation(new_table, timestamp, false, mutations);
|
||||
|
||||
make_update_columns_mutations(std::move(old_table), std::move(new_table), timestamp, from_thrift, mutations);
|
||||
@@ -1373,7 +1389,8 @@ std::vector<mutation> make_update_table_mutations(lw_shared_ptr<keyspace_metadat
|
||||
addTriggerToSchemaMutation(newTable, trigger, timestamp, mutation);
|
||||
|
||||
#endif
|
||||
return mutations;
|
||||
// Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
|
||||
return read_keyspace_mutation(service::get_storage_proxy(), keyspace->name()).then(push_back_and_return{std::move(mutations)});
|
||||
}
|
||||
|
||||
static void make_drop_table_or_view_mutations(schema_ptr schema_table,
|
||||
@@ -1390,10 +1407,9 @@ static void make_drop_table_or_view_mutations(schema_ptr schema_table,
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<mutation> make_drop_table_mutations(lw_shared_ptr<keyspace_metadata> keyspace, schema_ptr table, api::timestamp_type timestamp)
|
||||
future<std::vector<mutation>> make_drop_table_mutations(lw_shared_ptr<keyspace_metadata> keyspace, schema_ptr table, api::timestamp_type timestamp)
|
||||
{
|
||||
// Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
|
||||
auto mutations = make_create_keyspace_mutations(keyspace, timestamp, false);
|
||||
std::vector<mutation> mutations;
|
||||
make_drop_table_or_view_mutations(columnfamilies(), std::move(table), timestamp, mutations);
|
||||
|
||||
#if 0
|
||||
@@ -1405,7 +1421,8 @@ std::vector<mutation> make_drop_table_mutations(lw_shared_ptr<keyspace_metadata>
|
||||
for (String indexName : Keyspace.open(keyspace.name).getColumnFamilyStore(table.cfName).getBuiltIndexes())
|
||||
indexCells.addTombstone(indexCells.getComparator().makeCellName(indexName), ldt, timestamp);
|
||||
#endif
|
||||
return mutations;
|
||||
// Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
|
||||
return read_keyspace_mutation(service::get_storage_proxy(), keyspace->name()).then(push_back_and_return{std::move(mutations)});
|
||||
}
|
||||
|
||||
static future<schema_mutations> read_table_mutations(distributed<service::storage_proxy>& proxy, const qualified_name& table, schema_ptr s)
|
||||
@@ -1899,37 +1916,39 @@ schema_mutations make_schema_mutations(schema_ptr s, api::timestamp_type timesta
|
||||
return s->is_view() ? make_view_mutations(view_ptr(s), timestamp, with_columns) : make_table_mutations(s, timestamp, with_columns);
|
||||
}
|
||||
|
||||
std::vector<mutation> make_create_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr view, api::timestamp_type timestamp)
|
||||
future<std::vector<mutation>> make_create_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr view, api::timestamp_type timestamp)
|
||||
{
|
||||
// Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
|
||||
auto mutations = make_create_keyspace_mutations(keyspace, timestamp, false);
|
||||
std::vector<mutation> mutations;
|
||||
// And also the serialized base table.
|
||||
auto base = keyspace->cf_meta_data().at(view->view_info()->base_name());
|
||||
add_table_or_view_to_schema_mutation(base, timestamp, true, mutations);
|
||||
add_table_or_view_to_schema_mutation(view, timestamp, true, mutations);
|
||||
return mutations;
|
||||
|
||||
// Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
|
||||
return read_keyspace_mutation(service::get_storage_proxy(), keyspace->name()).then(push_back_and_return{std::move(mutations)});
|
||||
}
|
||||
|
||||
std::vector<mutation> make_update_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace,
|
||||
future<std::vector<mutation>> make_update_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace,
|
||||
view_ptr old_view,
|
||||
view_ptr new_view,
|
||||
api::timestamp_type timestamp)
|
||||
{
|
||||
// Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
|
||||
auto mutations = make_create_keyspace_mutations(keyspace, timestamp, false);
|
||||
std::vector<mutation> mutations;
|
||||
// And also the serialized base table.
|
||||
auto base = keyspace->cf_meta_data().at(new_view->view_info()->base_name());
|
||||
add_table_or_view_to_schema_mutation(base, timestamp, true, mutations);
|
||||
add_table_or_view_to_schema_mutation(new_view, timestamp, false, mutations);
|
||||
make_update_columns_mutations(old_view, new_view, timestamp, false, mutations);
|
||||
return mutations;
|
||||
|
||||
// Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
|
||||
return read_keyspace_mutation(service::get_storage_proxy(), keyspace->name()).then(push_back_and_return{std::move(mutations)});
|
||||
}
|
||||
|
||||
std::vector<mutation> make_drop_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr view, api::timestamp_type timestamp) {
|
||||
// Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
|
||||
auto mutations = make_create_keyspace_mutations(keyspace, timestamp, false);
|
||||
future<std::vector<mutation>> make_drop_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr view, api::timestamp_type timestamp) {
|
||||
std::vector<mutation> mutations;
|
||||
make_drop_table_or_view_mutations(views(), view, timestamp, mutations);
|
||||
return mutations;
|
||||
// Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
|
||||
return read_keyspace_mutation(service::get_storage_proxy(), keyspace->name()).then(push_back_and_return{std::move(mutations)});
|
||||
}
|
||||
|
||||
#if 0
|
||||
|
||||
@@ -80,6 +80,7 @@ future<std::vector<frozen_mutation>> convert_schema_to_mutations(distributed<ser
|
||||
|
||||
future<schema_result_value_type>
|
||||
read_schema_partition_for_keyspace(distributed<service::storage_proxy>& proxy, const sstring& schema_table_name, const sstring& keyspace_name);
|
||||
future<mutation> read_keyspace_mutation(distributed<service::storage_proxy>&, const sstring& keyspace_name);
|
||||
|
||||
future<> merge_schema(distributed<service::storage_proxy>& proxy, std::vector<mutation> mutations);
|
||||
|
||||
@@ -95,17 +96,17 @@ std::vector<mutation> make_drop_keyspace_mutations(lw_shared_ptr<keyspace_metada
|
||||
|
||||
lw_shared_ptr<keyspace_metadata> create_keyspace_from_schema_partition(const schema_result_value_type& partition);
|
||||
|
||||
std::vector<mutation> make_create_type_mutations(lw_shared_ptr<keyspace_metadata> keyspace, user_type type, api::timestamp_type timestamp);
|
||||
future<std::vector<mutation>> make_create_type_mutations(lw_shared_ptr<keyspace_metadata> keyspace, user_type type, api::timestamp_type timestamp);
|
||||
|
||||
std::vector<user_type> create_types_from_schema_partition(const schema_result_value_type& result);
|
||||
|
||||
std::vector<mutation> make_drop_type_mutations(lw_shared_ptr<keyspace_metadata> keyspace, user_type type, api::timestamp_type timestamp);
|
||||
future<std::vector<mutation>> make_drop_type_mutations(lw_shared_ptr<keyspace_metadata> keyspace, user_type type, api::timestamp_type timestamp);
|
||||
|
||||
void add_type_to_schema_mutation(user_type type, api::timestamp_type timestamp, std::vector<mutation>& mutations);
|
||||
|
||||
std::vector<mutation> make_create_table_mutations(lw_shared_ptr<keyspace_metadata> keyspace, schema_ptr table, api::timestamp_type timestamp);
|
||||
future<std::vector<mutation>> make_create_table_mutations(lw_shared_ptr<keyspace_metadata> keyspace, schema_ptr table, api::timestamp_type timestamp);
|
||||
|
||||
std::vector<mutation> make_update_table_mutations(
|
||||
future<std::vector<mutation>> make_update_table_mutations(
|
||||
lw_shared_ptr<keyspace_metadata> keyspace,
|
||||
schema_ptr old_table,
|
||||
schema_ptr new_table,
|
||||
@@ -114,7 +115,7 @@ std::vector<mutation> make_update_table_mutations(
|
||||
|
||||
future<std::map<sstring, schema_ptr>> create_tables_from_tables_partition(distributed<service::storage_proxy>& proxy, const schema_result::mapped_type& result);
|
||||
|
||||
std::vector<mutation> make_drop_table_mutations(lw_shared_ptr<keyspace_metadata> keyspace, schema_ptr table, api::timestamp_type timestamp);
|
||||
future<std::vector<mutation>> make_drop_table_mutations(lw_shared_ptr<keyspace_metadata> keyspace, schema_ptr table, api::timestamp_type timestamp);
|
||||
|
||||
future<schema_ptr> create_table_from_name(distributed<service::storage_proxy>& proxy, const sstring& keyspace, const sstring& table);
|
||||
|
||||
@@ -149,11 +150,11 @@ schema_mutations make_schema_mutations(schema_ptr s, api::timestamp_type timesta
|
||||
|
||||
void add_table_or_view_to_schema_mutation(schema_ptr view, api::timestamp_type timestamp, bool with_columns, std::vector<mutation>& mutations);
|
||||
|
||||
std::vector<mutation> make_create_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr view, api::timestamp_type timestamp);
|
||||
future<std::vector<mutation>> make_create_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr view, api::timestamp_type timestamp);
|
||||
|
||||
std::vector<mutation> make_update_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr old_view, view_ptr new_view, api::timestamp_type timestamp);
|
||||
future<std::vector<mutation>> make_update_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr old_view, view_ptr new_view, api::timestamp_type timestamp);
|
||||
|
||||
std::vector<mutation> make_drop_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr view, api::timestamp_type timestamp);
|
||||
future<std::vector<mutation>> make_drop_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr view, api::timestamp_type timestamp);
|
||||
|
||||
sstring serialize_kind(column_kind kind);
|
||||
column_kind deserialize_kind(sstring kind);
|
||||
|
||||
2
dist/ami/files/scylla-ami
vendored
2
dist/ami/files/scylla-ami
vendored
Submodule dist/ami/files/scylla-ami updated: d5a439759d...407e8f37ca
1
dist/common/modprobe.d/scylla-raid0.conf
vendored
Normal file
1
dist/common/modprobe.d/scylla-raid0.conf
vendored
Normal file
@@ -0,0 +1 @@
|
||||
options raid0 devices_discard_performance=Y
|
||||
74
dist/common/scripts/scylla_raid_setup
vendored
74
dist/common/scripts/scylla_raid_setup
vendored
@@ -5,15 +5,20 @@
|
||||
. /usr/lib/scylla/scylla_lib.sh
|
||||
|
||||
print_usage() {
|
||||
echo "scylla-raid-setup --disks /dev/hda,/dev/hdb... --raiddev /dev/md0 --update-fstab"
|
||||
echo "scylla-raid-setup --disks /dev/hda,/dev/hdb... --raiddev /dev/md0 --update-fstab --root /var/lib/scylla --volume-role [all|data|commitlog]"
|
||||
echo " --disks specify disks for RAID"
|
||||
echo " --raiddev MD device name for RAID"
|
||||
echo " --update-fstab update /etc/fstab for RAID"
|
||||
echo " --root specify the root of the tree"
|
||||
echo " --volume-role specify how will this device be used (data, commitlog, or all)"
|
||||
exit 1
|
||||
}
|
||||
|
||||
RAID=/dev/md0
|
||||
FSTAB=0
|
||||
ROOT=/var/lib/scylla
|
||||
ROLE="all"
|
||||
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
"--disks")
|
||||
@@ -29,12 +34,37 @@ while [ $# -gt 0 ]; do
|
||||
FSTAB=1
|
||||
shift 1
|
||||
;;
|
||||
"--root")
|
||||
ROOT="$2"
|
||||
shift 2
|
||||
;;
|
||||
"--volume-role")
|
||||
ROLE="$2"
|
||||
shift 2
|
||||
;;
|
||||
*)
|
||||
print_usage
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
ROOT=${ROOT%/}
|
||||
case "$ROLE" in
|
||||
"all")
|
||||
MOUNT_AT=$ROOT
|
||||
;;
|
||||
"data")
|
||||
MOUNT_AT="$ROOT/data"
|
||||
;;
|
||||
"commitlog")
|
||||
MOUNT_AT="$ROOT/commitlog"
|
||||
;;
|
||||
*)
|
||||
echo "Invalid role specified ($ROLE)"
|
||||
print_usage
|
||||
;;
|
||||
esac
|
||||
|
||||
if [ "$DISKS" = "" ]; then
|
||||
print_usage
|
||||
fi
|
||||
@@ -51,8 +81,8 @@ if [ -e $RAID ]; then
|
||||
echo "$RAID is already using"
|
||||
exit 1
|
||||
fi
|
||||
if [ "`mount|grep /var/lib/scylla`" != "" ]; then
|
||||
echo "/var/lib/scylla is already mounted"
|
||||
if mountpoint -q $MOUNT_AT; then
|
||||
echo "$MOUNT_AT is already mounted"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
@@ -61,18 +91,32 @@ if is_debian_variant; then
|
||||
else
|
||||
yum -y install mdadm xfsprogs
|
||||
fi
|
||||
mdadm --create --verbose --force --run $RAID --level=0 -c1024 --raid-devices=$NR_DISK $DISKS
|
||||
mkfs.xfs $RAID -f
|
||||
echo "DEVICE $DISKS" > /etc/mdadm.conf
|
||||
mdadm --detail --scan >> /etc/mdadm.conf
|
||||
if [ "$ID" = "ubuntu" ] && [ "$VERSION_ID" = "14.04" ]; then
|
||||
mdadm --create --verbose --force --run $RAID --level=0 -c1024 --raid-devices=$NR_DISK $DISKS
|
||||
mkfs.xfs $RAID -f
|
||||
else
|
||||
for dsk in $DISKS; do
|
||||
blkdiscard $dsk &
|
||||
done
|
||||
wait
|
||||
mdadm --create --verbose --force --run $RAID --level=0 -c1024 --raid-devices=$NR_DISK $DISKS
|
||||
mkfs.xfs $RAID -f -K
|
||||
fi
|
||||
mdadm --detail --scan > /etc/mdadm.conf
|
||||
|
||||
mkdir -p "$MOUNT_AT"
|
||||
mount -t xfs -o noatime $RAID "$MOUNT_AT"
|
||||
|
||||
# create this unconditionally so we are more robust about ordering
|
||||
# if the script is run multiple times. But must do after mount in case
|
||||
# we are mounting the root
|
||||
mkdir -p "$ROOT/data"
|
||||
mkdir -p "$ROOT/commitlog"
|
||||
mkdir -p "$ROOT/coredump"
|
||||
chown scylla:scylla "$ROOT"
|
||||
chown scylla:scylla "$ROOT"/*
|
||||
|
||||
if [ $FSTAB -ne 0 ]; then
|
||||
UUID=`blkid $RAID | awk '{print $2}'`
|
||||
echo "$UUID /var/lib/scylla xfs noatime 0 0" >> /etc/fstab
|
||||
echo "$UUID $MOUNT_AT xfs noatime 0 0" >> /etc/fstab
|
||||
fi
|
||||
mount -t xfs -o noatime $RAID /var/lib/scylla
|
||||
|
||||
mkdir -p /var/lib/scylla/data
|
||||
mkdir -p /var/lib/scylla/commitlog
|
||||
mkdir -p /var/lib/scylla/coredump
|
||||
chown scylla:scylla /var/lib/scylla/*
|
||||
chown scylla:scylla /var/lib/scylla/
|
||||
|
||||
11
dist/common/scripts/scylla_setup
vendored
11
dist/common/scripts/scylla_setup
vendored
@@ -81,7 +81,7 @@ verify_package() {
|
||||
}
|
||||
|
||||
list_block_devices() {
|
||||
if lsblk --help | grep -q -e -p; then
|
||||
if lsblk --help | grep -q -e '^\s*-p'; then
|
||||
lsblk -pnr | awk '{ print $1 }'
|
||||
else
|
||||
ls -1 /dev/sd* /dev/hd* /dev/xvd* /dev/nvme* /dev/mapper/* 2>/dev/null|grep -v control
|
||||
@@ -267,15 +267,18 @@ if [ $ENABLE_SERVICE -eq 1 ]; then
|
||||
printf "[housekeeping]\ncheck-version: True\n" > /etc/scylla.d/housekeeping.cfg
|
||||
fi
|
||||
if is_systemd; then
|
||||
systemctl unmask scylla-housekeeping.timer
|
||||
systemctl unmask scylla-housekeeping-daily.timer
|
||||
systemctl unmask scylla-housekeeping-restart.timer
|
||||
fi
|
||||
else
|
||||
if [ ! -f /etc/scylla.d/housekeeping.cfg ]; then
|
||||
printf "[housekeeping]\ncheck-version: False\n" > /etc/scylla.d/housekeeping.cfg
|
||||
fi
|
||||
if is_systemd; then
|
||||
systemctl mask scylla-housekeeping.timer
|
||||
systemctl stop scylla-housekeeping.timer || true
|
||||
systemctl mask scylla-housekeeping-daily.timer
|
||||
systemctl mask scylla-housekeeping-restart.timer
|
||||
systemctl stop scylla-housekeeping-daily.timer || true
|
||||
systemctl stop scylla-housekeeping-restart.timer || true
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[Unit]
|
||||
Description=Scylla Housekeeping
|
||||
Description=Scylla Housekeeping daily mode
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
11
dist/common/systemd/scylla-housekeeping-daily.timer
vendored
Normal file
11
dist/common/systemd/scylla-housekeeping-daily.timer
vendored
Normal file
@@ -0,0 +1,11 @@
|
||||
[Unit]
|
||||
Description=Run Scylla Housekeeping daily mode
|
||||
After=scylla-server.service
|
||||
BindsTo=scylla-server.service
|
||||
|
||||
[Timer]
|
||||
OnActiveSec=1d
|
||||
OnUnitActiveSec=1d
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
12
dist/common/systemd/scylla-housekeeping-restart.service
vendored
Normal file
12
dist/common/systemd/scylla-housekeeping-restart.service
vendored
Normal file
@@ -0,0 +1,12 @@
|
||||
[Unit]
|
||||
Description=Scylla Housekeeping restart mode
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=scylla
|
||||
Group=scylla
|
||||
ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -q -c /etc/scylla.d/housekeeping.cfg version --mode r
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
@@ -1,12 +1,11 @@
|
||||
[Unit]
|
||||
Description=Run Scylla Housekeeping daily
|
||||
Description=Run Scylla Housekeeping restart mode
|
||||
After=scylla-server.service
|
||||
BindsTo=scylla-server.service
|
||||
|
||||
[Timer]
|
||||
# set OnActiveSec to 3 to safely avoid issues/1846
|
||||
OnActiveSec=3
|
||||
OnUnitActiveSec=1d
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
3
dist/common/systemd/scylla-server.service.in
vendored
3
dist/common/systemd/scylla-server.service.in
vendored
@@ -2,7 +2,8 @@
|
||||
Description=Scylla Server
|
||||
After=network.target
|
||||
Wants=scylla-jmx.service
|
||||
Wants=scylla-housekeeping.timer
|
||||
Wants=scylla-housekeeping-restart.timer
|
||||
Wants=scylla-housekeeping-daily.timer
|
||||
|
||||
[Service]
|
||||
PermissionsStartOnly=true
|
||||
|
||||
12
dist/debian/build_deb.sh
vendored
12
dist/debian/build_deb.sh
vendored
@@ -84,7 +84,8 @@ if [ "$DISTRIBUTION" = "Debian" ]; then
|
||||
sed -i -e "s/@@COMPILER@@/g++-5/g" debian/rules
|
||||
sed -i -e "s/@@BUILD_DEPENDS@@/libsystemd-dev, g++-5, libunwind-dev/g" debian/control
|
||||
sed -i -e "s#@@INSTALL@@##g" debian/scylla-server.install
|
||||
sed -i -e "s#@@HKDOTTIMER@@#dist/common/systemd/scylla-housekeeping.timer /lib/systemd/system#g" debian/scylla-server.install
|
||||
sed -i -e "s#@@HKDOTTIMER_D@@#dist/common/systemd/scylla-housekeeping-daily.timer /lib/systemd/system#g" debian/scylla-server.install
|
||||
sed -i -e "s#@@HKDOTTIMER_R@@#dist/common/systemd/scylla-housekeeping-restart.timer /lib/systemd/system#g" debian/scylla-server.install
|
||||
sed -i -e "s#@@SYSCTL@@#dist/debian/sysctl.d/99-scylla.conf etc/sysctl.d#g" debian/scylla-server.install
|
||||
elif [ "$VERSION_ID" = "14.04" ]; then
|
||||
sed -i -e "s/@@REVISION@@/0ubuntu1/g" debian/changelog
|
||||
@@ -92,7 +93,8 @@ elif [ "$VERSION_ID" = "14.04" ]; then
|
||||
sed -i -e "s/@@COMPILER@@/g++-5/g" debian/rules
|
||||
sed -i -e "s/@@BUILD_DEPENDS@@/g++-5, libunwind8-dev/g" debian/control
|
||||
sed -i -e "s#@@INSTALL@@#dist/debian/sudoers.d/scylla etc/sudoers.d#g" debian/scylla-server.install
|
||||
sed -i -e "s#@@HKDOTTIMER@@##g" debian/scylla-server.install
|
||||
sed -i -e "s#@@HKDOTTIMER_D@@##g" debian/scylla-server.install
|
||||
sed -i -e "s#@@HKDOTTIMER_R@@##g" debian/scylla-server.install
|
||||
sed -i -e "s#@@SYSCTL@@#dist/debian/sysctl.d/99-scylla.conf etc/sysctl.d#g" debian/scylla-server.install
|
||||
else
|
||||
sed -i -e "s/@@REVISION@@/0ubuntu1/g" debian/changelog
|
||||
@@ -100,7 +102,8 @@ else
|
||||
sed -i -e "s/@@COMPILER@@/g++/g" debian/rules
|
||||
sed -i -e "s/@@BUILD_DEPENDS@@/libsystemd-dev, g++, libunwind-dev/g" debian/control
|
||||
sed -i -e "s#@@INSTALL@@##g" debian/scylla-server.install
|
||||
sed -i -e "s#@@HKDOTTIMER@@#dist/common/systemd/scylla-housekeeping.timer /lib/systemd/system#g" debian/scylla-server.install
|
||||
sed -i -e "s#@@HKDOTTIMER_D@@#dist/common/systemd/scylla-housekeeping-daily.timer /lib/systemd/system#g" debian/scylla-server.install
|
||||
sed -i -e "s#@@HKDOTTIMER_R@@#dist/common/systemd/scylla-housekeeping-restart.timer /lib/systemd/system#g" debian/scylla-server.install
|
||||
sed -i -e "s#@@SYSCTL@@##g" debian/scylla-server.install
|
||||
fi
|
||||
if [ $DIST -gt 0 ]; then
|
||||
@@ -116,7 +119,8 @@ fi
|
||||
|
||||
cp dist/common/systemd/scylla-server.service.in debian/scylla-server.service
|
||||
sed -i -e "s#@@SYSCONFDIR@@#/etc/default#g" debian/scylla-server.service
|
||||
cp dist/common/systemd/scylla-housekeeping.service debian/scylla-server.scylla-housekeeping.service
|
||||
cp dist/common/systemd/scylla-housekeeping-daily.service debian/scylla-server.scylla-housekeeping-daily.service
|
||||
cp dist/common/systemd/scylla-housekeeping-restart.service debian/scylla-server.scylla-housekeeping-restart.service
|
||||
cp dist/common/systemd/node-exporter.service debian/scylla-server.node-exporter.service
|
||||
|
||||
if [ "$VERSION_ID" = "14.04" ] && [ $REBUILD -eq 0 ]; then
|
||||
|
||||
2
dist/debian/control.in
vendored
2
dist/debian/control.in
vendored
@@ -4,7 +4,7 @@ Homepage: http://scylladb.com
|
||||
Section: database
|
||||
Priority: optional
|
||||
Standards-Version: 3.9.5
|
||||
Build-Depends: debhelper (>= 9), libyaml-cpp-dev, liblz4-dev, libsnappy-dev, libcrypto++-dev, libjsoncpp-dev, libaio-dev, libthrift-dev, thrift-compiler, antlr3, antlr3-c++-dev, ragel, ninja-build, git, libboost-program-options1.55-dev | libboost-program-options-dev, libboost-filesystem1.55-dev | libboost-filesystem-dev, libboost-system1.55-dev | libboost-system-dev, libboost-thread1.55-dev | libboost-thread-dev, libboost-test1.55-dev | libboost-test-dev, libgnutls28-dev, libhwloc-dev, libnuma-dev, libpciaccess-dev, xfslibs-dev, python3-pyparsing, libxml2-dev, libsctp-dev, python-urwid, pciutils, libprotobuf-dev, protobuf-compiler, systemtap-sdt-dev, @@BUILD_DEPENDS@@
|
||||
Build-Depends: debhelper (>= 9), libyaml-cpp-dev, liblz4-dev, libsnappy-dev, libcrypto++-dev, libjsoncpp-dev, libaio-dev, libthrift-dev, thrift-compiler, antlr3, antlr3-c++-dev, ragel, ninja-build, git, libboost-program-options1.55-dev | libboost-program-options-dev, libboost-filesystem1.55-dev | libboost-filesystem-dev, libboost-system1.55-dev | libboost-system-dev, libboost-thread1.55-dev | libboost-thread-dev, libboost-test1.55-dev | libboost-test-dev, libgnutls28-dev, libhwloc-dev, libnuma-dev, libpciaccess-dev, xfslibs-dev, python3-pyparsing, libxml2-dev, libsctp-dev, python-urwid, pciutils, libprotobuf-dev, protobuf-compiler, systemtap-sdt-dev, libtool, automake, @@BUILD_DEPENDS@@
|
||||
|
||||
Package: scylla-conf
|
||||
Architecture: any
|
||||
|
||||
7
dist/debian/dep/build_dependency.sh
vendored
7
dist/debian/dep/build_dependency.sh
vendored
@@ -77,10 +77,11 @@ fi
|
||||
|
||||
if [ "$DISTRIBUTION" = "Debian" ] && [ "$VERSION_ID" = "8" ]; then
|
||||
if [ ! -f build/gcc-5_*.deb ]; then
|
||||
sudo cp dist/debian/dep/debian-stretch-source.list /etc/apt/sources.list.d/
|
||||
sudo apt-get update
|
||||
cd build
|
||||
apt-get source gcc-5/stretch=5.4.1-2
|
||||
wget https://launchpad.net/debian/+archive/primary/+files/gcc-5_5.4.1-5.dsc
|
||||
wget https://launchpad.net/debian/+archive/primary/+files/gcc-5_5.4.1.orig.tar.gz
|
||||
wget https://launchpad.net/debian/+archive/primary/+files/gcc-5_5.4.1-5.diff.gz
|
||||
dpkg-source -x gcc-5_5.4.1-5.dsc
|
||||
cd gcc-5-5.4.1
|
||||
# resolve build time dependencies manually, since mk-build-deps doesn't works for gcc package
|
||||
sudo apt-get install -y g++-multilib libc6-dev-i386 lib32gcc1 libc6-dev-x32 libx32gcc1 libc6-dbg m4 libtool autoconf2.64 autogen gawk zlib1g-dev systemtap-sdt-dev gperf bison flex gdb texinfo locales sharutils libantlr-java libffi-dev gnat-4.9 libisl-dev libmpc-dev libmpfr-dev libgmp-dev dejagnu realpath chrpath quilt doxygen graphviz ghostscript texlive-latex-base xsltproc libxml2-utils docbook-xsl-ns
|
||||
|
||||
20
dist/debian/dep/debian-gcc-5-jessie.diff
vendored
20
dist/debian/dep/debian-gcc-5-jessie.diff
vendored
@@ -1,6 +1,5 @@
|
||||
diff -Nur debian/rules.conf /home/syuu/gcc-5-5.4.1/debian/rules.conf
|
||||
--- debian/rules.conf 2016-10-14 04:54:21.000000000 +0000
|
||||
+++ /home/syuu/gcc-5-5.4.1/debian/rules.conf 2016-10-12 17:28:54.138711378 +0000
|
||||
--- debian/rules.conf 2017-02-24 19:02:52.000000000 +0000
|
||||
+++ /home/syuu/gcc-5.5/gcc-5-5.4.1/debian/rules.conf 2017-02-24 18:13:59.000000000 +0000
|
||||
@@ -206,7 +206,7 @@
|
||||
ifneq (,$(filter $(distrelease),vivid))
|
||||
BINUTILSBDV = 2.25-3~
|
||||
@@ -10,14 +9,16 @@ diff -Nur debian/rules.conf /home/syuu/gcc-5-5.4.1/debian/rules.conf
|
||||
else ifneq (,$(filter $(distrelease),sid stretch xenial))
|
||||
BINUTILSBDV = 2.26.1
|
||||
endif
|
||||
@@ -387,9 +387,9 @@
|
||||
@@ -386,10 +386,10 @@
|
||||
MPFR_BUILD_DEP = libmpfr-dev (>= 3.0.0-9~),
|
||||
endif
|
||||
|
||||
ISL_BUILD_DEP = libisl-dev,
|
||||
-ifneq (,$(filter $(distrelease),jessie sid experimental))
|
||||
-ISL_BUILD_DEP = libisl-dev,
|
||||
-ifneq (,$(filter $(distrelease),jessie stretch sid experimental))
|
||||
- ISL_BUILD_DEP = libisl-dev (>= 0.14),
|
||||
-endif
|
||||
+#ifneq (,$(filter $(distrelease),jessie sid experimental))
|
||||
+#ISL_BUILD_DEP = libisl-dev,
|
||||
+#ifneq (,$(filter $(distrelease),jessie stretch sid experimental))
|
||||
+# ISL_BUILD_DEP = libisl-dev (>= 0.14),
|
||||
+#endif
|
||||
|
||||
@@ -37,9 +38,8 @@ diff -Nur debian/rules.conf /home/syuu/gcc-5-5.4.1/debian/rules.conf
|
||||
ifneq ($(DEB_CROSS),yes)
|
||||
# all archs for which to create b-d's
|
||||
any_archs = alpha amd64 armel armhf arm64 i386 mips mipsel mips64 mips64el powerpc ppc64 ppc64el m68k sh4 sparc64 s390x x32
|
||||
diff -Nur debian/rules.defs /home/syuu/gcc-5-5.4.1/debian/rules.defs
|
||||
--- debian/rules.defs 2016-10-14 04:54:21.000000000 +0000
|
||||
+++ /home/syuu/gcc-5-5.4.1/debian/rules.defs 2016-10-13 10:18:51.647631508 +0000
|
||||
--- debian/rules.defs 2017-02-24 19:02:52.000000000 +0000
|
||||
+++ /home/syuu/gcc-5.5/gcc-5-5.4.1/debian/rules.defs 2017-02-24 18:13:59.000000000 +0000
|
||||
@@ -412,7 +412,7 @@
|
||||
# gcc versions (fixincludes, libgcj-common) ...
|
||||
#with_common_pkgs := yes
|
||||
|
||||
2
dist/debian/dep/debian-stretch-source.list
vendored
2
dist/debian/dep/debian-stretch-source.list
vendored
@@ -1,2 +0,0 @@
|
||||
deb-src http://httpredir.debian.org/debian stretch main
|
||||
deb-src http://httpredir.debian.org/debian stretch-updates main
|
||||
3
dist/debian/rules.in
vendored
3
dist/debian/rules.in
vendored
@@ -11,7 +11,8 @@ override_dh_auto_clean:
|
||||
|
||||
override_dh_installinit:
|
||||
dh_installinit --no-start @@DH_INSTALLINIT@@
|
||||
dh_installinit --no-start --name scylla-housekeeping @@DH_INSTALLINIT@@
|
||||
dh_installinit --no-start --name scylla-housekeeping-daily @@DH_INSTALLINIT@@
|
||||
dh_installinit --no-start --name scylla-housekeeping-restart @@DH_INSTALLINIT@@
|
||||
dh_installinit --no-start --name node-exporter @@DH_INSTALLINIT@@
|
||||
|
||||
override_dh_strip:
|
||||
|
||||
3
dist/debian/scylla-server.install.in
vendored
3
dist/debian/scylla-server.install.in
vendored
@@ -15,6 +15,7 @@ build/release/iotune usr/bin
|
||||
dist/common/bin/scyllatop usr/bin
|
||||
dist/common/sbin/* usr/sbin
|
||||
@@ADDHKCFG@@
|
||||
@@HKDOTTIMER@@
|
||||
@@HKDOTTIMER_D@@
|
||||
@@HKDOTTIMER_R@@
|
||||
@@INSTALL@@
|
||||
@@SYSCTL@@
|
||||
|
||||
4
dist/docker/redhat/Dockerfile
vendored
4
dist/docker/redhat/Dockerfile
vendored
@@ -7,7 +7,7 @@ ENV container docker
|
||||
VOLUME [ "/sys/fs/cgroup" ]
|
||||
|
||||
#install scylla
|
||||
RUN curl http://downloads.scylladb.com/rpm/unstable/centos/master/latest/scylla.repo -o /etc/yum.repos.d/scylla.repo
|
||||
RUN curl http://downloads.scylladb.com/rpm/centos/scylla-1.7.repo -o /etc/yum.repos.d/scylla.repo
|
||||
RUN yum -y install epel-release
|
||||
RUN yum -y clean expire-cache
|
||||
RUN yum -y update
|
||||
@@ -38,6 +38,6 @@ ADD commandlineparser.py /commandlineparser.py
|
||||
ADD docker-entrypoint.py /docker-entrypoint.py
|
||||
ENTRYPOINT ["/docker-entrypoint.py"]
|
||||
|
||||
EXPOSE 10000 9042 9160 7000 7001
|
||||
EXPOSE 10000 9042 9160 9180 7000 7001
|
||||
VOLUME [ "/var/lib/scylla" ]
|
||||
RUN chown -R scylla.scylla /var/lib/scylla
|
||||
|
||||
11
dist/redhat/centos_dep/build_dependency.sh
vendored
11
dist/redhat/centos_dep/build_dependency.sh
vendored
@@ -28,10 +28,6 @@ if [ ! -f boost-1.58.0-11.fc23.src.rpm ]; then
|
||||
wget -nv https://kojipkgs.fedoraproject.org//packages/boost/1.58.0/11.fc23/src/boost-1.58.0-11.fc23.src.rpm
|
||||
fi
|
||||
|
||||
if [ ! -f ninja-build-1.6.0-2.fc23.src.rpm ]; then
|
||||
wget -nv https://kojipkgs.fedoraproject.org//packages/ninja-build/1.6.0/2.fc23/src/ninja-build-1.6.0-2.fc23.src.rpm
|
||||
fi
|
||||
|
||||
if [ ! -f ragel-6.8-5.fc23.src.rpm ]; then
|
||||
wget -nv https://kojipkgs.fedoraproject.org//packages/ragel/6.8/5.fc23/src/ragel-6.8-5.fc23.src.rpm
|
||||
fi
|
||||
@@ -94,13 +90,6 @@ if [ ! -f $RPMBUILD/RPMS/x86_64/scylla-boost-1.58.0-11.el7*.x86_64.rpm ]; then
|
||||
fi
|
||||
do_install scylla-boost*
|
||||
|
||||
if [ ! -f $RPMBUILD/RPMS/x86_64/scylla-ninja-build-1.6.0-2.el7*.x86_64.rpm ]; then
|
||||
rpm --define "_topdir $RPMBUILD" -ivh build/srpms/ninja-build-1.6.0-2.fc23.src.rpm
|
||||
patch $RPMBUILD/SPECS/ninja-build.spec < dist/redhat/centos_dep/ninja-build.diff
|
||||
rpmbuild --define "_topdir $RPMBUILD" -ba $RPMBUILD/SPECS/ninja-build.spec
|
||||
fi
|
||||
do_install scylla-ninja-build-1.6.0-2.el7*.x86_64.rpm
|
||||
|
||||
if [ ! -f $RPMBUILD/RPMS/x86_64/scylla-ragel-6.8-5.el7*.x86_64.rpm ]; then
|
||||
rpm --define "_topdir $RPMBUILD" -ivh build/srpms/ragel-6.8-5.fc23.src.rpm
|
||||
patch $RPMBUILD/SPECS/ragel.spec < dist/redhat/centos_dep/ragel.diff
|
||||
|
||||
56
dist/redhat/centos_dep/ninja-build.diff
vendored
56
dist/redhat/centos_dep/ninja-build.diff
vendored
@@ -1,56 +0,0 @@
|
||||
--- ninja-build.spec.orig 2016-01-20 14:41:16.892802134 +0000
|
||||
+++ ninja-build.spec 2016-01-20 14:44:42.453227192 +0000
|
||||
@@ -1,19 +1,18 @@
|
||||
-Name: ninja-build
|
||||
+Name: scylla-ninja-build
|
||||
Version: 1.6.0
|
||||
Release: 2%{?dist}
|
||||
Summary: A small build system with a focus on speed
|
||||
License: ASL 2.0
|
||||
URL: http://martine.github.com/ninja/
|
||||
Source0: https://github.com/martine/ninja/archive/v%{version}.tar.gz#/ninja-%{version}.tar.gz
|
||||
-Source1: ninja.vim
|
||||
# Rename mentions of the executable name to be ninja-build.
|
||||
Patch1000: ninja-1.6.0-binary-rename.patch
|
||||
+Requires: scylla-env
|
||||
BuildRequires: asciidoc
|
||||
BuildRequires: gtest-devel
|
||||
BuildRequires: python2-devel
|
||||
-BuildRequires: re2c >= 0.11.3
|
||||
-Requires: emacs-filesystem
|
||||
-Requires: vim-filesystem
|
||||
+#BuildRequires: scylla-re2c >= 0.11.3
|
||||
+%define _prefix /opt/scylladb
|
||||
|
||||
%description
|
||||
Ninja is a small build system with a focus on speed. It differs from other
|
||||
@@ -32,15 +31,8 @@
|
||||
./ninja -v ninja_test
|
||||
|
||||
%install
|
||||
-# TODO: Install ninja_syntax.py?
|
||||
-mkdir -p %{buildroot}/{%{_bindir},%{_datadir}/bash-completion/completions,%{_datadir}/emacs/site-lisp,%{_datadir}/vim/vimfiles/syntax,%{_datadir}/vim/vimfiles/ftdetect,%{_datadir}/zsh/site-functions}
|
||||
-
|
||||
+mkdir -p %{buildroot}/opt/scylladb/bin
|
||||
install -pm755 ninja %{buildroot}%{_bindir}/ninja-build
|
||||
-install -pm644 misc/bash-completion %{buildroot}%{_datadir}/bash-completion/completions/ninja-bash-completion
|
||||
-install -pm644 misc/ninja-mode.el %{buildroot}%{_datadir}/emacs/site-lisp/ninja-mode.el
|
||||
-install -pm644 misc/ninja.vim %{buildroot}%{_datadir}/vim/vimfiles/syntax/ninja.vim
|
||||
-install -pm644 %{SOURCE1} %{buildroot}%{_datadir}/vim/vimfiles/ftdetect/ninja.vim
|
||||
-install -pm644 misc/zsh-completion %{buildroot}%{_datadir}/zsh/site-functions/_ninja
|
||||
|
||||
%check
|
||||
# workaround possible too low default limits
|
||||
@@ -50,12 +42,6 @@
|
||||
%files
|
||||
%doc COPYING HACKING.md README doc/manual.html
|
||||
%{_bindir}/ninja-build
|
||||
-%{_datadir}/bash-completion/completions/ninja-bash-completion
|
||||
-%{_datadir}/emacs/site-lisp/ninja-mode.el
|
||||
-%{_datadir}/vim/vimfiles/syntax/ninja.vim
|
||||
-%{_datadir}/vim/vimfiles/ftdetect/ninja.vim
|
||||
-# zsh does not have a -filesystem package
|
||||
-%{_datadir}/zsh/
|
||||
|
||||
%changelog
|
||||
* Mon Nov 16 2015 Ben Boeckel <mathstuf@gmail.com> - 1.6.0-2
|
||||
28
dist/redhat/scylla.spec.in
vendored
28
dist/redhat/scylla.spec.in
vendored
@@ -27,9 +27,9 @@ Group: Applications/Databases
|
||||
Summary: The Scylla database server
|
||||
License: AGPLv3
|
||||
URL: http://www.scylladb.com/
|
||||
BuildRequires: libaio-devel libstdc++-devel cryptopp-devel hwloc-devel numactl-devel libpciaccess-devel libxml2-devel zlib-devel thrift-devel yaml-cpp-devel lz4-devel snappy-devel jsoncpp-devel systemd-devel xz-devel pcre-devel elfutils-libelf-devel bzip2-devel keyutils-libs-devel xfsprogs-devel make gnutls-devel systemd-devel lksctp-tools-devel protobuf-devel protobuf-compiler libunwind-devel systemtap-sdt-devel
|
||||
%{?fedora:BuildRequires: boost-devel ninja-build ragel antlr3-tool antlr3-C++-devel python3 gcc-c++ libasan libubsan python3-pyparsing dnf-yum}
|
||||
%{?rhel:BuildRequires: scylla-libstdc++-static scylla-boost-devel scylla-boost-static scylla-ninja-build scylla-ragel scylla-antlr3-tool scylla-antlr3-C++-devel python34 scylla-gcc-c++ >= 5.1.1, python34-pyparsing}
|
||||
BuildRequires: libaio-devel libstdc++-devel cryptopp-devel hwloc-devel numactl-devel libpciaccess-devel libxml2-devel zlib-devel thrift-devel yaml-cpp-devel lz4-devel snappy-devel jsoncpp-devel systemd-devel xz-devel pcre-devel elfutils-libelf-devel bzip2-devel keyutils-libs-devel xfsprogs-devel make gnutls-devel systemd-devel lksctp-tools-devel protobuf-devel protobuf-compiler libunwind-devel systemtap-sdt-devel libtool automake ninja-build
|
||||
%{?fedora:BuildRequires: boost-devel ragel antlr3-tool antlr3-C++-devel python3 gcc-c++ libasan libubsan python3-pyparsing dnf-yum}
|
||||
%{?rhel:BuildRequires: scylla-libstdc++-static scylla-boost-devel scylla-boost-static scylla-ragel scylla-antlr3-tool scylla-antlr3-C++-devel python34 scylla-gcc-c++ >= 5.1.1, python34-pyparsing}
|
||||
Requires: scylla-conf systemd-libs hwloc collectd PyYAML python-urwid pciutils pyparsing python-requests curl util-linux python-setuptools pciutils
|
||||
%{?rhel:Requires: python34 python34-PyYAML}
|
||||
Conflicts: abrt
|
||||
@@ -63,6 +63,9 @@ mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/security/limits.d/
|
||||
mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/collectd.d/
|
||||
mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/scylla/
|
||||
mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/scylla.d/
|
||||
%if 0%{?rhel}
|
||||
mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/modprobe.d/
|
||||
%endif
|
||||
mkdir -p $RPM_BUILD_ROOT%{_sysctldir}/
|
||||
mkdir -p $RPM_BUILD_ROOT%{_docdir}/scylla/
|
||||
mkdir -p $RPM_BUILD_ROOT%{_unitdir}
|
||||
@@ -73,6 +76,9 @@ install -m644 dist/common/limits.d/scylla.conf $RPM_BUILD_ROOT%{_sysconfdir}/sec
|
||||
install -m644 dist/common/collectd.d/scylla.conf $RPM_BUILD_ROOT%{_sysconfdir}/collectd.d/
|
||||
install -m644 dist/common/scylla.d/*.conf $RPM_BUILD_ROOT%{_sysconfdir}/scylla.d/
|
||||
install -m644 dist/common/sysctl.d/*.conf $RPM_BUILD_ROOT%{_sysctldir}/
|
||||
%if 0%{?rhel}
|
||||
install -m644 dist/common/modprobe.d/*.conf $RPM_BUILD_ROOT%{_sysconfdir}/modprobe.d/
|
||||
%endif
|
||||
install -d -m755 $RPM_BUILD_ROOT%{_sysconfdir}/scylla
|
||||
install -m644 conf/scylla.yaml $RPM_BUILD_ROOT%{_sysconfdir}/scylla/
|
||||
install -m644 conf/cassandra-rackdc.properties $RPM_BUILD_ROOT%{_sysconfdir}/scylla/
|
||||
@@ -151,10 +157,8 @@ rm -rf $RPM_BUILD_ROOT
|
||||
%{_docdir}/scylla/NOTICE.txt
|
||||
%{_docdir}/scylla/ORIGIN
|
||||
%{_docdir}/scylla/licenses/
|
||||
%{_unitdir}/scylla-server.service
|
||||
%{_unitdir}/scylla-housekeeping.service
|
||||
%{_unitdir}/scylla-housekeeping.timer
|
||||
%{_unitdir}/node-exporter.service
|
||||
%{_unitdir}/*.service
|
||||
%{_unitdir}/*.timer
|
||||
%{_bindir}/scylla
|
||||
%{_bindir}/iotune
|
||||
%{_bindir}/scyllatop
|
||||
@@ -228,6 +232,7 @@ Group: Applications/Databases
|
||||
Summary: Scylla configuration package for the Linux kernel
|
||||
License: AGPLv3
|
||||
URL: http://www.scylladb.com/
|
||||
Requires: kmod
|
||||
|
||||
%description kernel-conf
|
||||
This package contains Linux kernel configuration changes for the Scylla database. Install this package
|
||||
@@ -237,9 +242,18 @@ if Scylla is the main application on your server and you wish to optimize its la
|
||||
# We cannot use the sysctl_apply rpm macro because it is not present in 7.0
|
||||
# following is a "manual" expansion
|
||||
/usr/lib/systemd/systemd-sysctl 99-scylla-sched.conf >/dev/null 2>&1 || :
|
||||
# Write modprobe.d params when module already loaded
|
||||
%if 0%{?rhel}
|
||||
if [ -e /sys/module/raid0/parameters/devices_discard_performance ]; then
|
||||
echo Y > /sys/module/raid0/parameters/devices_discard_performance
|
||||
fi
|
||||
%endif
|
||||
|
||||
%files kernel-conf
|
||||
%defattr(-,root,root)
|
||||
%if 0%{?rhel}
|
||||
%config(noreplace) %{_sysconfdir}/modprobe.d/*.conf
|
||||
%endif
|
||||
%{_sysctldir}/*.conf
|
||||
|
||||
%changelog
|
||||
|
||||
@@ -50,6 +50,12 @@ public:
|
||||
// for real time waits.
|
||||
};
|
||||
|
||||
// Returns a time point which is earlier from t by d, or minimum time point if it cannot be represented.
|
||||
template<typename Clock, typename Duration, typename Rep, typename Period>
|
||||
inline
|
||||
auto saturating_subtract(std::chrono::time_point<Clock, Duration> t, std::chrono::duration<Rep, Period> d) -> decltype(t) {
|
||||
return std::max(t, decltype(t)::min() + d) - d;
|
||||
}
|
||||
|
||||
using expiry_opt = std::experimental::optional<gc_clock::time_point>;
|
||||
using ttl_opt = std::experimental::optional<gc_clock::duration>;
|
||||
|
||||
69
memtable.cc
69
memtable.cc
@@ -65,17 +65,15 @@ future<> memtable::clear_gently() noexcept {
|
||||
auto t = std::make_unique<seastar::thread>(attr, [this] {
|
||||
auto& alloc = allocator();
|
||||
|
||||
// entries can no longer be moved after unlink_leftmost_without_rebalance()
|
||||
// so need to disable compaction.
|
||||
logalloc::reclaim_lock rl(*this);
|
||||
|
||||
auto p = std::move(partitions);
|
||||
while (!p.empty()) {
|
||||
auto batch_size = std::min<size_t>(p.size(), 32);
|
||||
auto dirty_before = dirty_size();
|
||||
with_allocator(alloc, [&] () noexcept {
|
||||
while (batch_size--) {
|
||||
alloc.destroy(p.unlink_leftmost_without_rebalance());
|
||||
p.erase_and_dispose(p.begin(), [&] (auto e) {
|
||||
alloc.destroy(e);
|
||||
});
|
||||
}
|
||||
});
|
||||
remove_flushed_memory(dirty_before - dirty_size());
|
||||
@@ -205,19 +203,23 @@ protected:
|
||||
, _range(&range)
|
||||
{ }
|
||||
|
||||
memtable_entry* fetch_next_entry() {
|
||||
memtable_entry* fetch_entry() {
|
||||
update_iterators();
|
||||
if (_i == _end) {
|
||||
return nullptr;
|
||||
} else {
|
||||
memtable_entry& e = *_i;
|
||||
++_i;
|
||||
_last = e.key();
|
||||
_memtable->upgrade_entry(e);
|
||||
return &e;
|
||||
}
|
||||
}
|
||||
|
||||
void advance() {
|
||||
memtable_entry& e = *_i;
|
||||
_last = e.key();
|
||||
++_i;
|
||||
}
|
||||
|
||||
logalloc::allocating_section& read_section() {
|
||||
return _memtable->_read_section;
|
||||
}
|
||||
@@ -287,14 +289,18 @@ public:
|
||||
return _delegate();
|
||||
}
|
||||
|
||||
logalloc::reclaim_lock _(region());
|
||||
managed_bytes::linearization_context_guard lcg;
|
||||
memtable_entry* e = fetch_next_entry();
|
||||
if (!e) {
|
||||
return make_ready_future<streamed_mutation_opt>(stdx::nullopt);
|
||||
} else {
|
||||
return make_ready_future<streamed_mutation_opt>(e->read(mtbl(), schema(), _slice));
|
||||
}
|
||||
return read_section()(region(), [&] {
|
||||
return with_linearized_managed_bytes([&] {
|
||||
memtable_entry* e = fetch_entry();
|
||||
if (!e) {
|
||||
return make_ready_future<streamed_mutation_opt>(stdx::nullopt);
|
||||
} else {
|
||||
auto ret = make_ready_future<streamed_mutation_opt>(e->read(mtbl(), schema(), _slice));
|
||||
advance();
|
||||
return ret;
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
@@ -391,19 +397,24 @@ public:
|
||||
flush_reader& operator=(const flush_reader&) = delete;
|
||||
|
||||
virtual future<streamed_mutation_opt> operator()() override {
|
||||
logalloc::reclaim_lock _(region());
|
||||
managed_bytes::linearization_context_guard lcg;
|
||||
memtable_entry* e = fetch_next_entry();
|
||||
if (!e) {
|
||||
return make_ready_future<streamed_mutation_opt>(stdx::nullopt);
|
||||
} else {
|
||||
auto cr = query::clustering_key_filter_ranges::get_ranges(*schema(), query::full_slice, e->key().key());
|
||||
auto snp = e->partition().read(schema());
|
||||
auto mpsr = make_partition_snapshot_reader<partition_snapshot_accounter>(schema(), e->key(), std::move(cr), snp, region(), read_section(), mtbl(), _flushed_memory);
|
||||
_flushed_memory.account_component(*e);
|
||||
_flushed_memory.account_component(*snp);
|
||||
return make_ready_future<streamed_mutation_opt>(std::move(mpsr));
|
||||
}
|
||||
return read_section()(region(), [&] {
|
||||
return with_linearized_managed_bytes([&] {
|
||||
memtable_entry* e = fetch_entry();
|
||||
if (!e) {
|
||||
return make_ready_future<streamed_mutation_opt>(stdx::nullopt);
|
||||
} else {
|
||||
auto cr = query::clustering_key_filter_ranges::get_ranges(*schema(), query::full_slice, e->key().key());
|
||||
auto snp = e->partition().read(schema());
|
||||
auto mpsr = make_partition_snapshot_reader<partition_snapshot_accounter>(schema(), e->key(), std::move(cr),
|
||||
snp, region(), read_section(), mtbl(), _flushed_memory);
|
||||
_flushed_memory.account_component(*e);
|
||||
_flushed_memory.account_component(*snp);
|
||||
auto ret = make_ready_future<streamed_mutation_opt>(std::move(mpsr));
|
||||
advance();
|
||||
return ret;
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -274,7 +274,13 @@ void messaging_service::start_listen() {
|
||||
if (listen_to_bc) {
|
||||
_server_tls[1] = listen(utils::fb_utilities::get_broadcast_address());
|
||||
}
|
||||
|
||||
}
|
||||
// Do this on just cpu 0, to avoid duplicate logs.
|
||||
if (engine().cpu_id() == 0) {
|
||||
if (_server_tls[0]) {
|
||||
logger.info("Starting Encrypted Messaging Service on SSL port {}", _ssl_port);
|
||||
}
|
||||
logger.info("Starting Messaging Service on port {}", _port);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -308,14 +314,6 @@ messaging_service::messaging_service(gms::inet_address ip
|
||||
if (listen_now) {
|
||||
start_listen();
|
||||
}
|
||||
|
||||
// Do this on just cpu 0, to avoid duplicate logs.
|
||||
if (engine().cpu_id() == 0) {
|
||||
if (_server_tls[0]) {
|
||||
logger.info("Starting Encrypted Messaging Service on SSL port {}", _ssl_port);
|
||||
}
|
||||
logger.info("Starting Messaging Service on port {}", _port);
|
||||
}
|
||||
}
|
||||
|
||||
msg_addr messaging_service::get_source(const rpc::client_info& cinfo) {
|
||||
|
||||
@@ -123,7 +123,7 @@ public:
|
||||
uint32_t partition_limit, CompactedMutationsConsumer consumer)
|
||||
: _schema(s)
|
||||
, _query_time(query_time)
|
||||
, _gc_before(query_time - s.gc_grace_seconds())
|
||||
, _gc_before(saturating_subtract(query_time, s.gc_grace_seconds()))
|
||||
, _can_gc(always_gc)
|
||||
, _slice(slice)
|
||||
, _row_limit(limit)
|
||||
@@ -139,7 +139,7 @@ public:
|
||||
std::function<api::timestamp_type(const dht::decorated_key&)> get_max_purgeable)
|
||||
: _schema(s)
|
||||
, _query_time(compaction_time)
|
||||
, _gc_before(_query_time - s.gc_grace_seconds())
|
||||
, _gc_before(saturating_subtract(_query_time, s.gc_grace_seconds()))
|
||||
, _get_max_purgeable(std::move(get_max_purgeable))
|
||||
, _can_gc([this] (tombstone t) { return can_gc(t); })
|
||||
, _slice(query::full_slice)
|
||||
|
||||
@@ -1183,7 +1183,7 @@ uint32_t mutation_partition::do_compact(const schema& s,
|
||||
{
|
||||
assert(row_limit > 0);
|
||||
|
||||
auto gc_before = query_time - s.gc_grace_seconds();
|
||||
auto gc_before = saturating_subtract(query_time, s.gc_grace_seconds());
|
||||
|
||||
auto should_purge_tombstone = [&] (const tombstone& t) {
|
||||
return t.deletion_time < gc_before && can_gc(t);
|
||||
|
||||
@@ -345,7 +345,7 @@ public:
|
||||
: _w(std::move(w))
|
||||
, _row_count(c)
|
||||
, _short_read(sr)
|
||||
, _memory_tracker(std::move(_memory_tracker))
|
||||
, _memory_tracker(std::move(memory_tracker))
|
||||
, _partition_count(pc)
|
||||
{
|
||||
w.reduce_chunk_count();
|
||||
|
||||
@@ -492,6 +492,13 @@ static void split_and_add(std::vector<::dht::token_range>& ranges,
|
||||
auto midpoint = dht::global_partitioner().midpoint(
|
||||
range.start() ? range.start()->value() : dht::minimum_token(),
|
||||
range.end() ? range.end()->value() : dht::minimum_token());
|
||||
// This shouldn't happen, but if the range included just one token, we
|
||||
// can't split further (split() may actually fail with assertion failure)
|
||||
if ((range.start() && midpoint == range.start()->value()) ||
|
||||
(range.end() && midpoint == range.end()->value())) {
|
||||
ranges.push_back(range);
|
||||
return;
|
||||
}
|
||||
auto halves = range.split(midpoint, dht::token_comparator());
|
||||
ranges.push_back(halves.first);
|
||||
ranges.push_back(halves.second);
|
||||
|
||||
14
schema.cc
14
schema.cc
@@ -145,6 +145,20 @@ void schema::rebuild() {
|
||||
|
||||
thrift()._compound = is_compound();
|
||||
thrift()._is_dynamic = clustering_key_size() > 0;
|
||||
|
||||
if (default_validator()->is_counter()) {
|
||||
for (auto&& cdef : boost::range::join(static_columns(), regular_columns())) {
|
||||
if (!cdef.type->is_counter()) {
|
||||
throw exceptions::configuration_exception(sprint("Cannot add a non counter column (%s) in a counter column family", cdef.name_as_text()));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (auto&& cdef : all_columns()) {
|
||||
if (cdef.second->type->is_counter()) {
|
||||
throw exceptions::configuration_exception(sprint("Cannot add a counter column (%s) in a non counter column family", cdef.second->name_as_text()));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const column_mapping& schema::get_column_mapping() const {
|
||||
|
||||
2
seastar
2
seastar
Submodule seastar updated: f07f8ed68d...f391f9e94a
@@ -481,8 +481,10 @@ future<> migration_manager::announce_new_column_family(schema_ptr cfm, bool anno
|
||||
throw exceptions::already_exists_exception(cfm->ks_name(), cfm->cf_name());
|
||||
}
|
||||
logger.info("Create new ColumnFamily: {}", cfm);
|
||||
auto mutations = db::schema_tables::make_create_table_mutations(keyspace.metadata(), cfm, api::new_timestamp());
|
||||
return announce(std::move(mutations), announce_locally);
|
||||
return db::schema_tables::make_create_table_mutations(keyspace.metadata(), cfm, api::new_timestamp())
|
||||
.then([announce_locally, this] (auto&& mutations) {
|
||||
return announce(std::move(mutations), announce_locally);
|
||||
});
|
||||
} catch (const no_such_keyspace& e) {
|
||||
throw exceptions::configuration_exception(sprint("Cannot add table '%s' to non existing keyspace '%s'.", cfm->cf_name(), cfm->ks_name()));
|
||||
}
|
||||
@@ -501,8 +503,10 @@ future<> migration_manager::announce_column_family_update(schema_ptr cfm, bool f
|
||||
#endif
|
||||
logger.info("Update table '{}.{}' From {} To {}", cfm->ks_name(), cfm->cf_name(), *old_schema, *cfm);
|
||||
auto&& keyspace = db.find_keyspace(cfm->ks_name());
|
||||
auto mutations = db::schema_tables::make_update_table_mutations(keyspace.metadata(), old_schema, cfm, api::new_timestamp(), from_thrift);
|
||||
return announce(std::move(mutations), announce_locally);
|
||||
return db::schema_tables::make_update_table_mutations(keyspace.metadata(), old_schema, cfm, api::new_timestamp(), from_thrift)
|
||||
.then([announce_locally] (auto&& mutations) {
|
||||
return announce(std::move(mutations), announce_locally);
|
||||
});
|
||||
} catch (const no_such_column_family& e) {
|
||||
throw exceptions::configuration_exception(sprint("Cannot update non existing table '%s' in keyspace '%s'.",
|
||||
cfm->cf_name(), cfm->ks_name()));
|
||||
@@ -512,8 +516,10 @@ future<> migration_manager::announce_column_family_update(schema_ptr cfm, bool f
|
||||
static future<> do_announce_new_type(user_type new_type, bool announce_locally) {
|
||||
auto& db = get_local_storage_proxy().get_db().local();
|
||||
auto&& keyspace = db.find_keyspace(new_type->_keyspace);
|
||||
auto mutations = db::schema_tables::make_create_type_mutations(keyspace.metadata(), new_type, api::new_timestamp());
|
||||
return migration_manager::announce(std::move(mutations), announce_locally);
|
||||
return db::schema_tables::make_create_type_mutations(keyspace.metadata(), new_type, api::new_timestamp())
|
||||
.then([announce_locally] (auto&& mutations) {
|
||||
return migration_manager::announce(std::move(mutations), announce_locally);
|
||||
});
|
||||
}
|
||||
|
||||
future<> migration_manager::announce_new_type(user_type new_type, bool announce_locally) {
|
||||
@@ -609,8 +615,10 @@ future<> migration_manager::announce_column_family_drop(const sstring& ks_name,
|
||||
ks_name, ::join(", ", views | boost::adaptors::transformed([](auto&& v) { return v->cf_name(); }))));
|
||||
}
|
||||
logger.info("Drop table '{}.{}'", schema->ks_name(), schema->cf_name());
|
||||
auto mutations = db::schema_tables::make_drop_table_mutations(db.find_keyspace(ks_name).metadata(), schema, api::new_timestamp());
|
||||
return announce(std::move(mutations), announce_locally);
|
||||
return db::schema_tables::make_drop_table_mutations(db.find_keyspace(ks_name).metadata(), schema, api::new_timestamp())
|
||||
.then([announce_locally] (auto&& mutations) {
|
||||
return announce(std::move(mutations), announce_locally);
|
||||
});
|
||||
} catch (const no_such_column_family& e) {
|
||||
throw exceptions::configuration_exception(sprint("Cannot drop non existing table '%s' in keyspace '%s'.", cf_name, ks_name));
|
||||
}
|
||||
@@ -621,8 +629,10 @@ future<> migration_manager::announce_type_drop(user_type dropped_type, bool anno
|
||||
auto& db = get_local_storage_proxy().get_db().local();
|
||||
auto&& keyspace = db.find_keyspace(dropped_type->_keyspace);
|
||||
logger.info("Drop User Type: {}", dropped_type->get_name_as_string());
|
||||
auto mutations = db::schema_tables::make_drop_type_mutations(keyspace.metadata(), dropped_type, api::new_timestamp());
|
||||
return announce(std::move(mutations), announce_locally);
|
||||
return db::schema_tables::make_drop_type_mutations(keyspace.metadata(), dropped_type, api::new_timestamp())
|
||||
.then([announce_locally] (auto&& mutations) {
|
||||
return announce(std::move(mutations), announce_locally);
|
||||
});
|
||||
}
|
||||
|
||||
future<> migration_manager::announce_new_view(view_ptr view, bool announce_locally)
|
||||
@@ -637,8 +647,10 @@ future<> migration_manager::announce_new_view(view_ptr view, bool announce_local
|
||||
throw exceptions::already_exists_exception(view->ks_name(), view->cf_name());
|
||||
}
|
||||
logger.info("Create new view: {}", view);
|
||||
auto mutations = db::schema_tables::make_create_view_mutations(keyspace, std::move(view), api::new_timestamp());
|
||||
return announce(std::move(mutations), announce_locally);
|
||||
return db::schema_tables::make_create_view_mutations(keyspace, std::move(view), api::new_timestamp())
|
||||
.then([announce_locally] (auto&& mutations) {
|
||||
return announce(std::move(mutations), announce_locally);
|
||||
});
|
||||
} catch (const no_such_keyspace& e) {
|
||||
throw exceptions::configuration_exception(sprint("Cannot add view '%s' to non existing keyspace '%s'.", view->cf_name(), view->ks_name()));
|
||||
}
|
||||
@@ -660,8 +672,10 @@ future<> migration_manager::announce_view_update(view_ptr view, bool announce_lo
|
||||
oldCfm.validateCompatility(cfm);
|
||||
#endif
|
||||
logger.info("Update view '{}.{}' From {} To {}", view->ks_name(), view->cf_name(), *old_view, *view);
|
||||
auto mutations = db::schema_tables::make_update_view_mutations(std::move(keyspace), view_ptr(old_view), std::move(view), api::new_timestamp());
|
||||
return announce(std::move(mutations), announce_locally);
|
||||
return db::schema_tables::make_update_view_mutations(std::move(keyspace), view_ptr(old_view), std::move(view), api::new_timestamp())
|
||||
.then([announce_locally] (auto&& mutations) {
|
||||
return announce(std::move(mutations), announce_locally);
|
||||
});
|
||||
} catch (const std::out_of_range& e) {
|
||||
throw exceptions::configuration_exception(sprint("Cannot update non existing materialized view '%s' in keyspace '%s'.",
|
||||
view->cf_name(), view->ks_name()));
|
||||
@@ -680,8 +694,10 @@ future<> migration_manager::announce_view_drop(const sstring& ks_name,
|
||||
}
|
||||
auto keyspace = db.find_keyspace(ks_name).metadata();
|
||||
logger.info("Drop view '{}.{}'", view->ks_name(), view->cf_name());
|
||||
auto mutations = db::schema_tables::make_drop_view_mutations(std::move(keyspace), view_ptr(std::move(view)), api::new_timestamp());
|
||||
return announce(std::move(mutations), announce_locally);
|
||||
return db::schema_tables::make_drop_view_mutations(std::move(keyspace), view_ptr(std::move(view)), api::new_timestamp())
|
||||
.then([announce_locally] (auto&& mutations) {
|
||||
return announce(std::move(mutations), announce_locally);
|
||||
});
|
||||
} catch (const no_such_column_family& e) {
|
||||
throw exceptions::configuration_exception(sprint("Cannot drop non existing materialized view '%s' in keyspace '%s'.",
|
||||
cf_name, ks_name));
|
||||
|
||||
@@ -478,7 +478,6 @@ inline uint64_t& storage_proxy::split_stats::get_ep_stat(gms::inet_address ep) {
|
||||
storage_proxy::~storage_proxy() {}
|
||||
storage_proxy::storage_proxy(distributed<database>& db) : _db(db) {
|
||||
namespace sm = seastar::metrics;
|
||||
|
||||
_metrics.add_group(COORDINATOR_STATS_CATEGORY, {
|
||||
sm::make_queue_length("foreground_writes", [this] { return _stats.writes - _stats.background_writes; },
|
||||
sm::description("number of currently pending foreground write requests")),
|
||||
@@ -486,7 +485,7 @@ storage_proxy::storage_proxy(distributed<database>& db) : _db(db) {
|
||||
sm::make_queue_length("background_writes", [this] { return _stats.background_writes; },
|
||||
sm::description("number of currently pending background write requests")),
|
||||
|
||||
sm::make_queue_length("throttled_writes", [this] { return _throttled_writes.size(); },
|
||||
sm::make_queue_length("current_throttled_writes", [this] { return _throttled_writes.size(); },
|
||||
sm::description("number of currently throttled write requests")),
|
||||
|
||||
sm::make_total_operations("throttled_writes", [this] { return _stats.throttled_writes; },
|
||||
@@ -1733,14 +1732,14 @@ protected:
|
||||
size_t _targets_count;
|
||||
promise<> _done_promise; // all target responded
|
||||
bool _timedout = false; // will be true if request timeouts
|
||||
timer<lowres_clock> _timeout;
|
||||
timer<storage_proxy::clock_type> _timeout;
|
||||
size_t _responses = 0;
|
||||
schema_ptr _schema;
|
||||
|
||||
virtual void on_timeout() {}
|
||||
virtual size_t response_count() const = 0;
|
||||
public:
|
||||
abstract_read_resolver(schema_ptr schema, db::consistency_level cl, size_t target_count, lowres_clock::time_point timeout)
|
||||
abstract_read_resolver(schema_ptr schema, db::consistency_level cl, size_t target_count, storage_proxy::clock_type::time_point timeout)
|
||||
: _cl(cl)
|
||||
, _targets_count(target_count)
|
||||
, _schema(std::move(schema))
|
||||
@@ -1796,7 +1795,7 @@ class digest_read_resolver : public abstract_read_resolver {
|
||||
return _digest_results.size();
|
||||
}
|
||||
public:
|
||||
digest_read_resolver(schema_ptr schema, db::consistency_level cl, size_t block_for, lowres_clock::time_point timeout) : abstract_read_resolver(std::move(schema), cl, 0, timeout), _block_for(block_for) {}
|
||||
digest_read_resolver(schema_ptr schema, db::consistency_level cl, size_t block_for, storage_proxy::clock_type::time_point timeout) : abstract_read_resolver(std::move(schema), cl, 0, timeout), _block_for(block_for) {}
|
||||
void add_data(gms::inet_address from, foreign_ptr<lw_shared_ptr<query::result>> result) {
|
||||
if (!_timedout) {
|
||||
// if only one target was queried digest_check() will be skipped so we can also skip digest calculation
|
||||
@@ -2143,7 +2142,7 @@ private:
|
||||
return false;
|
||||
}
|
||||
public:
|
||||
data_read_resolver(schema_ptr schema, db::consistency_level cl, size_t targets_count, lowres_clock::time_point timeout) : abstract_read_resolver(std::move(schema), cl, targets_count, timeout) {
|
||||
data_read_resolver(schema_ptr schema, db::consistency_level cl, size_t targets_count, storage_proxy::clock_type::time_point timeout) : abstract_read_resolver(std::move(schema), cl, targets_count, timeout) {
|
||||
_data_results.reserve(targets_count);
|
||||
}
|
||||
void add_mutate_data(gms::inet_address from, foreign_ptr<lw_shared_ptr<reconcilable_result>> result) {
|
||||
@@ -2330,7 +2329,7 @@ protected:
|
||||
using targets_iterator = std::vector<gms::inet_address>::iterator;
|
||||
using digest_resolver_ptr = ::shared_ptr<digest_read_resolver>;
|
||||
using data_resolver_ptr = ::shared_ptr<data_read_resolver>;
|
||||
using clock_type = lowres_clock;
|
||||
using clock_type = storage_proxy::clock_type;
|
||||
|
||||
schema_ptr _schema;
|
||||
shared_ptr<storage_proxy> _proxy;
|
||||
@@ -2454,7 +2453,7 @@ protected:
|
||||
uint32_t original_partition_limit() const {
|
||||
return _cmd->partition_limit;
|
||||
}
|
||||
void reconcile(db::consistency_level cl, lowres_clock::time_point timeout, lw_shared_ptr<query::read_command> cmd) {
|
||||
void reconcile(db::consistency_level cl, storage_proxy::clock_type::time_point timeout, lw_shared_ptr<query::read_command> cmd) {
|
||||
data_resolver_ptr data_resolver = ::make_shared<data_read_resolver>(_schema, cl, _targets.size(), timeout);
|
||||
auto exec = shared_from_this();
|
||||
|
||||
@@ -2529,12 +2528,12 @@ protected:
|
||||
}
|
||||
});
|
||||
}
|
||||
void reconcile(db::consistency_level cl, lowres_clock::time_point timeout) {
|
||||
void reconcile(db::consistency_level cl, storage_proxy::clock_type::time_point timeout) {
|
||||
reconcile(cl, timeout, _cmd);
|
||||
}
|
||||
|
||||
public:
|
||||
virtual future<foreign_ptr<lw_shared_ptr<query::result>>> execute(lowres_clock::time_point timeout) {
|
||||
virtual future<foreign_ptr<lw_shared_ptr<query::result>>> execute(storage_proxy::clock_type::time_point timeout) {
|
||||
digest_resolver_ptr digest_resolver = ::make_shared<digest_read_resolver>(_schema, _cl, _block_for, timeout);
|
||||
auto exec = shared_from_this();
|
||||
|
||||
@@ -2604,7 +2603,7 @@ public:
|
||||
class always_speculating_read_executor : public abstract_read_executor {
|
||||
public:
|
||||
using abstract_read_executor::abstract_read_executor;
|
||||
virtual future<> make_requests(digest_resolver_ptr resolver, lowres_clock::time_point timeout) {
|
||||
virtual future<> make_requests(digest_resolver_ptr resolver, storage_proxy::clock_type::time_point timeout) {
|
||||
resolver->add_wait_targets(_targets.size());
|
||||
// FIXME: consider disabling for CL=*ONE
|
||||
bool want_digest = true;
|
||||
@@ -2615,10 +2614,10 @@ public:
|
||||
|
||||
// this executor sends request to an additional replica after some time below timeout
|
||||
class speculating_read_executor : public abstract_read_executor {
|
||||
timer<> _speculate_timer;
|
||||
timer<storage_proxy::clock_type> _speculate_timer;
|
||||
public:
|
||||
using abstract_read_executor::abstract_read_executor;
|
||||
virtual future<> make_requests(digest_resolver_ptr resolver, lowres_clock::time_point timeout) {
|
||||
virtual future<> make_requests(digest_resolver_ptr resolver, storage_proxy::clock_type::time_point timeout) {
|
||||
_speculate_timer.set_callback([this, resolver, timeout] {
|
||||
if (!resolver->is_completed()) { // at the time the callback runs request may be completed already
|
||||
resolver->add_wait_targets(1); // we send one more request so wait for it too
|
||||
@@ -2664,7 +2663,7 @@ class range_slice_read_executor : public abstract_read_executor {
|
||||
public:
|
||||
range_slice_read_executor(schema_ptr s, shared_ptr<storage_proxy> proxy, lw_shared_ptr<query::read_command> cmd, dht::partition_range pr, db::consistency_level cl, std::vector<gms::inet_address> targets, tracing::trace_state_ptr trace_state) :
|
||||
abstract_read_executor(std::move(s), std::move(proxy), std::move(cmd), std::move(pr), cl, targets.size(), std::move(targets), std::move(trace_state)) {}
|
||||
virtual future<foreign_ptr<lw_shared_ptr<query::result>>> execute(lowres_clock::time_point timeout) override {
|
||||
virtual future<foreign_ptr<lw_shared_ptr<query::result>>> execute(storage_proxy::clock_type::time_point timeout) override {
|
||||
reconcile(_cl, timeout);
|
||||
return _result_promise.get_future();
|
||||
}
|
||||
@@ -2795,7 +2794,7 @@ future<foreign_ptr<lw_shared_ptr<query::result>>>
|
||||
storage_proxy::query_singular(lw_shared_ptr<query::read_command> cmd, dht::partition_range_vector&& partition_ranges, db::consistency_level cl, tracing::trace_state_ptr trace_state) {
|
||||
std::vector<::shared_ptr<abstract_read_executor>> exec;
|
||||
exec.reserve(partition_ranges.size());
|
||||
auto timeout = lowres_clock::now() + std::chrono::milliseconds(_db.local().get_config().read_request_timeout_in_ms());
|
||||
auto timeout = storage_proxy::clock_type::now() + std::chrono::milliseconds(_db.local().get_config().read_request_timeout_in_ms());
|
||||
|
||||
for (auto&& pr: partition_ranges) {
|
||||
if (!pr.is_singular()) {
|
||||
@@ -2819,7 +2818,7 @@ storage_proxy::query_singular(lw_shared_ptr<query::read_command> cmd, dht::parti
|
||||
}
|
||||
|
||||
future<std::vector<foreign_ptr<lw_shared_ptr<query::result>>>>
|
||||
storage_proxy::query_partition_key_range_concurrent(lowres_clock::time_point timeout, std::vector<foreign_ptr<lw_shared_ptr<query::result>>>&& results,
|
||||
storage_proxy::query_partition_key_range_concurrent(storage_proxy::clock_type::time_point timeout, std::vector<foreign_ptr<lw_shared_ptr<query::result>>>&& results,
|
||||
lw_shared_ptr<query::read_command> cmd, db::consistency_level cl, dht::partition_range_vector::iterator&& i,
|
||||
dht::partition_range_vector&& ranges, int concurrency_factor, tracing::trace_state_ptr trace_state,
|
||||
uint32_t remaining_row_count, uint32_t remaining_partition_count) {
|
||||
@@ -2923,7 +2922,7 @@ storage_proxy::query_partition_key_range(lw_shared_ptr<query::read_command> cmd,
|
||||
schema_ptr schema = local_schema_registry().get(cmd->schema_version);
|
||||
keyspace& ks = _db.local().find_keyspace(schema->ks_name());
|
||||
dht::partition_range_vector ranges;
|
||||
auto timeout = lowres_clock::now() + std::chrono::milliseconds(_db.local().get_config().read_request_timeout_in_ms());
|
||||
auto timeout = storage_proxy::clock_type::now() + std::chrono::milliseconds(_db.local().get_config().read_request_timeout_in_ms());
|
||||
|
||||
// when dealing with LocalStrategy keyspaces, we can skip the range splitting and merging (which can be
|
||||
// expensive in clusters with vnodes)
|
||||
|
||||
@@ -71,7 +71,7 @@ public:
|
||||
private:
|
||||
struct rh_entry {
|
||||
::shared_ptr<abstract_write_response_handler> handler;
|
||||
timer<lowres_clock> expire_timer;
|
||||
timer<clock_type> expire_timer;
|
||||
rh_entry(::shared_ptr<abstract_write_response_handler>&& h, std::function<void()>&& cb);
|
||||
};
|
||||
|
||||
@@ -253,7 +253,7 @@ private:
|
||||
dht::partition_range_vector get_restricted_ranges(keyspace& ks, const schema& s, dht::partition_range range);
|
||||
float estimate_result_rows_per_range(lw_shared_ptr<query::read_command> cmd, keyspace& ks);
|
||||
static std::vector<gms::inet_address> intersection(const std::vector<gms::inet_address>& l1, const std::vector<gms::inet_address>& l2);
|
||||
future<std::vector<foreign_ptr<lw_shared_ptr<query::result>>>> query_partition_key_range_concurrent(lowres_clock::time_point timeout,
|
||||
future<std::vector<foreign_ptr<lw_shared_ptr<query::result>>>> query_partition_key_range_concurrent(clock_type::time_point timeout,
|
||||
std::vector<foreign_ptr<lw_shared_ptr<query::result>>>&& results, lw_shared_ptr<query::read_command> cmd, db::consistency_level cl, dht::partition_range_vector::iterator&& i,
|
||||
dht::partition_range_vector&& ranges, int concurrency_factor, tracing::trace_state_ptr trace_state,
|
||||
uint32_t remaining_row_count, uint32_t remaining_partition_count);
|
||||
|
||||
@@ -71,6 +71,12 @@ void compression::set_compressor(compressor c) {
|
||||
}
|
||||
}
|
||||
|
||||
// locate() takes a byte position in the uncompressed stream, and finds the
|
||||
// the location of the compressed chunk on disk which contains it, and the
|
||||
// offset in this chunk.
|
||||
// locate() may only be used for offsets of actual bytes, and in particular
|
||||
// the end-of-file position (one past the last byte) MUST not be used. If the
|
||||
// caller wants to read from the end of file, it should simply read nothing.
|
||||
compression::chunk_and_offset
|
||||
compression::locate(uint64_t position) const {
|
||||
auto ucl = uncompressed_chunk_length();
|
||||
@@ -310,6 +316,9 @@ public:
|
||||
virtual future<temporary_buffer<char>> skip(uint64_t n) override {
|
||||
_pos += n;
|
||||
assert(_pos <= _end_pos);
|
||||
if (_pos == _end_pos) {
|
||||
return make_ready_future<temporary_buffer<char>>();
|
||||
}
|
||||
auto addr = _compression_metadata->locate(_pos);
|
||||
auto underlying_n = addr.chunk_start - _underlying_pos;
|
||||
_underlying_pos = addr.chunk_start;
|
||||
|
||||
@@ -1951,19 +1951,20 @@ void sstable_writer::prepare_file_writer()
|
||||
options.write_behind = 10;
|
||||
|
||||
if (!_compression_enabled) {
|
||||
_writer = make_shared<checksummed_file_writer>(std::move(_sst._data_file), std::move(options), true);
|
||||
_writer = std::make_unique<checksummed_file_writer>(std::move(_sst._data_file), std::move(options), true);
|
||||
} else {
|
||||
prepare_compression(_sst._components->compression, _schema);
|
||||
_writer = make_shared<file_writer>(make_compressed_file_output_stream(std::move(_sst._data_file), std::move(options), &_sst._components->compression));
|
||||
_writer = std::make_unique<file_writer>(make_compressed_file_output_stream(std::move(_sst._data_file), std::move(options), &_sst._components->compression));
|
||||
}
|
||||
}
|
||||
|
||||
void sstable_writer::finish_file_writer()
|
||||
{
|
||||
_writer->close().get();
|
||||
auto writer = std::move(_writer);
|
||||
writer->close().get();
|
||||
|
||||
if (!_compression_enabled) {
|
||||
auto chksum_wr = static_pointer_cast<checksummed_file_writer>(_writer);
|
||||
auto chksum_wr = static_cast<checksummed_file_writer*>(writer.get());
|
||||
write_digest(_sst._write_error_handler, _sst.filename(sstable::component_type::Digest), chksum_wr->full_checksum());
|
||||
write_crc(_sst._write_error_handler, _sst.filename(sstable::component_type::CRC), chksum_wr->finalize_checksum());
|
||||
} else {
|
||||
@@ -1971,6 +1972,16 @@ void sstable_writer::finish_file_writer()
|
||||
}
|
||||
}
|
||||
|
||||
sstable_writer::~sstable_writer() {
|
||||
if (_writer) {
|
||||
try {
|
||||
_writer->close().get();
|
||||
} catch (...) {
|
||||
sstlog.error("sstable_writer failed to close file: {}", std::current_exception());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sstable_writer::sstable_writer(sstable& sst, const schema& s, uint64_t estimated_partitions,
|
||||
uint64_t max_sstable_size, bool backup, bool leave_unsealed, const io_priority_class& pc)
|
||||
: _sst(sst)
|
||||
|
||||
@@ -803,7 +803,7 @@ class sstable_writer {
|
||||
bool _backup;
|
||||
bool _leave_unsealed;
|
||||
bool _compression_enabled;
|
||||
shared_ptr<file_writer> _writer;
|
||||
std::unique_ptr<file_writer> _writer;
|
||||
stdx::optional<components_writer> _components_writer;
|
||||
private:
|
||||
void prepare_file_writer();
|
||||
@@ -811,6 +811,10 @@ private:
|
||||
public:
|
||||
sstable_writer(sstable& sst, const schema& s, uint64_t estimated_partitions,
|
||||
uint64_t max_sstable_size, bool backup, bool leave_unsealed, const io_priority_class& pc);
|
||||
~sstable_writer();
|
||||
sstable_writer(sstable_writer&& o) : _sst(o._sst), _schema(o._schema), _pc(o._pc), _backup(o._backup),
|
||||
_leave_unsealed(o._leave_unsealed), _compression_enabled(o._compression_enabled), _writer(std::move(o._writer)),
|
||||
_components_writer(std::move(o._components_writer)) {}
|
||||
void consume_new_partition(const dht::decorated_key& dk) { return _components_writer->consume_new_partition(dk); }
|
||||
void consume(tombstone t) { _components_writer->consume(t); }
|
||||
stop_iteration consume(static_row&& sr) { return _components_writer->consume(std::move(sr)); }
|
||||
|
||||
1
test.py
1
test.py
@@ -78,6 +78,7 @@ boost_tests = [
|
||||
'virtual_reader_test',
|
||||
'view_schema_test',
|
||||
'counter_test',
|
||||
'cell_locker_test',
|
||||
]
|
||||
|
||||
other_tests = [
|
||||
|
||||
@@ -55,13 +55,13 @@ SEASTAR_TEST_CASE(test_reading_with_different_schemas) {
|
||||
canonical_mutation cm1(m1);
|
||||
canonical_mutation cm2(m2);
|
||||
|
||||
{
|
||||
if (can_upgrade_schema(m1.schema(), m2.schema())) {
|
||||
auto m = cm1.to_mutation(m1.schema());
|
||||
m.upgrade(m2.schema());
|
||||
assert_that(cm1.to_mutation(m2.schema())).is_equal_to(m);
|
||||
}
|
||||
|
||||
{
|
||||
if (can_upgrade_schema(m2.schema(), m1.schema())) {
|
||||
auto m = cm2.to_mutation(m2.schema());
|
||||
m.upgrade(m1.schema());
|
||||
assert_that(cm2.to_mutation(m1.schema())).is_equal_to(m);
|
||||
|
||||
218
tests/cell_locker_test.cc
Normal file
218
tests/cell_locker_test.cc
Normal file
@@ -0,0 +1,218 @@
|
||||
/*
|
||||
* Copyright (C) 2017 ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* This file is part of Scylla.
|
||||
*
|
||||
* Scylla is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Scylla is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "tests/test-utils.hh"
|
||||
#include "disk-error-handler.hh"
|
||||
|
||||
#include <seastar/core/thread.hh>
|
||||
|
||||
#include "cell_locking.hh"
|
||||
#include "mutation.hh"
|
||||
#include "schema_builder.hh"
|
||||
|
||||
thread_local disk_error_signal_type commit_error;
|
||||
thread_local disk_error_signal_type general_disk_error;
|
||||
|
||||
static schema_ptr make_schema()
|
||||
{
|
||||
return schema_builder("ks", "cf")
|
||||
.with_column("pk", bytes_type, column_kind::partition_key)
|
||||
.with_column("ck", bytes_type, column_kind::clustering_key)
|
||||
.with_column("s1", bytes_type, column_kind::static_column)
|
||||
.with_column("s2", bytes_type, column_kind::static_column)
|
||||
.with_column("s3", bytes_type, column_kind::static_column)
|
||||
.with_column("r1", bytes_type)
|
||||
.with_column("r2", bytes_type)
|
||||
.with_column("r3", bytes_type)
|
||||
.build();
|
||||
}
|
||||
|
||||
static schema_ptr make_alternative_schema()
|
||||
{
|
||||
return schema_builder("ks", "cf")
|
||||
.with_column("pk", bytes_type, column_kind::partition_key)
|
||||
.with_column("ck", bytes_type, column_kind::clustering_key)
|
||||
.with_column("s0", bytes_type, column_kind::static_column)
|
||||
.with_column("s1", bytes_type, column_kind::static_column)
|
||||
.with_column("s2.5", bytes_type, column_kind::static_column)
|
||||
.with_column("s3", bytes_type, column_kind::static_column)
|
||||
.with_column("r0", bytes_type)
|
||||
.with_column("r1", bytes_type)
|
||||
.with_column("r2.5", bytes_type)
|
||||
.with_column("r3", bytes_type)
|
||||
.build();
|
||||
}
|
||||
|
||||
static schema_ptr make_schema_disjoint_with_others()
|
||||
{
|
||||
return schema_builder("ks", "cf")
|
||||
.with_column("pk", bytes_type, column_kind::partition_key)
|
||||
.with_column("ck", bytes_type, column_kind::clustering_key)
|
||||
.with_column("s8", bytes_type, column_kind::static_column)
|
||||
.with_column("s9", bytes_type, column_kind::static_column)
|
||||
.with_column("r8", bytes_type)
|
||||
.with_column("r9", bytes_type)
|
||||
.build();
|
||||
}
|
||||
|
||||
static data_value empty_value = data_value(to_bytes(""));
|
||||
|
||||
static auto make_row(const sstring& key, std::initializer_list<sstring> cells) {
|
||||
return std::pair<sstring, std::initializer_list<sstring>>(key, cells);
|
||||
}
|
||||
|
||||
static mutation make_mutation(schema_ptr s, const sstring& pk, std::initializer_list<sstring> static_cells,
|
||||
std::initializer_list<std::pair<sstring, std::initializer_list<sstring>>> clustering_cells)
|
||||
{
|
||||
auto m = mutation(partition_key::from_single_value(*s, to_bytes(pk)), s);
|
||||
for (auto&& c : static_cells) {
|
||||
m.set_static_cell(to_bytes(c), empty_value, api::new_timestamp());
|
||||
}
|
||||
for (auto&& r : clustering_cells) {
|
||||
auto ck = clustering_key::from_single_value(*s, to_bytes(r.first));
|
||||
for (auto&& c : r.second) {
|
||||
m.set_clustered_cell(ck, to_bytes(c), empty_value, api::new_timestamp());
|
||||
}
|
||||
}
|
||||
return m;
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_simple_locking_cells) {
|
||||
return seastar::async([&] {
|
||||
auto destroy = [] (auto) { };
|
||||
|
||||
auto s = make_schema();
|
||||
cell_locker cl(s);
|
||||
|
||||
auto m = make_mutation(s, "0", { "s1", "s3" }, {
|
||||
make_row("one", { "r1", "r2" }),
|
||||
make_row("two", { "r2", "r3" }),
|
||||
});
|
||||
|
||||
auto l1 = cl.lock_cells(m.decorated_key(), partition_cells_range(m.partition())).get0();
|
||||
auto f2 = cl.lock_cells(m.decorated_key(), partition_cells_range(m.partition()));
|
||||
BOOST_REQUIRE(!f2.available());
|
||||
|
||||
destroy(std::move(l1));
|
||||
destroy(f2.get0());
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_disjoint_mutations) {
|
||||
return seastar::async([&] {
|
||||
auto s = make_schema();
|
||||
cell_locker cl(s);
|
||||
|
||||
auto m1 = make_mutation(s, "0", { "s1" }, {
|
||||
make_row("one", { "r1", "r2" }),
|
||||
make_row("two", { "r3" }),
|
||||
});
|
||||
auto m2 = make_mutation(s, "0", { "s2" }, {
|
||||
make_row("two", { "r1", "r2" }),
|
||||
make_row("one", { "r3" }),
|
||||
});
|
||||
|
||||
auto m3 = mutation(partition_key::from_single_value(*s, to_bytes("1")), s);
|
||||
m3.partition() = m1.partition();
|
||||
|
||||
auto l1 = cl.lock_cells(m1.decorated_key(), partition_cells_range(m1.partition())).get0();
|
||||
auto l2 = cl.lock_cells(m2.decorated_key(), partition_cells_range(m2.partition())).get0();
|
||||
auto l3 = cl.lock_cells(m3.decorated_key(), partition_cells_range(m3.partition())).get0();
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_single_cell_overlap) {
|
||||
return seastar::async([&] {
|
||||
auto destroy = [] (auto) { };
|
||||
|
||||
auto s = make_schema();
|
||||
cell_locker cl(s);
|
||||
|
||||
auto m1 = make_mutation(s, "0", { "s1" }, {
|
||||
make_row("one", { "r1", "r2" }),
|
||||
make_row("two", { "r3" }),
|
||||
});
|
||||
auto m2 = make_mutation(s, "0", { "s1" }, {
|
||||
make_row("two", { "r1", "r2" }),
|
||||
make_row("one", { "r3" }),
|
||||
});
|
||||
auto m3 = make_mutation(s, "0", { "s2" }, {
|
||||
make_row("two", { "r1" }),
|
||||
make_row("one", { "r2", "r3" }),
|
||||
});
|
||||
|
||||
auto l1 = cl.lock_cells(m1.decorated_key(), partition_cells_range(m1.partition())).get0();
|
||||
auto f2 = cl.lock_cells(m2.decorated_key(), partition_cells_range(m2.partition()));
|
||||
BOOST_REQUIRE(!f2.available());
|
||||
destroy(std::move(l1));
|
||||
auto l2 = f2.get0();
|
||||
auto f3 = cl.lock_cells(m3.decorated_key(), partition_cells_range(m3.partition()));
|
||||
BOOST_REQUIRE(!f3.available());
|
||||
destroy(std::move(l2));
|
||||
auto l3 = f3.get0();
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_schema_change) {
|
||||
return seastar::async([&] {
|
||||
auto destroy = [] (auto) { };
|
||||
|
||||
auto s1 = make_schema();
|
||||
auto s2 = make_alternative_schema();
|
||||
cell_locker cl(s1);
|
||||
|
||||
auto m1 = make_mutation(s1, "0", { "s1", "s2", "s3"}, {
|
||||
make_row("one", { "r1", "r2", "r3" }),
|
||||
});
|
||||
|
||||
// disjoint with m1
|
||||
auto m2 = make_mutation(s2, "0", { "s0", "s2.5"}, {
|
||||
make_row("one", { "r0", "r2.5" }),
|
||||
make_row("two", { "r1", "r3" }),
|
||||
});
|
||||
|
||||
// overlaps with m1
|
||||
auto m3 = make_mutation(s2, "0", { "s1" }, {
|
||||
make_row("one", { "r1", "r3" }),
|
||||
});
|
||||
|
||||
auto l1 = cl.lock_cells(m1.decorated_key(), partition_cells_range(m1.partition())).get0();
|
||||
|
||||
destroy(std::move(m1));
|
||||
destroy(std::move(s1));
|
||||
cl.set_schema(s2);
|
||||
|
||||
auto l2 = cl.lock_cells(m2.decorated_key(), partition_cells_range(m2.partition())).get0();
|
||||
auto f3 = cl.lock_cells(m3.decorated_key(), partition_cells_range(m3.partition()));
|
||||
BOOST_REQUIRE(!f3.available());
|
||||
destroy(std::move(l1));
|
||||
auto l3 = f3.get0();
|
||||
|
||||
auto s3 = make_schema_disjoint_with_others();
|
||||
cl.set_schema(s3);
|
||||
|
||||
auto m4 = make_mutation(s3, "0", { "s8", "s9"}, {
|
||||
make_row("one", { "r8", "r9" }),
|
||||
make_row("two", { "r8", "r9" }),
|
||||
});
|
||||
auto l4 = cl.lock_cells(m4.decorated_key(), partition_cells_range(m4.partition())).get0();
|
||||
});
|
||||
}
|
||||
@@ -279,7 +279,7 @@ public:
|
||||
auto stop_ms = defer([&ms] { ms.stop().get(); });
|
||||
|
||||
auto& ss = service::get_storage_service();
|
||||
ss.start(std::ref(*db));
|
||||
ss.start(std::ref(*db)).get();
|
||||
auto stop_storage_service = defer([&ss] { ss.stop().get(); });
|
||||
|
||||
db->start(std::move(*cfg)).get();
|
||||
|
||||
@@ -29,7 +29,9 @@
|
||||
#include <seastar/core/timer.hh>
|
||||
#include <seastar/core/sleep.hh>
|
||||
#include <seastar/tests/test-utils.hh>
|
||||
#include <seastar/util/defer.hh>
|
||||
#include <deque>
|
||||
#include "utils/phased_barrier.hh"
|
||||
|
||||
#include "utils/logalloc.hh"
|
||||
#include "utils/managed_ref.hh"
|
||||
@@ -529,11 +531,7 @@ inline void quiesce(FutureType&& fut) {
|
||||
// a request may be broken into many continuations. While we could just yield many times, the
|
||||
// exact amount needed to guarantee execution would be dependent on the internals of the
|
||||
// implementation, we want to avoid that.
|
||||
timer<> tmr;
|
||||
tmr.set_callback([] { BOOST_FAIL("The future we were waiting for took too long to get ready"); });
|
||||
tmr.arm(2s);
|
||||
fut.get();
|
||||
tmr.cancel();
|
||||
with_timeout(lowres_clock::now() + 2s, std::move(fut)).get();
|
||||
}
|
||||
|
||||
// Simple RAII structure that wraps around a region_group
|
||||
@@ -859,15 +857,22 @@ class test_reclaimer: public region_group_reclaimer {
|
||||
region_group _rg;
|
||||
std::vector<size_t> _reclaim_sizes;
|
||||
bool _shutdown = false;
|
||||
shared_promise<> _unleash_reclaimer;
|
||||
seastar::gate _reclaimers_done;
|
||||
public:
|
||||
virtual void start_reclaiming() override {
|
||||
while (this->under_pressure()) {
|
||||
size_t reclaimed = test_async_reclaim_region::from_region(_rg.get_largest_region()).evict();
|
||||
_result_accumulator->_reclaim_sizes.push_back(reclaimed);
|
||||
}
|
||||
virtual void start_reclaiming() noexcept override {
|
||||
with_gate(_reclaimers_done, [this] {
|
||||
return _unleash_reclaimer.get_shared_future().then([this] {
|
||||
while (this->under_pressure()) {
|
||||
size_t reclaimed = test_async_reclaim_region::from_region(_rg.get_largest_region()).evict();
|
||||
_result_accumulator->_reclaim_sizes.push_back(reclaimed);
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
~test_reclaimer() {
|
||||
_reclaimers_done.close().get();
|
||||
_rg.shutdown().get();
|
||||
}
|
||||
|
||||
@@ -881,6 +886,10 @@ public:
|
||||
|
||||
test_reclaimer(size_t threshold) : region_group_reclaimer(threshold), _result_accumulator(this), _rg(*this) {}
|
||||
test_reclaimer(test_reclaimer& parent, size_t threshold) : region_group_reclaimer(threshold), _result_accumulator(&parent), _rg(&parent._rg, *this) {}
|
||||
|
||||
void unleash() {
|
||||
_unleash_reclaimer.set_value();
|
||||
}
|
||||
};
|
||||
|
||||
SEASTAR_TEST_CASE(test_region_groups_basic_throttling_simple_active_reclaim) {
|
||||
@@ -888,6 +897,7 @@ SEASTAR_TEST_CASE(test_region_groups_basic_throttling_simple_active_reclaim) {
|
||||
// allocate a single region to exhaustion, and make sure active reclaim is activated.
|
||||
test_reclaimer simple(logalloc::segment_size);
|
||||
test_async_reclaim_region simple_region(simple.rg(), logalloc::segment_size);
|
||||
simple.unleash();
|
||||
|
||||
// Can't run this function until we have reclaimed something
|
||||
auto fut = simple.rg().run_when_memory_available([] {});
|
||||
@@ -912,6 +922,7 @@ SEASTAR_TEST_CASE(test_region_groups_basic_throttling_active_reclaim_worst_offen
|
||||
test_async_reclaim_region small_region(simple.rg(), logalloc::segment_size);
|
||||
test_async_reclaim_region medium_region(simple.rg(), 2 * logalloc::segment_size);
|
||||
test_async_reclaim_region big_region(simple.rg(), 3 * logalloc::segment_size);
|
||||
simple.unleash();
|
||||
|
||||
// Can't run this function until we have reclaimed
|
||||
auto fut = simple.rg().run_when_memory_available([&simple] {
|
||||
@@ -941,6 +952,9 @@ SEASTAR_TEST_CASE(test_region_groups_basic_throttling_active_reclaim_leaf_offend
|
||||
test_async_reclaim_region small_region(small_leaf.rg(), logalloc::segment_size);
|
||||
test_async_reclaim_region medium_region(root.rg(), 2 * logalloc::segment_size);
|
||||
test_async_reclaim_region big_region(large_leaf.rg(), 3 * logalloc::segment_size);
|
||||
root.unleash();
|
||||
large_leaf.unleash();
|
||||
small_leaf.unleash();
|
||||
|
||||
// Can't run this function until we have reclaimed. Try at the root, and we'll make sure
|
||||
// that the leaves are forced correctly.
|
||||
@@ -967,6 +981,8 @@ SEASTAR_TEST_CASE(test_region_groups_basic_throttling_active_reclaim_ancestor_bl
|
||||
test_reclaimer leaf(root, logalloc::segment_size);
|
||||
|
||||
test_async_reclaim_region root_region(root.rg(), logalloc::segment_size);
|
||||
root.unleash();
|
||||
leaf.unleash();
|
||||
|
||||
// Can't run this function until we have reclaimed. Try at the leaf, and we'll make sure
|
||||
// that the root reclaims
|
||||
@@ -992,6 +1008,8 @@ SEASTAR_TEST_CASE(test_region_groups_basic_throttling_active_reclaim_big_region_
|
||||
test_async_reclaim_region root_region(root.rg(), 4 * logalloc::segment_size);
|
||||
test_async_reclaim_region big_leaf_region(leaf.rg(), 3 * logalloc::segment_size);
|
||||
test_async_reclaim_region small_leaf_region(leaf.rg(), 2 * logalloc::segment_size);
|
||||
root.unleash();
|
||||
leaf.unleash();
|
||||
|
||||
auto fut = root.rg().run_when_memory_available([&root] {
|
||||
BOOST_REQUIRE_EQUAL(root.reclaim_sizes().size(), 3);
|
||||
@@ -1018,6 +1036,8 @@ SEASTAR_TEST_CASE(test_region_groups_basic_throttling_active_reclaim_no_double_r
|
||||
test_reclaimer leaf(root, logalloc::segment_size);
|
||||
|
||||
test_async_reclaim_region leaf_region(leaf.rg(), logalloc::segment_size);
|
||||
root.unleash();
|
||||
leaf.unleash();
|
||||
|
||||
auto fut_root = root.rg().run_when_memory_available([&root] {
|
||||
BOOST_REQUIRE_EQUAL(root.reclaim_sizes().size(), 1);
|
||||
@@ -1037,3 +1057,117 @@ SEASTAR_TEST_CASE(test_region_groups_basic_throttling_active_reclaim_no_double_r
|
||||
BOOST_REQUIRE_EQUAL(root.reclaim_sizes()[0], logalloc::segment_size);
|
||||
});
|
||||
}
|
||||
|
||||
// Reproduces issue #2021
|
||||
SEASTAR_TEST_CASE(test_no_crash_when_a_lot_of_requests_released_which_change_region_group_size) {
|
||||
return seastar::async([] {
|
||||
#ifndef DEFAULT_ALLOCATOR // Because we need memory::stats().free_memory();
|
||||
logging::logger_registry().set_logger_level("lsa", seastar::log_level::debug);
|
||||
|
||||
auto free_space = memory::stats().free_memory();
|
||||
size_t threshold = size_t(0.75 * free_space);
|
||||
region_group_reclaimer recl(threshold, threshold);
|
||||
region_group gr(recl);
|
||||
auto close_gr = defer([&gr] { gr.shutdown().get(); });
|
||||
region r(gr);
|
||||
|
||||
with_allocator(r.allocator(), [&] {
|
||||
std::vector<managed_bytes> objs;
|
||||
|
||||
r.make_evictable([&] {
|
||||
if (objs.empty()) {
|
||||
return memory::reclaiming_result::reclaimed_nothing;
|
||||
}
|
||||
with_allocator(r.allocator(), [&] {
|
||||
objs.pop_back();
|
||||
});
|
||||
return memory::reclaiming_result::reclaimed_something;
|
||||
});
|
||||
|
||||
auto fill_to_pressure = [&] {
|
||||
while (!recl.under_pressure()) {
|
||||
objs.emplace_back(managed_bytes(managed_bytes::initialized_later(), 1024));
|
||||
}
|
||||
};
|
||||
|
||||
utils::phased_barrier request_barrier;
|
||||
auto wait_for_requests = defer([&] { request_barrier.advance_and_await().get(); });
|
||||
|
||||
for (int i = 0; i < 1000000; ++i) {
|
||||
fill_to_pressure();
|
||||
future<> f = gr.run_when_memory_available([&, op = request_barrier.start()] {
|
||||
// Trigger group size change (Refs issue #2021)
|
||||
gr.update(-10);
|
||||
gr.update(+10);
|
||||
});
|
||||
BOOST_REQUIRE(!f.available());
|
||||
}
|
||||
|
||||
// Release
|
||||
while (recl.under_pressure()) {
|
||||
objs.pop_back();
|
||||
}
|
||||
});
|
||||
#endif
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_reclaiming_runs_as_long_as_there_is_soft_pressure) {
|
||||
return seastar::async([] {
|
||||
size_t hard_threshold = logalloc::segment_size * 8;
|
||||
size_t soft_threshold = hard_threshold / 2;
|
||||
|
||||
class reclaimer : public region_group_reclaimer {
|
||||
bool _reclaim = false;
|
||||
protected:
|
||||
void start_reclaiming() noexcept override {
|
||||
_reclaim = true;
|
||||
}
|
||||
|
||||
void stop_reclaiming() noexcept override {
|
||||
_reclaim = false;
|
||||
}
|
||||
public:
|
||||
reclaimer(size_t hard_threshold, size_t soft_threshold)
|
||||
: region_group_reclaimer(hard_threshold, soft_threshold)
|
||||
{ }
|
||||
bool reclaiming() const { return _reclaim; };
|
||||
};
|
||||
|
||||
reclaimer recl(hard_threshold, soft_threshold);
|
||||
region_group gr(recl);
|
||||
auto close_gr = defer([&gr] { gr.shutdown().get(); });
|
||||
region r(gr);
|
||||
|
||||
with_allocator(r.allocator(), [&] {
|
||||
std::vector<managed_bytes> objs;
|
||||
|
||||
BOOST_REQUIRE(!recl.reclaiming());
|
||||
|
||||
while (!recl.over_soft_limit()) {
|
||||
objs.emplace_back(managed_bytes(managed_bytes::initialized_later(), logalloc::segment_size));
|
||||
}
|
||||
|
||||
BOOST_REQUIRE(recl.reclaiming());
|
||||
|
||||
while (!recl.under_pressure()) {
|
||||
objs.emplace_back(managed_bytes(managed_bytes::initialized_later(), logalloc::segment_size));
|
||||
}
|
||||
|
||||
BOOST_REQUIRE(recl.reclaiming());
|
||||
|
||||
while (recl.under_pressure()) {
|
||||
objs.pop_back();
|
||||
}
|
||||
|
||||
BOOST_REQUIRE(recl.over_soft_limit());
|
||||
BOOST_REQUIRE(recl.reclaiming());
|
||||
|
||||
while (recl.over_soft_limit()) {
|
||||
objs.pop_back();
|
||||
}
|
||||
|
||||
BOOST_REQUIRE(!recl.reclaiming());
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
@@ -76,13 +76,16 @@ int main(int argc, char** argv) {
|
||||
});
|
||||
|
||||
uint64_t counter = 0;
|
||||
logalloc::allocating_section alloc_sect;
|
||||
alloc_sect.set_lsa_reserve(0);
|
||||
alloc_sect.set_std_reserve(0);
|
||||
|
||||
while (counter < obj_count) {
|
||||
auto obj = managed_bytes(managed_bytes::initialized_later(), obj_size);
|
||||
{
|
||||
alloc_sect(r, [&] {
|
||||
auto obj = managed_bytes(managed_bytes::initialized_later(), obj_size);
|
||||
logalloc::reclaim_lock l(r);
|
||||
refs.push_back(std::move(obj));
|
||||
}
|
||||
});
|
||||
|
||||
++counter;
|
||||
|
||||
|
||||
@@ -191,7 +191,6 @@ static mutation_sets generate_mutation_sets() {
|
||||
.with_column("ck_col_2", bytes_type, column_kind::clustering_key)
|
||||
.with_column("regular_col_1", bytes_type)
|
||||
.with_column("regular_col_2", bytes_type)
|
||||
.with_column("regular_counter_col_1", counter_type)
|
||||
.with_column("static_col_1", bytes_type, column_kind::static_column)
|
||||
.with_column("static_col_2", bytes_type, column_kind::static_column);
|
||||
|
||||
@@ -300,9 +299,20 @@ static mutation_sets generate_mutation_sets() {
|
||||
}
|
||||
}
|
||||
|
||||
static constexpr auto rmg_iterations = 10;
|
||||
|
||||
{
|
||||
random_mutation_generator gen;
|
||||
for (int i = 0; i < 10; ++i) {
|
||||
random_mutation_generator gen(random_mutation_generator::generate_counters::no);
|
||||
for (int i = 0; i < rmg_iterations; ++i) {
|
||||
auto m = gen();
|
||||
result.unequal.emplace_back(mutations{m, gen()}); // collision unlikely
|
||||
result.equal.emplace_back(mutations{m, m});
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
random_mutation_generator gen(random_mutation_generator::generate_counters::yes);
|
||||
for (int i = 0; i < rmg_iterations; ++i) {
|
||||
auto m = gen();
|
||||
result.unequal.emplace_back(mutations{m, gen()}); // collision unlikely
|
||||
result.equal.emplace_back(mutations{m, m});
|
||||
@@ -364,6 +374,7 @@ bytes make_blob(size_t blob_size) {
|
||||
|
||||
class random_mutation_generator::impl {
|
||||
friend class random_mutation_generator;
|
||||
generate_counters _generate_counters;
|
||||
const size_t _external_blob_size = 128; // Should be enough to force use of external bytes storage
|
||||
const column_id column_count = row::max_vector_size * 2;
|
||||
std::mt19937 _gen;
|
||||
@@ -375,30 +386,33 @@ class random_mutation_generator::impl {
|
||||
return gc_clock::time_point() + std::chrono::seconds(dist(gen));
|
||||
}
|
||||
|
||||
public:
|
||||
schema_ptr make_schema() {
|
||||
schema_ptr do_make_schema(data_type type) {
|
||||
auto builder = schema_builder("ks", "cf")
|
||||
.with_column("pk", bytes_type, column_kind::partition_key)
|
||||
.with_column("ck1", bytes_type, column_kind::clustering_key)
|
||||
.with_column("ck2", bytes_type, column_kind::clustering_key)
|
||||
.with_column("c1", counter_type);
|
||||
.with_column("ck2", bytes_type, column_kind::clustering_key);
|
||||
|
||||
// Create enough columns so that row can overflow its vector storage
|
||||
for (column_id i = 0; i < column_count; ++i) {
|
||||
{
|
||||
auto column_name = sprint("v%d", i);
|
||||
builder.with_column(to_bytes(column_name), bytes_type, column_kind::regular_column);
|
||||
builder.with_column(to_bytes(column_name), type, column_kind::regular_column);
|
||||
}
|
||||
{
|
||||
auto column_name = sprint("s%d", i);
|
||||
builder.with_column(to_bytes(column_name), bytes_type, column_kind::static_column);
|
||||
builder.with_column(to_bytes(column_name), type, column_kind::static_column);
|
||||
}
|
||||
}
|
||||
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
impl() {
|
||||
schema_ptr make_schema() {
|
||||
return _generate_counters ? do_make_schema(counter_type)
|
||||
: do_make_schema(bytes_type);
|
||||
}
|
||||
public:
|
||||
explicit impl(generate_counters counters) : _generate_counters(counters) {
|
||||
_schema = make_schema();
|
||||
|
||||
for (int i = 0; i < 1024; ++i) {
|
||||
@@ -424,8 +438,6 @@ public:
|
||||
auto pkey = partition_key::from_single_value(*_schema, _blobs[0]);
|
||||
mutation m(pkey, _schema);
|
||||
|
||||
auto& counter_column = *_schema->get_column_definition(utf8_type->decompose(sstring("c1")));
|
||||
|
||||
std::map<counter_id, std::set<int64_t>> counter_used_clock_values;
|
||||
std::vector<counter_id> counter_ids;
|
||||
std::generate_n(std::back_inserter(counter_ids), 8, counter_id::generate_random);
|
||||
@@ -459,16 +471,16 @@ public:
|
||||
auto columns_to_set = column_count_dist(_gen);
|
||||
for (column_id i = 0; i < columns_to_set; ++i) {
|
||||
auto cid = column_id_dist(_gen);
|
||||
if (kind == column_kind::regular_column && cid == counter_column.id) {
|
||||
auto cell = bool_dist(_gen)
|
||||
? random_counter_cell()
|
||||
: atomic_cell::make_dead(timestamp_dist(_gen), expiry_dist(_gen));
|
||||
r.apply(_schema->column_at(kind, cid), std::move(cell));
|
||||
continue;
|
||||
}
|
||||
auto get_live_cell = [&] {
|
||||
if (_generate_counters) {
|
||||
return random_counter_cell();
|
||||
} else {
|
||||
return atomic_cell::make_live(timestamp_dist(_gen), _blobs[value_blob_index_dist(_gen)]);
|
||||
}
|
||||
};
|
||||
// FIXME: generate expiring cells
|
||||
auto cell = bool_dist(_gen)
|
||||
? atomic_cell::make_live(timestamp_dist(_gen), _blobs[value_blob_index_dist(_gen)])
|
||||
? get_live_cell()
|
||||
: atomic_cell::make_dead(timestamp_dist(_gen), expiry_dist(_gen));
|
||||
r.apply(_schema->column_at(kind, cid), std::move(cell));
|
||||
}
|
||||
@@ -529,8 +541,8 @@ public:
|
||||
|
||||
random_mutation_generator::~random_mutation_generator() {}
|
||||
|
||||
random_mutation_generator::random_mutation_generator()
|
||||
: _impl(std::make_unique<random_mutation_generator::impl>())
|
||||
random_mutation_generator::random_mutation_generator(generate_counters counters)
|
||||
: _impl(std::make_unique<random_mutation_generator::impl>(counters))
|
||||
{ }
|
||||
|
||||
mutation random_mutation_generator::operator()() {
|
||||
|
||||
@@ -37,11 +37,19 @@ void for_each_mutation_pair(std::function<void(const mutation&, const mutation&,
|
||||
// Calls the provided function on mutations. Is supposed to exercise as many differences as possible.
|
||||
void for_each_mutation(std::function<void(const mutation&)>);
|
||||
|
||||
// Returns true if mutations in schema s1 can be upgraded to s2.
|
||||
inline bool can_upgrade_schema(schema_ptr from, schema_ptr to) {
|
||||
return from->is_counter() == to->is_counter();
|
||||
}
|
||||
|
||||
class random_mutation_generator {
|
||||
class impl;
|
||||
std::unique_ptr<impl> _impl;
|
||||
public:
|
||||
random_mutation_generator();
|
||||
struct generate_counters_tag { };
|
||||
using generate_counters = bool_class<generate_counters_tag>;
|
||||
|
||||
explicit random_mutation_generator(generate_counters);
|
||||
~random_mutation_generator();
|
||||
mutation operator()();
|
||||
schema_ptr schema() const;
|
||||
|
||||
@@ -795,8 +795,7 @@ public:
|
||||
};
|
||||
|
||||
SEASTAR_TEST_CASE(test_apply_is_atomic_in_case_of_allocation_failures) {
|
||||
random_mutation_generator gen;
|
||||
|
||||
auto do_test = [] (auto&& gen) {
|
||||
failure_injecting_allocation_strategy alloc(standard_allocator());
|
||||
with_allocator(alloc, [&] {
|
||||
auto target = gen();
|
||||
@@ -857,7 +856,10 @@ SEASTAR_TEST_CASE(test_apply_is_atomic_in_case_of_allocation_failures) {
|
||||
}
|
||||
}
|
||||
});
|
||||
};
|
||||
|
||||
do_test(random_mutation_generator(random_mutation_generator::generate_counters::no));
|
||||
do_test(random_mutation_generator(random_mutation_generator::generate_counters::yes));
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
|
||||
@@ -238,7 +238,8 @@ future<> trace_keyspace_helper::start() {
|
||||
std::map<sstring, sstring> opts;
|
||||
opts["replication_factor"] = "2";
|
||||
auto ksm = keyspace_metadata::new_keyspace(KEYSPACE_NAME, "org.apache.cassandra.locator.SimpleStrategy", std::move(opts), true);
|
||||
service::get_local_migration_manager().announce_new_keyspace(ksm, false).get();
|
||||
// We use min_timestamp so that default keyspace metadata will loose with any manual adjustments. See issue #2129.
|
||||
service::get_local_migration_manager().announce_new_keyspace(ksm, api::min_timestamp, false).get();
|
||||
}
|
||||
|
||||
// Create tables
|
||||
|
||||
@@ -655,9 +655,9 @@ future<> cql_server::connection::process_request() {
|
||||
auto bv = bytes_view{reinterpret_cast<const int8_t*>(buf.begin()), buf.size()};
|
||||
auto cpu = pick_request_cpu();
|
||||
return smp::submit_to(cpu, [this, bv = std::move(bv), op, stream, client_state = _client_state, tracing_requested] () mutable {
|
||||
return this->process_request_one(bv, op, stream, std::move(client_state), tracing_requested).then([](auto&& response) {
|
||||
return this->process_request_one(bv, op, stream, std::move(client_state), tracing_requested).then([tracing_requested] (auto&& response) {
|
||||
auto& tracing_session_id_ptr = response.second.tracing_session_id_ptr();
|
||||
if (tracing_session_id_ptr) {
|
||||
if (tracing_requested == tracing_request_type::write_on_close && tracing_session_id_ptr) {
|
||||
response.first->set_tracing_id(*tracing_session_id_ptr);
|
||||
}
|
||||
return std::make_pair(make_foreign(response.first), response.second);
|
||||
|
||||
12
types.hh
12
types.hh
@@ -1166,6 +1166,18 @@ shared_ptr<const abstract_type> data_type_for<bool>() {
|
||||
return boolean_type;
|
||||
}
|
||||
|
||||
template <>
|
||||
inline
|
||||
shared_ptr<const abstract_type> data_type_for<float>() {
|
||||
return float_type;
|
||||
}
|
||||
|
||||
template <>
|
||||
inline
|
||||
shared_ptr<const abstract_type> data_type_for<double>() {
|
||||
return double_type;
|
||||
}
|
||||
|
||||
namespace std {
|
||||
|
||||
template <>
|
||||
|
||||
@@ -2065,56 +2065,6 @@ uint64_t region_group::top_region_evictable_space() const {
|
||||
return _regions.empty() ? 0 : _regions.top()->evictable_occupancy().total_space();
|
||||
}
|
||||
|
||||
void region_group::release_requests() noexcept {
|
||||
// The later() statement is here to avoid executing the function in update() context. But
|
||||
// also guarantees that we won't dominate the CPU if we have many requests to release.
|
||||
//
|
||||
// However, both with_gate() and later() can ultimately call to schedule() and consequently
|
||||
// allocate memory, which (if that allocation triggers a compaction - that frees memory) would
|
||||
// defeat the very purpose of not executing this on update() context. Allocations should be rare
|
||||
// on those but can happen, so we need to at least make sure they will not reclaim.
|
||||
//
|
||||
// Whatever comes after later() is already in a safe context, so we don't need to keep the lock
|
||||
// alive until we are done with the whole execution - only until later is successfully executed.
|
||||
tracker_reclaimer_lock rl;
|
||||
|
||||
_reclaimer.notify_relief();
|
||||
if (_descendant_blocked_requests) {
|
||||
_descendant_blocked_requests->set_value();
|
||||
}
|
||||
_descendant_blocked_requests = {};
|
||||
|
||||
if (_blocked_requests.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
with_gate(_asynchronous_gate, [this, rl = std::move(rl)] () mutable {
|
||||
return later().then([this] {
|
||||
// Check again, we may have executed release_requests() in this mean time from another entry
|
||||
// point (for instance, a descendant notification)
|
||||
if (_blocked_requests.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto blocked_at = do_for_each_parent(this, [] (auto rg) {
|
||||
return rg->execution_permitted() ? stop_iteration::no : stop_iteration::yes;
|
||||
});
|
||||
|
||||
if (!blocked_at) {
|
||||
auto req = std::move(_blocked_requests.front());
|
||||
_blocked_requests.pop_front();
|
||||
req->allocate();
|
||||
release_requests();
|
||||
} else {
|
||||
// If someone blocked us in the mean time then we can't execute. We need to make
|
||||
// sure that we are listening to notifications, though. It could be that we used to
|
||||
// be blocked on ourselves and now we are blocking on an ancestor
|
||||
subscribe_for_ancestor_available_memory_notification(blocked_at);
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
region* region_group::get_largest_region() {
|
||||
if (!_maximal_rg || _maximal_rg->_regions.empty()) {
|
||||
return nullptr;
|
||||
@@ -2148,6 +2098,88 @@ region_group::del(region_impl* child) {
|
||||
update(-child->occupancy().total_space());
|
||||
}
|
||||
|
||||
bool
|
||||
region_group::execution_permitted() noexcept {
|
||||
return do_for_each_parent(this, [] (auto rg) {
|
||||
return rg->under_pressure() ? stop_iteration::yes : stop_iteration::no;
|
||||
}) == nullptr;
|
||||
}
|
||||
|
||||
future<>
|
||||
region_group::start_releaser() {
|
||||
return later().then([this] {
|
||||
return repeat([this] () noexcept {
|
||||
if (_shutdown_requested) {
|
||||
return make_ready_future<stop_iteration>(stop_iteration::yes);
|
||||
}
|
||||
|
||||
if (!_blocked_requests.empty() && execution_permitted()) {
|
||||
auto req = std::move(_blocked_requests.front());
|
||||
_blocked_requests.pop_front();
|
||||
req->allocate();
|
||||
return make_ready_future<stop_iteration>(stop_iteration::no);
|
||||
} else {
|
||||
// Block reclaiming to prevent signal() from being called by reclaimer inside wait()
|
||||
// FIXME: handle allocation failures (not very likely) like allocating_section does
|
||||
tracker_reclaimer_lock rl;
|
||||
return _relief.wait().then([] {
|
||||
return stop_iteration::no;
|
||||
});
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
region_group::region_group(region_group *parent, region_group_reclaimer& reclaimer)
|
||||
: _parent(parent)
|
||||
, _reclaimer(reclaimer)
|
||||
, _releaser(reclaimer_can_block() ? start_releaser() : make_ready_future<>())
|
||||
{
|
||||
if (_parent) {
|
||||
_parent->add(this);
|
||||
}
|
||||
}
|
||||
|
||||
bool region_group::reclaimer_can_block() const {
|
||||
return _reclaimer.throttle_threshold() != std::numeric_limits<size_t>::max();
|
||||
}
|
||||
|
||||
void region_group::notify_relief() {
|
||||
_relief.signal();
|
||||
for (region_group* child : _subgroups) {
|
||||
child->notify_relief();
|
||||
}
|
||||
}
|
||||
|
||||
void region_group::update(ssize_t delta) {
|
||||
// Most-enclosing group which was relieved.
|
||||
region_group* top_relief = nullptr;
|
||||
|
||||
do_for_each_parent(this, [&top_relief, delta] (region_group* rg) mutable {
|
||||
rg->update_maximal_rg();
|
||||
rg->_total_memory += delta;
|
||||
|
||||
if (rg->_total_memory >= rg->_reclaimer.soft_limit_threshold()) {
|
||||
rg->_reclaimer.notify_soft_pressure();
|
||||
} else {
|
||||
rg->_reclaimer.notify_soft_relief();
|
||||
}
|
||||
|
||||
if (rg->_total_memory > rg->_reclaimer.throttle_threshold()) {
|
||||
rg->_reclaimer.notify_pressure();
|
||||
} else if (rg->_reclaimer.under_pressure()) {
|
||||
rg->_reclaimer.notify_relief();
|
||||
top_relief = rg;
|
||||
}
|
||||
|
||||
return stop_iteration::no;
|
||||
});
|
||||
|
||||
if (top_relief) {
|
||||
top_relief->notify_relief();
|
||||
}
|
||||
}
|
||||
|
||||
allocating_section::guard::guard()
|
||||
: _prev(shard_segment_pool.emergency_reserve_max())
|
||||
{ }
|
||||
@@ -2196,6 +2228,14 @@ void allocating_section::on_alloc_failure() {
|
||||
|
||||
#endif
|
||||
|
||||
void allocating_section::set_lsa_reserve(size_t reserve) {
|
||||
_lsa_reserve = reserve;
|
||||
}
|
||||
|
||||
void allocating_section::set_std_reserve(size_t reserve) {
|
||||
_std_reserve = reserve;
|
||||
}
|
||||
|
||||
void region_group::on_request_expiry::operator()(std::unique_ptr<allocating_function>& func) noexcept {
|
||||
func->fail(std::make_exception_ptr(timed_out_error()));
|
||||
}
|
||||
|
||||
@@ -64,8 +64,20 @@ protected:
|
||||
size_t _soft_limit;
|
||||
bool _under_pressure = false;
|
||||
bool _under_soft_pressure = false;
|
||||
virtual void start_reclaiming() {}
|
||||
virtual void stop_reclaiming() {}
|
||||
// The following restrictions apply to implementations of start_reclaiming() and stop_reclaiming():
|
||||
//
|
||||
// - must not use any region or region_group objects, because they're invoked synchronously
|
||||
// with operations on those.
|
||||
//
|
||||
// - must be noexcept, because they're called on the free path.
|
||||
//
|
||||
// - the implementation may be called synchronously with any operation
|
||||
// which allocates memory, because these are called by memory reclaimer.
|
||||
// In particular, the implementation should not depend on memory allocation
|
||||
// because that may fail when in reclaiming context.
|
||||
//
|
||||
virtual void start_reclaiming() noexcept {}
|
||||
virtual void stop_reclaiming() noexcept {}
|
||||
public:
|
||||
bool under_pressure() const {
|
||||
return _under_pressure;
|
||||
@@ -75,32 +87,26 @@ public:
|
||||
return _under_soft_pressure;
|
||||
}
|
||||
|
||||
void notify_soft_pressure() {
|
||||
void notify_soft_pressure() noexcept {
|
||||
if (!_under_soft_pressure) {
|
||||
_under_soft_pressure = true;
|
||||
start_reclaiming();
|
||||
}
|
||||
}
|
||||
|
||||
void notify_soft_relief() {
|
||||
void notify_soft_relief() noexcept {
|
||||
if (_under_soft_pressure) {
|
||||
_under_soft_pressure = false;
|
||||
stop_reclaiming();
|
||||
}
|
||||
}
|
||||
|
||||
void notify_pressure() {
|
||||
if (!_under_pressure) {
|
||||
_under_pressure = true;
|
||||
start_reclaiming();
|
||||
}
|
||||
void notify_pressure() noexcept {
|
||||
_under_pressure = true;
|
||||
}
|
||||
|
||||
void notify_relief() {
|
||||
if (_under_pressure) {
|
||||
_under_pressure = false;
|
||||
stop_reclaiming();
|
||||
}
|
||||
void notify_relief() noexcept {
|
||||
_under_pressure = false;
|
||||
}
|
||||
|
||||
region_group_reclaimer()
|
||||
@@ -108,7 +114,9 @@ public:
|
||||
region_group_reclaimer(size_t threshold)
|
||||
: _threshold(threshold), _soft_limit(threshold) {}
|
||||
region_group_reclaimer(size_t threshold, size_t soft)
|
||||
: _threshold(threshold), _soft_limit(soft) {}
|
||||
: _threshold(threshold), _soft_limit(soft) {
|
||||
assert(_soft_limit <= _threshold);
|
||||
}
|
||||
|
||||
virtual ~region_group_reclaimer() {}
|
||||
|
||||
@@ -229,9 +237,13 @@ class region_group {
|
||||
// a different ancestor)
|
||||
std::experimental::optional<shared_promise<>> _descendant_blocked_requests = {};
|
||||
|
||||
region_group* _waiting_on_ancestor = nullptr;
|
||||
seastar::gate _asynchronous_gate;
|
||||
condition_variable _relief;
|
||||
future<> _releaser;
|
||||
bool _shutdown_requested = false;
|
||||
|
||||
bool reclaimer_can_block() const;
|
||||
future<> start_releaser();
|
||||
void notify_relief();
|
||||
public:
|
||||
// When creating a region_group, one can specify an optional throttle_threshold parameter. This
|
||||
// parameter won't affect normal allocations, but an API is provided, through the region_group's
|
||||
@@ -239,17 +251,13 @@ public:
|
||||
// the total memory for the region group (and all of its parents) is lower or equal to the
|
||||
// region_group's throttle_treshold (and respectively for its parents).
|
||||
region_group(region_group_reclaimer& reclaimer = no_reclaimer) : region_group(nullptr, reclaimer) {}
|
||||
region_group(region_group* parent, region_group_reclaimer& reclaimer = no_reclaimer) : _parent(parent), _reclaimer(reclaimer) {
|
||||
if (_parent) {
|
||||
_parent->add(this);
|
||||
}
|
||||
}
|
||||
region_group(region_group* parent, region_group_reclaimer& reclaimer = no_reclaimer);
|
||||
region_group(region_group&& o) = delete;
|
||||
region_group(const region_group&) = delete;
|
||||
~region_group() {
|
||||
// If we set a throttle threshold, we'd be postponing many operations. So shutdown must be
|
||||
// called.
|
||||
if (_reclaimer.throttle_threshold() != std::numeric_limits<size_t>::max()) {
|
||||
if (reclaimer_can_block()) {
|
||||
assert(_shutdown_requested);
|
||||
}
|
||||
if (_parent) {
|
||||
@@ -261,24 +269,7 @@ public:
|
||||
size_t memory_used() const {
|
||||
return _total_memory;
|
||||
}
|
||||
void update(ssize_t delta) {
|
||||
do_for_each_parent(this, [delta] (auto rg) mutable {
|
||||
rg->update_maximal_rg();
|
||||
rg->_total_memory += delta;
|
||||
// It is okay to call release_requests for a region_group that can't allow execution.
|
||||
// But that can generate various spurious messages to groups waiting on us that will be
|
||||
// then woken up just so they can go to wait again. So let's filter that.
|
||||
if (rg->execution_permitted()) {
|
||||
rg->release_requests();
|
||||
}
|
||||
if (rg->_total_memory >= rg->_reclaimer.soft_limit_threshold()) {
|
||||
rg->_reclaimer.notify_soft_pressure();
|
||||
} else if (rg->_total_memory < rg->_reclaimer.soft_limit_threshold()) {
|
||||
rg->_reclaimer.notify_soft_relief();
|
||||
}
|
||||
return stop_iteration::no;
|
||||
});
|
||||
}
|
||||
void update(ssize_t delta);
|
||||
|
||||
// It would be easier to call update, but it is unfortunately broken in boost versions up to at
|
||||
// least 1.59.
|
||||
@@ -324,36 +315,18 @@ public:
|
||||
using futurator = futurize<std::result_of_t<Func()>>;
|
||||
|
||||
auto blocked_at = do_for_each_parent(this, [] (auto rg) {
|
||||
return (rg->_blocked_requests.empty() && rg->execution_permitted()) ? stop_iteration::no : stop_iteration::yes;
|
||||
return (rg->_blocked_requests.empty() && !rg->under_pressure()) ? stop_iteration::no : stop_iteration::yes;
|
||||
});
|
||||
|
||||
if (!blocked_at) {
|
||||
return futurator::apply(func);
|
||||
}
|
||||
subscribe_for_ancestor_available_memory_notification(blocked_at);
|
||||
|
||||
auto fn = std::make_unique<concrete_allocating_function<Func>>(std::forward<Func>(func));
|
||||
auto fut = fn->get_future();
|
||||
_blocked_requests.push_back(std::move(fn), timeout);
|
||||
++_blocked_requests_counter;
|
||||
|
||||
// This is called here, and not at update(), for two reasons: the first, is that things that
|
||||
// are done during the free() path should be done carefuly, in the sense that they can
|
||||
// trigger another update call and put us in a loop. Not to mention we would like to keep
|
||||
// those from having exceptions. We solve that for release_requests by using later(), but in
|
||||
// here we can do away with that need altogether.
|
||||
//
|
||||
// Second and most important, until we actually block a request, the pressure condition may
|
||||
// very well be transient. There are opportunities for compactions, the condition can go
|
||||
// away on its own, etc.
|
||||
//
|
||||
// The reason we check execution permitted(), is that we'll still block requests if we have
|
||||
// free memory but existing requests in the queue. That is so we can keep our FIFO ordering
|
||||
// guarantee. So we need to distinguish here the case in which we're blocking merely to
|
||||
// serialize requests, so that the caller does not evict more than it should.
|
||||
if (!blocked_at->execution_permitted()) {
|
||||
blocked_at->_reclaimer.notify_pressure();
|
||||
}
|
||||
return fut;
|
||||
}
|
||||
|
||||
@@ -363,9 +336,11 @@ public:
|
||||
region* get_largest_region();
|
||||
|
||||
// Shutdown is mandatory for every user who has set a threshold
|
||||
// Can be called at most once.
|
||||
future<> shutdown() {
|
||||
_shutdown_requested = true;
|
||||
return _asynchronous_gate.close();
|
||||
_relief.signal();
|
||||
return std::move(_releaser);
|
||||
}
|
||||
|
||||
size_t blocked_requests() {
|
||||
@@ -376,43 +351,9 @@ public:
|
||||
return _blocked_requests_counter;
|
||||
}
|
||||
private:
|
||||
// Make sure we get a notification and can call release_requests when one of our ancestors that
|
||||
// used to block us is no longer under memory pressure.
|
||||
void subscribe_for_ancestor_available_memory_notification(region_group *ancestor) {
|
||||
if ((this == ancestor) || (_waiting_on_ancestor)) {
|
||||
return; // already subscribed, or no need to
|
||||
}
|
||||
|
||||
_waiting_on_ancestor = ancestor;
|
||||
|
||||
with_gate(_asynchronous_gate, [this] {
|
||||
// We reevaluate _waiting_on_ancestor here so we make sure there is no deferring point
|
||||
// between determining the ancestor and registering with it for a notification. We start
|
||||
// with _waiting_on_ancestor set to the initial value, and after we are notified, we
|
||||
// will set _waiting_on_ancestor to nullptr to force this lambda to reevaluate it.
|
||||
auto evaluate_ancestor_and_stop = [this] {
|
||||
if (!_waiting_on_ancestor) {
|
||||
auto new_blocking_point = do_for_each_parent(this, [] (auto rg) {
|
||||
return (rg->execution_permitted()) ? stop_iteration::no : stop_iteration::yes;
|
||||
});
|
||||
if (!new_blocking_point) {
|
||||
release_requests();
|
||||
}
|
||||
_waiting_on_ancestor = (new_blocking_point == this) ? nullptr : new_blocking_point;
|
||||
}
|
||||
return _waiting_on_ancestor == nullptr;
|
||||
};
|
||||
|
||||
return do_until(evaluate_ancestor_and_stop, [this] {
|
||||
if (!_waiting_on_ancestor->_descendant_blocked_requests) {
|
||||
_waiting_on_ancestor->_descendant_blocked_requests = shared_promise<>();
|
||||
}
|
||||
return _waiting_on_ancestor->_descendant_blocked_requests->get_shared_future().then([this] {
|
||||
_waiting_on_ancestor = nullptr;
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
// Returns true if and only if constraints of this group are not violated.
|
||||
// That's taking into account any constraints imposed by enclosing (parent) groups.
|
||||
bool execution_permitted() noexcept;
|
||||
|
||||
// Executes the function func for each region_group upwards in the hierarchy, starting with the
|
||||
// parameter node. The function func may return stop_iteration::no, in which case it proceeds to
|
||||
@@ -432,11 +373,10 @@ private:
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
inline bool execution_permitted() const {
|
||||
return _total_memory <= _reclaimer.throttle_threshold();
|
||||
}
|
||||
|
||||
void release_requests() noexcept;
|
||||
inline bool under_pressure() const {
|
||||
return _reclaimer.under_pressure();
|
||||
}
|
||||
|
||||
uint64_t top_region_evictable_space() const;
|
||||
|
||||
@@ -687,6 +627,9 @@ private:
|
||||
};
|
||||
void on_alloc_failure();
|
||||
public:
|
||||
void set_lsa_reserve(size_t);
|
||||
void set_std_reserve(size_t);
|
||||
|
||||
//
|
||||
// Invokes func with reclaim_lock on region r. If LSA allocation fails
|
||||
// inside func it is retried after increasing LSA segment reserve. The
|
||||
|
||||
Reference in New Issue
Block a user