Currently timeout is opt-in, that is, all methods that even have it default it to `db::no_timeout`. This means that ensuring timeout is used where it should be is completely up to the author and the reviewrs of the code. As humans are notoriously prone to mistakes this has resulted in a very inconsistent usage of timeout, many clients of `flat_mutation_reader` passing the timeout only to some members and only on certain call sites. This is small wonder considering that some core operations like `operator()()` only recently received a timeout parameter and others like `peek()` didn't even have one until this patch. Both of these methods call `fill_buffer()` which potentially talks to the lower layers and is supposed to propagate the timeout. All this makes the `flat_mutation_reader`'s timeout effectively useless. To make order in this chaos make the timeout parameter a mandatory one on all `flat_mutation_reader` methods that need it. This ensures that humans now get a reminder from the compiler when they forget to pass the timeout. Clients can still opt-out from passing a timeout by passing `db::no_timeout` (the previous default value) but this will be now explicit and developers should think before typing it. There were suprisingly few core call sites to fix up. Where a timeout was available nearby I propagated it to be able to pass it to the reader, where I couldn't I passed `db::no_timeout`. Authors of the latter kind of code (view, streaming and repair are some of the notable examples) should maybe consider propagating down a timeout if needed. In the test code (the wast majority of the changes) I just used `db::no_timeout` everywhere. Tests: unit(release, debug) Signed-off-by: Botond Dénes <bdenes@scylladb.com> Message-Id: <1edc10802d5eb23de8af28c9f48b8d3be0f1a468.1536744563.git.bdenes@scylladb.com>
368 lines
14 KiB
C++
368 lines
14 KiB
C++
/*
|
|
* Copyright (C) 2017 ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* This file is part of Scylla.
|
|
*
|
|
* Scylla is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU Affero General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* Scylla is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#include <boost/range/irange.hpp>
|
|
#include "seastarx.hh"
|
|
#include "tests/simple_schema.hh"
|
|
#include "core/app-template.hh"
|
|
#include "memtable.hh"
|
|
#include "row_cache.hh"
|
|
#include "partition_slice_builder.hh"
|
|
#include "utils/int_range.hh"
|
|
#include "utils/div_ceil.hh"
|
|
#include "tests/memtable_snapshot_source.hh"
|
|
#include <seastar/core/reactor.hh>
|
|
|
|
logging::logger test_log("test");
|
|
|
|
static thread_local bool cancelled = false;
|
|
|
|
using namespace std::chrono_literals;
|
|
|
|
namespace row_cache_stress_test {
|
|
|
|
struct table {
|
|
simple_schema s;
|
|
std::vector<dht::decorated_key> p_keys;
|
|
std::vector<api::timestamp_type> p_writetime; // committed writes
|
|
std::vector<clustering_key> c_keys;
|
|
uint64_t mutation_phase = 0;
|
|
uint64_t mutations = 0;
|
|
uint64_t reads_started = 0;
|
|
uint64_t scans_started = 0;
|
|
|
|
lw_shared_ptr<memtable> mt;
|
|
lw_shared_ptr<memtable> prev_mt;
|
|
memtable_snapshot_source underlying;
|
|
cache_tracker tracker;
|
|
row_cache cache;
|
|
|
|
table(unsigned partitions, unsigned rows)
|
|
: mt(make_lw_shared<memtable>(s.schema()))
|
|
, underlying(s.schema())
|
|
, cache(s.schema(), snapshot_source([this] { return underlying(); }), tracker)
|
|
{
|
|
p_keys = s.make_pkeys(partitions);
|
|
p_writetime.resize(p_keys.size());
|
|
c_keys = s.make_ckeys(rows);
|
|
}
|
|
|
|
size_t index_of_key(const dht::decorated_key& dk) {
|
|
for (auto i : boost::irange<size_t>(0, p_keys.size())) {
|
|
if (p_keys[i].equal(*s.schema(), dk)) {
|
|
return i;
|
|
}
|
|
}
|
|
throw std::runtime_error(sprint("key not found: %s", dk));
|
|
}
|
|
|
|
sstring value_tag(int key, uint64_t phase) {
|
|
return sprint("k_0x%x_p_0x%x", key, phase);
|
|
}
|
|
|
|
mutation get_mutation(int key, api::timestamp_type t, const sstring& tag) {
|
|
mutation m(s.schema(), p_keys[key]);
|
|
for (auto ck : c_keys) {
|
|
s.add_row(m, ck, tag, t);
|
|
}
|
|
return m;
|
|
}
|
|
|
|
// Must not be called concurrently
|
|
void flush() {
|
|
test_log.trace("flushing");
|
|
prev_mt = std::exchange(mt, make_lw_shared<memtable>(s.schema()));
|
|
auto flushed = make_lw_shared<memtable>(s.schema());
|
|
flushed->apply(*prev_mt).get();
|
|
prev_mt->mark_flushed(flushed->as_data_source());
|
|
test_log.trace("updating cache");
|
|
cache.update([&] {
|
|
underlying.apply(flushed);
|
|
}, *prev_mt).get();
|
|
test_log.trace("flush done");
|
|
prev_mt = {};
|
|
}
|
|
|
|
void mutate_next_phase() {
|
|
test_log.trace("mutating, phase={}", mutation_phase);
|
|
for (auto i : boost::irange<int>(0, p_keys.size())) {
|
|
auto t = s.new_timestamp();
|
|
auto tag = value_tag(i, mutation_phase);
|
|
auto m = get_mutation(i, t, tag);
|
|
mt->apply(std::move(m));
|
|
p_writetime[i] = t;
|
|
test_log.trace("updated key {}, {} @{}", i, tag, t);
|
|
++mutations;
|
|
later().get();
|
|
}
|
|
test_log.trace("mutated whole ring");
|
|
++mutation_phase;
|
|
// FIXME: mutate concurrently with flush
|
|
flush();
|
|
}
|
|
|
|
struct reader {
|
|
dht::partition_range pr;
|
|
query::partition_slice slice;
|
|
flat_mutation_reader rd;
|
|
};
|
|
|
|
std::unique_ptr<reader> make_reader(dht::partition_range pr, query::partition_slice slice) {
|
|
test_log.trace("making reader, pk={} ck={}", pr, slice);
|
|
auto r = std::make_unique<reader>(reader{std::move(pr), std::move(slice), make_empty_flat_reader(s.schema())});
|
|
std::vector<flat_mutation_reader> rd;
|
|
if (prev_mt) {
|
|
rd.push_back(prev_mt->make_flat_reader(s.schema(), r->pr, r->slice));
|
|
}
|
|
rd.push_back(mt->make_flat_reader(s.schema(), r->pr, r->slice));
|
|
rd.push_back(cache.make_reader(s.schema(), r->pr, r->slice));
|
|
r->rd = make_combined_reader(s.schema(), std::move(rd), streamed_mutation::forwarding::yes, mutation_reader::forwarding::no);
|
|
return r;
|
|
}
|
|
|
|
std::unique_ptr<reader> make_single_key_reader(int pk, int_range ck_range) {
|
|
++reads_started;
|
|
auto slice = partition_slice_builder(*s.schema())
|
|
.with_range(ck_range.transform([this] (int key) { return c_keys[key]; }))
|
|
.build();
|
|
auto pr = dht::partition_range::make_singular(p_keys[pk]);
|
|
return make_reader(std::move(pr), std::move(slice));
|
|
}
|
|
|
|
std::unique_ptr<reader> make_scanning_reader() {
|
|
++scans_started;
|
|
return make_reader(query::full_partition_range, s.schema()->full_slice());
|
|
}
|
|
};
|
|
|
|
struct reader_id {
|
|
sstring name;
|
|
|
|
friend std::ostream& operator<<(std::ostream& out, reader_id id) {
|
|
return out << id.name;
|
|
}
|
|
};
|
|
|
|
class validating_consumer {
|
|
table& _t;
|
|
reader_id _id;
|
|
stdx::optional<sstring> _value;
|
|
size_t _row_count = 0;
|
|
size_t _key = 0;
|
|
std::vector<api::timestamp_type> _writetimes;
|
|
public:
|
|
validating_consumer(table& t, reader_id id)
|
|
: _t(t)
|
|
, _id(id)
|
|
, _writetimes(t.p_writetime)
|
|
{ }
|
|
|
|
void consume_new_partition(const dht::decorated_key& key) {
|
|
test_log.trace("reader {}: enters partition {}", _id, key);
|
|
_value = {};
|
|
_key = _t.index_of_key(key);
|
|
}
|
|
|
|
stop_iteration consume_end_of_partition() { return stop_iteration::no; }
|
|
stop_iteration consume(tombstone) { return stop_iteration::no; }
|
|
stop_iteration consume(const static_row&) { return stop_iteration::no; }
|
|
stop_iteration consume(const range_tombstone&) { return stop_iteration::no; }
|
|
|
|
stop_iteration consume(const clustering_row& row) {
|
|
++_row_count;
|
|
sstring value;
|
|
api::timestamp_type t;
|
|
std::tie(value, t) = _t.s.get_value(row);
|
|
test_log.trace("reader {}: {} @{}, {}", _id, value, t, row);
|
|
if (_value && value != _value) {
|
|
throw std::runtime_error(sprint("Saw values from two different writes in partition %d: %s and %s", _key, _value, value));
|
|
}
|
|
auto lowest_timestamp = _writetimes[_key];
|
|
if (t < lowest_timestamp) {
|
|
throw std::runtime_error(sprint("Expected to see the write @%d, but saw @%d (%s), c_key=%s", lowest_timestamp, t, value, row.key()));
|
|
}
|
|
_value = std::move(value);
|
|
return stop_iteration::no;
|
|
}
|
|
|
|
size_t consume_end_of_stream() {
|
|
test_log.trace("reader {}: done, {} rows", _id, _row_count);
|
|
return _row_count;
|
|
}
|
|
};
|
|
|
|
template<typename T>
|
|
class monotonic_counter {
|
|
std::function<T()> _getter;
|
|
T _prev;
|
|
public:
|
|
monotonic_counter(std::function<T()> getter)
|
|
: _getter(std::move(getter)) {
|
|
_prev = _getter();
|
|
}
|
|
// Return change in value since the last call to change() or rate().
|
|
auto change() {
|
|
auto now = _getter();
|
|
return now - std::exchange(_prev, now);
|
|
}
|
|
};
|
|
|
|
}
|
|
|
|
using namespace row_cache_stress_test;
|
|
|
|
int main(int argc, char** argv) {
|
|
namespace bpo = boost::program_options;
|
|
app_template app;
|
|
app.add_options()
|
|
("trace", "Enables trace-level logging for the test actions")
|
|
("concurrency", bpo::value<unsigned>()->default_value(10), "Number of concurrent single partition readers")
|
|
("scan-concurrency", bpo::value<unsigned>()->default_value(2), "Number of concurrent ring scanners")
|
|
("partitions", bpo::value<unsigned>()->default_value(10), "Number of partitions")
|
|
("rows", bpo::value<unsigned>()->default_value(10000), "Number of rows in each partitions")
|
|
("seconds", bpo::value<unsigned>()->default_value(600), "Duration [s] after which the test terminates with a success")
|
|
;
|
|
|
|
return app.run(argc, argv, [&app] {
|
|
if (app.configuration().count("trace")) {
|
|
test_log.set_level(seastar::log_level::trace);
|
|
}
|
|
|
|
return seastar::async([&app] {
|
|
auto concurrency = app.configuration()["concurrency"].as<unsigned>();
|
|
auto scan_concurrency = app.configuration()["scan-concurrency"].as<unsigned>();
|
|
auto partitions = app.configuration()["partitions"].as<unsigned>();
|
|
auto rows = app.configuration()["rows"].as<unsigned>();
|
|
auto seconds = app.configuration()["seconds"].as<unsigned>();
|
|
|
|
row_cache_stress_test::table t(partitions, rows);
|
|
|
|
engine().at_exit([] {
|
|
cancelled = true;
|
|
return make_ready_future();
|
|
});
|
|
|
|
timer<> completion_timer;
|
|
completion_timer.set_callback([&] {
|
|
test_log.info("Test done.");
|
|
cancelled = true;
|
|
});
|
|
completion_timer.arm(std::chrono::seconds(seconds));
|
|
|
|
auto fail = [&] (sstring msg) {
|
|
test_log.error("{}", msg);
|
|
cancelled = true;
|
|
completion_timer.cancel();
|
|
};
|
|
|
|
// Stats printer
|
|
timer<> stats_printer;
|
|
monotonic_counter<uint64_t> reads([&] { return t.reads_started; });
|
|
monotonic_counter<uint64_t> scans([&] { return t.scans_started; });
|
|
monotonic_counter<uint64_t> mutations([&] { return t.mutations; });
|
|
monotonic_counter<uint64_t> flushes([&] { return t.mutation_phase; });
|
|
stats_printer.set_callback([&] {
|
|
auto MB = 1024 * 1024;
|
|
test_log.info("reads/s: {}, scans/s: {}, mutations/s: {}, flushes/s: {}, Cache: {}/{} [MB], LSA: {}/{} [MB], std free: {} [MB]",
|
|
reads.change(), scans.change(), mutations.change(), flushes.change(),
|
|
t.tracker.region().occupancy().used_space() / MB,
|
|
t.tracker.region().occupancy().total_space() / MB,
|
|
logalloc::shard_tracker().region_occupancy().used_space() / MB,
|
|
logalloc::shard_tracker().region_occupancy().total_space() / MB,
|
|
seastar::memory::stats().free_memory() / MB);
|
|
});
|
|
stats_printer.arm_periodic(1s);
|
|
|
|
auto single_partition_reader = [&] (int i, reader_id id) {
|
|
auto n_keys = t.c_keys.size();
|
|
|
|
// Assign ranges so that there is ~30% overlap between adjacent readers.
|
|
auto len = div_ceil(n_keys, concurrency);
|
|
len = std::min(n_keys, len + div_ceil(len, 3)); // so that read ranges overlap
|
|
auto start = (n_keys - len) * i / (std::max(concurrency - 1, 1u));
|
|
int_range ck_range = make_int_range(start, start + len);
|
|
|
|
int pk = t.p_keys.size() / 2; // FIXME: spread over 3 consecutive partitions
|
|
test_log.info("{} is using pk={} ck={}", id, pk, ck_range);
|
|
while (!cancelled) {
|
|
test_log.trace("{}: starting read", id);
|
|
auto rd = t.make_single_key_reader(pk, ck_range);
|
|
auto row_count = rd->rd.consume(validating_consumer(t, id), db::no_timeout).get0();
|
|
if (row_count != len) {
|
|
throw std::runtime_error(sprint("Expected %d fragments, got %d", len, row_count));
|
|
}
|
|
}
|
|
};
|
|
|
|
auto scanning_reader = [&] (reader_id id) {
|
|
auto expected_row_count = t.p_keys.size() * t.c_keys.size();
|
|
while (!cancelled) {
|
|
test_log.trace("{}: starting read", id);
|
|
auto rd = t.make_scanning_reader();
|
|
auto row_count = rd->rd.consume(validating_consumer(t, id), db::no_timeout).get0();
|
|
if (row_count != expected_row_count) {
|
|
throw std::runtime_error(sprint("Expected %d fragments, got %d", expected_row_count, row_count));
|
|
}
|
|
}
|
|
};
|
|
|
|
// populate the initial phase, readers expect constant fragment count.
|
|
t.mutate_next_phase();
|
|
|
|
auto readers = parallel_for_each(boost::irange(0u, concurrency), [&] (auto i) {
|
|
reader_id id{sprint("single-%d", i)};
|
|
return seastar::async([&, i, id] {
|
|
single_partition_reader(i, id);
|
|
}).handle_exception([&, id] (auto e) {
|
|
fail(sprint("%s failed: %s", id, e));
|
|
});
|
|
});
|
|
|
|
auto scanning_readers = parallel_for_each(boost::irange(0u, scan_concurrency), [&] (auto i) {
|
|
reader_id id{sprint("scan-%d", i)};
|
|
return seastar::async([&, id] {
|
|
scanning_reader(id);
|
|
}).handle_exception([&, id] (auto e) {
|
|
fail(sprint("%s failed: %s", id, e));
|
|
});
|
|
});
|
|
|
|
timer<> evictor;
|
|
evictor.set_callback([&] {
|
|
test_log.trace("evicting");
|
|
t.cache.evict();
|
|
});
|
|
evictor.arm_periodic(3s);
|
|
|
|
// Mutator
|
|
while (!cancelled) {
|
|
t.mutate_next_phase();
|
|
}
|
|
|
|
stats_printer.cancel();
|
|
completion_timer.cancel();
|
|
evictor.cancel();
|
|
readers.get();
|
|
scanning_readers.get();
|
|
});
|
|
});
|
|
}
|