Files
scylladb/test/boost/reader_concurrency_semaphore_test.cc
Andrzej Jackowski eb241a7048 test: make preemptive abort coverage deterministic
The test used a real-time sleep to move the queued permit into the
preemptive-abort window. If the reactor did not get CPU for long
enough, admission could run only after the permit's timeout had
expired, making the expected abort path flaky.

The test also exhausted memory together with count resources, so the
queued permit could wait for memory. Preemptive abort is intentionally
not applied to permits waiting for memory, so keep enough memory
available and assert that the permit is queued only on count.

Use an immediate preemptive-abort threshold and a long finite timeout
to exercise admission-time abort without relying on scheduler timing.

Fixes: SCYLLADB-1796

Closes scylladb/scylladb#29736
2026-05-07 09:59:53 +03:00

2638 lines
115 KiB
C++

/*
* Copyright (C) 2021-present ScyllaDB
*/
/*
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
*/
#include "utils/assert.hh"
#include <seastar/util/closeable.hh>
#include <seastar/core/file.hh>
#include "reader_concurrency_semaphore.hh"
#include "sstables/sstables_manager.hh"
#include "reader_concurrency_semaphore_group.hh"
#include "test/lib/log.hh"
#include "test/lib/simple_schema.hh"
#include "test/lib/cql_assertions.hh"
#include "test/lib/cql_test_env.hh"
#include "test/lib/eventually.hh"
#include "test/lib/key_utils.hh"
#include "test/lib/random_utils.hh"
#include "test/lib/random_schema.hh"
#include "test/lib/test_utils.hh"
#include "test/lib/tmpdir.hh"
#include <fmt/ranges.h>
#include <seastar/core/coroutine.hh>
#include <seastar/coroutine/parallel_for_each.hh>
#include <seastar/testing/on_internal_error.hh>
#undef SEASTAR_TESTING_MAIN
#include <seastar/testing/test_case.hh>
#include <seastar/testing/thread_test_case.hh>
#include <boost/test/unit_test.hpp>
#include "readers/empty.hh"
#include "readers/from_mutations.hh"
#include "replica/database.hh" // new_reader_base_cost is there :(
#include "db/config.hh"
// Provides access to private members of reader_concurrency_semaphore for testing.
struct reader_concurrency_semaphore_tester {
static void signal(reader_concurrency_semaphore& sem, reader_resources r) {
sem.signal(r);
}
};
BOOST_AUTO_TEST_SUITE(reader_concurrency_semaphore_test)
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_clear_inactive_reads) {
simple_schema s;
std::vector<reader_permit> permits;
std::vector<reader_concurrency_semaphore::inactive_read_handle> handles;
{
reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::no_limits{}, get_name(), reader_concurrency_semaphore::register_metrics::no);
auto stop_sem = deferred_stop(semaphore);
auto clear_permits = defer([&permits] { permits.clear(); });
for (int i = 0; i < 10; ++i) {
permits.emplace_back(semaphore.make_tracking_only_permit(s.schema(), get_name(), db::no_timeout, {}));
handles.emplace_back(semaphore.register_inactive_read(make_empty_mutation_reader(s.schema(), permits.back())));
}
BOOST_REQUIRE(std::all_of(handles.begin(), handles.end(), [] (const reader_concurrency_semaphore::inactive_read_handle& handle) { return bool(handle); }));
BOOST_REQUIRE(std::all_of(permits.begin(), permits.end(), [] (const reader_permit& permit) { return permit.get_state() == reader_permit::state::inactive; }));
semaphore.clear_inactive_reads();
BOOST_REQUIRE(std::all_of(handles.begin(), handles.end(), [] (const reader_concurrency_semaphore::inactive_read_handle& handle) { return !bool(handle); }));
BOOST_REQUIRE(std::all_of(permits.begin(), permits.end(), [] (const reader_permit& permit) { return permit.get_state() == reader_permit::state::evicted; }));
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 0);
permits.clear();
handles.clear();
for (int i = 0; i < 10; ++i) {
handles.emplace_back(semaphore.register_inactive_read(make_empty_mutation_reader(s.schema(), semaphore.make_tracking_only_permit(s.schema(), get_name(), db::no_timeout, {}))));
}
BOOST_REQUIRE(std::all_of(handles.begin(), handles.end(), [] (const reader_concurrency_semaphore::inactive_read_handle& handle) { return bool(handle); }));
}
// Check that the destructor also clears inactive reads.
BOOST_REQUIRE(std::all_of(handles.begin(), handles.end(), [] (const reader_concurrency_semaphore::inactive_read_handle& handle) { return !bool(handle); }));
}
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_destroyed_permit_releases_units) {
simple_schema s;
const auto initial_resources = reader_concurrency_semaphore::resources{10, 1024 * 1024};
reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::for_tests{}, get_name(), initial_resources.count, initial_resources.memory);
auto stop_sem = deferred_stop(semaphore);
// Not admitted, active
{
auto permit = semaphore.make_tracking_only_permit(s.schema(), get_name(), db::no_timeout, {});
auto units2 = permit.consume_memory(1024);
}
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial_resources);
// Not admitted, inactive
{
auto permit = semaphore.make_tracking_only_permit(s.schema(), get_name(), db::no_timeout, {});
auto units2 = permit.consume_memory(1024);
auto handle = semaphore.register_inactive_read(make_empty_mutation_reader(s.schema(), permit));
BOOST_REQUIRE(semaphore.try_evict_one_inactive_read());
}
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial_resources);
// Admitted, active
{
auto permit = semaphore.obtain_permit(s.schema(), get_name(), 1024, db::no_timeout, {}).get();
auto units1 = permit.consume_memory(1024);
}
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial_resources);
// Admitted, inactive
{
auto permit = semaphore.obtain_permit(s.schema(), get_name(), 1024, db::no_timeout, {}).get();
auto units1 = permit.consume_memory(1024);
auto handle = semaphore.register_inactive_read(make_empty_mutation_reader(s.schema(), permit));
BOOST_REQUIRE(semaphore.try_evict_one_inactive_read());
}
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial_resources);
}
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_abandoned_handle_closes_reader) {
simple_schema s;
reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::no_limits{}, get_name(), reader_concurrency_semaphore::register_metrics::no);
auto stop_sem = deferred_stop(semaphore);
auto permit = semaphore.make_tracking_only_permit(s.schema(), get_name(), db::no_timeout, {});
{
auto handle = semaphore.register_inactive_read(make_empty_mutation_reader(s.schema(), permit));
// The handle is destroyed here, triggering the destrution of the inactive read.
// If the test fails an SCYLLA_ASSERT() is triggered due to the reader being
// destroyed without having been closed before.
}
}
// This unit test passes a read through admission again-and-again, just
// like an evictable reader would be during its lifetime. When readmitted
// the read sometimes has to wait and sometimes not. This is to check that
// the readmitting a previously admitted reader doesn't leak any units.
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_readmission_preserves_units) {
simple_schema s;
const auto initial_resources = reader_concurrency_semaphore::resources{10, 1024 * 1024};
const auto base_resources = reader_concurrency_semaphore::resources{1, 1024};
reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::for_tests{}, get_name(), initial_resources.count, initial_resources.memory);
auto stop_sem = deferred_stop(semaphore);
reader_permit_opt permit = semaphore.obtain_permit(s.schema(), get_name(), 1024, db::no_timeout, {}).get();
BOOST_REQUIRE_EQUAL(permit->consumed_resources(), base_resources);
std::optional<reader_permit::resource_units> residue_units;
for (int i = 0; i < 10; ++i) {
residue_units.emplace(permit->consume_resources(reader_resources(0, 100)));
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial_resources - permit->consumed_resources());
auto handle = semaphore.register_inactive_read(make_empty_mutation_reader(s.schema(), *permit));
BOOST_REQUIRE(semaphore.try_evict_one_inactive_read());
BOOST_REQUIRE_EQUAL(permit->consumed_resources(), residue_units->resources());
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial_resources - permit->consumed_resources());
if (i % 2) {
auto sponge_permit = semaphore.make_tracking_only_permit(s.schema(), get_name(), db::no_timeout, {});
auto consumed_resources = sponge_permit.consume_resources(semaphore.available_resources());
auto fut = make_ready_future<>();
if (permit->needs_readmission()) {
fut = permit->wait_readmission();
}
BOOST_REQUIRE(!fut.available());
consumed_resources.reset_to_zero();
fut.get();
} else {
if (permit->needs_readmission()) {
permit->wait_readmission().get();
}
}
BOOST_REQUIRE_EQUAL(permit->consumed_resources(), residue_units->resources() + base_resources);
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial_resources - permit->consumed_resources());
}
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial_resources - permit->consumed_resources());
residue_units.reset();
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial_resources - permit->consumed_resources());
permit = {};
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial_resources);
}
// This unit test checks that the semaphore doesn't get into a deadlock
// when contended, in the presence of many memory-only reads (that don't
// wait for admission). This is tested by simulating the 3 kind of reads we
// currently have in the system:
// * memory-only: reads that don't pass admission and only own memory.
// * admitted: reads that pass admission.
// * evictable: admitted reads that are furthermore evictable.
//
// The test creates and runs a large number of these reads in parallel,
// read kinds being selected randomly, then creates a watchdog which
// kills the test if no progress is being made.
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_forward_progress) {
class reader {
class skeleton_reader : public mutation_reader::impl {
std::optional<reader_permit::resource_units> _resources;
public:
skeleton_reader(schema_ptr s, reader_permit permit)
: impl(std::move(s), std::move(permit)) { }
virtual future<> fill_buffer() override {
reader_permit::awaits_guard _{_permit};
_resources.emplace(_permit.consume_resources(reader_resources(0, tests::random::get_int(1024, 2048))));
co_await sleep(std::chrono::milliseconds(1));
}
virtual future<> next_partition() override { return make_ready_future<>(); }
virtual future<> fast_forward_to(const dht::partition_range& pr) override { return make_ready_future<>(); }
virtual future<> fast_forward_to(position_range) override { return make_ready_future<>(); }
virtual future<> close() noexcept override {
_resources.reset();
return make_ready_future<>();
}
};
struct reader_visitor {
reader& r;
future<> operator()(std::monostate& ms) { return r.tick(ms); }
future<> operator()(mutation_reader& reader) { return r.tick(reader); }
future<> operator()(reader_concurrency_semaphore::inactive_read_handle& handle) { return r.tick(handle); }
};
private:
schema_ptr _schema;
reader_concurrency_semaphore& _semaphore;
bool _memory_only = true;
bool _evictable = false;
reader_permit_opt _permit;
std::optional<reader_permit::resource_units> _units;
std::variant<std::monostate, mutation_reader, reader_concurrency_semaphore::inactive_read_handle> _reader;
private:
void make_reader() {
_reader = make_mutation_reader<skeleton_reader>(_schema, *_permit);
}
future<> tick(std::monostate&) {
make_reader();
co_await tick(std::get<mutation_reader>(_reader));
}
future<> tick(mutation_reader& reader) {
co_await reader.fill_buffer();
if (_evictable) {
_reader = _permit->semaphore().register_inactive_read(std::move(reader));
}
}
future<> tick(reader_concurrency_semaphore::inactive_read_handle& handle) {
if (auto reader = _permit->semaphore().unregister_inactive_read(std::move(handle)); reader) {
_reader = std::move(*reader);
} else {
if (_permit->needs_readmission()) {
co_await _permit->wait_readmission();
}
make_reader();
}
co_await tick(std::get<mutation_reader>(_reader));
}
public:
reader(schema_ptr s, reader_concurrency_semaphore& semaphore, bool memory_only, bool evictable)
: _schema(std::move(s))
, _semaphore(semaphore)
, _memory_only(memory_only)
, _evictable(evictable)
{
}
future<> obtain_permit() {
if (_memory_only) {
_permit = _semaphore.make_tracking_only_permit(_schema, "reader_m", db::no_timeout, {});
} else {
_permit = co_await _semaphore.obtain_permit(_schema, fmt::format("reader_{}", _evictable ? 'e' : 'a'), 1024, db::no_timeout, {});
}
_units = _permit->consume_memory(tests::random::get_int(128, 1024));
}
future<> tick() {
return std::visit(reader_visitor{*this}, _reader);
}
future<> close() noexcept {
if (auto reader = std::get_if<mutation_reader>(&_reader)) {
return reader->close();
}
return make_ready_future<>();
}
};
#ifdef DEBUG
const auto count = 10;
const auto num_readers = 512;
const auto ticks = 200;
#else
const auto count = 10;
const auto num_readers = 128;
const auto ticks = 10;
#endif
simple_schema s;
reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::for_tests{}, get_name(), count, count * 1024);
auto stop_sem = deferred_stop(semaphore);
std::vector<std::unique_ptr<reader>> readers;
unsigned nr_memory_only = 0;
unsigned nr_admitted = 0;
unsigned nr_evictable = 0;
for (auto i = 0; i < num_readers; ++i) {
const auto memory_only = tests::random::get_bool();
const auto evictable = !memory_only && tests::random::get_bool();
if (memory_only) {
++nr_memory_only;
} else if (evictable) {
++nr_evictable;
} else {
++nr_admitted;
}
readers.emplace_back(std::make_unique<reader>(s.schema(), semaphore, memory_only, evictable));
}
testlog.info("Created {} readers, memory_only={}, admitted={}, evictable={}", readers.size(), nr_memory_only, nr_admitted, nr_evictable);
bool watchdog_touched = false;
auto watchdog = timer<db::timeout_clock>([&semaphore, &watchdog_touched] {
if (!watchdog_touched) {
testlog.error("Watchdog detected a deadlock, dumping diagnostics before killing the test: {}", semaphore.dump_diagnostics());
semaphore.broken(std::make_exception_ptr(std::runtime_error("test killed by watchdog")));
}
watchdog_touched = false;
});
watchdog.arm_periodic(std::chrono::seconds(30));
parallel_for_each(readers, [&] (std::unique_ptr<reader>& r_) -> future<> {
auto r = std::move(r_);
try {
co_await r->obtain_permit();
} catch (semaphore_timed_out&) {
semaphore.broken(std::make_exception_ptr(std::runtime_error("test failed due to read ")));
co_return;
}
for (auto i = 0; i < ticks; ++i) {
try {
watchdog_touched = true;
co_await r->tick();
} catch (semaphore_timed_out&) {
semaphore.broken(std::make_exception_ptr(std::runtime_error("test failed due to read ")));
break;
}
}
co_await r->close();
watchdog_touched = true;
}).get();
}
class dummy_file_impl : public file_impl {
virtual future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, io_intent*) override {
return make_ready_future<size_t>(0);
}
virtual future<size_t> write_dma(uint64_t pos, std::vector<iovec> iov, io_intent*) override {
return make_ready_future<size_t>(0);
}
virtual future<size_t> read_dma(uint64_t pos, void* buffer, size_t len, io_intent*) override {
return make_ready_future<size_t>(0);
}
virtual future<size_t> read_dma(uint64_t pos, std::vector<iovec> iov, io_intent*) override {
return make_ready_future<size_t>(0);
}
virtual future<> flush(void) override {
return make_ready_future<>();
}
virtual future<struct stat> stat(void) override {
return make_ready_future<struct stat>();
}
virtual future<> truncate(uint64_t length) override {
return make_ready_future<>();
}
virtual future<> discard(uint64_t offset, uint64_t length) override {
return make_ready_future<>();
}
virtual future<> allocate(uint64_t position, uint64_t length) override {
return make_ready_future<>();
}
virtual future<uint64_t> size(void) override {
return make_ready_future<uint64_t>(0);
}
virtual future<> close() override {
return make_ready_future<>();
}
virtual subscription<directory_entry> list_directory(std::function<future<> (directory_entry de)> next) override {
throw_with_backtrace<std::bad_function_call>();
}
virtual future<temporary_buffer<uint8_t>> dma_read_bulk(uint64_t offset, size_t range_size, io_intent*) override {
temporary_buffer<uint8_t> buf(range_size);
memset(buf.get_write(), 0xff, buf.size());
return make_ready_future<temporary_buffer<uint8_t>>(std::move(buf));
}
};
SEASTAR_TEST_CASE(reader_restriction_file_tracking) {
return async([&] {
reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::for_tests{}, get_name(), 100, 4 * 1024);
auto stop_sem = deferred_stop(semaphore);
auto permit = semaphore.obtain_permit(nullptr, get_name(), 0, db::no_timeout, {}).get();
{
auto tracked_file = make_tracked_file(file(shared_ptr<file_impl>(make_shared<dummy_file_impl>())), permit);
BOOST_REQUIRE_EQUAL(4 * 1024, semaphore.available_resources().memory);
auto buf1 = tracked_file.dma_read_bulk<char>(0, 1024).get();
BOOST_REQUIRE_EQUAL(3 * 1024, semaphore.available_resources().memory);
auto buf2 = tracked_file.dma_read_bulk<char>(0, 1024).get();
BOOST_REQUIRE_EQUAL(2 * 1024, semaphore.available_resources().memory);
auto buf3 = tracked_file.dma_read_bulk<char>(0, 1024).get();
BOOST_REQUIRE_EQUAL(1 * 1024, semaphore.available_resources().memory);
auto buf4 = tracked_file.dma_read_bulk<char>(0, 1024).get();
BOOST_REQUIRE_EQUAL(0 * 1024, semaphore.available_resources().memory);
auto buf5 = tracked_file.dma_read_bulk<char>(0, 1024).get();
BOOST_REQUIRE_EQUAL(-1 * 1024, semaphore.available_resources().memory);
// Reassing buf1, should still have the same amount of units.
buf1 = tracked_file.dma_read_bulk<char>(0, 1024).get();
BOOST_REQUIRE_EQUAL(-1 * 1024, semaphore.available_resources().memory);
// Move buf1 to the heap, so that we can safely destroy it
auto buf1_ptr = std::make_unique<temporary_buffer<char>>(std::move(buf1));
BOOST_REQUIRE_EQUAL(-1 * 1024, semaphore.available_resources().memory);
buf1_ptr.reset();
BOOST_REQUIRE_EQUAL(0 * 1024, semaphore.available_resources().memory);
// Move tracked_file to the heap, so that we can safely destroy it.
auto tracked_file_ptr = std::make_unique<file>(std::move(tracked_file));
tracked_file_ptr.reset();
// Move buf4 to the heap, so that we can safely destroy it
auto buf4_ptr = std::make_unique<temporary_buffer<char>>(std::move(buf4));
BOOST_REQUIRE_EQUAL(0 * 1024, semaphore.available_resources().memory);
// Releasing buffers that overlived the tracked-file they
// originated from should succeed.
buf4_ptr.reset();
BOOST_REQUIRE_EQUAL(1 * 1024, semaphore.available_resources().memory);
}
// All units should have been deposited back.
REQUIRE_EVENTUALLY_EQUAL<ssize_t>([&] { return semaphore.available_resources().memory; }, 4 * 1024);
});
}
SEASTAR_TEST_CASE(reader_concurrency_semaphore_timeout) {
return async([&] () {
reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::for_tests{}, get_name(), 2, replica::new_reader_base_cost);
auto stop_sem = deferred_stop(semaphore);
{
auto timeout = db::timeout_clock::now() + std::chrono::duration_cast<db::timeout_clock::time_point::duration>(std::chrono::milliseconds{1});
reader_permit_opt permit1 = semaphore.obtain_permit(nullptr, "permit1", replica::new_reader_base_cost, timeout, {}).get();
auto permit2_fut = semaphore.obtain_permit(nullptr, "permit2", replica::new_reader_base_cost, timeout, {});
auto permit3_fut = semaphore.obtain_permit(nullptr, "permit3", replica::new_reader_base_cost, timeout, {});
BOOST_REQUIRE_EQUAL(semaphore.get_stats().waiters, 2);
const auto futures_failed = eventually_true([&] { return permit2_fut.failed() && permit3_fut.failed(); });
BOOST_CHECK(futures_failed);
if (futures_failed) {
BOOST_CHECK_THROW(std::rethrow_exception(permit2_fut.get_exception()), semaphore_timed_out);
BOOST_CHECK_THROW(std::rethrow_exception(permit3_fut.get_exception()), semaphore_timed_out);
} else {
// We need special cleanup when the test failed to avoid invalid
// memory access.
permit1 = {};
BOOST_CHECK(eventually_true([&] { return permit2_fut.available(); }));
{
auto res = permit2_fut.get();
}
BOOST_CHECK(eventually_true([&] { return permit3_fut.available(); }));
{
auto res = permit3_fut.get();
}
}
}
// All units should have been deposited back.
REQUIRE_EVENTUALLY_EQUAL<ssize_t>([&] { return semaphore.available_resources().memory; }, replica::new_reader_base_cost);
});
}
SEASTAR_THREAD_TEST_CASE(reader_concurrency_semaphore_abort) {
const auto preemptive_abort_factor = 1.0f;
reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::for_tests{}, get_name(), 1, 2 * replica::new_reader_base_cost,
100, utils::updateable_value(std::numeric_limits<uint32_t>::max()), utils::updateable_value(std::numeric_limits<uint32_t>::max()),
utils::updateable_value<uint32_t>(1), utils::updateable_value<float>(preemptive_abort_factor));
auto stop_sem = deferred_stop(semaphore);
{
BOOST_REQUIRE(semaphore.get_stats().total_reads_shed_due_to_overload == 0);
auto timeout = db::timeout_clock::now() + 60min;
reader_permit_opt permit1 = semaphore.obtain_permit(nullptr, "permit1", replica::new_reader_base_cost, timeout, {}).get();
auto permit2_fut = semaphore.obtain_permit(nullptr, "permit2", replica::new_reader_base_cost, timeout, {});
BOOST_REQUIRE_EQUAL(semaphore.get_stats().waiters, 1);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().reads_enqueued_for_admission, 1);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().reads_enqueued_for_memory, 0);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().reads_queued_because_count_resources, 1);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().reads_queued_because_memory_resources, 0);
BOOST_REQUIRE_EQUAL(semaphore.available_resources().memory, replica::new_reader_base_cost);
permit1 = {};
BOOST_REQUIRE(eventually_true([&] { return permit2_fut.available(); }));
BOOST_REQUIRE_THROW(permit2_fut.get(), semaphore_aborted);
BOOST_CHECK(semaphore.get_stats().total_reads_shed_due_to_overload > 0);
}
// All units should have been deposited back.
REQUIRE_EVENTUALLY_EQUAL<ssize_t>([&] { return semaphore.available_resources().memory; }, 2 * replica::new_reader_base_cost);
}
SEASTAR_TEST_CASE(reader_concurrency_semaphore_max_queue_length) {
return async([&] () {
reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::for_tests{}, get_name(), 1, replica::new_reader_base_cost, 2);
auto stop_sem = deferred_stop(semaphore);
{
reader_permit_opt permit1 = semaphore.obtain_permit(nullptr, "permit1", replica::new_reader_base_cost, db::no_timeout, {}).get();
auto permit2_fut = semaphore.obtain_permit(nullptr, "permit2", replica::new_reader_base_cost, db::no_timeout, {});
auto permit3_fut = semaphore.obtain_permit(nullptr, "permit3", replica::new_reader_base_cost, db::no_timeout, {});
BOOST_REQUIRE_EQUAL(semaphore.get_stats().waiters, 2);
auto permit4_fut = semaphore.obtain_permit(nullptr, "permit4", replica::new_reader_base_cost, db::no_timeout, {});
// The queue should now be full.
BOOST_REQUIRE_THROW(permit4_fut.get(), std::runtime_error);
permit1 = {};
{
auto res = permit2_fut.get();
}
{
auto res = permit3_fut.get();
}
}
REQUIRE_EVENTUALLY_EQUAL<ssize_t>([&] { return semaphore.available_resources().memory; }, replica::new_reader_base_cost);
});
}
SEASTAR_THREAD_TEST_CASE(reader_concurrency_semaphore_dump_reader_diganostics) {
const auto initial_resources = reader_concurrency_semaphore::resources{10, 32 * 1024};
reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::for_tests{}, get_name(), initial_resources.count, initial_resources.memory);
auto stop_sem = deferred_stop(semaphore);
const auto nr_tables = tests::random::get_int<unsigned>(2, 4);
std::vector<schema_ptr> schemas;
for (unsigned i = 0; i < nr_tables; ++i) {
schemas.emplace_back(schema_builder("ks", fmt::format("tbl{}", i))
.with_column("pk", int32_type, column_kind::partition_key)
.with_column("v", int32_type, column_kind::regular_column).build());
}
schemas.emplace_back(nullptr);
const std::vector<std::string> tracing_permit_op_names{"push-view-updates-1", "push-view-updates-2", "multishard-mutation-query"};
const std::vector<std::string> regular_permit_op_names{"data-query", "mutation-query", "shard-reader"};
struct permit_data {
std::optional<future<reader_permit>> permit_fut;
reader_permit_opt permit;
std::optional<reader_permit::resource_units> resources;
reader_concurrency_semaphore::inactive_read_handle irh;
std::optional<reader_permit::need_cpu_guard> need_cpu_guard;
void reset() {
permit_fut.reset();
permit = {};
resources.reset();
irh = {};
need_cpu_guard.reset();
}
};
std::deque<permit_data> permits;
for (auto& schema : schemas) {
const auto nr_permits = tests::random::get_int<unsigned>(2, 32);
for (unsigned i = 0; i < nr_permits; ++i) {
auto& permit = permits.emplace_back();
if (!tests::random::get_int<unsigned>(0, 4)) {
permit.permit = semaphore.make_tracking_only_permit(
schema,
tracing_permit_op_names.at(tests::random::get_int<unsigned>(0, tracing_permit_op_names.size() - 1)),
db::no_timeout,
{});
permit.resources = permit.permit->consume_resources(reader_resources(tests::random::get_int<unsigned>(0, 1), tests::random::get_int<unsigned>(1024, 16 * 1024 * 1024)));
} else {
//Ensure timeout_seconds > 0 to avoid permits being rejected during admission. The test will become flaky.
const auto timeout_seconds = tests::random::get_int<unsigned>(1, 4);
permit.permit_fut = semaphore.obtain_permit(
schema,
regular_permit_op_names.at(tests::random::get_int<unsigned>(0, regular_permit_op_names.size() - 1)),
1024,
db::timeout_clock::now() + std::chrono::seconds(timeout_seconds),
{});
if (!permit.permit_fut->available()) {
continue;
}
if (permit.permit_fut) {
permit.permit = permit.permit_fut->get();
permit.permit_fut.reset();
}
switch (tests::random::get_int<unsigned>(0, 4)) {
case 0:
permit.resources = permit.permit->consume_memory(tests::random::get_int<unsigned>(1024, 2048));
permit.irh = semaphore.register_inactive_read(make_empty_mutation_reader(schema, *permit.permit));
break;
case 1:
permit.need_cpu_guard.emplace(*permit.permit);
break;
default:
permit.resources = permit.permit->consume_resources(reader_resources(tests::random::get_int<unsigned>(0, 1), tests::random::get_int<unsigned>(1024, 16 * 1024 * 1024)));
}
}
}
}
testlog.info("With max-lines=4: {}", semaphore.dump_diagnostics(4));
testlog.info("With no max-lines: {}", semaphore.dump_diagnostics(0));
std::exception_ptr ex;
for (auto& permit : permits) {
try {
if (permit.permit_fut) {
permit.permit_fut->get();
}
} catch (const timed_out_error&) {
// Ignore the timeouts
} catch (...) {
ex = std::current_exception();
}
permit.reset();
}
if (ex) {
BOOST_FAIL(fmt::format("obtain_permit() resolved with unexpected exception {}", ex));
}
}
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_stop_waits_on_permits) {
BOOST_TEST_MESSAGE("unused");
{
reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::no_limits{}, get_name(), reader_concurrency_semaphore::register_metrics::no);
// Checks for stop() should not be triggered.
}
BOOST_TEST_MESSAGE("0 permits");
{
reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::no_limits{}, get_name(), reader_concurrency_semaphore::register_metrics::no);
// Test will fail by timing out.
semaphore.stop().get();
}
BOOST_TEST_MESSAGE("1 permit");
{
auto semaphore = std::make_unique<reader_concurrency_semaphore>(reader_concurrency_semaphore::no_limits{}, get_name(),
reader_concurrency_semaphore::register_metrics::no);
auto permit = std::make_unique<reader_permit>(semaphore->make_tracking_only_permit(nullptr, "permit1", db::no_timeout, {}));
// Test will fail via use-after-free
auto f = semaphore->stop().then([semaphore = std::move(semaphore)] { });
yield().get();
BOOST_REQUIRE(!f.available());
permit.reset();
// Test will fail by timing out.
f.get();
}
}
} // reader_concurrency_semaphore_test namespace
static void require_can_admit(schema_ptr schema, reader_concurrency_semaphore& semaphore, bool expected_can_admit, const char* description,
std::source_location sl = std::source_location::current()) {
testlog.trace("Running admission scenario {}, with expected_can_admit={}, available resources on the semaphore: {}", description,
expected_can_admit, semaphore.available_resources());
const auto stats_before = semaphore.get_stats();
auto admit_fut = semaphore.obtain_permit(schema, "require_can_admit", 1024, db::timeout_clock::now(), {});
admit_fut.wait();
const bool can_admit = !admit_fut.failed();
if (can_admit) {
admit_fut.ignore_ready_future();
} else {
// Make sure we have a timeout exception, not something else
BOOST_REQUIRE_THROW(std::rethrow_exception(admit_fut.get_exception()), semaphore_timed_out);
}
const auto stats_after = semaphore.get_stats();
BOOST_REQUIRE_EQUAL(stats_after.reads_admitted, stats_before.reads_admitted + uint64_t(can_admit));
// Deliberately not checking `reads_enqueued_for_admission`, a read can be enqueued temporarily during the admission process.
if (can_admit == expected_can_admit) {
testlog.trace("admission scenario '{}' with expected_can_admit={} passed at {}:{}", description, expected_can_admit, sl.file_name(),
sl.line());
} else {
BOOST_FAIL(fmt::format("admission scenario '{}' with expected_can_admit={} failed at {}:{}\ndiagnostics: {}", description,
expected_can_admit, sl.file_name(), sl.line(), semaphore.dump_diagnostics()));
}
};
namespace reader_concurrency_semaphore_test {
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_admission) {
simple_schema s;
const auto schema = s.schema();
const auto initial_resources = reader_concurrency_semaphore::resources{2, 2 * 1024};
reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::for_tests{}, get_name(), initial_resources.count, initial_resources.memory);
auto stop_sem = deferred_stop(semaphore);
auto require_can_admit = [&] (bool expected_can_admit, const char* description,
std::source_location sl = std::source_location::current()) {
::require_can_admit(schema, semaphore, expected_can_admit, description, sl);
};
require_can_admit(true, "semaphore in initial state");
// resources and waitlist
{
reader_permit_opt permit = semaphore.obtain_permit(schema, get_name(), 1024, db::no_timeout, {}).get();
require_can_admit(true, "enough resources");
const auto stats_before = semaphore.get_stats();
auto enqueued_permit_fut = semaphore.obtain_permit(schema, get_name(), 2 * 1024, db::no_timeout, {});
{
const auto stats_after = semaphore.get_stats();
BOOST_REQUIRE(!enqueued_permit_fut.available());
BOOST_REQUIRE_EQUAL(stats_after.reads_enqueued_for_admission, stats_before.reads_enqueued_for_admission + 1);
BOOST_REQUIRE_EQUAL(stats_after.reads_admitted, stats_before.reads_admitted);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().waiters, 1);
}
BOOST_REQUIRE(semaphore.available_resources().count >= 1);
BOOST_REQUIRE(semaphore.available_resources().memory >= 1024);
require_can_admit(false, "enough resources but waitlist not empty");
permit = {};
reader_permit _(enqueued_permit_fut.get());
}
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial_resources);
require_can_admit(true, "semaphore in initial state");
// need_cpu and awaits
{
auto permit = semaphore.obtain_permit(schema, get_name(), 1024, db::no_timeout, {}).get();
require_can_admit(true, "!need_cpu");
{
reader_permit::need_cpu_guard ncpu_guard{permit};
require_can_admit(false, "need_cpu > awaits");
{
reader_permit::awaits_guard awaits_guard{permit};
require_can_admit(true, "need_cpu == awaits");
}
require_can_admit(false, "need_cpu > awaits");
}
require_can_admit(true, "!need_cpu");
}
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial_resources);
require_can_admit(true, "semaphore in initial state");
// forward progress -- resources
{
auto sponge_permit = semaphore.make_tracking_only_permit(nullptr, "sponge", db::no_timeout, {});
sponge_permit.consume_resources(reader_resources::with_memory(semaphore.available_resources().memory));
require_can_admit(true, "semaphore with no memory but all count available");
}
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial_resources);
require_can_admit(true, "semaphore in initial state");
// forward progress -- readmission
{
auto permit = semaphore.obtain_permit(schema, get_name(), 1024, db::no_timeout, {}).get();
auto irh = semaphore.register_inactive_read(make_empty_mutation_reader(s.schema(), permit));
BOOST_REQUIRE(semaphore.try_evict_one_inactive_read());
BOOST_REQUIRE(!irh);
reader_permit::need_cpu_guard _{permit};
const auto stats_before = semaphore.get_stats();
auto wait_fut = make_ready_future<>();
if (permit.needs_readmission()) {
wait_fut = permit.wait_readmission();
}
wait_fut.wait();
BOOST_REQUIRE(!wait_fut.failed());
const auto stats_after = semaphore.get_stats();
BOOST_REQUIRE_EQUAL(stats_after.reads_admitted, stats_before.reads_admitted + 1);
BOOST_REQUIRE_EQUAL(stats_after.reads_enqueued_for_admission, stats_before.reads_enqueued_for_admission);
}
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial_resources);
require_can_admit(true, "semaphore in initial state");
// inactive readers
{
auto permit = semaphore.obtain_permit(schema, get_name(), 1024, db::no_timeout, {}).get();
require_can_admit(true, "!need_cpu");
{
auto irh = semaphore.register_inactive_read(make_empty_mutation_reader(s.schema(), permit));
require_can_admit(true, "inactive");
reader_permit::need_cpu_guard ncpu_guard{permit};
require_can_admit(true, "inactive (need_cpu)");
{
auto rd = semaphore.unregister_inactive_read(std::move(irh));
rd->close().get();
}
require_can_admit(false, "need_cpu > awaits");
irh = semaphore.register_inactive_read(make_empty_mutation_reader(s.schema(), permit));
require_can_admit(true, "inactive (need_cpu)");
}
require_can_admit(true, "!need_cpu");
}
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial_resources);
require_can_admit(true, "semaphore in initial state");
// evicting inactive readers for admission
{
auto permit1 = semaphore.obtain_permit(schema, get_name(), 1024, db::timeout_clock::now(), {}).get();
auto irh1 = semaphore.register_inactive_read(make_empty_mutation_reader(s.schema(), permit1));
auto permit2 = semaphore.obtain_permit(schema, get_name(), 1024, db::timeout_clock::now(), {}).get();
auto irh2 = semaphore.register_inactive_read(make_empty_mutation_reader(s.schema(), permit2));
BOOST_REQUIRE(eventually_true([&] { return !irh1 || !irh2; }));
require_can_admit(true, "evictable reads");
}
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial_resources);
require_can_admit(true, "semaphore in initial state");
auto check_admitting_enqueued_read = [&] (auto pre_admission_hook, auto post_enqueue_hook) {
auto cookie1 = pre_admission_hook();
require_can_admit(false, "admission awaits");
const auto stats_before = semaphore.get_stats();
auto permit2_fut = semaphore.obtain_permit(schema, get_name(), 1024, db::no_timeout, {});
const auto stats_after = semaphore.get_stats();
BOOST_REQUIRE_EQUAL(stats_after.reads_admitted, stats_before.reads_admitted);
BOOST_REQUIRE_EQUAL(stats_after.reads_enqueued_for_admission, stats_before.reads_enqueued_for_admission + 1);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().waiters, 1);
[[maybe_unused]] auto guard = post_enqueue_hook(cookie1);
if (!eventually_true([&] { return permit2_fut.available(); })) {
semaphore.broken();
permit2_fut.wait();
permit2_fut.ignore_ready_future();
BOOST_FAIL("Enqueued permit didn't get admitted as expected");
}
};
// admitting enqueued reads -- permit owning resources destroyed
{
check_admitting_enqueued_read(
[&] {
return reader_permit_opt(semaphore.obtain_permit(schema, get_name(), 2 * 1024, db::no_timeout, {}).get());
},
[] (reader_permit_opt& permit1) {
permit1 = {};
return 0;
}
);
}
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial_resources);
require_can_admit(true, "semaphore in initial state");
// admitting enqueued reads -- permit owning resources becomes inactive
{
check_admitting_enqueued_read(
[&] {
return reader_permit_opt(semaphore.obtain_permit(schema, get_name(), 2 * 1024, db::no_timeout, {}).get());
},
[&] (reader_permit_opt& permit1) {
return semaphore.register_inactive_read(make_empty_mutation_reader(s.schema(), *permit1));
}
);
}
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial_resources);
require_can_admit(true, "semaphore in initial state");
// admitting enqueued reads -- permit becomes active
{
check_admitting_enqueued_read(
[&] {
auto permit = semaphore.obtain_permit(schema, get_name(), 1024, db::no_timeout, {}).get();
require_can_admit(true, "enough resources");
return std::pair(permit, std::optional<reader_permit::need_cpu_guard>{permit});
}, [&] (std::pair<reader_permit, std::optional<reader_permit::need_cpu_guard>>& permit_and_need_cpu_guard) {
permit_and_need_cpu_guard.second.reset();
return 0;
}
);
}
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial_resources);
require_can_admit(true, "semaphore in initial state");
// admitting enqueued reads -- permit becomes awaits
{
check_admitting_enqueued_read(
[&] {
auto permit = semaphore.obtain_permit(schema, get_name(), 1024, db::no_timeout, {}).get();
require_can_admit(true, "enough resources");
return std::pair(permit, reader_permit::need_cpu_guard{permit});
}, [&] (std::pair<reader_permit, reader_permit::need_cpu_guard>& permit_and_need_cpu_guard) {
return reader_permit::awaits_guard{permit_and_need_cpu_guard.first};
}
);
}
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial_resources);
require_can_admit(true, "semaphore in initial state");
}
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_need_cpu_awaits) {
const auto initial_resources = reader_concurrency_semaphore::resources{2, 2 * 1024};
reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::for_tests{}, get_name(), initial_resources.count, initial_resources.memory);
auto stop_sem = deferred_stop(semaphore);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().current_permits, 0);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().need_cpu_permits, 0);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().awaits_permits, 0);
auto permit = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout, {}).get();
for (auto scenario = 0; scenario < 5; ++scenario) {
testlog.info("Running scenario {}", scenario);
std::vector<reader_permit::need_cpu_guard> need_cpu;
std::vector<reader_permit::awaits_guard> awaits;
unsigned count;
switch (scenario) {
case 0:
need_cpu.emplace_back(permit);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().current_permits, 1);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().need_cpu_permits, 1);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().awaits_permits, 0);
break;
case 1:
need_cpu.emplace_back(permit);
awaits.emplace_back(permit);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().current_permits, 1);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().need_cpu_permits, 1);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().awaits_permits, 1);
break;
case 2:
awaits.emplace_back(permit);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().current_permits, 1);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().need_cpu_permits, 0);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().awaits_permits, 0);
break;
case 3:
awaits.emplace_back(permit);
need_cpu.emplace_back(permit);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().current_permits, 1);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().need_cpu_permits, 1);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().awaits_permits, 1);
break;
default:
count = tests::random::get_int<unsigned>(3, 100);
for (unsigned i = 0; i < count; ++i) {
if (tests::random::get_bool()) {
need_cpu.emplace_back(permit);
} else {
awaits.emplace_back(permit);
}
}
break;
}
while (!need_cpu.empty() && !awaits.empty()) {
const bool pop_need_cpu = !need_cpu.empty() && tests::random::get_bool();
if (pop_need_cpu) {
need_cpu.pop_back();
if (need_cpu.empty()) {
BOOST_REQUIRE_EQUAL(semaphore.get_stats().need_cpu_permits, 0);
}
} else {
awaits.pop_back();
if (awaits.empty()) {
BOOST_REQUIRE_EQUAL(semaphore.get_stats().awaits_permits, 0);
}
}
}
}
}
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_evict_inactive_reads_for_table) {
auto spec = tests::make_random_schema_specification(get_name());
std::list<tests::random_schema> schemas;
struct inactive_read {
reader_concurrency_semaphore::inactive_read_handle handle;
std::optional<dht::partition_range> range;
explicit inactive_read(std::optional<dht::partition_range> range = {})
: range(std::move(range))
{ }
inactive_read(reader_concurrency_semaphore::inactive_read_handle handle, std::optional<dht::partition_range> range = {})
: handle(std::move(handle))
, range(std::move(range))
{ }
operator bool() const {
return bool(handle);
}
};
std::unordered_map<tests::random_schema*, std::list<inactive_read>> schema_handles;
for (unsigned i = 0; i < 4; ++i) {
auto& s = schemas.emplace_back(tests::random_schema(i, *spec));
schema_handles.emplace(&s, std::list<inactive_read>{});
}
auto make_random_range = [] (tests::random_schema& s) {
auto keys = s.make_pkeys(2);
return interval<tests::data_model::mutation_description::key>::make({keys[0]}, {keys[1]}).transform([&s] (const tests::data_model::mutation_description::key& k) -> dht::ring_position {
return dht::decorate_key(*s.schema(), partition_key::from_exploded(*s.schema(), k));
});
};
reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::no_limits{}, get_name(), reader_concurrency_semaphore::register_metrics::no);
auto stop_sem = deferred_stop(semaphore);
for (auto& s : schemas) {
auto& handles = schema_handles[&s];
for (int i = 0; i < 10; ++i) {
auto& handle = handles.emplace_back(make_random_range(s));
handle.handle = semaphore.register_inactive_read(
make_empty_mutation_reader(s.schema(), semaphore.make_tracking_only_permit(s.schema(), get_name(), db::no_timeout, {})),
&*handle.range);
}
for (int i = 0; i < 4; ++i) {
handles.emplace_back(semaphore.register_inactive_read(make_empty_mutation_reader(s.schema(), semaphore.make_tracking_only_permit(s.schema(), get_name(), db::no_timeout, {}))));
}
}
for (auto& s : schemas) {
auto& handles = schema_handles[&s];
BOOST_REQUIRE(std::all_of(handles.begin(), handles.end(), [] (const inactive_read& ir) { return bool(ir); }));
}
for (auto& s : schemas) {
auto& handles = schema_handles[&s];
BOOST_REQUIRE(std::all_of(handles.begin(), handles.end(), [] (const inactive_read& ir) { return bool(ir); }));
std::optional<dht::partition_range> evict_range;
if (tests::random::get_bool()) {
evict_range.emplace(make_random_range(s));
}
semaphore.evict_inactive_reads_for_table(s.schema()->id(), evict_range ? &*evict_range : nullptr).get();
for (const auto& [k, v] : schema_handles) {
if (k == &s) {
for (const auto& ir : v) {
if (ir) {
BOOST_REQUIRE(ir.range);
BOOST_REQUIRE(evict_range);
BOOST_REQUIRE(!ir.range->overlaps(*evict_range, dht::ring_position_comparator(*s.schema())));
}
}
} else if (!v.empty()) {
BOOST_REQUIRE(std::all_of(v.begin(), v.end(), [] (const inactive_read& ir) { return bool(ir); }));
}
}
handles.clear();
}
}
// Reproduces https://github.com/scylladb/scylladb/issues/11770
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_evict_inactive_reads_when_all_is_awaits) {
simple_schema ss;
const auto& s = ss.schema();
const auto initial_resources = reader_concurrency_semaphore::resources{2, 32 * 1024};
reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::for_tests{}, get_name(), initial_resources.count, initial_resources.memory);
auto stop_sem = deferred_stop(semaphore);
class read {
reader_permit _permit;
promise<> _read_started_pr;
future<> _read_started_fut;
promise<> _read_done_pr;
reader_permit::need_cpu_guard _ncpu_guard;
std::optional<reader_permit::awaits_guard> _awaits_guard;
public:
explicit read(reader_permit p) : _permit(std::move(p)), _read_started_fut(_read_started_pr.get_future()), _ncpu_guard(_permit) { }
future<> wait_read_started() { return std::move(_read_started_fut); }
void set_read_done() { _read_done_pr.set_value(); }
void mark_as_awaits() { _awaits_guard.emplace(_permit); }
void mark_as_not_awaits() { _awaits_guard.reset(); }
reader_concurrency_semaphore::read_func get_read_func() {
return [this] (reader_permit permit) -> future<> {
_read_started_pr.set_value();
co_await _read_done_pr.get_future();
};
}
};
auto p1 = semaphore.obtain_permit(s, get_name(), 1024, db::no_timeout, {}).get();
auto irh1 = semaphore.register_inactive_read(make_empty_mutation_reader(ss.schema(), p1));
auto p2 = semaphore.obtain_permit(s, get_name(), 1024, db::no_timeout, {}).get();
read rd2(p2);
auto fut2 = semaphore.with_ready_permit(p2, rd2.get_read_func());
// At this point we expect to have:
// * 1 inactive read (not evicted)
// * 1 need_cpu (but not awaiting) read on the ready list
// * 1 waiter
// * no more count resources left
auto p3_fut = semaphore.obtain_permit(s, get_name(), 1024, db::no_timeout, {});
BOOST_REQUIRE_EQUAL(semaphore.get_stats().waiters, 2); // (waiters includes _ready_list entries)
BOOST_REQUIRE_EQUAL(semaphore.get_stats().reads_enqueued_for_admission, 1);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().need_cpu_permits, 0); // permit looses need_cpu status while waiting for execution
BOOST_REQUIRE_EQUAL(semaphore.get_stats().awaits_permits, 0);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 1);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().permit_based_evictions, 0);
BOOST_REQUIRE_EQUAL(semaphore.available_resources().count, 0);
BOOST_REQUIRE(irh1);
// Start the read emptying the ready list, this should not be enough to admit p3
rd2.wait_read_started().get();
BOOST_REQUIRE_EQUAL(semaphore.get_stats().waiters, 1);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().need_cpu_permits, 1);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().awaits_permits, 0);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 1);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().permit_based_evictions, 0);
BOOST_REQUIRE_EQUAL(semaphore.available_resources().count, 0);
BOOST_REQUIRE(irh1);
// Marking p2 as awaits should eventually allow p3 to be admitted by evicting p1
rd2.mark_as_awaits();
REQUIRE_EVENTUALLY_EQUAL<uint64_t>([&] { return semaphore.get_stats().waiters; }, 0);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().need_cpu_permits, 1);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().awaits_permits, 1);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 0);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().permit_based_evictions, 1);
BOOST_REQUIRE_EQUAL(semaphore.available_resources().count, 0);
BOOST_REQUIRE(!irh1);
p3_fut.get();
rd2.mark_as_not_awaits();
rd2.set_read_done();
fut2.get();
}
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_set_resources) {
const auto initial_resources = reader_concurrency_semaphore::resources{4, 4 * 1024};
reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::for_tests{}, get_name(), initial_resources.count, initial_resources.memory);
auto stop_sem = deferred_stop(semaphore);
auto permit1 = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout, {}).get();
auto permit2 = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout, {}).get();
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), reader_resources(2, 2 * 1024));
BOOST_REQUIRE_EQUAL(semaphore.initial_resources(), reader_resources(4, 4 * 1024));
semaphore.set_resources({8, 8 * 1024});
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), reader_resources(6, 6 * 1024));
BOOST_REQUIRE_EQUAL(semaphore.initial_resources(), reader_resources(8, 8 * 1024));
semaphore.set_resources({2, 2 * 1024});
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), reader_resources(0, 0));
BOOST_REQUIRE_EQUAL(semaphore.initial_resources(), reader_resources(2, 2 * 1024));
semaphore.set_resources({3, 128});
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), reader_resources(1, 128 - 2 * 1024));
BOOST_REQUIRE_EQUAL(semaphore.initial_resources(), reader_resources(3, 128));
semaphore.set_resources({1, 3 * 1024});
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), reader_resources(-1, 1024));
BOOST_REQUIRE_EQUAL(semaphore.initial_resources(), reader_resources(1, 3 * 1024));
auto permit3_fut = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout, {});
BOOST_REQUIRE_EQUAL(semaphore.get_stats().reads_enqueued_for_admission, 1);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().waiters, 1);
semaphore.set_resources({4, 4 * 1024});
REQUIRE_EVENTUALLY_EQUAL<uint64_t>([&] { return semaphore.get_stats().waiters; }, 0);
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), reader_resources(1, 1024));
BOOST_REQUIRE_EQUAL(semaphore.initial_resources(), reader_resources(4, 4 * 1024));
permit3_fut.get();
}
} // namespace reader_concurrency_semaphore_test
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_group) {
const auto initial_resources = reader_resources{100, 100 * 1024};
auto serialize_multiplier = utils::updateable_value_source<uint32_t>(2);
auto kill_multiplier = utils::updateable_value_source<uint32_t>(3);
auto cpu_concurrency = utils::updateable_value_source<uint32_t>(1);
auto preemptive_abort_factor = utils::updateable_value_source<float>(0.0f);
reader_concurrency_semaphore_group sem_group(initial_resources.memory, initial_resources.count, 1000,
utils::updateable_value(serialize_multiplier),
utils::updateable_value(kill_multiplier),
utils::updateable_value(cpu_concurrency),
utils::updateable_value(preemptive_abort_factor));
auto stop_sem = deferred_stop(sem_group);
circular_buffer<scheduling_group> recycle_bin;
const auto initial_shares = 1000;
struct scheduling_group_with_shares {
scheduling_group sg;
size_t shares;
scheduling_group_with_shares(scheduling_group sg, size_t shares) : sg(sg), shares(shares) { }
};
std::vector<scheduling_group_with_shares> scheduling_groups;
const auto max_sched_groups = 8;
auto check_sem_group = [&] {
const auto total_shares = std::ranges::fold_left(scheduling_groups
| std::views::transform([] (const scheduling_group_with_shares& sgs) { return sgs.shares; }), size_t(0), std::plus{});
ssize_t total_memory = 0;
sem_group.foreach_semaphore([&] (scheduling_group sg, reader_concurrency_semaphore& sem) {
const auto res = sem.available_resources();
BOOST_CHECK_EQUAL(res.count, initial_resources.count); // currently count is not partitioned among the semaphores
auto it = std::find_if(scheduling_groups.begin(), scheduling_groups.end(), [sg] (const scheduling_group_with_shares& sgs) { return sgs.sg == sg; });
BOOST_REQUIRE(it != scheduling_groups.end());
const auto shares = it->shares;
const ssize_t expected_memory = std::floor((double(shares) / double(total_shares)) * initial_resources.memory);
const auto memory_diff = std::abs(res.memory - expected_memory);
testlog.trace("{}: {}/{} (shares) -> {}/{} (memory) | res.memory: {}", sg.name(), shares, total_shares, expected_memory, initial_resources.memory, res.memory);
BOOST_CHECK_LE(memory_diff, scheduling_groups.size()); // due to integer division, we allow for ceil/floor (off-by-one), the remainder being added to any semaphore
total_memory += res.memory;
});
BOOST_CHECK_EQUAL(total_memory, initial_resources.memory); // no off-by-one allowed on the total
};
auto add_sg = [&, sgi = 0] () mutable {
if (scheduling_groups.size() >= max_sched_groups) {
return false;
}
testlog.debug("create sg{}", sgi);
scheduling_group sg;
const auto sg_name = format("sg{}", sgi++);
if (recycle_bin.empty()) {
sg = create_scheduling_group(sg_name, initial_shares).get();
} else {
sg = recycle_bin.front();
recycle_bin.pop_front();
rename_scheduling_group(sg, sg_name).get();
}
scheduling_groups.emplace_back(sg, initial_shares);
sem_group.add_or_update(sg, initial_shares);
sem_group.wait_adjust_complete().get();
return true;
};
while (add_sg()) {
check_sem_group();
}
for (size_t i = 0; i < 32; ++i) {
testlog.debug("iteration {}", i);
std::shuffle(scheduling_groups.begin(), scheduling_groups.end(), tests::random::gen());
switch (tests::random::get_int<uint8_t>(0, 3)) {
case 0: // add
{
testlog.debug("maybe add sg");
if (add_sg()) {
break;
}
[[fallthrough]];
}
case 1: //remove
{
const auto& sgs = scheduling_groups.back();
testlog.debug("maybe remove {}", sgs.sg.name());
if (scheduling_groups.size() > 1) {
testlog.debug("remove {}", sgs.sg.name());
sem_group.remove(sgs.sg).get();
recycle_bin.push_back(sgs.sg);
scheduling_groups.pop_back();
break;
}
[[fallthrough]];
}
default: //update
{
auto& sgs = scheduling_groups.back();
const auto new_shares = tests::random::get_int<size_t>(100, 1000);
sgs.shares = new_shares;
testlog.debug("update {}: {}->{}", sgs.sg.name(), sgs.shares, new_shares);
sem_group.add_or_update(sgs.sg, new_shares);
sem_group.wait_adjust_complete().get();
break;
}
}
check_sem_group();
}
}
namespace {
class allocating_reader {
static constexpr size_t admission_cost = 1024;
static constexpr size_t buf_size = 1024;
static constexpr size_t read_iterations = 4;
public:
enum class state {
wait_for_admission,
request_memory,
wait_for_memory,
release_memory,
done,
};
const char* to_string(state s) {
switch (s) {
case state::wait_for_admission: return "state::wait_for_admission";
case state::request_memory: return "state::request_memory";
case state::wait_for_memory: return "state::wait_for_memory";
case state::release_memory: return "state::release_memory";
case state::done: return "state::done";
}
std::abort();
};
private:
reader_concurrency_semaphore& _sem;
state _state = state::wait_for_admission;
std::optional<future<>> _admission_fut;
std::optional<reader_permit> _permit;
std::list<reader_permit::resource_units> _current_resource_units;
std::list<future<reader_permit::resource_units>> _pending_resource_units;
unsigned _read_count = 0;
bool _success = true;
public:
explicit allocating_reader(reader_concurrency_semaphore& sem) : _sem(sem) {
testlog.debug("[{}] allocating_reader created", fmt::ptr(this));
_admission_fut = sem.obtain_permit(nullptr, "reader", admission_cost, db::no_timeout, {}).then_wrapped([this] (future<reader_permit>&& permit_fut) {
try {
_permit = std::move(permit_fut.get());
_state = state::request_memory;
} catch (...) {
_state = state::done;
_success = false;
}
});
}
~allocating_reader() { }
void operator()() {
testlog.debug("[{}|p:0x{:x}] allocating_reader(): _state={}, _permit.state={}, _permit.resources={}, _sem.resources={}",
fmt::ptr(this),
_permit ? _permit->id() : 0,
to_string(_state),
_permit ? format("{}", _permit->get_state()) : "N/A",
_permit ? _permit->consumed_resources() : reader_resources{},
_sem.consumed_resources());
switch (_state) {
case state::wait_for_admission:
break;
case state::request_memory:
{
size_t n = 0;
if (!_read_count) {
n = 1;
} else {
n = tests::random::get_int(1, 8);
}
++_read_count;
try {
for (size_t i = 0; i < n; ++i) {
_pending_resource_units.emplace_back(_permit->request_memory(buf_size));
}
} catch (std::bad_alloc&) {
testlog.debug("[{}|p:{}] read killed", fmt::ptr(this), _permit ? _permit->id() : 0);
_read_count = read_iterations;
}
_state = state::wait_for_memory;
break;
}
case state::wait_for_memory:
for (auto it = _pending_resource_units.begin(); it != _pending_resource_units.end();) {
if (it->available()) {
try {
_current_resource_units.push_back(it->get());
} catch (std::bad_alloc&) {
testlog.debug("[{}|p:{}] read killed", fmt::ptr(this), _permit ? _permit->id() : 0);
_read_count = read_iterations;
}
it = _pending_resource_units.erase(it);
} else {
++it;
}
}
if (_pending_resource_units.empty()) {
_state = state::release_memory;
}
break;
case state::release_memory:
if (_current_resource_units.empty()) {
if (_read_count == read_iterations) {
_state = state::done;
} else if (!tests::random::get_int(0, 7)) {
_state = state::done;
} else {
_state = state::request_memory;
}
} else {
_current_resource_units.pop_front();
}
break;
case state::done:
_permit.reset();
break;
}
}
bool done() const { return _state == state::done; }
bool success() const { return _success; }
reader_resources resources() const { return _permit ? _permit->consumed_resources() : reader_resources{}; }
future<> close() {
if (_admission_fut) {
co_await std::move(_admission_fut).value();
}
co_await coroutine::parallel_for_each(_pending_resource_units, [] (future<reader_permit::resource_units>& fut) {
return std::move(fut).then_wrapped([] (future<reader_permit::resource_units>&& fut) {
try {
fut.get();
} catch (...) {
}
});
});
_current_resource_units.clear();
_permit.reset();
}
};
} //anonymous namespace
namespace reader_concurrency_semaphore_test {
// Check that the memory consumption limiting mechanism doesn't leak any
// resources or cause any internal consistencies in the semaphore.
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_memory_limit_no_leaks) {
const auto initial_resources = reader_concurrency_semaphore::resources{4, 4 * 1024};
const auto serialize_multiplier = 2;
const auto kill_multiplier = 3;
reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::for_tests{}, get_name(), initial_resources.count, initial_resources.memory, 100,
utils::updateable_value<uint32_t>(serialize_multiplier), utils::updateable_value<uint32_t>(kill_multiplier));
auto stop_sem = deferred_stop(semaphore);
const size_t reader_count_target = 6;
const size_t iteration_limit = 1000;
std::list<allocating_reader> readers;
size_t i = 0;
bool done = false;
sstring error = "";
while (!done) {
testlog.debug("iteration {}", i);
for (auto& rd : readers) {
rd();
reader_resources all_permit_res;
semaphore.foreach_permit([&all_permit_res] (const reader_permit& p) { all_permit_res += p.consumed_resources(); });
if (semaphore.consumed_resources() != all_permit_res) {
testlog.error("resource mismatch: semaphore.consumed_resources() ({}) != sum of resources in permits ({})", semaphore.consumed_resources(), all_permit_res);
}
}
if (readers.size() < reader_count_target) {
readers.emplace_back(semaphore);
}
done = std::all_of(readers.begin(), readers.end(), std::mem_fn(&allocating_reader::done));
testlog.debug("{}", semaphore.dump_diagnostics());
reader_resources all_permit_res;
semaphore.foreach_permit([&all_permit_res] (const reader_permit& p) { all_permit_res += p.consumed_resources(); });
if (semaphore.consumed_resources().memory >= (semaphore.initial_resources().memory * kill_multiplier)) {
error = format("kill limit failed: semaphore.consumed_resources() ({}) >= kill limit ({})", semaphore.consumed_resources().memory, (semaphore.initial_resources().memory * kill_multiplier));
} else if (semaphore.consumed_resources() != all_permit_res) {
error = format("resource mismatch: semaphore.consumed_resources() ({}) != sum of resources in permits ({})", semaphore.consumed_resources(), all_permit_res);
} else if (i >= iteration_limit) {
error = format("test failed to finish in {} iterations", iteration_limit);
}
if (error.empty()) {
++i;
} else {
testlog.error("stopping test at iteration {}: {}", i, error);
done = true;
}
seastar::thread::yield();
}
testlog.info("{}", semaphore.dump_diagnostics());
parallel_for_each(readers.begin(), readers.end(), [] (allocating_reader& rd) {
return rd.close();
}).get();
if (!error.empty()) {
BOOST_FAIL(error);
}
const bool all_ok = std::all_of(readers.begin(), readers.end(), std::mem_fn(&allocating_reader::success));
BOOST_REQUIRE(all_ok);
}
struct memory_limit_table {
schema_ptr schema;
tmpdir sst_dir;
partition_key pk;
clustering_key ck;
sstring value;
};
memory_limit_table create_memory_limit_table(cql_test_env& env, uint64_t target_num_sstables) {
auto& db = env.local_db();
sstring value(256 * 1024, '0');
env.execute_cql("CREATE TABLE ks.tbl (pk int, ck int, value text, primary key (pk, ck)) WITH compaction = {'class': 'NullCompactionStrategy'};").get();
BOOST_REQUIRE(env.local_db().has_schema("ks", "tbl"));
auto& tbl = db.find_column_family("ks", "tbl");
auto s = tbl.schema();
auto& sst_man = tbl.get_sstables_manager();
auto& semaphore = db.get_reader_concurrency_semaphore();
auto dk = tests::generate_partition_key(s);
auto ck = tests::generate_clustering_key(s);
mutation mut(s, dk);
mut.set_clustered_cell(ck, to_bytes("value"), data_value(value), 0);
auto sstables_dir = tmpdir();
const auto sstable_write_concurrency = 16;
uint64_t num_sstables = 0;
parallel_for_each(std::views::iota(0, sstable_write_concurrency), [&] (int i) {
return seastar::async([&] {
while (num_sstables != target_num_sstables) {
++num_sstables;
auto sst = tbl.make_sstable();
auto writer_cfg = sst_man.configure_writer("test");
sst->write_components(
make_mutation_reader_from_mutations(s, semaphore.make_tracking_only_permit(s, "test", db::no_timeout, {}), mut, s->full_slice()),
1,
s,
writer_cfg,
encoding_stats{}).get();
sst->open_data().get();
tbl.add_sstable_and_update_cache(std::move(sst)).get();
}
});
}).get();
return {s, std::move(sstables_dir), std::move(dk.key()), std::move(ck), std::move(value)};
}
#ifndef DEBUG
constexpr uint64_t target_memory = uint64_t(1) << 28; // 256MB
#endif
// Check that the memory consumption limiting mechanism of the semaphore does
// prevent OOM crashes.
// The test fails by OOM crashing.
// This test should be run with 256MB of memory.
SEASTAR_TEST_CASE(test_reader_concurrency_semaphore_memory_limit_no_oom) {
#ifndef DEBUG
if (memory::stats().total_memory() != target_memory) {
std::cerr << "Test " << get_name() << " should be run with 256M of memory, make sure you invoke with -m256M" << std::endl;
return make_ready_future<>();
}
#endif
auto db_cfg_ptr = make_shared<db::config>();
auto& db_cfg = *db_cfg_ptr;
// Disable the cache altogether, we want all reads to go to disk.
db_cfg.enable_cache(false);
db_cfg.enable_commitlog(false);
db_cfg.reader_concurrency_semaphore_serialize_limit_multiplier.set(2, utils::config_file::config_source::CommandLine);
db_cfg.reader_concurrency_semaphore_kill_limit_multiplier.set(4, utils::config_file::config_source::CommandLine);
return do_with_cql_env_thread([] (cql_test_env& env) {
auto tbl = create_memory_limit_table(env, 256);
#ifdef DEBUG
const auto num_reads = 16;
#else
const auto num_reads = 128;
#endif
auto read_id = env.prepare("SELECT value FROM ks.tbl WHERE pk = ? AND ck = ?").get();
parallel_for_each(std::views::iota(0, num_reads), [&] (int i) {
return env.execute_prepared(read_id, {cql3::raw_value::make_value(tbl.pk.explode().front()), cql3::raw_value::make_value(tbl.ck.explode().front())}).then_wrapped(
[&] (future<shared_ptr<cql_transport::messages::result_message>> fut) {
if (fut.failed()) {
// We expect failed, OOM-killed reads here.
// No way to verify why they failed so we swallow all failures.
fut.ignore_ready_future();
return;
}
assert_that(fut.get()).is_rows().with_rows_ignore_order({ {serialized(tbl.value)} });
});
}).get();
return make_ready_future<>();
}, std::move(db_cfg_ptr));
}
// Check that the memory consumption limiting mechanism of the semaphore does
// prevent reads exhausting memory to the extent that they start to fail due to
// bad alloc (but not necessarily crash the node).
// Instead the limiting mechanism engages and kills reads before they get to that
// point. From the outset, a read failing due to OOM and a read killed by the
// limiting mechanism looks the same. To differentiate, the test checks that all
// failures were caused by the limiting mechanism.
// This test should be run with 256M memory.
SEASTAR_TEST_CASE(test_reader_concurrency_semaphore_memory_limit_engages) {
#ifndef DEBUG
if (memory::stats().total_memory() != target_memory) {
std::cerr << "Test " << get_name() << " should be run with 256M of memory, make sure you invoke with -m256M" << std::endl;
return make_ready_future<>();
}
#endif
auto db_cfg_ptr = make_shared<db::config>();
auto& db_cfg = *db_cfg_ptr;
// Disable the cache altogether, we want all reads to go to disk.
db_cfg.enable_cache(false);
db_cfg.enable_commitlog(false);
db_cfg.reader_concurrency_semaphore_serialize_limit_multiplier.set(2, utils::config_file::config_source::CommandLine);
db_cfg.reader_concurrency_semaphore_kill_limit_multiplier.set(4, utils::config_file::config_source::CommandLine);
return do_with_cql_env_thread([] (cql_test_env& env) {
auto tbl = create_memory_limit_table(env, 54);
auto& db = env.local_db();
auto& semaphore = db.get_reader_concurrency_semaphore();
const auto num_reads = 128;
auto read_id = env.prepare("SELECT value FROM ks.tbl WHERE pk = ? AND ck = ?").get();
// We first check that the test params are not too strict and a single
// read can finish successfully.
try {
auto msg = env.execute_prepared(read_id, {cql3::raw_value::make_value(tbl.pk.explode().front()), cql3::raw_value::make_value(tbl.ck.explode().front())}).get();
assert_that(msg).is_rows().with_rows_ignore_order({ {serialized(tbl.value)} });
} catch (...) {
BOOST_FAIL(fmt::format("canary read failed with: {}", std::current_exception()));
}
uint64_t successful_reads = 0;
uint64_t failed_reads = 0;
parallel_for_each(std::views::iota(0, num_reads), [&] (int i) {
return env.execute_prepared(read_id, {cql3::raw_value::make_value(tbl.pk.explode().front()), cql3::raw_value::make_value(tbl.ck.explode().front())}).then_wrapped(
[&] (future<shared_ptr<cql_transport::messages::result_message>> fut) {
if (fut.failed()) {
// We expect failed, OOM-killed reads here.
// No way to verify why they failed so we swallow all failures.
fut.ignore_ready_future();
++failed_reads;
return;
}
assert_that(fut.get()).is_rows().with_rows_ignore_order({ {serialized(tbl.value)} });
++successful_reads;
});
}).get();
testlog.info("total reads: {} ({} successful, {} failed)", num_reads, successful_reads, failed_reads);
testlog.info("{}", semaphore.dump_diagnostics());
// There should be both successful and failed reads.
// If there is only one or the other, the test is not testing anything.
// We also check that the memory limiting mechanism of the semaphore was engaged.
// The test is meaningless without it.
// In the slow debug builds we never reach the kill limit for some reason.
#ifndef DEBUG
BOOST_REQUIRE_GE(failed_reads, 1);
#endif
BOOST_REQUIRE_GE(successful_reads, 1);
// Almost each failed read should have been in the memory queue at one point.
BOOST_REQUIRE_GE(semaphore.get_stats().reads_enqueued_for_memory, semaphore.get_stats().total_reads_killed_due_to_kill_limit);
// All failures must be caused by the kill limit triggering.
BOOST_REQUIRE_EQUAL(semaphore.get_stats().total_reads_killed_due_to_kill_limit, failed_reads);
return make_ready_future<>();
}, std::move(db_cfg_ptr));
}
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_request_memory_preserves_state) {
const auto initial_resources = reader_concurrency_semaphore::resources{2, 2 * 1024};
const auto serialize_multiplier = 2;
reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::for_tests{}, get_name(), initial_resources.count,
initial_resources.memory, 100, utils::updateable_value<uint32_t>(serialize_multiplier));
auto stop_sem = deferred_stop(semaphore);
auto sponge_permit = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout, {}).get();
uint64_t reads_enqueued_for_memory = 0;
auto do_check = [&] (reader_permit& permit, uint64_t need_cpu, uint64_t awaits, std::source_location sl) {
testlog.info("do_check() {}:{}", sl.file_name(), sl.line());
BOOST_REQUIRE_EQUAL(semaphore.get_stats().current_permits, 2);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().need_cpu_permits, need_cpu);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().awaits_permits, awaits);
auto units1 = permit.request_memory(1024).get();
BOOST_REQUIRE_EQUAL(semaphore.get_stats().current_permits, 2);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().need_cpu_permits, need_cpu);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().awaits_permits, awaits);
auto sponge_units = sponge_permit.request_memory(8 * 1024).get();
// sponge permit is now the blessed one
auto units2_fut = permit.request_memory(1024);
BOOST_REQUIRE(!units2_fut.available());
BOOST_REQUIRE_EQUAL(semaphore.get_stats().reads_enqueued_for_memory, ++reads_enqueued_for_memory);
sponge_units.reset_to_zero();
auto units2 = units2_fut.get();
BOOST_REQUIRE_EQUAL(semaphore.get_stats().current_permits, 2);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().need_cpu_permits, need_cpu);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().awaits_permits, awaits);
};
// active
{
auto permit = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout, {}).get();
do_check(permit, 0, 0, std::source_location::current());
}
// need_cpu
{
auto permit = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout, {}).get();
reader_permit::need_cpu_guard ncpu_guard{permit};
do_check(permit, 1, 0, std::source_location::current());
}
// awaits
{
auto permit = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout, {}).get();
reader_permit::need_cpu_guard ncpu_guard{permit};
reader_permit::awaits_guard awaits_guard{permit};
do_check(permit, 1, 1, std::source_location::current());
}
}
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_blessed_read_goes_inactive) {
const auto initial_resources = reader_concurrency_semaphore::resources{2, 2 * 1024};
const auto serialize_multiplier = 2;
reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::for_tests{}, get_name(), initial_resources.count,
initial_resources.memory, 100, utils::updateable_value<uint32_t>(serialize_multiplier));
auto stop_sem = deferred_stop(semaphore);
simple_schema ss;
auto s = ss.schema();
auto permit = semaphore.obtain_permit(s, get_name(), 1024, db::no_timeout, {}).get();
std::vector<reader_permit::resource_units> permit_res;
permit_res.emplace_back(permit.request_memory(1024).get());
permit_res.emplace_back(permit.request_memory(1024).get());
BOOST_REQUIRE_EQUAL(semaphore.consumed_resources(), reader_resources(1, 3 * 1024));
BOOST_REQUIRE_EQUAL(semaphore.get_blessed_permit(), 0);
// permit is the blessed one
permit_res.emplace_back(permit.request_memory(1024).get());
BOOST_REQUIRE_EQUAL(semaphore.get_stats().reads_enqueued_for_memory, 0);
BOOST_REQUIRE_EQUAL(semaphore.get_blessed_permit(), permit.id());
// register the blessed permit (permit) as inactive
permit_res.clear();
auto handle = semaphore.register_inactive_read(make_empty_mutation_reader(s, permit));
// Upon being registered as inactive, the permit should loose the blessed status
BOOST_REQUIRE_EQUAL(semaphore.get_blessed_permit(), 0);
}
// Check that `stop()` correctly evicts all inactive reads.
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_stop_with_inactive_reads) {
reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::no_limits{}, get_name(), reader_concurrency_semaphore::register_metrics::no);
simple_schema ss;
auto s = ss.schema();
auto permit = reader_permit_opt(semaphore.obtain_permit(s, get_name(), 1024, db::no_timeout, {}).get());
auto handle = semaphore.register_inactive_read(make_empty_mutation_reader(s, *permit));
BOOST_REQUIRE(handle);
BOOST_REQUIRE_EQUAL(permit->get_state(), reader_permit::state::inactive);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 1);
// Using BOOST_CHECK_* because an exception thrown here causes a segfault,
// due to the stop future not being waited for.
auto stop_f = semaphore.stop();
BOOST_CHECK(!stop_f.available());
BOOST_CHECK(eventually_true([&] { return !semaphore.get_stats().inactive_reads; }));
BOOST_CHECK(!handle);
BOOST_CHECK_EQUAL(permit->get_state(), reader_permit::state::evicted);
// Stop waits on all permits, so we need to destroy the permit before we can
// wait on the stop future.
permit = {};
stop_f.get();
}
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_permit_waiting_for_memory_goes_inactive) {
const auto initial_resources = reader_concurrency_semaphore::resources{2, 2 * 1024};
const auto serialize_multiplier = 2;
reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::for_tests{}, get_name(), initial_resources.count,
initial_resources.memory, 100, utils::updateable_value<uint32_t>(serialize_multiplier));
auto stop_sem = deferred_stop(semaphore);
auto permit1 = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout, {}).get();
auto permit2 = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout, {}).get();
std::vector<reader_permit::resource_units> res;
res.emplace_back(permit1.consume_memory(2048));
res.emplace_back(permit2.consume_memory(2048));
res.emplace_back(permit1.request_memory(1024).get());
BOOST_REQUIRE_EQUAL(semaphore.get_blessed_permit(), permit1.id());
BOOST_REQUIRE_EQUAL(semaphore.get_stats().reads_enqueued_for_memory, 0);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().waiters, 0);
auto res_fut = permit2.request_memory(1024);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().reads_enqueued_for_memory, 1);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().waiters, 1);
simple_schema ss;
auto s = ss.schema();
auto handle = semaphore.register_inactive_read(make_empty_mutation_reader(s, permit2));
// permit2 should have been evicted, its memory requests killed with std::bad_alloc
BOOST_REQUIRE(!handle);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().permit_based_evictions, 1);
BOOST_REQUIRE_EQUAL(permit2.get_state(), reader_permit::state::evicted);
BOOST_REQUIRE_THROW(res_fut.get(), std::bad_alloc);
res.clear();
// Reproduce #13539: successful request for memory, should not include
// amounts of failed request in the past
BOOST_REQUIRE(permit2.needs_readmission());
permit2.wait_readmission().get();
permit2.request_memory(1024).get();
}
// Check that inactive reads are not needlessly evicted when admission is not
// blocked on resources.
// This test covers all the cases where eviction should **not** happen.
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_no_unnecessary_evicting) {
const auto initial_resources = reader_concurrency_semaphore::resources{2, 4 * 1024};
reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::for_tests{}, get_name(), initial_resources.count, initial_resources.memory, 100);
auto stop_sem = deferred_stop(semaphore);
simple_schema ss;
auto s = ss.schema();
auto permit1 = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout, {}).get();
// There are available resources
{
BOOST_REQUIRE_EQUAL(semaphore.available_resources().count, 1);
BOOST_REQUIRE_EQUAL(semaphore.available_resources().memory, 3 * 1024);
auto handle = semaphore.register_inactive_read(make_empty_mutation_reader(s, permit1));
BOOST_REQUIRE(handle);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 1);
semaphore.set_resources(initial_resources);
BOOST_REQUIRE(handle);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 1);
BOOST_REQUIRE(semaphore.unregister_inactive_read(std::move(handle)));
}
// Count resources are on the limit but no one wants more
{
auto permit2 = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout, {}).get();
BOOST_REQUIRE_EQUAL(semaphore.available_resources().count, 0);
BOOST_REQUIRE_EQUAL(semaphore.available_resources().memory, 2 * 1024);
auto handle = semaphore.register_inactive_read(make_empty_mutation_reader(s, permit1));
BOOST_REQUIRE(handle);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 1);
semaphore.set_resources(initial_resources);
BOOST_REQUIRE(handle);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 1);
BOOST_REQUIRE(semaphore.unregister_inactive_read(std::move(handle)));
}
// Memory resources are on the limit but no one wants more
{
auto units = permit1.consume_memory(3 * 1024);
BOOST_REQUIRE_EQUAL(semaphore.available_resources().count, 1);
BOOST_REQUIRE_EQUAL(semaphore.available_resources().memory, 0);
auto handle = semaphore.register_inactive_read(make_empty_mutation_reader(s, permit1));
BOOST_REQUIRE(handle);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 1);
BOOST_REQUIRE(semaphore.unregister_inactive_read(std::move(handle)));
}
// Up the resource count, we need more permits to check the rest of the scenarios
semaphore.set_resources({4, 4 * 1024});
// There are waiters but they are not blocked on resources
{
auto permit2 = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout, {}).get();
auto permit3 = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout, {}).get();
std::optional<reader_permit::need_cpu_guard> ncpu_guard1{permit1};
std::optional<reader_permit::need_cpu_guard> ncpu_guard2{permit2};
auto permit4_fut = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout, {});
BOOST_REQUIRE_EQUAL(semaphore.get_stats().waiters, 1);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().reads_queued_because_need_cpu_permits, 1);
// First check the register path.
auto handle = semaphore.register_inactive_read(make_empty_mutation_reader(s, permit3));
BOOST_REQUIRE(handle);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().permit_based_evictions, 0);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 1);
BOOST_REQUIRE_EQUAL(permit3.get_state(), reader_permit::state::inactive);
// Now check the callback admission path (admission check on resources being freed).
ncpu_guard2.reset();
BOOST_REQUIRE(handle);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().permit_based_evictions, 0);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 1);
BOOST_REQUIRE_EQUAL(permit3.get_state(), reader_permit::state::inactive);
}
}
// Check that inactive reads are evicted when they are blocking admission
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_necessary_evicting) {
const auto initial_resources = reader_concurrency_semaphore::resources{2, 4 * 1024};
reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::for_tests{}, get_name(), initial_resources.count, initial_resources.memory, 100);
auto stop_sem = deferred_stop(semaphore);
simple_schema ss;
auto s = ss.schema();
uint64_t evicted_reads = 0;
auto permit1 = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout, {}).get();
// No count resources - obtaining new permit
{
auto permit2 = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout, {}).get();
BOOST_REQUIRE_EQUAL(semaphore.available_resources().count, 0);
BOOST_REQUIRE_EQUAL(semaphore.available_resources().memory, 2 * 1024);
auto handle = semaphore.register_inactive_read(make_empty_mutation_reader(s, permit1));
BOOST_REQUIRE(handle);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 1);
auto new_permit = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout, {}).get();
BOOST_REQUIRE(!handle);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 0);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().permit_based_evictions, ++evicted_reads);
}
BOOST_REQUIRE(permit1.needs_readmission());
permit1.wait_readmission().get();
// No count resources - waiter
{
auto permit2 = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout, {}).get();
BOOST_REQUIRE_EQUAL(semaphore.available_resources().count, 0);
BOOST_REQUIRE_EQUAL(semaphore.available_resources().memory, 2 * 1024);
auto new_permit_fut = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout, {});
BOOST_REQUIRE_EQUAL(semaphore.get_stats().waiters, 1);
auto handle = semaphore.register_inactive_read(make_empty_mutation_reader(s, permit1));
BOOST_REQUIRE(!handle);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 0);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().permit_based_evictions, ++evicted_reads);
new_permit_fut.get();
}
BOOST_REQUIRE(permit1.needs_readmission());
permit1.wait_readmission().get();
// No memory resources
{
auto units = permit1.consume_memory(3 * 1024);
BOOST_REQUIRE_EQUAL(semaphore.available_resources().count, 1);
BOOST_REQUIRE_EQUAL(semaphore.available_resources().memory, 0);
auto handle = semaphore.register_inactive_read(make_empty_mutation_reader(s, permit1));
BOOST_REQUIRE(handle);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 1);
auto new_permit = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout, {}).get();
BOOST_REQUIRE(!handle);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 0);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().permit_based_evictions, ++evicted_reads);
}
BOOST_REQUIRE(permit1.needs_readmission());
permit1.wait_readmission().get();
// No memory resources - waiter
{
auto units = permit1.consume_memory(3 * 1024);
BOOST_REQUIRE_EQUAL(semaphore.available_resources().count, 1);
BOOST_REQUIRE_EQUAL(semaphore.available_resources().memory, 0);
auto new_permit_fut = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout, {});
BOOST_REQUIRE_EQUAL(semaphore.get_stats().waiters, 1);
auto handle = semaphore.register_inactive_read(make_empty_mutation_reader(s, permit1));
BOOST_REQUIRE(!handle);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 0);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().permit_based_evictions, ++evicted_reads);
new_permit_fut.get();
}
BOOST_REQUIRE(permit1.needs_readmission());
permit1.wait_readmission().get();
// No count resources - waiter blocked on something else too
{
auto permit2 = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout, {}).get();
BOOST_REQUIRE_EQUAL(semaphore.available_resources().count, 0);
BOOST_REQUIRE_EQUAL(semaphore.available_resources().memory, 2 * 1024);
std::optional<reader_permit::need_cpu_guard> ncpu_guard{permit2};
auto new_permit_fut = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout, {});
BOOST_REQUIRE_EQUAL(semaphore.get_stats().waiters, 1);
auto handle = semaphore.register_inactive_read(make_empty_mutation_reader(s, permit1));
BOOST_REQUIRE(handle);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 1);
ncpu_guard.reset();
REQUIRE_EVENTUALLY_EQUAL<bool>([&] { return bool(handle); }, false);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 0);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().permit_based_evictions, ++evicted_reads);
new_permit_fut.get();
}
BOOST_REQUIRE(permit1.needs_readmission());
permit1.wait_readmission().get();
// No memory resources - waiter blocked on something else too
{
semaphore.set_resources({initial_resources.count + 1, initial_resources.memory});
auto permit2 = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout, {}).get();
auto units = permit1.consume_memory(2 * 1024);
BOOST_REQUIRE_EQUAL(semaphore.available_resources().count, 1);
BOOST_REQUIRE_EQUAL(semaphore.available_resources().memory, 0);
std::optional<reader_permit::need_cpu_guard> ncpu_guard{permit2};
auto new_permit_fut = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout, {});
BOOST_REQUIRE_EQUAL(semaphore.get_stats().waiters, 1);
auto handle = semaphore.register_inactive_read(make_empty_mutation_reader(s, permit1));
BOOST_REQUIRE(handle);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 1);
ncpu_guard.reset();
REQUIRE_EVENTUALLY_EQUAL<bool>([&] { return bool(handle); }, false);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 0);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().permit_based_evictions, ++evicted_reads);
new_permit_fut.get();
semaphore.set_resources(initial_resources);
}
}
// Check that a waiter permit which was queued due to the _ready_list not being
// empty, will be executed right after the previous read in _ready_list is
// executed, even if said read doesn't trigger admission checks via releasing
// resources.
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_execution_stage_wakeup) {
const auto initial_resources = reader_concurrency_semaphore::resources{2, 4 * 1024};
reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::for_tests{}, get_name(), initial_resources.count, initial_resources.memory, 100);
auto stop_sem = deferred_stop(semaphore);
auto permit1 = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout, {}).get();
BOOST_REQUIRE_EQUAL(semaphore.get_stats().reads_admitted, 1);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().reads_admitted_immediately, 1);
bool func_called = false;
auto func_fut = semaphore.with_ready_permit(permit1, [&] (reader_permit permit) {
func_called = true;
return sleep(std::chrono::milliseconds(1));
});
// permit1 should be on the ready list, not executed yet
BOOST_REQUIRE(!func_called);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().waiters, 1);
// trying to obtain a second permit should block on the _ready_list
auto permit2_fut = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout, {});
BOOST_REQUIRE_EQUAL(semaphore.get_stats().reads_queued_because_ready_list, 1);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().waiters, 2);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().reads_admitted, 1);
// After func runs, the _ready_list becomes empty and the waiting permit should be admitted
func_fut.get();
BOOST_REQUIRE_EQUAL(semaphore.get_stats().waiters, 0);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().reads_admitted, 2);
permit2_fut.get();
}
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_live_update_count) {
utils::updateable_value_source<int> count{1};
const uint32_t initial_memory = 4 * 1024;
const auto serialize_multiplier = std::numeric_limits<uint32_t>::max();
const auto kill_multiplier = std::numeric_limits<uint32_t>::max();
const auto cpu_concurrency = 1;
const auto preemptive_abort_factor = 0.0f;
reader_concurrency_semaphore semaphore(
utils::updateable_value(count),
initial_memory,
get_name(),
100,
utils::updateable_value<uint32_t>(serialize_multiplier),
utils::updateable_value<uint32_t>(kill_multiplier),
utils::updateable_value<uint32_t>(cpu_concurrency),
utils::updateable_value<float>(preemptive_abort_factor),
reader_concurrency_semaphore::register_metrics::no);
auto stop_sem = deferred_stop(semaphore);
BOOST_REQUIRE_EQUAL(semaphore.initial_resources(), reader_resources(count(), initial_memory));
count.set(10);
BOOST_REQUIRE_EQUAL(semaphore.initial_resources(), reader_resources(count(), initial_memory));
}
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_live_update_cpu_concurrency) {
simple_schema s;
const auto schema = s.schema();
utils::updateable_value_source<uint32_t> cpu_concurrency{2};
const int32_t initial_count = 4;
const uint32_t initial_memory = 4 * 1024;
const auto serialize_multiplier = std::numeric_limits<uint32_t>::max();
const auto kill_multiplier = std::numeric_limits<uint32_t>::max();
const auto preemptive_abort_factor = 0.0f;
reader_concurrency_semaphore semaphore(
utils::updateable_value<int>(initial_count),
initial_memory,
get_name(),
100,
utils::updateable_value<uint32_t>(serialize_multiplier),
utils::updateable_value<uint32_t>(kill_multiplier),
utils::updateable_value(cpu_concurrency),
utils::updateable_value<float>(preemptive_abort_factor),
reader_concurrency_semaphore::register_metrics::no);
auto stop_sem = deferred_stop(semaphore);
auto require_can_admit = [&] (bool expected_can_admit, const char* description,
std::source_location sl = std::source_location::current()) {
::require_can_admit(schema, semaphore, expected_can_admit, description, sl);
};
auto permit1 = semaphore.obtain_permit(schema, get_name(), 1024, db::no_timeout, {}).get();
require_can_admit(true, "!need_cpu");
{
reader_permit::need_cpu_guard ncpu_guard{permit1};
require_can_admit(true, "need_cpu < cpu_concurrency");
auto permit2 = semaphore.obtain_permit(schema, get_name(), 1024, db::no_timeout, {}).get();
// no change
require_can_admit(true, "need_cpu < cpu_concurrency");
{
reader_permit::need_cpu_guard ncpu_guard{permit2};
require_can_admit(false, "need_cpu == cpu_concurrency");
cpu_concurrency.set(3);
require_can_admit(true, "after set(3): need_cpu < cpu_concurrency");
cpu_concurrency.set(2);
require_can_admit(false, "after set(2): need_cpu == cpu_concurrency");
}
require_can_admit(true, "need_cpu < cpu_concurrency");
}
require_can_admit(true, "!need_cpu");
}
/// Check that permits are cleaned up properly if they step on queue overload.
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_wait_queue_overload_cleanup) {
simple_schema s;
const auto schema = s.schema();
const std::string test_name = get_name();
reader_concurrency_semaphore semaphore(
utils::updateable_value<int>(1),
1024,
test_name + " semaphore",
1,
utils::updateable_value<uint32_t>(2),
utils::updateable_value<uint32_t>(4),
utils::updateable_value<uint32_t>(1),
utils::updateable_value<float>(0.0f),
reader_concurrency_semaphore::register_metrics::no);
auto stop_sem = deferred_stop(semaphore);
reader_permit_opt permit1 = semaphore.obtain_permit(schema, test_name, 1024, db::no_timeout, {}).get();
BOOST_REQUIRE_EQUAL(semaphore.get_stats().reads_admitted, 1);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().reads_admitted_immediately, 1);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().waiters, 0);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().total_reads_shed_due_to_overload, 0);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().current_permits, 1);
auto permit2_fut = semaphore.obtain_permit(schema, test_name, 1024, db::no_timeout, {});
BOOST_REQUIRE_EQUAL(semaphore.get_stats().reads_admitted, 1);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().reads_admitted_immediately, 1);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().waiters, 1);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().total_reads_shed_due_to_overload, 0);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().current_permits, 2);
{
reader_permit_opt permit_holder;
auto permit3_fut = semaphore.with_permit(schema, test_name.c_str(), 1024, db::no_timeout, {}, permit_holder, [] (reader_permit) {
BOOST_FAIL("unexpected call to with permit lambda");
return make_ready_future<>();
});
BOOST_REQUIRE(permit3_fut.failed());
BOOST_CHECK_THROW(permit3_fut.get(), std::runtime_error);
}
BOOST_REQUIRE_EQUAL(semaphore.get_stats().reads_admitted, 1);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().reads_admitted_immediately, 1);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().waiters, 1);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().total_reads_shed_due_to_overload, 1);
// This is the critical check in this test: we check that the permit3 was
// destroyed and it has not become a zombie permit due to incomplete cleanup.
BOOST_REQUIRE_EQUAL(semaphore.get_stats().current_permits, 2);
permit1 = {};
permit2_fut.get();
}
// Check that attempting to abort an already aborted permit is handled correctly.
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_double_permit_abort) {
simple_schema s;
const auto schema = s.schema();
const std::string test_name = get_name();
reader_concurrency_semaphore semaphore(
utils::updateable_value<int>(2),
2048,
test_name + " semaphore",
std::numeric_limits<size_t>::max(),
utils::updateable_value<uint32_t>(2),
utils::updateable_value<uint32_t>(400),
utils::updateable_value<uint32_t>(2),
utils::updateable_value<float>(0.0f),
reader_concurrency_semaphore::register_metrics::no);
auto stop_sem = deferred_stop(semaphore);
reader_permit_opt permit1 = semaphore.obtain_permit(schema, test_name, 1024, db::no_timeout, {}).get();
BOOST_REQUIRE_EQUAL(semaphore.get_stats().reads_admitted, 1);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().reads_admitted_immediately, 1);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().waiters, 0);
reader_permit_opt permit2 = semaphore.obtain_permit(schema, test_name, 1024, db::no_timeout, {}).get();
BOOST_REQUIRE_EQUAL(semaphore.get_stats().reads_admitted, 2);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().reads_admitted_immediately, 2);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().waiters, 0);
// Exhaust all memory until serialize-limit triggers and make sure permit1 is
// the blessed one.
auto res1 = permit1->consume_memory(2048);
auto requested_memory1_fut = permit1->request_memory(1024);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().reads_enqueued_for_memory, 0);
auto requested_memory1 = requested_memory1_fut.get();
// Requesting memory for permit2 will queue the permit for memory.
auto requested_memory2_fut = permit2->request_memory(1024);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().reads_enqueued_for_memory, 1);
// Set timeout to 0 and wait for permit to time out.
permit2->set_timeout(db::timeout_clock::now());
eventually_true([&] {
auto ex = permit2->get_abort_exception();
if (!ex) {
return false;
}
try {
std::rethrow_exception(ex);
} catch (named_semaphore_timed_out&) {
return true;
} catch (...) {
BOOST_FAIL(format("unexpected exception while waiting for permit to time out: {}", std::current_exception()));
return true;
}
});
// Attempting to register a read which is queued on memory as inactive, will
// trigger an attempt to abort the read.
// This is where the double-abort happens.
auto irh = semaphore.register_inactive_read(make_empty_mutation_reader(s.schema(), *permit2));
BOOST_REQUIRE_THROW(requested_memory2_fut.get(), named_semaphore_timed_out);
}
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_abort_preemptively_aborted_permit) {
simple_schema s;
const auto schema = s.schema();
const std::string test_name = get_name();
const auto initial_resources = reader_concurrency_semaphore::resources{2, 2 * 1024};
const auto serialize_multiplier = 2;
// Ensure permits are shed immediately during admission.
const auto preemptive_abort_factor = 1.0f;
reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::for_tests{}, get_name(), initial_resources.count,
initial_resources.memory, 100, utils::updateable_value<uint32_t>(serialize_multiplier),
utils::updateable_value(std::numeric_limits<uint32_t>::max()),
utils::updateable_value<uint32_t>(1), utils::updateable_value<float>(preemptive_abort_factor));
auto stop_sem = deferred_stop(semaphore);
// Set a ridiculously long timeout to ensure permit will not be rejected due to timeout
auto timeout = db::timeout_clock::now() + 60min;
auto permit1 = semaphore.obtain_permit(schema, test_name, 1024, db::no_timeout, {}).get();
auto units1 = permit1.request_memory(2024).get();
reader_permit_opt permit2_holder;
auto permit2_fut = semaphore.with_permit(schema, test_name.c_str(), 1024, timeout, {}, permit2_holder, [] (reader_permit) {
BOOST_FAIL("unexpected call to with permit lambda");
return make_ready_future<>();
});
// Triggers maybe_admit_waiters()
units1.reset_to_zero();
BOOST_REQUIRE(eventually_true([&] { return permit2_fut.failed(); }));
BOOST_REQUIRE_THROW(permit2_fut.get(), named_semaphore_aborted);
BOOST_REQUIRE_EQUAL(semaphore.get_stats().total_reads_shed_due_to_overload, 1);
auto irh = semaphore.register_inactive_read(make_empty_mutation_reader(schema, *permit2_holder));
BOOST_CHECK(!irh);
}
/// Test that if no count resources are currently used, a single permit is always admitted regardless of available memory.
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_always_admit_one_permit) {
simple_schema s;
const auto schema = s.schema();
const std::string test_name = get_name();
reader_concurrency_semaphore semaphore(
utils::updateable_value<int>(2),
2048,
test_name + " semaphore",
std::numeric_limits<size_t>::max(),
utils::updateable_value<uint32_t>(200),
utils::updateable_value<uint32_t>(400),
utils::updateable_value<uint32_t>(1),
utils::updateable_value<float>(0.0f),
reader_concurrency_semaphore::register_metrics::no);
auto stop_sem = deferred_stop(semaphore);
// Scenario1: all memory use used by tracking permit (not consuming count resources)
{
auto permit = semaphore.make_tracking_only_permit(schema, test_name, db::no_timeout, {});
auto res = permit.consume_memory(4096);
require_can_admit(schema, semaphore, true, "all memory used, but one permit should always be admitted");
}
// Scenario2: all memory use used by evicted permit (recouped count resource)
{
auto permit = semaphore.obtain_permit(schema, test_name, 1024, db::no_timeout, {}).get();
auto res = permit.consume_memory(4096);
require_can_admit(schema, semaphore, false, "all memory used, cannot admit");
auto irh = semaphore.register_inactive_read(make_empty_mutation_reader(s.schema(), permit));
BOOST_REQUIRE(!irh);
require_can_admit(schema, semaphore, true, "all memory used, but one permit should always be admitted");
}
}
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_release_base_resources) {
simple_schema s;
const auto schema = s.schema();
const std::string test_name = get_name();
const auto total_resources = reader_resources{2, 2048};
reader_concurrency_semaphore semaphore(
utils::updateable_value<int>(total_resources.count),
total_resources.memory,
test_name + " semaphore",
std::numeric_limits<size_t>::max(),
utils::updateable_value<uint32_t>(200),
utils::updateable_value<uint32_t>(400),
utils::updateable_value<uint32_t>(1),
utils::updateable_value<float>(0.0f),
reader_concurrency_semaphore::register_metrics::no);
auto stop_sem = deferred_stop(semaphore);
const auto expected_base_resources = reader_resources{1, 1024};
const auto expected_available_resources = total_resources - expected_base_resources;
const auto expected_consumed_resources = expected_base_resources;
{
auto permit = semaphore.obtain_permit(s.schema(), get_name(), 1024, db::no_timeout, {}).get();
BOOST_REQUIRE_EQUAL(permit.base_resources(), expected_base_resources);
BOOST_REQUIRE_EQUAL(permit.consumed_resources(), permit.base_resources());
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), expected_available_resources);
BOOST_REQUIRE_EQUAL(semaphore.consumed_resources(), expected_consumed_resources);
permit.release_base_resources();
// Should be unchanged, it is just not consumed
BOOST_REQUIRE_EQUAL(permit.base_resources(), expected_base_resources);
BOOST_REQUIRE_EQUAL(permit.consumed_resources(), reader_resources{});
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), total_resources);
BOOST_REQUIRE_EQUAL(semaphore.consumed_resources(), reader_resources{});
}
{
auto permit = semaphore.obtain_permit(s.schema(), get_name(), 1024, db::no_timeout, {}).get();
auto irh = semaphore.register_inactive_read(make_empty_mutation_reader(s.schema(), permit));
BOOST_REQUIRE(irh);
BOOST_REQUIRE(semaphore.try_evict_one_inactive_read());
BOOST_REQUIRE(!irh);
BOOST_REQUIRE_EQUAL(permit.consumed_resources(), reader_resources{});
BOOST_REQUIRE_EQUAL(permit.base_resources(), expected_base_resources);
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), total_resources);
BOOST_REQUIRE_EQUAL(semaphore.consumed_resources(), reader_resources{});
// Should be no-op
permit.release_base_resources();
// Should be unchanged, it is just not consumed
BOOST_REQUIRE_EQUAL(permit.base_resources(), expected_base_resources);
BOOST_REQUIRE_EQUAL(permit.consumed_resources(), reader_resources{});
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), total_resources);
BOOST_REQUIRE_EQUAL(semaphore.consumed_resources(), reader_resources{});
}
}
// Reproducer for https://scylladb.atlassian.net/browse/SCYLLADB-1016
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_preemptive_abort_requested_memory_leak) {
const ssize_t memory = 1024;
const uint32_t serialize_limit_multiplier = 2;
reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::for_tests{}, get_name(),
2, // count
memory,
100, // max queue length
utils::updateable_value(serialize_limit_multiplier),
utils::updateable_value(std::numeric_limits<uint32_t>::max()), // kill limit multiplier
utils::updateable_value<uint32_t>(1), // cpu concurrency
utils::updateable_value<float>(1.0f)); // preemptive abort factor
auto stop_sem = deferred_stop(semaphore);
auto permit1 = semaphore.obtain_permit(nullptr, "permit1", memory/2, db::no_timeout, {}).get();
reader_permit_opt permit2 = semaphore.obtain_permit(nullptr, "permit2", memory/2, db::timeout_clock::now() + 60s, {}).get();
auto units1 = permit1.request_memory(memory * serialize_limit_multiplier).get();
auto mem_fut = permit2->request_memory(1024);
BOOST_REQUIRE(!mem_fut.available());
// Triggers maybe_admit_waiters()
units1.reset_to_zero();
// Consume mem_fut to properly account for the 1024 bytes consumed by
// on_granted_memory(). In debug mode, the .then() continuation that creates
// the resource_units may be deferred due to yielding, so we must .get() it
// to ensure the resource_units is created and can be properly destroyed.
{ auto u = mem_fut.get(); }
// on_granted_memory() consumes stale _requested_memory (1024) + 512,
// but resource_units only tracks 512 — the difference leaks.
{ auto u = permit2->request_memory(512).get(); }
// Shouldn't fail if SCYLLADB-1016 is fixed.
permit2 = {};
}
// Verify that signal() detects and corrects a negative resource leak.
// When a bug causes available resources to exceed initial resources
// after signal(), the semaphore should report the negative leak via
// on_internal_error_noexcept and clamp _resources back to _initial_resources
// so that consumed_resources() never goes negative.
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_signal_detects_negative_resource_leak) {
const auto initial = reader_resources{2, 2048};
reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::for_tests{}, get_name(), initial.count, initial.memory);
auto stop_sem = deferred_stop(semaphore);
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial);
BOOST_REQUIRE_EQUAL(semaphore.consumed_resources(), reader_resources{});
// Simulate a negative leak: signal more resources than were ever consumed.
// This would happen if a bug double-returned resources or inflated
// the amount returned to signal().
// signal() calls on_internal_error_noexcept which would abort in
// test mode, so temporarily disable that.
const auto leaked = reader_resources{1, 512};
{
seastar::testing::scoped_no_abort_on_internal_error no_abort;
reader_concurrency_semaphore_tester::signal(semaphore, leaked);
}
// signal() should have detected the over-return and clamped
// available resources back to initial.
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial);
BOOST_REQUIRE_EQUAL(semaphore.consumed_resources(), reader_resources{});
}
BOOST_AUTO_TEST_SUITE_END()