/* * Copyright (C) 2018 ScyllaDB */ /* * This file is part of Scylla. * * Scylla is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Scylla is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Scylla. If not, see . */ #include #include #include #include #include "reader_concurrency_semaphore.hh" #include "utils/exceptions.hh" #include "schema.hh" #include "utils/human_readable.hh" #include "flat_mutation_reader.hh" logger rcslog("reader_concurrency_semaphore"); reader_permit::resource_units::resource_units(reader_permit permit, reader_resources res) noexcept : _permit(std::move(permit)), _resources(res) { _permit.consume(res); } reader_permit::resource_units::resource_units(resource_units&& o) noexcept : _permit(std::move(o._permit)) , _resources(std::exchange(o._resources, {})) { } reader_permit::resource_units::~resource_units() { if (_resources) { reset(); } } reader_permit::resource_units& reader_permit::resource_units::operator=(resource_units&& o) noexcept { if (&o == this) { return *this; } reset(); _permit = std::move(o._permit); _resources = std::exchange(o._resources, {}); return *this; } void reader_permit::resource_units::add(resource_units&& o) { assert(_permit == o._permit); _resources += std::exchange(o._resources, {}); } void reader_permit::resource_units::reset(reader_resources res) { _permit.consume(res); if (_resources) { _permit.signal(_resources); } _resources = res; } class reader_permit::impl : public boost::intrusive::list_base_hook> { reader_concurrency_semaphore& _semaphore; const schema* _schema; sstring _op_name; std::string_view _op_name_view; reader_resources _resources; reader_permit::state _state = reader_permit::state::registered; public: struct value_tag {}; impl(reader_concurrency_semaphore& semaphore, const schema* const schema, const std::string_view& op_name) : _semaphore(semaphore) , _schema(schema) , _op_name_view(op_name) { } impl(reader_concurrency_semaphore& semaphore, const schema* const schema, sstring&& op_name) : _semaphore(semaphore) , _schema(schema) , _op_name(std::move(op_name)) , _op_name_view(_op_name) { } ~impl() { if (_resources) { on_internal_error_noexcept(rcslog, format("reader_permit::impl::~impl(): permit {}.{}:{} detected a leak of {{count={}, memory={}}} resources", _schema ? _schema->ks_name() : "*", _schema ? _schema->cf_name() : "*", _op_name_view, _resources.count, _resources.memory)); } } reader_concurrency_semaphore& semaphore() { return _semaphore; } const ::schema* get_schema() const { return _schema; } std::string_view get_op_name() const { return _op_name_view; } reader_permit::state get_state() const { return _state; } void on_waiting() { _state = reader_permit::state::waiting; } void on_admission() { _state = reader_permit::state::admitted; _semaphore.consume(_resources); } void consume(reader_resources res) { _resources += res; if (_state == reader_permit::state::admitted) { _semaphore.consume(res); } } void signal(reader_resources res) { _resources -= res; if (_state == reader_permit::state::admitted) { _semaphore.signal(res); } } reader_resources resources() const { return _resources; } }; struct reader_concurrency_semaphore::permit_list { using list_type = boost::intrusive::list>; list_type permits; }; reader_permit::reader_permit(reader_concurrency_semaphore& semaphore, const schema* const schema, std::string_view op_name) : _impl(::seastar::make_shared(semaphore, schema, op_name)) { semaphore._permit_list->permits.push_back(*_impl); } reader_permit::reader_permit(reader_concurrency_semaphore& semaphore, const schema* const schema, sstring&& op_name) : _impl(::seastar::make_shared(semaphore, schema, std::move(op_name))) { semaphore._permit_list->permits.push_back(*_impl); } void reader_permit::on_waiting() { _impl->on_waiting(); } void reader_permit::on_admission() { _impl->on_admission(); } reader_permit::~reader_permit() { } reader_concurrency_semaphore& reader_permit::semaphore() { return _impl->semaphore(); } future reader_permit::wait_admission(size_t memory, db::timeout_clock::time_point timeout) { return _impl->semaphore().do_wait_admission(*this, memory, timeout); } void reader_permit::consume(reader_resources res) { _impl->consume(res); } void reader_permit::signal(reader_resources res) { _impl->signal(res); } reader_permit::resource_units reader_permit::consume_memory(size_t memory) { return consume_resources(reader_resources{0, ssize_t(memory)}); } reader_permit::resource_units reader_permit::consume_resources(reader_resources res) { return resource_units(*this, res); } reader_resources reader_permit::consumed_resources() const { return _impl->resources(); } std::ostream& operator<<(std::ostream& os, reader_permit::state s) { switch (s) { case reader_permit::state::registered: os << "registered"; break; case reader_permit::state::waiting: os << "waiting"; break; case reader_permit::state::admitted: os << "admitted"; break; } return os; } namespace { struct permit_stats { uint64_t memory = 0; uint64_t count = 0; void add(uint64_t m) { memory += m; ++count; } permit_stats& operator+=(const permit_stats& o) { memory += o.memory; count += o.count; return *this; } }; using permit_group_key = std::tuple; struct permit_group_key_hash { size_t operator()(const permit_group_key& k) const { using underlying_type = std::underlying_type_t; return std::hash()(reinterpret_cast(std::get<0>(k))) ^ std::hash()(std::get<1>(k)) ^ std::hash()(static_cast(std::get<2>(k))); } }; using permit_groups = std::unordered_map; static permit_stats do_dump_reader_permit_diagnostics(std::ostream& os, const permit_groups& permits, reader_permit::state state, bool sort_by_memory) { struct permit_summary { const schema* s; std::string_view op_name; uint64_t memory; uint64_t count; }; std::vector permit_summaries; for (const auto& [k, v] : permits) { const auto& [s, op_name, k_state] = k; if (k_state == state) { permit_summaries.emplace_back(permit_summary{s, op_name, v.memory, v.count}); } } std::ranges::sort(permit_summaries, [sort_by_memory] (const permit_summary& a, const permit_summary& b) { if (sort_by_memory) { return a.memory < b.memory; } else { return a.count < b.count; } }); permit_stats total; auto print_line = [&os, sort_by_memory] (auto col1, auto col2, auto col3) { if (sort_by_memory) { fmt::print(os, "{}\t{}\t{}\n", col2, col1, col3); } else { fmt::print(os, "{}\t{}\t{}\n", col1, col2, col3); } }; fmt::print(os, "Permits with state {}, sorted by {}\n", state, sort_by_memory ? "memory" : "count"); print_line("count", "memory", "name"); for (const auto& summary : permit_summaries) { total.count += summary.count; total.memory += summary.memory; print_line(summary.count, utils::to_hr_size(summary.memory), fmt::format("{}.{}:{}", summary.s ? summary.s->ks_name() : "*", summary.s ? summary.s->cf_name() : "*", summary.op_name)); } fmt::print(os, "\n"); print_line(total.count, utils::to_hr_size(total.memory), "total"); return total; } static void do_dump_reader_permit_diagnostics(std::ostream& os, const reader_concurrency_semaphore& semaphore, const reader_concurrency_semaphore::permit_list& list, std::string_view problem) { permit_groups permits; for (const auto& permit : list.permits) { permits[permit_group_key(permit.get_schema(), permit.get_op_name(), permit.get_state())].add(permit.resources().memory); } permit_stats total; fmt::print(os, "Semaphore {}: {}, dumping permit diagnostics:\n", semaphore.name(), problem); total += do_dump_reader_permit_diagnostics(os, permits, reader_permit::state::admitted, true); fmt::print(os, "\n"); total += do_dump_reader_permit_diagnostics(os, permits, reader_permit::state::waiting, false); fmt::print(os, "\n"); total += do_dump_reader_permit_diagnostics(os, permits, reader_permit::state::registered, false); fmt::print(os, "\n"); fmt::print(os, "Total: permits: {}, memory: {}\n", total.count, utils::to_hr_size(total.memory)); } static void maybe_dump_reader_permit_diagnostics(const reader_concurrency_semaphore& semaphore, const reader_concurrency_semaphore::permit_list& list, std::string_view problem) { static thread_local logger::rate_limit rate_limit(std::chrono::seconds(30)); rcslog.log(log_level::info, rate_limit, "{}", value_of([&] { std::ostringstream os; do_dump_reader_permit_diagnostics(os, semaphore, list, problem); return os.str(); })); } } // anonymous namespace void reader_concurrency_semaphore::expiry_handler::operator()(entry& e) noexcept { e.pr.set_exception(named_semaphore_timed_out(_semaphore._name)); maybe_dump_reader_permit_diagnostics(_semaphore, *_semaphore._permit_list, "timed out"); } reader_concurrency_semaphore::inactive_read::~inactive_read() { } void reader_concurrency_semaphore::signal(const resources& r) noexcept { _resources += r; while (!_wait_list.empty() && has_available_units(_wait_list.front().res)) { auto& x = _wait_list.front(); try { x.permit.on_admission(); x.pr.set_value(reader_permit::resource_units(std::move(x.permit), x.res)); } catch (...) { x.pr.set_exception(std::current_exception()); } _wait_list.pop_front(); } } reader_concurrency_semaphore::reader_concurrency_semaphore(int count, ssize_t memory, sstring name, size_t max_queue_length, std::function prethrow_action) : _initial_resources(count, memory) , _resources(count, memory) , _wait_list(expiry_handler(*this)) , _name(std::move(name)) , _max_queue_length(max_queue_length) , _prethrow_action(std::move(prethrow_action)) , _permit_list(std::make_unique()) {} reader_concurrency_semaphore::reader_concurrency_semaphore(no_limits, sstring name) : reader_concurrency_semaphore( std::numeric_limits::max(), std::numeric_limits::max(), std::move(name)) {} reader_concurrency_semaphore::~reader_concurrency_semaphore() { broken(std::make_exception_ptr(broken_semaphore{})); } reader_concurrency_semaphore::inactive_read_handle reader_concurrency_semaphore::register_inactive_read(flat_mutation_reader reader) noexcept { // Implies _inactive_reads.empty(), we don't queue new readers before // evicting all inactive reads. if (_wait_list.empty()) { try { auto irp = std::make_unique(std::move(reader)); auto& ir = *irp; _inactive_reads.push_back(ir); ++_stats.inactive_reads; return inactive_read_handle(*this, std::move(irp)); } catch (...) { // It is okay to swallow the exception since // we're allowed to drop the reader upon registration // due to lack of resources. Returning an empty // i_r_h here rather than throwing simplifies the caller's // error handling. rcslog.warn("Registering inactive read failed: {}. Ignored as if it was evicted.", std::current_exception()); } } else { ++_stats.permit_based_evictions; } return inactive_read_handle(); } void reader_concurrency_semaphore::set_notify_handler(inactive_read_handle& irh, eviction_notify_handler&& notify_handler, std::optional ttl_opt) { auto& ir = *irh._irp; ir.notify_handler = std::move(notify_handler); if (ttl_opt) { ir.ttl_timer.set_callback([this, &ir] { evict(ir, evict_reason::time); }); ir.ttl_timer.arm(lowres_clock::now() + *ttl_opt); } } flat_mutation_reader_opt reader_concurrency_semaphore::unregister_inactive_read(inactive_read_handle irh) { if (!irh) { return {}; } if (irh._sem != this) { on_internal_error(rcslog, fmt::format( "reader_concurrency_semaphore::unregister_inactive_read(): " "attempted to unregister an inactive read with a handle belonging to another semaphore: " "this is {} (0x{:x}) but the handle belongs to {} (0x{:x})", name(), reinterpret_cast(this), irh._sem->name(), reinterpret_cast(irh._sem))); } --_stats.inactive_reads; auto irp = std::move(irh._irp); irp->unlink(); return std::move(irp->reader); } bool reader_concurrency_semaphore::try_evict_one_inactive_read(evict_reason reason) { if (_inactive_reads.empty()) { return false; } evict(_inactive_reads.front(), reason); return true; } void reader_concurrency_semaphore::evict(inactive_read& ir, evict_reason reason) { auto reader = std::move(ir.reader); ir.unlink(); if (auto notify_handler = std::move(ir.notify_handler)) { notify_handler(reason); // The notify_handler may destroy the inactive_read. // Do not use it after this point! } switch (reason) { case evict_reason::permit: ++_stats.permit_based_evictions; break; case evict_reason::time: ++_stats.time_based_evictions; break; case evict_reason::manual: break; } --_stats.inactive_reads; } bool reader_concurrency_semaphore::has_available_units(const resources& r) const { return bool(_resources) && _resources >= r; } bool reader_concurrency_semaphore::may_proceed(const resources& r) const { // Special case: when there is no active reader (based on count) admit one // regardless of availability of memory. return _wait_list.empty() && (has_available_units(r) || _resources.count == _initial_resources.count); } future reader_concurrency_semaphore::do_wait_admission(reader_permit permit, size_t memory, db::timeout_clock::time_point timeout) { if (_wait_list.size() >= _max_queue_length) { _stats.total_reads_shed_due_to_overload++; if (_prethrow_action) { _prethrow_action(); } maybe_dump_reader_permit_diagnostics(*this, *_permit_list, "wait queue overloaded"); return make_exception_future( std::make_exception_ptr(std::runtime_error( format("{}: restricted mutation reader queue overload", _name)))); } auto r = resources(1, static_cast(memory)); while (!may_proceed(r)) { if (!try_evict_one_inactive_read(evict_reason::permit)) { break; } } if (may_proceed(r)) { permit.on_admission(); return make_ready_future(reader_permit::resource_units(std::move(permit), r)); } promise pr; auto fut = pr.get_future(); permit.on_waiting(); _wait_list.push_back(entry(std::move(pr), std::move(permit), r), timeout); return fut; } reader_permit reader_concurrency_semaphore::make_permit(const schema* const schema, const char* const op_name) { return reader_permit(*this, schema, std::string_view(op_name)); } reader_permit reader_concurrency_semaphore::make_permit(const schema* const schema, sstring&& op_name) { return reader_permit(*this, schema, std::move(op_name)); } void reader_concurrency_semaphore::broken(std::exception_ptr ex) { while (!_wait_list.empty()) { _wait_list.front().pr.set_exception(std::make_exception_ptr(broken_semaphore{})); _wait_list.pop_front(); } } // A file that tracks the memory usage of buffers resulting from read // operations. class tracking_file_impl : public file_impl { file _tracked_file; reader_permit _permit; public: tracking_file_impl(file file, reader_permit permit) : _tracked_file(std::move(file)) , _permit(std::move(permit)) { _memory_dma_alignment = _tracked_file.memory_dma_alignment(); _disk_read_dma_alignment = _tracked_file.disk_read_dma_alignment(); _disk_write_dma_alignment = _tracked_file.disk_write_dma_alignment(); } tracking_file_impl(const tracking_file_impl&) = delete; tracking_file_impl& operator=(const tracking_file_impl&) = delete; tracking_file_impl(tracking_file_impl&&) = default; tracking_file_impl& operator=(tracking_file_impl&&) = default; virtual future write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc) override { return get_file_impl(_tracked_file)->write_dma(pos, buffer, len, pc); } virtual future write_dma(uint64_t pos, std::vector iov, const io_priority_class& pc) override { return get_file_impl(_tracked_file)->write_dma(pos, std::move(iov), pc); } virtual future read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& pc) override { return get_file_impl(_tracked_file)->read_dma(pos, buffer, len, pc); } virtual future read_dma(uint64_t pos, std::vector iov, const io_priority_class& pc) override { return get_file_impl(_tracked_file)->read_dma(pos, iov, pc); } virtual future<> flush(void) override { return get_file_impl(_tracked_file)->flush(); } virtual future stat(void) override { return get_file_impl(_tracked_file)->stat(); } virtual future<> truncate(uint64_t length) override { return get_file_impl(_tracked_file)->truncate(length); } virtual future<> discard(uint64_t offset, uint64_t length) override { return get_file_impl(_tracked_file)->discard(offset, length); } virtual future<> allocate(uint64_t position, uint64_t length) override { return get_file_impl(_tracked_file)->allocate(position, length); } virtual future size(void) override { return get_file_impl(_tracked_file)->size(); } virtual future<> close() override { return get_file_impl(_tracked_file)->close(); } virtual std::unique_ptr dup() override { return get_file_impl(_tracked_file)->dup(); } virtual subscription list_directory(std::function (directory_entry de)> next) override { return get_file_impl(_tracked_file)->list_directory(std::move(next)); } virtual future> dma_read_bulk(uint64_t offset, size_t range_size, const io_priority_class& pc) override { return get_file_impl(_tracked_file)->dma_read_bulk(offset, range_size, pc).then([this, units = _permit.consume_memory(range_size)] (temporary_buffer buf) { return make_ready_future>(make_tracked_temporary_buffer(std::move(buf), _permit)); }); } }; file make_tracked_file(file f, reader_permit p) { return file(make_shared(f, std::move(p))); }