The wasmtime runtime allocates memory for the executable code of the WASM programs using mmap and not the seastar allocator. As a result, the memory that Scylla actually uses becomes not only the memory preallocated for the seastar allocator but the sum of that and the memory allocated for executable codes by the WASM runtime. To keep limiting the memory used by Scylla, we measure how much memory do the WASM programs use and if they use too much, compiled WASM UDFs (modules) that are currently not in use are evicted to make room. To evict a module it is required to evict all instances of this module (the underlying implementation of modules and instances uses shared pointers to the executable code). For this reason, we add reference counts to modules. Each instance using a module is a reference. When an instance is destroyed, a reference is removed. If all references to a module are removed, the executable code for this module is deallocated. The eviction of a module is actually acheved by eviction of all its references. When we want to free memory for a new module we repeatedly evict instances from the wasm_instance_cache using its LRU strategy until some module loses all its instances. This process may not succeed if the instances currently in use (so not in the cache) use too much memory - in this case the query also fails. Otherwise the new module is added to the tracking system. This strategy may evict some instances unnecessarily, but evicting modules should not happen frequently, and any more efficient solution requires an even bigger intervention into the code.
302 lines
10 KiB
C++
302 lines
10 KiB
C++
/*
|
|
* Copyright (C) 2022-present ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* SPDX-License-Identifier: AGPL-3.0-or-later
|
|
*/
|
|
|
|
#include "lang/wasm_instance_cache.hh"
|
|
#include "lang/wasm.hh"
|
|
#include "seastar/core/metrics.hh"
|
|
#include "seastar/core/scheduling.hh"
|
|
#include <exception>
|
|
#include <seastar/core/units.hh>
|
|
#include <seastar/core/shared_mutex.hh>
|
|
#include <seastar/util/defer.hh>
|
|
#include <unistd.h>
|
|
|
|
namespace wasm {
|
|
|
|
static size_t compiled_size(const wasmtime::Module& module) noexcept {
|
|
// Round up the exact size to the nearest page size.
|
|
auto page_size = getpagesize();
|
|
return (module.raw_size() + (page_size - 1)) & (~(page_size - 1));
|
|
}
|
|
|
|
static size_t wasm_stack_size() noexcept {
|
|
// Wasm stack contains 2 stacks - one for wasm functions and one for
|
|
// host functions, both of which are 128KB - and a guard page.
|
|
return 256 * KB + getpagesize();
|
|
}
|
|
|
|
module_handle::module_handle(wasmtime::Module& module, instance_cache& cache, wasmtime::Engine& engine)
|
|
: _module(module)
|
|
, _cache(cache)
|
|
{
|
|
_cache.track_module_ref(_module, engine);
|
|
}
|
|
|
|
module_handle::module_handle(const module_handle& mh) noexcept
|
|
: _module(mh._module)
|
|
, _cache(mh._cache)
|
|
{
|
|
_module.add_user();
|
|
}
|
|
|
|
module_handle::~module_handle() noexcept {
|
|
_cache.remove_module_ref(_module);
|
|
}
|
|
|
|
static constexpr size_t WASM_PAGE_SIZE = 64 * KB;
|
|
|
|
instance_cache::stats& instance_cache::shard_stats() {
|
|
return _stats;
|
|
}
|
|
|
|
void instance_cache::setup_metrics() {
|
|
namespace sm = seastar::metrics;
|
|
_metrics.add_group("user_functions", {
|
|
sm::make_counter("cache_hits", wasm::instance_cache::shard_stats().cache_hits,
|
|
sm::description("The number of user defined function cache hits")),
|
|
sm::make_counter("cache_misses", wasm::instance_cache::shard_stats().cache_misses,
|
|
sm::description("The number of user defined functions loaded")),
|
|
sm::make_counter("cache_blocks", wasm::instance_cache::shard_stats().cache_blocks,
|
|
sm::description("The number of times a user defined function waited for an instance")),
|
|
sm::make_gauge("cache_instace_count_any", [this] { return _cache.size(); },
|
|
sm::description("The total number of cached wasm instances, instances in use and empty instances")),
|
|
sm::make_gauge("cache_total_size", [this] { return _total_size; },
|
|
sm::description("The total size of instances stored in the user defined function cache")),
|
|
});
|
|
}
|
|
|
|
instance_cache::instance_cache(size_t size, size_t instance_size, seastar::lowres_clock::duration timer_period)
|
|
: _timer([this] { return on_timer(); })
|
|
, _timer_period(timer_period)
|
|
, _max_size(size)
|
|
, _max_instance_size(instance_size)
|
|
{
|
|
setup_metrics();
|
|
_timer.arm_periodic(_timer_period);
|
|
}
|
|
|
|
wasm_instance instance_cache::load(wasm::context& ctx) {
|
|
auto mh = module_handle(**ctx.module, *this, ctx.engine_ptr);
|
|
auto store = wasmtime::create_store(ctx.engine_ptr, ctx.total_fuel, ctx.yield_fuel);
|
|
auto instance = wasmtime::create_instance(ctx.engine_ptr, **ctx.module, *store);
|
|
auto func = wasmtime::create_func(*instance, *store, ctx.function_name);
|
|
auto memory = wasmtime::get_memory(*instance, *store);
|
|
return wasm_instance{
|
|
.store = std::move(store),
|
|
.instance = std::move(instance),
|
|
.func = std::move(func),
|
|
.memory = std::move(memory),
|
|
.mh = std::move(mh)
|
|
};
|
|
}
|
|
|
|
// lru must not be empty, and its elements must refer to entries in _cache
|
|
void instance_cache::evict_lru() noexcept {
|
|
auto& entry = _lru.front();
|
|
_total_size -= entry.instance_size;
|
|
entry.cache_entry->instance = std::nullopt;
|
|
entry.cache_entry->it = _lru.end();
|
|
_lru.pop_front();
|
|
}
|
|
|
|
void instance_cache::on_timer() noexcept {
|
|
auto now = seastar::lowres_clock::now();
|
|
while (!_lru.empty() && _lru.front().timestamp + _timer_period < now) {
|
|
evict_lru();
|
|
}
|
|
}
|
|
|
|
static uint32_t get_instance_size(wasm_instance& instance) {
|
|
// reserve 1 wasm page for instance data other than the wasm memory
|
|
return WASM_PAGE_SIZE * (1 + instance.memory->size(*instance.store));
|
|
}
|
|
|
|
seastar::future<instance_cache::value_type> instance_cache::get(const db::functions::function_name& name, const std::vector<data_type>& arg_types, wasm::context& ctx) {
|
|
auto [it, end_it] = _cache.equal_range(name);
|
|
while (it != end_it) {
|
|
if (it->second->scheduling_group == seastar::current_scheduling_group() && it->second->arg_types == arg_types) {
|
|
break;
|
|
}
|
|
++it;
|
|
}
|
|
|
|
if (it == end_it) {
|
|
it = _cache.emplace(name, make_lw_shared<cache_entry_type>(cache_entry_type{
|
|
.scheduling_group = seastar::current_scheduling_group(),
|
|
.arg_types = arg_types,
|
|
.mutex = seastar::shared_mutex(),
|
|
.instance = std::nullopt,
|
|
.it = _lru.end(),
|
|
.module = *ctx.module.value(),
|
|
}));
|
|
}
|
|
auto& entry = it->second;
|
|
auto f = entry->mutex.lock();
|
|
if (!f.available()) {
|
|
++shard_stats().cache_blocks;
|
|
}
|
|
return f.then([this, entry, &ctx] {
|
|
// When the instance leaves the cache, it should be ready to be used. For
|
|
// that, we need to make sure that there is enough free memory for the
|
|
// wasm runtime stack, that is allocated at the start of the UDF execution,
|
|
// and which is not allocated using seastar allocator, but using mmap.
|
|
reserve_wasm_stack();
|
|
if (!entry->instance) {
|
|
++shard_stats().cache_misses;
|
|
try {
|
|
entry->instance.emplace(load(ctx));
|
|
} catch (...) {
|
|
// We couldn't actually use the compiled module, so we need to remove
|
|
// the reference to it.
|
|
std::exception_ptr ex = std::current_exception();
|
|
return make_exception_future<instance_cache::value_type>(std::move(ex));
|
|
}
|
|
} else {
|
|
// because we don't want to remove an instance after it starts being used,
|
|
// and also because we can't track its size efficiently, we remove it from
|
|
// lru and subtract its size from the total size until it is no longer used
|
|
++shard_stats().cache_hits;
|
|
_total_size -= entry->it->instance_size;
|
|
_lru.erase(entry->it);
|
|
entry->it = _lru.end();
|
|
}
|
|
return make_ready_future<instance_cache::value_type>(entry);
|
|
});
|
|
}
|
|
|
|
void instance_cache::recycle(instance_cache::value_type val) noexcept {
|
|
// While the instance is in cache, it is not used and no stack is allocated for it.
|
|
free_wasm_stack();
|
|
val->mutex.unlock();
|
|
size_t size;
|
|
try {
|
|
if (!val->instance) {
|
|
return;
|
|
}
|
|
size = get_instance_size(val->instance.value());
|
|
if (size > _max_instance_size) {
|
|
val->instance = std::nullopt;
|
|
return;
|
|
}
|
|
} catch (...) {
|
|
// we can't get the instance size, so we can't recycle it
|
|
val->instance = std::nullopt;
|
|
return;
|
|
}
|
|
while (_total_size + size > _max_size) {
|
|
// make space for the recycled instance if needed. we won't
|
|
// remove the instance itself because it was not in the lru
|
|
evict_lru();
|
|
}
|
|
try {
|
|
// new instance_size is set here
|
|
_lru.push_back({val, seastar::lowres_clock::now(), size});
|
|
val->it = --_lru.end();
|
|
_total_size += val->it->instance_size;
|
|
} catch (...) {
|
|
// we can't add the instance to the lru, so we can't recycle it
|
|
val->instance = std::nullopt;
|
|
}
|
|
}
|
|
|
|
void instance_cache::remove(const db::functions::function_name& name, const std::vector<data_type>& arg_types) noexcept {
|
|
auto [it,end_it] = _cache.equal_range(name);
|
|
while (it != end_it) {
|
|
auto& entry_ptr = it->second;
|
|
if (entry_ptr->arg_types == arg_types) {
|
|
if (entry_ptr->it != _lru.end()) {
|
|
_total_size -= entry_ptr->it->instance_size;
|
|
_lru.erase(entry_ptr->it);
|
|
}
|
|
it = _cache.erase(it);
|
|
} else {
|
|
++it;
|
|
}
|
|
}
|
|
}
|
|
|
|
void instance_cache::track_module_ref(wasmtime::Module& module, wasmtime::Engine& engine) {
|
|
if (!module.is_compiled()) {
|
|
size_t module_size = compiled_size(module);
|
|
while (_compiled_size + module_size > _max_compiled_size && !_lru.empty()) {
|
|
evict_modules();
|
|
}
|
|
if (_compiled_size + module_size > _max_compiled_size) {
|
|
throw wasm::exception("No memory left for the compiled WASM function");
|
|
}
|
|
module.compile(engine);
|
|
_compiled_size += module_size;
|
|
}
|
|
module.add_user();
|
|
}
|
|
|
|
void instance_cache::remove_module_ref(wasmtime::Module& module) noexcept {
|
|
module.remove_user();
|
|
if (module.user_count() == 0) {
|
|
module.release();
|
|
_compiled_size -= compiled_size(module);
|
|
}
|
|
}
|
|
|
|
void instance_cache::reserve_wasm_stack() {
|
|
size_t stack_size = wasm_stack_size();
|
|
while (!_lru.empty() && _compiled_size + stack_size > _max_compiled_size) {
|
|
evict_modules();
|
|
}
|
|
if (_compiled_size + stack_size > _max_compiled_size) {
|
|
throw wasm::exception("No memory left to execute the WASM function");
|
|
} else {
|
|
_compiled_size += stack_size;
|
|
}
|
|
}
|
|
|
|
void instance_cache::free_wasm_stack() noexcept {
|
|
_compiled_size -= wasm_stack_size();
|
|
}
|
|
|
|
void instance_cache::evict_modules() noexcept {
|
|
size_t prev_size = _compiled_size;
|
|
while (!_lru.empty() && _compiled_size == prev_size) {
|
|
evict_lru();
|
|
}
|
|
}
|
|
|
|
size_t instance_cache::size() const {
|
|
return _cache.size();
|
|
}
|
|
|
|
size_t instance_cache::max_size() const {
|
|
return _max_size;
|
|
}
|
|
|
|
size_t instance_cache::memory_footprint() const {
|
|
return _total_size;
|
|
}
|
|
|
|
future<> instance_cache::stop() {
|
|
_timer.cancel();
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
}
|
|
|
|
namespace std {
|
|
|
|
inline std::ostream& operator<<(std::ostream& out, const seastar::scheduling_group& sg) {
|
|
return out << sg.name();
|
|
}
|
|
|
|
template <>
|
|
struct equal_to<seastar::scheduling_group> {
|
|
bool operator()(seastar::scheduling_group& sg1, seastar::scheduling_group& sg2) const noexcept {
|
|
return sg1 == sg2;
|
|
}
|
|
};
|
|
|
|
}
|