Currently eviction is performed until occupancy of the whole region drops below the 85% threshold. This may take a while if region had high occupancy and is large. We could improve the situation by only evicting until occupancy of the sparsest segment drops below the threshold, as is done by this change. I tested this using a c-s read workload in which the condition triggers in the cache region, with 1G per shard: lsa-timing - Reclamation cycle took 12.934 us. lsa-timing - Reclamation cycle took 47.771 us. lsa-timing - Reclamation cycle took 125.946 us. lsa-timing - Reclamation cycle took 144356 us. lsa-timing - Reclamation cycle took 655.765 us. lsa-timing - Reclamation cycle took 693.418 us. lsa-timing - Reclamation cycle took 509.869 us. lsa-timing - Reclamation cycle took 1139.15 us. The 144ms pause is when large eviction is necessary. The change improves worst case latency. Reclamation time statistics over 30 second period after cache fills up, in microseconds: Before: avg = 1524.283148 stdev = 11021.021118 min = 12.934000 max = 144356.000000 sum = 257603.852000 samples = 169 After: avg = 1317.362414 stdev = 1913.542802 min = 263.935000 max = 19244.600000 sum = 175209.201000 samples = 133 Refs #1634. Message-Id: <1484730859-11969-1-git-send-email-tgrabiec@scylladb.com>
1040 lines
38 KiB
C++
1040 lines
38 KiB
C++
/*
|
|
* Copyright (C) 2015 ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* This file is part of Scylla.
|
|
*
|
|
* Scylla is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU Affero General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* Scylla is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
|
|
#include <boost/test/unit_test.hpp>
|
|
#include <boost/intrusive/parent_from_member.hpp>
|
|
#include <algorithm>
|
|
#include <chrono>
|
|
|
|
#include <seastar/core/thread.hh>
|
|
#include <seastar/core/timer.hh>
|
|
#include <seastar/core/sleep.hh>
|
|
#include <seastar/tests/test-utils.hh>
|
|
#include <deque>
|
|
|
|
#include "utils/logalloc.hh"
|
|
#include "utils/managed_ref.hh"
|
|
#include "utils/managed_bytes.hh"
|
|
#include "log.hh"
|
|
|
|
#include "disk-error-handler.hh"
|
|
|
|
thread_local disk_error_signal_type commit_error;
|
|
thread_local disk_error_signal_type general_disk_error;
|
|
|
|
[[gnu::unused]]
|
|
static auto x = [] {
|
|
logging::logger_registry().set_all_loggers_level(logging::log_level::debug);
|
|
return 0;
|
|
}();
|
|
|
|
using namespace logalloc;
|
|
|
|
SEASTAR_TEST_CASE(test_compaction) {
|
|
return seastar::async([] {
|
|
region reg;
|
|
|
|
with_allocator(reg.allocator(), [®] {
|
|
std::vector<managed_ref<int>> _allocated;
|
|
|
|
// Allocate several segments
|
|
|
|
auto reclaim_counter_1 = reg.reclaim_counter();
|
|
|
|
for (int i = 0; i < 32 * 1024 * 4; i++) {
|
|
_allocated.push_back(make_managed<int>());
|
|
}
|
|
|
|
// Allocation should not invalidate references
|
|
BOOST_REQUIRE_EQUAL(reg.reclaim_counter(), reclaim_counter_1);
|
|
|
|
shard_tracker().reclaim_all_free_segments();
|
|
|
|
// Free 1/3 randomly
|
|
|
|
std::random_shuffle(_allocated.begin(), _allocated.end());
|
|
|
|
auto it = _allocated.begin();
|
|
size_t nr_freed = _allocated.size() / 3;
|
|
for (size_t i = 0; i < nr_freed; ++i) {
|
|
*it++ = {};
|
|
}
|
|
|
|
// Freeing should not invalidate references
|
|
BOOST_REQUIRE_EQUAL(reg.reclaim_counter(), reclaim_counter_1);
|
|
|
|
// Try to reclaim
|
|
|
|
size_t target = sizeof(managed<int>) * nr_freed;
|
|
BOOST_REQUIRE(shard_tracker().reclaim(target) >= target);
|
|
|
|
// There must have been some compaction during such reclaim
|
|
BOOST_REQUIRE(reg.reclaim_counter() != reclaim_counter_1);
|
|
});
|
|
});
|
|
}
|
|
|
|
|
|
SEASTAR_TEST_CASE(test_compaction_with_multiple_regions) {
|
|
return seastar::async([] {
|
|
region reg1;
|
|
region reg2;
|
|
|
|
std::vector<managed_ref<int>> allocated1;
|
|
std::vector<managed_ref<int>> allocated2;
|
|
|
|
int count = 32 * 1024 * 4 * 2;
|
|
|
|
with_allocator(reg1.allocator(), [&] {
|
|
for (int i = 0; i < count; i++) {
|
|
allocated1.push_back(make_managed<int>());
|
|
}
|
|
});
|
|
|
|
with_allocator(reg2.allocator(), [&] {
|
|
for (int i = 0; i < count; i++) {
|
|
allocated2.push_back(make_managed<int>());
|
|
}
|
|
});
|
|
|
|
size_t quarter = shard_tracker().region_occupancy().total_space() / 4;
|
|
|
|
shard_tracker().reclaim_all_free_segments();
|
|
|
|
// Can't reclaim anything yet
|
|
BOOST_REQUIRE(shard_tracker().reclaim(quarter) == 0);
|
|
|
|
// Free 60% from the second pool
|
|
|
|
// Shuffle, so that we don't free whole segments back to the pool
|
|
// and there's nothing to reclaim.
|
|
std::random_shuffle(allocated2.begin(), allocated2.end());
|
|
|
|
with_allocator(reg2.allocator(), [&] {
|
|
auto it = allocated2.begin();
|
|
for (size_t i = 0; i < (count * 0.6); ++i) {
|
|
*it++ = {};
|
|
}
|
|
});
|
|
|
|
BOOST_REQUIRE(shard_tracker().reclaim(quarter) >= quarter);
|
|
BOOST_REQUIRE(shard_tracker().reclaim(quarter) < quarter);
|
|
|
|
// Free 60% from the first pool
|
|
|
|
std::random_shuffle(allocated1.begin(), allocated1.end());
|
|
|
|
with_allocator(reg1.allocator(), [&] {
|
|
auto it = allocated1.begin();
|
|
for (size_t i = 0; i < (count * 0.6); ++i) {
|
|
*it++ = {};
|
|
}
|
|
});
|
|
|
|
BOOST_REQUIRE(shard_tracker().reclaim(quarter) >= quarter);
|
|
BOOST_REQUIRE(shard_tracker().reclaim(quarter) < quarter);
|
|
|
|
with_allocator(reg2.allocator(), [&] () mutable {
|
|
allocated2.clear();
|
|
});
|
|
|
|
with_allocator(reg1.allocator(), [&] () mutable {
|
|
allocated1.clear();
|
|
});
|
|
});
|
|
}
|
|
|
|
SEASTAR_TEST_CASE(test_mixed_type_compaction) {
|
|
return seastar::async([] {
|
|
static bool a_moved = false;
|
|
static bool b_moved = false;
|
|
static bool c_moved = false;
|
|
|
|
static bool a_destroyed = false;
|
|
static bool b_destroyed = false;
|
|
static bool c_destroyed = false;
|
|
|
|
struct A {
|
|
uint8_t v = 0xca;
|
|
A() = default;
|
|
A(A&&) noexcept {
|
|
a_moved = true;
|
|
}
|
|
~A() {
|
|
BOOST_REQUIRE(v == 0xca);
|
|
a_destroyed = true;
|
|
}
|
|
};
|
|
struct B {
|
|
uint16_t v = 0xcafe;
|
|
B() = default;
|
|
B(B&&) noexcept {
|
|
b_moved = true;
|
|
}
|
|
~B() {
|
|
BOOST_REQUIRE(v == 0xcafe);
|
|
b_destroyed = true;
|
|
}
|
|
};
|
|
struct C {
|
|
uint64_t v = 0xcafebabe;
|
|
C() = default;
|
|
C(C&&) noexcept {
|
|
c_moved = true;
|
|
}
|
|
~C() {
|
|
BOOST_REQUIRE(v == 0xcafebabe);
|
|
c_destroyed = true;
|
|
}
|
|
};
|
|
|
|
region reg;
|
|
with_allocator(reg.allocator(), [&] {
|
|
{
|
|
std::vector<int*> objs;
|
|
|
|
auto p1 = make_managed<A>();
|
|
|
|
int junk_count = 10;
|
|
|
|
for (int i = 0; i < junk_count; i++) {
|
|
objs.push_back(reg.allocator().construct<int>(i));
|
|
}
|
|
|
|
auto p2 = make_managed<B>();
|
|
|
|
for (int i = 0; i < junk_count; i++) {
|
|
objs.push_back(reg.allocator().construct<int>(i));
|
|
}
|
|
|
|
auto p3 = make_managed<C>();
|
|
|
|
for (auto&& p : objs) {
|
|
reg.allocator().destroy(p);
|
|
}
|
|
|
|
reg.full_compaction();
|
|
|
|
BOOST_REQUIRE(a_moved);
|
|
BOOST_REQUIRE(b_moved);
|
|
BOOST_REQUIRE(c_moved);
|
|
|
|
BOOST_REQUIRE(a_destroyed);
|
|
BOOST_REQUIRE(b_destroyed);
|
|
BOOST_REQUIRE(c_destroyed);
|
|
|
|
a_destroyed = false;
|
|
b_destroyed = false;
|
|
c_destroyed = false;
|
|
}
|
|
|
|
BOOST_REQUIRE(a_destroyed);
|
|
BOOST_REQUIRE(b_destroyed);
|
|
BOOST_REQUIRE(c_destroyed);
|
|
});
|
|
});
|
|
}
|
|
|
|
SEASTAR_TEST_CASE(test_blob) {
|
|
return seastar::async([] {
|
|
region reg;
|
|
with_allocator(reg.allocator(), [&] {
|
|
auto src = bytes("123456");
|
|
managed_bytes b(src);
|
|
|
|
BOOST_REQUIRE(bytes_view(b) == src);
|
|
|
|
reg.full_compaction();
|
|
|
|
BOOST_REQUIRE(bytes_view(b) == src);
|
|
});
|
|
});
|
|
}
|
|
|
|
SEASTAR_TEST_CASE(test_merging) {
|
|
return seastar::async([] {
|
|
region reg1;
|
|
region reg2;
|
|
|
|
reg1.merge(reg2);
|
|
|
|
managed_ref<int> r1;
|
|
|
|
with_allocator(reg1.allocator(), [&] {
|
|
r1 = make_managed<int>();
|
|
});
|
|
|
|
reg2.merge(reg1);
|
|
|
|
with_allocator(reg2.allocator(), [&] {
|
|
r1 = {};
|
|
});
|
|
|
|
std::vector<managed_ref<int>> refs;
|
|
|
|
with_allocator(reg1.allocator(), [&] {
|
|
for (int i = 0; i < 10000; ++i) {
|
|
refs.emplace_back(make_managed<int>());
|
|
}
|
|
});
|
|
|
|
reg2.merge(reg1);
|
|
|
|
with_allocator(reg2.allocator(), [&] {
|
|
refs.clear();
|
|
});
|
|
});
|
|
}
|
|
|
|
#ifndef DEFAULT_ALLOCATOR
|
|
SEASTAR_TEST_CASE(test_region_lock) {
|
|
return seastar::async([] {
|
|
region reg;
|
|
with_allocator(reg.allocator(), [&] {
|
|
std::deque<managed_bytes> refs;
|
|
|
|
for (int i = 0; i < 1024 * 10; ++i) {
|
|
refs.push_back(managed_bytes(managed_bytes::initialized_later(), 1024));
|
|
}
|
|
|
|
// Evict 30% so that region is compactible, but do it randomly so that
|
|
// segments are not released into the standard allocator without compaction.
|
|
std::random_shuffle(refs.begin(), refs.end());
|
|
for (size_t i = 0; i < refs.size() * 0.3; ++i) {
|
|
refs.pop_back();
|
|
}
|
|
|
|
reg.make_evictable([&refs] {
|
|
if (refs.empty()) {
|
|
return memory::reclaiming_result::reclaimed_nothing;
|
|
}
|
|
refs.pop_back();
|
|
return memory::reclaiming_result::reclaimed_something;
|
|
});
|
|
|
|
std::deque<bytes> objects;
|
|
|
|
auto counter = reg.reclaim_counter();
|
|
|
|
// Verify that with compaction lock we rather run out of memory
|
|
// than compact it
|
|
{
|
|
BOOST_REQUIRE(reg.reclaiming_enabled());
|
|
|
|
logalloc::reclaim_lock _(reg);
|
|
|
|
BOOST_REQUIRE(!reg.reclaiming_enabled());
|
|
auto used_before = reg.occupancy().used_space();
|
|
|
|
try {
|
|
while (true) {
|
|
objects.push_back(bytes(bytes::initialized_later(), 1024*1024));
|
|
}
|
|
} catch (const std::bad_alloc&) {
|
|
// expected
|
|
}
|
|
|
|
BOOST_REQUIRE(reg.reclaim_counter() == counter);
|
|
BOOST_REQUIRE(reg.occupancy().used_space() == used_before); // eviction is also disabled
|
|
}
|
|
|
|
BOOST_REQUIRE(reg.reclaiming_enabled());
|
|
});
|
|
});
|
|
}
|
|
|
|
SEASTAR_TEST_CASE(test_large_allocation) {
|
|
return seastar::async([] {
|
|
logalloc::region r_evictable;
|
|
logalloc::region r_non_evictable;
|
|
|
|
static constexpr unsigned element_size = 16 * 1024;
|
|
|
|
std::deque<managed_bytes> evictable;
|
|
std::deque<managed_bytes> non_evictable;
|
|
try {
|
|
while (true) {
|
|
with_allocator(r_evictable.allocator(), [&] {
|
|
evictable.push_back(bytes(bytes::initialized_later(),element_size));
|
|
});
|
|
with_allocator(r_non_evictable.allocator(), [&] {
|
|
non_evictable.push_back(bytes(bytes::initialized_later(),element_size));
|
|
});
|
|
}
|
|
} catch (const std::bad_alloc&) {
|
|
// expected
|
|
}
|
|
|
|
std::random_shuffle(evictable.begin(), evictable.end());
|
|
r_evictable.make_evictable([&] {
|
|
return with_allocator(r_evictable.allocator(), [&] {
|
|
if (evictable.empty()) {
|
|
return memory::reclaiming_result::reclaimed_nothing;
|
|
}
|
|
evictable.pop_front();
|
|
return memory::reclaiming_result::reclaimed_something;
|
|
});
|
|
});
|
|
|
|
auto clear_all = [&] {
|
|
with_allocator(r_non_evictable.allocator(), [&] {
|
|
non_evictable.clear();
|
|
});
|
|
with_allocator(r_evictable.allocator(), [&] {
|
|
evictable.clear();
|
|
});
|
|
};
|
|
|
|
try {
|
|
auto ptr = std::make_unique<char[]>(evictable.size() * element_size / 4 * 3);
|
|
} catch (const std::bad_alloc&) {
|
|
// This shouldn't have happened, but clear remaining lsa data
|
|
// properly so that humans see bad_alloc instead of some confusing
|
|
// assertion failure caused by destroying evictable and
|
|
// non_evictable without with_allocator().
|
|
clear_all();
|
|
throw;
|
|
}
|
|
|
|
clear_all();
|
|
});
|
|
}
|
|
#endif
|
|
|
|
SEASTAR_TEST_CASE(test_region_groups) {
|
|
return seastar::async([] {
|
|
logalloc::region_group just_four;
|
|
logalloc::region_group all;
|
|
logalloc::region_group one_and_two(&all);
|
|
|
|
auto one = std::make_unique<logalloc::region>(one_and_two);
|
|
auto two = std::make_unique<logalloc::region>(one_and_two);
|
|
auto three = std::make_unique<logalloc::region>(all);
|
|
auto four = std::make_unique<logalloc::region>(just_four);
|
|
auto five = std::make_unique<logalloc::region>();
|
|
|
|
constexpr size_t one_count = 1024 * 1024;
|
|
std::vector<managed_ref<int>> one_objs;
|
|
with_allocator(one->allocator(), [&] {
|
|
for (size_t i = 0; i < one_count; i++) {
|
|
one_objs.emplace_back(make_managed<int>());
|
|
}
|
|
});
|
|
BOOST_REQUIRE_GE(ssize_t(one->occupancy().used_space()), ssize_t(one_count * sizeof(int)));
|
|
BOOST_REQUIRE_GE(ssize_t(one->occupancy().total_space()), ssize_t(one->occupancy().used_space()));
|
|
BOOST_REQUIRE_EQUAL(one_and_two.memory_used(), one->occupancy().total_space());
|
|
BOOST_REQUIRE_EQUAL(all.memory_used(), one->occupancy().total_space());
|
|
|
|
constexpr size_t two_count = 512 * 1024;
|
|
std::vector<managed_ref<int>> two_objs;
|
|
with_allocator(two->allocator(), [&] {
|
|
for (size_t i = 0; i < two_count; i++) {
|
|
two_objs.emplace_back(make_managed<int>());
|
|
}
|
|
});
|
|
BOOST_REQUIRE_GE(ssize_t(two->occupancy().used_space()), ssize_t(two_count * sizeof(int)));
|
|
BOOST_REQUIRE_GE(ssize_t(two->occupancy().total_space()), ssize_t(two->occupancy().used_space()));
|
|
BOOST_REQUIRE_EQUAL(one_and_two.memory_used(), one->occupancy().total_space() + two->occupancy().total_space());
|
|
BOOST_REQUIRE_EQUAL(all.memory_used(), one_and_two.memory_used());
|
|
|
|
constexpr size_t three_count = 2048 * 1024;
|
|
std::vector<managed_ref<int>> three_objs;
|
|
with_allocator(three->allocator(), [&] {
|
|
for (size_t i = 0; i < three_count; i++) {
|
|
three_objs.emplace_back(make_managed<int>());
|
|
}
|
|
});
|
|
BOOST_REQUIRE_GE(ssize_t(three->occupancy().used_space()), ssize_t(three_count * sizeof(int)));
|
|
BOOST_REQUIRE_GE(ssize_t(three->occupancy().total_space()), ssize_t(three->occupancy().used_space()));
|
|
BOOST_REQUIRE_EQUAL(all.memory_used(), one_and_two.memory_used() + three->occupancy().total_space());
|
|
|
|
constexpr size_t four_count = 256 * 1024;
|
|
std::vector<managed_ref<int>> four_objs;
|
|
with_allocator(four->allocator(), [&] {
|
|
for (size_t i = 0; i < four_count; i++) {
|
|
four_objs.emplace_back(make_managed<int>());
|
|
}
|
|
});
|
|
BOOST_REQUIRE_GE(ssize_t(four->occupancy().used_space()), ssize_t(four_count * sizeof(int)));
|
|
BOOST_REQUIRE_GE(ssize_t(four->occupancy().total_space()), ssize_t(four->occupancy().used_space()));
|
|
BOOST_REQUIRE_EQUAL(just_four.memory_used(), four->occupancy().total_space());
|
|
|
|
with_allocator(five->allocator(), [] {
|
|
std::vector<managed_ref<int>> five_objs;
|
|
for (size_t i = 0; i < 16 * 1024; i++) {
|
|
five_objs.emplace_back(make_managed<int>());
|
|
}
|
|
});
|
|
|
|
three->merge(*four);
|
|
BOOST_REQUIRE_GE(ssize_t(three->occupancy().used_space()), ssize_t((three_count + four_count)* sizeof(int)));
|
|
BOOST_REQUIRE_GE(ssize_t(three->occupancy().total_space()), ssize_t(three->occupancy().used_space()));
|
|
BOOST_REQUIRE_EQUAL(all.memory_used(), one_and_two.memory_used() + three->occupancy().total_space());
|
|
BOOST_REQUIRE_EQUAL(just_four.memory_used(), 0);
|
|
|
|
three->merge(*five);
|
|
BOOST_REQUIRE_GE(ssize_t(three->occupancy().used_space()), ssize_t((three_count + four_count)* sizeof(int)));
|
|
BOOST_REQUIRE_GE(ssize_t(three->occupancy().total_space()), ssize_t(three->occupancy().used_space()));
|
|
BOOST_REQUIRE_EQUAL(all.memory_used(), one_and_two.memory_used() + three->occupancy().total_space());
|
|
|
|
with_allocator(two->allocator(), [&] {
|
|
two_objs.clear();
|
|
});
|
|
two.reset();
|
|
BOOST_REQUIRE_EQUAL(one_and_two.memory_used(), one->occupancy().total_space());
|
|
BOOST_REQUIRE_EQUAL(all.memory_used(), one_and_two.memory_used() + three->occupancy().total_space());
|
|
|
|
with_allocator(one->allocator(), [&] {
|
|
one_objs.clear();
|
|
});
|
|
one.reset();
|
|
BOOST_REQUIRE_EQUAL(one_and_two.memory_used(), 0);
|
|
BOOST_REQUIRE_EQUAL(all.memory_used(), three->occupancy().total_space());
|
|
|
|
with_allocator(three->allocator(), [&] {
|
|
three_objs.clear();
|
|
four_objs.clear();
|
|
});
|
|
three.reset();
|
|
four.reset();
|
|
five.reset();
|
|
BOOST_REQUIRE_EQUAL(all.memory_used(), 0);
|
|
});
|
|
}
|
|
|
|
using namespace std::chrono_literals;
|
|
|
|
template <typename FutureType>
|
|
inline void quiesce(FutureType&& fut) {
|
|
// Unfortunately seastar::thread::yield is not enough here, because the process of releasing
|
|
// a request may be broken into many continuations. While we could just yield many times, the
|
|
// exact amount needed to guarantee execution would be dependent on the internals of the
|
|
// implementation, we want to avoid that.
|
|
timer<> tmr;
|
|
tmr.set_callback([] { BOOST_FAIL("The future we were waiting for took too long to get ready"); });
|
|
tmr.arm(2s);
|
|
fut.get();
|
|
tmr.cancel();
|
|
}
|
|
|
|
// Simple RAII structure that wraps around a region_group
|
|
// Not using defer because we usually employ many region groups
|
|
struct test_region_group: public logalloc::region_group {
|
|
test_region_group(region_group* parent, region_group_reclaimer& reclaimer) : logalloc::region_group(parent, reclaimer) {}
|
|
test_region_group(region_group_reclaimer& reclaimer) : logalloc::region_group(nullptr, reclaimer) {}
|
|
|
|
~test_region_group() {
|
|
shutdown().get();
|
|
}
|
|
};
|
|
|
|
struct test_region: public logalloc::region {
|
|
test_region(test_region_group& rg) : logalloc::region(rg) {}
|
|
~test_region() {
|
|
clear();
|
|
}
|
|
|
|
void clear() {
|
|
with_allocator(allocator(), [this] {
|
|
std::vector<managed_bytes>().swap(_alloc);
|
|
std::vector<managed_ref<uint64_t>>().swap(_alloc_simple);
|
|
});
|
|
}
|
|
void alloc(size_t size = logalloc::segment_size) {
|
|
with_allocator(allocator(), [this, size] {
|
|
_alloc.push_back(bytes(bytes::initialized_later(), size));
|
|
});
|
|
}
|
|
|
|
void alloc_small(size_t nr = 1) {
|
|
with_allocator(allocator(), [this] {
|
|
_alloc_simple.emplace_back(make_managed<uint64_t>());
|
|
});
|
|
}
|
|
private:
|
|
std::vector<managed_bytes> _alloc;
|
|
// For small objects we don't want to get caught in basic_sstring's internal buffer. We know
|
|
// which size we need to allocate to avoid that, but that's technically internal representation.
|
|
// Better to use integers if we want something small.
|
|
std::vector<managed_ref<uint64_t>> _alloc_simple;
|
|
};
|
|
|
|
SEASTAR_TEST_CASE(test_region_groups_basic_throttling) {
|
|
return seastar::async([] {
|
|
region_group_reclaimer simple_reclaimer(logalloc::segment_size);
|
|
|
|
// singleton hierarchy, only one segment allowed
|
|
test_region_group simple(simple_reclaimer);
|
|
auto simple_region = std::make_unique<test_region>(simple);
|
|
|
|
// Expectation: after first allocation region will have one segment,
|
|
// memory_used() == throttle_threshold and we are good to go, future
|
|
// is ready immediately.
|
|
//
|
|
// The allocation of the first element won't change the memory usage inside
|
|
// the group and we'll be okay to do that a second time.
|
|
auto fut = simple.run_when_memory_available([&simple_region] { simple_region->alloc_small(); });
|
|
BOOST_REQUIRE_EQUAL(fut.available(), true);
|
|
BOOST_REQUIRE_EQUAL(simple.memory_used(), logalloc::segment_size);
|
|
|
|
fut = simple.run_when_memory_available([&simple_region] { simple_region->alloc_small(); });
|
|
BOOST_REQUIRE_EQUAL(fut.available(), true);
|
|
BOOST_REQUIRE_EQUAL(simple.memory_used(), logalloc::segment_size);
|
|
|
|
auto big_region = std::make_unique<test_region>(simple);
|
|
// Allocate a big chunk, that will certainly get us over the threshold
|
|
big_region->alloc();
|
|
|
|
// We should not be permitted to go forward with a new allocation now...
|
|
fut = simple.run_when_memory_available([&simple_region] { simple_region->alloc_small(); });
|
|
BOOST_REQUIRE_EQUAL(fut.available(), false);
|
|
BOOST_REQUIRE_GT(simple.memory_used(), logalloc::segment_size);
|
|
|
|
// But when we remove the big bytes allocator from the region, then we should.
|
|
// Internally, we can't guarantee that just freeing the object will give the segment back,
|
|
// that's up to the internal policies. So to make sure we need to remove the whole region.
|
|
big_region.reset();
|
|
|
|
quiesce(std::move(fut));
|
|
});
|
|
}
|
|
|
|
SEASTAR_TEST_CASE(test_region_groups_linear_hierarchy_throttling_child_alloc) {
|
|
return seastar::async([] {
|
|
region_group_reclaimer parent_reclaimer(2 * logalloc::segment_size);
|
|
region_group_reclaimer child_reclaimer(logalloc::segment_size);
|
|
|
|
test_region_group parent(parent_reclaimer);
|
|
test_region_group child(&parent, child_reclaimer);
|
|
|
|
auto child_region = std::make_unique<test_region>(child);
|
|
auto parent_region = std::make_unique<test_region>(parent);
|
|
|
|
child_region->alloc();
|
|
BOOST_REQUIRE_GE(parent.memory_used(), logalloc::segment_size);
|
|
|
|
auto fut = parent.run_when_memory_available([&parent_region] { parent_region->alloc_small(); });
|
|
BOOST_REQUIRE_EQUAL(fut.available(), true);
|
|
BOOST_REQUIRE_GE(parent.memory_used(), 2 * logalloc::segment_size);
|
|
|
|
// This time child will use all parent's memory. Note that because the child's memory limit
|
|
// is lower than the parent's, for that to happen we need to allocate directly.
|
|
child_region->alloc();
|
|
BOOST_REQUIRE_GE(child.memory_used(), 2 * logalloc::segment_size);
|
|
|
|
fut = parent.run_when_memory_available([&parent_region] { parent_region->alloc_small(); });
|
|
BOOST_REQUIRE_EQUAL(fut.available(), false);
|
|
BOOST_REQUIRE_GE(parent.memory_used(), 2 * logalloc::segment_size);
|
|
|
|
child_region.reset();
|
|
quiesce(std::move(fut));
|
|
});
|
|
}
|
|
|
|
SEASTAR_TEST_CASE(test_region_groups_linear_hierarchy_throttling_parent_alloc) {
|
|
return seastar::async([] {
|
|
region_group_reclaimer simple_reclaimer(logalloc::segment_size);
|
|
|
|
test_region_group parent(simple_reclaimer);
|
|
test_region_group child(&parent, simple_reclaimer);
|
|
|
|
auto parent_region = std::make_unique<test_region>(parent);
|
|
|
|
parent_region->alloc();
|
|
BOOST_REQUIRE_GE(parent.memory_used(), logalloc::segment_size);
|
|
|
|
auto fut = child.run_when_memory_available([] {});
|
|
BOOST_REQUIRE_EQUAL(fut.available(), false);
|
|
|
|
parent_region.reset();
|
|
quiesce(std::move(fut));
|
|
});
|
|
}
|
|
|
|
SEASTAR_TEST_CASE(test_region_groups_fifo_order) {
|
|
// tests that requests that are queued for later execution execute in FIFO order
|
|
return seastar::async([] {
|
|
region_group_reclaimer simple_reclaimer(logalloc::segment_size);
|
|
|
|
test_region_group rg(simple_reclaimer);
|
|
|
|
auto region = std::make_unique<test_region>(rg);
|
|
|
|
// fill the parent. Try allocating at child level. Should not be allowed.
|
|
region->alloc();
|
|
BOOST_REQUIRE_GE(rg.memory_used(), logalloc::segment_size);
|
|
|
|
auto exec_cnt = make_lw_shared<int>(0);
|
|
std::vector<future<>> executions;
|
|
|
|
for (auto index = 0; index < 100; ++index) {
|
|
auto fut = rg.run_when_memory_available([exec_cnt, index] {
|
|
BOOST_REQUIRE_EQUAL(index, (*exec_cnt)++);
|
|
});
|
|
BOOST_REQUIRE_EQUAL(fut.available(), false);
|
|
executions.push_back(std::move(fut));
|
|
}
|
|
|
|
region.reset();
|
|
quiesce(when_all(executions.begin(), executions.end()));
|
|
});
|
|
}
|
|
|
|
SEASTAR_TEST_CASE(test_region_groups_linear_hierarchy_throttling_moving_restriction) {
|
|
// Hierarchy here is A -> B -> C.
|
|
// We will fill B causing an execution in C to fail. We then fill A and free B.
|
|
//
|
|
// C should still be blocked.
|
|
return seastar::async([] {
|
|
region_group_reclaimer simple_reclaimer(logalloc::segment_size);
|
|
|
|
test_region_group root(simple_reclaimer);
|
|
test_region_group inner(&root, simple_reclaimer);
|
|
test_region_group child(&inner, simple_reclaimer);
|
|
|
|
auto inner_region = std::make_unique<test_region>(inner);
|
|
auto root_region = std::make_unique<test_region>(root);
|
|
|
|
// fill the inner node. Try allocating at child level. Should not be allowed.
|
|
circular_buffer<managed_bytes> big_alloc;
|
|
with_allocator(inner_region->allocator(), [&big_alloc] {
|
|
big_alloc.push_back(bytes(bytes::initialized_later(), logalloc::segment_size));
|
|
});
|
|
BOOST_REQUIRE_GE(inner.memory_used(), logalloc::segment_size);
|
|
|
|
auto fut = child.run_when_memory_available([] {});
|
|
BOOST_REQUIRE_EQUAL(fut.available(), false);
|
|
|
|
// Now fill the root...
|
|
with_allocator(root_region->allocator(), [&big_alloc] {
|
|
big_alloc.push_back(bytes(bytes::initialized_later(), logalloc::segment_size));
|
|
});
|
|
BOOST_REQUIRE_GE(root.memory_used(), logalloc::segment_size);
|
|
|
|
// And free the inner node. We will verify that
|
|
// 1) the notifications that the inner node sent the child when it was freed won't
|
|
// erroneously cause it to execute
|
|
// 2) the child is still able to receive notifications from the root
|
|
with_allocator(inner_region->allocator(), [&big_alloc] {
|
|
big_alloc.pop_front();
|
|
});
|
|
inner_region.reset();
|
|
|
|
// Verifying (1)
|
|
// Can't quiesce because we don't want to wait on the futures.
|
|
sleep(10ms);
|
|
BOOST_REQUIRE_EQUAL(fut.available(), false);
|
|
|
|
// Verifying (2)
|
|
with_allocator(root_region->allocator(), [&big_alloc] {
|
|
big_alloc.pop_front();
|
|
});
|
|
root_region.reset();
|
|
quiesce(std::move(fut));
|
|
});
|
|
}
|
|
|
|
SEASTAR_TEST_CASE(test_region_groups_tree_hierarchy_throttling_leaf_alloc) {
|
|
return seastar::async([] {
|
|
class leaf {
|
|
region_group_reclaimer _leaf_reclaimer;
|
|
test_region_group _rg;
|
|
std::unique_ptr<test_region> _region;
|
|
public:
|
|
leaf(test_region_group& parent)
|
|
: _leaf_reclaimer(logalloc::segment_size)
|
|
, _rg(&parent, _leaf_reclaimer)
|
|
, _region(std::make_unique<test_region>(_rg))
|
|
{}
|
|
|
|
void alloc(size_t size) {
|
|
_region->alloc(size);
|
|
}
|
|
|
|
future<> try_alloc(size_t size) {
|
|
return _rg.run_when_memory_available([this, size] {
|
|
alloc(size);
|
|
});
|
|
}
|
|
void reset() {
|
|
_region.reset(new test_region(_rg));
|
|
}
|
|
};
|
|
|
|
region_group_reclaimer simple_reclaimer(logalloc::segment_size);
|
|
test_region_group parent(simple_reclaimer);
|
|
|
|
leaf first_leaf(parent);
|
|
leaf second_leaf(parent);
|
|
leaf third_leaf(parent);
|
|
|
|
first_leaf.alloc(logalloc::segment_size);
|
|
second_leaf.alloc(logalloc::segment_size);
|
|
third_leaf.alloc(logalloc::segment_size);
|
|
|
|
auto fut_1 = first_leaf.try_alloc(sizeof(uint64_t));
|
|
auto fut_2 = second_leaf.try_alloc(sizeof(uint64_t));
|
|
auto fut_3 = third_leaf.try_alloc(sizeof(uint64_t));
|
|
|
|
BOOST_REQUIRE_EQUAL(fut_1.available() || fut_2.available() || fut_3.available(), false);
|
|
|
|
// Total memory is still 2 * segment_size, can't proceed
|
|
first_leaf.reset();
|
|
// Can't quiesce because we don't want to wait on the futures.
|
|
sleep(10ms);
|
|
|
|
BOOST_REQUIRE_EQUAL(fut_1.available() || fut_2.available() || fut_3.available(), false);
|
|
|
|
// Now all futures should resolve.
|
|
first_leaf.reset();
|
|
second_leaf.reset();
|
|
third_leaf.reset();
|
|
quiesce(when_all(std::move(fut_1), std::move(fut_2), std::move(fut_3)));
|
|
});
|
|
}
|
|
|
|
// Helper for all async reclaim tests.
|
|
class test_async_reclaim_region {
|
|
logalloc::region _region;
|
|
std::vector<managed_bytes> _alloc;
|
|
size_t _alloc_size;
|
|
// Make sure we don't reclaim the same region more than once. It is supposed to be empty
|
|
// after the first reclaim
|
|
int _reclaim_counter = 0;
|
|
region_group& _rg;
|
|
public:
|
|
test_async_reclaim_region(region_group& rg, size_t alloc_size)
|
|
: _region(rg)
|
|
, _alloc_size(alloc_size)
|
|
, _rg(rg)
|
|
{
|
|
with_allocator(_region.allocator(), [this] {
|
|
_alloc.push_back(bytes(bytes::initialized_later(), this->_alloc_size));
|
|
});
|
|
|
|
}
|
|
|
|
~test_async_reclaim_region() {
|
|
with_allocator(_region.allocator(), [this] {
|
|
std::vector<managed_bytes>().swap(_alloc);
|
|
});
|
|
}
|
|
|
|
size_t evict() {
|
|
BOOST_REQUIRE_EQUAL(_reclaim_counter++, 0);
|
|
with_allocator(_region.allocator(), [this] {
|
|
std::vector<managed_bytes>().swap(_alloc);
|
|
});
|
|
_region = logalloc::region(_rg);
|
|
return this->_alloc_size;
|
|
}
|
|
static test_async_reclaim_region& from_region(region* region_ptr) {
|
|
auto aptr = boost::intrusive::get_parent_from_member(region_ptr, &test_async_reclaim_region::_region);
|
|
return *aptr;
|
|
}
|
|
};
|
|
|
|
class test_reclaimer: public region_group_reclaimer {
|
|
size_t _threshold;
|
|
test_reclaimer *_result_accumulator;
|
|
region_group _rg;
|
|
std::vector<size_t> _reclaim_sizes;
|
|
bool _shutdown = false;
|
|
public:
|
|
virtual void start_reclaiming() override {
|
|
while (this->under_pressure()) {
|
|
size_t reclaimed = test_async_reclaim_region::from_region(_rg.get_largest_region()).evict();
|
|
_result_accumulator->_reclaim_sizes.push_back(reclaimed);
|
|
}
|
|
}
|
|
|
|
~test_reclaimer() {
|
|
_rg.shutdown().get();
|
|
}
|
|
|
|
std::vector<size_t>& reclaim_sizes() {
|
|
return _reclaim_sizes;
|
|
}
|
|
|
|
region_group& rg() {
|
|
return _rg;
|
|
}
|
|
|
|
test_reclaimer(size_t threshold) : region_group_reclaimer(threshold), _result_accumulator(this), _rg(*this) {}
|
|
test_reclaimer(test_reclaimer& parent, size_t threshold) : region_group_reclaimer(threshold), _result_accumulator(&parent), _rg(&parent._rg, *this) {}
|
|
};
|
|
|
|
SEASTAR_TEST_CASE(test_region_groups_basic_throttling_simple_active_reclaim) {
|
|
return seastar::async([] {
|
|
// allocate a single region to exhaustion, and make sure active reclaim is activated.
|
|
test_reclaimer simple(logalloc::segment_size);
|
|
test_async_reclaim_region simple_region(simple.rg(), logalloc::segment_size);
|
|
|
|
// Can't run this function until we have reclaimed something
|
|
auto fut = simple.rg().run_when_memory_available([] {});
|
|
|
|
// Initially not available
|
|
BOOST_REQUIRE_EQUAL(fut.available(), false);
|
|
quiesce(std::move(fut));
|
|
|
|
BOOST_REQUIRE_EQUAL(simple.reclaim_sizes().size(), 1);
|
|
});
|
|
}
|
|
|
|
SEASTAR_TEST_CASE(test_region_groups_basic_throttling_active_reclaim_worst_offender) {
|
|
return seastar::async([] {
|
|
// allocate three regions with three different sizes (segment boundary must be used due to
|
|
// LSA granularity).
|
|
//
|
|
// The function can only be executed when all three are freed - which exercises continous
|
|
// reclaim, but they must be freed in descending order of their sizes
|
|
test_reclaimer simple(logalloc::segment_size);
|
|
|
|
test_async_reclaim_region small_region(simple.rg(), logalloc::segment_size);
|
|
test_async_reclaim_region medium_region(simple.rg(), 2 * logalloc::segment_size);
|
|
test_async_reclaim_region big_region(simple.rg(), 3 * logalloc::segment_size);
|
|
|
|
// Can't run this function until we have reclaimed
|
|
auto fut = simple.rg().run_when_memory_available([&simple] {
|
|
BOOST_REQUIRE_EQUAL(simple.reclaim_sizes().size(), 3);
|
|
});
|
|
|
|
// Initially not available
|
|
BOOST_REQUIRE_EQUAL(fut.available(), false);
|
|
quiesce(std::move(fut));
|
|
|
|
// Test if the ordering is the one we have expected
|
|
BOOST_REQUIRE_EQUAL(simple.reclaim_sizes()[2], logalloc::segment_size);
|
|
BOOST_REQUIRE_EQUAL(simple.reclaim_sizes()[1], 2 * logalloc::segment_size);
|
|
BOOST_REQUIRE_EQUAL(simple.reclaim_sizes()[0], 3 * logalloc::segment_size);
|
|
});
|
|
}
|
|
|
|
SEASTAR_TEST_CASE(test_region_groups_basic_throttling_active_reclaim_leaf_offender) {
|
|
return seastar::async([] {
|
|
// allocate a parent region group (A) with two leaf region groups (B and C), so that B has
|
|
// the largest size, then A, then C. Make sure that the freeing happens in descending order.
|
|
// of their sizes regardless of the topology
|
|
test_reclaimer root(logalloc::segment_size);
|
|
test_reclaimer large_leaf(root, logalloc::segment_size);
|
|
test_reclaimer small_leaf(root, logalloc::segment_size);
|
|
|
|
test_async_reclaim_region small_region(small_leaf.rg(), logalloc::segment_size);
|
|
test_async_reclaim_region medium_region(root.rg(), 2 * logalloc::segment_size);
|
|
test_async_reclaim_region big_region(large_leaf.rg(), 3 * logalloc::segment_size);
|
|
|
|
// Can't run this function until we have reclaimed. Try at the root, and we'll make sure
|
|
// that the leaves are forced correctly.
|
|
auto fut = root.rg().run_when_memory_available([&root] {
|
|
BOOST_REQUIRE_EQUAL(root.reclaim_sizes().size(), 3);
|
|
});
|
|
|
|
// Initially not available
|
|
BOOST_REQUIRE_EQUAL(fut.available(), false);
|
|
quiesce(std::move(fut));
|
|
|
|
// Test if the ordering is the one we have expected
|
|
BOOST_REQUIRE_EQUAL(root.reclaim_sizes()[2], logalloc::segment_size);
|
|
BOOST_REQUIRE_EQUAL(root.reclaim_sizes()[1], 2 * logalloc::segment_size);
|
|
BOOST_REQUIRE_EQUAL(root.reclaim_sizes()[0], 3 * logalloc::segment_size);
|
|
});
|
|
}
|
|
|
|
SEASTAR_TEST_CASE(test_region_groups_basic_throttling_active_reclaim_ancestor_block) {
|
|
return seastar::async([] {
|
|
// allocate a parent region group (A) with a leaf region group (B)
|
|
// Make sure that active reclaim still works when we block at an ancestor
|
|
test_reclaimer root(logalloc::segment_size);
|
|
test_reclaimer leaf(root, logalloc::segment_size);
|
|
|
|
test_async_reclaim_region root_region(root.rg(), logalloc::segment_size);
|
|
|
|
// Can't run this function until we have reclaimed. Try at the leaf, and we'll make sure
|
|
// that the root reclaims
|
|
auto fut = leaf.rg().run_when_memory_available([&root] {
|
|
BOOST_REQUIRE_EQUAL(root.reclaim_sizes().size(), 1);
|
|
});
|
|
|
|
// Initially not available
|
|
BOOST_REQUIRE_EQUAL(fut.available(), false);
|
|
quiesce(std::move(fut));
|
|
|
|
BOOST_REQUIRE_EQUAL(root.reclaim_sizes()[0], logalloc::segment_size);
|
|
});
|
|
}
|
|
|
|
SEASTAR_TEST_CASE(test_region_groups_basic_throttling_active_reclaim_big_region_goes_first) {
|
|
return seastar::async([] {
|
|
// allocate a parent region group (A) with a leaf region group (B). B's usage is higher, but
|
|
// due to multiple small regions. Make sure we reclaim from A first.
|
|
test_reclaimer root(logalloc::segment_size);
|
|
test_reclaimer leaf(root, logalloc::segment_size);
|
|
|
|
test_async_reclaim_region root_region(root.rg(), 4 * logalloc::segment_size);
|
|
test_async_reclaim_region big_leaf_region(leaf.rg(), 3 * logalloc::segment_size);
|
|
test_async_reclaim_region small_leaf_region(leaf.rg(), 2 * logalloc::segment_size);
|
|
|
|
auto fut = root.rg().run_when_memory_available([&root] {
|
|
BOOST_REQUIRE_EQUAL(root.reclaim_sizes().size(), 3);
|
|
});
|
|
|
|
// Initially not available
|
|
BOOST_REQUIRE_EQUAL(fut.available(), false);
|
|
quiesce(std::move(fut));
|
|
|
|
BOOST_REQUIRE_EQUAL(root.reclaim_sizes()[2], 2 * logalloc::segment_size);
|
|
BOOST_REQUIRE_EQUAL(root.reclaim_sizes()[1], 3 * logalloc::segment_size);
|
|
BOOST_REQUIRE_EQUAL(root.reclaim_sizes()[0], 4 * logalloc::segment_size);
|
|
});
|
|
}
|
|
|
|
SEASTAR_TEST_CASE(test_region_groups_basic_throttling_active_reclaim_no_double_reclaim) {
|
|
return seastar::async([] {
|
|
// allocate a parent region group (A) with a leaf region group (B), and let B go over limit.
|
|
// Both A and B try to execute requests, and we need to make sure that doesn't cause B's
|
|
// region eviction function to be called more than once. Node that test_async_reclaim_region
|
|
// will already make sure that we don't have double calls, so all we have to do is to
|
|
// generate a situation in which a double call would happen
|
|
test_reclaimer root(logalloc::segment_size);
|
|
test_reclaimer leaf(root, logalloc::segment_size);
|
|
|
|
test_async_reclaim_region leaf_region(leaf.rg(), logalloc::segment_size);
|
|
|
|
auto fut_root = root.rg().run_when_memory_available([&root] {
|
|
BOOST_REQUIRE_EQUAL(root.reclaim_sizes().size(), 1);
|
|
});
|
|
|
|
auto fut_leaf = leaf.rg().run_when_memory_available([&root] {
|
|
BOOST_REQUIRE_EQUAL(root.reclaim_sizes().size(), 1);
|
|
});
|
|
|
|
// Initially not available
|
|
BOOST_REQUIRE_EQUAL(fut_root.available(), false);
|
|
BOOST_REQUIRE_EQUAL(fut_leaf.available(), false);
|
|
quiesce(std::move(fut_root));
|
|
quiesce(std::move(fut_leaf));
|
|
|
|
BOOST_REQUIRE_EQUAL(root.reclaim_sizes().size(), 1);
|
|
BOOST_REQUIRE_EQUAL(root.reclaim_sizes()[0], logalloc::segment_size);
|
|
});
|
|
}
|