Files
scylladb/test/boost/sstable_compaction_test.cc
Pavel Emelyanov 9066224cf4 table: Don't export compaction manager reference
There's a public call on replica::table to get back the compaction
manager reference. It's not needed, actually. The users of the call are
distributed loader which already has database at hand, and a test that
creates itw own instance of compaction manager for its testing tables
and thus also has it available.

tests: unit(dev)

Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Message-Id: <20220406171351.3050-1-xemul@scylladb.com>
2022-04-07 09:27:45 +03:00

5018 lines
243 KiB
C++

/*
* Copyright (C) 2021-present ScyllaDB
*/
/*
* SPDX-License-Identifier: AGPL-3.0-or-later
*/
#include <seastar/core/sstring.hh>
#include <seastar/core/future-util.hh>
#include <seastar/core/align.hh>
#include <seastar/core/aligned_buffer.hh>
#include <seastar/util/closeable.hh>
#include <seastar/core/coroutine.hh>
#include "sstables/sstables.hh"
#include "sstables/key.hh"
#include "sstables/compress.hh"
#include "compaction/compaction.hh"
#include <seastar/testing/test_case.hh>
#include <seastar/testing/thread_test_case.hh>
#include "schema.hh"
#include "schema_builder.hh"
#include "replica/database.hh"
#include "compaction/leveled_manifest.hh"
#include "sstables/metadata_collector.hh"
#include "sstables/sstable_writer.hh"
#include <memory>
#include "test/boost/sstable_test.hh"
#include <seastar/core/seastar.hh>
#include <seastar/core/do_with.hh>
#include "compaction/compaction_manager.hh"
#include "test/lib/tmpdir.hh"
#include "dht/i_partitioner.hh"
#include "dht/murmur3_partitioner.hh"
#include "range.hh"
#include "partition_slice_builder.hh"
#include "compaction/compaction_strategy_impl.hh"
#include "compaction/date_tiered_compaction_strategy.hh"
#include "compaction/time_window_compaction_strategy.hh"
#include "test/lib/mutation_assertions.hh"
#include "counters.hh"
#include "cell_locking.hh"
#include "test/lib/simple_schema.hh"
#include "replica/memtable-sstable.hh"
#include "test/lib/index_reader_assertions.hh"
#include "test/lib/flat_mutation_reader_assertions.hh"
#include "test/lib/make_random_string.hh"
#include "test/lib/sstable_run_based_compaction_strategy_for_tests.hh"
#include "compatible_ring_position.hh"
#include "mutation_compactor.hh"
#include "service/priority_manager.hh"
#include "db/config.hh"
#include "mutation_writer/partition_based_splitting_writer.hh"
#include "compaction/table_state.hh"
#include "mutation_rebuilder.hh"
#include <stdio.h>
#include <ftw.h>
#include <unistd.h>
#include <boost/range/algorithm/find_if.hpp>
#include <boost/algorithm/cxx11/all_of.hpp>
#include <boost/algorithm/cxx11/is_sorted.hpp>
#include <boost/icl/interval_map.hpp>
#include "test/lib/test_services.hh"
#include "test/lib/cql_test_env.hh"
#include "test/lib/reader_concurrency_semaphore.hh"
#include "test/lib/sstable_utils.hh"
#include "test/lib/random_utils.hh"
#include "readers/from_mutations_v2.hh"
#include "readers/from_fragments_v2.hh"
#include "readers/combined.hh"
namespace fs = std::filesystem;
using namespace sstables;
static const sstring some_keyspace("ks");
static const sstring some_column_family("cf");
atomic_cell make_atomic_cell(data_type dt, bytes_view value, uint32_t ttl = 0, uint32_t expiration = 0) {
if (ttl) {
return atomic_cell::make_live(*dt, 0, value,
gc_clock::time_point(gc_clock::duration(expiration)), gc_clock::duration(ttl));
} else {
return atomic_cell::make_live(*dt, 0, value);
}
}
//////////////////////////////// Test basic compaction support
// open_sstable() opens the requested sstable for reading only (sstables are
// immutable, so an existing sstable cannot be opened for writing).
// It returns a future because opening requires reading from disk, and
// therefore may block. The future value is a shared sstable - a reference-
// counting pointer to an sstable - allowing for the returned handle to
// be passed around until no longer needed.
static future<sstables::shared_sstable> open_sstable(test_env& env, schema_ptr schema, sstring dir, unsigned long generation) {
return env.reusable_sst(std::move(schema), dir, generation);
}
// open_sstables() opens several generations of the same sstable, returning,
// after all the tables have been open, their vector.
static future<std::vector<sstables::shared_sstable>> open_sstables(test_env& env, schema_ptr s, sstring dir, std::vector<unsigned long> generations) {
return do_with(std::vector<sstables::shared_sstable>(),
[&env, dir = std::move(dir), generations = std::move(generations), s] (auto& ret) mutable {
return parallel_for_each(generations, [&env, &ret, &dir, s] (unsigned long generation) {
return open_sstable(env, s, dir, generation).then([&ret] (sstables::shared_sstable sst) {
ret.push_back(std::move(sst));
});
}).then([&ret] {
return std::move(ret);
});
});
}
// mutation_reader for sstable keeping all the required objects alive.
static flat_mutation_reader sstable_reader(shared_sstable sst, schema_ptr s, reader_permit permit) {
return sst->as_mutation_source().make_reader(s, std::move(permit), query::full_partition_range, s->full_slice());
}
class table_state_for_test : public table_state {
column_family_for_tests& _t;
test_env& _env;
std::vector<sstables::shared_sstable> _compacted_undeleted;
public:
explicit table_state_for_test(column_family_for_tests& t, test_env& env)
: _t(t)
, _env(env)
{
}
const schema_ptr& schema() const noexcept override {
return _t->schema();
}
unsigned min_compaction_threshold() const noexcept override {
return _t.schema()->min_compaction_threshold();
}
bool compaction_enforce_min_threshold() const noexcept override {
return true;
}
const sstables::sstable_set& get_sstable_set() const override {
return _t->get_sstable_set();
}
std::unordered_set<sstables::shared_sstable> fully_expired_sstables(const std::vector<sstables::shared_sstable>& sstables, gc_clock::time_point query_time) const override {
return sstables::get_fully_expired_sstables(_t->as_table_state(), sstables, query_time);
}
const std::vector<sstables::shared_sstable>& compacted_undeleted_sstables() const noexcept override {
return _compacted_undeleted;
}
sstables::compaction_strategy& get_compaction_strategy() const noexcept override {
return _t->get_compaction_strategy();
}
reader_permit make_compaction_reader_permit() const override {
return _env.make_reader_permit();
}
sstables::sstable_writer_config configure_writer(sstring origin) const override {
return _env.manager().configure_writer(std::move(origin));
}
api::timestamp_type min_memtable_timestamp() const override {
return _t->min_memtable_timestamp();
}
future<> update_compaction_history(utils::UUID compaction_id, sstring ks_name, sstring cf_name, std::chrono::milliseconds ended_at, int64_t bytes_in, int64_t bytes_out) override {
return make_ready_future<>();
}
};
static std::unique_ptr<table_state> make_table_state_for_test(column_family_for_tests& t, test_env& env) {
return std::make_unique<table_state_for_test>(t, env);
}
class strategy_control_for_test : public strategy_control {
bool _has_ongoing_compaction;
public:
explicit strategy_control_for_test(bool has_ongoing_compaction) noexcept : _has_ongoing_compaction(has_ongoing_compaction) {}
bool has_ongoing_compaction(table_state& table_s) const noexcept override {
return _has_ongoing_compaction;
}
};
static std::unique_ptr<strategy_control> make_strategy_control_for_test(bool has_ongoing_compaction) {
return std::make_unique<strategy_control_for_test>(has_ongoing_compaction);
}
SEASTAR_TEST_CASE(compaction_manager_basic_test) {
return test_env::do_with_async([] (test_env& env) {
BOOST_REQUIRE(smp::count == 1);
auto s = make_shared_schema({}, some_keyspace, some_column_family,
{{"p1", utf8_type}}, {{"c1", utf8_type}}, {{"r1", int32_type}}, {}, utf8_type);
auto cm = make_lw_shared<compaction_manager>();
cm->enable();
auto stop_cm = defer([&cm] {
cm->stop().get();
});
auto tmp = tmpdir();
replica::column_family::config cfg = column_family_test_config(env.manager(), env.semaphore());
cfg.datadir = tmp.path().string();
cfg.enable_commitlog = false;
cfg.enable_incremental_backups = false;
auto cl_stats = make_lw_shared<cell_locker_stats>();
auto tracker = make_lw_shared<cache_tracker>();
auto cf = make_lw_shared<replica::column_family>(s, cfg, replica::column_family::no_commitlog(), *cm, *cl_stats, *tracker);
cf->start();
cf->mark_ready_for_writes();
cf->set_compaction_strategy(sstables::compaction_strategy_type::size_tiered);
auto generations = std::vector<unsigned long>({1, 2, 3, 4});
for (auto generation : generations) {
// create 4 sstables of similar size to be compacted later on.
auto mt = make_lw_shared<replica::memtable>(s);
const column_definition& r1_col = *s->get_column_definition("r1");
sstring k = "key" + to_sstring(generation);
auto key = partition_key::from_exploded(*s, {to_bytes(k)});
auto c_key = clustering_key::from_exploded(*s, {to_bytes("abc")});
mutation m(s, key);
m.set_clustered_cell(c_key, r1_col, make_atomic_cell(int32_type, int32_type->decompose(1)));
mt->apply(std::move(m));
auto sst = env.make_sstable(s, tmp.path().string(), column_family_test::calculate_generation_for_new_table(*cf), sstables::get_highest_sstable_version(), big);
write_memtable_to_sstable_for_test(*mt, sst).get();
sst->load().get();
column_family_test(cf).add_sstable(sst);
}
BOOST_REQUIRE(cf->sstables_count() == generations.size());
cf->trigger_compaction();
BOOST_REQUIRE(cm->get_stats().pending_tasks == 1 || cm->get_stats().active_tasks == 1);
// wait for submitted job to finish.
auto end = [cm] { return cm->get_stats().pending_tasks == 0 && cm->get_stats().active_tasks == 0; };
while (!end()) {
// sleep until compaction manager selects cf for compaction.
sleep(std::chrono::milliseconds(100)).get();
}
BOOST_REQUIRE(cm->get_stats().completed_tasks == 1);
BOOST_REQUIRE(cm->get_stats().errors == 0);
// expect sstables of cf to be compacted.
BOOST_REQUIRE(cf->sstables_count() == 1);
cf->stop().get();
});
}
SEASTAR_TEST_CASE(compact) {
return sstables::test_env::do_with([] (sstables::test_env& env) {
BOOST_REQUIRE(smp::count == 1);
constexpr int generation = 17;
// The "compaction" sstable was created with the following schema:
// CREATE TABLE compaction (
// name text,
// age int,
// height int,
// PRIMARY KEY (name)
//);
auto builder = schema_builder("tests", "compaction")
.with_column("name", utf8_type, column_kind::partition_key)
.with_column("age", int32_type)
.with_column("height", int32_type);
builder.set_comment("Example table for compaction");
builder.set_gc_grace_seconds(std::numeric_limits<int32_t>::max());
auto s = builder.build();
auto cm = make_lw_shared<compaction_manager>();
auto cl_stats = make_lw_shared<cell_locker_stats>();
auto tracker = make_lw_shared<cache_tracker>();
auto cf = make_lw_shared<replica::column_family>(s, column_family_test_config(env.manager(), env.semaphore()), replica::column_family::no_commitlog(), *cm, *cl_stats, *tracker);
cf->mark_ready_for_writes();
return test_setup::do_with_tmp_directory([s, generation, cf, cm] (test_env& env, sstring tmpdir_path) {
return open_sstables(env, s, "test/resource/sstables/compaction", {1,2,3}).then([&env, tmpdir_path, s, cf, cm, generation] (auto sstables) {
auto new_sstable = [&env, gen = make_lw_shared<unsigned>(generation), s, tmpdir_path] {
return env.make_sstable(s, tmpdir_path,
(*gen)++, sstables::get_highest_sstable_version(), sstables::sstable::format_types::big);
};
return compact_sstables(*cm, sstables::compaction_descriptor(std::move(sstables), default_priority_class()), *cf, new_sstable).then([&env, s, generation, cf, cm, tmpdir_path] (auto) {
// Verify that the compacted sstable has the right content. We expect to see:
// name | age | height
// -------+-----+--------
// jerry | 40 | 170
// tom | 20 | 180
// john | 20 | deleted
// nadav - deleted partition
return open_sstable(env, s, tmpdir_path, generation).then([&env, s] (shared_sstable sst) {
auto reader = make_lw_shared<flat_mutation_reader>(sstable_reader(sst, s, env.make_reader_permit())); // reader holds sst and s alive.
return read_mutation_from_flat_mutation_reader(*reader).then([reader, s] (mutation_opt m) {
BOOST_REQUIRE(m);
BOOST_REQUIRE(m->key().equal(*s, partition_key::from_singular(*s, data_value(sstring("jerry")))));
BOOST_REQUIRE(!m->partition().partition_tombstone());
auto rows = m->partition().clustered_rows();
BOOST_REQUIRE(rows.calculate_size() == 1);
auto &row = rows.begin()->row();
BOOST_REQUIRE(!row.deleted_at());
auto &cells = row.cells();
auto& cdef1 = *s->get_column_definition("age");
auto& cdef2 = *s->get_column_definition("height");
BOOST_REQUIRE(cells.cell_at(cdef1.id).as_atomic_cell(cdef1).value() == managed_bytes({0,0,0,40}));
BOOST_REQUIRE(cells.cell_at(cdef2.id).as_atomic_cell(cdef2).value() == managed_bytes({0,0,0,(int8_t)170}));
return read_mutation_from_flat_mutation_reader(*reader);
}).then([reader, s] (mutation_opt m) {
BOOST_REQUIRE(m);
BOOST_REQUIRE(m->key().equal(*s, partition_key::from_singular(*s, data_value(sstring("tom")))));
BOOST_REQUIRE(!m->partition().partition_tombstone());
auto rows = m->partition().clustered_rows();
BOOST_REQUIRE(rows.calculate_size() == 1);
auto &row = rows.begin()->row();
BOOST_REQUIRE(!row.deleted_at());
auto &cells = row.cells();
auto& cdef1 = *s->get_column_definition("age");
auto& cdef2 = *s->get_column_definition("height");
BOOST_REQUIRE(cells.cell_at(cdef1.id).as_atomic_cell(cdef1).value() == managed_bytes({0,0,0,20}));
BOOST_REQUIRE(cells.cell_at(cdef2.id).as_atomic_cell(cdef2).value() == managed_bytes({0,0,0,(int8_t)180}));
return read_mutation_from_flat_mutation_reader(*reader);
}).then([reader, s] (mutation_opt m) {
BOOST_REQUIRE(m);
BOOST_REQUIRE(m->key().equal(*s, partition_key::from_singular(*s, data_value(sstring("john")))));
BOOST_REQUIRE(!m->partition().partition_tombstone());
auto rows = m->partition().clustered_rows();
BOOST_REQUIRE(rows.calculate_size() == 1);
auto &row = rows.begin()->row();
BOOST_REQUIRE(!row.deleted_at());
auto &cells = row.cells();
auto& cdef1 = *s->get_column_definition("age");
auto& cdef2 = *s->get_column_definition("height");
BOOST_REQUIRE(cells.cell_at(cdef1.id).as_atomic_cell(cdef1).value() == managed_bytes({0,0,0,20}));
BOOST_REQUIRE(cells.find_cell(cdef2.id) == nullptr);
return read_mutation_from_flat_mutation_reader(*reader);
}).then([reader, s] (mutation_opt m) {
BOOST_REQUIRE(m);
BOOST_REQUIRE(m->key().equal(*s, partition_key::from_singular(*s, data_value(sstring("nadav")))));
BOOST_REQUIRE(m->partition().partition_tombstone());
auto rows = m->partition().clustered_rows();
BOOST_REQUIRE(rows.calculate_size() == 0);
return read_mutation_from_flat_mutation_reader(*reader);
}).then([reader] (mutation_opt m) {
BOOST_REQUIRE(!m);
}).finally([reader] {
return reader->close();
});
});
});
});
}).finally([cl_stats, tracker] { });
});
// verify that the compacted sstable look like
}
static std::vector<sstables::shared_sstable> get_candidates_for_leveled_strategy(replica::column_family& cf) {
std::vector<sstables::shared_sstable> candidates;
candidates.reserve(cf.sstables_count());
for (auto sstables = cf.get_sstables(); auto& entry : *sstables) {
candidates.push_back(entry);
}
return candidates;
}
// Return vector of sstables generated by compaction. Only relevant for leveled one.
static future<std::vector<unsigned long>> compact_sstables(test_env& env, sstring tmpdir_path, std::vector<unsigned long> generations_to_compact,
unsigned long new_generation, bool create_sstables, uint64_t min_sstable_size, compaction_strategy_type strategy) {
BOOST_REQUIRE(smp::count == 1);
schema_builder builder(make_shared_schema({}, some_keyspace, some_column_family,
{{"p1", utf8_type}}, {{"c1", utf8_type}}, {{"r1", utf8_type}}, {}, utf8_type));
builder.set_compressor_params(compression_parameters::no_compression());
builder.set_min_compaction_threshold(4);
auto s = builder.build(schema_builder::compact_storage::no);
column_family_for_tests cf(env.manager(), s);
auto generations = make_lw_shared<std::vector<unsigned long>>(std::move(generations_to_compact));
auto sstables = make_lw_shared<std::vector<sstables::shared_sstable>>();
auto created = make_lw_shared<std::vector<unsigned long>>();
auto f = make_ready_future<>();
return f.then([&env, generations, sstables, s, create_sstables, min_sstable_size, tmpdir_path] () mutable {
if (!create_sstables) {
return open_sstables(env, s, tmpdir_path, *generations).then([sstables] (auto opened_sstables) mutable {
for (auto& sst : opened_sstables) {
sstables->push_back(sst);
}
return make_ready_future<>();
});
}
return do_for_each(*generations, [&env, generations, sstables, s, min_sstable_size, tmpdir_path] (unsigned long generation) {
auto mt = make_lw_shared<replica::memtable>(s);
const column_definition& r1_col = *s->get_column_definition("r1");
sstring k = "key" + to_sstring(generation);
auto key = partition_key::from_exploded(*s, {to_bytes(k)});
auto c_key = clustering_key::from_exploded(*s, {to_bytes("abc")});
mutation m(s, key);
m.set_clustered_cell(c_key, r1_col, make_atomic_cell(utf8_type, bytes(min_sstable_size, 'a')));
mt->apply(std::move(m));
auto sst = env.make_sstable(s, tmpdir_path, generation, sstables::get_highest_sstable_version(), big);
return write_memtable_to_sstable_for_test(*mt, sst).then([mt, sst, s, sstables] {
return sst->load().then([sst, sstables] {
sstables->push_back(sst);
return make_ready_future<>();
});
});
});
}).then([&env, cf, sstables, new_generation, generations, strategy, created, min_sstable_size, s, tmpdir_path] () mutable {
auto generation = make_lw_shared<unsigned long>(new_generation);
auto new_sstable = [&env, generation, created, s, tmpdir_path] {
auto gen = (*generation)++;
created->push_back(gen);
return env.make_sstable(s, tmpdir_path,
gen, sstables::get_highest_sstable_version(), sstables::sstable::format_types::big);
};
// We must have opened at least all original candidates.
BOOST_REQUIRE(generations->size() == sstables->size());
if (strategy == compaction_strategy_type::size_tiered) {
// Calling function that will return a list of sstables to compact based on size-tiered strategy.
int min_threshold = cf->schema()->min_compaction_threshold();
int max_threshold = cf->schema()->max_compaction_threshold();
auto sstables_to_compact = sstables::size_tiered_compaction_strategy::most_interesting_bucket(*sstables, min_threshold, max_threshold);
// We do expect that all candidates were selected for compaction (in this case).
BOOST_REQUIRE(sstables_to_compact.size() == sstables->size());
return compact_sstables(cf.get_compaction_manager(), sstables::compaction_descriptor(std::move(sstables_to_compact),
default_priority_class()), *cf, new_sstable).then([generation] (auto) {});
} else if (strategy == compaction_strategy_type::leveled) {
for (auto& sst : *sstables) {
BOOST_REQUIRE(sst->get_sstable_level() == 0);
BOOST_REQUIRE(sst->data_size() >= min_sstable_size);
column_family_test(cf).add_sstable(sst);
}
auto candidates = get_candidates_for_leveled_strategy(*cf);
sstables::size_tiered_compaction_strategy_options stcs_options;
auto table_s = make_table_state_for_test(cf, env);
leveled_manifest manifest = leveled_manifest::create(*table_s, candidates, 1, stcs_options);
std::vector<std::optional<dht::decorated_key>> last_compacted_keys(leveled_manifest::MAX_LEVELS);
std::vector<int> compaction_counter(leveled_manifest::MAX_LEVELS);
auto candidate = manifest.get_compaction_candidates(last_compacted_keys, compaction_counter);
BOOST_REQUIRE(candidate.sstables.size() == sstables->size());
BOOST_REQUIRE(candidate.level == 1);
BOOST_REQUIRE(candidate.max_sstable_bytes == 1024*1024);
return compact_sstables(cf.get_compaction_manager(), sstables::compaction_descriptor(std::move(candidate.sstables),
default_priority_class(), candidate.level, 1024*1024), *cf, new_sstable).then([generation] (auto) {});
} else {
throw std::runtime_error("unexpected strategy");
}
return make_ready_future<>();
}).then([cf, created] {
return std::move(*created);
}).finally([cf] () mutable {
return cf.stop_and_keep_alive();
});
}
static future<> compact_sstables(test_env& env, sstring tmpdir_path, std::vector<unsigned long> generations_to_compact, unsigned long new_generation, bool create_sstables = true) {
uint64_t min_sstable_size = 50;
return compact_sstables(env, tmpdir_path, std::move(generations_to_compact), new_generation, create_sstables, min_sstable_size,
compaction_strategy_type::size_tiered).then([new_generation] (auto ret) {
// size tiered compaction will output at most one sstable, let's assert that.
BOOST_REQUIRE(ret.size() == 1);
BOOST_REQUIRE(ret[0] == new_generation);
return make_ready_future<>();
});
}
static future<> check_compacted_sstables(test_env& env, sstring tmpdir_path, unsigned long generation, std::vector<unsigned long> compacted_generations) {
auto s = make_shared_schema({}, some_keyspace, some_column_family,
{{"p1", utf8_type}}, {{"c1", utf8_type}}, {{"r1", utf8_type}}, {}, utf8_type);
auto generations = make_lw_shared<std::vector<unsigned long>>(std::move(compacted_generations));
return open_sstable(env, s, tmpdir_path, generation).then([&env, s, generations] (shared_sstable sst) {
auto reader = sstable_reader(sst, s, env.make_reader_permit()); // reader holds sst and s alive.
auto keys = make_lw_shared<std::vector<partition_key>>();
return with_closeable(std::move(reader), [generations, s, keys] (flat_mutation_reader& reader) {
return do_for_each(*generations, [&reader, keys] (unsigned long generation) mutable {
return read_mutation_from_flat_mutation_reader(reader).then([generation, keys] (mutation_opt m) {
BOOST_REQUIRE(m);
keys->push_back(m->key());
});
}).then([s, keys, generations] {
// keys from compacted sstable aren't ordered lexographically,
// thus we must read all keys into a vector, sort the vector
// lexographically, then proceed with the comparison.
std::sort(keys->begin(), keys->end(), partition_key::less_compare(*s));
BOOST_REQUIRE(keys->size() == generations->size());
auto i = 0;
for (auto& k : *keys) {
sstring original_k = "key" + to_sstring((*generations)[i++]);
BOOST_REQUIRE(k.equal(*s, partition_key::from_singular(*s, data_value(original_k))));
}
return make_ready_future<>();
});
});
});
}
SEASTAR_TEST_CASE(compact_02) {
// NOTE: generations 18 to 38 are used here.
// This tests size-tiered compaction strategy by creating 4 sstables of
// similar size and compacting them to create a new tier.
// The process above is repeated 4 times until you have 4 compacted
// sstables of similar size. Then you compact these 4 compacted sstables,
// and make sure that you have all partition keys.
// By the way, automatic compaction isn't tested here, instead the
// strategy algorithm that selects candidates for compaction.
return test_setup::do_with_tmp_directory([] (test_env& env, sstring tmpdir_path) {
// Compact 4 sstables into 1 using size-tiered strategy to select sstables.
// E.g.: generations 18, 19, 20 and 21 will be compacted into generation 22.
return compact_sstables(env, tmpdir_path, { 18, 19, 20, 21 }, 22).then([&env, tmpdir_path] {
// Check that generation 22 contains all keys of generations 18, 19, 20 and 21.
return check_compacted_sstables(env, tmpdir_path, 22, { 18, 19, 20, 21 });
}).then([&env, tmpdir_path] {
return compact_sstables(env, tmpdir_path, { 23, 24, 25, 26 }, 27).then([&env, tmpdir_path] {
return check_compacted_sstables(env, tmpdir_path, 27, { 23, 24, 25, 26 });
});
}).then([&env, tmpdir_path] {
return compact_sstables(env, tmpdir_path, { 28, 29, 30, 31 }, 32).then([&env, tmpdir_path] {
return check_compacted_sstables(env, tmpdir_path, 32, { 28, 29, 30, 31 });
});
}).then([&env, tmpdir_path] {
return compact_sstables(env, tmpdir_path, { 33, 34, 35, 36 }, 37).then([&env, tmpdir_path] {
return check_compacted_sstables(env, tmpdir_path, 37, { 33, 34, 35, 36 });
});
}).then([&env, tmpdir_path] {
// In this step, we compact 4 compacted sstables.
return compact_sstables(env, tmpdir_path, { 22, 27, 32, 37 }, 38, false).then([&env, tmpdir_path] {
// Check that the compacted sstable contains all keys.
return check_compacted_sstables(env, tmpdir_path, 38,
{ 18, 19, 20, 21, 23, 24, 25, 26, 28, 29, 30, 31, 33, 34, 35, 36 });
});
});
});
}
// Leveled compaction strategy tests
static void add_sstable_for_leveled_test(test_env& env, lw_shared_ptr<replica::column_family> cf, int64_t gen, uint64_t fake_data_size,
uint32_t sstable_level, sstring first_key, sstring last_key, int64_t max_timestamp = 0) {
auto sst = env.make_sstable(cf->schema(), "", gen, la, big);
sstables::test(sst).set_values_for_leveled_strategy(fake_data_size, sstable_level, max_timestamp, std::move(first_key), std::move(last_key));
assert(sst->data_size() == fake_data_size);
assert(sst->get_sstable_level() == sstable_level);
assert(sst->get_stats_metadata().max_timestamp == max_timestamp);
assert(sst->generation() == gen);
column_family_test(cf).add_sstable(sst);
}
static shared_sstable add_sstable_for_overlapping_test(test_env& env, lw_shared_ptr<replica::column_family> cf, int64_t gen, sstring first_key, sstring last_key, stats_metadata stats = {}) {
auto sst = env.make_sstable(cf->schema(), "", gen, la, big);
sstables::test(sst).set_values(std::move(first_key), std::move(last_key), std::move(stats));
column_family_test(cf).add_sstable(sst);
return sst;
}
static shared_sstable sstable_for_overlapping_test(test_env& env, const schema_ptr& schema, int64_t gen, sstring first_key, sstring last_key, uint32_t level = 0) {
auto sst = env.make_sstable(schema, "", gen, la, big);
sstables::test(sst).set_values_for_leveled_strategy(0, level, 0, std::move(first_key), std::move(last_key));
return sst;
}
// ranges: [a,b] and [c,d]
// returns true if token ranges overlap.
static bool key_range_overlaps(column_family_for_tests& cf, sstring a, sstring b, sstring c, sstring d) {
const dht::i_partitioner& p = cf->schema()->get_partitioner();
const dht::sharder& sharder = cf->schema()->get_sharder();
auto range1 = create_token_range_from_keys(sharder, p, a, b);
auto range2 = create_token_range_from_keys(sharder, p, c, d);
return range1.overlaps(range2, dht::token_comparator());
}
static shared_sstable get_sstable(const lw_shared_ptr<replica::column_family>& cf, int64_t generation) {
auto sstables = cf->get_sstables();
auto entry = boost::range::find_if(*sstables, [generation] (shared_sstable sst) { return generation == sst->generation(); });
assert(entry != sstables->end());
assert((*entry)->generation() == generation);
return *entry;
}
static bool sstable_overlaps(const lw_shared_ptr<replica::column_family>& cf, int64_t gen1, int64_t gen2) {
auto candidate1 = get_sstable(cf, gen1);
auto range1 = range<dht::token>::make(candidate1->get_first_decorated_key()._token, candidate1->get_last_decorated_key()._token);
auto candidate2 = get_sstable(cf, gen2);
auto range2 = range<dht::token>::make(candidate2->get_first_decorated_key()._token, candidate2->get_last_decorated_key()._token);
return range1.overlaps(range2, dht::token_comparator());
}
SEASTAR_TEST_CASE(leveled_01) {
BOOST_REQUIRE_EQUAL(smp::count, 1);
return test_env::do_with([] (test_env& env) {
column_family_for_tests cf(env.manager());
auto key_and_token_pair = token_generation_for_current_shard(50);
auto min_key = key_and_token_pair[0].first;
auto max_key = key_and_token_pair[key_and_token_pair.size()-1].first;
auto max_sstable_size_in_mb = 1;
auto max_sstable_size = max_sstable_size_in_mb*1024*1024;
// Creating two sstables which key range overlap.
add_sstable_for_leveled_test(env, cf, /*gen*/1, max_sstable_size, /*level*/0, min_key, max_key);
BOOST_REQUIRE(cf->get_sstables()->size() == 1);
add_sstable_for_leveled_test(env, cf, /*gen*/2, max_sstable_size, /*level*/0, key_and_token_pair[1].first, max_key);
BOOST_REQUIRE(cf->get_sstables()->size() == 2);
BOOST_REQUIRE(key_range_overlaps(cf, min_key, max_key, key_and_token_pair[1].first, max_key) == true);
BOOST_REQUIRE(sstable_overlaps(cf, 1, 2) == true);
auto candidates = get_candidates_for_leveled_strategy(*cf);
sstables::size_tiered_compaction_strategy_options stcs_options;
auto table_s = make_table_state_for_test(cf, env);
leveled_manifest manifest = leveled_manifest::create(*table_s, candidates, max_sstable_size_in_mb, stcs_options);
BOOST_REQUIRE(manifest.get_level_size(0) == 2);
std::vector<std::optional<dht::decorated_key>> last_compacted_keys(leveled_manifest::MAX_LEVELS);
std::vector<int> compaction_counter(leveled_manifest::MAX_LEVELS);
auto candidate = manifest.get_compaction_candidates(last_compacted_keys, compaction_counter);
BOOST_REQUIRE(candidate.sstables.size() == 2);
BOOST_REQUIRE(candidate.level == 1);
std::set<unsigned long> gens = { 1, 2 };
for (auto& sst : candidate.sstables) {
BOOST_REQUIRE(gens.contains(sst->generation()));
gens.erase(sst->generation());
BOOST_REQUIRE(sst->get_sstable_level() == 0);
}
BOOST_REQUIRE(gens.empty());
return cf.stop_and_keep_alive();
});
}
SEASTAR_TEST_CASE(leveled_02) {
BOOST_REQUIRE_EQUAL(smp::count, 1);
return test_env::do_with([] (test_env& env) {
column_family_for_tests cf(env.manager());
auto key_and_token_pair = token_generation_for_current_shard(50);
auto min_key = key_and_token_pair[0].first;
auto max_key = key_and_token_pair[key_and_token_pair.size()-1].first;
auto max_sstable_size_in_mb = 1;
auto max_sstable_size = max_sstable_size_in_mb*1024*1024;
// Generation 1 will overlap only with generation 2.
// Remember that for level0, leveled strategy prefer choosing older sstables as candidates.
add_sstable_for_leveled_test(env, cf, /*gen*/1, max_sstable_size, /*level*/0, min_key, key_and_token_pair[10].first);
BOOST_REQUIRE(cf->get_sstables()->size() == 1);
add_sstable_for_leveled_test(env, cf, /*gen*/2, max_sstable_size, /*level*/0, min_key, key_and_token_pair[20].first);
BOOST_REQUIRE(cf->get_sstables()->size() == 2);
add_sstable_for_leveled_test(env, cf, /*gen*/3, max_sstable_size, /*level*/0, key_and_token_pair[30].first, max_key);
BOOST_REQUIRE(cf->get_sstables()->size() == 3);
BOOST_REQUIRE(key_range_overlaps(cf, min_key, key_and_token_pair[10].first, min_key, key_and_token_pair[20].first) == true);
BOOST_REQUIRE(key_range_overlaps(cf, min_key, key_and_token_pair[20].first, key_and_token_pair[30].first, max_key) == false);
BOOST_REQUIRE(key_range_overlaps(cf, min_key, key_and_token_pair[10].first, key_and_token_pair[30].first, max_key) == false);
BOOST_REQUIRE(sstable_overlaps(cf, 1, 2) == true);
BOOST_REQUIRE(sstable_overlaps(cf, 2, 1) == true);
BOOST_REQUIRE(sstable_overlaps(cf, 1, 3) == false);
BOOST_REQUIRE(sstable_overlaps(cf, 2, 3) == false);
auto candidates = get_candidates_for_leveled_strategy(*cf);
sstables::size_tiered_compaction_strategy_options stcs_options;
auto table_s = make_table_state_for_test(cf, env);
leveled_manifest manifest = leveled_manifest::create(*table_s, candidates, max_sstable_size_in_mb, stcs_options);
BOOST_REQUIRE(manifest.get_level_size(0) == 3);
std::vector<std::optional<dht::decorated_key>> last_compacted_keys(leveled_manifest::MAX_LEVELS);
std::vector<int> compaction_counter(leveled_manifest::MAX_LEVELS);
auto candidate = manifest.get_compaction_candidates(last_compacted_keys, compaction_counter);
BOOST_REQUIRE(candidate.sstables.size() == 3);
BOOST_REQUIRE(candidate.level == 1);
std::set<unsigned long> gens = { 1, 2, 3 };
for (auto& sst : candidate.sstables) {
BOOST_REQUIRE(gens.contains(sst->generation()));
gens.erase(sst->generation());
BOOST_REQUIRE(sst->get_sstable_level() == 0);
}
BOOST_REQUIRE(gens.empty());
return cf.stop_and_keep_alive();
});
}
SEASTAR_TEST_CASE(leveled_03) {
BOOST_REQUIRE_EQUAL(smp::count, 1);
return test_env::do_with([] (test_env& env) {
column_family_for_tests cf(env.manager());
auto key_and_token_pair = token_generation_for_current_shard(50);
auto min_key = key_and_token_pair[0].first;
auto max_key = key_and_token_pair[key_and_token_pair.size()-1].first;
// Creating two sstables of level 0 which overlap
add_sstable_for_leveled_test(env, cf, /*gen*/1, /*data_size*/1024*1024, /*level*/0, min_key, key_and_token_pair[10].first);
add_sstable_for_leveled_test(env, cf, /*gen*/2, /*data_size*/1024*1024, /*level*/0, min_key, key_and_token_pair[20].first);
// Creating a sstable of level 1 which overlap with two sstables above.
add_sstable_for_leveled_test(env, cf, /*gen*/3, /*data_size*/1024*1024, /*level*/1, min_key, key_and_token_pair[30].first);
// Creating a sstable of level 1 which doesn't overlap with any sstable.
add_sstable_for_leveled_test(env, cf, /*gen*/4, /*data_size*/1024*1024, /*level*/1, key_and_token_pair[40].first, max_key);
BOOST_REQUIRE(cf->get_sstables()->size() == 4);
BOOST_REQUIRE(key_range_overlaps(cf, min_key, key_and_token_pair[10].first, min_key, key_and_token_pair[20].first) == true);
BOOST_REQUIRE(key_range_overlaps(cf, min_key, key_and_token_pair[10].first, min_key, key_and_token_pair[30].first) == true);
BOOST_REQUIRE(key_range_overlaps(cf, min_key, key_and_token_pair[20].first, min_key, key_and_token_pair[30].first) == true);
BOOST_REQUIRE(key_range_overlaps(cf, min_key, key_and_token_pair[10].first, key_and_token_pair[40].first, max_key) == false);
BOOST_REQUIRE(key_range_overlaps(cf, min_key, key_and_token_pair[30].first, key_and_token_pair[40].first, max_key) == false);
BOOST_REQUIRE(sstable_overlaps(cf, 1, 2) == true);
BOOST_REQUIRE(sstable_overlaps(cf, 1, 3) == true);
BOOST_REQUIRE(sstable_overlaps(cf, 2, 3) == true);
BOOST_REQUIRE(sstable_overlaps(cf, 1, 4) == false);
BOOST_REQUIRE(sstable_overlaps(cf, 2, 4) == false);
BOOST_REQUIRE(sstable_overlaps(cf, 3, 4) == false);
auto max_sstable_size_in_mb = 1;
auto candidates = get_candidates_for_leveled_strategy(*cf);
sstables::size_tiered_compaction_strategy_options stcs_options;
auto table_s = make_table_state_for_test(cf, env);
leveled_manifest manifest = leveled_manifest::create(*table_s, candidates, max_sstable_size_in_mb, stcs_options);
BOOST_REQUIRE(manifest.get_level_size(0) == 2);
BOOST_REQUIRE(manifest.get_level_size(1) == 2);
std::vector<std::optional<dht::decorated_key>> last_compacted_keys(leveled_manifest::MAX_LEVELS);
std::vector<int> compaction_counter(leveled_manifest::MAX_LEVELS);
auto candidate = manifest.get_compaction_candidates(last_compacted_keys, compaction_counter);
BOOST_REQUIRE(candidate.sstables.size() == 3);
BOOST_REQUIRE(candidate.level == 1);
std::set<std::pair<unsigned long, uint32_t>> gen_and_level = { {1,0}, {2,0}, {3,1} };
for (auto& sst : candidate.sstables) {
std::pair<unsigned long, uint32_t> pair(sst->generation(), sst->get_sstable_level());
auto it = gen_and_level.find(pair);
BOOST_REQUIRE(it != gen_and_level.end());
BOOST_REQUIRE(sst->get_sstable_level() == it->second);
gen_and_level.erase(pair);
}
BOOST_REQUIRE(gen_and_level.empty());
return cf.stop_and_keep_alive();
});
}
SEASTAR_TEST_CASE(leveled_04) {
BOOST_REQUIRE_EQUAL(smp::count, 1);
return test_env::do_with([] (test_env& env) {
column_family_for_tests cf(env.manager());
auto key_and_token_pair = token_generation_for_current_shard(50);
auto min_key = key_and_token_pair[0].first;
auto max_key = key_and_token_pair[key_and_token_pair.size()-1].first;
auto max_sstable_size_in_mb = 1;
auto max_sstable_size_in_bytes = max_sstable_size_in_mb*1024*1024;
// add 1 level-0 sstable to cf.
add_sstable_for_leveled_test(env, cf, /*gen*/1, /*data_size*/max_sstable_size_in_bytes, /*level*/0, min_key, max_key);
// create two big sstables in level1 to force leveled compaction on it.
auto max_bytes_for_l1 = leveled_manifest::max_bytes_for_level(1, max_sstable_size_in_bytes);
// NOTE: SSTables in level1 cannot overlap.
add_sstable_for_leveled_test(env, cf, /*gen*/2, /*data_size*/max_bytes_for_l1, /*level*/1, min_key, key_and_token_pair[25].first);
add_sstable_for_leveled_test(env, cf, /*gen*/3, /*data_size*/max_bytes_for_l1, /*level*/1, key_and_token_pair[26].first, max_key);
// Create SSTable in level2 that overlaps with the ones in level1,
// so compaction in level1 will select overlapping sstables in
// level2.
add_sstable_for_leveled_test(env, cf, /*gen*/4, /*data_size*/max_sstable_size_in_bytes, /*level*/2, min_key, max_key);
BOOST_REQUIRE(cf->get_sstables()->size() == 4);
BOOST_REQUIRE(key_range_overlaps(cf, min_key, max_key, min_key, max_key) == true);
BOOST_REQUIRE(sstable_overlaps(cf, 1, 2) == true);
BOOST_REQUIRE(sstable_overlaps(cf, 1, 3) == true);
BOOST_REQUIRE(sstable_overlaps(cf, 2, 3) == false);
BOOST_REQUIRE(sstable_overlaps(cf, 3, 4) == true);
BOOST_REQUIRE(sstable_overlaps(cf, 2, 4) == true);
auto candidates = get_candidates_for_leveled_strategy(*cf);
sstables::size_tiered_compaction_strategy_options stcs_options;
auto table_s = make_table_state_for_test(cf, env);
leveled_manifest manifest = leveled_manifest::create(*table_s, candidates, max_sstable_size_in_mb, stcs_options);
BOOST_REQUIRE(manifest.get_level_size(0) == 1);
BOOST_REQUIRE(manifest.get_level_size(1) == 2);
BOOST_REQUIRE(manifest.get_level_size(2) == 1);
// checks scores; used to determine the level of compaction to proceed with.
auto level1_score = (double) manifest.get_total_bytes(manifest.get_level(1)) / (double) manifest.max_bytes_for_level(1);
BOOST_REQUIRE(level1_score > 1.001);
auto level2_score = (double) manifest.get_total_bytes(manifest.get_level(2)) / (double) manifest.max_bytes_for_level(2);
BOOST_REQUIRE(level2_score < 1.001);
std::vector<std::optional<dht::decorated_key>> last_compacted_keys(leveled_manifest::MAX_LEVELS);
std::vector<int> compaction_counter(leveled_manifest::MAX_LEVELS);
auto candidate = manifest.get_compaction_candidates(last_compacted_keys, compaction_counter);
BOOST_REQUIRE(candidate.sstables.size() == 2);
BOOST_REQUIRE(candidate.level == 2);
std::set<unsigned long> levels = { 1, 2 };
for (auto& sst : candidate.sstables) {
BOOST_REQUIRE(levels.contains(sst->get_sstable_level()));
levels.erase(sst->get_sstable_level());
}
BOOST_REQUIRE(levels.empty());
return cf.stop_and_keep_alive();
});
}
SEASTAR_TEST_CASE(leveled_05) {
// NOTE: Generations from 48 to 51 are used here.
return test_setup::do_with_tmp_directory([] (test_env& env, sstring tmpdir_path) {
// Check compaction code with leveled strategy. In this test, two sstables of level 0 will be created.
return compact_sstables(env, tmpdir_path, { 48, 49 }, 50, true, 1024*1024, compaction_strategy_type::leveled).then([tmpdir_path] (auto generations) {
BOOST_REQUIRE(generations.size() == 2);
BOOST_REQUIRE(generations[0] == 50);
BOOST_REQUIRE(generations[1] == 51);
return seastar::async([&, generations = std::move(generations), tmpdir_path] {
for (auto gen : generations) {
auto fname = sstable::filename(tmpdir_path, "ks", "cf", sstables::get_highest_sstable_version(), gen, big, component_type::Data);
BOOST_REQUIRE(file_size(fname).get0() >= 1024*1024);
}
});
});
});
}
SEASTAR_TEST_CASE(leveled_06) {
// Test that we can compact a single L1 compaction into an empty L2.
return test_env::do_with([] (test_env& env) {
column_family_for_tests cf(env.manager());
auto max_sstable_size_in_mb = 1;
auto max_sstable_size_in_bytes = max_sstable_size_in_mb*1024*1024;
auto max_bytes_for_l1 = leveled_manifest::max_bytes_for_level(1, max_sstable_size_in_bytes);
// Create fake sstable that will be compacted into L2.
add_sstable_for_leveled_test(env, cf, /*gen*/1, /*data_size*/max_bytes_for_l1*2, /*level*/1, "a", "a");
BOOST_REQUIRE(cf->get_sstables()->size() == 1);
auto candidates = get_candidates_for_leveled_strategy(*cf);
sstables::size_tiered_compaction_strategy_options stcs_options;
auto table_s = make_table_state_for_test(cf, env);
leveled_manifest manifest = leveled_manifest::create(*table_s, candidates, max_sstable_size_in_mb, stcs_options);
BOOST_REQUIRE(manifest.get_level_size(0) == 0);
BOOST_REQUIRE(manifest.get_level_size(1) == 1);
BOOST_REQUIRE(manifest.get_level_size(2) == 0);
std::vector<std::optional<dht::decorated_key>> last_compacted_keys(leveled_manifest::MAX_LEVELS);
std::vector<int> compaction_counter(leveled_manifest::MAX_LEVELS);
auto candidate = manifest.get_compaction_candidates(last_compacted_keys, compaction_counter);
BOOST_REQUIRE(candidate.level == 2);
BOOST_REQUIRE(candidate.sstables.size() == 1);
auto& sst = (candidate.sstables)[0];
BOOST_REQUIRE(sst->get_sstable_level() == 1);
BOOST_REQUIRE(sst->generation() == 1);
return cf.stop_and_keep_alive();
});
}
SEASTAR_TEST_CASE(leveled_07) {
return test_env::do_with([] (test_env& env) {
column_family_for_tests cf(env.manager());
for (auto i = 0; i < leveled_manifest::MAX_COMPACTING_L0*2; i++) {
add_sstable_for_leveled_test(env, cf, i, 1024*1024, /*level*/0, "a", "a", i /* max timestamp */);
}
auto candidates = get_candidates_for_leveled_strategy(*cf);
sstables::size_tiered_compaction_strategy_options stcs_options;
auto table_s = make_table_state_for_test(cf, env);
leveled_manifest manifest = leveled_manifest::create(*table_s, candidates, 1, stcs_options);
std::vector<std::optional<dht::decorated_key>> last_compacted_keys(leveled_manifest::MAX_LEVELS);
std::vector<int> compaction_counter(leveled_manifest::MAX_LEVELS);
auto desc = manifest.get_compaction_candidates(last_compacted_keys, compaction_counter);
BOOST_REQUIRE(desc.level == 1);
BOOST_REQUIRE(desc.sstables.size() == leveled_manifest::MAX_COMPACTING_L0);
// check that strategy returns the oldest sstables
for (auto& sst : desc.sstables) {
BOOST_REQUIRE(sst->get_stats_metadata().max_timestamp < leveled_manifest::MAX_COMPACTING_L0);
}
return cf.stop_and_keep_alive();
});
}
SEASTAR_TEST_CASE(leveled_invariant_fix) {
return test_env::do_with([] (test_env& env) {
column_family_for_tests cf(env.manager());
auto sstables_no = cf.schema()->max_compaction_threshold();
auto key_and_token_pair = token_generation_for_current_shard(sstables_no);
auto min_key = key_and_token_pair[0].first;
auto max_key = key_and_token_pair[key_and_token_pair.size()-1].first;
auto sstable_max_size = 1024*1024;
// add non overlapping with min token to be discarded by strategy
add_sstable_for_leveled_test(env, cf, 0, sstable_max_size, /*level*/1, min_key, min_key);
for (auto i = 1; i < sstables_no-1; i++) {
add_sstable_for_leveled_test(env, cf, i, sstable_max_size, /*level*/1, key_and_token_pair[i].first, key_and_token_pair[i].first);
}
// add large token span sstable into level 1, which overlaps with all sstables added in loop above.
add_sstable_for_leveled_test(env, cf, sstables_no, sstable_max_size, 1, key_and_token_pair[1].first, max_key);
auto candidates = get_candidates_for_leveled_strategy(*cf);
sstables::size_tiered_compaction_strategy_options stcs_options;
auto table_s = make_table_state_for_test(cf, env);
leveled_manifest manifest = leveled_manifest::create(*table_s, candidates, 1, stcs_options);
std::vector<std::optional<dht::decorated_key>> last_compacted_keys(leveled_manifest::MAX_LEVELS);
std::vector<int> compaction_counter(leveled_manifest::MAX_LEVELS);
auto candidate = manifest.get_compaction_candidates(last_compacted_keys, compaction_counter);
BOOST_REQUIRE(candidate.level == 1);
BOOST_REQUIRE(candidate.sstables.size() == size_t(sstables_no-1));
BOOST_REQUIRE(boost::algorithm::all_of(candidate.sstables, [] (auto& sst) {
return sst->generation() != 0;
}));
return cf.stop_and_keep_alive();
});
}
SEASTAR_TEST_CASE(leveled_stcs_on_L0) {
return test_env::do_with([] (test_env& env) {
schema_builder builder(make_shared_schema({}, some_keyspace, some_column_family,
{{"p1", utf8_type}}, {}, {}, {}, utf8_type));
builder.set_min_compaction_threshold(4);
auto s = builder.build(schema_builder::compact_storage::no);
column_family_for_tests cf(env.manager(), s);
auto key_and_token_pair = token_generation_for_current_shard(1);
auto sstable_max_size_in_mb = 1;
auto l0_sstables_no = s->min_compaction_threshold();
// we don't want level 0 to be worth promoting.
auto l0_sstables_size = (sstable_max_size_in_mb*1024*1024)/(l0_sstables_no+1);
add_sstable_for_leveled_test(env, cf, 0, sstable_max_size_in_mb*1024*1024, /*level*/1, key_and_token_pair[0].first, key_and_token_pair[0].first);
for (auto gen = 0; gen < l0_sstables_no; gen++) {
add_sstable_for_leveled_test(env, cf, gen+1, l0_sstables_size, /*level*/0, key_and_token_pair[0].first, key_and_token_pair[0].first);
}
auto candidates = get_candidates_for_leveled_strategy(*cf);
BOOST_REQUIRE(candidates.size() == size_t(l0_sstables_no+1));
BOOST_REQUIRE(cf->get_sstables()->size() == size_t(l0_sstables_no+1));
std::vector<std::optional<dht::decorated_key>> last_compacted_keys(leveled_manifest::MAX_LEVELS);
std::vector<int> compaction_counter(leveled_manifest::MAX_LEVELS);
sstables::size_tiered_compaction_strategy_options stcs_options;
auto table_s = make_table_state_for_test(cf, env);
{
leveled_manifest manifest = leveled_manifest::create(*table_s, candidates, sstable_max_size_in_mb, stcs_options);
BOOST_REQUIRE(!manifest.worth_promoting_L0_candidates(manifest.get_level(0)));
auto candidate = manifest.get_compaction_candidates(last_compacted_keys, compaction_counter);
BOOST_REQUIRE(candidate.level == 0);
BOOST_REQUIRE(candidate.sstables.size() == size_t(l0_sstables_no));
BOOST_REQUIRE(boost::algorithm::all_of(candidate.sstables, [] (auto& sst) {
return sst->generation() != 0;
}));
}
{
candidates.resize(2);
leveled_manifest manifest = leveled_manifest::create(*table_s, candidates, sstable_max_size_in_mb, stcs_options);
auto candidate = manifest.get_compaction_candidates(last_compacted_keys, compaction_counter);
BOOST_REQUIRE(candidate.level == 0);
BOOST_REQUIRE(candidate.sstables.empty());
}
return cf.stop_and_keep_alive();
});
}
SEASTAR_TEST_CASE(overlapping_starved_sstables_test) {
return test_env::do_with([] (test_env& env) {
column_family_for_tests cf(env.manager());
auto key_and_token_pair = token_generation_for_current_shard(5);
auto min_key = key_and_token_pair[0].first;
auto max_sstable_size_in_mb = 1;
auto max_sstable_size_in_bytes = max_sstable_size_in_mb*1024*1024;
// we compact 2 sstables: 0->2 in L1 and 0->1 in L2, and rely on strategy
// to bring a sstable from level 3 that theoretically wasn't compacted
// for many rounds and won't introduce an overlap.
auto max_bytes_for_l1 = leveled_manifest::max_bytes_for_level(1, max_sstable_size_in_bytes);
add_sstable_for_leveled_test(env, cf, /*gen*/1, max_bytes_for_l1*1.1, /*level*/1, min_key, key_and_token_pair[2].first);
add_sstable_for_leveled_test(env, cf, /*gen*/2, max_sstable_size_in_bytes, /*level*/2, min_key, key_and_token_pair[1].first);
add_sstable_for_leveled_test(env, cf, /*gen*/3, max_sstable_size_in_bytes, /*level*/3, min_key, key_and_token_pair[1].first);
std::vector<std::optional<dht::decorated_key>> last_compacted_keys(leveled_manifest::MAX_LEVELS);
std::vector<int> compaction_counter(leveled_manifest::MAX_LEVELS);
// make strategy think that level 3 wasn't compacted for many rounds
compaction_counter[3] = leveled_manifest::NO_COMPACTION_LIMIT+1;
auto candidates = get_candidates_for_leveled_strategy(*cf);
sstables::size_tiered_compaction_strategy_options stcs_options;
auto table_s = make_table_state_for_test(cf, env);
leveled_manifest manifest = leveled_manifest::create(*table_s, candidates, max_sstable_size_in_mb, stcs_options);
auto candidate = manifest.get_compaction_candidates(last_compacted_keys, compaction_counter);
BOOST_REQUIRE(candidate.level == 2);
BOOST_REQUIRE(candidate.sstables.size() == 3);
return cf.stop_and_keep_alive();
});
}
SEASTAR_TEST_CASE(check_overlapping) {
return test_env::do_with([] (test_env& env) {
column_family_for_tests cf(env.manager());
auto key_and_token_pair = token_generation_for_current_shard(4);
auto min_key = key_and_token_pair[0].first;
auto max_key = key_and_token_pair[key_and_token_pair.size()-1].first;
auto sst1 = add_sstable_for_overlapping_test(env, cf, /*gen*/1, min_key, key_and_token_pair[1].first);
auto sst2 = add_sstable_for_overlapping_test(env, cf, /*gen*/2, min_key, key_and_token_pair[2].first);
auto sst3 = add_sstable_for_overlapping_test(env, cf, /*gen*/3, key_and_token_pair[3].first, max_key);
auto sst4 = add_sstable_for_overlapping_test(env, cf, /*gen*/4, min_key, max_key);
BOOST_REQUIRE(cf->get_sstables()->size() == 4);
std::vector<shared_sstable> compacting = { sst1, sst2 };
std::vector<shared_sstable> uncompacting = { sst3, sst4 };
auto overlapping_sstables = leveled_manifest::overlapping(*cf.schema(), compacting, uncompacting);
BOOST_REQUIRE(overlapping_sstables.size() == 1);
BOOST_REQUIRE(overlapping_sstables.front()->generation() == 4);
return cf.stop_and_keep_alive();
});
}
SEASTAR_TEST_CASE(tombstone_purge_test) {
BOOST_REQUIRE(smp::count == 1);
return test_env::do_with_async([] (test_env& env) {
cell_locker_stats cl_stats;
// In a column family with gc_grace_seconds set to 0, check that a tombstone
// is purged after compaction.
auto builder = schema_builder("tests", "tombstone_purge")
.with_column("id", utf8_type, column_kind::partition_key)
.with_column("value", int32_type);
builder.set_gc_grace_seconds(0);
auto s = builder.build();
auto tmp = tmpdir();
auto sst_gen = [&env, s, &tmp, gen = make_lw_shared<unsigned>(1)] () mutable {
return env.make_sstable(s, tmp.path().string(), (*gen)++, sstables::get_highest_sstable_version(), big);
};
auto compact = [&, s] (std::vector<shared_sstable> all, std::vector<shared_sstable> to_compact) -> std::vector<shared_sstable> {
column_family_for_tests cf(env.manager(), s);
auto stop_cf = deferred_stop(cf);
for (auto&& sst : all) {
column_family_test(cf).add_sstable(sst);
}
return compact_sstables(cf.get_compaction_manager(), sstables::compaction_descriptor(to_compact, default_priority_class()), *cf, sst_gen).get0().new_sstables;
};
auto next_timestamp = [] {
static thread_local api::timestamp_type next = 1;
return next++;
};
auto make_insert = [&] (partition_key key) {
mutation m(s, key);
m.set_clustered_cell(clustering_key::make_empty(), bytes("value"), data_value(int32_t(1)), next_timestamp());
return m;
};
auto make_expiring = [&] (partition_key key, int ttl) {
mutation m(s, key);
m.set_clustered_cell(clustering_key::make_empty(), bytes("value"), data_value(int32_t(1)),
gc_clock::now().time_since_epoch().count(), gc_clock::duration(ttl));
return m;
};
auto make_delete = [&] (partition_key key) {
mutation m(s, key);
tombstone tomb(next_timestamp(), gc_clock::now());
m.partition().apply(tomb);
return m;
};
auto assert_that_produces_dead_cell = [&] (auto& sst, partition_key& key) {
auto reader = make_lw_shared<flat_mutation_reader>(sstable_reader(sst, s, env.make_reader_permit()));
read_mutation_from_flat_mutation_reader(*reader).then([reader, s, &key] (mutation_opt m) {
BOOST_REQUIRE(m);
BOOST_REQUIRE(m->key().equal(*s, key));
auto rows = m->partition().clustered_rows();
BOOST_REQUIRE_EQUAL(rows.calculate_size(), 1);
auto& row = rows.begin()->row();
auto& cells = row.cells();
BOOST_REQUIRE_EQUAL(cells.size(), 1);
auto& cdef = *s->get_column_definition("value");
BOOST_REQUIRE(!cells.cell_at(cdef.id).as_atomic_cell(cdef).is_live());
return (*reader)();
}).then([reader, s] (mutation_fragment_opt m) {
BOOST_REQUIRE(!m);
}).finally([reader] {
return reader->close();
}).get();
};
auto alpha = partition_key::from_exploded(*s, {to_bytes("alpha")});
auto beta = partition_key::from_exploded(*s, {to_bytes("beta")});
auto ttl = 10;
{
auto mut1 = make_insert(alpha);
auto mut2 = make_insert(beta);
auto mut3 = make_delete(alpha);
std::vector<shared_sstable> sstables = {
make_sstable_containing(sst_gen, {mut1, mut2}),
make_sstable_containing(sst_gen, {mut3})
};
forward_jump_clocks(std::chrono::seconds(ttl));
auto result = compact(sstables, sstables);
BOOST_REQUIRE_EQUAL(1, result.size());
assert_that(sstable_reader(result[0], s, env.make_reader_permit()))
.produces(mut2)
.produces_end_of_stream();
}
{
auto mut1 = make_insert(alpha);
auto mut2 = make_insert(alpha);
auto mut3 = make_delete(alpha);
auto sst1 = make_sstable_containing(sst_gen, {mut1});
auto sst2 = make_sstable_containing(sst_gen, {mut2, mut3});
forward_jump_clocks(std::chrono::seconds(ttl));
auto result = compact({sst1, sst2}, {sst2});
BOOST_REQUIRE_EQUAL(1, result.size());
assert_that(sstable_reader(result[0], s, env.make_reader_permit()))
.produces(mut3)
.produces_end_of_stream();
}
{
auto mut1 = make_insert(alpha);
auto mut2 = make_delete(alpha);
auto mut3 = make_insert(beta);
auto mut4 = make_insert(alpha);
auto sst1 = make_sstable_containing(sst_gen, {mut1, mut2, mut3});
auto sst2 = make_sstable_containing(sst_gen, {mut4});
forward_jump_clocks(std::chrono::seconds(ttl));
auto result = compact({sst1, sst2}, {sst1});
BOOST_REQUIRE_EQUAL(1, result.size());
assert_that(sstable_reader(result[0], s, env.make_reader_permit()))
.produces(mut3)
.produces_end_of_stream();
}
{
auto mut1 = make_insert(alpha);
auto mut2 = make_delete(alpha);
auto mut3 = make_insert(beta);
auto mut4 = make_insert(beta);
auto sst1 = make_sstable_containing(sst_gen, {mut1, mut2, mut3});
auto sst2 = make_sstable_containing(sst_gen, {mut4});
forward_jump_clocks(std::chrono::seconds(ttl));
auto result = compact({sst1, sst2}, {sst1});
BOOST_REQUIRE_EQUAL(1, result.size());
assert_that(sstable_reader(result[0], s, env.make_reader_permit()))
.produces(mut3)
.produces_end_of_stream();
}
{
// check that expired cell will not be purged if it will ressurect overwritten data.
auto mut1 = make_insert(alpha);
auto mut2 = make_expiring(alpha, ttl);
auto sst1 = make_sstable_containing(sst_gen, {mut1});
auto sst2 = make_sstable_containing(sst_gen, {mut2});
forward_jump_clocks(std::chrono::seconds(ttl));
auto result = compact({sst1, sst2}, {sst2});
BOOST_REQUIRE_EQUAL(1, result.size());
assert_that_produces_dead_cell(result[0], alpha);
result = compact({sst1, sst2}, {sst1, sst2});
BOOST_REQUIRE_EQUAL(0, result.size());
}
{
auto mut1 = make_insert(alpha);
auto mut2 = make_expiring(beta, ttl);
auto sst1 = make_sstable_containing(sst_gen, {mut1});
auto sst2 = make_sstable_containing(sst_gen, {mut2});
forward_jump_clocks(std::chrono::seconds(ttl));
auto result = compact({sst1, sst2}, {sst2});
BOOST_REQUIRE_EQUAL(0, result.size());
}
{
auto mut1 = make_insert(alpha);
auto mut2 = make_expiring(alpha, ttl);
auto mut3 = make_insert(beta);
auto sst1 = make_sstable_containing(sst_gen, {mut1});
auto sst2 = make_sstable_containing(sst_gen, {mut2, mut3});
forward_jump_clocks(std::chrono::seconds(ttl));
auto result = compact({sst1, sst2}, {sst1, sst2});
BOOST_REQUIRE_EQUAL(1, result.size());
assert_that(sstable_reader(result[0], s, env.make_reader_permit()))
.produces(mut3)
.produces_end_of_stream();
}
});
}
SEASTAR_TEST_CASE(sstable_rewrite) {
BOOST_REQUIRE(smp::count == 1);
return test_setup::do_with_tmp_directory([] (test_env& env, sstring tmpdir_path) {
auto s = make_shared_schema({}, some_keyspace, some_column_family,
{{"p1", utf8_type}}, {{"c1", utf8_type}}, {{"r1", utf8_type}}, {}, utf8_type);
auto mt = make_lw_shared<replica::memtable>(s);
const column_definition& r1_col = *s->get_column_definition("r1");
auto key_for_this_shard = token_generation_for_current_shard(1);
auto apply_key = [mt, s, &r1_col] (sstring key_to_write) {
auto key = partition_key::from_exploded(*s, {to_bytes(key_to_write)});
auto c_key = clustering_key::from_exploded(*s, {to_bytes("c1")});
mutation m(s, key);
m.set_clustered_cell(c_key, r1_col, make_atomic_cell(utf8_type, bytes("a")));
mt->apply(std::move(m));
};
apply_key(key_for_this_shard[0].first);
auto sst = env.make_sstable(s, tmpdir_path, 51, sstables::get_highest_sstable_version(), big);
return write_memtable_to_sstable_for_test(*mt, sst).then([&env, s, sst, tmpdir_path] {
return env.reusable_sst(s, tmpdir_path, 51);
}).then([&env, s, key = key_for_this_shard[0].first, tmpdir_path] (auto sstp) mutable {
auto new_tables = make_lw_shared<std::vector<sstables::shared_sstable>>();
auto creator = [&env, new_tables, s, tmpdir_path] {
auto sst = env.make_sstable(s, tmpdir_path, 52, sstables::get_highest_sstable_version(), big);
new_tables->emplace_back(sst);
return sst;
};
column_family_for_tests cf(env.manager(), s);
std::vector<shared_sstable> sstables;
sstables.push_back(std::move(sstp));
return compact_sstables(cf.get_compaction_manager(), sstables::compaction_descriptor(std::move(sstables), default_priority_class()), *cf, creator).then([&env, s, key, new_tables] (auto) {
BOOST_REQUIRE(new_tables->size() == 1);
auto newsst = (*new_tables)[0];
BOOST_REQUIRE(newsst->generation() == 52);
auto reader = make_lw_shared<flat_mutation_reader>(sstable_reader(newsst, s, env.make_reader_permit()));
return (*reader)().then([s, reader, key] (mutation_fragment_opt m) {
BOOST_REQUIRE(m);
BOOST_REQUIRE(m->is_partition_start());
auto pkey = partition_key::from_exploded(*s, {to_bytes(key)});
BOOST_REQUIRE(m->as_partition_start().key().key().equal(*s, pkey));
return reader->next_partition();
}).then([reader] {
return (*reader)();
}).then([reader] (mutation_fragment_opt m) {
BOOST_REQUIRE(!m);
}).finally([reader] {
return reader->close();
});
}).finally([cf] () mutable { return cf.stop_and_keep_alive(); });
}).then([sst, mt, s] {});
});
}
SEASTAR_TEST_CASE(test_sstable_max_local_deletion_time_2) {
// Create sstable A with 5x column with TTL 100 and 1x column with TTL 1000
// Create sstable B with tombstone for column in sstable A with TTL 1000.
// Compact them and expect that maximum deletion time is that of column with TTL 100.
return test_setup::do_with_tmp_directory([] (test_env& env, sstring tmpdir_path) {
return seastar::async([&env, tmpdir_path] {
for (auto version : writable_sstable_versions) {
schema_builder builder(some_keyspace, some_column_family);
builder.with_column("p1", utf8_type, column_kind::partition_key);
builder.with_column("c1", utf8_type, column_kind::clustering_key);
builder.with_column("r1", utf8_type);
schema_ptr s = builder.build(schema_builder::compact_storage::no);
column_family_for_tests cf(env.manager(), s);
auto close_cf = deferred_stop(cf);
auto mt = make_lw_shared<replica::memtable>(s);
auto now = gc_clock::now();
int32_t last_expiry = 0;
auto add_row = [&now, &mt, &s, &last_expiry](mutation &m, bytes column_name, uint32_t ttl) {
auto c_key = clustering_key::from_exploded(*s, {column_name});
last_expiry = (now + gc_clock::duration(ttl)).time_since_epoch().count();
m.set_clustered_cell(c_key, *s->get_column_definition("r1"),
make_atomic_cell(utf8_type, bytes(""), ttl, last_expiry));
mt->apply(std::move(m));
};
auto get_usable_sst = [&env, s, tmpdir_path, version](replica::memtable &mt, int64_t gen) -> future<sstable_ptr> {
auto sst = env.make_sstable(s, tmpdir_path, gen, version, big);
return write_memtable_to_sstable_for_test(mt, sst).then([&env, sst, gen, s, tmpdir_path, version] {
return env.reusable_sst(s, tmpdir_path, gen, version);
});
};
mutation m(s, partition_key::from_exploded(*s, {to_bytes("deletetest")}));
for (auto i = 0; i < 5; i++) {
add_row(m, to_bytes("deletecolumn" + to_sstring(i)), 100);
}
add_row(m, to_bytes("todelete"), 1000);
auto sst1 = get_usable_sst(*mt, 54).get0();
BOOST_REQUIRE(last_expiry == sst1->get_stats_metadata().max_local_deletion_time);
mt = make_lw_shared<replica::memtable>(s);
m = mutation(s, partition_key::from_exploded(*s, {to_bytes("deletetest")}));
tombstone tomb(api::new_timestamp(), now);
m.partition().apply_delete(*s, clustering_key::from_exploded(*s, {to_bytes("todelete")}), tomb);
mt->apply(std::move(m));
auto sst2 = get_usable_sst(*mt, 55).get0();
BOOST_REQUIRE(now.time_since_epoch().count() == sst2->get_stats_metadata().max_local_deletion_time);
auto creator = [&env, s, tmpdir_path, version, gen = make_lw_shared<unsigned>(56)] { return env.make_sstable(s, tmpdir_path, (*gen)++, version, big); };
auto info = compact_sstables(cf.get_compaction_manager(), sstables::compaction_descriptor({sst1, sst2}, default_priority_class()), *cf, creator).get0();
BOOST_REQUIRE(info.new_sstables.size() == 1);
BOOST_REQUIRE(((now + gc_clock::duration(100)).time_since_epoch().count()) ==
info.new_sstables.front()->get_stats_metadata().max_local_deletion_time);
}
});
});
}
static stats_metadata build_stats(int64_t min_timestamp, int64_t max_timestamp, int32_t max_local_deletion_time) {
stats_metadata stats = {};
stats.min_timestamp = min_timestamp;
stats.max_timestamp = max_timestamp;
stats.max_local_deletion_time = max_local_deletion_time;
return stats;
}
SEASTAR_TEST_CASE(get_fully_expired_sstables_test) {
return test_env::do_with_async([] (test_env& env) {
auto key_and_token_pair = token_generation_for_current_shard(4);
auto min_key = key_and_token_pair[0].first;
auto max_key = key_and_token_pair[key_and_token_pair.size()-1].first;
auto t0 = gc_clock::from_time_t(1).time_since_epoch().count();
auto t1 = gc_clock::from_time_t(10).time_since_epoch().count();
auto t2 = gc_clock::from_time_t(15).time_since_epoch().count();
auto t3 = gc_clock::from_time_t(20).time_since_epoch().count();
auto t4 = gc_clock::from_time_t(30).time_since_epoch().count();
{
column_family_for_tests cf(env.manager());
auto close_cf = deferred_stop(cf);
auto sst1 = add_sstable_for_overlapping_test(env, cf, /*gen*/1, min_key, key_and_token_pair[1].first, build_stats(t0, t1, t1));
auto sst2 = add_sstable_for_overlapping_test(env, cf, /*gen*/2, min_key, key_and_token_pair[2].first, build_stats(t0, t1, std::numeric_limits<int32_t>::max()));
auto sst3 = add_sstable_for_overlapping_test(env, cf, /*gen*/3, min_key, max_key, build_stats(t3, t4, std::numeric_limits<int32_t>::max()));
std::vector<sstables::shared_sstable> compacting = { sst1, sst2 };
auto expired = get_fully_expired_sstables(cf->as_table_state(), compacting, /*gc before*/gc_clock::from_time_t(15) + cf->schema()->gc_grace_seconds());
BOOST_REQUIRE(expired.size() == 0);
}
{
column_family_for_tests cf(env.manager());
auto close_cf = deferred_stop(cf);
auto sst1 = add_sstable_for_overlapping_test(env, cf, /*gen*/1, min_key, key_and_token_pair[1].first, build_stats(t0, t1, t1));
auto sst2 = add_sstable_for_overlapping_test(env, cf, /*gen*/2, min_key, key_and_token_pair[2].first, build_stats(t2, t3, std::numeric_limits<int32_t>::max()));
auto sst3 = add_sstable_for_overlapping_test(env, cf, /*gen*/3, min_key, max_key, build_stats(t3, t4, std::numeric_limits<int32_t>::max()));
std::vector<sstables::shared_sstable> compacting = { sst1, sst2 };
auto expired = get_fully_expired_sstables(cf->as_table_state(), compacting, /*gc before*/gc_clock::from_time_t(25) + cf->schema()->gc_grace_seconds());
BOOST_REQUIRE(expired.size() == 1);
auto expired_sst = *expired.begin();
BOOST_REQUIRE(expired_sst->generation() == 1);
}
});
}
SEASTAR_TEST_CASE(compaction_with_fully_expired_table) {
return test_env::do_with_async([] (test_env& env) {
auto builder = schema_builder("la", "cf")
.with_column("pk", utf8_type, column_kind::partition_key)
.with_column("ck1", utf8_type, column_kind::clustering_key)
.with_column("r1", int32_type);
builder.set_gc_grace_seconds(0);
auto s = builder.build();
auto tmp = tmpdir();
auto key = partition_key::from_exploded(*s, {to_bytes("key1")});
auto c_key = clustering_key_prefix::from_exploded(*s, {to_bytes("c1")});
auto sst_gen = [&env, s, &tmp, gen = make_lw_shared<unsigned>(1)] () mutable {
return env.make_sstable(s, tmp.path().string(), (*gen)++, sstables::get_highest_sstable_version(), big);
};
auto mt = make_lw_shared<replica::memtable>(s);
mutation m(s, key);
tombstone tomb(api::new_timestamp(), gc_clock::now() - std::chrono::seconds(3600));
m.partition().apply_delete(*s, c_key, tomb);
mt->apply(std::move(m));
auto sst = sst_gen();
write_memtable_to_sstable_for_test(*mt, sst).get();
sst = env.reusable_sst(s, tmp.path().string(), 1).get0();
column_family_for_tests cf(env.manager(), s);
auto close_cf = deferred_stop(cf);
auto ssts = std::vector<shared_sstable>{ sst };
auto expired = get_fully_expired_sstables(cf->as_table_state(), ssts, gc_clock::now());
BOOST_REQUIRE(expired.size() == 1);
auto expired_sst = *expired.begin();
BOOST_REQUIRE(expired_sst->generation() == 1);
auto ret = compact_sstables(cf.get_compaction_manager(), sstables::compaction_descriptor(ssts, default_priority_class()), *cf, sst_gen).get0();
BOOST_REQUIRE(ret.new_sstables.empty());
BOOST_REQUIRE(ret.end_size == 0);
});
}
SEASTAR_TEST_CASE(basic_date_tiered_strategy_test) {
return test_env::do_with([] (test_env& env) {
schema_builder builder(make_shared_schema({}, some_keyspace, some_column_family,
{{"p1", utf8_type}}, {}, {}, {}, utf8_type));
builder.set_min_compaction_threshold(4);
auto s = builder.build(schema_builder::compact_storage::no);
column_family_for_tests cf(env.manager(), s);
std::vector<sstables::shared_sstable> candidates;
int min_threshold = cf->schema()->min_compaction_threshold();
auto now = db_clock::now();
auto past_hour = now - std::chrono::seconds(3600);
int64_t timestamp_for_now = now.time_since_epoch().count() * 1000;
int64_t timestamp_for_past_hour = past_hour.time_since_epoch().count() * 1000;
for (auto i = 1; i <= min_threshold; i++) {
auto sst = add_sstable_for_overlapping_test(env, cf, /*gen*/i, "a", "a",
build_stats(timestamp_for_now, timestamp_for_now, std::numeric_limits<int32_t>::max()));
candidates.push_back(sst);
}
// add sstable that belong to a different time tier.
auto sst = add_sstable_for_overlapping_test(env, cf, /*gen*/min_threshold + 1, "a", "a",
build_stats(timestamp_for_past_hour, timestamp_for_past_hour, std::numeric_limits<int32_t>::max()));
candidates.push_back(sst);
auto gc_before = gc_clock::now() - cf->schema()->gc_grace_seconds();
std::map<sstring, sstring> options;
date_tiered_manifest manifest(options);
auto table_s = make_table_state_for_test(cf, env);
auto sstables = manifest.get_next_sstables(*table_s, candidates, gc_before);
BOOST_REQUIRE(sstables.size() == 4);
for (auto& sst : sstables) {
BOOST_REQUIRE(sst->generation() != (min_threshold + 1));
}
return cf.stop_and_keep_alive();
});
}
SEASTAR_TEST_CASE(date_tiered_strategy_test_2) {
return test_env::do_with([] (test_env& env) {
schema_builder builder(make_shared_schema({}, some_keyspace, some_column_family,
{{"p1", utf8_type}}, {}, {}, {}, utf8_type));
builder.set_min_compaction_threshold(4);
auto s = builder.build(schema_builder::compact_storage::no);
column_family_for_tests cf(env.manager(), s);
// deterministic timestamp for Fri, 01 Jan 2016 00:00:00 GMT.
auto tp = db_clock::from_time_t(1451606400);
int64_t timestamp = tp.time_since_epoch().count() * 1000; // in microseconds.
std::vector<sstables::shared_sstable> candidates;
int min_threshold = cf->schema()->min_compaction_threshold();
// add sstables that belong to same time window until min threshold is satisfied.
for (auto i = 1; i <= min_threshold; i++) {
auto sst = add_sstable_for_overlapping_test(env, cf, /*gen*/i, "a", "a",
build_stats(timestamp, timestamp, std::numeric_limits<int32_t>::max()));
candidates.push_back(sst);
}
// belongs to the time window
auto tp2 = tp + std::chrono::seconds(1800);
timestamp = tp2.time_since_epoch().count() * 1000;
auto sst = add_sstable_for_overlapping_test(env, cf, /*gen*/min_threshold + 1, "a", "a",
build_stats(timestamp, timestamp, std::numeric_limits<int32_t>::max()));
candidates.push_back(sst);
// doesn't belong to the time window above
auto tp3 = tp + std::chrono::seconds(4000);
timestamp = tp3.time_since_epoch().count() * 1000;
auto sst2 = add_sstable_for_overlapping_test(env, cf, /*gen*/min_threshold + 2, "a", "a",
build_stats(timestamp, timestamp, std::numeric_limits<int32_t>::max()));
candidates.push_back(sst2);
std::map<sstring, sstring> options;
// Use a 1-hour time window.
options.emplace(sstring("base_time_seconds"), sstring("3600"));
date_tiered_manifest manifest(options);
auto gc_before = gc_clock::time_point(std::chrono::seconds(0)); // disable gc before.
auto table_s = make_table_state_for_test(cf, env);
auto sstables = manifest.get_next_sstables(*table_s, candidates, gc_before);
std::unordered_set<int64_t> gens;
for (auto sst : sstables) {
gens.insert(sst->generation());
}
BOOST_REQUIRE(sstables.size() == size_t(min_threshold + 1));
BOOST_REQUIRE(gens.contains(min_threshold + 1));
BOOST_REQUIRE(!gens.contains(min_threshold + 2));
return cf.stop_and_keep_alive();
});
}
SEASTAR_TEST_CASE(time_window_strategy_time_window_tests) {
using namespace std::chrono;
api::timestamp_type tstamp1 = duration_cast<microseconds>(milliseconds(1451001601000L)).count(); // 2015-12-25 @ 00:00:01, in milliseconds
api::timestamp_type tstamp2 = duration_cast<microseconds>(milliseconds(1451088001000L)).count(); // 2015-12-26 @ 00:00:01, in milliseconds
api::timestamp_type low_hour = duration_cast<microseconds>(milliseconds(1451001600000L)).count(); // 2015-12-25 @ 00:00:00, in milliseconds
// A 1 hour window should round down to the beginning of the hour
BOOST_REQUIRE(time_window_compaction_strategy::get_window_lower_bound(duration_cast<seconds>(hours(1)), tstamp1) == low_hour);
// A 1 minute window should round down to the beginning of the hour
BOOST_REQUIRE(time_window_compaction_strategy::get_window_lower_bound(duration_cast<seconds>(minutes(1)), tstamp1) == low_hour);
// A 1 day window should round down to the beginning of the hour
BOOST_REQUIRE(time_window_compaction_strategy::get_window_lower_bound(duration_cast<seconds>(hours(24)), tstamp1) == low_hour);
// The 2 day window of 2015-12-25 + 2015-12-26 should round down to the beginning of 2015-12-25
BOOST_REQUIRE(time_window_compaction_strategy::get_window_lower_bound(duration_cast<seconds>(hours(24*2)), tstamp2) == low_hour);
return make_ready_future<>();
}
SEASTAR_TEST_CASE(time_window_strategy_ts_resolution_check) {
return test_env::do_with([] (test_env& env) {
auto ts = 1451001601000L; // 2015-12-25 @ 00:00:01, in milliseconds
auto ts_in_ms = std::chrono::milliseconds(ts);
auto ts_in_us = std::chrono::duration_cast<std::chrono::microseconds>(ts_in_ms);
auto s = schema_builder("tests", "time_window_strategy")
.with_column("id", utf8_type, column_kind::partition_key)
.with_column("value", int32_type).build();
{
std::map<sstring, sstring> opts = { { time_window_compaction_strategy_options::TIMESTAMP_RESOLUTION_KEY, "MILLISECONDS" }, };
time_window_compaction_strategy_options options(opts);
auto sst = env.make_sstable(s, "", 1, la, big);
sstables::test(sst).set_values("key1", "key1", build_stats(ts_in_ms.count(), ts_in_ms.count(), std::numeric_limits<int32_t>::max()));
auto ret = time_window_compaction_strategy::get_buckets({ sst }, options);
auto expected = time_window_compaction_strategy::get_window_lower_bound(options.get_sstable_window_size(), ts_in_us.count());
BOOST_REQUIRE(ret.second == expected);
}
{
std::map<sstring, sstring> opts = { { time_window_compaction_strategy_options::TIMESTAMP_RESOLUTION_KEY, "MICROSECONDS" }, };
time_window_compaction_strategy_options options(opts);
auto sst = env.make_sstable(s, "", 1, la, big);
sstables::test(sst).set_values("key1", "key1", build_stats(ts_in_us.count(), ts_in_us.count(), std::numeric_limits<int32_t>::max()));
auto ret = time_window_compaction_strategy::get_buckets({ sst }, options);
auto expected = time_window_compaction_strategy::get_window_lower_bound(options.get_sstable_window_size(), ts_in_us.count());
BOOST_REQUIRE(ret.second == expected);
}
return make_ready_future<>();
});
}
SEASTAR_TEST_CASE(time_window_strategy_correctness_test) {
using namespace std::chrono;
return test_env::do_with_async([] (test_env& env) {
auto s = schema_builder("tests", "time_window_strategy")
.with_column("id", utf8_type, column_kind::partition_key)
.with_column("value", int32_type).build();
auto tmp = tmpdir();
auto sst_gen = [&env, s, &tmp, gen = make_lw_shared<unsigned>(1)] () mutable {
return env.make_sstable(s, tmp.path().string(), (*gen)++, sstables::get_highest_sstable_version(), big);
};
auto make_insert = [&] (partition_key key, api::timestamp_type t) {
mutation m(s, key);
m.set_clustered_cell(clustering_key::make_empty(), bytes("value"), data_value(int32_t(1)), t);
return m;
};
api::timestamp_type tstamp = api::timestamp_clock::now().time_since_epoch().count();
api::timestamp_type tstamp2 = tstamp - duration_cast<microseconds>(seconds(2L * 3600L)).count();
std::vector<shared_sstable> sstables;
// create 5 sstables
for (api::timestamp_type t = 0; t < 3; t++) {
auto key = partition_key::from_exploded(*s, {to_bytes("key" + to_sstring(t))});
auto mut = make_insert(std::move(key), t);
sstables.push_back(make_sstable_containing(sst_gen, {std::move(mut)}));
}
// Decrement the timestamp to simulate a timestamp in the past hour
for (api::timestamp_type t = 3; t < 5; t++) {
// And add progressively more cells into each sstable
auto key = partition_key::from_exploded(*s, {to_bytes("key" + to_sstring(t))});
auto mut = make_insert(std::move(key), t);
sstables.push_back(make_sstable_containing(sst_gen, {std::move(mut)}));
}
std::map<sstring, sstring> options;
time_window_compaction_strategy twcs(options);
std::map<api::timestamp_type, std::vector<shared_sstable>> buckets;
column_family_for_tests cf(env.manager(), s);
auto close_cf = deferred_stop(cf);
auto table_s = make_table_state_for_test(cf, env);
auto control = make_strategy_control_for_test(false);
// We'll put 3 sstables into the newest bucket
for (api::timestamp_type i = 0; i < 3; i++) {
auto bound = time_window_compaction_strategy::get_window_lower_bound(duration_cast<seconds>(hours(1)), tstamp);
buckets[bound].push_back(sstables[i]);
}
auto now = api::timestamp_clock::now().time_since_epoch().count();
auto new_bucket = twcs.newest_bucket(*table_s, *control, buckets, 4, 32,
time_window_compaction_strategy::get_window_lower_bound(duration_cast<seconds>(hours(1)), now));
// incoming bucket should not be accepted when it has below the min threshold SSTables
BOOST_REQUIRE(new_bucket.empty());
now = api::timestamp_clock::now().time_since_epoch().count();
new_bucket = twcs.newest_bucket(*table_s, *control, buckets, 2, 32,
time_window_compaction_strategy::get_window_lower_bound(duration_cast<seconds>(hours(1)), now));
// incoming bucket should be accepted when it is larger than the min threshold SSTables
BOOST_REQUIRE(!new_bucket.empty());
// And 2 into the second bucket (1 hour back)
for (api::timestamp_type i = 3; i < 5; i++) {
auto bound = time_window_compaction_strategy::get_window_lower_bound(duration_cast<seconds>(hours(1)), tstamp2);
buckets[bound].push_back(sstables[i]);
}
// "an sstable with a single value should have equal min/max timestamps"
for (auto& sst : sstables) {
BOOST_REQUIRE(sst->get_stats_metadata().min_timestamp == sst->get_stats_metadata().max_timestamp);
}
// Test trim
auto num_sstables = 40;
for (int r = 5; r < num_sstables; r++) {
auto key = partition_key::from_exploded(*s, {to_bytes("key" + to_sstring(r))});
std::vector<mutation> mutations;
for (int i = 0 ; i < r ; i++) {
mutations.push_back(make_insert(key, tstamp + r));
}
sstables.push_back(make_sstable_containing(sst_gen, std::move(mutations)));
}
// Reset the buckets, overfill it now
for (int i = 0 ; i < 40; i++) {
auto bound = time_window_compaction_strategy::get_window_lower_bound(duration_cast<seconds>(hours(1)),
sstables[i]->get_stats_metadata().max_timestamp);
buckets[bound].push_back(sstables[i]);
}
now = api::timestamp_clock::now().time_since_epoch().count();
new_bucket = twcs.newest_bucket(*table_s, *control, buckets, 4, 32,
time_window_compaction_strategy::get_window_lower_bound(duration_cast<seconds>(hours(1)), now));
// new bucket should be trimmed to max threshold of 32
BOOST_REQUIRE(new_bucket.size() == size_t(32));
});
}
// Check that TWCS will only perform size-tiered on the current window and also
// the past windows that were already previously compacted into a single SSTable.
SEASTAR_TEST_CASE(time_window_strategy_size_tiered_behavior_correctness) {
using namespace std::chrono;
return test_env::do_with_async([] (test_env& env) {
auto s = schema_builder("tests", "time_window_strategy")
.with_column("id", utf8_type, column_kind::partition_key)
.with_column("value", int32_type).build();
auto tmp = tmpdir();
auto sst_gen = [&env, s, &tmp, gen = make_lw_shared<unsigned>(1)] () mutable {
return env.make_sstable(s, tmp.path().string(), (*gen)++, sstables::get_highest_sstable_version(), big);
};
auto make_insert = [&] (partition_key key, api::timestamp_type t) {
mutation m(s, key);
m.set_clustered_cell(clustering_key::make_empty(), bytes("value"), data_value(int32_t(1)), t);
return m;
};
std::map<sstring, sstring> options;
time_window_compaction_strategy twcs(options);
std::map<api::timestamp_type, std::vector<shared_sstable>> buckets; // windows
int min_threshold = 4;
int max_threshold = 32;
auto window_size = duration_cast<seconds>(hours(1));
auto add_new_sstable_to_bucket = [&] (api::timestamp_type ts, api::timestamp_type window_ts) {
auto key = partition_key::from_exploded(*s, {to_bytes("key" + to_sstring(ts))});
auto mut = make_insert(std::move(key), ts);
auto sst = make_sstable_containing(sst_gen, {std::move(mut)});
auto bound = time_window_compaction_strategy::get_window_lower_bound(window_size, window_ts);
buckets[bound].push_back(std::move(sst));
};
column_family_for_tests cf(env.manager(), s);
auto close_cf = deferred_stop(cf);
auto major_compact_bucket = [&] (api::timestamp_type window_ts) {
auto bound = time_window_compaction_strategy::get_window_lower_bound(window_size, window_ts);
auto ret = compact_sstables(cf.get_compaction_manager(), sstables::compaction_descriptor(std::move(buckets[bound]), default_priority_class()), *cf, sst_gen).get0();
BOOST_REQUIRE(ret.new_sstables.size() == 1);
buckets[bound] = std::move(ret.new_sstables);
};
api::timestamp_type current_window_ts = api::timestamp_clock::now().time_since_epoch().count();
api::timestamp_type past_window_ts = current_window_ts - duration_cast<microseconds>(seconds(2L * 3600L)).count();
// create 1 sstable into past time window and let the strategy know about it
add_new_sstable_to_bucket(0, past_window_ts);
auto now = time_window_compaction_strategy::get_window_lower_bound(window_size, past_window_ts);
auto table_s = make_table_state_for_test(cf, env);
auto control = make_strategy_control_for_test(false);
// past window cannot be compacted because it has a single SSTable
BOOST_REQUIRE(twcs.newest_bucket(*table_s, *control, buckets, min_threshold, max_threshold, now).size() == 0);
// create min_threshold-1 sstables into current time window
for (api::timestamp_type t = 0; t < min_threshold - 1; t++) {
add_new_sstable_to_bucket(t, current_window_ts);
}
// add 1 sstable into past window.
add_new_sstable_to_bucket(1, past_window_ts);
now = time_window_compaction_strategy::get_window_lower_bound(window_size, current_window_ts);
// past window can now be compacted into a single SSTable because it was the previous current (active) window.
// current window cannot be compacted because it has less than min_threshold SSTables
BOOST_REQUIRE(twcs.newest_bucket(*table_s, *control, buckets, min_threshold, max_threshold, now).size() == 2);
major_compact_bucket(past_window_ts);
// now past window cannot be compacted again, because it was already compacted into a single SSTable, now it switches to STCS mode.
BOOST_REQUIRE(twcs.newest_bucket(*table_s, *control, buckets, min_threshold, max_threshold, now).size() == 0);
// make past window contain more than min_threshold similar-sized SSTables, allowing it to be compacted again.
for (api::timestamp_type t = 1; t < min_threshold; t++) {
add_new_sstable_to_bucket(t, past_window_ts);
}
// now past window can be compacted again because it switched to STCS mode and has more than min_threshold SSTables.
BOOST_REQUIRE(twcs.newest_bucket(*table_s, *control, buckets, min_threshold, max_threshold, now).size() == size_t(min_threshold));
});
}
static void check_min_max_column_names(const sstable_ptr& sst, std::vector<bytes> min_components, std::vector<bytes> max_components) {
const auto& st = sst->get_stats_metadata();
BOOST_TEST_MESSAGE(fmt::format("min {}/{} max {}/{}", st.min_column_names.elements.size(), min_components.size(), st.max_column_names.elements.size(), max_components.size()));
BOOST_REQUIRE(st.min_column_names.elements.size() == min_components.size());
for (auto i = 0U; i < st.min_column_names.elements.size(); i++) {
BOOST_REQUIRE(min_components[i] == st.min_column_names.elements[i].value);
}
BOOST_REQUIRE(st.max_column_names.elements.size() == max_components.size());
for (auto i = 0U; i < st.max_column_names.elements.size(); i++) {
BOOST_REQUIRE(max_components[i] == st.max_column_names.elements[i].value);
}
}
SEASTAR_TEST_CASE(min_max_clustering_key_test_2) {
return test_env::do_with_async([] (test_env& env) {
for (const auto version : writable_sstable_versions) {
auto s = schema_builder("ks", "cf")
.with_column("pk", utf8_type, column_kind::partition_key)
.with_column("ck1", utf8_type, column_kind::clustering_key)
.with_column("r1", int32_type)
.build();
column_family_for_tests cf(env.manager(), s);
auto close_cf = deferred_stop(cf);
auto tmp = tmpdir();
auto mt = make_lw_shared<replica::memtable>(s);
const column_definition &r1_col = *s->get_column_definition("r1");
for (auto j = 0; j < 8; j++) {
auto key = partition_key::from_exploded(*s, {to_bytes("key" + to_sstring(j))});
mutation m(s, key);
for (auto i = 100; i < 150; i++) {
auto c_key = clustering_key::from_exploded(*s, {to_bytes(to_sstring(j) + "ck" + to_sstring(i))});
m.set_clustered_cell(c_key, r1_col, make_atomic_cell(int32_type, int32_type->decompose(1)));
}
mt->apply(std::move(m));
}
auto sst = env.make_sstable(s, tmp.path().string(), 1, version, big);
write_memtable_to_sstable_for_test(*mt, sst).get();
sst = env.reusable_sst(s, tmp.path().string(), 1, version).get0();
check_min_max_column_names(sst, {"0ck100"}, {"7ck149"});
mt = make_lw_shared<replica::memtable>(s);
auto key = partition_key::from_exploded(*s, {to_bytes("key9")});
mutation m(s, key);
for (auto i = 101; i < 299; i++) {
auto c_key = clustering_key::from_exploded(*s, {to_bytes(to_sstring(9) + "ck" + to_sstring(i))});
m.set_clustered_cell(c_key, r1_col, make_atomic_cell(int32_type, int32_type->decompose(1)));
}
mt->apply(std::move(m));
auto sst2 = env.make_sstable(s, tmp.path().string(), 2, version, big);
write_memtable_to_sstable_for_test(*mt, sst2).get();
sst2 = env.reusable_sst(s, tmp.path().string(), 2, version).get0();
check_min_max_column_names(sst2, {"9ck101"}, {"9ck298"});
auto creator = [&env, s, &tmp, version] { return env.make_sstable(s, tmp.path().string(), 3, version, big); };
auto info = compact_sstables(cf.get_compaction_manager(), sstables::compaction_descriptor({sst, sst2}, default_priority_class()), *cf, creator).get0();
BOOST_REQUIRE(info.new_sstables.size() == 1);
check_min_max_column_names(info.new_sstables.front(), {"0ck100"}, {"9ck298"});
}
});
}
SEASTAR_TEST_CASE(size_tiered_beyond_max_threshold_test) {
return test_env::do_with([] (test_env& env) {
column_family_for_tests cf(env.manager());
auto cs = sstables::make_compaction_strategy(sstables::compaction_strategy_type::size_tiered, cf.schema()->compaction_strategy_options());
std::vector<sstables::shared_sstable> candidates;
int max_threshold = cf->schema()->max_compaction_threshold();
candidates.reserve(max_threshold+1);
for (auto i = 0; i < (max_threshold+1); i++) { // (max_threshold+1) sstables of similar size
auto sst = env.make_sstable(cf.schema(), "", i, la, big);
sstables::test(sst).set_data_file_size(1);
candidates.push_back(std::move(sst));
}
auto table_s = make_table_state_for_test(cf, env);
auto strategy_c = make_strategy_control_for_test(false);
auto desc = cs.get_sstables_for_compaction(*table_s, *strategy_c, std::move(candidates));
BOOST_REQUIRE(desc.sstables.size() == size_t(max_threshold));
return cf.stop_and_keep_alive();
});
}
SEASTAR_TEST_CASE(sstable_expired_data_ratio) {
return test_env::do_with_async([] (test_env& env) {
auto tmp = tmpdir();
auto s = make_shared_schema({}, some_keyspace, some_column_family,
{{"p1", utf8_type}}, {{"c1", utf8_type}}, {{"r1", utf8_type}}, {}, utf8_type);
auto mt = make_lw_shared<replica::memtable>(s);
static constexpr float expired = 0.33;
// we want number of expired keys to be ~ 1.5*sstables::TOMBSTONE_HISTOGRAM_BIN_SIZE so as to
// test ability of histogram to return a good estimation after merging keys.
static int total_keys = std::ceil(sstables::TOMBSTONE_HISTOGRAM_BIN_SIZE/expired)*1.5;
auto insert_key = [&] (bytes k, uint32_t ttl, uint32_t expiration_time) {
auto key = partition_key::from_exploded(*s, {k});
mutation m(s, key);
auto c_key = clustering_key::from_exploded(*s, {to_bytes("c1")});
m.set_clustered_cell(c_key, *s->get_column_definition("r1"), make_atomic_cell(utf8_type, bytes("a"), ttl, expiration_time));
mt->apply(std::move(m));
};
auto expired_keys = total_keys*expired;
auto now = gc_clock::now();
for (auto i = 0; i < expired_keys; i++) {
// generate expiration time at different time points or only a few entries would be created in histogram
auto expiration_time = (now - gc_clock::duration(DEFAULT_GC_GRACE_SECONDS*2+i)).time_since_epoch().count();
insert_key(to_bytes("expired_key" + to_sstring(i)), 1, expiration_time);
}
auto remaining = total_keys-expired_keys;
auto expiration_time = (now + gc_clock::duration(3600)).time_since_epoch().count();
for (auto i = 0; i < remaining; i++) {
insert_key(to_bytes("key" + to_sstring(i)), 3600, expiration_time);
}
auto sst = env.make_sstable(s, tmp.path().string(), 1, sstables::get_highest_sstable_version(), big);
write_memtable_to_sstable_for_test(*mt, sst).get();
sst = env.reusable_sst(s, tmp.path().string(), 1).get0();
const auto& stats = sst->get_stats_metadata();
BOOST_REQUIRE(stats.estimated_tombstone_drop_time.bin.size() == sstables::TOMBSTONE_HISTOGRAM_BIN_SIZE);
auto gc_before = gc_clock::now() - s->gc_grace_seconds();
auto uncompacted_size = sst->data_size();
// Asserts that two keys are equal to within a positive delta
BOOST_REQUIRE(std::fabs(sst->estimate_droppable_tombstone_ratio(gc_before) - expired) <= 0.1);
sstable_run run;
run.insert(sst);
BOOST_REQUIRE(std::fabs(run.estimate_droppable_tombstone_ratio(gc_before) - expired) <= 0.1);
column_family_for_tests cf(env.manager(), s);
auto close_cf = deferred_stop(cf);
auto creator = [&, gen = make_lw_shared<unsigned>(2)] {
auto sst = env.make_sstable(s, tmp.path().string(), (*gen)++, sstables::get_highest_sstable_version(), big);
return sst;
};
auto info = compact_sstables(cf.get_compaction_manager(), sstables::compaction_descriptor({ sst }, default_priority_class()), *cf, creator).get0();
BOOST_REQUIRE(info.new_sstables.size() == 1);
BOOST_REQUIRE(info.new_sstables.front()->estimate_droppable_tombstone_ratio(gc_before) == 0.0f);
BOOST_REQUIRE_CLOSE(info.new_sstables.front()->data_size(), uncompacted_size*(1-expired), 5);
std::map<sstring, sstring> options;
options.emplace("tombstone_threshold", "0.3f");
auto cs = sstables::make_compaction_strategy(sstables::compaction_strategy_type::size_tiered, options);
// that's needed because sstable with expired data should be old enough.
sstables::test(sst).set_data_file_write_time(db_clock::time_point::min());
auto table_s = make_table_state_for_test(cf, env);
auto strategy_c = make_strategy_control_for_test(false);
auto descriptor = cs.get_sstables_for_compaction(*table_s, *strategy_c, { sst });
BOOST_REQUIRE(descriptor.sstables.size() == 1);
BOOST_REQUIRE(descriptor.sstables.front() == sst);
cs = sstables::make_compaction_strategy(sstables::compaction_strategy_type::leveled, options);
sst->set_sstable_level(1);
descriptor = cs.get_sstables_for_compaction(*table_s, *strategy_c, { sst });
BOOST_REQUIRE(descriptor.sstables.size() == 1);
BOOST_REQUIRE(descriptor.sstables.front() == sst);
// make sure sstable picked for tombstone compaction removal won't be promoted or demoted.
BOOST_REQUIRE(descriptor.sstables.front()->get_sstable_level() == 1U);
// check tombstone compaction is disabled by default for DTCS
cs = sstables::make_compaction_strategy(sstables::compaction_strategy_type::date_tiered, {});
descriptor = cs.get_sstables_for_compaction(*table_s, *strategy_c, { sst });
BOOST_REQUIRE(descriptor.sstables.size() == 0);
cs = sstables::make_compaction_strategy(sstables::compaction_strategy_type::date_tiered, options);
descriptor = cs.get_sstables_for_compaction(*table_s, *strategy_c, { sst });
BOOST_REQUIRE(descriptor.sstables.size() == 1);
BOOST_REQUIRE(descriptor.sstables.front() == sst);
// sstable with droppable ratio of 0.3 won't be included due to threshold
{
std::map<sstring, sstring> options;
options.emplace("tombstone_threshold", "0.5f");
auto cs = sstables::make_compaction_strategy(sstables::compaction_strategy_type::size_tiered, options);
auto descriptor = cs.get_sstables_for_compaction(*table_s, *strategy_c, { sst });
BOOST_REQUIRE(descriptor.sstables.size() == 0);
}
// sstable which was recently created won't be included due to min interval
{
std::map<sstring, sstring> options;
options.emplace("tombstone_compaction_interval", "3600");
auto cs = sstables::make_compaction_strategy(sstables::compaction_strategy_type::size_tiered, options);
sstables::test(sst).set_data_file_write_time(db_clock::now());
auto descriptor = cs.get_sstables_for_compaction(*table_s, *strategy_c, { sst });
BOOST_REQUIRE(descriptor.sstables.size() == 0);
}
});
}
SEASTAR_TEST_CASE(compaction_correctness_with_partitioned_sstable_set) {
return test_env::do_with_async([] (test_env& env) {
cell_locker_stats cl_stats;
auto builder = schema_builder("tests", "tombstone_purge")
.with_column("id", utf8_type, column_kind::partition_key)
.with_column("value", int32_type);
builder.set_gc_grace_seconds(0);
builder.set_compaction_strategy(sstables::compaction_strategy_type::leveled);
auto s = builder.build();
auto tmp = tmpdir();
auto sst_gen = [&env, s, &tmp, gen = make_lw_shared<unsigned>(1)] () mutable {
auto sst = env.make_sstable(s, tmp.path().string(), (*gen)++, sstables::get_highest_sstable_version(), big);
return sst;
};
auto compact = [&, s] (std::vector<shared_sstable> all) -> std::vector<shared_sstable> {
// NEEDED for partitioned_sstable_set to actually have an effect
std::for_each(all.begin(), all.end(), [] (auto& sst) { sst->set_sstable_level(1); });
column_family_for_tests cf(env.manager(), s);
auto close_cf = deferred_stop(cf);
return compact_sstables(cf.get_compaction_manager(), sstables::compaction_descriptor(std::move(all), default_priority_class(), 0, 0 /*std::numeric_limits<uint64_t>::max()*/),
*cf, sst_gen).get0().new_sstables;
};
auto make_insert = [&] (auto p) {
auto key = partition_key::from_exploded(*s, {to_bytes(p.first)});
mutation m(s, key);
m.set_clustered_cell(clustering_key::make_empty(), bytes("value"), data_value(int32_t(1)), 1 /* ts */);
BOOST_REQUIRE(m.decorated_key().token() == p.second);
return m;
};
auto tokens = token_generation_for_current_shard(4);
auto mut1 = make_insert(tokens[0]);
auto mut2 = make_insert(tokens[1]);
auto mut3 = make_insert(tokens[2]);
auto mut4 = make_insert(tokens[3]);
{
std::vector<shared_sstable> sstables = {
make_sstable_containing(sst_gen, {mut1, mut2}),
make_sstable_containing(sst_gen, {mut3, mut4})
};
auto result = compact(std::move(sstables));
BOOST_REQUIRE_EQUAL(4, result.size());
assert_that(sstable_reader(result[0], s, env.make_reader_permit()))
.produces(mut1)
.produces_end_of_stream();
assert_that(sstable_reader(result[1], s, env.make_reader_permit()))
.produces(mut2)
.produces_end_of_stream();
assert_that(sstable_reader(result[2], s, env.make_reader_permit()))
.produces(mut3)
.produces_end_of_stream();
assert_that(sstable_reader(result[3], s, env.make_reader_permit()))
.produces(mut4)
.produces_end_of_stream();
}
{
// with partitioned_sstable_set having an interval with exclusive lower boundary, example:
// [mut1, mut2]
// (mut2, mut3]
std::vector<shared_sstable> sstables = {
make_sstable_containing(sst_gen, {mut1, mut2}),
make_sstable_containing(sst_gen, {mut2, mut3}),
make_sstable_containing(sst_gen, {mut3, mut4})
};
auto result = compact(std::move(sstables));
BOOST_REQUIRE_EQUAL(4, result.size());
assert_that(sstable_reader(result[0], s, env.make_reader_permit()))
.produces(mut1)
.produces_end_of_stream();
assert_that(sstable_reader(result[1], s, env.make_reader_permit()))
.produces(mut2)
.produces_end_of_stream();
assert_that(sstable_reader(result[2], s, env.make_reader_permit()))
.produces(mut3)
.produces_end_of_stream();
assert_that(sstable_reader(result[3], s, env.make_reader_permit()))
.produces(mut4)
.produces_end_of_stream();
}
{
// with gap between tables
std::vector<shared_sstable> sstables = {
make_sstable_containing(sst_gen, {mut1, mut2}),
make_sstable_containing(sst_gen, {mut4, mut4})
};
auto result = compact(std::move(sstables));
BOOST_REQUIRE_EQUAL(3, result.size());
assert_that(sstable_reader(result[0], s, env.make_reader_permit()))
.produces(mut1)
.produces_end_of_stream();
assert_that(sstable_reader(result[1], s, env.make_reader_permit()))
.produces(mut2)
.produces_end_of_stream();
assert_that(sstable_reader(result[2], s, env.make_reader_permit()))
.produces(mut4)
.produces_end_of_stream();
}
});
}
SEASTAR_TEST_CASE(sstable_cleanup_correctness_test) {
return do_with_cql_env([] (auto& e) {
return test_env::do_with_async([&db = e.local_db()] (test_env& env) {
cell_locker_stats cl_stats;
auto ks_name = "ks"; // single_node_cql_env::ks_name
auto s = schema_builder(ks_name, "correcness_test")
.with_column("id", utf8_type, column_kind::partition_key)
.with_column("value", int32_type).build();
auto tmp = tmpdir();
auto sst_gen = [&env, s, &tmp, gen = make_lw_shared<unsigned>(1)] () mutable {
return env.make_sstable(s, tmp.path().string(), (*gen)++, sstables::get_highest_sstable_version(), big);
};
auto make_insert = [&] (partition_key key) {
mutation m(s, key);
m.set_clustered_cell(clustering_key::make_empty(), bytes("value"), data_value(int32_t(1)), api::timestamp_type(0));
return m;
};
auto total_partitions = 10000U;
auto local_keys = make_local_keys(total_partitions, s);
std::vector<mutation> mutations;
for (auto i = 0U; i < total_partitions; i++) {
mutations.push_back(make_insert(partition_key::from_deeply_exploded(*s, { local_keys.at(i) })));
}
auto sst = make_sstable_containing(sst_gen, mutations);
auto run_identifier = sst->run_identifier();
auto cf = make_lw_shared<replica::column_family>(s, column_family_test_config(env.manager(), env.semaphore()), replica::column_family::no_commitlog(),
db.get_compaction_manager(), cl_stats, db.row_cache_tracker());
cf->mark_ready_for_writes();
cf->start();
dht::token_range_vector local_ranges = db.get_keyspace_local_ranges(ks_name);
auto descriptor = sstables::compaction_descriptor({std::move(sst)}, default_priority_class(), compaction_descriptor::default_level,
compaction_descriptor::default_max_sstable_bytes, run_identifier, compaction_type_options::make_cleanup(std::move(local_ranges)));
auto ret = compact_sstables(db.get_compaction_manager(), std::move(descriptor), *cf, sst_gen).get0();
BOOST_REQUIRE(ret.new_sstables.size() == 1);
BOOST_REQUIRE(ret.new_sstables.front()->get_estimated_key_count() >= total_partitions);
BOOST_REQUIRE((ret.new_sstables.front()->get_estimated_key_count() - total_partitions) <= uint64_t(s->min_index_interval()));
BOOST_REQUIRE(ret.new_sstables.front()->run_identifier() == run_identifier);
});
});
}
std::vector<mutation_fragment_v2> write_corrupt_sstable(test_env& env, sstable& sst, reader_permit permit,
std::function<void(mutation_fragment_v2&&, bool)> write_to_secondary) {
auto schema = sst.get_schema();
std::vector<mutation_fragment_v2> corrupt_fragments;
const auto ts = api::timestamp_type{1};
auto local_keys = make_local_keys(3, schema);
auto config = env.manager().configure_writer();
config.validation_level = mutation_fragment_stream_validation_level::partition_region; // this test violates key order on purpose
auto writer = sst.get_writer(*schema, local_keys.size(), config, encoding_stats{});
auto make_static_row = [&, schema, ts] {
auto r = row{};
auto cdef = schema->static_column_at(0);
auto ac = atomic_cell::make_live(*cdef.type, ts, cdef.type->decompose(data_value(1)));
r.apply(cdef, atomic_cell_or_collection{std::move(ac)});
return static_row(*schema, std::move(r));
};
auto make_clustering_row = [&, schema, ts] (unsigned i) {
auto r = row{};
auto cdef = schema->regular_column_at(0);
auto ac = atomic_cell::make_live(*cdef.type, ts, cdef.type->decompose(data_value(1)));
r.apply(cdef, atomic_cell_or_collection{std::move(ac)});
return clustering_row(clustering_key::from_single_value(*schema, int32_type->decompose(data_value(int(i)))), {}, {}, std::move(r));
};
auto write_partition = [&, schema, ts] (int pk, bool is_corrupt) {
auto pkey = partition_key::from_deeply_exploded(*schema, { local_keys.at(pk) });
auto dkey = dht::decorate_key(*schema, pkey);
testlog.trace("Writing partition {}", pkey.with_schema(*schema));
write_to_secondary(mutation_fragment_v2(*schema, permit, partition_start(dkey, {})), is_corrupt);
corrupt_fragments.emplace_back(*schema, permit, partition_start(dkey, {}));
writer.consume_new_partition(dkey);
{
auto sr = make_static_row();
testlog.trace("Writing row {}", sr.position());
write_to_secondary(mutation_fragment_v2(*schema, permit, static_row(*schema, sr)), is_corrupt);
corrupt_fragments.emplace_back(*schema, permit, static_row(*schema, sr));
writer.consume(std::move(sr));
}
const unsigned rows_count = 10;
for (unsigned i = 0; i < rows_count; ++i) {
auto cr = make_clustering_row(i);
testlog.trace("Writing row {}", cr.position());
write_to_secondary(mutation_fragment_v2(*schema, permit, clustering_row(*schema, cr)), is_corrupt);
corrupt_fragments.emplace_back(*schema, permit, clustering_row(*schema, cr));
writer.consume(clustering_row(*schema, cr));
// write row twice
if (i == (rows_count / 2)) {
auto bad_cr = make_clustering_row(i - 2);
testlog.trace("Writing out-of-order row {}", bad_cr.position());
write_to_secondary(mutation_fragment_v2(*schema, permit, clustering_row(*schema, cr)), true);
corrupt_fragments.emplace_back(*schema, permit, clustering_row(*schema, bad_cr));
writer.consume(std::move(bad_cr));
}
}
testlog.trace("Writing partition_end");
write_to_secondary(mutation_fragment_v2(*schema, permit, partition_end{}), is_corrupt);
corrupt_fragments.emplace_back(*schema, permit, partition_end{});
writer.consume_end_of_partition();
};
write_partition(1, false);
write_partition(0, true);
write_partition(2, false);
testlog.info("Writing done");
writer.consume_end_of_stream();
return corrupt_fragments;
}
SEASTAR_TEST_CASE(sstable_scrub_validate_mode_test) {
cql_test_config test_cfg;
auto& db_cfg = *test_cfg.db_config;
// Disable cache to filter out its possible "corrections" to the corrupt sstable.
db_cfg.enable_cache(false);
db_cfg.enable_commitlog(false);
return do_with_cql_env([this] (cql_test_env& cql_env) -> future<> {
return test_env::do_with_async([this, &cql_env] (test_env& env) {
cell_locker_stats cl_stats;
auto& db = cql_env.local_db();
auto& compaction_manager = db.get_compaction_manager();
auto schema = schema_builder("ks", get_name())
.with_column("pk", utf8_type, column_kind::partition_key)
.with_column("ck", int32_type, column_kind::clustering_key)
.with_column("s", int32_type, column_kind::static_column)
.with_column("v", int32_type).build();
auto permit = env.make_reader_permit();
auto tmp = tmpdir();
auto sst_gen = [&env, schema, &tmp, gen = make_lw_shared<unsigned>(1)] () mutable {
return env.make_sstable(schema, tmp.path().string(), (*gen)++);
};
auto scrubbed_mt = make_lw_shared<replica::memtable>(schema);
auto sst = sst_gen();
testlog.info("Writing sstable {}", sst->get_filename());
const auto corrupt_fragments = write_corrupt_sstable(env, *sst, permit, [&, mut_builder = mutation_rebuilder_v2(schema)] (mutation_fragment_v2&& mf, bool) mutable {
if (mf.is_end_of_partition()) {
scrubbed_mt->apply(*std::move(mut_builder).consume_end_of_stream());
} else {
std::move(mf).consume(mut_builder);
}
});
sst->load().get();
testlog.info("Loaded sstable {}", sst->get_filename());
auto cfg = column_family_test_config(env.manager(), env.semaphore());
cfg.datadir = tmp.path().string();
auto table = make_lw_shared<replica::column_family>(schema, cfg, replica::column_family::no_commitlog(),
db.get_compaction_manager(), cl_stats, db.row_cache_tracker());
auto stop_table = defer([table] {
table->stop().get();
});
table->mark_ready_for_writes();
table->start();
table->add_sstable_and_update_cache(sst).get();
BOOST_REQUIRE(table->in_strategy_sstables().size() == 1);
BOOST_REQUIRE(table->in_strategy_sstables().front() == sst);
auto verify_fragments = [&] (sstables::shared_sstable sst, const std::vector<mutation_fragment_v2>& mfs) {
auto r = assert_that(sst->as_mutation_source().make_reader_v2(schema, env.make_reader_permit()));
for (const auto& mf : mfs) {
testlog.trace("Expecting {}", mutation_fragment_v2::printer(*schema, mf));
r.produces(*schema, mf);
}
r.produces_end_of_stream();
};
testlog.info("Verifying written data...");
// Make sure we wrote what we though we wrote.
verify_fragments(sst, corrupt_fragments);
testlog.info("Validate");
// No way to really test validation besides observing the log messages.
sstables::compaction_type_options::scrub opts = {
.operation_mode = sstables::compaction_type_options::scrub::mode::validate,
};
compaction_manager.perform_sstable_scrub(table.get(), opts).get();
BOOST_REQUIRE(sst->is_quarantined());
BOOST_REQUIRE(table->in_strategy_sstables().empty());
verify_fragments(sst, corrupt_fragments);
});
}, test_cfg);
}
SEASTAR_THREAD_TEST_CASE(scrub_validate_mode_validate_reader_test) {
auto schema = schema_builder("ks", get_name())
.with_column("pk", utf8_type, column_kind::partition_key)
.with_column("ck", int32_type, column_kind::clustering_key)
.with_column("s", int32_type, column_kind::static_column)
.with_column("v", int32_type).build();
tests::reader_concurrency_semaphore_wrapper semaphore;
auto permit = semaphore.make_permit();
std::deque<mutation_fragment_v2> frags;
const auto ts = api::timestamp_type{1};
auto local_keys = make_local_keys(5, schema);
auto make_partition_start = [&, schema] (unsigned pk) {
auto pkey = partition_key::from_deeply_exploded(*schema, { local_keys.at(pk) });
auto dkey = dht::decorate_key(*schema, pkey);
return mutation_fragment_v2(*schema, permit, partition_start(std::move(dkey), {}));
};
auto make_partition_end = [&, schema] {
return mutation_fragment_v2(*schema, permit, partition_end());
};
auto make_static_row = [&, schema, ts] {
auto r = row{};
auto cdef = schema->static_column_at(0);
auto ac = atomic_cell::make_live(*cdef.type, ts, cdef.type->decompose(data_value(1)));
r.apply(cdef, atomic_cell_or_collection{std::move(ac)});
return mutation_fragment_v2(*schema, permit, static_row(*schema, std::move(r)));
};
auto make_clustering_row = [&, schema, ts] (unsigned i) {
auto r = row{};
auto cdef = schema->regular_column_at(0);
auto ac = atomic_cell::make_live(*cdef.type, ts, cdef.type->decompose(data_value(1)));
r.apply(cdef, atomic_cell_or_collection{std::move(ac)});
return mutation_fragment_v2(*schema, permit,
clustering_row(clustering_key::from_single_value(*schema, int32_type->decompose(data_value(int(i)))), {}, {}, std::move(r)));
};
auto info = make_lw_shared<compaction_data>();
BOOST_TEST_MESSAGE("valid");
{
frags.emplace_back(make_partition_start(0));
frags.emplace_back(make_static_row());
frags.emplace_back(make_clustering_row(0));
frags.emplace_back(make_clustering_row(1));
frags.emplace_back(make_partition_end());
frags.emplace_back(make_partition_start(2));
frags.emplace_back(make_partition_end());
const auto valid = scrub_validate_mode_validate_reader(make_flat_mutation_reader_from_fragments(schema, permit, std::move(frags)), *info).get();
BOOST_REQUIRE(valid);
}
BOOST_TEST_MESSAGE("out-of-order clustering row");
{
frags.emplace_back(make_partition_start(0));
frags.emplace_back(make_clustering_row(1));
frags.emplace_back(make_clustering_row(0));
frags.emplace_back(make_partition_end());
const auto valid = scrub_validate_mode_validate_reader(make_flat_mutation_reader_from_fragments(schema, permit, std::move(frags)), *info).get();
BOOST_REQUIRE(!valid);
}
BOOST_TEST_MESSAGE("out-of-order static row");
{
frags.emplace_back(make_partition_start(0));
frags.emplace_back(make_clustering_row(0));
frags.emplace_back(make_static_row());
frags.emplace_back(make_partition_end());
const auto valid = scrub_validate_mode_validate_reader(make_flat_mutation_reader_from_fragments(schema, permit, std::move(frags)), *info).get();
BOOST_REQUIRE(!valid);
}
BOOST_TEST_MESSAGE("out-of-order partition start");
{
frags.emplace_back(make_partition_start(0));
frags.emplace_back(make_clustering_row(1));
frags.emplace_back(make_partition_start(2));
frags.emplace_back(make_partition_end());
const auto valid = scrub_validate_mode_validate_reader(make_flat_mutation_reader_from_fragments(schema, permit, std::move(frags)), *info).get();
BOOST_REQUIRE(!valid);
}
BOOST_TEST_MESSAGE("out-of-order partition");
{
frags.emplace_back(make_partition_start(2));
frags.emplace_back(make_clustering_row(0));
frags.emplace_back(make_partition_end());
frags.emplace_back(make_partition_start(0));
frags.emplace_back(make_partition_end());
const auto valid = scrub_validate_mode_validate_reader(make_flat_mutation_reader_from_fragments(schema, permit, std::move(frags)), *info).get();
BOOST_REQUIRE(!valid);
}
BOOST_TEST_MESSAGE("missing end-of-partition at EOS");
{
frags.emplace_back(make_partition_start(0));
frags.emplace_back(make_clustering_row(0));
const auto valid = scrub_validate_mode_validate_reader(make_flat_mutation_reader_from_fragments(schema, permit, std::move(frags)), *info).get();
BOOST_REQUIRE(!valid);
}
}
SEASTAR_TEST_CASE(sstable_scrub_skip_mode_test) {
cql_test_config test_cfg;
auto& db_cfg = *test_cfg.db_config;
// Disable cache to filter out its possible "corrections" to the corrupt sstable.
db_cfg.enable_cache(false);
db_cfg.enable_commitlog(false);
return do_with_cql_env([this] (cql_test_env& cql_env) -> future<> {
return test_env::do_with_async([this, &cql_env] (test_env& env) {
cell_locker_stats cl_stats;
auto& db = cql_env.local_db();
auto& compaction_manager = db.get_compaction_manager();
auto schema = schema_builder("ks", get_name())
.with_column("pk", utf8_type, column_kind::partition_key)
.with_column("ck", int32_type, column_kind::clustering_key)
.with_column("s", int32_type, column_kind::static_column)
.with_column("v", int32_type).build();
auto permit = env.make_reader_permit();
auto tmp = tmpdir();
auto sst_gen = [&env, schema, &tmp, gen = make_lw_shared<unsigned>(1)] () mutable {
return env.make_sstable(schema, tmp.path().string(), (*gen)++);
};
std::vector<mutation_fragment_v2> scrubbed_fragments;
auto sst = sst_gen();
const auto corrupt_fragments = write_corrupt_sstable(env, *sst, permit, [&] (mutation_fragment_v2&& mf, bool is_corrupt) {
if (!is_corrupt) {
scrubbed_fragments.emplace_back(std::move(mf));
}
});
testlog.info("Writing sstable {}", sst->get_filename());
sst->load().get();
testlog.info("Loaded sstable {}", sst->get_filename());
auto cfg = column_family_test_config(env.manager(), env.semaphore());
cfg.datadir = tmp.path().string();
auto table = make_lw_shared<replica::column_family>(schema, cfg, replica::column_family::no_commitlog(),
db.get_compaction_manager(), cl_stats, db.row_cache_tracker());
auto stop_table = defer([table] {
table->stop().get();
});
table->mark_ready_for_writes();
table->start();
table->add_sstable_and_update_cache(sst).get();
BOOST_REQUIRE(table->in_strategy_sstables().size() == 1);
BOOST_REQUIRE(table->in_strategy_sstables().front() == sst);
auto verify_fragments = [&] (sstables::shared_sstable sst, const std::vector<mutation_fragment_v2>& mfs) {
auto r = assert_that(sst->as_mutation_source().make_reader_v2(schema, permit));
for (const auto& mf : mfs) {
testlog.trace("Expecting {}", mutation_fragment_v2::printer(*schema, mf));
r.produces(*schema, mf);
}
r.produces_end_of_stream();
};
testlog.info("Verifying written data...");
// Make sure we wrote what we though we wrote.
verify_fragments(sst, corrupt_fragments);
testlog.info("Scrub in abort mode");
// We expect the scrub with mode=srub::mode::abort to stop on the first invalid fragment.
sstables::compaction_type_options::scrub opts = {};
opts.operation_mode = sstables::compaction_type_options::scrub::mode::abort;
compaction_manager.perform_sstable_scrub(table.get(), opts).get();
BOOST_REQUIRE(table->in_strategy_sstables().size() == 1);
verify_fragments(sst, corrupt_fragments);
testlog.info("Scrub in skip mode");
// We expect the scrub with mode=srub::mode::skip to get rid of all invalid data.
opts.operation_mode = sstables::compaction_type_options::scrub::mode::skip;
compaction_manager.perform_sstable_scrub(table.get(), opts).get();
BOOST_REQUIRE(table->in_strategy_sstables().size() == 1);
BOOST_REQUIRE(table->in_strategy_sstables().front() != sst);
verify_fragments(table->in_strategy_sstables().front(), scrubbed_fragments);
});
}, test_cfg);
}
SEASTAR_TEST_CASE(sstable_scrub_segregate_mode_test) {
cql_test_config test_cfg;
auto& db_cfg = *test_cfg.db_config;
// Disable cache to filter out its possible "corrections" to the corrupt sstable.
db_cfg.enable_cache(false);
db_cfg.enable_commitlog(false);
return do_with_cql_env([this] (cql_test_env& cql_env) -> future<> {
return test_env::do_with_async([this, &cql_env] (test_env& env) {
cell_locker_stats cl_stats;
auto& db = cql_env.local_db();
auto& compaction_manager = db.get_compaction_manager();
auto schema = schema_builder("ks", get_name())
.with_column("pk", utf8_type, column_kind::partition_key)
.with_column("ck", int32_type, column_kind::clustering_key)
.with_column("s", int32_type, column_kind::static_column)
.with_column("v", int32_type).build();
auto permit = env.make_reader_permit();
auto tmp = tmpdir();
auto sst_gen = [&env, schema, &tmp, gen = make_lw_shared<unsigned>(1)] () mutable {
return env.make_sstable(schema, tmp.path().string(), (*gen)++);
};
auto scrubbed_mt = make_lw_shared<replica::memtable>(schema);
auto sst = sst_gen();
testlog.info("Writing sstable {}", sst->get_filename());
const auto corrupt_fragments = write_corrupt_sstable(env, *sst, permit, [&, mut_builder = mutation_rebuilder_v2(schema)] (mutation_fragment_v2&& mf, bool) mutable {
if (mf.is_end_of_partition()) {
scrubbed_mt->apply(*std::move(mut_builder).consume_end_of_stream());
} else {
std::move(mf).consume(mut_builder);
}
});
sst->load().get();
testlog.info("Loaded sstable {}", sst->get_filename());
auto cfg = column_family_test_config(env.manager(), env.semaphore());
cfg.datadir = tmp.path().string();
auto table = make_lw_shared<replica::column_family>(schema, cfg, replica::column_family::no_commitlog(),
db.get_compaction_manager(), cl_stats, db.row_cache_tracker());
auto stop_table = defer([table] {
table->stop().get();
});
table->mark_ready_for_writes();
table->start();
table->add_sstable_and_update_cache(sst).get();
BOOST_REQUIRE(table->in_strategy_sstables().size() == 1);
BOOST_REQUIRE(table->in_strategy_sstables().front() == sst);
auto verify_fragments = [&] (sstables::shared_sstable sst, const std::vector<mutation_fragment_v2>& mfs) {
auto r = assert_that(sst->as_mutation_source().make_reader_v2(schema, env.make_reader_permit()));
for (const auto& mf : mfs) {
testlog.trace("Expecting {}", mutation_fragment_v2::printer(*schema, mf));
r.produces(*schema, mf);
}
r.produces_end_of_stream();
};
testlog.info("Verifying written data...");
// Make sure we wrote what we though we wrote.
verify_fragments(sst, corrupt_fragments);
testlog.info("Scrub in abort mode");
// We expect the scrub with mode=srub::mode::abort to stop on the first invalid fragment.
sstables::compaction_type_options::scrub opts = {};
opts.operation_mode = sstables::compaction_type_options::scrub::mode::abort;
compaction_manager.perform_sstable_scrub(table.get(), opts).get();
BOOST_REQUIRE(table->in_strategy_sstables().size() == 1);
verify_fragments(sst, corrupt_fragments);
testlog.info("Scrub in segregate mode");
// We expect the scrub with mode=srub::mode::segregate to fix all out-of-order data.
opts.operation_mode = sstables::compaction_type_options::scrub::mode::segregate;
compaction_manager.perform_sstable_scrub(table.get(), opts).get();
testlog.info("Scrub resulted in {} sstables", table->in_strategy_sstables().size());
BOOST_REQUIRE(table->in_strategy_sstables().size() > 1);
{
auto sst_reader = assert_that(table->as_mutation_source().make_reader_v2(schema, env.make_reader_permit()));
auto mt_reader = scrubbed_mt->as_data_source().make_reader_v2(schema, env.make_reader_permit());
auto mt_reader_close = deferred_close(mt_reader);
while (auto mf_opt = mt_reader().get()) {
testlog.trace("Expecting {}", mutation_fragment_v2::printer(*schema, *mf_opt));
sst_reader.produces(*schema, *mf_opt);
}
sst_reader.produces_end_of_stream();
}
});
}, test_cfg);
}
SEASTAR_TEST_CASE(sstable_scrub_quarantine_mode_test) {
cql_test_config test_cfg;
auto& db_cfg = *test_cfg.db_config;
// Disable cache to filter out its possible "corrections" to the corrupt sstable.
db_cfg.enable_cache(false);
db_cfg.enable_commitlog(false);
constexpr std::array<sstables::compaction_type_options::scrub::quarantine_mode, 3> quarantine_modes = {
sstables::compaction_type_options::scrub::quarantine_mode::include,
sstables::compaction_type_options::scrub::quarantine_mode::exclude,
sstables::compaction_type_options::scrub::quarantine_mode::only,
};
for (auto qmode : quarantine_modes) {
co_await do_with_cql_env([this, qmode] (cql_test_env& cql_env) {
return test_env::do_with_async([this, qmode, &cql_env] (test_env& env) {
cell_locker_stats cl_stats;
auto& db = cql_env.local_db();
auto& compaction_manager = db.get_compaction_manager();
auto schema = schema_builder("ks", get_name())
.with_column("pk", utf8_type, column_kind::partition_key)
.with_column("ck", int32_type, column_kind::clustering_key)
.with_column("s", int32_type, column_kind::static_column)
.with_column("v", int32_type).build();
auto permit = env.make_reader_permit();
auto tmp = tmpdir();
auto sst_gen = [&env, schema, &tmp, gen = make_lw_shared<unsigned>(1)] () mutable {
return env.make_sstable(schema, tmp.path().string(), (*gen)++);
};
auto scrubbed_mt = make_lw_shared<replica::memtable>(schema);
auto sst = sst_gen();
testlog.info("Writing sstable {}", sst->get_filename());
const auto corrupt_fragments = write_corrupt_sstable(env, *sst, permit, [&, mut_builder = mutation_rebuilder_v2(schema)] (mutation_fragment_v2&& mf, bool) mutable {
if (mf.is_end_of_partition()) {
scrubbed_mt->apply(*std::move(mut_builder).consume_end_of_stream());
} else {
std::move(mf).consume(mut_builder);
}
});
sst->load().get();
testlog.info("Loaded sstable {}", sst->get_filename());
auto cfg = column_family_test_config(env.manager(), env.semaphore());
cfg.datadir = tmp.path().string();
auto table = make_lw_shared<replica::column_family>(schema, cfg, replica::column_family::no_commitlog(),
db.get_compaction_manager(), cl_stats, db.row_cache_tracker());
auto stop_table = defer([table] {
table->stop().get();
});
table->mark_ready_for_writes();
table->start();
table->add_sstable_and_update_cache(sst).get();
BOOST_REQUIRE(table->in_strategy_sstables().size() == 1);
BOOST_REQUIRE(table->in_strategy_sstables().front() == sst);
auto verify_fragments = [&] (sstables::shared_sstable sst, const std::vector<mutation_fragment_v2>& mfs) {
auto r = assert_that(sst->as_mutation_source().make_reader_v2(schema, env.make_reader_permit()));
for (const auto& mf : mfs) {
testlog.trace("Expecting {}", mutation_fragment_v2::printer(*schema, mf));
r.produces(*schema, mf);
}
r.produces_end_of_stream();
};
testlog.info("Verifying written data...");
// Make sure we wrote what we though we wrote.
verify_fragments(sst, corrupt_fragments);
testlog.info("Scrub in validate mode");
// We expect the scrub with mode=scrub::mode::validate to quarantine the sstable.
sstables::compaction_type_options::scrub opts = {};
opts.operation_mode = sstables::compaction_type_options::scrub::mode::validate;
compaction_manager.perform_sstable_scrub(table.get(), opts).get();
BOOST_REQUIRE(table->in_strategy_sstables().empty());
BOOST_REQUIRE(sst->is_quarantined());
verify_fragments(sst, corrupt_fragments);
testlog.info("Scrub in segregate mode with quarantine_mode {}", qmode);
// We expect the scrub with mode=scrub::mode::segregate to fix all out-of-order data.
opts.operation_mode = sstables::compaction_type_options::scrub::mode::segregate;
opts.quarantine_operation_mode = qmode;
compaction_manager.perform_sstable_scrub(table.get(), opts).get();
switch (qmode) {
case sstables::compaction_type_options::scrub::quarantine_mode::include:
case sstables::compaction_type_options::scrub::quarantine_mode::only:
// The sstable should be found and scrubbed when scrub::quarantine_mode is scrub::quarantine_mode::{include,only}
testlog.info("Scrub resulted in {} sstables", table->in_strategy_sstables().size());
BOOST_REQUIRE(table->in_strategy_sstables().size() > 1);
{
auto sst_reader = assert_that(table->as_mutation_source().make_reader(schema, env.make_reader_permit()));
auto mt_reader = scrubbed_mt->as_data_source().make_reader(schema, env.make_reader_permit());
auto mt_reader_close = deferred_close(mt_reader);
while (auto mf_opt = mt_reader().get()) {
testlog.trace("Expecting {}", mutation_fragment::printer(*schema, *mf_opt));
sst_reader.produces(*schema, *mf_opt);
}
sst_reader.produces_end_of_stream();
}
break;
case sstables::compaction_type_options::scrub::quarantine_mode::exclude:
// The sstable should not be found when scrub::quarantine_mode is scrub::quarantine_mode::exclude
BOOST_REQUIRE(table->in_strategy_sstables().empty());
BOOST_REQUIRE(sst->is_quarantined());
verify_fragments(sst, corrupt_fragments);
break;
}
});
}, test_cfg);
}
}
// Test the scrub_reader in segregate mode and segregate_by_partition together,
// as they are used in scrub compaction in segregate mode.
SEASTAR_THREAD_TEST_CASE(test_scrub_segregate_stack) {
simple_schema ss;
auto schema = ss.schema();
tests::reader_concurrency_semaphore_wrapper semaphore;
auto permit = semaphore.make_permit();
struct expected_rows_type {
using expected_clustering_rows_type = std::set<clustering_key, clustering_key::less_compare>;
bool has_static_row = false;
expected_clustering_rows_type clustering_rows;
explicit expected_rows_type(const ::schema& s) : clustering_rows(s) { }
};
using expected_partitions_type = std::map<dht::decorated_key, expected_rows_type, dht::decorated_key::less_comparator>;
expected_partitions_type expected_partitions{dht::decorated_key::less_comparator(schema)};
std::deque<mutation_fragment_v2> all_fragments;
size_t double_partition_end = 0;
size_t missing_partition_end = 0;
for (uint32_t p = 0; p < 10; ++p) {
auto dk = ss.make_pkey(tests::random::get_int<uint32_t>(0, 8));
auto it = expected_partitions.find(dk);
testlog.trace("Generating data for {} partition {}", it == expected_partitions.end() ? "new" : "existing", dk);
if (it == expected_partitions.end()) {
auto [inserted_it, _] = expected_partitions.emplace(dk, expected_rows_type(*schema));
it = inserted_it;
}
all_fragments.emplace_back(*schema, permit, partition_start(dk, {}));
auto& expected_rows = it->second;
for (uint32_t r = 0; r < 10; ++r) {
const auto is_clustering_row = tests::random::get_int<unsigned>(0, 8);
if (is_clustering_row) {
auto ck = ss.make_ckey(tests::random::get_int<uint32_t>(0, 8));
testlog.trace("Generating clustering row {}", ck);
all_fragments.emplace_back(*schema, permit, ss.make_row_v2(permit, ck, "cv"));
expected_rows.clustering_rows.insert(ck);
} else {
testlog.trace("Generating static row");
all_fragments.emplace_back(*schema, permit, ss.make_static_row_v2(permit, "sv"));
expected_rows.has_static_row = true;
}
}
const auto partition_end_roll = tests::random::get_int(0, 100);
if (partition_end_roll < 80) {
testlog.trace("Generating partition end");
all_fragments.emplace_back(*schema, permit, partition_end());
} else if (partition_end_roll < 90) {
testlog.trace("Generating double partition end");
++double_partition_end;
all_fragments.emplace_back(*schema, permit, partition_end());
all_fragments.emplace_back(*schema, permit, partition_end());
} else {
testlog.trace("Not generating partition end");
++missing_partition_end;
}
}
{
size_t rows = 0;
for (const auto& part : expected_partitions) {
rows += part.second.clustering_rows.size();
}
testlog.info("Generated {} partitions (with {} double and {} missing partition ends), {} rows and {} fragments total", expected_partitions.size(), double_partition_end, missing_partition_end, rows, all_fragments.size());
}
auto copy_fragments = [&schema, &semaphore] (const std::deque<mutation_fragment_v2>& frags) {
auto permit = semaphore.make_permit();
std::deque<mutation_fragment_v2> copied_fragments;
for (const auto& frag : frags) {
copied_fragments.emplace_back(*schema, permit, frag);
}
return copied_fragments;
};
std::list<std::deque<mutation_fragment_v2>> segregated_fragment_streams;
mutation_writer::segregate_by_partition(
make_scrubbing_reader(make_flat_mutation_reader_from_fragments(schema, permit, std::move(all_fragments)), sstables::compaction_type_options::scrub::mode::segregate),
mutation_writer::segregate_config{default_priority_class(), 100000},
[&schema, &segregated_fragment_streams] (flat_mutation_reader_v2 rd) {
return async([&schema, &segregated_fragment_streams, rd = std::move(rd)] () mutable {
auto close = deferred_close(rd);
auto& fragments = segregated_fragment_streams.emplace_back();
while (auto mf_opt = rd().get()) {
fragments.emplace_back(*schema, rd.permit(), *mf_opt);
}
});
}).get();
testlog.info("Segregation resulted in {} fragment streams", segregated_fragment_streams.size());
testlog.info("Checking position monotonicity of segregated streams");
{
size_t i = 0;
for (const auto& segregated_fragment_stream : segregated_fragment_streams) {
testlog.debug("Checking position monotonicity of segregated stream #{}", i++);
assert_that(make_flat_mutation_reader_from_fragments(schema, permit, copy_fragments(segregated_fragment_stream)))
.has_monotonic_positions();
}
}
testlog.info("Checking position monotonicity of re-combined stream");
{
std::vector<flat_mutation_reader_v2> readers;
readers.reserve(segregated_fragment_streams.size());
for (const auto& segregated_fragment_stream : segregated_fragment_streams) {
readers.emplace_back(make_flat_mutation_reader_from_fragments(schema, permit, copy_fragments(segregated_fragment_stream)));
}
assert_that(make_combined_reader(schema, permit, std::move(readers))).has_monotonic_positions();
}
testlog.info("Checking content of re-combined stream");
{
std::vector<flat_mutation_reader_v2> readers;
readers.reserve(segregated_fragment_streams.size());
for (const auto& segregated_fragment_stream : segregated_fragment_streams) {
readers.emplace_back(make_flat_mutation_reader_from_fragments(schema, permit, copy_fragments(segregated_fragment_stream)));
}
auto rd = assert_that(make_combined_reader(schema, permit, std::move(readers)));
for (const auto& [pkey, content] : expected_partitions) {
testlog.debug("Checking content of partition {}", pkey);
rd.produces_partition_start(pkey);
if (content.has_static_row) {
rd.produces_static_row();
}
for (const auto& ckey : content.clustering_rows) {
rd.produces_row_with_key(ckey);
}
rd.produces_partition_end();
}
rd.produces_end_of_stream();
}
}
SEASTAR_THREAD_TEST_CASE(sstable_scrub_reader_test) {
auto schema = schema_builder("ks", get_name())
.with_column("pk", utf8_type, column_kind::partition_key)
.with_column("ck", int32_type, column_kind::clustering_key)
.with_column("s", int32_type, column_kind::static_column)
.with_column("v", int32_type).build();
tests::reader_concurrency_semaphore_wrapper semaphore;
auto permit = semaphore.make_permit();
std::deque<mutation_fragment_v2> corrupt_fragments;
std::deque<mutation_fragment_v2> scrubbed_fragments;
const auto ts = api::timestamp_type{1};
auto local_keys = make_local_keys(5, schema);
auto make_partition_start = [&, schema] (unsigned pk) {
auto pkey = partition_key::from_deeply_exploded(*schema, { local_keys.at(pk) });
auto dkey = dht::decorate_key(*schema, pkey);
return mutation_fragment_v2(*schema, permit, partition_start(std::move(dkey), {}));
};
auto make_static_row = [&, schema, ts] {
auto r = row{};
auto cdef = schema->static_column_at(0);
auto ac = atomic_cell::make_live(*cdef.type, ts, cdef.type->decompose(data_value(1)));
r.apply(cdef, atomic_cell_or_collection{std::move(ac)});
return mutation_fragment_v2(*schema, permit, static_row(*schema, std::move(r)));
};
auto make_clustering_row = [&, schema, ts] (unsigned i) {
auto r = row{};
auto cdef = schema->regular_column_at(0);
auto ac = atomic_cell::make_live(*cdef.type, ts, cdef.type->decompose(data_value(1)));
r.apply(cdef, atomic_cell_or_collection{std::move(ac)});
return mutation_fragment_v2(*schema, permit,
clustering_row(clustering_key::from_single_value(*schema, int32_type->decompose(data_value(int(i)))), {}, {}, std::move(r)));
};
auto add_fragment = [&, schema] (mutation_fragment_v2 mf, bool add_to_scrubbed = true) {
corrupt_fragments.emplace_back(mutation_fragment_v2(*schema, permit, mf));
if (add_to_scrubbed) {
scrubbed_fragments.emplace_back(std::move(mf));
}
};
// Partition 0
add_fragment(make_partition_start(0));
add_fragment(make_static_row());
add_fragment(make_clustering_row(0));
add_fragment(make_clustering_row(2));
add_fragment(make_clustering_row(1), false); // out-of-order clustering key
scrubbed_fragments.emplace_back(*schema, permit, partition_end{}); // missing partition-end
// Partition 2
add_fragment(make_partition_start(2));
add_fragment(make_static_row());
add_fragment(make_clustering_row(0));
add_fragment(make_clustering_row(1));
add_fragment(make_static_row(), false); // out-of-order static row
add_fragment(mutation_fragment_v2(*schema, permit, partition_end{}));
// Partition 1 - out-of-order
add_fragment(make_partition_start(1), false);
add_fragment(make_static_row(), false);
add_fragment(make_clustering_row(0), false);
add_fragment(make_clustering_row(1), false);
add_fragment(make_clustering_row(2), false);
add_fragment(make_clustering_row(3), false);
add_fragment(mutation_fragment_v2(*schema, permit, partition_end{}), false);
// Partition 3
add_fragment(make_partition_start(3));
add_fragment(make_static_row());
add_fragment(make_clustering_row(0));
add_fragment(make_clustering_row(1));
add_fragment(make_clustering_row(2));
add_fragment(make_clustering_row(3));
scrubbed_fragments.emplace_back(*schema, permit, partition_end{}); // missing partition-end - at EOS
auto r = assert_that(make_scrubbing_reader(make_flat_mutation_reader_from_fragments(schema, permit, std::move(corrupt_fragments)),
compaction_type_options::scrub::mode::skip));
for (const auto& mf : scrubbed_fragments) {
testlog.info("Expecting {}", mutation_fragment_v2::printer(*schema, mf));
r.produces(*schema, mf);
}
r.produces_end_of_stream();
}
SEASTAR_TEST_CASE(sstable_run_based_compaction_test) {
return test_env::do_with_async([] (test_env& env) {
cell_locker_stats cl_stats;
auto builder = schema_builder("tests", "sstable_run_based_compaction_test")
.with_column("id", utf8_type, column_kind::partition_key)
.with_column("value", int32_type);
auto s = builder.build();
auto tmp = tmpdir();
auto sst_gen = [&env, s, &tmp, gen = make_lw_shared<unsigned>(1)] () mutable {
auto sst = env.make_sstable(s, tmp.path().string(), (*gen)++, sstables::get_highest_sstable_version(), big);
return sst;
};
auto tracker = make_lw_shared<cache_tracker>();
column_family_for_tests cf(env.manager(), s);
auto close_cf = deferred_stop(cf);
cf->mark_ready_for_writes();
cf->start();
cf->set_compaction_strategy(sstables::compaction_strategy_type::size_tiered);
auto compact = [&, s] (std::vector<shared_sstable> all, auto replacer) -> std::vector<shared_sstable> {
return compact_sstables(cf.get_compaction_manager(), sstables::compaction_descriptor(std::move(all), default_priority_class(), 1, 0), *cf, sst_gen, replacer).get0().new_sstables;
};
auto make_insert = [&] (auto p) {
auto key = partition_key::from_exploded(*s, {to_bytes(p.first)});
mutation m(s, key);
m.set_clustered_cell(clustering_key::make_empty(), bytes("value"), data_value(int32_t(1)), 1 /* ts */);
BOOST_REQUIRE(m.decorated_key().token() == p.second);
return m;
};
auto tokens = token_generation_for_current_shard(16);
std::unordered_set<shared_sstable> sstables;
std::vector<utils::observer<sstable&>> observers;
sstables::sstable_run_based_compaction_strategy_for_tests cs;
auto do_replace = [&] (const std::vector<shared_sstable>& old_sstables, const std::vector<shared_sstable>& new_sstables) {
for (auto& old_sst : old_sstables) {
BOOST_REQUIRE(sstables.contains(old_sst));
sstables.erase(old_sst);
}
for (auto& new_sst : new_sstables) {
BOOST_REQUIRE(!sstables.contains(new_sst));
sstables.insert(new_sst);
}
column_family_test(cf).rebuild_sstable_list(new_sstables, old_sstables);
cf.get_compaction_manager().propagate_replacement(&*cf, old_sstables, new_sstables);
};
auto do_incremental_replace = [&] (auto old_sstables, auto new_sstables, auto& expected_sst, auto& closed_sstables_tracker) {
// that's because each sstable will contain only 1 mutation.
BOOST_REQUIRE(old_sstables.size() == 1);
BOOST_REQUIRE(new_sstables.size() == 1);
// check that sstable replacement follows token order
BOOST_REQUIRE(*expected_sst == old_sstables.front()->generation());
expected_sst++;
// check that previously released sstables were already closed
if (old_sstables.front()->generation() % 4 == 0) {
// Due to performance reasons, sstables are not released immediately, but in batches.
// At the time of writing, mutation_reader_merger releases it's sstable references
// in batches of 4. That's why we only perform this check every 4th sstable.
BOOST_REQUIRE(*closed_sstables_tracker == old_sstables.front()->generation());
}
do_replace(old_sstables, new_sstables);
observers.push_back(old_sstables.front()->add_on_closed_handler([&] (sstable& sst) {
testlog.info("Closing sstable of generation {}", sst.generation());
closed_sstables_tracker++;
}));
testlog.info("Removing sstable of generation {}, refcnt: {}", old_sstables.front()->generation(), old_sstables.front().use_count());
};
auto do_compaction = [&] (size_t expected_input, size_t expected_output) -> std::vector<shared_sstable> {
auto input_ssts = std::vector<shared_sstable>(sstables.begin(), sstables.end());
auto table_s = make_table_state_for_test(cf, env);
auto strategy_c = make_strategy_control_for_test(false);
auto desc = cs.get_sstables_for_compaction(*table_s, *strategy_c, std::move(input_ssts));
// nothing to compact, move on.
if (desc.sstables.empty()) {
return {};
}
std::unordered_set<utils::UUID> run_ids;
bool incremental_enabled = std::any_of(desc.sstables.begin(), desc.sstables.end(), [&run_ids] (shared_sstable& sst) {
return !run_ids.insert(sst->run_identifier()).second;
});
BOOST_REQUIRE(desc.sstables.size() == expected_input);
auto sstable_run = boost::copy_range<std::set<int64_t>>(desc.sstables
| boost::adaptors::transformed([] (auto& sst) { return sst->generation(); }));
auto expected_sst = sstable_run.begin();
auto closed_sstables_tracker = sstable_run.begin();
auto replacer = [&] (sstables::compaction_completion_desc desc) {
auto old_sstables = std::move(desc.old_sstables);
auto new_sstables = std::move(desc.new_sstables);
BOOST_REQUIRE(expected_sst != sstable_run.end());
if (incremental_enabled) {
do_incremental_replace(std::move(old_sstables), std::move(new_sstables), expected_sst, closed_sstables_tracker);
} else {
do_replace(std::move(old_sstables), std::move(new_sstables));
expected_sst = sstable_run.end();
}
};
auto result = compact(std::move(desc.sstables), replacer);
BOOST_REQUIRE_EQUAL(expected_output, result.size());
BOOST_REQUIRE(expected_sst == sstable_run.end());
return result;
};
// Generate 4 sstable runs composed of 4 fragments each after 4 compactions.
// All fragments non-overlapping.
for (auto i = 0U; i < tokens.size(); i++) {
auto sst = make_sstable_containing(sst_gen, { make_insert(tokens[i]) });
sst->set_sstable_level(1);
BOOST_REQUIRE(sst->get_sstable_level() == 1);
column_family_test(cf).add_sstable(sst);
sstables.insert(std::move(sst));
do_compaction(4, 4);
}
BOOST_REQUIRE(sstables.size() == 16);
// Generate 1 sstable run from 4 sstables runs of similar size
auto result = do_compaction(16, 16);
BOOST_REQUIRE(result.size() == 16);
for (auto i = 0U; i < tokens.size(); i++) {
assert_that(sstable_reader(result[i], s, env.make_reader_permit()))
.produces(make_insert(tokens[i]))
.produces_end_of_stream();
}
});
}
SEASTAR_TEST_CASE(compaction_strategy_aware_major_compaction_test) {
return test_env::do_with_async([] (test_env& env) {
cell_locker_stats cl_stats;
auto s = schema_builder("tests", "compaction_strategy_aware_major_compaction_test")
.with_column("id", utf8_type, column_kind::partition_key)
.with_column("value", int32_type).build();
auto tmp = tmpdir();
auto sst_gen = [&env, s, &tmp, gen = make_lw_shared<unsigned>(1)] () mutable {
return env.make_sstable(s, tmp.path().string(), (*gen)++, sstables::get_highest_sstable_version(), big);
};
auto make_insert = [&] (partition_key key) {
mutation m(s, key);
m.set_clustered_cell(clustering_key::make_empty(), bytes("value"), data_value(int32_t(1)), api::timestamp_type(0));
return m;
};
auto alpha = partition_key::from_exploded(*s, {to_bytes("alpha")});
auto sst = make_sstable_containing(sst_gen, {make_insert(alpha)});
sst->set_sstable_level(2);
auto sst2 = make_sstable_containing(sst_gen, {make_insert(alpha)});
sst2->set_sstable_level(3);
auto candidates = std::vector<sstables::shared_sstable>({ sst, sst2 });
column_family_for_tests cf(env.manager());
auto close_cf = deferred_stop(cf);
auto table_s = make_table_state_for_test(cf, env);
{
auto cs = sstables::make_compaction_strategy(sstables::compaction_strategy_type::leveled, cf.schema()->compaction_strategy_options());
auto descriptor = cs.get_major_compaction_job(*table_s, candidates);
BOOST_REQUIRE(descriptor.sstables.size() == candidates.size());
BOOST_REQUIRE(uint32_t(descriptor.level) == sst2->get_sstable_level());
}
{
auto cs = sstables::make_compaction_strategy(sstables::compaction_strategy_type::size_tiered, cf.schema()->compaction_strategy_options());
auto descriptor = cs.get_major_compaction_job(*table_s, candidates);
BOOST_REQUIRE(descriptor.sstables.size() == candidates.size());
BOOST_REQUIRE(descriptor.level == 0);
}
});
}
SEASTAR_TEST_CASE(backlog_tracker_correctness_after_changing_compaction_strategy) {
return test_env::do_with_async([] (test_env& env) {
cell_locker_stats cl_stats;
auto builder = schema_builder("tests", "backlog_tracker_correctness_after_changing_compaction_strategy")
.with_column("id", utf8_type, column_kind::partition_key)
.with_column("value", int32_type);
auto s = builder.build();
auto tmp = make_lw_shared<tmpdir>();
auto sst_gen = [&env, s, tmp, gen = make_lw_shared<unsigned>(1)] () mutable {
auto sst = env.make_sstable(s, tmp->path().string(), (*gen)++, sstables::get_highest_sstable_version(), big);
return sst;
};
column_family_for_tests cf(env.manager(), s);
auto close_cf = deferred_stop(cf);
cf->set_compaction_strategy(sstables::compaction_strategy_type::leveled);
{
auto tokens = token_generation_for_current_shard(4);
auto make_insert = [&] (auto p) {
auto key = partition_key::from_exploded(*s, {to_bytes(p.first)});
mutation m(s, key);
m.set_clustered_cell(clustering_key::make_empty(), bytes("value"), data_value(int32_t(1)), 1 /* ts */);
BOOST_REQUIRE(m.decorated_key().token() == p.second);
return m;
};
auto mut1 = make_insert(tokens[0]);
auto mut2 = make_insert(tokens[1]);
auto mut3 = make_insert(tokens[2]);
auto mut4 = make_insert(tokens[3]);
std::vector<shared_sstable> ssts = {
make_sstable_containing(sst_gen, {mut1, mut2}),
make_sstable_containing(sst_gen, {mut3, mut4})
};
for (auto& sst : ssts) {
cf->get_compaction_strategy().get_backlog_tracker().replace_sstables({}, {sst});
}
// Start compaction, then stop tracking compaction, switch to TWCS, wait for compaction to finish and check for backlog.
// That's done to assert backlog will work for compaction that is finished and was stopped tracking.
auto fut = compact_sstables(cf.get_compaction_manager(), sstables::compaction_descriptor(ssts, default_priority_class()), *cf, sst_gen);
// set_compaction_strategy() itself is responsible for transferring charges from old to new backlog tracker.
cf->set_compaction_strategy(sstables::compaction_strategy_type::time_window);
for (auto& sst : ssts) {
cf->get_compaction_strategy().get_backlog_tracker().replace_sstables({}, {sst});
}
auto ret = fut.get0();
BOOST_REQUIRE(ret.new_sstables.size() == 1);
}
// triggers code that iterates through registered compactions.
cf._data->cm.backlog();
cf->get_compaction_strategy().get_backlog_tracker().backlog();
});
}
SEASTAR_TEST_CASE(partial_sstable_run_filtered_out_test) {
BOOST_REQUIRE(smp::count == 1);
return test_env::do_with_async([] (test_env& env) {
auto s = schema_builder("tests", "partial_sstable_run_filtered_out_test")
.with_column("id", utf8_type, column_kind::partition_key)
.with_column("value", int32_type).build();
auto tmp = tmpdir();
auto cm = make_lw_shared<compaction_manager>();
auto stop_cm = defer([cm] {
cm->stop().get();
});
cm->enable();
replica::column_family::config cfg = column_family_test_config(env.manager(), env.semaphore());
cfg.datadir = tmp.path().string();
cfg.enable_commitlog = false;
cfg.enable_incremental_backups = false;
auto cl_stats = make_lw_shared<cell_locker_stats>();
auto tracker = make_lw_shared<cache_tracker>();
auto cf = make_lw_shared<replica::column_family>(s, cfg, replica::column_family::no_commitlog(), *cm, *cl_stats, *tracker);
cf->start();
cf->mark_ready_for_writes();
utils::UUID partial_sstable_run_identifier = utils::make_random_uuid();
mutation mut(s, partition_key::from_exploded(*s, {to_bytes("alpha")}));
mut.set_clustered_cell(clustering_key::make_empty(), bytes("value"), data_value(int32_t(1)), 0);
sstable_writer_config sst_cfg = env.manager().configure_writer();
sst_cfg.run_identifier = partial_sstable_run_identifier;
auto partial_sstable_run_sst = make_sstable_easy(env, tmp.path(), make_flat_mutation_reader_from_mutations_v2(s, env.make_reader_permit(), { std::move(mut) }), sst_cfg);
column_family_test(cf).add_sstable(partial_sstable_run_sst);
column_family_test::update_sstables_known_generation(*cf, partial_sstable_run_sst->generation());
auto generation_exists = [&cf] (int64_t generation) {
auto sstables = cf->get_sstables();
auto entry = boost::range::find_if(*sstables, [generation] (shared_sstable sst) { return generation == sst->generation(); });
return entry != sstables->end();
};
BOOST_REQUIRE(generation_exists(partial_sstable_run_sst->generation()));
// register partial sstable run
auto cm_test = compaction_manager_test(*cm);
cm_test.run(partial_sstable_run_identifier, cf.get(), [cf] (sstables::compaction_data&) {
return cf->compact_all_sstables();
}).get();
// make sure partial sstable run has none of its fragments compacted.
BOOST_REQUIRE(generation_exists(partial_sstable_run_sst->generation()));
});
}
// Make sure that a custom tombstone-gced-only writer will be feeded with gc'able tombstone
// from the regular compaction's input sstable.
SEASTAR_TEST_CASE(purged_tombstone_consumer_sstable_test) {
BOOST_REQUIRE(smp::count == 1);
return test_env::do_with_async([] (test_env& env) {
cell_locker_stats cl_stats;
auto builder = schema_builder("tests", "purged_tombstone_consumer_sstable_test")
.with_column("id", utf8_type, column_kind::partition_key)
.with_column("value", int32_type);
builder.set_gc_grace_seconds(0);
auto s = builder.build();
auto tmp = tmpdir();
auto sst_gen = [&env, s, &tmp, gen = make_lw_shared<unsigned>(1)] () mutable {
return env.make_sstable(s, tmp.path().string(), (*gen)++, sstables::get_highest_sstable_version(), big);
};
class compacting_sstable_writer_test {
shared_sstable& _sst;
sstable_writer _writer;
public:
explicit compacting_sstable_writer_test(const schema_ptr& s, shared_sstable& sst, sstables_manager& manager)
: _sst(sst),
_writer(sst->get_writer(*s, 1, manager.configure_writer("test"),
encoding_stats{}, service::get_local_compaction_priority())) {}
void consume_new_partition(const dht::decorated_key& dk) { _writer.consume_new_partition(dk); }
void consume(tombstone t) { _writer.consume(t); }
stop_iteration consume(static_row&& sr, tombstone, bool) { return _writer.consume(std::move(sr)); }
stop_iteration consume(clustering_row&& cr, row_tombstone tomb, bool) { return _writer.consume(std::move(cr)); }
stop_iteration consume(range_tombstone_change&& rtc) { return _writer.consume(std::move(rtc)); }
stop_iteration consume_end_of_partition() { return _writer.consume_end_of_partition(); }
void consume_end_of_stream() { _writer.consume_end_of_stream(); _sst->open_data().get0(); }
};
std::optional<gc_clock::time_point> gc_before;
auto max_purgeable_ts = api::max_timestamp;
auto is_tombstone_purgeable = [&gc_before, max_purgeable_ts](const tombstone& t) {
bool can_gc = t.deletion_time < *gc_before;
return t && can_gc && t.timestamp < max_purgeable_ts;
};
auto compact = [&] (std::vector<shared_sstable> all) -> std::pair<shared_sstable, shared_sstable> {
auto max_purgeable_func = [max_purgeable_ts] (const dht::decorated_key& dk) {
return max_purgeable_ts;
};
auto non_purged = sst_gen();
auto purged_only = sst_gen();
auto cr = compacting_sstable_writer_test(s, non_purged, env.manager());
auto purged_cr = compacting_sstable_writer_test(s, purged_only, env.manager());
auto gc_now = gc_clock::now();
gc_before = gc_now - s->gc_grace_seconds();
auto gc_grace_seconds = s->gc_grace_seconds();
auto cfc = compact_for_compaction_v2<compacting_sstable_writer_test, compacting_sstable_writer_test>(
*s, gc_now, max_purgeable_func, std::move(cr), std::move(purged_cr));
auto cs = sstables::make_compaction_strategy(sstables::compaction_strategy_type::size_tiered, s->compaction_strategy_options());
auto compacting = make_lw_shared<sstables::sstable_set>(cs.make_sstable_set(s));
for (auto&& sst : all) {
compacting->insert(std::move(sst));
}
auto r = compacting->make_range_sstable_reader(s,
env.make_reader_permit(),
query::full_partition_range,
s->full_slice(),
service::get_local_compaction_priority(),
nullptr,
::streamed_mutation::forwarding::no,
::mutation_reader::forwarding::no);
auto close_r = deferred_close(r);
r.consume_in_thread(std::move(cfc));
return {std::move(non_purged), std::move(purged_only)};
};
auto next_timestamp = [] {
static thread_local api::timestamp_type next = 1;
return next++;
};
auto make_insert = [&] (partition_key key) {
mutation m(s, key);
m.set_clustered_cell(clustering_key::make_empty(), bytes("value"), data_value(int32_t(1)), next_timestamp());
return m;
};
auto make_delete = [&] (partition_key key) -> std::pair<mutation, tombstone> {
mutation m(s, key);
tombstone tomb(next_timestamp(), gc_clock::now());
m.partition().apply(tomb);
return {m, tomb};
};
auto alpha = partition_key::from_exploded(*s, {to_bytes("alpha")});
auto beta = partition_key::from_exploded(*s, {to_bytes("beta")});
auto ttl = 5;
auto assert_that_produces_purged_tombstone = [&] (auto& sst, partition_key& key, tombstone tomb) {
auto reader = make_lw_shared<flat_mutation_reader>(sstable_reader(sst, s, env.make_reader_permit()));
read_mutation_from_flat_mutation_reader(*reader).then([reader, s, &key, is_tombstone_purgeable, &tomb] (mutation_opt m) {
BOOST_REQUIRE(m);
BOOST_REQUIRE(m->key().equal(*s, key));
auto rows = m->partition().clustered_rows();
BOOST_REQUIRE_EQUAL(rows.calculate_size(), 0);
BOOST_REQUIRE(is_tombstone_purgeable(m->partition().partition_tombstone()));
BOOST_REQUIRE(m->partition().partition_tombstone() == tomb);
return (*reader)();
}).then([reader, s] (mutation_fragment_opt m) {
BOOST_REQUIRE(!m);
}).finally([reader] {
return reader->close();
}).get();
};
// gc'ed tombstone for alpha will go to gc-only consumer, whereas live data goes to regular consumer.
{
auto mut1 = make_insert(alpha);
auto mut2 = make_insert(beta);
auto [mut3, mut3_tombstone] = make_delete(alpha);
std::vector<shared_sstable> sstables = {
make_sstable_containing(sst_gen, {mut1, mut2}),
make_sstable_containing(sst_gen, {mut3})
};
forward_jump_clocks(std::chrono::seconds(ttl));
auto [non_purged, purged_only] = compact(std::move(sstables));
assert_that(sstable_reader(non_purged, s, env.make_reader_permit()))
.produces(mut2)
.produces_end_of_stream();
assert_that_produces_purged_tombstone(purged_only, alpha, mut3_tombstone);
}
});
}
/* Make sure data is not ressurrected.
sstable 1 with key A and key B and key C
sstable 2 with expired (GC'able) tombstone for key A
use max_sstable_size = 1;
so key A and expired tombstone for key A are compacted away.
key B is written into a new sstable, and sstable 2 is removed.
Need to stop compaction at this point!!!
Result: sstable 1 is alive in the table, whereas sstable 2 is gone.
if key A can be read from table, data was ressurrected.
*/
SEASTAR_TEST_CASE(incremental_compaction_data_resurrection_test) {
return test_env::do_with_async([] (test_env& env) {
cell_locker_stats cl_stats;
// In a column family with gc_grace_seconds set to 0, check that a tombstone
// is purged after compaction.
auto builder = schema_builder("tests", "incremental_compaction_data_resurrection_test")
.with_column("id", utf8_type, column_kind::partition_key)
.with_column("value", int32_type);
builder.set_gc_grace_seconds(0);
auto s = builder.build();
auto tmp = tmpdir();
auto sst_gen = [&env, s, &tmp, gen = make_lw_shared<unsigned>(1)] () mutable {
return env.make_sstable(s, tmp.path().string(), (*gen)++, sstables::get_highest_sstable_version(), big);
};
auto next_timestamp = [] {
static thread_local api::timestamp_type next = 1;
return next++;
};
auto make_insert = [&] (partition_key key) {
mutation m(s, key);
m.set_clustered_cell(clustering_key::make_empty(), bytes("value"), data_value(int32_t(1)), next_timestamp());
return m;
};
auto deletion_time = gc_clock::now();
auto make_delete = [&] (partition_key key) {
mutation m(s, key);
tombstone tomb(next_timestamp(), deletion_time);
m.partition().apply(tomb);
return m;
};
auto tokens = token_generation_for_current_shard(3);
auto alpha = partition_key::from_exploded(*s, {to_bytes(tokens[0].first)});
auto beta = partition_key::from_exploded(*s, {to_bytes(tokens[1].first)});
auto gamma = partition_key::from_exploded(*s, {to_bytes(tokens[2].first)});
auto ttl = 5;
auto mut1 = make_insert(alpha);
auto mut2 = make_insert(beta);
auto mut3 = make_insert(gamma);
auto mut1_deletion = make_delete(alpha);
auto non_expired_sst = make_sstable_containing(sst_gen, {mut1, mut2, mut3});
auto expired_sst = make_sstable_containing(sst_gen, {mut1_deletion});
// make ssts belong to same run for compaction to enable incremental approach
utils::UUID run_id = utils::make_random_uuid();
sstables::test(non_expired_sst).set_run_identifier(run_id);
sstables::test(expired_sst).set_run_identifier(run_id);
std::vector<shared_sstable> sstables = {
non_expired_sst,
expired_sst,
};
// make mut1_deletion gc'able.
forward_jump_clocks(std::chrono::seconds(ttl));
auto cm = make_lw_shared<compaction_manager>();
replica::column_family::config cfg = column_family_test_config(env.manager(), env.semaphore());
cfg.datadir = tmp.path().string();
cfg.enable_disk_writes = false;
cfg.enable_commitlog = false;
cfg.enable_cache = true;
cfg.enable_incremental_backups = false;
auto tracker = make_lw_shared<cache_tracker>();
auto cf = make_lw_shared<replica::column_family>(s, cfg, replica::column_family::no_commitlog(), *cm, cl_stats, *tracker);
auto stop_cf = deferred_stop(*cf);
cf->mark_ready_for_writes();
cf->start();
cf->set_compaction_strategy(sstables::compaction_strategy_type::null);
auto is_partition_dead = [&s, &cf, &env] (partition_key& pkey) {
replica::column_family::const_mutation_partition_ptr mp = cf->find_partition_slow(s, env.make_reader_permit(), pkey).get0();
return mp && bool(mp->partition_tombstone());
};
cf->add_sstable_and_update_cache(non_expired_sst).get();
BOOST_REQUIRE(!is_partition_dead(alpha));
cf->add_sstable_and_update_cache(expired_sst).get();
BOOST_REQUIRE(is_partition_dead(alpha));
auto replacer = [&] (sstables::compaction_completion_desc desc) {
auto old_sstables = std::move(desc.old_sstables);
auto new_sstables = std::move(desc.new_sstables);
// expired_sst is exhausted, and new sstable is written with mut 2.
BOOST_REQUIRE_EQUAL(old_sstables.size(), 1);
BOOST_REQUIRE(old_sstables.front() == expired_sst);
BOOST_REQUIRE_EQUAL(new_sstables.size(), 2);
for (auto& new_sstable : new_sstables) {
if (new_sstable->get_max_local_deletion_time() == deletion_time) { // Skipping GC SSTable.
continue;
}
assert_that(sstable_reader(new_sstable, s, env.make_reader_permit()))
.produces(mut2)
.produces_end_of_stream();
}
column_family_test(cf).rebuild_sstable_list(new_sstables, old_sstables);
// force compaction failure after sstable containing expired tombstone is removed from set.
throw std::runtime_error("forcing compaction failure on early replacement");
};
bool swallowed = false;
try {
// The goal is to have one sstable generated for each mutation to trigger the issue.
auto max_sstable_size = 0;
auto result = compact_sstables(*cm, sstables::compaction_descriptor(sstables, default_priority_class(), 0, max_sstable_size), *cf, sst_gen, replacer).get0().new_sstables;
BOOST_REQUIRE_EQUAL(2, result.size());
} catch (...) {
// swallow exception
swallowed = true;
}
BOOST_REQUIRE(swallowed);
// check there's no data resurrection
BOOST_REQUIRE(is_partition_dead(alpha));
});
}
SEASTAR_TEST_CASE(twcs_major_compaction_test) {
// Tests that two mutations that were written a month apart are compacted
// to two different SSTables, whereas two mutations that were written 1ms apart
// are compacted to the same SSTable.
return test_env::do_with_async([] (test_env& env) {
cell_locker_stats cl_stats;
// In a column family with gc_grace_seconds set to 0, check that a tombstone
// is purged after compaction.
auto builder = schema_builder("tests", "twcs_major")
.with_column("id", utf8_type, column_kind::partition_key)
.with_column("cl", int32_type, column_kind::clustering_key)
.with_column("value", int32_type);
auto s = builder.build();
auto tmp = tmpdir();
auto sst_gen = [&env, s, &tmp, gen = make_lw_shared<unsigned>(1)] () mutable {
return env.make_sstable(s, tmp.path().string(), (*gen)++, sstables::get_highest_sstable_version(), big);
};
auto next_timestamp = [] (auto step) {
using namespace std::chrono;
return (api::timestamp_clock::now().time_since_epoch() - duration_cast<microseconds>(step)).count();
};
auto make_insert = [&] (api::timestamp_clock::duration step) {
static thread_local int32_t value = 1;
auto key_and_token_pair = token_generation_for_current_shard(1);
auto key_str = key_and_token_pair[0].first;
auto key = partition_key::from_exploded(*s, {to_bytes(key_str)});
mutation m(s, key);
auto c_key = clustering_key::from_exploded(*s, {int32_type->decompose(value++)});
m.set_clustered_cell(c_key, bytes("value"), data_value(int32_t(value)), next_timestamp(step));
return m;
};
// Two mutations, one of them 30 days ago. Should be split when
// compacting
auto mut1 = make_insert(0ms);
auto mut2 = make_insert(720h);
// Two mutations, close together. Should end up in the same SSTable
auto mut3 = make_insert(0ms);
auto mut4 = make_insert(1ms);
auto cm = make_lw_shared<compaction_manager>();
replica::column_family::config cfg = column_family_test_config(env.manager(), env.semaphore());
cfg.datadir = tmp.path().string();
cfg.enable_disk_writes = true;
cfg.enable_commitlog = false;
cfg.enable_cache = false;
cfg.enable_incremental_backups = false;
auto tracker = make_lw_shared<cache_tracker>();
auto cf = make_lw_shared<replica::column_family>(s, cfg, replica::column_family::no_commitlog(), *cm, cl_stats, *tracker);
cf->mark_ready_for_writes();
cf->start();
cf->set_compaction_strategy(sstables::compaction_strategy_type::time_window);
auto original_together = make_sstable_containing(sst_gen, {mut3, mut4});
auto ret = compact_sstables(*cm, sstables::compaction_descriptor({original_together}, default_priority_class()), *cf, sst_gen, replacer_fn_no_op()).get0();
BOOST_REQUIRE(ret.new_sstables.size() == 1);
auto original_apart = make_sstable_containing(sst_gen, {mut1, mut2});
ret = compact_sstables(*cm, sstables::compaction_descriptor({original_apart}, default_priority_class()), *cf, sst_gen, replacer_fn_no_op()).get0();
BOOST_REQUIRE(ret.new_sstables.size() == 2);
});
}
SEASTAR_TEST_CASE(autocompaction_control_test) {
return test_env::do_with_async([] (test_env& env) {
cell_locker_stats cl_stats;
cache_tracker tracker;
compaction_manager cm;
auto stop_compaction_manager = deferred_stop(cm);
cm.enable();
auto s = schema_builder(some_keyspace, some_column_family)
.with_column("id", utf8_type, column_kind::partition_key)
.with_column("value", int32_type)
.build();
auto tmp = tmpdir();
replica::column_family::config cfg = column_family_test_config(env.manager(), env.semaphore());
cfg.datadir = tmp.path().string();
cfg.enable_commitlog = false;
cfg.enable_disk_writes = true;
auto cf = make_lw_shared<replica::column_family>(s, cfg, replica::column_family::no_commitlog(), cm, cl_stats, tracker);
cf->set_compaction_strategy(sstables::compaction_strategy_type::size_tiered);
cf->mark_ready_for_writes();
// no compactions done yet
auto& ss = cm.get_stats();
BOOST_REQUIRE(cm.get_stats().pending_tasks == 0 && cm.get_stats().active_tasks == 0 && ss.completed_tasks == 0);
// auto compaction is enabled by default
BOOST_REQUIRE(!cf->is_auto_compaction_disabled_by_user());
// disable auto compaction by user
cf->disable_auto_compaction().get();
// check it is disabled
BOOST_REQUIRE(cf->is_auto_compaction_disabled_by_user());
// generate a few sstables
auto sst_gen = [&env, s, &tmp, gen = make_lw_shared<unsigned>(1)] () mutable {
return env.make_sstable(s, tmp.path().string(), (*gen)++, sstables::get_highest_sstable_version(), big);
};
auto make_insert = [&] (partition_key key) {
mutation m(s, key);
m.set_clustered_cell(clustering_key::make_empty(), bytes("value"), data_value(int32_t(1)), 1 /* ts */);
return m;
};
auto min_threshold = cf->schema()->min_compaction_threshold();
auto tokens = token_generation_for_current_shard(1);
for (auto i = 0; i < 2 * min_threshold; ++i) {
auto key = partition_key::from_exploded(*s, {to_bytes(tokens[0].first)});
auto mut = make_insert(key);
auto sst = make_sstable_containing(sst_gen, {mut});
cf->add_sstable_and_update_cache(sst).wait();
}
// check compaction manager does not receive background compaction submissions
cf->start();
auto stop_cf = deferred_stop(*cf);
cf->trigger_compaction();
cm.submit(cf.get());
BOOST_REQUIRE(cm.get_stats().pending_tasks == 0 && cm.get_stats().active_tasks == 0 && ss.completed_tasks == 0);
// enable auto compaction
cf->enable_auto_compaction();
// check enabled
BOOST_REQUIRE(!cf->is_auto_compaction_disabled_by_user());
// trigger background compaction
cf->trigger_compaction();
// wait until compaction finished
do_until([&cm] { return cm.get_stats().pending_tasks == 0 && cm.get_stats().active_tasks == 0; }, [] {
return sleep(std::chrono::milliseconds(100));
}).wait();
// test compaction successfully finished
BOOST_REQUIRE(ss.errors == 0);
BOOST_REQUIRE(ss.completed_tasks == 1);
});
}
//
// Test that https://github.com/scylladb/scylla/issues/6472 is gone
//
SEASTAR_TEST_CASE(test_bug_6472) {
return test_setup::do_with_tmp_directory([] (test_env& env, sstring tmpdir_path) {
auto builder = schema_builder("tests", "test_bug_6472")
.with_column("id", utf8_type, column_kind::partition_key)
.with_column("cl", int32_type, column_kind::clustering_key)
.with_column("value", int32_type);
builder.set_compaction_strategy(sstables::compaction_strategy_type::time_window);
std::map<sstring, sstring> opts = {
{ time_window_compaction_strategy_options::COMPACTION_WINDOW_UNIT_KEY, "HOURS" },
{ time_window_compaction_strategy_options::COMPACTION_WINDOW_SIZE_KEY, "1" },
};
builder.set_compaction_strategy_options(std::move(opts));
builder.set_gc_grace_seconds(0);
auto s = builder.build();
auto sst_gen = [&env, s, tmpdir_path, gen = make_lw_shared<unsigned>(1)] () mutable {
return env.make_sstable(s, tmpdir_path, (*gen)++, sstables::get_highest_sstable_version(), big);
};
auto next_timestamp = [] (auto step) {
using namespace std::chrono;
return (gc_clock::now().time_since_epoch() - duration_cast<microseconds>(step)).count();
};
auto tokens = token_generation_for_shard(1, this_shard_id(), test_db_config.murmur3_partitioner_ignore_msb_bits(), smp::count);
auto make_expiring_cell = [&] (std::chrono::hours step) {
static thread_local int32_t value = 1;
auto key_str = tokens[0].first;
auto key = partition_key::from_exploded(*s, {to_bytes(key_str)});
mutation m(s, key);
auto c_key = clustering_key::from_exploded(*s, {int32_type->decompose(value++)});
m.set_clustered_cell(c_key, bytes("value"), data_value(int32_t(value)), next_timestamp(step), gc_clock::duration(step + 5s));
return m;
};
auto cm = make_lw_shared<compaction_manager>();
replica::column_family::config cfg = column_family_test_config(env.manager(), env.semaphore());
cfg.datadir = tmpdir_path;
cfg.enable_disk_writes = true;
cfg.enable_commitlog = false;
cfg.enable_cache = false;
cfg.enable_incremental_backups = false;
auto tracker = make_lw_shared<cache_tracker>();
cell_locker_stats cl_stats;
auto cf = make_lw_shared<replica::column_family>(s, cfg, replica::column_family::no_commitlog(), *cm, cl_stats, *tracker);
cf->mark_ready_for_writes();
cf->start();
// Make 100 expiring cells which belong to different time windows
std::vector<mutation> muts;
muts.reserve(101);
for (auto i = 1; i < 101; i++) {
muts.push_back(make_expiring_cell(std::chrono::hours(i)));
}
muts.push_back(make_expiring_cell(std::chrono::hours(110)));
//
// Reproduce issue 6472 by making an input set which causes both interposer and GC writer to be enabled
//
std::vector<shared_sstable> sstables_spanning_many_windows = {
make_sstable_containing(sst_gen, muts),
make_sstable_containing(sst_gen, muts),
};
utils::UUID run_id = utils::make_random_uuid();
for (auto& sst : sstables_spanning_many_windows) {
sstables::test(sst).set_run_identifier(run_id);
}
// Make sure everything we wanted expired is expired by now.
forward_jump_clocks(std::chrono::hours(101));
auto ret = compact_sstables(*cm, sstables::compaction_descriptor(sstables_spanning_many_windows,
default_priority_class()), *cf, sst_gen, replacer_fn_no_op()).get0();
BOOST_REQUIRE(ret.new_sstables.size() == 1);
return make_ready_future<>();
});
}
SEASTAR_TEST_CASE(sstable_needs_cleanup_test) {
return test_env::do_with([] (test_env& env) {
auto s = make_shared_schema({}, some_keyspace, some_column_family,
{{"p1", utf8_type}}, {}, {}, {}, utf8_type);
auto tokens = token_generation_for_current_shard(10);
auto sst_gen = [&env, s, gen = make_lw_shared<unsigned>(1)] (sstring first, sstring last) mutable {
return sstable_for_overlapping_test(env, s, (*gen)++, first, last);
};
auto token = [&] (size_t index) -> dht::token {
return tokens[index].second;
};
auto key_from_token = [&] (size_t index) -> sstring {
return tokens[index].first;
};
auto token_range = [&] (size_t first, size_t last) -> dht::token_range {
return dht::token_range::make(token(first), token(last));
};
{
auto local_ranges = { token_range(0, 9) };
auto sst = sst_gen(key_from_token(0), key_from_token(9));
BOOST_REQUIRE(!needs_cleanup(sst, local_ranges, s));
}
{
auto local_ranges = { token_range(0, 1), token_range(3, 4), token_range(5, 6) };
auto sst = sst_gen(key_from_token(0), key_from_token(1));
BOOST_REQUIRE(!needs_cleanup(sst, local_ranges, s));
auto sst2 = sst_gen(key_from_token(2), key_from_token(2));
BOOST_REQUIRE(needs_cleanup(sst2, local_ranges, s));
auto sst3 = sst_gen(key_from_token(0), key_from_token(6));
BOOST_REQUIRE(needs_cleanup(sst3, local_ranges, s));
auto sst5 = sst_gen(key_from_token(7), key_from_token(7));
BOOST_REQUIRE(needs_cleanup(sst5, local_ranges, s));
}
return make_ready_future<>();
});
}
SEASTAR_TEST_CASE(test_twcs_partition_estimate) {
return test_setup::do_with_tmp_directory([] (test_env& env, sstring tmpdir_path) {
auto builder = schema_builder("tests", "test_bug_6472")
.with_column("id", utf8_type, column_kind::partition_key)
.with_column("cl", int32_type, column_kind::clustering_key)
.with_column("value", int32_type);
builder.set_compaction_strategy(sstables::compaction_strategy_type::time_window);
std::map<sstring, sstring> opts = {
{ time_window_compaction_strategy_options::COMPACTION_WINDOW_UNIT_KEY, "HOURS" },
{ time_window_compaction_strategy_options::COMPACTION_WINDOW_SIZE_KEY, "1" },
};
builder.set_compaction_strategy_options(std::move(opts));
builder.set_gc_grace_seconds(0);
auto s = builder.build();
const auto rows_per_partition = 200;
auto sst_gen = [&env, s, tmpdir_path, gen = make_lw_shared<unsigned>(1)] () mutable {
return env.make_sstable(s, tmpdir_path, (*gen)++, sstables::get_highest_sstable_version(), big);
};
auto next_timestamp = [] (int sstable_idx, int ck_idx) {
using namespace std::chrono;
auto window = hours(sstable_idx * rows_per_partition + ck_idx);
return (gc_clock::now().time_since_epoch() - duration_cast<microseconds>(window)).count();
};
auto tokens = token_generation_for_shard(4, this_shard_id(), test_db_config.murmur3_partitioner_ignore_msb_bits(), smp::count);
auto make_sstable = [&] (int sstable_idx) {
static thread_local int32_t value = 1;
auto key_str = tokens[sstable_idx].first;
auto key = partition_key::from_exploded(*s, {to_bytes(key_str)});
mutation m(s, key);
for (auto ck = 0; ck < rows_per_partition; ++ck) {
auto c_key = clustering_key::from_exploded(*s, {int32_type->decompose(value++)});
m.set_clustered_cell(c_key, bytes("value"), data_value(int32_t(value)), next_timestamp(sstable_idx, ck));
}
return make_sstable_containing(sst_gen, {m});
};
auto cm = make_lw_shared<compaction_manager>();
replica::column_family::config cfg = column_family_test_config(env.manager(), env.semaphore());
cfg.datadir = tmpdir_path;
cfg.enable_disk_writes = true;
cfg.enable_commitlog = false;
cfg.enable_cache = false;
cfg.enable_incremental_backups = false;
auto tracker = make_lw_shared<cache_tracker>();
cell_locker_stats cl_stats;
auto cf = make_lw_shared<replica::column_family>(s, cfg, replica::column_family::no_commitlog(), *cm, cl_stats, *tracker);
cf->mark_ready_for_writes();
cf->start();
std::vector<shared_sstable> sstables_spanning_many_windows = {
make_sstable(0),
make_sstable(1),
make_sstable(2),
make_sstable(3),
};
auto ret = compact_sstables(*cm, sstables::compaction_descriptor(sstables_spanning_many_windows,
default_priority_class()), *cf, sst_gen, replacer_fn_no_op()).get0();
// The real test here is that we don't assert() in
// sstables::prepare_summary() with the compact_sstables() call above,
// this is only here as a sanity check.
BOOST_REQUIRE_EQUAL(ret.new_sstables.size(), std::min(sstables_spanning_many_windows.size() * rows_per_partition,
sstables::time_window_compaction_strategy::max_data_segregation_window_count));
return make_ready_future<>();
});
}
SEASTAR_TEST_CASE(stcs_reshape_test) {
return test_env::do_with_async([] (test_env& env) {
simple_schema ss;
auto s = ss.schema();
std::vector<shared_sstable> sstables;
sstables.reserve(s->max_compaction_threshold());
auto key_and_token_pair = token_generation_for_current_shard(s->max_compaction_threshold() + 2);
for (auto gen = 1; gen <= s->max_compaction_threshold(); gen++) {
auto sst = env.make_sstable(s, "", gen);
sstables::test(sst).set_data_file_size(1);
sstables::test(sst).set_values(key_and_token_pair[gen - 1].first, key_and_token_pair[gen + 1].first, stats_metadata{});
sstables.push_back(std::move(sst));
}
auto cs = sstables::make_compaction_strategy(sstables::compaction_strategy_type::size_tiered,
s->compaction_strategy_options());
BOOST_REQUIRE(cs.get_reshaping_job(sstables, s, default_priority_class(), reshape_mode::strict).sstables.size());
BOOST_REQUIRE(cs.get_reshaping_job(sstables, s, default_priority_class(), reshape_mode::relaxed).sstables.size());
});
}
SEASTAR_TEST_CASE(lcs_reshape_test) {
return test_env::do_with_async([] (test_env& env) {
simple_schema ss;
auto s = ss.schema();
auto keys = token_generation_for_current_shard(256);
auto cs = sstables::make_compaction_strategy(sstables::compaction_strategy_type::leveled,
s->compaction_strategy_options());
// non overlapping
{
std::vector <shared_sstable> sstables;
for (auto i = 0; i < 256; i++) {
auto sst = env.make_sstable(s, "", i + 1);
auto key = keys[i].first;
sstables::test(sst).set_values_for_leveled_strategy(1 /* size */, 0 /* level */, 0 /* max ts */, key, key);
sstables.push_back(std::move(sst));
}
BOOST_REQUIRE(cs.get_reshaping_job(sstables, s, default_priority_class(), reshape_mode::strict).sstables.size() == 256);
}
// all overlapping
{
std::vector <shared_sstable> sstables;
for (auto i = 0; i < 256; i++) {
auto sst = env.make_sstable(s, "", i + 1);
auto key = keys[0].first;
sstables::test(sst).set_values_for_leveled_strategy(1 /* size */, 0 /* level */, 0 /* max ts */, key, key);
sstables.push_back(std::move(sst));
}
BOOST_REQUIRE(cs.get_reshaping_job(sstables, s, default_priority_class(), reshape_mode::strict).sstables.size() == uint64_t(s->max_compaction_threshold()));
}
// single sstable
{
auto sst = env.make_sstable(s, "", 1);
auto key = keys[0].first;
sstables::test(sst).set_values_for_leveled_strategy(1 /* size */, 0 /* level */, 0 /* max ts */, key, key);
BOOST_REQUIRE(cs.get_reshaping_job({ sst }, s, default_priority_class(), reshape_mode::strict).sstables.size() == 0);
}
});
}
SEASTAR_TEST_CASE(test_twcs_interposer_on_memtable_flush) {
return test_env::do_with_async([] (test_env& env) {
auto test_interposer_on_flush = [&] (bool split_during_flush) {
auto builder = schema_builder("tests", "test_twcs_interposer_on_flush")
.with_column("id", utf8_type, column_kind::partition_key)
.with_column("cl", int32_type, column_kind::clustering_key)
.with_column("value", int32_type);
builder.set_compaction_strategy(sstables::compaction_strategy_type::time_window);
std::map<sstring, sstring> opts = {
{ time_window_compaction_strategy_options::COMPACTION_WINDOW_UNIT_KEY, "HOURS" },
{ time_window_compaction_strategy_options::COMPACTION_WINDOW_SIZE_KEY, "1" },
};
builder.set_compaction_strategy_options(std::move(opts));
auto s = builder.build();
auto next_timestamp = [] (auto step) {
using namespace std::chrono;
return (gc_clock::now().time_since_epoch() - duration_cast<microseconds>(step)).count();
};
auto tokens = token_generation_for_shard(1, this_shard_id(), test_db_config.murmur3_partitioner_ignore_msb_bits(), smp::count);
auto make_row = [&] (std::chrono::hours step) {
static thread_local int32_t value = 1;
auto key_str = tokens[0].first;
auto key = partition_key::from_exploded(*s, {to_bytes(key_str)});
mutation m(s, key);
auto c_key = clustering_key::from_exploded(*s, {int32_type->decompose(value++)});
m.set_clustered_cell(c_key, bytes("value"), data_value(int32_t(value)), next_timestamp(step));
return m;
};
auto tmp = tmpdir();
auto cm = make_lw_shared<compaction_manager>();
replica::column_family::config cfg = column_family_test_config(env.manager(), env.semaphore());
cfg.datadir = tmp.path().string();
cfg.enable_disk_writes = true;
cfg.enable_cache = false;
auto tracker = make_lw_shared<cache_tracker>();
cell_locker_stats cl_stats;
auto cf = make_lw_shared<replica::column_family>(s, cfg, replica::column_family::no_commitlog(), *cm, cl_stats, *tracker);
cf->mark_ready_for_writes();
cf->start();
size_t target_windows_span = (split_during_flush) ? 10 : 1;
constexpr size_t rows_per_window = 10;
auto mt = make_lw_shared<replica::memtable>(s);
for (unsigned i = 1; i <= target_windows_span; i++) {
for (unsigned j = 0; j < rows_per_window; j++) {
mt->apply(make_row(std::chrono::hours(i)));
}
}
auto ret = column_family_test(cf).try_flush_memtable_to_sstable(mt).get0();
BOOST_REQUIRE(ret == stop_iteration::yes);
auto expected_ssts = (split_during_flush) ? target_windows_span : 1;
testlog.info("split_during_flush={}, actual={}, expected={}", split_during_flush, cf->get_sstables()->size(), expected_ssts);
BOOST_REQUIRE(cf->get_sstables()->size() == expected_ssts);
};
test_interposer_on_flush(true);
test_interposer_on_flush(false);
});
}
SEASTAR_TEST_CASE(test_twcs_compaction_across_buckets) {
return test_env::do_with_async([] (test_env& env) {
auto builder = schema_builder("tests", "test_twcs_compaction_across_buckets")
.with_column("id", utf8_type, column_kind::partition_key)
.with_column("cl", int32_type, column_kind::clustering_key)
.with_column("value", int32_type);
builder.set_compaction_strategy(sstables::compaction_strategy_type::time_window);
std::map<sstring, sstring> opts = {
{ time_window_compaction_strategy_options::COMPACTION_WINDOW_UNIT_KEY, "HOURS" },
{ time_window_compaction_strategy_options::COMPACTION_WINDOW_SIZE_KEY, "1" },
};
builder.set_compaction_strategy_options(std::move(opts));
auto s = builder.build();
auto next_timestamp = [] (std::chrono::hours step = std::chrono::hours(0)) {
return (gc_clock::now().time_since_epoch() - std::chrono::duration_cast<std::chrono::microseconds>(step)).count();
};
auto tmp = tmpdir();
auto sst_gen = [&env, s, &tmp, gen = make_lw_shared<unsigned>(1)] () mutable {
return env.make_sstable(s, tmp.path().string(), (*gen)++, sstables::get_highest_sstable_version(), big);
};
auto tokens = token_generation_for_shard(1, this_shard_id(), test_db_config.murmur3_partitioner_ignore_msb_bits(), smp::count);
auto pkey = partition_key::from_exploded(*s, {to_bytes(tokens[0].first)});
auto make_row = [&] (std::chrono::hours step) {
static thread_local int32_t value = 1;
mutation m(s, pkey);
auto c_key = clustering_key::from_exploded(*s, {int32_type->decompose(value++)});
m.set_clustered_cell(c_key, bytes("value"), data_value(int32_t(value)), next_timestamp(step));
return m;
};
column_family_for_tests cf(env.manager(), s);
auto close_cf = deferred_stop(cf);
constexpr unsigned windows = 10;
std::vector<shared_sstable> sstables_spanning_many_windows;
sstables_spanning_many_windows.reserve(windows + 1);
for (unsigned w = 0; w < windows; w++) {
sstables_spanning_many_windows.push_back(make_sstable_containing(sst_gen, {make_row(std::chrono::hours((w + 1) * 2))}));
}
auto deletion_mut = [&] () {
mutation m(s, pkey);
tombstone tomb(next_timestamp(), gc_clock::now());
m.partition().apply(tomb);
return m;
}();
sstables_spanning_many_windows.push_back(make_sstable_containing(sst_gen, {deletion_mut}));
auto ret = compact_sstables(cf.get_compaction_manager(), sstables::compaction_descriptor(std::move(sstables_spanning_many_windows),
default_priority_class()), *cf, sst_gen, replacer_fn_no_op(), can_purge_tombstones::no).get0();
BOOST_REQUIRE(ret.new_sstables.size() == 1);
assert_that(sstable_reader(ret.new_sstables[0], s, env.make_reader_permit()))
.produces(deletion_mut)
.produces_end_of_stream();
});
}
SEASTAR_TEST_CASE(test_offstrategy_sstable_compaction) {
return test_env::do_with_async([tmpdirs = std::vector<decltype(tmpdir())>()] (test_env& env) mutable {
for (const auto version : writable_sstable_versions) {
tmpdirs.push_back(tmpdir());
auto& tmp = tmpdirs.back();
simple_schema ss;
auto s = ss.schema();
auto pk = ss.make_pkey(make_local_key(s));
auto mut = mutation(s, pk);
ss.add_row(mut, ss.make_ckey(0), "val");
auto cm = make_lw_shared<compaction_manager>();
auto stop_cm = defer([cm] {
cm->stop().get();
});
replica::column_family::config cfg = column_family_test_config(env.manager(), env.semaphore());
cfg.datadir = tmp.path().string();
cfg.enable_disk_writes = true;
cfg.enable_cache = false;
auto tracker = make_lw_shared<cache_tracker>();
cell_locker_stats cl_stats;
auto cf = make_lw_shared<replica::column_family>(s, cfg, replica::column_family::no_commitlog(), *cm, cl_stats, *tracker);
// Make sure we release reference to all sstables, allowing them to be deleted before dir is destroyed
auto stop_cf = defer([cf] {
cf->stop().get();
});
auto sst_gen = [&env, s, cf, path = tmp.path().string(), version] () mutable {
return env.make_sstable(s, path, column_family_test::calculate_generation_for_new_table(*cf), version, big);
};
cf->mark_ready_for_writes();
cf->start();
for (auto i = 0; i < cf->schema()->max_compaction_threshold(); i++) {
auto sst = make_sstable_containing(sst_gen, {mut});
cf->add_sstable_and_update_cache(std::move(sst), sstables::offstrategy::yes).get();
}
BOOST_REQUIRE(cf->perform_offstrategy_compaction().get0());
}
});
}
SEASTAR_TEST_CASE(twcs_reshape_with_disjoint_set_test) {
static constexpr unsigned disjoint_sstable_count = 256;
return test_env::do_with_async([] (test_env& env) {
auto builder = schema_builder("tests", "twcs_reshape_test")
.with_column("id", utf8_type, column_kind::partition_key)
.with_column("cl", ::timestamp_type, column_kind::clustering_key)
.with_column("value", int32_type);
builder.set_compaction_strategy(sstables::compaction_strategy_type::time_window);
std::map <sstring, sstring> opts = {
{time_window_compaction_strategy_options::COMPACTION_WINDOW_UNIT_KEY, "HOURS"},
{time_window_compaction_strategy_options::COMPACTION_WINDOW_SIZE_KEY, "8"},
{"min_sstable_size", "1"},
};
builder.set_compaction_strategy_options(std::move(opts));
size_t min_threshold = tests::random::get_int(4, 8);
builder.set_min_compaction_threshold(min_threshold);
auto s = builder.build();
auto cs = sstables::make_compaction_strategy(sstables::compaction_strategy_type::time_window, s->compaction_strategy_options());
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<> distrib(1, 3600*24);
using namespace std::chrono;
// Make it easier to reproduce timing-based issues by running this test multiple times.
auto offset_duration = duration_cast<microseconds>(minutes(distrib(gen)));
auto now = gc_clock::now().time_since_epoch() + offset_duration;
// The twcs is configured with 8-hours time window. If the starting time
// is not aligned with that then some buckets may get less than this
// number of sstables in and potentially hit the minimal threshold of
// 4 sstables. Align the starting time not to make this happen.
auto now_in_minutes = duration_cast<minutes>(now);
constexpr auto window_size_in_minutes = 8 * 60;
forward_jump_clocks(minutes(window_size_in_minutes - now_in_minutes.count() % window_size_in_minutes));
now = gc_clock::now().time_since_epoch() + offset_duration;
assert(std::chrono::duration_cast<minutes>(now).count() % window_size_in_minutes == 0);
auto next_timestamp = [now](auto step) {
return (now + duration_cast<seconds>(step)).count();
};
auto tokens = token_generation_for_shard(disjoint_sstable_count, this_shard_id(), test_db_config.murmur3_partitioner_ignore_msb_bits(), smp::count);
auto make_row = [&](unsigned token_idx, auto step) {
static thread_local int32_t value = 1;
auto key_str = tokens[token_idx].first;
auto key = partition_key::from_exploded(*s, {to_bytes(key_str)});
mutation m(s, key);
auto next_ts = next_timestamp(step);
auto c_key = clustering_key::from_exploded(*s, {::timestamp_type->decompose(next_ts)});
m.set_clustered_cell(c_key, bytes("value"), data_value(int32_t(value++)), next_ts);
return m;
};
auto tmp = tmpdir();
auto sst_gen = [&env, s, &tmp, gen = make_lw_shared<unsigned>(1)]() {
return env.make_sstable(s, tmp.path().string(), (*gen)++, sstables::sstable::version_types::md, big);
};
{
// create set of 256 disjoint ssts that belong to the same time window and expect that twcs reshape allows them all to be compacted at once
std::vector<sstables::shared_sstable> sstables;
sstables.reserve(disjoint_sstable_count);
for (unsigned i = 0; i < disjoint_sstable_count; i++) {
auto sst = make_sstable_containing(sst_gen, {make_row(i, std::chrono::hours(1))});
sstables.push_back(std::move(sst));
}
BOOST_REQUIRE_EQUAL(cs.get_reshaping_job(sstables, s, default_priority_class(), reshape_mode::strict).sstables.size(), disjoint_sstable_count);
}
{
// create set of 256 disjoint ssts that belong to different windows and expect that twcs reshape allows them all to be compacted at once
std::vector<sstables::shared_sstable> sstables;
sstables.reserve(disjoint_sstable_count);
for (auto i = 0; i < disjoint_sstable_count; i++) {
auto sst = make_sstable_containing(sst_gen, {make_row(i, std::chrono::hours(i))});
sstables.push_back(std::move(sst));
}
auto reshaping_count = cs.get_reshaping_job(sstables, s, default_priority_class(), reshape_mode::strict).sstables.size();
BOOST_REQUIRE_GE(reshaping_count, disjoint_sstable_count - min_threshold + 1);
BOOST_REQUIRE_LE(reshaping_count, disjoint_sstable_count);
}
{
// create set of 256 disjoint ssts that belong to different windows with none over the threshold and expect that twcs reshape selects none of them
std::vector<sstables::shared_sstable> sstables;
sstables.reserve(disjoint_sstable_count);
for (auto i = 0; i < disjoint_sstable_count; i++) {
auto sst = make_sstable_containing(sst_gen, {make_row(i, std::chrono::hours(24*i))});
sstables.push_back(std::move(sst));
i++;
sst = make_sstable_containing(sst_gen, {make_row(i, std::chrono::hours(24*i + 1))});
sstables.push_back(std::move(sst));
}
BOOST_REQUIRE_EQUAL(cs.get_reshaping_job(sstables, s, default_priority_class(), reshape_mode::strict).sstables.size(), 0);
}
{
// create set of 256 overlapping ssts that belong to the same time window and expect that twcs reshape allows only 32 to be compacted at once
std::vector<sstables::shared_sstable> sstables;
sstables.reserve(disjoint_sstable_count);
for (unsigned i = 0; i < disjoint_sstable_count; i++) {
auto sst = make_sstable_containing(sst_gen, {make_row(0, std::chrono::hours(1))});
sstables.push_back(std::move(sst));
}
BOOST_REQUIRE_EQUAL(cs.get_reshaping_job(sstables, s, default_priority_class(), reshape_mode::strict).sstables.size(), uint64_t(s->max_compaction_threshold()));
}
{
// create set of 64 files which size is either small or big. as STCS reshape logic reused by TWCS favor compaction of smaller files
// first, verify that only 32 small (similar-sized) files are returned
std::vector<mutation> mutations_for_small_files;
mutations_for_small_files.push_back(make_row(0, std::chrono::hours(1)));
std::vector<mutation> mutations_for_big_files;
for (auto i = 0; i < tokens.size(); i++) {
mutations_for_big_files.push_back(make_row(i, std::chrono::hours(1)));
}
std::unordered_set<int64_t> generations_for_small_files;
std::vector<sstables::shared_sstable> sstables;
sstables.reserve(64);
for (unsigned i = 0; i < 64; i++) {
sstables::shared_sstable sst;
//
// intermix big and small files, to make sure STCS logic is really applied to favor similar-sized reshape jobs.
//
if (i % 2 == 0) {
sst = make_sstable_containing(sst_gen, mutations_for_small_files);
generations_for_small_files.insert(sst->generation());
} else {
sst = make_sstable_containing(sst_gen, mutations_for_big_files);
}
sstables.push_back(std::move(sst));
}
auto check_mode_correctness = [&] (reshape_mode mode) {
auto ret = cs.get_reshaping_job(sstables, s, default_priority_class(), mode);
BOOST_REQUIRE_EQUAL(ret.sstables.size(), uint64_t(s->max_compaction_threshold()));
// fail if any file doesn't belong to set of small files
bool has_big_sized_files = boost::algorithm::any_of(ret.sstables, [&] (const sstables::shared_sstable& sst) {
return !generations_for_small_files.contains(sst->generation());
});
BOOST_REQUIRE(!has_big_sized_files);
};
check_mode_correctness(reshape_mode::strict);
check_mode_correctness(reshape_mode::relaxed);
}
});
}
SEASTAR_TEST_CASE(stcs_reshape_overlapping_test) {
static constexpr unsigned disjoint_sstable_count = 256;
return test_env::do_with_async([] (test_env& env) {
auto builder = schema_builder("tests", "stcs_reshape_test")
.with_column("id", utf8_type, column_kind::partition_key)
.with_column("cl", ::timestamp_type, column_kind::clustering_key)
.with_column("value", int32_type);
builder.set_compaction_strategy(sstables::compaction_strategy_type::size_tiered);
auto s = builder.build();
std::map<sstring, sstring> opts;
auto cs = sstables::make_compaction_strategy(sstables::compaction_strategy_type::size_tiered, std::move(opts));
auto tokens = token_generation_for_shard(disjoint_sstable_count, this_shard_id(), test_db_config.murmur3_partitioner_ignore_msb_bits(), smp::count);
auto make_row = [&](unsigned token_idx) {
auto key_str = tokens[token_idx].first;
auto key = partition_key::from_exploded(*s, {to_bytes(key_str)});
mutation m(s, key);
auto value = 1;
auto next_ts = 1;
auto c_key = clustering_key::from_exploded(*s, {::timestamp_type->decompose(next_ts)});
m.set_clustered_cell(c_key, bytes("value"), data_value(int32_t(value)), next_ts);
return m;
};
auto tmp = tmpdir();
auto sst_gen = [&env, s, &tmp, gen = make_lw_shared<unsigned>(1)]() {
return env.make_sstable(s, tmp.path().string(), (*gen)++, sstables::sstable::version_types::md, big);
};
{
// create set of 256 disjoint ssts and expect that stcs reshape allows them all to be compacted at once
std::vector<sstables::shared_sstable> sstables;
sstables.reserve(disjoint_sstable_count);
for (unsigned i = 0; i < disjoint_sstable_count; i++) {
auto sst = make_sstable_containing(sst_gen, {make_row(i)});
sstables.push_back(std::move(sst));
}
BOOST_REQUIRE(cs.get_reshaping_job(sstables, s, default_priority_class(), reshape_mode::strict).sstables.size() == disjoint_sstable_count);
}
{
// create set of 256 overlapping ssts and expect that stcs reshape allows only 32 to be compacted at once
std::vector<sstables::shared_sstable> sstables;
sstables.reserve(disjoint_sstable_count);
for (unsigned i = 0; i < disjoint_sstable_count; i++) {
auto sst = make_sstable_containing(sst_gen, {make_row(0)});
sstables.push_back(std::move(sst));
}
BOOST_REQUIRE(cs.get_reshaping_job(sstables, s, default_priority_class(), reshape_mode::strict).sstables.size() == uint64_t(s->max_compaction_threshold()));
}
});
}
// Regression test for #8432
SEASTAR_TEST_CASE(test_twcs_single_key_reader_filtering) {
return test_env::do_with_async([] (test_env& env) {
auto builder = schema_builder("tests", "twcs_single_key_reader_filtering")
.with_column("pk", int32_type, column_kind::partition_key)
.with_column("ck", int32_type, column_kind::clustering_key)
.with_column("v", int32_type);
builder.set_compaction_strategy(sstables::compaction_strategy_type::time_window);
auto s = builder.build();
auto tmp = tmpdir();
auto sst_gen = [&env, s, &tmp, gen = make_lw_shared<unsigned>(1)]() {
return env.make_sstable(s, tmp.path().string(), (*gen)++, sstables::sstable::version_types::md, big);
};
auto make_row = [&] (int32_t pk, int32_t ck) {
mutation m(s, partition_key::from_single_value(*s, int32_type->decompose(pk)));
m.set_clustered_cell(clustering_key::from_single_value(*s, int32_type->decompose(ck)), to_bytes("v"), int32_t(0), api::new_timestamp());
return m;
};
auto sst1 = make_sstable_containing(sst_gen, {make_row(0, 0)});
auto sst2 = make_sstable_containing(sst_gen, {make_row(0, 1)});
auto dkey = sst1->get_first_decorated_key();
auto cm = make_lw_shared<compaction_manager>();
replica::column_family::config cfg = column_family_test_config(env.manager(), env.semaphore());
replica::cf_stats cf_stats{0};
cfg.cf_stats = &cf_stats;
cfg.datadir = tmp.path().string();
auto tracker = make_lw_shared<cache_tracker>();
cell_locker_stats cl_stats;
replica::column_family cf(s, cfg, replica::column_family::no_commitlog(), *cm, cl_stats, *tracker);
cf.mark_ready_for_writes();
cf.start();
auto cs = sstables::make_compaction_strategy(sstables::compaction_strategy_type::time_window, {});
auto set = cs.make_sstable_set(s);
set.insert(std::move(sst1));
set.insert(std::move(sst2));
reader_permit permit = env.make_reader_permit();
utils::estimated_histogram eh;
auto pr = dht::partition_range::make_singular(dkey);
auto slice = partition_slice_builder(*s)
.with_range(query::clustering_range {
query::clustering_range::bound { clustering_key_prefix::from_single_value(*s, int32_type->decompose(0)) },
query::clustering_range::bound { clustering_key_prefix::from_single_value(*s, int32_type->decompose(1)) },
}).build();
auto reader = set.create_single_key_sstable_reader(
&cf, s, permit, eh, pr, slice, default_priority_class(),
tracing::trace_state_ptr(), ::streamed_mutation::forwarding::no,
::mutation_reader::forwarding::no);
auto close_reader = deferred_close(reader);
auto checked_by_ck = cf_stats.sstables_checked_by_clustering_filter;
auto surviving_after_ck = cf_stats.surviving_sstables_after_clustering_filter;
// consume all fragments
while (reader().get());
// At least sst2 should be checked by the CK filter during fragment consumption and should pass.
// With the bug in #8432, sst2 wouldn't even be checked by the CK filter since it would pass right after checking the PK filter.
BOOST_REQUIRE_GE(cf_stats.sstables_checked_by_clustering_filter - checked_by_ck, 1);
BOOST_REQUIRE_EQUAL(
cf_stats.surviving_sstables_after_clustering_filter - surviving_after_ck,
cf_stats.sstables_checked_by_clustering_filter - checked_by_ck);
});
}
SEASTAR_TEST_CASE(max_ongoing_compaction_test) {
return test_env::do_with_async([] (test_env& env) {
BOOST_REQUIRE(smp::count == 1);
auto make_schema = [] (auto idx) {
auto builder = schema_builder("tests", std::to_string(idx))
.with_column("id", utf8_type, column_kind::partition_key)
.with_column("cl", int32_type, column_kind::clustering_key)
.with_column("value", int32_type);
builder.set_compaction_strategy(sstables::compaction_strategy_type::time_window);
std::map <sstring, sstring> opts = {
{time_window_compaction_strategy_options::COMPACTION_WINDOW_UNIT_KEY, "HOURS"},
{time_window_compaction_strategy_options::COMPACTION_WINDOW_SIZE_KEY, "1"},
{time_window_compaction_strategy_options::EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_KEY, "0"},
};
builder.set_compaction_strategy_options(std::move(opts));
builder.set_gc_grace_seconds(0);
return builder.build();
};
auto cm = make_lw_shared<compaction_manager>();
cm->enable();
auto stop_cm = defer([&cm] {
cm->stop().get();
});
auto tmp = tmpdir();
auto cl_stats = make_lw_shared<cell_locker_stats>();
auto tracker = make_lw_shared<cache_tracker>();
auto tokens = token_generation_for_shard(1, this_shard_id(), test_db_config.murmur3_partitioner_ignore_msb_bits(), smp::count);
auto next_timestamp = [] (auto step) {
using namespace std::chrono;
return (gc_clock::now().time_since_epoch() - duration_cast<microseconds>(step)).count();
};
auto make_expiring_cell = [&] (schema_ptr s, std::chrono::hours step) {
static thread_local int32_t value = 1;
auto key_str = tokens[0].first;
auto key = partition_key::from_exploded(*s, {to_bytes(key_str)});
mutation m(s, key);
auto c_key = clustering_key::from_exploded(*s, {int32_type->decompose(value++)});
m.set_clustered_cell(c_key, bytes("value"), data_value(int32_t(value)), next_timestamp(step), gc_clock::duration(step + 5s));
return m;
};
auto make_table_with_single_fully_expired_sstable = [&] (auto idx) {
auto s = make_schema(idx);
replica::column_family::config cfg = column_family_test_config(env.manager(), env.semaphore());
cfg.datadir = tmp.path().string() + "/" + std::to_string(idx);
touch_directory(cfg.datadir).get();
cfg.enable_commitlog = false;
cfg.enable_incremental_backups = false;
auto sst_gen = [&env, s, dir = cfg.datadir, gen = make_lw_shared<unsigned>(1)] () mutable {
return env.make_sstable(s, dir, (*gen)++, sstables::sstable::version_types::md, big);
};
auto cf = make_lw_shared<replica::column_family>(s, cfg, replica::column_family::no_commitlog(), *cm, *cl_stats, *tracker);
cf->start();
cf->mark_ready_for_writes();
auto muts = { make_expiring_cell(s, std::chrono::hours(1)) };
auto sst = make_sstable_containing(sst_gen, muts);
column_family_test(cf).add_sstable(sst);
return cf;
};
std::vector<lw_shared_ptr<replica::column_family>> tables;
auto stop_tables = defer([&tables] {
for (auto& t : tables) {
t->stop().get();
}
});
for (auto i = 0; i < 100; i++) {
tables.push_back(make_table_with_single_fully_expired_sstable(i));
}
// Make sure everything is expired
forward_jump_clocks(std::chrono::hours(100));
for (auto& t : tables) {
BOOST_REQUIRE(t->sstables_count() == 1);
t->trigger_compaction();
}
BOOST_REQUIRE(cm->get_stats().pending_tasks >= 1 || cm->get_stats().active_tasks >= 1);
size_t max_ongoing_compaction = 0;
// wait for submitted jobs to finish.
auto end = [cm, &tables] {
return cm->get_stats().pending_tasks == 0 && cm->get_stats().active_tasks == 0
&& boost::algorithm::all_of(tables, [] (auto& t) { return t->sstables_count() == 0; });
};
while (!end()) {
if (!cm->get_stats().pending_tasks && !cm->get_stats().active_tasks) {
for (auto& t : tables) {
if (t->sstables_count()) {
t->trigger_compaction();
}
}
}
max_ongoing_compaction = std::max(cm->get_stats().active_tasks, max_ongoing_compaction);
yield().get();
}
BOOST_REQUIRE(cm->get_stats().errors == 0);
BOOST_REQUIRE(max_ongoing_compaction == 1);
});
}
SEASTAR_TEST_CASE(compound_sstable_set_incremental_selector_test) {
return test_env::do_with([] (test_env& env) {
auto s = make_shared_schema({}, some_keyspace, some_column_family,
{{"p1", utf8_type}}, {}, {}, {}, utf8_type);
auto cs = sstables::make_compaction_strategy(sstables::compaction_strategy_type::leveled, s->compaction_strategy_options());
auto key_and_token_pair = token_generation_for_current_shard(8);
auto decorated_keys = boost::copy_range<std::vector<dht::decorated_key>>(
key_and_token_pair | boost::adaptors::transformed([&s] (const std::pair<sstring, dht::token>& key_and_token) {
auto value = bytes(reinterpret_cast<const signed char*>(key_and_token.first.data()), key_and_token.first.size());
auto pk = sstables::key::from_bytes(value).to_partition_key(*s);
return dht::decorate_key(*s, std::move(pk));
}));
auto check = [] (sstable_set::incremental_selector& selector, const dht::decorated_key& key, std::unordered_set<int64_t> expected_gens) {
auto sstables = selector.select(key).sstables;
BOOST_REQUIRE_EQUAL(sstables.size(), expected_gens.size());
for (auto& sst : sstables) {
BOOST_REQUIRE(expected_gens.contains(sst->generation()));
}
};
{
auto set1 = make_lw_shared<sstable_set>(cs.make_sstable_set(s));
auto set2 = make_lw_shared<sstable_set>(cs.make_sstable_set(s));
set1->insert(sstable_for_overlapping_test(env, s, 1, key_and_token_pair[0].first, key_and_token_pair[1].first, 1));
set2->insert(sstable_for_overlapping_test(env, s, 2, key_and_token_pair[0].first, key_and_token_pair[1].first, 1));
set1->insert(sstable_for_overlapping_test(env, s, 3, key_and_token_pair[3].first, key_and_token_pair[4].first, 1));
set2->insert(sstable_for_overlapping_test(env, s, 4, key_and_token_pair[4].first, key_and_token_pair[4].first, 1));
set1->insert(sstable_for_overlapping_test(env, s, 5, key_and_token_pair[4].first, key_and_token_pair[5].first, 1));
sstable_set compound = sstables::make_compound_sstable_set(s, { set1, set2 });
sstable_set::incremental_selector sel = compound.make_incremental_selector();
check(sel, decorated_keys[0], {1, 2});
check(sel, decorated_keys[1], {1, 2});
check(sel, decorated_keys[2], {});
check(sel, decorated_keys[3], {3});
check(sel, decorated_keys[4], {3, 4, 5});
check(sel, decorated_keys[5], {5});
check(sel, decorated_keys[6], {});
check(sel, decorated_keys[7], {});
}
{
auto set1 = make_lw_shared<sstable_set>(cs.make_sstable_set(s));
auto set2 = make_lw_shared<sstable_set>(cs.make_sstable_set(s));
set1->insert(sstable_for_overlapping_test(env, s, 0, key_and_token_pair[0].first, key_and_token_pair[1].first, 0));
set2->insert(sstable_for_overlapping_test(env, s, 1, key_and_token_pair[0].first, key_and_token_pair[1].first, 1));
set1->insert(sstable_for_overlapping_test(env, s, 2, key_and_token_pair[0].first, key_and_token_pair[1].first, 1));
set2->insert(sstable_for_overlapping_test(env, s, 3, key_and_token_pair[3].first, key_and_token_pair[4].first, 1));
set1->insert(sstable_for_overlapping_test(env, s, 4, key_and_token_pair[4].first, key_and_token_pair[4].first, 1));
set2->insert(sstable_for_overlapping_test(env, s, 5, key_and_token_pair[4].first, key_and_token_pair[5].first, 1));
sstable_set compound = sstables::make_compound_sstable_set(s, { set1, set2 });
sstable_set::incremental_selector sel = compound.make_incremental_selector();
check(sel, decorated_keys[0], {0, 1, 2});
check(sel, decorated_keys[1], {0, 1, 2});
check(sel, decorated_keys[2], {0});
check(sel, decorated_keys[3], {0, 3});
check(sel, decorated_keys[4], {0, 3, 4, 5});
check(sel, decorated_keys[5], {0, 5});
check(sel, decorated_keys[6], {0});
check(sel, decorated_keys[7], {0});
}
{
// reproduces use-after-free failure in incremental reader selector with compound set where the next position
// returned by a set can be used after freed as selector position in another set, producing incorrect results.
enum class strategy_param : bool {
ICS = false,
LCS = true,
};
auto incremental_selection_test = [&] (strategy_param param) {
auto set1 = make_lw_shared<sstable_set>(sstables::make_partitioned_sstable_set(s, make_lw_shared<sstable_list>(), false));
auto set2 = make_lw_shared<sstable_set>(sstables::make_partitioned_sstable_set(s, make_lw_shared<sstable_list>(), bool(param)));
set1->insert(sstable_for_overlapping_test(env, s, 0, key_and_token_pair[1].first, key_and_token_pair[1].first, 1));
set2->insert(sstable_for_overlapping_test(env, s, 1, key_and_token_pair[0].first, key_and_token_pair[2].first, 1));
set2->insert(sstable_for_overlapping_test(env, s, 2, key_and_token_pair[3].first, key_and_token_pair[3].first, 1));
set2->insert(sstable_for_overlapping_test(env, s, 3, key_and_token_pair[4].first, key_and_token_pair[4].first, 1));
sstable_set compound = sstables::make_compound_sstable_set(s, { set1, set2 });
sstable_set::incremental_selector sel = compound.make_incremental_selector();
dht::ring_position_view pos = dht::ring_position_view::min();
std::unordered_set<sstables::shared_sstable> sstables;
do {
auto ret = sel.select(pos);
pos = ret.next_position;
sstables.insert(ret.sstables.begin(), ret.sstables.end());
} while (!pos.is_max());
BOOST_REQUIRE(sstables.size() == 4);
};
incremental_selection_test(strategy_param::ICS);
incremental_selection_test(strategy_param::LCS);
}
return make_ready_future<>();
});
}
SEASTAR_TEST_CASE(twcs_single_key_reader_through_compound_set_test) {
return test_env::do_with_async([] (test_env& env) {
auto builder = schema_builder("tests", "single_key_reader_through_compound_set_test")
.with_column("id", utf8_type, column_kind::partition_key)
.with_column("cl", ::timestamp_type, column_kind::clustering_key)
.with_column("value", int32_type);
builder.set_compaction_strategy(sstables::compaction_strategy_type::time_window);
std::map <sstring, sstring> opts = {
{time_window_compaction_strategy_options::COMPACTION_WINDOW_UNIT_KEY, "HOURS"},
{time_window_compaction_strategy_options::COMPACTION_WINDOW_SIZE_KEY, "1"},
};
builder.set_compaction_strategy_options(std::move(opts));
auto s = builder.build();
auto cs = sstables::make_compaction_strategy(sstables::compaction_strategy_type::time_window, std::move(opts));
auto next_timestamp = [](auto step) {
using namespace std::chrono;
return (gc_clock::now().time_since_epoch() + duration_cast<microseconds>(step)).count();
};
auto tokens = token_generation_for_shard(1, this_shard_id(), test_db_config.murmur3_partitioner_ignore_msb_bits(), smp::count);
auto make_row = [&](std::chrono::hours step) {
static thread_local int32_t value = 1;
auto key_str = tokens[0].first;
auto key = partition_key::from_exploded(*s, {to_bytes(key_str)});
mutation m(s, key);
auto next_ts = next_timestamp(step);
auto c_key = clustering_key::from_exploded(*s, {::timestamp_type->decompose(next_ts)});
m.set_clustered_cell(c_key, bytes("value"), data_value(int32_t(value++)), next_ts);
return m;
};
auto tmp = tmpdir();
auto cm = make_lw_shared<compaction_manager>();
replica::column_family::config cfg = column_family_test_config(env.manager(), env.semaphore());
replica::cf_stats cf_stats{0};
cfg.cf_stats = &cf_stats;
cfg.datadir = tmp.path().string();
cfg.enable_disk_writes = true;
cfg.enable_cache = false;
auto tracker = make_lw_shared<cache_tracker>();
cell_locker_stats cl_stats;
auto cf = make_lw_shared<replica::column_family>(s, cfg, replica::column_family::no_commitlog(), *cm, cl_stats, *tracker);
cf->mark_ready_for_writes();
cf->start();
auto set1 = make_lw_shared<sstable_set>(cs.make_sstable_set(s));
auto set2 = make_lw_shared<sstable_set>(cs.make_sstable_set(s));
auto sst_gen = [&env, s, &tmp, gen = make_lw_shared<unsigned>(1)]() {
return env.make_sstable(s, tmp.path().string(), (*gen)++, sstables::sstable::version_types::md, big);
};
// sstables with same key but belonging to different windows
auto sst1 = make_sstable_containing(sst_gen, {make_row(std::chrono::hours(1))});
auto sst2 = make_sstable_containing(sst_gen, {make_row(std::chrono::hours(5))});
BOOST_REQUIRE(sst1->get_first_decorated_key().token() == sst2->get_last_decorated_key().token());
auto dkey = sst1->get_first_decorated_key();
set1->insert(std::move(sst1));
set2->insert(std::move(sst2));
sstable_set compound = sstables::make_compound_sstable_set(s, {set1, set2});
reader_permit permit = env.make_reader_permit();
utils::estimated_histogram eh;
auto pr = dht::partition_range::make_singular(dkey);
auto reader = compound.create_single_key_sstable_reader(&*cf, s, permit, eh, pr, s->full_slice(), default_priority_class(),
tracing::trace_state_ptr(), ::streamed_mutation::forwarding::no,
::mutation_reader::forwarding::no);
auto close_reader = deferred_close(reader);
auto mfopt = read_mutation_from_flat_mutation_reader(reader).get0();
BOOST_REQUIRE(mfopt);
mfopt = read_mutation_from_flat_mutation_reader(reader).get0();
BOOST_REQUIRE(!mfopt);
BOOST_REQUIRE(cf_stats.clustering_filter_count > 0);
});
}
SEASTAR_TEST_CASE(test_major_does_not_miss_data_in_memtable) {
return test_env::do_with_async([] (test_env& env) {
auto builder = schema_builder("tests", "test_major_does_not_miss_data_in_memtable")
.with_column("id", utf8_type, column_kind::partition_key)
.with_column("cl", int32_type, column_kind::clustering_key)
.with_column("value", int32_type);
auto s = builder.build();
auto tmp = tmpdir();
auto tokens = token_generation_for_shard(1, this_shard_id(), test_db_config.murmur3_partitioner_ignore_msb_bits(), smp::count);
auto pkey = partition_key::from_exploded(*s, {to_bytes(tokens[0].first)});
column_family_for_tests cf(env.manager(), s, tmp.path().string());
auto close_cf = deferred_stop(cf);
auto sst_gen = [&env, &cf, s, &tmp] () mutable {
return env.make_sstable(s, tmp.path().string(), column_family_test::calculate_generation_for_new_table(*cf),
sstables::get_highest_sstable_version(), big);
};
auto row_mut = [&] () {
static thread_local int32_t value = 1;
mutation m(s, pkey);
auto c_key = clustering_key::from_exploded(*s, {int32_type->decompose(value++)});
m.set_clustered_cell(c_key, bytes("value"), data_value(int32_t(value)), gc_clock::now().time_since_epoch().count());
return m;
}();
auto sst = make_sstable_containing(sst_gen, {std::move(row_mut)});
cf->add_sstable_and_update_cache(sst).get();
BOOST_REQUIRE(cf->get_sstables()->size() == 1);
auto deletion_mut = [&] () {
mutation m(s, pkey);
tombstone tomb(gc_clock::now().time_since_epoch().count(), gc_clock::now());
m.partition().apply(tomb);
return m;
}();
cf->apply(deletion_mut);
cf->compact_all_sstables().get();
BOOST_REQUIRE(cf->get_sstables()->size() == 1);
auto new_sst = *(cf->get_sstables()->begin());
BOOST_REQUIRE(new_sst->generation() != sst->generation());
assert_that(sstable_reader(new_sst, s, env.make_reader_permit()))
.produces(deletion_mut)
.produces_end_of_stream();
});
}
SEASTAR_TEST_CASE(simple_backlog_controller_test) {
auto run_controller_test = [] (sstables::compaction_strategy_type compaction_strategy_type, test_env& env) {
/////////////
// settings
static constexpr float disk_memory_ratio = 78.125; /* AWS I3en is ~78.125 */
static constexpr uint64_t available_memory_per_shard = 8'000'000'000; /* AWS I3en */
static constexpr float target_disk_usage = 0.50;
const uint64_t available_disk_size_per_shard = disk_memory_ratio * available_memory_per_shard;
const uint64_t available_memory = available_memory_per_shard * 0.92; /* 8% is reserved for the OS */
const uint64_t estimated_flush_size = double(available_memory) * 0.05; /* flush threshold is 5% of available shard mem */
const uint64_t all_tables_disk_usage = double(available_disk_size_per_shard) * target_disk_usage;
auto as = abort_source();
compaction_manager::compaction_scheduling_group csg = { default_scheduling_group(), default_priority_class() };
compaction_manager::maintenance_scheduling_group msg = { default_scheduling_group(), default_priority_class() };
auto manager = compaction_manager(csg, msg, available_memory, as);
auto add_sstable = [&env, &manager, gen = make_lw_shared<unsigned>(1)] (replica::table& t, uint64_t data_size) {
auto sst = env.make_sstable(t.schema(), "", (*gen)++, la, big);
auto key = make_local_key(t.schema());
sstables::test(sst).set_values_for_leveled_strategy(data_size, 0 /*level*/, 0 /*max ts*/, key, key);
assert(sst->data_size() == data_size);
auto backlog_before = t.get_compaction_strategy().get_backlog_tracker().backlog();
t.add_sstable_and_update_cache(sst).get();
testlog.debug("\tNew sstable of size={}; Backlog diff={};",
sstables::pretty_printed_data_size(data_size),
t.get_compaction_strategy().get_backlog_tracker().backlog() - backlog_before);
};
auto tracker = make_lw_shared<cache_tracker>();
cell_locker_stats cl_stats;
auto create_table = [&] () {
simple_schema ss;
auto s = ss.schema();
replica::column_family::config cfg = column_family_test_config(env.manager(), env.semaphore());
cfg.datadir = "";
cfg.enable_disk_writes = true;
cfg.enable_cache = false;
auto t = make_lw_shared<replica::table>(s, cfg, replica::table::no_commitlog(), manager, cl_stats, *tracker);
t->mark_ready_for_writes();
t->start();
t->set_compaction_strategy(compaction_strategy_type);
return t;
};
auto get_size_for_tier = [&] (int tier) -> uint64_t {
return std::pow(4, tier) * estimated_flush_size;
};
auto get_total_tiers = [&] (uint64_t target_size) -> unsigned {
double inv_log_4 = 1.0f / std::log(4);
return std::ceil(std::log(double(target_size) / estimated_flush_size) * inv_log_4);
};
auto normalize_backlog = [&] (double backlog) -> double {
return backlog / available_memory;
};
struct result {
unsigned table_count;
uint64_t per_table_max_disk_usage;
double normalized_backlog;
};
std::vector<result> results;
std::vector<unsigned> target_table_count_s = { 1, 2, 5, 10, 20 };
for (auto target_table_count : target_table_count_s) {
const uint64_t per_table_max_disk_usage = std::ceil(all_tables_disk_usage / target_table_count);
testlog.info("Creating tables, with max size={}", sstables::pretty_printed_data_size(per_table_max_disk_usage));
std::vector<lw_shared_ptr<replica::table>> tables;
uint64_t tables_total_size = 0;
for (uint64_t t_idx = 0, available_space = all_tables_disk_usage; available_space >= estimated_flush_size; t_idx++) {
auto target_disk_usage = std::min(available_space, per_table_max_disk_usage);
auto tiers = get_total_tiers(target_disk_usage);
auto t = create_table();
for (auto tier_idx = 0; tier_idx < tiers; tier_idx++) {
auto tier_size = get_size_for_tier(tier_idx);
if (tier_size > available_space) {
break;
}
add_sstable(*t, tier_size);
available_space -= std::min(available_space, uint64_t(tier_size));
}
auto table_size = t->get_stats().live_disk_space_used;
testlog.debug("T{}: {} tiers, with total size={}", t_idx, tiers, sstables::pretty_printed_data_size(table_size));
tables.push_back(t);
tables_total_size += table_size;
}
testlog.debug("Created {} tables, with total size={}", tables.size(), sstables::pretty_printed_data_size(tables_total_size));
results.push_back(result{ tables.size(), per_table_max_disk_usage, normalize_backlog(manager.backlog()) });
for (auto& t : tables) {
t->stop().get();
}
}
for (auto& r : results) {
testlog.info("Tables={} with max size={} -> NormalizedBacklog={}", r.table_count, sstables::pretty_printed_data_size(r.per_table_max_disk_usage), r.normalized_backlog);
// Expect 0 backlog as tiers are all perfectly compacted
BOOST_REQUIRE(r.normalized_backlog == 0.0f);
}
};
return test_env::do_with_async([run_controller_test] (test_env& env) {
run_controller_test(sstables::compaction_strategy_type::size_tiered, env);
run_controller_test(sstables::compaction_strategy_type::time_window, env);
run_controller_test(sstables::compaction_strategy_type::leveled, env);
});
}
SEASTAR_TEST_CASE(test_compaction_strategy_cleanup_method) {
return test_env::do_with_async([] (test_env& env) {
constexpr size_t all_files = 64;
auto get_cleanup_jobs = [&env, &all_files] (sstables::compaction_strategy_type compaction_strategy_type,
std::map<sstring, sstring> strategy_options = {},
const api::timestamp_clock::duration step_base = 0ms,
unsigned sstable_level = 0) {
auto builder = schema_builder("tests", "test_compaction_strategy_cleanup_method")
.with_column("id", utf8_type, column_kind::partition_key)
.with_column("cl", int32_type, column_kind::clustering_key)
.with_column("value", int32_type);
builder.set_compaction_strategy(compaction_strategy_type);
builder.set_compaction_strategy_options(std::move(strategy_options));
auto s = builder.build();
auto tmp = tmpdir();
auto tokens = token_generation_for_shard(all_files, this_shard_id(), test_db_config.murmur3_partitioner_ignore_msb_bits(), smp::count);
column_family_for_tests cf(env.manager(), s, tmp.path().string());
auto close_cf = deferred_stop(cf);
auto sst_gen = [&env, &cf, s, &tmp]() mutable {
return env.make_sstable(s, tmp.path().string(), column_family_test::calculate_generation_for_new_table(*cf),
sstables::get_highest_sstable_version(), big);
};
using namespace std::chrono;
auto now = gc_clock::now().time_since_epoch() + duration_cast<microseconds>(seconds(tests::random::get_int(0, 3600*24)));
auto next_timestamp = [&now] (microseconds step) mutable -> api::timestamp_type {
return (now + step).count();
};
auto make_mutation = [&] (unsigned pkey_idx, api::timestamp_type ts) {
auto pkey = partition_key::from_exploded(*s, {to_bytes(tokens[pkey_idx].first)});
mutation m(s, pkey);
auto c_key = clustering_key::from_exploded(*s, {int32_type->decompose(1)});
m.set_clustered_cell(c_key, bytes("value"), data_value(int32_t(1)), ts);
return m;
};
std::vector<sstables::shared_sstable> candidates;
candidates.reserve(all_files);
for (auto i = 0; i < all_files; i++) {
auto current_step = duration_cast<microseconds>(step_base) * i;
auto sst = make_sstable_containing(sst_gen, {make_mutation(i, next_timestamp(current_step))});
sst->set_sstable_level(sstable_level);
candidates.push_back(std::move(sst));
}
auto strategy = cf->get_compaction_strategy();
auto jobs = strategy.get_cleanup_compaction_jobs(cf->as_table_state(), candidates);
return std::make_pair(std::move(candidates), std::move(jobs));
};
auto run_cleanup_strategy_test = [&] (sstables::compaction_strategy_type compaction_strategy_type, size_t per_job_files, auto&&... args) {
testlog.info("Running cleanup test for strategy type {}", compaction_strategy::name(compaction_strategy_type));
size_t target_job_count = all_files / per_job_files;
auto [candidates, descriptors] = get_cleanup_jobs(compaction_strategy_type, std::forward<decltype(args)>(args)...);
testlog.info("get_cleanup_jobs() returned {} descriptors; expected={}", descriptors.size(), target_job_count);
BOOST_REQUIRE(descriptors.size() == target_job_count);
auto generations = boost::copy_range<std::unordered_set<unsigned>>(candidates | boost::adaptors::transformed(std::mem_fn(&sstables::sstable::generation)));
auto check_desc = [&] (const auto& desc) {
BOOST_REQUIRE(desc.sstables.size() == per_job_files);
for (auto& sst: desc.sstables) {
BOOST_REQUIRE(generations.erase(sst->generation()));
}
};
for (auto& desc : descriptors) {
check_desc(desc);
}
};
// STCS: Check that 2 jobs are returned for a size tier containing 2x more files than max threshold.
run_cleanup_strategy_test(sstables::compaction_strategy_type::size_tiered, 32);
// Default implementation: check that it will return one job for each file
run_cleanup_strategy_test(sstables::compaction_strategy_type::null, 1);
// TWCS: Check that it will return one job for each time window
std::map<sstring, sstring> twcs_opts = {
{time_window_compaction_strategy_options::COMPACTION_WINDOW_UNIT_KEY, "HOURS"},
{time_window_compaction_strategy_options::COMPACTION_WINDOW_SIZE_KEY, "1"},
};
run_cleanup_strategy_test(sstables::compaction_strategy_type::time_window, 1, std::move(twcs_opts), 1h);
const std::map<sstring, sstring> empty_opts;
// LCS: Check that 2 jobs are returned for all similar-sized files in level 0.
run_cleanup_strategy_test(sstables::compaction_strategy_type::leveled, 32, empty_opts, 0ms, 0);
// LCS: Check that 1 jobs is returned for all non-overlapping files in level 1, as incremental compaction can be employed
// to limit memory usage and space requirement.
run_cleanup_strategy_test(sstables::compaction_strategy_type::leveled, 64, empty_opts, 0ms, 1);
});
}