Files
scylladb/test/boost/memtable_test.cc
Mikołaj Sielużycki 1d84a254c0 flat_mutation_reader: Split readers by file and remove unnecessary includes.
The flat_mutation_reader files were conflated and contained multiple
readers, which were not strictly necessary. Splitting optimizes both
iterative compilation times, as touching rarely used readers doesn't
recompile large chunks of codebase. Total compilation times are also
improved, as the size of flat_mutation_reader.hh and
flat_mutation_reader_v2.hh have been reduced and those files are
included by many file in the codebase.

With changes

real	29m14.051s
user	168m39.071s
sys	5m13.443s

Without changes

real	30m36.203s
user	175m43.354s
sys	5m26.376s

Closes #10194
2022-03-14 13:20:25 +02:00

818 lines
33 KiB
C++

/*
* Copyright (C) 2015-present ScyllaDB
*/
/*
* SPDX-License-Identifier: AGPL-3.0-or-later
*/
#include <boost/test/unit_test.hpp>
#include "service/priority_manager.hh"
#include "replica/database.hh"
#include "db/config.hh"
#include "utils/UUID_gen.hh"
#include <seastar/testing/test_case.hh>
#include <seastar/testing/thread_test_case.hh>
#include "schema_builder.hh"
#include <seastar/util/closeable.hh>
#include "service/migration_manager.hh"
#include <seastar/core/thread.hh>
#include "replica/memtable.hh"
#include "test/lib/cql_test_env.hh"
#include "test/lib/cql_assertions.hh"
#include "test/lib/mutation_source_test.hh"
#include "test/lib/mutation_assertions.hh"
#include "test/lib/flat_mutation_reader_assertions.hh"
#include "readers/flat_mutation_reader.hh"
#include "test/lib/data_model.hh"
#include "test/lib/eventually.hh"
#include "test/lib/random_utils.hh"
#include "test/lib/log.hh"
#include "test/lib/reader_concurrency_semaphore.hh"
#include "test/lib/simple_schema.hh"
#include "utils/error_injection.hh"
static api::timestamp_type next_timestamp() {
static thread_local api::timestamp_type next_timestamp = 1;
return next_timestamp++;
}
static bytes make_unique_bytes() {
return to_bytes(utils::UUID_gen::get_time_UUID().to_sstring());
}
static void set_column(mutation& m, const sstring& column_name) {
assert(m.schema()->get_column_definition(to_bytes(column_name))->type == bytes_type);
auto value = data_value(make_unique_bytes());
m.set_clustered_cell(clustering_key::make_empty(), to_bytes(column_name), value, next_timestamp());
}
static
mutation make_unique_mutation(schema_ptr s) {
return mutation(s, partition_key::from_single_value(*s, make_unique_bytes()));
}
// Returns a vector of empty mutations in ring order
std::vector<mutation> make_ring(schema_ptr s, int n_mutations) {
std::vector<mutation> ring;
for (int i = 0; i < n_mutations; ++i) {
ring.push_back(make_unique_mutation(s));
}
std::sort(ring.begin(), ring.end(), mutation_decorated_key_less_comparator());
return ring;
}
SEASTAR_TEST_CASE(test_memtable_conforms_to_mutation_source) {
return seastar::async([] {
run_mutation_source_tests([](schema_ptr s, const std::vector<mutation>& partitions) {
auto mt = make_lw_shared<replica::memtable>(s);
for (auto&& m : partitions) {
mt->apply(m);
}
logalloc::shard_tracker().full_compaction();
return mt->as_data_source();
});
});
}
SEASTAR_TEST_CASE(test_memtable_with_many_versions_conforms_to_mutation_source) {
return seastar::async([] {
tests::reader_concurrency_semaphore_wrapper semaphore;
lw_shared_ptr<replica::memtable> mt;
std::vector<flat_mutation_reader_v2> readers;
auto clear_readers = [&readers] {
parallel_for_each(readers, [] (flat_mutation_reader_v2& rd) {
return rd.close();
}).finally([&readers] {
readers.clear();
}).get();
};
auto cleanup_readers = defer([&] { clear_readers(); });
std::deque<dht::partition_range> ranges_storage;
lw_shared_ptr<bool> finished = make_lw_shared(false);
auto full_compaction_in_background = seastar::do_until([finished] {return *finished;}, [] {
// do_refresh_state is called when we detect a new partition snapshot version.
// If snapshot version changes in process of reading mutation fragments from a
// clustering range, the partition_snapshot_reader state is refreshed with saved
// last position of emitted row and range tombstone. full_compaction increases the
// change mark.
logalloc::shard_tracker().full_compaction();
return seastar::sleep(100us);
});
run_mutation_source_tests([&] (schema_ptr s, const std::vector<mutation>& muts) {
clear_readers();
mt = make_lw_shared<replica::memtable>(s);
for (auto&& m : muts) {
mt->apply(m);
// Create reader so that each mutation is in a separate version
auto rd = mt->make_flat_reader(s, semaphore.make_permit(), ranges_storage.emplace_back(dht::partition_range::make_singular(m.decorated_key())));
rd.set_max_buffer_size(1);
rd.fill_buffer().get();
readers.emplace_back(std::move(rd));
}
return mt->as_data_source();
});
*finished = true;
full_compaction_in_background.get();
});
}
SEASTAR_TEST_CASE(test_memtable_flush_reader) {
// Memtable flush reader is severly limited, it always assumes that
// the full partition range is being read and that
// streamed_mutation::forwarding is set to no. Therefore, we cannot use
// run_mutation_source_tests() to test it.
return seastar::async([] {
tests::reader_concurrency_semaphore_wrapper semaphore;
auto make_memtable = [] (dirty_memory_manager& mgr, replica::table_stats& tbl_stats, std::vector<mutation> muts) {
assert(!muts.empty());
auto mt = make_lw_shared<replica::memtable>(muts.front().schema(), mgr, tbl_stats);
for (auto& m : muts) {
mt->apply(m);
}
return mt;
};
auto test_random_streams = [&] (random_mutation_generator&& gen) {
for (auto i = 0; i < 4; i++) {
replica::table_stats tbl_stats;
dirty_memory_manager mgr;
const auto muts = gen(4);
const auto now = gc_clock::now();
auto compacted_muts = muts;
for (auto& mut : compacted_muts) {
mut.partition().compact_for_compaction(*mut.schema(), always_gc, mut.decorated_key(), now);
}
testlog.info("Simple read");
auto mt = make_memtable(mgr, tbl_stats, muts);
assert_that(mt->make_flush_reader(gen.schema(), semaphore.make_permit(), default_priority_class()))
.produces_compacted(compacted_muts[0], now)
.produces_compacted(compacted_muts[1], now)
.produces_compacted(compacted_muts[2], now)
.produces_compacted(compacted_muts[3], now)
.produces_end_of_stream();
testlog.info("Read with next_partition() calls between partition");
mt = make_memtable(mgr, tbl_stats, muts);
assert_that(mt->make_flush_reader(gen.schema(), semaphore.make_permit(), default_priority_class()))
.next_partition()
.produces_compacted(compacted_muts[0], now)
.next_partition()
.produces_compacted(compacted_muts[1], now)
.next_partition()
.produces_compacted(compacted_muts[2], now)
.next_partition()
.produces_compacted(compacted_muts[3], now)
.next_partition()
.produces_end_of_stream();
testlog.info("Read with next_partition() calls inside partitions");
mt = make_memtable(mgr, tbl_stats, muts);
assert_that(mt->make_flush_reader(gen.schema(), semaphore.make_permit(), default_priority_class()))
.produces_compacted(compacted_muts[0], now)
.produces_partition_start(muts[1].decorated_key(), muts[1].partition().partition_tombstone())
.next_partition()
.produces_compacted(compacted_muts[2], now)
.next_partition()
.produces_partition_start(muts[3].decorated_key(), muts[3].partition().partition_tombstone())
.next_partition()
.produces_end_of_stream();
}
};
test_random_streams(random_mutation_generator(random_mutation_generator::generate_counters::no));
test_random_streams(random_mutation_generator(random_mutation_generator::generate_counters::yes));
});
}
SEASTAR_TEST_CASE(test_adding_a_column_during_reading_doesnt_affect_read_result) {
return seastar::async([] {
auto common_builder = schema_builder("ks", "cf")
.with_column("pk", bytes_type, column_kind::partition_key);
auto s1 = common_builder
.with_column("v2", bytes_type, column_kind::regular_column)
.build();
auto s2 = common_builder
.with_column("v1", bytes_type, column_kind::regular_column) // new column
.with_column("v2", bytes_type, column_kind::regular_column)
.build();
tests::reader_concurrency_semaphore_wrapper semaphore;
auto mt = make_lw_shared<replica::memtable>(s1);
std::vector<mutation> ring = make_ring(s1, 3);
for (auto&& m : ring) {
set_column(m, "v2");
mt->apply(m);
}
auto check_rd_s1 = assert_that(mt->make_flat_reader(s1, semaphore.make_permit()));
auto check_rd_s2 = assert_that(mt->make_flat_reader(s2, semaphore.make_permit()));
check_rd_s1.next_mutation().has_schema(s1).is_equal_to(ring[0]);
check_rd_s2.next_mutation().has_schema(s2).is_equal_to(ring[0]);
mt->set_schema(s2);
check_rd_s1.next_mutation().has_schema(s1).is_equal_to(ring[1]);
check_rd_s2.next_mutation().has_schema(s2).is_equal_to(ring[1]);
check_rd_s1.next_mutation().has_schema(s1).is_equal_to(ring[2]);
check_rd_s2.next_mutation().has_schema(s2).is_equal_to(ring[2]);
check_rd_s1.produces_end_of_stream();
check_rd_s2.produces_end_of_stream();
assert_that(mt->make_flat_reader(s1, semaphore.make_permit()))
.produces(ring[0])
.produces(ring[1])
.produces(ring[2])
.produces_end_of_stream();
assert_that(mt->make_flat_reader(s2, semaphore.make_permit()))
.produces(ring[0])
.produces(ring[1])
.produces(ring[2])
.produces_end_of_stream();
});
}
SEASTAR_TEST_CASE(test_virtual_dirty_accounting_on_flush) {
return seastar::async([] {
schema_ptr s = schema_builder("ks", "cf")
.with_column("pk", bytes_type, column_kind::partition_key)
.with_column("col", bytes_type, column_kind::regular_column)
.build();
tests::reader_concurrency_semaphore_wrapper semaphore;
dirty_memory_manager mgr;
replica::table_stats tbl_stats;
auto mt = make_lw_shared<replica::memtable>(s, mgr, tbl_stats);
std::vector<mutation> ring = make_ring(s, 3);
std::vector<mutation> current_ring;
for (auto&& m : ring) {
auto m_with_cell = m;
m_with_cell.set_clustered_cell(clustering_key::make_empty(), to_bytes("col"),
data_value(bytes(bytes::initialized_later(), 4096)), next_timestamp());
mt->apply(m_with_cell);
current_ring.push_back(m_with_cell);
}
// Create a reader which will cause many partition versions to be created
flat_mutation_reader_v2_opt rd1 = mt->make_flat_reader(s, semaphore.make_permit());
auto close_rd1 = deferred_close(*rd1);
rd1->set_max_buffer_size(1);
rd1->fill_buffer().get();
// Override large cell value with a short one
{
auto part0_update = ring[0];
part0_update.set_clustered_cell(clustering_key::make_empty(), to_bytes("col"),
data_value(bytes(bytes::initialized_later(), 8)), next_timestamp());
mt->apply(std::move(part0_update));
current_ring[0] = part0_update;
}
std::vector<size_t> virtual_dirty_values;
virtual_dirty_values.push_back(mgr.virtual_dirty_memory());
auto flush_reader_check = assert_that(mt->make_flush_reader(s, semaphore.make_permit(), service::get_local_priority_manager().memtable_flush_priority()));
flush_reader_check.produces_partition(current_ring[0]);
virtual_dirty_values.push_back(mgr.virtual_dirty_memory());
flush_reader_check.produces_partition(current_ring[1]);
virtual_dirty_values.push_back(mgr.virtual_dirty_memory());
while ((*rd1)().get0()) ;
close_rd1.close_now();
logalloc::shard_tracker().full_compaction();
flush_reader_check.produces_partition(current_ring[2]);
virtual_dirty_values.push_back(mgr.virtual_dirty_memory());
flush_reader_check.produces_end_of_stream();
virtual_dirty_values.push_back(mgr.virtual_dirty_memory());
std::reverse(virtual_dirty_values.begin(), virtual_dirty_values.end());
BOOST_REQUIRE(std::is_sorted(virtual_dirty_values.begin(), virtual_dirty_values.end()));
});
}
// Reproducer for #1753
SEASTAR_TEST_CASE(test_partition_version_consistency_after_lsa_compaction_happens) {
return seastar::async([] {
schema_ptr s = schema_builder("ks", "cf")
.with_column("pk", bytes_type, column_kind::partition_key)
.with_column("ck", bytes_type, column_kind::clustering_key)
.with_column("col", bytes_type, column_kind::regular_column)
.build();
tests::reader_concurrency_semaphore_wrapper semaphore;
auto mt = make_lw_shared<replica::memtable>(s);
auto empty_m = make_unique_mutation(s);
auto ck1 = clustering_key::from_single_value(*s, serialized(make_unique_bytes()));
auto ck2 = clustering_key::from_single_value(*s, serialized(make_unique_bytes()));
auto ck3 = clustering_key::from_single_value(*s, serialized(make_unique_bytes()));
auto m1 = empty_m;
m1.set_clustered_cell(ck1, to_bytes("col"), data_value(bytes(bytes::initialized_later(), 8)), next_timestamp());
auto m2 = empty_m;
m2.set_clustered_cell(ck2, to_bytes("col"), data_value(bytes(bytes::initialized_later(), 8)), next_timestamp());
auto m3 = empty_m;
m3.set_clustered_cell(ck3, to_bytes("col"), data_value(bytes(bytes::initialized_later(), 8)), next_timestamp());
mt->apply(m1);
std::optional<flat_reader_assertions_v2> rd1 = assert_that(mt->make_flat_reader(s, semaphore.make_permit()));
rd1->set_max_buffer_size(1);
rd1->fill_buffer().get();
mt->apply(m2);
std::optional<flat_reader_assertions_v2> rd2 = assert_that(mt->make_flat_reader(s, semaphore.make_permit()));
rd2->set_max_buffer_size(1);
rd2->fill_buffer().get();
mt->apply(m3);
std::optional<flat_reader_assertions_v2> rd3 = assert_that(mt->make_flat_reader(s, semaphore.make_permit()));
rd3->set_max_buffer_size(1);
rd3->fill_buffer().get();
logalloc::shard_tracker().full_compaction();
auto rd4 = assert_that(mt->make_flat_reader(s, semaphore.make_permit()));
rd4.set_max_buffer_size(1);
rd4.fill_buffer().get();
auto rd5 = assert_that(mt->make_flat_reader(s, semaphore.make_permit()));
rd5.set_max_buffer_size(1);
rd5.fill_buffer().get();
auto rd6 = assert_that(mt->make_flat_reader(s, semaphore.make_permit()));
rd6.set_max_buffer_size(1);
rd6.fill_buffer().get();
rd1->next_mutation().is_equal_to(m1);
rd2->next_mutation().is_equal_to(m1 + m2);
rd3->next_mutation().is_equal_to(m1 + m2 + m3);
rd3 = {};
rd4.next_mutation().is_equal_to(m1 + m2 + m3);
rd1 = {};
rd5.next_mutation().is_equal_to(m1 + m2 + m3);
rd2 = {};
rd6.next_mutation().is_equal_to(m1 + m2 + m3);
});
}
// Reproducer for #1746
SEASTAR_TEST_CASE(test_segment_migration_during_flush) {
return seastar::async([] {
schema_ptr s = schema_builder("ks", "cf")
.with_column("pk", bytes_type, column_kind::partition_key)
.with_column("ck", bytes_type, column_kind::clustering_key)
.with_column("col", bytes_type, column_kind::regular_column)
.build();
tests::reader_concurrency_semaphore_wrapper semaphore;
replica::table_stats tbl_stats;
dirty_memory_manager mgr;
auto mt = make_lw_shared<replica::memtable>(s, mgr, tbl_stats);
const int rows_per_partition = 300;
const int partitions = 3;
std::vector<mutation> ring = make_ring(s, partitions);
for (auto& m : ring) {
for (int i = 0; i < rows_per_partition; ++i) {
auto ck = clustering_key::from_single_value(*s, serialized(make_unique_bytes()));
auto col_value = data_value(bytes(bytes::initialized_later(), 8));
m.set_clustered_cell(ck, to_bytes("col"), col_value, next_timestamp());
}
mt->apply(m);
}
std::vector<size_t> virtual_dirty_values;
virtual_dirty_values.push_back(mgr.virtual_dirty_memory());
auto rd = mt->make_flush_reader(s, semaphore.make_permit(), service::get_local_priority_manager().memtable_flush_priority());
auto close_rd = deferred_close(rd);
for (int i = 0; i < partitions; ++i) {
auto mfopt = rd().get0();
BOOST_REQUIRE(bool(mfopt));
BOOST_REQUIRE(mfopt->is_partition_start());
while (!mfopt->is_end_of_partition()) {
logalloc::shard_tracker().full_compaction();
mfopt = rd().get0();
}
virtual_dirty_values.push_back(mgr.virtual_dirty_memory());
}
BOOST_REQUIRE(!rd().get0());
std::reverse(virtual_dirty_values.begin(), virtual_dirty_values.end());
BOOST_REQUIRE(std::is_sorted(virtual_dirty_values.begin(), virtual_dirty_values.end()));
});
}
// Reproducer for #2854
SEASTAR_TEST_CASE(test_fast_forward_to_after_memtable_is_flushed) {
return seastar::async([] {
schema_ptr s = schema_builder("ks", "cf")
.with_column("pk", bytes_type, column_kind::partition_key)
.with_column("col", bytes_type, column_kind::regular_column)
.build();
tests::reader_concurrency_semaphore_wrapper semaphore;
auto mt = make_lw_shared<replica::memtable>(s);
auto mt2 = make_lw_shared<replica::memtable>(s);
std::vector<mutation> ring = make_ring(s, 5);
for (auto& m : ring) {
mt->apply(m);
mt2->apply(m);
}
auto rd = assert_that(mt->make_flat_reader(s, semaphore.make_permit()));
rd.produces(ring[0]);
mt->mark_flushed(mt2->as_data_source());
rd.produces(ring[1]);
auto range = dht::partition_range::make_starting_with(dht::ring_position(ring[3].decorated_key()));
rd.fast_forward_to(range);
rd.produces(ring[3]).produces(ring[4]).produces_end_of_stream();
});
}
SEASTAR_TEST_CASE(test_exception_safety_of_partition_range_reads) {
return seastar::async([] {
random_mutation_generator gen(random_mutation_generator::generate_counters::no);
auto s = gen.schema();
tests::reader_concurrency_semaphore_wrapper semaphore;
std::vector<mutation> ms = gen(2);
auto mt = make_lw_shared<replica::memtable>(s);
for (auto& m : ms) {
mt->apply(m);
}
memory::with_allocation_failures([&] {
assert_that(mt->make_flat_reader(s, semaphore.make_permit(), query::full_partition_range))
.produces(ms);
});
});
}
SEASTAR_TEST_CASE(test_exception_safety_of_flush_reads) {
return seastar::async([] {
random_mutation_generator gen(random_mutation_generator::generate_counters::no);
auto s = gen.schema();
tests::reader_concurrency_semaphore_wrapper semaphore;
std::vector<mutation> ms = gen(2);
auto mt = make_lw_shared<replica::memtable>(s);
for (auto& m : ms) {
mt->apply(m);
}
memory::with_allocation_failures([&] {
auto revert = defer([&] {
mt->revert_flushed_memory();
});
assert_that(mt->make_flush_reader(s, semaphore.make_permit(), default_priority_class()))
.produces(ms);
});
});
}
SEASTAR_TEST_CASE(test_exception_safety_of_single_partition_reads) {
return seastar::async([] {
random_mutation_generator gen(random_mutation_generator::generate_counters::no);
auto s = gen.schema();
tests::reader_concurrency_semaphore_wrapper semaphore;
std::vector<mutation> ms = gen(2);
auto mt = make_lw_shared<replica::memtable>(s);
for (auto& m : ms) {
mt->apply(m);
}
memory::with_allocation_failures([&] {
assert_that(mt->make_flat_reader(s, semaphore.make_permit(), dht::partition_range::make_singular(ms[1].decorated_key())))
.produces(ms[1]);
});
});
}
SEASTAR_TEST_CASE(test_hash_is_cached) {
return seastar::async([] {
auto s = schema_builder("ks", "cf")
.with_column("pk", bytes_type, column_kind::partition_key)
.with_column("v", bytes_type, column_kind::regular_column)
.build();
tests::reader_concurrency_semaphore_wrapper semaphore;
auto mt = make_lw_shared<replica::memtable>(s);
auto m = make_unique_mutation(s);
set_column(m, "v");
mt->apply(m);
{
auto rd = mt->make_flat_reader(s, semaphore.make_permit());
auto close_rd = deferred_close(rd);
rd().get0()->as_partition_start();
clustering_row row = std::move(*rd().get0()).as_clustering_row();
BOOST_REQUIRE(!row.cells().cell_hash_for(0));
}
{
auto slice = s->full_slice();
slice.options.set<query::partition_slice::option::with_digest>();
auto rd = mt->make_flat_reader(s, semaphore.make_permit(), query::full_partition_range, slice);
auto close_rd = deferred_close(rd);
rd().get0()->as_partition_start();
clustering_row row = std::move(*rd().get0()).as_clustering_row();
BOOST_REQUIRE(row.cells().cell_hash_for(0));
}
{
auto rd = mt->make_flat_reader(s, semaphore.make_permit());
auto close_rd = deferred_close(rd);
rd().get0()->as_partition_start();
clustering_row row = std::move(*rd().get0()).as_clustering_row();
BOOST_REQUIRE(row.cells().cell_hash_for(0));
}
set_column(m, "v");
mt->apply(m);
{
auto rd = mt->make_flat_reader(s, semaphore.make_permit());
auto close_rd = deferred_close(rd);
rd().get0()->as_partition_start();
clustering_row row = std::move(*rd().get0()).as_clustering_row();
BOOST_REQUIRE(!row.cells().cell_hash_for(0));
}
{
auto slice = s->full_slice();
slice.options.set<query::partition_slice::option::with_digest>();
auto rd = mt->make_flat_reader(s, semaphore.make_permit(), query::full_partition_range, slice);
auto close_rd = deferred_close(rd);
rd().get0()->as_partition_start();
clustering_row row = std::move(*rd().get0()).as_clustering_row();
BOOST_REQUIRE(row.cells().cell_hash_for(0));
}
{
auto rd = mt->make_flat_reader(s, semaphore.make_permit());
auto close_rd = deferred_close(rd);
rd().get0()->as_partition_start();
clustering_row row = std::move(*rd().get0()).as_clustering_row();
BOOST_REQUIRE(row.cells().cell_hash_for(0));
}
});
}
SEASTAR_THREAD_TEST_CASE(test_collecting_encoding_stats) {
auto random_int32_value = [] {
return int32_type->decompose(tests::random::get_int<int32_t>());
};
auto now = gc_clock::now();
auto td = tests::data_model::table_description({ { "pk", int32_type } }, { { "ck", utf8_type } });
auto td1 = td;
td1.add_static_column("s1", int32_type);
td1.add_regular_column("v1", int32_type);
td1.add_regular_column("v2", int32_type);
auto built_schema = td1.build();
auto s = built_schema.schema;
auto md1 = tests::data_model::mutation_description({ to_bytes("pk1") });
md1.add_clustered_row_marker({ to_bytes("ck1") });
md1.add_clustered_cell({ to_bytes("ck1") }, "v1", random_int32_value());
auto m1 = md1.build(s);
auto md2 = tests::data_model::mutation_description({ to_bytes("pk2") });
auto md2_ttl = gc_clock::duration(std::chrono::seconds(1));
md2.add_clustered_row_marker({ to_bytes("ck1") }, -10);
md2.add_clustered_cell({ to_bytes("ck1") }, "v1", random_int32_value());
md2.add_clustered_cell({ to_bytes("ck2") }, "v2",
tests::data_model::mutation_description::atomic_value(random_int32_value(), tests::data_model::data_timestamp, md2_ttl, now + md2_ttl));
auto m2 = md2.build(s);
auto md3 = tests::data_model::mutation_description({ to_bytes("pk3") });
auto md3_ttl = gc_clock::duration(std::chrono::seconds(2));
auto md3_expiry_point = now - std::chrono::hours(8);
md3.add_static_cell("s1",
tests::data_model::mutation_description::atomic_value(random_int32_value(), tests::data_model::data_timestamp, md3_ttl, md3_expiry_point));
auto m3 = md3.build(s);
auto mt = make_lw_shared<replica::memtable>(s);
auto stats = mt->get_encoding_stats();
BOOST_CHECK(stats.min_local_deletion_time == gc_clock::time_point::max());
BOOST_CHECK_EQUAL(stats.min_timestamp, api::max_timestamp);
BOOST_CHECK(stats.min_ttl == gc_clock::duration::max());
mt->apply(m1);
stats = mt->get_encoding_stats();
BOOST_CHECK(stats.min_local_deletion_time == gc_clock::time_point::max());
BOOST_CHECK_EQUAL(stats.min_timestamp, tests::data_model::data_timestamp);
BOOST_CHECK(stats.min_ttl == gc_clock::duration::max());
mt->apply(m2);
stats = mt->get_encoding_stats();
BOOST_CHECK(stats.min_local_deletion_time == now + md2_ttl);
BOOST_CHECK_EQUAL(stats.min_timestamp, -10);
BOOST_CHECK(stats.min_ttl == md2_ttl);
mt->apply(m3);
stats = mt->get_encoding_stats();
BOOST_CHECK(stats.min_local_deletion_time == md3_expiry_point);
BOOST_CHECK_EQUAL(stats.min_timestamp, -10);
BOOST_CHECK(stats.min_ttl == md2_ttl);
}
SEASTAR_TEST_CASE(memtable_flush_compresses_mutations) {
auto db_config = make_shared<db::config>();
db_config->enable_cache.set(false);
return do_with_cql_env_thread([](cql_test_env& env) {
// Create table and insert some data
char const* ks_name = "keyspace_name";
char const* table_name = "table_name";
env.execute_cql(format("CREATE KEYSPACE {} WITH REPLICATION = {{'class' : 'SimpleStrategy', 'replication_factor' : 1}};", ks_name)).get();
env.execute_cql(format("CREATE TABLE {}.{} (pk int, ck int, id int, PRIMARY KEY(pk, ck));", ks_name, table_name)).get();
replica::database& db = env.local_db();
replica::table& t = db.find_column_family(ks_name, table_name);
tests::reader_concurrency_semaphore_wrapper semaphore;
schema_ptr s = t.schema();
// Build expected mutation with partition key: 1, clustering_key: 2 and value of id column: 3
dht::decorated_key pk = dht::decorate_key(*s, partition_key::from_single_value(*s, serialized(1)));
clustering_key ck = clustering_key::from_single_value(*s, serialized(2));
mutation m1 = mutation(s, pk);
m1.set_clustered_cell(ck, to_bytes("id"), data_value(3), api::new_timestamp());
mutation m2 = mutation(s, pk);
m2.partition().apply_delete(*s, clustering_key_prefix::from_singular(*s, 2), tombstone{api::new_timestamp(), gc_clock::now()});
t.apply(m1);
t.apply(m2);
// Flush to make sure all the modifications make it to disk
t.flush().get();
// Treat the table as mutation_source and assert we get the expected mutation and end of stream
mutation_source ms = t.as_mutation_source();
assert_that(ms.make_reader(s, semaphore.make_permit()))
.produces(m2)
.produces_end_of_stream();
}, db_config);
}
SEASTAR_TEST_CASE(sstable_compaction_does_not_resurrect_data) {
auto db_config = make_shared<db::config>();
db_config->enable_cache.set(false);
return do_with_cql_env_thread([](cql_test_env& env) {
replica::database& db = env.local_db();
service::migration_manager& mm = env.migration_manager().local();
sstring ks_name = "ks";
sstring table_name = "table_name";
schema_ptr s = schema_builder(ks_name, table_name)
.with_column(to_bytes("pk"), int32_type, column_kind::partition_key)
.with_column(to_bytes("ck"), int32_type, column_kind::clustering_key)
.with_column(to_bytes("id"), int32_type)
.set_gc_grace_seconds(1)
.build();
auto group0_guard = mm.start_group0_operation().get();
auto ts = group0_guard.write_timestamp();
mm.announce(mm.prepare_new_column_family_announcement(s, ts).get(), std::move(group0_guard)).get();
replica::table& t = db.find_column_family(ks_name, table_name);
dht::decorated_key pk = dht::decorate_key(*s, partition_key::from_single_value(*s, serialized(1)));
clustering_key ck_to_delete = clustering_key::from_single_value(*s, serialized(2));
clustering_key ck = clustering_key::from_single_value(*s, serialized(3));
api::timestamp_type insertion_timestamp_before_delete = api::new_timestamp();
forward_jump_clocks(1s);
api::timestamp_type deletion_timestamp = api::new_timestamp();
forward_jump_clocks(1s);
api::timestamp_type insertion_timestamp_after_delete = api::new_timestamp();
mutation m_delete = mutation(s, pk);
m_delete.partition().apply_delete(
*s,
ck_to_delete,
tombstone{deletion_timestamp, gc_clock::now()});
t.apply(m_delete);
// Insert data that won't be removed by tombstone to prevent compaction from skipping whole partition
mutation m_insert = mutation(s, pk);
m_insert.set_clustered_cell(ck, to_bytes("id"), data_value(3), insertion_timestamp_after_delete);
t.apply(m_insert);
// Flush and wait until the gc_grace_seconds pass
t.flush().get();
forward_jump_clocks(2s);
// Apply the past mutation to memtable to simulate repair. This row should be deleted by tombstone
mutation m_past_insert = mutation(s, pk);
m_past_insert.set_clustered_cell(
ck_to_delete,
to_bytes("id"),
data_value(4),
insertion_timestamp_before_delete);
t.apply(m_past_insert);
// Trigger compaction. If all goes well, compaction should check if a relevant row is in the memtable
// and should not purge the tombstone.
t.compact_all_sstables().get();
// If we get additional row (1, 2, 4), that means the tombstone was purged and data was resurrected
assert_that(env.execute_cql(format("SELECT * FROM {}.{};", ks_name, table_name)).get0())
.is_rows()
.with_rows_ignore_order({
{serialized(1), serialized(3), serialized(3)},
});
}, db_config);
}
SEASTAR_TEST_CASE(failed_flush_prevents_writes) {
#ifndef SCYLLA_ENABLE_ERROR_INJECTION
std::cerr << "Skipping test as it depends on error injection. Please run in mode where it's enabled (debug,dev).\n";
return make_ready_future<>();
#else
return do_with_cql_env_thread([](cql_test_env& env) {
replica::database& db = env.local_db();
service::migration_manager& mm = env.migration_manager().local();
simple_schema ss;
schema_ptr s = ss.schema();
auto group0_guard = mm.start_group0_operation().get();
auto ts = group0_guard.write_timestamp();
mm.announce(mm.prepare_new_column_family_announcement(s, ts).get(), std::move(group0_guard)).get();
replica::table& t = db.find_column_family("ks", "cf");
replica::memtable& m = t.active_memtable();
dirty_memory_manager& dmm = m.get_dirty_memory_manager();
// Insert something so that we have data in memtable to flush
// it has to be somewhat large, as automatic flushing picks the
// largest memtable to flush
mutation mt = {s, ss.make_pkey(make_local_key(s))};
for (uint32_t i = 0; i < 1000; ++i) {
ss.add_row(mt, ss.make_ckey(i), format("{}", i));
}
t.apply(mt);
utils::get_local_injector().enable("table_seal_active_memtable_pre_flush");
// Trigger flush
dmm.notify_soft_pressure();
BOOST_ASSERT(eventually_true([&db]() { return db.cf_stats()->failed_memtables_flushes_count != 0; }));
// The flush failed, make sure there is still data in memtable.
BOOST_ASSERT(!t.active_memtable().empty());
utils::get_local_injector().disable("table_seal_active_memtable_pre_flush");
// Release pressure, so that we can trigger flush again
dmm.notify_soft_relief();
// Trigger pressure, the error above is no longer being injected, so flush
// should be triggerred and succeed
dmm.notify_soft_pressure();
BOOST_ASSERT(eventually_true([&t]() { return t.active_memtable().empty(); }));
});
#endif
}