mirror of
https://github.com/scylladb/scylladb.git
synced 2026-04-19 16:15:07 +00:00
No point in going through the vector<mutation> entry-point just to discover in run time that it was called with a single-element vector, when we know that in advance. Signed-off-by: Benny Halevy <bhalevy@scylladb.com> Closes #13733
887 lines
39 KiB
C++
887 lines
39 KiB
C++
/*
|
|
* Copyright (C) 2018-present ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* SPDX-License-Identifier: AGPL-3.0-or-later
|
|
*/
|
|
|
|
#include <boost/test/unit_test.hpp>
|
|
|
|
#include "replica/database.hh"
|
|
#include "db/view/view_builder.hh"
|
|
#include "db/view/view_updating_consumer.hh"
|
|
#include "db/view/view_update_generator.hh"
|
|
#include "db/system_keyspace.hh"
|
|
#include "db/system_keyspace_view_types.hh"
|
|
#include "db/config.hh"
|
|
#include "cql3/query_options.hh"
|
|
|
|
#include "test/lib/scylla_test_case.hh"
|
|
#include <seastar/testing/thread_test_case.hh>
|
|
#include <seastar/util/closeable.hh>
|
|
|
|
#include "test/lib/cql_test_env.hh"
|
|
#include "test/lib/cql_assertions.hh"
|
|
#include "test/lib/sstable_utils.hh"
|
|
#include "schema/schema_builder.hh"
|
|
#include "service/priority_manager.hh"
|
|
#include "test/lib/test_services.hh"
|
|
#include "test/lib/data_model.hh"
|
|
#include "test/lib/log.hh"
|
|
#include "test/lib/random_utils.hh"
|
|
#include "test/lib/key_utils.hh"
|
|
#include "utils/ranges.hh"
|
|
|
|
#include "readers/from_mutations_v2.hh"
|
|
#include "readers/evictable.hh"
|
|
|
|
using namespace std::literals::chrono_literals;
|
|
|
|
schema_ptr test_table_schema() {
|
|
static thread_local auto s = [] {
|
|
schema_builder builder(make_shared_schema(
|
|
generate_legacy_id("try1", "data"), "try1", "data",
|
|
// partition key
|
|
{{"p", utf8_type}},
|
|
// clustering key
|
|
{{"c", utf8_type}},
|
|
// regular columns
|
|
{{"v", utf8_type}},
|
|
// static columns
|
|
{},
|
|
// regular column name type
|
|
utf8_type,
|
|
// comment
|
|
""
|
|
));
|
|
return builder.build(schema_builder::compact_storage::no);
|
|
}();
|
|
return s;
|
|
}
|
|
|
|
SEASTAR_TEST_CASE(test_builder_with_large_partition) {
|
|
return do_with_cql_env_thread([] (cql_test_env& e) {
|
|
e.execute_cql("create table cf (p int, c int, v int, primary key (p, c))").get();
|
|
|
|
for (auto i = 0; i < 1024; ++i) {
|
|
e.execute_cql(format("insert into cf (p, c, v) values (0, {:d}, 0)", i)).get();
|
|
}
|
|
|
|
auto f = e.local_view_builder().wait_until_built("ks", "vcf");
|
|
e.execute_cql("create materialized view vcf as select * from cf "
|
|
"where p is not null and c is not null and v is not null "
|
|
"primary key (v, c, p)").get();
|
|
|
|
f.get();
|
|
auto built = e.get_system_keyspace().local().load_built_views().get0();
|
|
BOOST_REQUIRE_EQUAL(built.size(), 1);
|
|
BOOST_REQUIRE_EQUAL(built[0].second, sstring("vcf"));
|
|
|
|
auto msg = e.execute_cql("select count(*) from vcf where v = 0").get0();
|
|
assert_that(msg).is_rows().with_size(1);
|
|
assert_that(msg).is_rows().with_rows({{{long_type->decompose(1024L)}}});
|
|
});
|
|
}
|
|
|
|
// This test reproduces issue #4213. We have a large base partition with
|
|
// many rows, and the view has the *same* partition key as the base, so all
|
|
// the generated view rows will go to the same view partition. The view
|
|
// builder used to batch up to 128 (view_builder::batch_size) of these view
|
|
// rows into one view mutation, and if the individual rows are big (i.e.,
|
|
// contain very long strings or blobs), this 128-row mutation can be too
|
|
// large to be applied because of our limitation on commit-log segment size
|
|
// (commitlog_segment_size_in_mb, by default 32 MB). When applying the
|
|
// mutation fails, view building retries indefinitely and this test hung.
|
|
SEASTAR_TEST_CASE(test_builder_with_large_partition_2) {
|
|
return do_with_cql_env_thread([] (cql_test_env& e) {
|
|
const int nrows = 64; // meant to be lower than view_builder::batch_size
|
|
const int target_size = 33*1024*1024; // meant to be higher than commitlog_segment_size_in_mb
|
|
e.execute_cql("create table cf (p int, c int, s ascii, primary key (p, c))").get();
|
|
const sstring longstring = sstring(target_size / nrows, 'x');
|
|
for (auto i = 0; i < nrows; ++i) {
|
|
e.execute_cql(format("insert into cf (p, c, s) values (0, {:d}, '{}')", i, longstring)).get();
|
|
}
|
|
|
|
auto f = e.local_view_builder().wait_until_built("ks", "vcf");
|
|
e.execute_cql("create materialized view vcf as select * from cf "
|
|
"where p is not null and c is not null "
|
|
"primary key (p, c)").get();
|
|
|
|
f.get();
|
|
auto built = e.get_system_keyspace().local().load_built_views().get0();
|
|
BOOST_REQUIRE_EQUAL(built.size(), 1);
|
|
BOOST_REQUIRE_EQUAL(built[0].second, sstring("vcf"));
|
|
|
|
auto msg = e.execute_cql("select count(*) from vcf").get0();
|
|
assert_that(msg).is_rows().with_size(1);
|
|
assert_that(msg).is_rows().with_rows({{{long_type->decompose(long(nrows))}}});
|
|
});
|
|
}
|
|
|
|
|
|
SEASTAR_TEST_CASE(test_builder_with_multiple_partitions) {
|
|
return do_with_cql_env_thread([] (cql_test_env& e) {
|
|
e.execute_cql("create table cf (p int, c int, v int, primary key (p, c))").get();
|
|
|
|
for (auto i = 0; i < 1024; ++i) {
|
|
e.execute_cql(format("insert into cf (p, c, v) values ({:d}, {:d}, 0)", i % 5, i)).get();
|
|
}
|
|
|
|
auto f = e.local_view_builder().wait_until_built("ks", "vcf");
|
|
e.execute_cql("create materialized view vcf as select * from cf "
|
|
"where p is not null and c is not null and v is not null "
|
|
"primary key (v, c, p)").get();
|
|
|
|
f.get();
|
|
auto built = e.get_system_keyspace().local().load_built_views().get0();
|
|
BOOST_REQUIRE_EQUAL(built.size(), 1);
|
|
BOOST_REQUIRE_EQUAL(built[0].second, sstring("vcf"));
|
|
|
|
auto msg = e.execute_cql("select count(*) from vcf where v = 0").get0();
|
|
assert_that(msg).is_rows().with_size(1);
|
|
assert_that(msg).is_rows().with_rows({{{long_type->decompose(1024L)}}});
|
|
});
|
|
}
|
|
|
|
SEASTAR_TEST_CASE(test_builder_with_multiple_partitions_of_batch_size_rows) {
|
|
return do_with_cql_env_thread([] (cql_test_env& e) {
|
|
e.execute_cql("create table cf (p int, c int, v int, primary key (p, c))").get();
|
|
|
|
for (auto i = 0; i < 1024; ++i) {
|
|
e.execute_cql(format("insert into cf (p, c, v) values ({:d}, {:d}, 0)", i % db::view::view_builder::batch_size, i)).get();
|
|
}
|
|
|
|
auto f = e.local_view_builder().wait_until_built("ks", "vcf");
|
|
e.execute_cql("create materialized view vcf as select * from cf "
|
|
"where p is not null and c is not null and v is not null "
|
|
"primary key (v, c, p)").get();
|
|
|
|
f.get();
|
|
auto built = e.get_system_keyspace().local().load_built_views().get0();
|
|
BOOST_REQUIRE_EQUAL(built.size(), 1);
|
|
BOOST_REQUIRE_EQUAL(built[0].second, sstring("vcf"));
|
|
|
|
auto msg = e.execute_cql("select count(*) from vcf where v = 0").get0();
|
|
assert_that(msg).is_rows().with_size(1);
|
|
assert_that(msg).is_rows().with_rows({{{long_type->decompose(1024L)}}});
|
|
});
|
|
}
|
|
|
|
SEASTAR_TEST_CASE(test_builder_view_added_during_ongoing_build) {
|
|
return do_with_cql_env_thread([] (cql_test_env& e) {
|
|
e.execute_cql("create table cf (p int, c int, v int, primary key (p, c))").get();
|
|
|
|
for (auto i = 0; i < 5000; ++i) {
|
|
e.execute_cql(format("insert into cf (p, c, v) values (0, {:d}, 0)", i)).get();
|
|
}
|
|
|
|
auto f1 = e.local_view_builder().wait_until_built("ks", "vcf1");
|
|
auto f2 = e.local_view_builder().wait_until_built("ks", "vcf2");
|
|
|
|
e.execute_cql("create materialized view vcf1 as select * from cf "
|
|
"where p is not null and c is not null and v is not null "
|
|
"primary key (v, c, p)").get();
|
|
|
|
sleep(1s).get();
|
|
|
|
e.execute_cql("create materialized view vcf2 as select * from cf "
|
|
"where p is not null and c is not null and v is not null "
|
|
"primary key (v, p, c)").get();
|
|
|
|
f2.get();
|
|
f1.get();
|
|
auto built = e.get_system_keyspace().local().load_built_views().get0();
|
|
BOOST_REQUIRE_EQUAL(built.size(), 2);
|
|
|
|
auto msg = e.execute_cql("select count(*) from vcf1 where v = 0").get0();
|
|
assert_that(msg).is_rows().with_size(1);
|
|
assert_that(msg).is_rows().with_rows({{{long_type->decompose(5000L)}}});
|
|
|
|
msg = e.execute_cql("select count(*) from vcf2 where v = 0").get0();
|
|
assert_that(msg).is_rows().with_size(1);
|
|
assert_that(msg).is_rows().with_rows({{{long_type->decompose(5000L)}}});
|
|
});
|
|
}
|
|
|
|
std::mt19937 random_generator() {
|
|
// In case of errors, replace the seed with a fixed value to get a deterministic run.
|
|
auto seed = tests::random::get_int<uint32_t>();
|
|
std::cout << "Random seed: " << seed << "\n";
|
|
return std::mt19937(seed);
|
|
}
|
|
|
|
bytes random_bytes(size_t size, std::mt19937& gen) {
|
|
bytes result(bytes::initialized_later(), size);
|
|
static thread_local std::uniform_int_distribution<int> dist(0, std::numeric_limits<uint8_t>::max());
|
|
for (size_t i = 0; i < size; ++i) {
|
|
result[i] = dist(gen);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
SEASTAR_TEST_CASE(test_builder_across_tokens_with_large_partitions) {
|
|
return do_with_cql_env_thread([] (cql_test_env& e) {
|
|
auto gen = random_generator();
|
|
|
|
e.execute_cql("create table cf (p blob, c int, v int, primary key (p, c))").get();
|
|
auto s = e.local_db().find_schema("ks", "cf");
|
|
|
|
auto make_key = [&] (auto) { return to_hex(random_bytes(128, gen)); };
|
|
for (auto&& k : boost::irange(0, 4) | boost::adaptors::transformed(make_key)) {
|
|
for (auto i = 0; i < 1000; ++i) {
|
|
e.execute_cql(format("insert into cf (p, c, v) values (0x{}, {:d}, 0)", k, i)).get();
|
|
}
|
|
}
|
|
|
|
auto f1 = e.local_view_builder().wait_until_built("ks", "vcf1");
|
|
auto f2 = e.local_view_builder().wait_until_built("ks", "vcf2");
|
|
|
|
e.execute_cql("create materialized view vcf1 as select * from cf "
|
|
"where p is not null and c is not null and v is not null "
|
|
"primary key (v, c, p)").get();
|
|
|
|
sleep(1s).get();
|
|
|
|
e.execute_cql("create materialized view vcf2 as select * from cf "
|
|
"where p is not null and c is not null and v is not null "
|
|
"primary key (v, p, c)").get();
|
|
|
|
f2.get();
|
|
f1.get();
|
|
auto built = e.get_system_keyspace().local().load_built_views().get0();
|
|
BOOST_REQUIRE_EQUAL(built.size(), 2);
|
|
|
|
auto msg = e.execute_cql("select count(*) from vcf1 where v = 0").get0();
|
|
assert_that(msg).is_rows().with_size(1);
|
|
assert_that(msg).is_rows().with_rows({{{long_type->decompose(4000L)}}});
|
|
|
|
msg = e.execute_cql("select count(*) from vcf2 where v = 0").get0();
|
|
assert_that(msg).is_rows().with_size(1);
|
|
assert_that(msg).is_rows().with_rows({{{long_type->decompose(4000L)}}});
|
|
});
|
|
}
|
|
|
|
SEASTAR_TEST_CASE(test_builder_across_tokens_with_small_partitions) {
|
|
return do_with_cql_env_thread([] (cql_test_env& e) {
|
|
auto gen = random_generator();
|
|
|
|
e.execute_cql("create table cf (p blob, c int, v int, primary key (p, c))").get();
|
|
auto s = e.local_db().find_schema("ks", "cf");
|
|
|
|
auto make_key = [&] (auto) { return to_hex(random_bytes(128, gen)); };
|
|
for (auto&& k : boost::irange(0, 1000) | boost::adaptors::transformed(make_key)) {
|
|
for (auto i = 0; i < 4; ++i) {
|
|
e.execute_cql(format("insert into cf (p, c, v) values (0x{}, {:d}, 0)", k, i)).get();
|
|
}
|
|
}
|
|
|
|
auto f1 = e.local_view_builder().wait_until_built("ks", "vcf1");
|
|
auto f2 = e.local_view_builder().wait_until_built("ks", "vcf2");
|
|
|
|
e.execute_cql("create materialized view vcf1 as select * from cf "
|
|
"where p is not null and c is not null and v is not null "
|
|
"primary key (v, c, p)").get();
|
|
|
|
sleep(1s).get();
|
|
|
|
e.execute_cql("create materialized view vcf2 as select * from cf "
|
|
"where p is not null and c is not null and v is not null "
|
|
"primary key (v, p, c)").get();
|
|
|
|
f2.get();
|
|
f1.get();
|
|
|
|
auto built = e.get_system_keyspace().local().load_built_views().get0();
|
|
BOOST_REQUIRE_EQUAL(built.size(), 2);
|
|
|
|
auto msg = e.execute_cql("select count(*) from vcf1 where v = 0").get0();
|
|
assert_that(msg).is_rows().with_size(1);
|
|
assert_that(msg).is_rows().with_rows({{{long_type->decompose(4000L)}}});
|
|
|
|
msg = e.execute_cql("select count(*) from vcf2 where v = 0").get0();
|
|
assert_that(msg).is_rows().with_size(1);
|
|
assert_that(msg).is_rows().with_rows({{{long_type->decompose(4000L)}}});
|
|
});
|
|
}
|
|
|
|
SEASTAR_TEST_CASE(test_builder_with_tombstones) {
|
|
return do_with_cql_env_thread([] (cql_test_env& e) {
|
|
e.execute_cql("create table cf (p int, c1 int, c2 int, v int, primary key (p, c1, c2))").get();
|
|
|
|
for (auto i = 0; i < 100; ++i) {
|
|
e.execute_cql(format("insert into cf (p, c1, c2, v) values (0, {:d}, {:d}, 1)", i % 2, i)).get();
|
|
}
|
|
|
|
e.execute_cql("delete from cf where p = 0 and c1 = 0").get();
|
|
e.execute_cql("delete from cf where p = 0 and c1 = 1 and c2 >= 50 and c2 < 101").get();
|
|
|
|
auto f = e.local_view_builder().wait_until_built("ks", "vcf");
|
|
e.execute_cql("create materialized view vcf as select * from cf "
|
|
"where p is not null and c1 is not null and c2 is not null and v is not null "
|
|
"primary key ((v, p), c1, c2)").get();
|
|
|
|
f.get();
|
|
auto built = e.get_system_keyspace().local().load_built_views().get0();
|
|
BOOST_REQUIRE_EQUAL(built.size(), 1);
|
|
|
|
auto msg = e.execute_cql("select * from vcf").get0();
|
|
assert_that(msg).is_rows().with_size(25);
|
|
});
|
|
}
|
|
|
|
SEASTAR_TEST_CASE(test_builder_with_concurrent_writes) {
|
|
return do_with_cql_env_thread([] (cql_test_env& e) {
|
|
auto gen = random_generator();
|
|
|
|
e.execute_cql("create table cf (p blob, c int, v int, primary key (p, c))").get();
|
|
|
|
const size_t rows = 6864;
|
|
const size_t rows_per_partition = 4;
|
|
const size_t partitions = rows / rows_per_partition;
|
|
|
|
std::unordered_set<sstring> keys;
|
|
while (keys.size() != partitions) {
|
|
keys.insert(to_hex(random_bytes(128, gen)));
|
|
}
|
|
|
|
auto half = keys.begin();
|
|
std::advance(half, keys.size() / 2);
|
|
auto k = keys.begin();
|
|
for (; k != half; ++k) {
|
|
for (size_t i = 0; i < rows_per_partition; ++i) {
|
|
e.execute_cql(format("insert into cf (p, c, v) values (0x{}, {:d}, 0)", *k, i)).get();
|
|
}
|
|
}
|
|
|
|
auto f = e.local_view_builder().wait_until_built("ks", "vcf");
|
|
e.execute_cql("create materialized view vcf as select * from cf "
|
|
"where p is not null and c is not null and v is not null "
|
|
"primary key (v, c, p)").get();
|
|
|
|
for (; k != keys.end(); ++k) {
|
|
for (size_t i = 0; i < rows_per_partition; ++i) {
|
|
e.execute_cql(format("insert into cf (p, c, v) values (0x{}, {:d}, 0)", *k, i)).get();
|
|
}
|
|
}
|
|
|
|
f.get();
|
|
eventually([&] {
|
|
auto msg = e.execute_cql("select * from vcf").get0();
|
|
assert_that(msg).is_rows().with_size(rows);
|
|
});
|
|
});
|
|
}
|
|
|
|
SEASTAR_TEST_CASE(test_builder_with_concurrent_drop) {
|
|
return do_with_cql_env_thread([] (cql_test_env& e) {
|
|
auto gen = random_generator();
|
|
|
|
e.execute_cql("create table cf (p blob, c int, v int, primary key (p, c))").get();
|
|
|
|
auto make_key = [&] (auto) { return to_hex(random_bytes(128, gen)); };
|
|
for (auto&& k : boost::irange(0, 1000) | boost::adaptors::transformed(make_key)) {
|
|
for (auto i = 0; i < 5; ++i) {
|
|
e.execute_cql(format("insert into cf (p, c, v) values (0x{}, {:d}, 0)", k, i)).get();
|
|
}
|
|
}
|
|
|
|
e.execute_cql("create materialized view vcf as select * from cf "
|
|
"where p is not null and c is not null and v is not null "
|
|
"primary key (v, c, p)").get();
|
|
|
|
e.execute_cql("drop materialized view vcf").get();
|
|
|
|
eventually([&] {
|
|
auto msg = e.execute_cql("select * from system.scylla_views_builds_in_progress").get0();
|
|
assert_that(msg).is_rows().is_empty();
|
|
msg = e.execute_cql("select * from system.built_views").get0();
|
|
assert_that(msg).is_rows().is_empty();
|
|
msg = e.execute_cql("select * from system.views_builds_in_progress").get0();
|
|
assert_that(msg).is_rows().is_empty();
|
|
msg = e.execute_cql("select * from system_distributed.view_build_status").get0();
|
|
assert_that(msg).is_rows().is_empty();
|
|
});
|
|
});
|
|
}
|
|
|
|
SEASTAR_TEST_CASE(test_view_update_generator) {
|
|
return do_with_cql_env_thread([] (cql_test_env& e) {
|
|
e.execute_cql("create table t (p text, c text, v text, primary key (p, c))").get();
|
|
auto s = test_table_schema();
|
|
|
|
auto insert_id = e.prepare("insert into t (p, c, v) values (?, ?, ?)").get();
|
|
|
|
auto keys = tests::generate_partition_keys(2, s);
|
|
const auto& key1 = keys[0];
|
|
const auto& key2 = keys[1];
|
|
|
|
const auto key1_raw = cql3::raw_value::make_value(key1.key().explode().front());
|
|
const auto key2_raw = cql3::raw_value::make_value(key2.key().explode().front());
|
|
|
|
for (auto i = 0; i < 1024; ++i) {
|
|
e.execute_prepared(insert_id, {
|
|
key1_raw,
|
|
cql3::raw_value::make_value(serialized(format("c{}", i))),
|
|
cql3::raw_value::make_value(serialized("x"))}).get();
|
|
}
|
|
for (auto i = 0; i < 512; ++i) {
|
|
e.execute_prepared(insert_id, {
|
|
cql3::raw_value::make_value(key2.key().explode().front()),
|
|
cql3::raw_value::make_value(serialized(format("c{}", i))),
|
|
cql3::raw_value::make_value(serialized(format("{}", i + 1)))}).get();
|
|
}
|
|
auto& view_update_generator = e.local_view_update_generator();
|
|
|
|
std::vector<shared_sstable> ssts;
|
|
|
|
lw_shared_ptr<replica::table> t = e.local_db().find_column_family("ks", "t").shared_from_this();
|
|
|
|
auto write_to_sstable = [&] (mutation m) {
|
|
auto sst = t->make_streaming_staging_sstable();
|
|
sstables::sstable_writer_config sst_cfg = e.db().local().get_user_sstables_manager().configure_writer("test");
|
|
auto& pc = service::get_local_streaming_priority();
|
|
|
|
auto permit = e.local_db().get_reader_concurrency_semaphore().make_tracking_only_permit(s.get(), "test", db::no_timeout, {});
|
|
sst->write_components(make_flat_mutation_reader_from_mutations_v2(m.schema(), std::move(permit), m), 1ul, s, sst_cfg, {}, pc).get();
|
|
sst->open_data().get();
|
|
t->add_sstable_and_update_cache(sst).get();
|
|
return sst;
|
|
};
|
|
|
|
mutation m(s, key1);
|
|
auto col = s->get_column_definition("v");
|
|
for (int i = 1024; i < 1280; ++i) {
|
|
auto& row = m.partition().clustered_row(*s, clustering_key::from_exploded(*s, {to_bytes(fmt::format("c{}", i))}));
|
|
row.cells().apply(*col, atomic_cell::make_live(*col->type, 2345, col->type->decompose(sstring(fmt::format("v{}", i)))));
|
|
// Scatter the data in a bunch of different sstables, so we
|
|
// can test the registration semaphore of the view update
|
|
// generator
|
|
if (!(i % 10)) {
|
|
ssts.push_back(write_to_sstable(std::exchange(m, mutation(s, key1))));
|
|
}
|
|
}
|
|
ssts.push_back(write_to_sstable(std::move(m)));
|
|
|
|
BOOST_REQUIRE_EQUAL(view_update_generator.available_register_units(), db::view::view_update_generator::registration_queue_size);
|
|
|
|
parallel_for_each(ssts.begin(), ssts.begin() + 10, [&] (shared_sstable& sst) {
|
|
return view_update_generator.register_staging_sstable(sst, t);
|
|
}).get();
|
|
|
|
BOOST_REQUIRE_EQUAL(view_update_generator.available_register_units(), db::view::view_update_generator::registration_queue_size);
|
|
|
|
parallel_for_each(ssts.begin() + 10, ssts.end(), [&] (shared_sstable& sst) {
|
|
return view_update_generator.register_staging_sstable(sst, t);
|
|
}).get();
|
|
|
|
BOOST_REQUIRE_EQUAL(view_update_generator.available_register_units(), db::view::view_update_generator::registration_queue_size);
|
|
|
|
auto select_by_p_id = e.prepare("SELECT * FROM t WHERE p = ?").get();
|
|
auto select_by_p_and_c_id = e.prepare("SELECT * FROM t WHERE p = ? and c = ?").get();
|
|
|
|
eventually([&, key1, key2] {
|
|
auto msg = e.execute_prepared(select_by_p_id, {key1_raw}).get();
|
|
assert_that(msg).is_rows().with_size(1280);
|
|
msg = e.execute_prepared(select_by_p_id, {key2_raw}).get();
|
|
assert_that(msg).is_rows().with_size(512);
|
|
const auto key1_dv = key1.key().explode().front();
|
|
|
|
for (int i = 0; i < 1024; ++i) {
|
|
auto msg = e.execute_prepared(select_by_p_and_c_id, {key1_raw, cql3::raw_value::make_value(serialized(format("c{}", i)))}).get();
|
|
assert_that(msg).is_rows().with_size(1).with_row({
|
|
{key1_dv},
|
|
{utf8_type->decompose(sstring(fmt::format("c{}", i)))},
|
|
{utf8_type->decompose(sstring("x"))}
|
|
});
|
|
|
|
}
|
|
for (int i = 1024; i < 1280; ++i) {
|
|
auto msg = e.execute_prepared(select_by_p_and_c_id, {key1_raw, cql3::raw_value::make_value(serialized(format("c{}", i)))}).get();
|
|
assert_that(msg).is_rows().with_size(1).with_row({
|
|
{key1_dv},
|
|
{utf8_type->decompose(sstring(fmt::format("c{}", i)))},
|
|
{utf8_type->decompose(sstring(fmt::format("v{}", i)))}
|
|
});
|
|
|
|
}
|
|
});
|
|
|
|
BOOST_REQUIRE_EQUAL(view_update_generator.available_register_units(), db::view::view_update_generator::registration_queue_size);
|
|
});
|
|
}
|
|
|
|
SEASTAR_THREAD_TEST_CASE(test_view_update_generator_deadlock) {
|
|
cql_test_config test_cfg;
|
|
auto& db_cfg = *test_cfg.db_config;
|
|
|
|
db_cfg.enable_cache(false);
|
|
db_cfg.enable_commitlog(false);
|
|
|
|
do_with_cql_env_thread([] (cql_test_env& e) -> future<> {
|
|
e.execute_cql("create table t (p text, c text, v text, primary key (p, c))").get();
|
|
e.execute_cql("create materialized view tv as select * from t "
|
|
"where p is not null and c is not null and v is not null "
|
|
"primary key (v, c, p)").get();
|
|
|
|
auto s = test_table_schema();
|
|
const auto key = tests::generate_partition_key(s);
|
|
const auto key1_raw = cql3::raw_value::make_value(key.key().explode().front());
|
|
|
|
auto insert_id = e.prepare("insert into t (p, c, v) values (?, ?, 'x')").get();
|
|
|
|
for (auto i = 0; i < 1024; ++i) {
|
|
e.execute_prepared(insert_id, {key1_raw, cql3::raw_value::make_value(serialized(format("c{}", i)))}).get();
|
|
}
|
|
|
|
// We need data on the disk so that the pre-image reader is forced to go to disk.
|
|
e.db().invoke_on_all([] (replica::database& db) {
|
|
return db.flush_all_memtables();
|
|
}).get();
|
|
|
|
auto& view_update_generator = e.local_view_update_generator();
|
|
|
|
lw_shared_ptr<replica::table> t = e.local_db().find_column_family("ks", "t").shared_from_this();
|
|
|
|
mutation m(s, key);
|
|
auto col = s->get_column_definition("v");
|
|
const auto filler_val_size = 4 * 1024;
|
|
const auto filler_val = sstring(filler_val_size, 'a');
|
|
for (int i = 0; i < 1024; ++i) {
|
|
auto& row = m.partition().clustered_row(*s, clustering_key::from_exploded(*s, {to_bytes(fmt::format("c{}", i))}));
|
|
row.cells().apply(*col, atomic_cell::make_live(*col->type, 2345, col->type->decompose(filler_val)));
|
|
}
|
|
|
|
auto sst = t->make_streaming_staging_sstable();
|
|
sstables::sstable_writer_config sst_cfg = e.local_db().get_user_sstables_manager().configure_writer("test");
|
|
auto& pc = service::get_local_streaming_priority();
|
|
|
|
auto permit = e.local_db().get_reader_concurrency_semaphore().make_tracking_only_permit(s.get(), "test", db::no_timeout, {});
|
|
sst->write_components(make_flat_mutation_reader_from_mutations_v2(m.schema(), std::move(permit), m), 1ul, s, sst_cfg, {}, pc).get();
|
|
sst->open_data().get();
|
|
t->add_sstable_and_update_cache(sst).get();
|
|
|
|
auto& sem = *with_scheduling_group(e.local_db().get_streaming_scheduling_group(), [&] () {
|
|
return &e.local_db().get_reader_concurrency_semaphore();
|
|
}).get0();
|
|
|
|
// consume all units except what is needed to admit a single reader.
|
|
auto sponge_permit = sem.make_tracking_only_permit(s.get(), "sponge", db::no_timeout, {});
|
|
auto resources = sponge_permit.consume_resources(sem.available_resources() - reader_resources{1, replica::new_reader_base_cost});
|
|
|
|
testlog.info("res = [.count={}, .memory={}]", sem.available_resources().count, sem.available_resources().memory);
|
|
|
|
BOOST_REQUIRE_EQUAL(sem.get_stats().permit_based_evictions, 0);
|
|
|
|
view_update_generator.register_staging_sstable(sst, t).get();
|
|
|
|
eventually_true([&] {
|
|
return sem.get_stats().permit_based_evictions > 0;
|
|
});
|
|
|
|
return make_ready_future<>();
|
|
}, std::move(test_cfg)).get();
|
|
}
|
|
|
|
// Test that registered sstables (and semaphore units) are not leaked when
|
|
// sstables are register *while* a batch of sstables are processed.
|
|
SEASTAR_THREAD_TEST_CASE(test_view_update_generator_register_semaphore_unit_leak) {
|
|
cql_test_config test_cfg;
|
|
auto& db_cfg = *test_cfg.db_config;
|
|
|
|
db_cfg.enable_cache(false);
|
|
db_cfg.enable_commitlog(false);
|
|
|
|
do_with_cql_env_thread([] (cql_test_env& e) {
|
|
e.execute_cql("create table t (p text, c text, v text, primary key (p, c))").get();
|
|
e.execute_cql("create materialized view tv as select * from t "
|
|
"where p is not null and c is not null and v is not null "
|
|
"primary key (v, c, p)").get();
|
|
|
|
auto s = test_table_schema();
|
|
const auto key = tests::generate_partition_key(s);
|
|
const auto key1_raw = cql3::raw_value::make_value(key.key().explode().front());
|
|
|
|
auto insert_id = e.prepare("insert into t (p, c, v) values (?, ?, 'x')").get();
|
|
|
|
for (auto i = 0; i < 1024; ++i) {
|
|
e.execute_prepared(insert_id, {key1_raw, cql3::raw_value::make_value(serialized(format("c{}", i)))}).get();
|
|
}
|
|
|
|
// We need data on the disk so that the pre-image reader is forced to go to disk.
|
|
e.db().invoke_on_all([] (replica::database& db) {
|
|
return db.flush_all_memtables();
|
|
}).get();
|
|
|
|
auto& view_update_generator = e.local_view_update_generator();
|
|
|
|
lw_shared_ptr<replica::table> t = e.local_db().find_column_family("ks", "t").shared_from_this();
|
|
|
|
auto make_sstable = [&] {
|
|
mutation m(s, key);
|
|
auto col = s->get_column_definition("v");
|
|
const auto val = sstring(10, 'a');
|
|
for (int i = 0; i < 1024; ++i) {
|
|
auto& row = m.partition().clustered_row(*s, clustering_key::from_exploded(*s, {to_bytes(fmt::format("c{}", i))}));
|
|
row.cells().apply(*col, atomic_cell::make_live(*col->type, 2345, col->type->decompose(val)));
|
|
}
|
|
|
|
auto sst = t->make_streaming_staging_sstable();
|
|
sstables::sstable_writer_config sst_cfg = e.local_db().get_user_sstables_manager().configure_writer("test");
|
|
auto& pc = service::get_local_streaming_priority();
|
|
|
|
auto permit = e.local_db().get_reader_concurrency_semaphore().make_tracking_only_permit(s.get(), "test", db::no_timeout, {});
|
|
sst->write_components(make_flat_mutation_reader_from_mutations_v2(m.schema(), std::move(permit), m), 1ul, s, sst_cfg, {}, pc).get();
|
|
sst->open_data().get();
|
|
t->add_sstable_and_update_cache(sst).get();
|
|
return sst;
|
|
};
|
|
|
|
std::vector<sstables::shared_sstable> prepared_sstables;
|
|
|
|
// We need 2 * N + 1 sstables. N should be at least 5 (number of units
|
|
// on the register semaphore) + 1 (just to make sure the returned future
|
|
// blocks). While the initial batch is processed we want to register N
|
|
// more sstables, + 1 to detect the leak (N units will be returned from
|
|
// the initial batch). See below for more details.
|
|
const auto num_sstables = (view_update_generator.available_register_units() + 1) * 2 + 1;
|
|
for (auto i = 0; i < num_sstables; ++i) {
|
|
prepared_sstables.push_back(make_sstable());
|
|
}
|
|
|
|
// First batch: register N sstables.
|
|
while (view_update_generator.available_register_units()) {
|
|
auto fut = view_update_generator.register_staging_sstable(std::move(prepared_sstables.back()), t);
|
|
prepared_sstables.pop_back();
|
|
BOOST_REQUIRE(fut.available());
|
|
}
|
|
|
|
// Make sure we consumed all units and thus the register future blocks.
|
|
auto fut1 = view_update_generator.register_staging_sstable(std::move(prepared_sstables.back()), t);
|
|
prepared_sstables.pop_back();
|
|
BOOST_REQUIRE(!fut1.available());
|
|
|
|
std::vector<future<>> futures;
|
|
futures.reserve(prepared_sstables.size());
|
|
|
|
// While the first batch is processed, concurrently register the
|
|
// remaining N + 1 sstables, yielding in-between so the first batch
|
|
// processing can progress.
|
|
while (!prepared_sstables.empty()) {
|
|
thread::yield();
|
|
futures.emplace_back(view_update_generator.register_staging_sstable(std::move(prepared_sstables.back()), t));
|
|
prepared_sstables.pop_back();
|
|
}
|
|
|
|
// Make sure the first batch is processed.
|
|
fut1.get();
|
|
|
|
auto fut_res = when_all_succeed(futures.begin(), futures.end());
|
|
|
|
auto watchdog_timer_done = make_ready_future<>();
|
|
|
|
// Watchdog timer which will break out of the deadlock and fail the test.
|
|
timer watchdog_timer([&view_update_generator, &watchdog_timer_done] {
|
|
// Re-start it so stop() on shutdown doesn't crash.
|
|
watchdog_timer_done = watchdog_timer_done.then([&] {
|
|
return view_update_generator.stop().then([&] {
|
|
return view_update_generator.start();
|
|
});
|
|
});
|
|
});
|
|
|
|
watchdog_timer.arm(std::chrono::seconds(60));
|
|
|
|
// Wait on the second batch, will fail if the watchdog timer fails.
|
|
fut_res.get();
|
|
|
|
watchdog_timer.cancel();
|
|
|
|
watchdog_timer_done.get();
|
|
}, std::move(test_cfg)).get();
|
|
}
|
|
|
|
SEASTAR_THREAD_TEST_CASE(test_view_update_generator_buffering) {
|
|
using partition_size_map = std::map<dht::decorated_key, size_t, dht::ring_position_less_comparator>;
|
|
|
|
class consumer_verifier {
|
|
schema_ptr _schema;
|
|
reader_concurrency_semaphore& _semaphore;
|
|
const partition_size_map& _partition_rows;
|
|
std::vector<mutation>& _collected_muts;
|
|
std::unique_ptr<row_locker> _rl;
|
|
std::unique_ptr<row_locker::stats> _rl_stats;
|
|
clustering_key::less_compare _less_cmp;
|
|
const size_t _max_rows_soft;
|
|
const size_t _max_rows_hard;
|
|
size_t _buffer_rows = 0;
|
|
bool& _ok;
|
|
|
|
private:
|
|
static size_t rows_in_limit(size_t l) {
|
|
const size_t _100kb = 100 * 1024;
|
|
// round up
|
|
return l / _100kb + std::min(size_t(1), l % _100kb);
|
|
}
|
|
|
|
static size_t rows_in_mut(const mutation& m) {
|
|
return std::distance(m.partition().clustered_rows().begin(), m.partition().clustered_rows().end());
|
|
}
|
|
|
|
void check(mutation mut) {
|
|
// First we check that we would be able to create a reader, even
|
|
// though the staging reader consumed all resources.
|
|
auto permit = _semaphore.obtain_permit(_schema.get(), "consumer_verifier", replica::new_reader_base_cost, db::timeout_clock::now(), {}).get0();
|
|
|
|
const size_t current_rows = rows_in_mut(mut);
|
|
const auto total_rows = _partition_rows.at(mut.decorated_key());
|
|
_buffer_rows += current_rows;
|
|
|
|
testlog.trace("consumer_verifier::check(): key={}, rows={}/{}, _buffer={}",
|
|
partition_key::with_schema_wrapper(*_schema, mut.key()),
|
|
current_rows,
|
|
total_rows,
|
|
_buffer_rows);
|
|
|
|
BOOST_REQUIRE(current_rows);
|
|
BOOST_REQUIRE(current_rows <= _max_rows_hard);
|
|
BOOST_REQUIRE(_buffer_rows <= _max_rows_hard);
|
|
|
|
// The current partition doesn't have all of its rows yet, verify
|
|
// that the new mutation contains the next rows for the same
|
|
// partition
|
|
if (!_collected_muts.empty() && rows_in_mut(_collected_muts.back()) < _partition_rows.at(_collected_muts.back().decorated_key())) {
|
|
BOOST_REQUIRE(_collected_muts.back().decorated_key().equal(*mut.schema(), mut.decorated_key()));
|
|
const auto& previous_ckey = (--_collected_muts.back().partition().clustered_rows().end())->key();
|
|
const auto& next_ckey = mut.partition().clustered_rows().begin()->key();
|
|
BOOST_REQUIRE(_less_cmp(previous_ckey, next_ckey));
|
|
mutation_application_stats stats;
|
|
_collected_muts.back().partition().apply(*_schema, mut.partition(), *mut.schema(), stats);
|
|
// The new mutation is a new partition.
|
|
} else {
|
|
if (!_collected_muts.empty()) {
|
|
BOOST_REQUIRE(!_collected_muts.back().decorated_key().equal(*mut.schema(), mut.decorated_key()));
|
|
}
|
|
_collected_muts.push_back(std::move(mut));
|
|
}
|
|
|
|
if (_buffer_rows >= _max_rows_hard) { // buffer flushed on hard limit
|
|
_buffer_rows = 0;
|
|
testlog.trace("consumer_verifier::check(): buffer ends on hard limit");
|
|
} else if (_buffer_rows >= _max_rows_soft) { // buffer flushed on soft limit
|
|
_buffer_rows = 0;
|
|
testlog.trace("consumer_verifier::check(): buffer ends on soft limit");
|
|
}
|
|
}
|
|
|
|
public:
|
|
consumer_verifier(schema_ptr schema, reader_concurrency_semaphore& sem, const partition_size_map& partition_rows, std::vector<mutation>& collected_muts, bool& ok)
|
|
: _schema(std::move(schema))
|
|
, _semaphore(sem)
|
|
, _partition_rows(partition_rows)
|
|
, _collected_muts(collected_muts)
|
|
, _rl(std::make_unique<row_locker>(_schema))
|
|
, _rl_stats(std::make_unique<row_locker::stats>())
|
|
, _less_cmp(*_schema)
|
|
, _max_rows_soft(rows_in_limit(db::view::view_updating_consumer::buffer_size_soft_limit))
|
|
, _max_rows_hard(rows_in_limit(db::view::view_updating_consumer::buffer_size_hard_limit))
|
|
, _ok(ok)
|
|
{ }
|
|
|
|
future<row_locker::lock_holder> operator()(mutation mut) {
|
|
try {
|
|
check(std::move(mut));
|
|
} catch (...) {
|
|
_ok = false;
|
|
BOOST_FAIL(fmt::format("consumer_verifier::operator(): caught unexpected exception {}", std::current_exception()));
|
|
}
|
|
return _rl->lock_pk(_collected_muts.back().decorated_key(), true, db::no_timeout, *_rl_stats);
|
|
}
|
|
};
|
|
|
|
reader_concurrency_semaphore sem(reader_concurrency_semaphore::for_tests{}, get_name(), 1, replica::new_reader_base_cost);
|
|
auto stop_sem = deferred_stop(sem);
|
|
|
|
auto schema = schema_builder("ks", "cf")
|
|
.with_column("pk", int32_type, column_kind::partition_key)
|
|
.with_column("ck", int32_type, column_kind::clustering_key)
|
|
.with_column("v", bytes_type)
|
|
.build();
|
|
|
|
const auto blob_100kb = bytes(100 * 1024, bytes::value_type(0xab));
|
|
const abort_source as;
|
|
|
|
const auto partition_size_sets = std::vector<std::vector<int>>{{12}, {8, 4}, {8, 16}, {22}, {8, 8, 8, 8}, {8, 8, 8, 16, 8}, {8, 20, 16, 16}, {50}, {21}, {21, 2}};
|
|
const auto max_partition_set_size = std::ranges::max_element(partition_size_sets, [] (const std::vector<int>& a, const std::vector<int>& b) { return a.size() < b.size(); })->size();
|
|
auto pkeys = ranges::to<std::vector<dht::decorated_key>>(boost::irange(size_t{0}, max_partition_set_size) | boost::adaptors::transformed([schema] (int i) {
|
|
return dht::decorate_key(*schema, partition_key::from_single_value(*schema, int32_type->decompose(data_value(i))));
|
|
}));
|
|
std::ranges::sort(pkeys, dht::ring_position_less_comparator(*schema));
|
|
|
|
for (auto partition_sizes_100kb : partition_size_sets) {
|
|
testlog.debug("partition_sizes_100kb={}", partition_sizes_100kb);
|
|
partition_size_map partition_rows{dht::ring_position_less_comparator(*schema)};
|
|
std::vector<mutation> muts;
|
|
auto pk = 0;
|
|
for (auto partition_size_100kb : partition_sizes_100kb) {
|
|
auto mut_desc = tests::data_model::mutation_description(pkeys.at(pk++).key().explode(*schema));
|
|
for (auto ck = 0; ck < partition_size_100kb; ++ck) {
|
|
mut_desc.add_clustered_cell({int32_type->decompose(data_value(ck))}, "v", tests::data_model::mutation_description::value(blob_100kb));
|
|
}
|
|
muts.push_back(mut_desc.build(schema));
|
|
partition_rows.emplace(muts.back().decorated_key(), partition_size_100kb);
|
|
}
|
|
|
|
std::ranges::sort(muts, [less = dht::ring_position_less_comparator(*schema)] (const mutation& a, const mutation& b) {
|
|
return less(a.decorated_key(), b.decorated_key());
|
|
});
|
|
|
|
auto permit = sem.obtain_permit(schema.get(), get_name(), replica::new_reader_base_cost, db::no_timeout, {}).get0();
|
|
|
|
auto mt = make_lw_shared<replica::memtable>(schema);
|
|
for (const auto& mut : muts) {
|
|
mt->apply(mut);
|
|
}
|
|
|
|
auto p = make_manually_paused_evictable_reader_v2(
|
|
mt->as_data_source(),
|
|
schema,
|
|
permit,
|
|
query::full_partition_range,
|
|
schema->full_slice(),
|
|
service::get_local_streaming_priority(),
|
|
nullptr,
|
|
::mutation_reader::forwarding::no);
|
|
auto& staging_reader = std::get<0>(p);
|
|
auto& staging_reader_handle = std::get<1>(p);
|
|
auto close_staging_reader = deferred_close(staging_reader);
|
|
|
|
std::vector<mutation> collected_muts;
|
|
bool ok = true;
|
|
|
|
staging_reader.consume_in_thread(db::view::view_updating_consumer(schema, permit, as, staging_reader_handle,
|
|
consumer_verifier(schema, sem, partition_rows, collected_muts, ok)));
|
|
|
|
BOOST_REQUIRE(ok);
|
|
|
|
BOOST_REQUIRE_EQUAL(muts.size(), collected_muts.size());
|
|
for (size_t i = 0; i < muts.size(); ++i) {
|
|
testlog.trace("compare mutation {}", i);
|
|
BOOST_REQUIRE_EQUAL(muts[i], collected_muts[i]);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Checking for view_builds_in_progress used to assume that certain values
|
|
// (e.g. first_token) are always present in every row, which lead to using
|
|
// freed memory. In order to check for regressions, a row with missing
|
|
// values is inserted and ensured that it produces a status without any
|
|
// views in progress.
|
|
SEASTAR_TEST_CASE(test_load_view_build_progress_with_values_missing) {
|
|
return do_with_cql_env_thread([] (cql_test_env& e) {
|
|
cquery_nofail(e, format("INSERT INTO system.{} (keyspace_name, view_name, cpu_id) VALUES ('ks', 'v', {})",
|
|
db::system_keyspace::v3::SCYLLA_VIEWS_BUILDS_IN_PROGRESS, this_shard_id()));
|
|
BOOST_REQUIRE(e.get_system_keyspace().local().load_view_build_progress().get0().empty());
|
|
});
|
|
}
|