/* * Copyright (C) 2015 ScyllaDB */ /* * This file is part of Scylla. * * Scylla is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Scylla is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Scylla. If not, see . */ #include #include #include #include #include "sstable_test.hh" #include "sstables/key.hh" #include #include #include "sstables/sstables.hh" #include "database.hh" #include "timestamp.hh" #include "schema_builder.hh" #include "mutation_reader.hh" #include "mutation_source_test.hh" #include "partition_slice_builder.hh" #include "tmpdir.hh" #include "memtable-sstable.hh" #include "tests/index_reader_assertions.hh" #include "tests/test_services.hh" #include "flat_mutation_reader_assertions.hh" #include "simple_schema.hh" #include "tests/sstable_utils.hh" #include "tests/make_random_string.hh" #include "data_model.hh" #include "random-utils.hh" using namespace sstables; using namespace std::chrono_literals; static db::nop_large_data_handler nop_lp_handler; SEASTAR_THREAD_TEST_CASE(nonexistent_key) { auto wait_bg = seastar::defer([] { sstables::await_background_jobs().get(); }); sstables::test_env env; env.reusable_sst(uncompressed_schema(), uncompressed_dir(), 1).then([] (auto sstp) { return do_with(make_dkey(uncompressed_schema(), "invalid_key"), [sstp] (auto& key) { auto s = uncompressed_schema(); auto rd = make_lw_shared(sstp->read_row_flat(s, key)); return (*rd)(db::no_timeout).then([sstp, s, &key, rd] (auto mutation) { BOOST_REQUIRE(!mutation); return make_ready_future<>(); }); }); }).get(); } future<> test_no_clustered(sstables::test_env& env, bytes&& key, std::unordered_map &&map) { return env.reusable_sst(uncompressed_schema(), uncompressed_dir(), 1).then([k = std::move(key), map = std::move(map)] (auto sstp) mutable { return do_with(make_dkey(uncompressed_schema(), std::move(k)), [sstp, map = std::move(map)] (auto& key) { auto s = uncompressed_schema(); auto rd = make_lw_shared(sstp->read_row_flat(s, key)); return read_mutation_from_flat_mutation_reader(*rd, db::no_timeout).then([sstp, s, &key, rd, map = std::move(map)] (auto mutation) { BOOST_REQUIRE(mutation); auto& mp = mutation->partition(); for (auto&& e : mp.range(*s, nonwrapping_range())) { BOOST_REQUIRE(to_bytes(e.key()) == to_bytes("")); BOOST_REQUIRE(e.row().cells().size() == map.size()); auto &row = e.row().cells(); for (auto&& c: map) { match_live_cell(row, *s, c.first, c.second); } } return make_ready_future<>(); }); }); }); } SEASTAR_THREAD_TEST_CASE(uncompressed_1) { auto wait_bg = seastar::defer([] { sstables::await_background_jobs().get(); }); sstables::test_env env; test_no_clustered(env, "vinna", {{ "col1", to_sstring("daughter") }, { "col2", 3 }}).get(); } SEASTAR_THREAD_TEST_CASE(uncompressed_2) { auto wait_bg = seastar::defer([] { sstables::await_background_jobs().get(); }); sstables::test_env env; test_no_clustered(env, "gustaf", {{ "col1", to_sstring("son") }, { "col2", 0 }}).get(); } SEASTAR_THREAD_TEST_CASE(uncompressed_3) { auto wait_bg = seastar::defer([] { sstables::await_background_jobs().get(); }); sstables::test_env env; test_no_clustered(env, "isak", {{ "col1", to_sstring("son") }, { "col2", 1 }}).get(); } SEASTAR_THREAD_TEST_CASE(uncompressed_4) { auto wait_bg = seastar::defer([] { sstables::await_background_jobs().get(); }); sstables::test_env env; test_no_clustered(env, "finna", {{ "col1", to_sstring("daughter") }, { "col2", 2 }}).get(); } /* * * insert into todata.complex_schema (key, clust1, clust2, reg_set, reg, static_obj) values ('key1', 'cl1.1', 'cl2.1', { '1', '2' }, 'v1', 'static_value'); * insert into todata.complex_schema (key, clust1, clust2, reg_list, reg, static_obj) values ('key1', 'cl1.2', 'cl2.2', [ '2', '1'], 'v2','static_value'); * insert into todata.complex_schema (key, clust1, clust2, reg_map, reg, static_obj) values ('key2', 'kcl1.1', 'kcl2.1', { '3': '1', '4' : '2' }, 'v3', 'static_value'); * insert into todata.complex_schema (key, clust1, clust2, reg_fset, reg, static_obj) values ('key2', 'kcl1.2', 'kcl2.2', { '3', '1', '4' , '2' }, 'v4', 'static_value'); * insert into todata.complex_schema (key, static_collection) values ('key2', { '1', '2', '3' , '4' }); * (flush) * * delete reg from todata.complex_schema where key = 'key2' and clust1 = 'kcl1.2' and clust2 = 'kcl2.2'; * insert into todata.complex_schema (key, clust1, clust2, reg, static_obj) values ('key3', 'tcl1.1', 'tcl2.1', 'v5', 'static_value_3') using ttl 86400; * delete from todata.complex_schema where key = 'key1' and clust1='cl1.1'; * delete static_obj from todata.complex_schema where key = 'key2'; * delete reg_list[0] from todata.complex_schema where key = 'key1' and clust1='cl1.2' and clust2='cl2.2'; * delete reg_fset from todata.complex_schema where key = 'key2' and clust1='kcl1.2' and clust2='kcl2.2'; * delete reg_map['3'] from todata.complex_schema where key = 'key2' and clust1='kcl1.1' and clust2='kcl2.1'; * delete static_collection['1'] from todata.complex_schema where key = 'key2'; * (flush) * * insert into todata.complex_schema (key, static_obj) values('key2', 'final_static'); * update todata.complex_schema set reg_map = reg_map + { '6': '1' } where key = 'key2' and clust1='kcl1.1' and clust2='kcl2.1'; * update todata.complex_schema set reg_list = reg_list + [ '6' ] where key = 'key1' and clust1='cl1.2' and clust2='cl2.2'; * update todata.complex_schema set reg_set = reg_set + { '6' } where key = 'key1' and clust1='cl1.2' and clust2='cl2.2'; * (flush) */ // FIXME: we are lacking a full deletion test template future generate_clustered(sstables::test_env& env, bytes&& key) { return env.reusable_sst(complex_schema(), "tests/sstables/complex", Generation).then([k = std::move(key)] (auto sstp) mutable { return do_with(make_dkey(complex_schema(), std::move(k)), [sstp] (auto& key) { auto s = complex_schema(); auto rd = make_lw_shared(sstp->read_row_flat(s, key)); return read_mutation_from_flat_mutation_reader(*rd, db::no_timeout).then([sstp, s, &key, rd] (auto mutation) { BOOST_REQUIRE(mutation); return std::move(*mutation); }); }); }); } inline auto clustered_row(mutation& mutation, const schema& s, std::vector&& v) { auto exploded = exploded_clustering_prefix(std::move(v)); auto clustering_pair = clustering_key::from_clustering_prefix(s, exploded); return deletable_row(s, mutation.partition().clustered_row(s, clustering_pair)); } SEASTAR_THREAD_TEST_CASE(complex_sst1_k1) { auto wait_bg = seastar::defer([] { sstables::await_background_jobs().get(); }); sstables::test_env env; generate_clustered<1>(env, "key1").then([] (auto&& mutation) { auto s = complex_schema(); auto& sr = mutation.partition().static_row(); match_live_cell(sr, *s, "static_obj", data_value(to_bytes("static_value"))); auto row1 = clustered_row(mutation, *s, {"cl1.1", "cl2.1"}); match_live_cell(row1.cells(), *s, "reg", data_value(to_bytes("v1"))); match_absent(row1.cells(), *s, "reg_list"); match_absent(row1.cells(), *s, "reg_map"); match_absent(row1.cells(), *s, "reg_fset"); auto reg_set = match_collection(row1.cells(), *s, "reg_set", tombstone(deletion_time{1431451390, 1431451390209521l})); match_collection_element(reg_set.cells[0], to_bytes("1"), bytes_opt{}); match_collection_element(reg_set.cells[1], to_bytes("2"), bytes_opt{}); auto row2 = clustered_row(mutation, *s, {"cl1.2", "cl2.2"}); match_live_cell(row2.cells(), *s, "reg", data_value(to_bytes("v2"))); match_absent(row2.cells(), *s, "reg_set"); match_absent(row2.cells(), *s, "reg_map"); match_absent(row2.cells(), *s, "reg_fset"); auto reg_list = match_collection(row2.cells(), *s, "reg_list", tombstone(deletion_time{1431451390, 1431451390213471l})); match_collection_element(reg_list.cells[0], bytes_opt{}, to_bytes("2")); match_collection_element(reg_list.cells[1], bytes_opt{}, to_bytes("1")); return make_ready_future<>(); }).get(); } SEASTAR_THREAD_TEST_CASE(complex_sst1_k2) { auto wait_bg = seastar::defer([] { sstables::await_background_jobs().get(); }); sstables::test_env env; generate_clustered<1>(env, "key2").then([] (auto&& mutation) { auto s = complex_schema(); auto& sr = mutation.partition().static_row(); match_live_cell(sr, *s, "static_obj", data_value(to_bytes("static_value"))); auto static_set = match_collection(sr, *s, "static_collection", tombstone(deletion_time{1431451390, 1431451390225257l})); match_collection_element(static_set.cells[0], to_bytes("1"), bytes_opt{}); match_collection_element(static_set.cells[1], to_bytes("2"), bytes_opt{}); match_collection_element(static_set.cells[2], to_bytes("3"), bytes_opt{}); match_collection_element(static_set.cells[3], to_bytes("4"), bytes_opt{}); auto row1 = clustered_row(mutation, *s, {"kcl1.1", "kcl2.1"}); match_live_cell(row1.cells(), *s, "reg", data_value(to_bytes("v3"))); match_absent(row1.cells(), *s, "reg_list"); match_absent(row1.cells(), *s, "reg_set"); match_absent(row1.cells(), *s, "reg_fset"); auto reg_map = match_collection(row1.cells(), *s, "reg_map", tombstone(deletion_time{1431451390, 1431451390217436l})); match_collection_element(reg_map.cells[0], to_bytes("3"), to_bytes("1")); match_collection_element(reg_map.cells[1], to_bytes("4"), to_bytes("2")); auto row2 = clustered_row(mutation, *s, {"kcl1.2", "kcl2.2"}); match_live_cell(row2.cells(), *s, "reg", data_value(to_bytes("v4"))); match_absent(row2.cells(), *s, "reg_set"); match_absent(row2.cells(), *s, "reg_map"); match_absent(row2.cells(), *s, "reg_list"); return make_ready_future<>(); }).get(); } SEASTAR_THREAD_TEST_CASE(complex_sst2_k1) { auto wait_bg = seastar::defer([] { sstables::await_background_jobs().get(); }); sstables::test_env env; generate_clustered<2>(env, "key1").then([] (auto&& mutation) { auto s = complex_schema(); auto exploded = exploded_clustering_prefix({"cl1.1", "cl2.1"}); auto clustering = clustering_key::from_clustering_prefix(*s, exploded); auto t1 = mutation.partition().range_tombstone_for_row(*s, clustering); BOOST_REQUIRE(t1.timestamp == 1431451394600754l); BOOST_REQUIRE(t1.deletion_time == gc_clock::time_point(gc_clock::duration(1431451394))); auto row = clustered_row(mutation, *s, {"cl1.2", "cl2.2"}); auto reg_list = match_collection(row.cells(), *s, "reg_list", tombstone(deletion_time{0, api::missing_timestamp})); match_collection_element(reg_list.cells[0], bytes_opt{}, bytes_opt{}); return make_ready_future<>(); }).get(); } SEASTAR_THREAD_TEST_CASE(complex_sst2_k2) { auto wait_bg = seastar::defer([] { sstables::await_background_jobs().get(); }); sstables::test_env env; generate_clustered<2>(env, "key2").then([] (auto&& mutation) { auto s = complex_schema(); auto& sr = mutation.partition().static_row(); match_dead_cell(sr, *s, "static_obj"); auto static_set = match_collection(sr, *s, "static_collection", tombstone(deletion_time{0, api::missing_timestamp})); match_collection_element(static_set.cells[0], to_bytes("1"), bytes_opt{}); auto row1 = clustered_row(mutation, *s, {"kcl1.1", "kcl2.1"}); // map dead match_absent(row1.cells(), *s, "reg_list"); match_absent(row1.cells(), *s, "reg_set"); match_absent(row1.cells(), *s, "reg_fset"); match_absent(row1.cells(), *s, "reg"); match_collection(row1.cells(), *s, "reg_map", tombstone(deletion_time{0, api::missing_timestamp})); auto row2 = clustered_row(mutation, *s, {"kcl1.2", "kcl2.2"}); match_dead_cell(row2.cells(), *s, "reg"); match_absent(row2.cells(), *s, "reg_map"); match_absent(row2.cells(), *s, "reg_list"); match_absent(row2.cells(), *s, "reg_set"); match_dead_cell(row2.cells(), *s, "reg_fset"); match_dead_cell(row2.cells(), *s, "reg"); return make_ready_future<>(); }).get(); } SEASTAR_THREAD_TEST_CASE(complex_sst2_k3) { auto wait_bg = seastar::defer([] { sstables::await_background_jobs().get(); }); sstables::test_env env; generate_clustered<2>(env, "key3").then([] (auto&& mutation) { auto s = complex_schema(); auto& sr = mutation.partition().static_row(); match_expiring_cell(sr, *s, "static_obj", data_value(to_bytes("static_value_3")), 1431451394597062l, 1431537794); auto row1 = clustered_row(mutation, *s, {"tcl1.1", "tcl2.1"}); BOOST_REQUIRE(row1.created_at() == 1431451394597062l); match_expiring_cell(row1.cells(), *s, "reg", data_value(to_bytes("v5")), 1431451394597062l, 1431537794); match_absent(row1.cells(), *s, "reg_list"); match_absent(row1.cells(), *s, "reg_set"); match_absent(row1.cells(), *s, "reg_map"); match_absent(row1.cells(), *s, "reg_fset"); return make_ready_future<>(); }).get(); } SEASTAR_THREAD_TEST_CASE(complex_sst3_k1) { auto wait_bg = seastar::defer([] { sstables::await_background_jobs().get(); }); sstables::test_env env; generate_clustered<3>(env, "key1").then([] (auto&& mutation) { auto s = complex_schema(); auto row = clustered_row(mutation, *s, {"cl1.2", "cl2.2"}); auto reg_set = match_collection(row.cells(), *s, "reg_set", tombstone(deletion_time{0, api::missing_timestamp})); match_collection_element(reg_set.cells[0], to_bytes("6"), bytes_opt{}); auto reg_list = match_collection(row.cells(), *s, "reg_list", tombstone(deletion_time{0, api::missing_timestamp})); match_collection_element(reg_list.cells[0], bytes_opt{}, to_bytes("6")); match_absent(row.cells(), *s, "static_obj"); match_absent(row.cells(), *s, "reg_map"); match_absent(row.cells(), *s, "reg"); match_absent(row.cells(), *s, "reg_fset"); return make_ready_future<>(); }).get(); } SEASTAR_THREAD_TEST_CASE(complex_sst3_k2) { auto wait_bg = seastar::defer([] { sstables::await_background_jobs().get(); }); sstables::test_env env; generate_clustered<3>(env, "key2").then([] (auto&& mutation) { auto s = complex_schema(); auto& sr = mutation.partition().static_row(); match_live_cell(sr, *s, "static_obj", data_value(to_bytes("final_static"))); auto row = clustered_row(mutation, *s, {"kcl1.1", "kcl2.1"}); auto reg_map = match_collection(row.cells(), *s, "reg_map", tombstone(deletion_time{0, api::missing_timestamp})); match_collection_element(reg_map.cells[0], to_bytes("6"), to_bytes("1")); match_absent(row.cells(), *s, "reg_list"); match_absent(row.cells(), *s, "reg_set"); match_absent(row.cells(), *s, "reg"); match_absent(row.cells(), *s, "reg_fset"); return make_ready_future<>(); }).get(); } future<> test_range_reads(sstables::test_env& env, const dht::token& min, const dht::token& max, std::vector& expected) { return env.reusable_sst(uncompressed_schema(), uncompressed_dir(), 1).then([min, max, &expected] (auto sstp) mutable { auto s = uncompressed_schema(); auto count = make_lw_shared(0); auto expected_size = expected.size(); auto stop = make_lw_shared(false); return do_with(dht::partition_range::make(dht::ring_position::starting_at(min), dht::ring_position::ending_at(max)), [&, sstp, s] (auto& pr) { auto mutations = make_lw_shared(sstp->read_range_rows_flat(s, pr)); return do_until([stop] { return *stop; }, // Note: The data in the following lambda, including // "mutations", continues to live until after the last // iteration's future completes, so its lifetime is safe. [sstp, mutations = std::move(mutations), &expected, expected_size, count, stop] () mutable { return (*mutations)(db::no_timeout).then([&expected, expected_size, count, stop, mutations] (mutation_fragment_opt mfopt) mutable { if (mfopt) { BOOST_REQUIRE(mfopt->is_partition_start()); BOOST_REQUIRE(*count < expected_size); BOOST_REQUIRE(std::vector({expected.back()}) == mfopt->as_partition_start().key().key().explode()); expected.pop_back(); (*count)++; mutations->next_partition(); } else { *stop = true; } }); }).then([count, expected_size] { BOOST_REQUIRE(*count == expected_size); }); }); }); } SEASTAR_THREAD_TEST_CASE(read_range) { auto wait_bg = seastar::defer([] { sstables::await_background_jobs().get(); }); std::vector expected = { to_bytes("finna"), to_bytes("isak"), to_bytes("gustaf"), to_bytes("vinna") }; do_with(sstables::test_env(), std::move(expected), [] (auto& env, auto& expected) { return test_range_reads(env, dht::minimum_token(), dht::maximum_token(), expected); }).get(); } SEASTAR_THREAD_TEST_CASE(read_partial_range) { auto wait_bg = seastar::defer([] { sstables::await_background_jobs().get(); }); std::vector expected = { to_bytes("finna"), to_bytes("isak") }; do_with(sstables::test_env(), std::move(expected), [] (auto& env, auto& expected) { return test_range_reads(env, dht::global_partitioner().get_token(key_view(bytes_view(expected.back()))), dht::maximum_token(), expected); }).get(); } SEASTAR_THREAD_TEST_CASE(read_partial_range_2) { auto wait_bg = seastar::defer([] { sstables::await_background_jobs().get(); }); std::vector expected = { to_bytes("gustaf"), to_bytes("vinna") }; do_with(sstables::test_env(), std::move(expected), [] (auto& env, auto& expected) { return test_range_reads(env, dht::minimum_token(), dht::global_partitioner().get_token(key_view(bytes_view(expected.front()))), expected); }).get(); } static mutation_source make_sstable_mutation_source(sstables::test_env& env, schema_ptr s, sstring dir, std::vector mutations, sstable_writer_config cfg, sstables::sstable::version_types version) { auto sst = env.make_sstable(s, dir, 1 /* generation */, version, sstables::sstable::format_types::big); auto mt = make_lw_shared(s); for (auto&& m : mutations) { mt->apply(m); } sst->write_components(mt->make_flat_reader(s), mutations.size(), s, cfg, mt->get_encoding_stats()).get(); sst->load().get(); return as_mutation_source(sst); } // Must be run in a seastar thread static void test_mutation_source(sstables::test_env& env, sstable_writer_config cfg, sstables::sstable::version_types version) { std::vector dirs; run_mutation_source_tests([&env, &dirs, &cfg, version] (schema_ptr s, const std::vector& partitions) -> mutation_source { dirs.emplace_back(); return make_sstable_mutation_source(env, s, dirs.back().path().string(), partitions, cfg, version); }); } SEASTAR_TEST_CASE(test_sstable_conforms_to_mutation_source) { return seastar::async([] { auto wait_bg = seastar::defer([] { sstables::await_background_jobs().get(); }); storage_service_for_tests ssft; sstables::test_env env; for (auto version : all_sstable_versions) { for (auto index_block_size : {1, 128, 64*1024}) { sstable_writer_config cfg; cfg.promoted_index_block_size = index_block_size; cfg.large_data_handler = &nop_lp_handler; test_mutation_source(env, cfg, version); } } }); } SEASTAR_TEST_CASE(test_sstable_can_write_and_read_range_tombstone) { return seastar::async([] { auto wait_bg = seastar::defer([] { sstables::await_background_jobs().get(); }); storage_service_for_tests ssft; auto dir = tmpdir(); auto s = make_lw_shared(schema({}, "ks", "cf", {{"p1", utf8_type}}, {{"c1", int32_type}}, {{"r1", int32_type}}, {}, utf8_type)); auto key = partition_key::from_exploded(*s, {to_bytes(make_local_key(s))}); auto c_key_start = clustering_key::from_exploded(*s, {int32_type->decompose(1)}); auto c_key_end = clustering_key::from_exploded(*s, {int32_type->decompose(2)}); mutation m(s, key); auto ttl = gc_clock::now() + std::chrono::seconds(1); m.partition().apply_delete(*s, range_tombstone(c_key_start, bound_kind::excl_start, c_key_end, bound_kind::excl_end, tombstone(9, ttl))); auto mt = make_lw_shared(s); mt->apply(std::move(m)); sstables::test_env env; auto sst = env.make_sstable(s, dir.path().string(), 1 /* generation */, sstables::sstable::version_types::la, sstables::sstable::format_types::big); write_memtable_to_sstable_for_test(*mt, sst).get(); sst->load().get(); auto mr = sst->read_rows_flat(s); auto mut = read_mutation_from_flat_mutation_reader(mr, db::no_timeout).get0(); BOOST_REQUIRE(bool(mut)); auto& rts = mut->partition().row_tombstones(); BOOST_REQUIRE(rts.size() == 1); auto it = rts.begin(); BOOST_REQUIRE(it->equal(*s, range_tombstone( c_key_start, bound_kind::excl_start, c_key_end, bound_kind::excl_end, tombstone(9, ttl)))); }); } SEASTAR_THREAD_TEST_CASE(compact_storage_sparse_read) { auto wait_bg = seastar::defer([] { sstables::await_background_jobs().get(); }); sstables::test_env env; env.reusable_sst(compact_sparse_schema(), "tests/sstables/compact_sparse", 1).then([] (auto sstp) { return do_with(make_dkey(compact_sparse_schema(), "first_row"), [sstp] (auto& key) { auto s = compact_sparse_schema(); auto rd = make_lw_shared(sstp->read_row_flat(s, key)); return read_mutation_from_flat_mutation_reader(*rd, db::no_timeout).then([sstp, s, &key, rd] (auto mutation) { BOOST_REQUIRE(mutation); auto& mp = mutation->partition(); auto& row = mp.clustered_row(*s, clustering_key::make_empty()); match_live_cell(row.cells(), *s, "cl1", data_value(to_bytes("cl1"))); match_live_cell(row.cells(), *s, "cl2", data_value(to_bytes("cl2"))); return make_ready_future<>(); }); }); }).get(); } SEASTAR_THREAD_TEST_CASE(compact_storage_simple_dense_read) { auto wait_bg = seastar::defer([] { sstables::await_background_jobs().get(); }); sstables::test_env env; env.reusable_sst(compact_simple_dense_schema(), "tests/sstables/compact_simple_dense", 1).then([] (auto sstp) { return do_with(make_dkey(compact_simple_dense_schema(), "first_row"), [sstp] (auto& key) { auto s = compact_simple_dense_schema(); auto rd = make_lw_shared(sstp->read_row_flat(s, key)); return read_mutation_from_flat_mutation_reader(*rd, db::no_timeout).then([sstp, s, &key, rd] (auto mutation) { auto& mp = mutation->partition(); auto exploded = exploded_clustering_prefix({"cl1"}); auto clustering = clustering_key::from_clustering_prefix(*s, exploded); auto& row = mp.clustered_row(*s, clustering); match_live_cell(row.cells(), *s, "cl2", data_value(to_bytes("cl2"))); return make_ready_future<>(); }); }); }).get(); } SEASTAR_THREAD_TEST_CASE(compact_storage_dense_read) { auto wait_bg = seastar::defer([] { sstables::await_background_jobs().get(); }); sstables::test_env env; env.reusable_sst(compact_dense_schema(), "tests/sstables/compact_dense", 1).then([] (auto sstp) { return do_with(make_dkey(compact_dense_schema(), "first_row"), [sstp] (auto& key) { auto s = compact_dense_schema(); auto rd = make_lw_shared(sstp->read_row_flat(s, key)); return read_mutation_from_flat_mutation_reader(*rd, db::no_timeout).then([sstp, s, &key, rd] (auto mutation) { auto& mp = mutation->partition(); auto exploded = exploded_clustering_prefix({"cl1", "cl2"}); auto clustering = clustering_key::from_clustering_prefix(*s, exploded); auto& row = mp.clustered_row(*s, clustering); match_live_cell(row.cells(), *s, "cl3", data_value(to_bytes("cl3"))); return make_ready_future<>(); }); }); }).get(); } // We recently had an issue, documented at #188, where range-reading from an // sstable would break if collections were used. // // Make sure we don't regress on that. SEASTAR_THREAD_TEST_CASE(broken_ranges_collection) { auto wait_bg = seastar::defer([] { sstables::await_background_jobs().get(); }); sstables::test_env env; env.reusable_sst(peers_schema(), "tests/sstables/broken_ranges", 2).then([] (auto sstp) { auto s = peers_schema(); auto reader = make_lw_shared(sstp->as_mutation_source().make_reader(s, query::full_partition_range)); return repeat([s, reader] { return read_mutation_from_flat_mutation_reader(*reader, db::no_timeout).then([s, reader] (mutation_opt mut) { auto key_equal = [s, &mut] (sstring ip) { return mut->key().equal(*s, partition_key::from_deeply_exploded(*s, { net::inet_address(ip) })); }; if (!mut) { return stop_iteration::yes; } else if (key_equal("127.0.0.1")) { auto& row = mut->partition().clustered_row(*s, clustering_key::make_empty()); match_absent(row.cells(), *s, "tokens"); } else if (key_equal("127.0.0.3")) { auto& row = mut->partition().clustered_row(*s, clustering_key::make_empty()); auto tokens = match_collection(row.cells(), *s, "tokens", tombstone(deletion_time{0x55E5F2D5, 0x051EB3FC99715Dl })); match_collection_element(tokens.cells[0], to_bytes("-8180144272884242102"), bytes_opt{}); } else { BOOST_REQUIRE(key_equal("127.0.0.2")); auto t = mut->partition().partition_tombstone(); BOOST_REQUIRE(t.timestamp == 0x051EB3FB016850l); } return stop_iteration::no; }); }); }).get(); } static schema_ptr tombstone_overlap_schema() { static thread_local auto s = [] { schema_builder builder(make_lw_shared(schema(generate_legacy_id("try1", "tab"), "try1", "tab", // partition key {{"pk", utf8_type}}, // clustering key {{"ck1", utf8_type}, {"ck2", utf8_type}}, // regular columns {{"data", utf8_type}}, // static columns {}, // regular column name type utf8_type, // comment "" ))); return builder.build(schema_builder::compact_storage::no); }(); return s; } static future ka_sst(schema_ptr schema, sstring dir, unsigned long generation) { sstables::test_env env; auto sst = env.make_sstable(std::move(schema), dir, generation, sstables::sstable::version_types::ka, big); auto fut = sst->load(); return std::move(fut).then([sst = std::move(sst)] { return make_ready_future(std::move(sst)); }); } // Considering the schema above, the sstable looks like: // {"key": "pk", // "cells": [["aaa:_","aaa:bbb:_",1459334681228103,"t",1459334681], // ["aaa:bbb:_","aaa:bbb:!",1459334681244989,"t",1459334681], // ["aaa:bbb:!","aaa:!",1459334681228103,"t",1459334681]]} // ] SEASTAR_THREAD_TEST_CASE(tombstone_in_tombstone) { auto wait_bg = seastar::defer([] { sstables::await_background_jobs().get(); }); ka_sst(tombstone_overlap_schema(), "tests/sstables/tombstone_overlap", 1).then([] (auto sstp) { auto s = tombstone_overlap_schema(); return do_with(sstp->read_rows_flat(s), [sstp, s] (auto& reader) { return repeat([sstp, s, &reader] { return read_mutation_from_flat_mutation_reader(reader, db::no_timeout).then([s] (mutation_opt mut) { if (!mut) { return stop_iteration::yes; } auto make_pkey = [s] (sstring b) { return partition_key::from_deeply_exploded(*s, { data_value(b) }); }; auto make_ckey = [s] (sstring c1, sstring c2 = {}) { std::vector v; v.push_back(data_value(c1)); if (!c2.empty()) { v.push_back(data_value(c2)); } return clustering_key::from_deeply_exploded(*s, std::move(v)); }; BOOST_REQUIRE(mut->key().equal(*s, make_pkey("pk"))); // Somewhat counterintuitively, scylla represents // deleting a small row with all clustering keys set - not // as a "row tombstone" but rather as a deleted clustering row. auto& rts = mut->partition().row_tombstones(); BOOST_REQUIRE(rts.size() == 2); auto it = rts.begin(); BOOST_REQUIRE(it->equal(*s, range_tombstone( make_ckey("aaa"), bound_kind::incl_start, make_ckey("aaa", "bbb"), bound_kind::excl_end, tombstone(1459334681228103LL, it->tomb.deletion_time)))); ++it; BOOST_REQUIRE(it->equal(*s, range_tombstone( make_ckey("aaa", "bbb"), bound_kind::excl_start, make_ckey("aaa"), bound_kind::incl_end, tombstone(1459334681228103LL, it->tomb.deletion_time)))); auto& rows = mut->partition().clustered_rows(); BOOST_REQUIRE(rows.calculate_size() == 1); for (auto& e : rows) { BOOST_REQUIRE(e.key().equal(*s, make_ckey("aaa", "bbb"))); BOOST_REQUIRE(e.row().deleted_at().tomb().timestamp == 1459334681244989LL); } return stop_iteration::no; }); }); }); }).get(); } // Same schema as above, the sstable looks like: // {"key": "pk", // "cells": [["aaa:_","aaa:bbb:_",1459334681228103,"t",1459334681], // ["aaa:bbb:_","aaa:ccc:!",1459334681228103,"t",1459334681], // ["aaa:ccc:!","aaa:ddd:!",1459334681228103,"t",1459334681], // ["aaa:ddd:!","aaa:!",1459334681228103,"t",1459334681]]} // // We're not sure how this sort of sstable can be generated with Cassandra 2's // CQL, but we saw a similar thing is a real use case. SEASTAR_THREAD_TEST_CASE(range_tombstone_reading) { auto wait_bg = seastar::defer([] { sstables::await_background_jobs().get(); }); ka_sst(tombstone_overlap_schema(), "tests/sstables/tombstone_overlap", 4).then([] (auto sstp) { auto s = tombstone_overlap_schema(); return do_with(sstp->read_rows_flat(s), [sstp, s] (auto& reader) { return repeat([sstp, s, &reader] { return read_mutation_from_flat_mutation_reader(reader, db::no_timeout).then([s] (mutation_opt mut) { if (!mut) { return stop_iteration::yes; } auto make_pkey = [s] (sstring b) { return partition_key::from_deeply_exploded(*s, { data_value(b) }); }; auto make_ckey = [s] (sstring c1, sstring c2 = {}) { std::vector v; v.push_back(data_value(c1)); if (!c2.empty()) { v.push_back(data_value(c2)); } return clustering_key::from_deeply_exploded(*s, std::move(v)); }; BOOST_REQUIRE(mut->key().equal(*s, make_pkey("pk"))); auto& rts = mut->partition().row_tombstones(); BOOST_REQUIRE(rts.size() == 1); auto it = rts.begin(); BOOST_REQUIRE(it->equal(*s, range_tombstone( make_ckey("aaa"), bound_kind::incl_start, make_ckey("aaa"), bound_kind::incl_end, tombstone(1459334681228103LL, it->tomb.deletion_time)))); auto& rows = mut->partition().clustered_rows(); BOOST_REQUIRE(rows.calculate_size() == 0); return stop_iteration::no; }); }); }); }).get(); } // In this test case we have *three* levels of of tombstones: // create COLUMNFAMILY tab2 (pk text, ck1 text, ck2 text, ck3 text, data text, primary key(pk, ck1, ck2, ck3)); // delete from tab2 where pk = 'pk' and ck1 = 'aaa'; // delete from tab2 where pk = 'pk' and ck1 = 'aaa' and ck2 = 'bbb'; // delete from tab2 where pk = 'pk' and ck1 = 'aaa' and ck2 = 'bbb' and ck3 = 'ccc'; // And then, to have more fun, I edited the resulting sstable manually (using // Cassandra's json2sstable and sstable2json tools) to further split the // resulting tombstones into even more tombstones: // {"key": "pk", // "cells": // [["aaa:_","aaa:bba:_",1459438519943668,"t",1459438519], // ["aaa:bba:_","aaa:bbb:_",1459438519943668,"t",1459438519], // ["aaa:bbb:_","aaa:bbb:ccb:_",1459438519950348,"t",1459438519], // ["aaa:bbb:ccb:_","aaa:bbb:ccc:_",1459438519950348,"t",1459438519], // ["aaa:bbb:ccc:_","aaa:bbb:ccc:!",1459438519958850,"t",1459438519], // ["aaa:bbb:ccc:!","aaa:bbb:ddd:!",1459438519950348,"t",1459438519], // ["aaa:bbb:ddd:!","aaa:bbb:!",1459438519950348,"t",1459438519], // ["aaa:bbb:!","aaa:!",1459438519943668,"t",1459438519]]} static schema_ptr tombstone_overlap_schema2() { static thread_local auto s = [] { schema_builder builder(make_lw_shared(schema(generate_legacy_id("try1", "tab2"), "try1", "tab2", // partition key {{"pk", utf8_type}}, // clustering key {{"ck1", utf8_type}, {"ck2", utf8_type}, {"ck3", utf8_type}}, // regular columns {{"data", utf8_type}}, // static columns {}, // regular column name type utf8_type, // comment "" ))); return builder.build(schema_builder::compact_storage::no); }(); return s; } SEASTAR_THREAD_TEST_CASE(tombstone_in_tombstone2) { auto wait_bg = seastar::defer([] { sstables::await_background_jobs().get(); }); ka_sst(tombstone_overlap_schema2(), "tests/sstables/tombstone_overlap", 3).then([] (auto sstp) { auto s = tombstone_overlap_schema2(); return do_with(sstp->read_rows_flat(s), [sstp, s] (auto& reader) { return repeat([sstp, s, &reader] { return read_mutation_from_flat_mutation_reader(reader, db::no_timeout).then([s] (mutation_opt mut) { if (!mut) { return stop_iteration::yes; } auto make_pkey = [s] (sstring b) { return partition_key::from_deeply_exploded(*s, { data_value(b) }); }; auto make_ckey = [s] (sstring c1, sstring c2 = {}, sstring c3 = {}) { std::vector v; v.push_back(data_value(c1)); if (!c2.empty()) { v.push_back(data_value(c2)); } if (!c3.empty()) { v.push_back(data_value(c3)); } return clustering_key::from_deeply_exploded(*s, std::move(v)); }; BOOST_REQUIRE(mut->key().equal(*s, make_pkey("pk"))); auto& rows = mut->partition().clustered_rows(); auto& rts = mut->partition().row_tombstones(); auto it = rts.begin(); BOOST_REQUIRE(it->start_bound().equal(*s, bound_view(make_ckey("aaa"), bound_kind::incl_start))); BOOST_REQUIRE(it->end_bound().equal(*s, bound_view(make_ckey("aaa", "bbb"), bound_kind::excl_end))); BOOST_REQUIRE(it->tomb.timestamp == 1459438519943668L); ++it; BOOST_REQUIRE(it->start_bound().equal(*s, bound_view(make_ckey("aaa", "bbb"), bound_kind::incl_start))); BOOST_REQUIRE(it->end_bound().equal(*s, bound_view(make_ckey("aaa", "bbb", "ccc"), bound_kind::excl_end))); BOOST_REQUIRE(it->tomb.timestamp == 1459438519950348L); ++it; BOOST_REQUIRE(it->start_bound().equal(*s, bound_view(make_ckey("aaa", "bbb", "ccc"), bound_kind::excl_start))); BOOST_REQUIRE(it->end_bound().equal(*s, bound_view(make_ckey("aaa", "bbb"), bound_kind::incl_end))); BOOST_REQUIRE(it->tomb.timestamp == 1459438519950348L); ++it; BOOST_REQUIRE(it->start_bound().equal(*s, bound_view(make_ckey("aaa", "bbb"), bound_kind::excl_start))); BOOST_REQUIRE(it->end_bound().equal(*s, bound_view(make_ckey("aaa"), bound_kind::incl_end))); BOOST_REQUIRE(it->tomb.timestamp == 1459438519943668L); ++it; BOOST_REQUIRE(it == rts.end()); BOOST_REQUIRE(rows.calculate_size() == 1); for (auto& e : rows) { BOOST_REQUIRE(e.key().equal(*s, make_ckey("aaa", "bbb", "ccc"))); BOOST_REQUIRE(e.row().deleted_at().tomb().timestamp == 1459438519958850LL); } return stop_iteration::no; }); }); }); }).get(); } SEASTAR_TEST_CASE(test_non_compound_table_row_is_not_marked_as_static) { return seastar::async([] { auto wait_bg = seastar::defer([] { sstables::await_background_jobs().get(); }); for (const auto version : all_sstable_versions) { storage_service_for_tests ssft; auto dir = tmpdir(); schema_builder builder("ks", "cf"); builder.with_column("p", utf8_type, column_kind::partition_key); builder.with_column("c", int32_type, column_kind::clustering_key); builder.with_column("v", int32_type); auto s = builder.build(schema_builder::compact_storage::yes); auto k = partition_key::from_exploded(*s, {to_bytes(make_local_key(s))}); auto ck = clustering_key::from_exploded(*s, {int32_type->decompose(static_cast(0xffff0000))}); mutation m(s, k); auto cell = atomic_cell::make_live(*int32_type, 1, int32_type->decompose(17), { }); m.set_clustered_cell(ck, *s->get_column_definition("v"), std::move(cell)); auto mt = make_lw_shared(s); mt->apply(std::move(m)); sstables::test_env env; auto sst = env.make_sstable(s, dir.path().string(), 1 /* generation */, version, sstables::sstable::format_types::big); write_memtable_to_sstable_for_test(*mt, sst).get(); sst->load().get(); auto mr = sst->read_rows_flat(s); auto mut = read_mutation_from_flat_mutation_reader(mr, db::no_timeout).get0(); BOOST_REQUIRE(bool(mut)); } }); } SEASTAR_TEST_CASE(test_has_partition_key) { return seastar::async([] { auto wait_bg = seastar::defer([] { sstables::await_background_jobs().get(); }); for (const auto version : all_sstable_versions) { storage_service_for_tests ssft; auto dir = tmpdir(); schema_builder builder("ks", "cf"); builder.with_column("p", utf8_type, column_kind::partition_key); builder.with_column("c", int32_type, column_kind::clustering_key); builder.with_column("v", int32_type); auto s = builder.build(schema_builder::compact_storage::yes); auto k = partition_key::from_exploded(*s, {to_bytes(make_local_key(s))}); auto ck = clustering_key::from_exploded(*s, {int32_type->decompose(static_cast(0xffff0000))}); mutation m(s, k); auto cell = atomic_cell::make_live(*int32_type, 1, int32_type->decompose(17), { }); m.set_clustered_cell(ck, *s->get_column_definition("v"), std::move(cell)); auto mt = make_lw_shared(s); mt->apply(std::move(m)); sstables::test_env env; auto sst = env.make_sstable(s, dir.path().string(), 1 /* generation */, version, sstables::sstable::format_types::big); write_memtable_to_sstable_for_test(*mt, sst).get(); dht::decorated_key dk(dht::global_partitioner().decorate_key(*s, k)); auto hk = sstables::sstable::make_hashed_key(*s, dk.key()); sst->load().get(); auto mr = sst->read_rows_flat(s); auto res = sst->has_partition_key(hk, dk).get0(); BOOST_REQUIRE(bool(res)); auto dk2 = dht::global_partitioner().decorate_key(*s, partition_key::from_nodetool_style_string(s, "xx")); auto hk2 = sstables::sstable::make_hashed_key(*s, dk2.key()); res = sst->has_partition_key(hk2, dk2).get0(); BOOST_REQUIRE(! bool(res)); } }); } static std::unique_ptr get_index_reader(shared_sstable sst) { return std::make_unique(sst, default_priority_class()); } SEASTAR_TEST_CASE(test_promoted_index_blocks_are_monotonic) { return seastar::async([] { auto wait_bg = seastar::defer([] { sstables::await_background_jobs().get(); }); storage_service_for_tests ssft; auto dir = tmpdir(); schema_builder builder("ks", "cf"); builder.with_column("p", utf8_type, column_kind::partition_key); builder.with_column("c1", int32_type, column_kind::clustering_key); builder.with_column("c2", int32_type, column_kind::clustering_key); builder.with_column("v", int32_type); auto s = builder.build(); auto k = partition_key::from_exploded(*s, {to_bytes(make_local_key(s))}); auto cell = atomic_cell::make_live(*int32_type, 1, int32_type->decompose(88), { }); mutation m(s, k); auto ck = clustering_key::from_exploded(*s, {int32_type->decompose(1), int32_type->decompose(2)}); m.set_clustered_cell(ck, *s->get_column_definition("v"), atomic_cell(*int32_type, cell)); ck = clustering_key::from_exploded(*s, {int32_type->decompose(1), int32_type->decompose(4)}); m.set_clustered_cell(ck, *s->get_column_definition("v"), atomic_cell(*int32_type, cell)); ck = clustering_key::from_exploded(*s, {int32_type->decompose(1), int32_type->decompose(6)}); m.set_clustered_cell(ck, *s->get_column_definition("v"), atomic_cell(*int32_type, cell)); ck = clustering_key::from_exploded(*s, {int32_type->decompose(3), int32_type->decompose(9)}); m.set_clustered_cell(ck, *s->get_column_definition("v"), atomic_cell(*int32_type, cell)); m.partition().apply_row_tombstone(*s, range_tombstone( clustering_key_prefix::from_exploded(*s, {int32_type->decompose(1)}), bound_kind::excl_start, clustering_key_prefix::from_exploded(*s, {int32_type->decompose(2)}), bound_kind::incl_end, {1, gc_clock::now()})); auto mt = make_lw_shared(s); mt->apply(std::move(m)); sstables::test_env env; auto sst = env.make_sstable(s, dir.path().string(), 1 /* generation */, sstables::sstable::version_types::ka, sstables::sstable::format_types::big); sstable_writer_config cfg; cfg.promoted_index_block_size = 1; cfg.large_data_handler = &nop_lp_handler; sst->write_components(mt->make_flat_reader(s), 1, s, cfg, mt->get_encoding_stats()).get(); sst->load().get(); assert_that(get_index_reader(sst)).has_monotonic_positions(*s); }); } SEASTAR_TEST_CASE(test_promoted_index_blocks_are_monotonic_compound_dense) { return seastar::async([] { auto wait_bg = seastar::defer([] { sstables::await_background_jobs().get(); }); for (const auto version : all_sstable_versions) { storage_service_for_tests ssft; auto dir = tmpdir(); schema_builder builder("ks", "cf"); builder.with_column("p", utf8_type, column_kind::partition_key); builder.with_column("c1", int32_type, column_kind::clustering_key); builder.with_column("c2", int32_type, column_kind::clustering_key); builder.with_column("v", int32_type); auto s = builder.build(schema_builder::compact_storage::yes); auto dk = dht::global_partitioner().decorate_key(*s, partition_key::from_exploded(*s, {to_bytes(make_local_key(s))})); auto cell = atomic_cell::make_live(*int32_type, 1, int32_type->decompose(88), { }); mutation m(s, dk); auto ck1 = clustering_key::from_exploded(*s, {int32_type->decompose(1), int32_type->decompose(2)}); m.set_clustered_cell(ck1, *s->get_column_definition("v"), atomic_cell(*int32_type, cell)); auto ck2 = clustering_key::from_exploded(*s, {int32_type->decompose(1), int32_type->decompose(4)}); m.set_clustered_cell(ck2, *s->get_column_definition("v"), atomic_cell(*int32_type, cell)); auto ck3 = clustering_key::from_exploded(*s, {int32_type->decompose(1), int32_type->decompose(6)}); m.set_clustered_cell(ck3, *s->get_column_definition("v"), atomic_cell(*int32_type, cell)); auto ck4 = clustering_key::from_exploded(*s, {int32_type->decompose(3), int32_type->decompose(9)}); m.set_clustered_cell(ck4, *s->get_column_definition("v"), atomic_cell(*int32_type, cell)); m.partition().apply_row_tombstone(*s, range_tombstone( clustering_key_prefix::from_exploded(*s, {int32_type->decompose(1)}), bound_kind::incl_start, clustering_key_prefix::from_exploded(*s, {int32_type->decompose(2)}), bound_kind::incl_end, {1, gc_clock::now()})); auto mt = make_lw_shared(s); mt->apply(std::move(m)); sstables::test_env env; auto sst = env.make_sstable(s, dir.path().string(), 1 /* generation */, version, sstables::sstable::format_types::big); sstable_writer_config cfg; cfg.promoted_index_block_size = 1; cfg.large_data_handler = &nop_lp_handler; sst->write_components(mt->make_flat_reader(s), 1, s, cfg, mt->get_encoding_stats()).get(); sst->load().get(); { assert_that(get_index_reader(sst)).has_monotonic_positions(*s); } { auto slice = partition_slice_builder(*s).with_range(query::clustering_range::make_starting_with({ck1})).build(); assert_that(sst->as_mutation_source().make_reader(s, dht::partition_range::make_singular(dk), slice)) .produces(m) .produces_end_of_stream(); } } }); } SEASTAR_TEST_CASE(test_promoted_index_blocks_are_monotonic_non_compound_dense) { return seastar::async([] { auto wait_bg = seastar::defer([] { sstables::await_background_jobs().get(); }); for (const auto version : all_sstable_versions) { storage_service_for_tests ssft; auto dir = tmpdir(); schema_builder builder("ks", "cf"); builder.with_column("p", utf8_type, column_kind::partition_key); builder.with_column("c1", int32_type, column_kind::clustering_key); builder.with_column("v", int32_type); auto s = builder.build(schema_builder::compact_storage::yes); auto dk = dht::global_partitioner().decorate_key(*s, partition_key::from_exploded(*s, {to_bytes(make_local_key(s))})); auto cell = atomic_cell::make_live(*int32_type, 1, int32_type->decompose(88), { }); mutation m(s, dk); auto ck1 = clustering_key::from_exploded(*s, {int32_type->decompose(1)}); m.set_clustered_cell(ck1, *s->get_column_definition("v"), atomic_cell(*int32_type, cell)); auto ck2 = clustering_key::from_exploded(*s, {int32_type->decompose(2)}); m.set_clustered_cell(ck2, *s->get_column_definition("v"), atomic_cell(*int32_type, cell)); auto ck3 = clustering_key::from_exploded(*s, {int32_type->decompose(3)}); m.set_clustered_cell(ck3, *s->get_column_definition("v"), atomic_cell(*int32_type, cell)); m.partition().apply_row_tombstone(*s, range_tombstone( clustering_key_prefix::from_exploded(*s, {int32_type->decompose(1)}), bound_kind::incl_start, clustering_key_prefix::from_exploded(*s, {int32_type->decompose(2)}), bound_kind::incl_end, {1, gc_clock::now()})); auto mt = make_lw_shared(s); mt->apply(std::move(m)); sstables::test_env env; auto sst = env.make_sstable(s, dir.path().string(), 1 /* generation */, version, sstables::sstable::format_types::big); sstable_writer_config cfg; cfg.promoted_index_block_size = 1; cfg.large_data_handler = &nop_lp_handler; sst->write_components(mt->make_flat_reader(s), 1, s, cfg, mt->get_encoding_stats()).get(); sst->load().get(); { assert_that(get_index_reader(sst)).has_monotonic_positions(*s); } { auto slice = partition_slice_builder(*s).with_range(query::clustering_range::make_starting_with({ck1})).build(); assert_that(sst->as_mutation_source().make_reader(s, dht::partition_range::make_singular(dk), slice)) .produces(m) .produces_end_of_stream(); } } }); } SEASTAR_TEST_CASE(test_promoted_index_repeats_open_tombstones) { return seastar::async([] { auto wait_bg = seastar::defer([] { sstables::await_background_jobs().get(); }); for (const auto version : all_sstable_versions) { storage_service_for_tests ssft; auto dir = tmpdir(); int id = 0; for (auto& compact : { schema_builder::compact_storage::no, schema_builder::compact_storage::yes }) { const auto generation = id++; schema_builder builder("ks", format("cf{:d}", generation)); builder.with_column("p", utf8_type, column_kind::partition_key); builder.with_column("c1", bytes_type, column_kind::clustering_key); builder.with_column("v", int32_type); auto s = builder.build(compact); auto dk = dht::global_partitioner().decorate_key(*s, partition_key::from_exploded(*s, {to_bytes(make_local_key(s))})); auto cell = atomic_cell::make_live(*int32_type, 1, int32_type->decompose(88), { }); mutation m(s, dk); m.partition().apply_row_tombstone(*s, range_tombstone( clustering_key_prefix::from_exploded(*s, {bytes_type->decompose(data_value(to_bytes("ck1")))}), bound_kind::incl_start, clustering_key_prefix::from_exploded(*s, {bytes_type->decompose(data_value(to_bytes("ck5")))}), bound_kind::incl_end, {1, gc_clock::now()})); auto ck = clustering_key::from_exploded(*s, {bytes_type->decompose(data_value(to_bytes("ck3")))}); m.set_clustered_cell(ck, *s->get_column_definition("v"), atomic_cell(*int32_type, cell)); auto mt = make_lw_shared(s); mt->apply(m); sstables::test_env env; auto sst = env.make_sstable(s, dir.path().string(), generation, version, sstables::sstable::format_types::big); sstable_writer_config cfg; cfg.promoted_index_block_size = 1; cfg.large_data_handler = &nop_lp_handler; sst->write_components(mt->make_flat_reader(s), 1, s, cfg, mt->get_encoding_stats()).get(); sst->load().get(); { auto slice = partition_slice_builder(*s).with_range(query::clustering_range::make_starting_with({ck})).build(); assert_that(sst->as_mutation_source().make_reader(s, dht::partition_range::make_singular(dk), slice)) .produces(m) .produces_end_of_stream(); } } } }); } SEASTAR_TEST_CASE(test_range_tombstones_are_correctly_seralized_for_non_compound_dense_schemas) { return seastar::async([] { auto wait_bg = seastar::defer([] { sstables::await_background_jobs().get(); }); for (const auto version : all_sstable_versions) { storage_service_for_tests ssft; auto dir = tmpdir(); schema_builder builder("ks", "cf"); builder.with_column("p", utf8_type, column_kind::partition_key); builder.with_column("c", int32_type, column_kind::clustering_key); builder.with_column("v", int32_type); auto s = builder.build(schema_builder::compact_storage::yes); auto dk = dht::global_partitioner().decorate_key(*s, partition_key::from_exploded(*s, {to_bytes(make_local_key(s))})); mutation m(s, dk); m.partition().apply_row_tombstone(*s, range_tombstone( clustering_key_prefix::from_exploded(*s, {int32_type->decompose(1)}), bound_kind::incl_start, clustering_key_prefix::from_exploded(*s, {int32_type->decompose(2)}), bound_kind::incl_end, {1, gc_clock::now()})); auto mt = make_lw_shared(s); mt->apply(m); sstables::test_env env; auto sst = env.make_sstable(s, dir.path().string(), 1 /* generation */, version, sstables::sstable::format_types::big); sstable_writer_config cfg; cfg.large_data_handler = &nop_lp_handler; sst->write_components(mt->make_flat_reader(s), 1, s, cfg, mt->get_encoding_stats()).get(); sst->load().get(); { auto slice = partition_slice_builder(*s).build(); assert_that(sst->as_mutation_source().make_reader(s, dht::partition_range::make_singular(dk), slice)) .produces(m) .produces_end_of_stream(); } } }); } SEASTAR_TEST_CASE(test_promoted_index_is_absent_for_schemas_without_clustering_key) { return seastar::async([] { auto wait_bg = seastar::defer([] { sstables::await_background_jobs().get(); }); for (const auto version : all_sstable_versions) { storage_service_for_tests ssft; auto dir = tmpdir(); schema_builder builder("ks", "cf"); builder.with_column("p", utf8_type, column_kind::partition_key); builder.with_column("v", int32_type); auto s = builder.build(schema_builder::compact_storage::yes); auto dk = dht::global_partitioner().decorate_key(*s, partition_key::from_exploded(*s, {to_bytes(make_local_key(s))})); mutation m(s, dk); for (auto&& v : { 1, 2, 3, 4 }) { auto cell = atomic_cell::make_live(*int32_type, 1, int32_type->decompose(v), { }); m.set_clustered_cell(clustering_key_prefix::make_empty(), *s->get_column_definition("v"), atomic_cell(*int32_type, cell)); } auto mt = make_lw_shared(s); mt->apply(m); sstables::test_env env; auto sst = env.make_sstable(s, dir.path().string(), 1 /* generation */, version, sstables::sstable::format_types::big); sstable_writer_config cfg; cfg.promoted_index_block_size = 1; cfg.large_data_handler = &nop_lp_handler; sst->write_components(mt->make_flat_reader(s), 1, s, cfg, mt->get_encoding_stats()).get(); sst->load().get(); assert_that(get_index_reader(sst)).is_empty(*s); } }); } SEASTAR_TEST_CASE(test_can_write_and_read_non_compound_range_tombstone_as_compound) { return seastar::async([] { auto wait_bg = seastar::defer([] { sstables::await_background_jobs().get(); }); for (const auto version : all_sstable_versions) { storage_service_for_tests ssft; auto dir = tmpdir(); schema_builder builder("ks", "cf"); builder.with_column("p", utf8_type, column_kind::partition_key); builder.with_column("c", int32_type, column_kind::clustering_key); builder.with_column("v", int32_type); auto s = builder.build(schema_builder::compact_storage::yes); auto dk = dht::global_partitioner().decorate_key(*s, partition_key::from_exploded(*s, {to_bytes(make_local_key(s))})); mutation m(s, dk); m.partition().apply_row_tombstone(*s, range_tombstone( clustering_key_prefix::from_exploded(*s, {int32_type->decompose(1)}), bound_kind::incl_start, clustering_key_prefix::from_exploded(*s, {int32_type->decompose(2)}), bound_kind::incl_end, {1, gc_clock::now()})); auto mt = make_lw_shared(s); mt->apply(m); sstables::test_env env; auto sst = env.make_sstable(s, dir.path().string(), 1 /* generation */, version, sstables::sstable::format_types::big); sstable_writer_config cfg; cfg.correctly_serialize_non_compound_range_tombstones = false; cfg.large_data_handler = &nop_lp_handler; sst->write_components(mt->make_flat_reader(s), 1, s, cfg, mt->get_encoding_stats()).get(); sst->load().get(); { auto slice = partition_slice_builder(*s).build(); assert_that(sst->as_mutation_source().make_reader(s, dht::partition_range::make_singular(dk), slice)) .produces(m) .produces_end_of_stream(); } } }); } SEASTAR_TEST_CASE(test_writing_combined_stream_with_tombstones_at_the_same_position) { return seastar::async([] { auto wait_bg = seastar::defer([] { sstables::await_background_jobs().get(); }); for (const auto version : all_sstable_versions) { storage_service_for_tests ssft; auto dir = tmpdir(); simple_schema ss; auto s = ss.schema(); auto rt1 = ss.make_range_tombstone(ss.make_ckey_range(1, 10)); auto rt2 = ss.make_range_tombstone(ss.make_ckey_range(1, 5)); // rt1 + rt2 = {[1, 5], (5, 10]} auto local_k = make_local_key(s); mutation m1 = ss.new_mutation(local_k); ss.add_row(m1, ss.make_ckey(0), "v0"); // So that we don't hit workaround for #1203, which would cover up bugs m1.partition().apply_delete(*s, rt1); m1.partition().apply_delete(*s, ss.make_ckey(4), ss.new_tombstone()); auto rt3 = ss.make_range_tombstone(ss.make_ckey_range(20, 21)); m1.partition().apply_delete(*s, ss.make_ckey(20), ss.new_tombstone()); m1.partition().apply_delete(*s, rt3); mutation m2 = ss.new_mutation(local_k); m2.partition().apply_delete(*s, rt2); ss.add_row(m2, ss.make_ckey(4), "v2"); // position inside rt2 auto mt1 = make_lw_shared(s); mt1->apply(m1); auto mt2 = make_lw_shared(s); mt2->apply(m2); sstables::test_env env; auto sst = env.make_sstable(s, dir.path().string(), 1 /* generation */, version, sstables::sstable::format_types::big); sstable_writer_config cfg; cfg.large_data_handler = &nop_lp_handler; sst->write_components(make_combined_reader(s, mt1->make_flat_reader(s), mt2->make_flat_reader(s)), 1, s, cfg, encoding_stats{}).get(); sst->load().get(); assert_that(sst->as_mutation_source().make_reader(s)) .produces(m1 + m2) .produces_end_of_stream(); } }); } SEASTAR_TEST_CASE(test_no_index_reads_when_rows_fall_into_range_boundaries) { return seastar::async([] { auto wait_bg = seastar::defer([] { sstables::await_background_jobs().get(); }); for (const auto version : all_sstable_versions) { storage_service_for_tests ssft; simple_schema ss(simple_schema::with_static::yes); auto s = ss.schema(); auto pks = make_local_keys(2, s); mutation m1 = ss.new_mutation(pks[0]); ss.add_row(m1, ss.make_ckey(1), "v"); ss.add_row(m1, ss.make_ckey(2), "v"); ss.add_row(m1, ss.make_ckey(5), "v"); ss.add_row(m1, ss.make_ckey(6), "v"); mutation m2 = ss.new_mutation(pks[1]); ss.add_static_row(m2, "svalue"); ss.add_row(m2, ss.make_ckey(2), "v"); ss.add_row(m2, ss.make_ckey(5), "v"); ss.add_row(m2, ss.make_ckey(6), "v"); sstable_writer_config cfg; cfg.large_data_handler = &nop_lp_handler; tmpdir dir; sstables::test_env env; auto ms = make_sstable_mutation_source(env, s, dir.path().string(), {m1, m2}, cfg, version); auto index_accesses = [] { auto&& stats = sstables::shared_index_lists::shard_stats(); return stats.hits + stats.misses + stats.blocks; }; auto before = index_accesses(); { assert_that(ms.make_reader(s)) .produces(m1) .produces(m2) .produces_end_of_stream(); BOOST_REQUIRE_EQUAL(index_accesses(), before); } } }); } SEASTAR_THREAD_TEST_CASE(test_large_index_pages_do_not_cause_large_allocations) { auto wait_bg = seastar::defer([] { sstables::await_background_jobs().get(); }); // We create a sequence of partitions such that first we have a partition with a very long key, then // series of partitions with small keys. This should result in large index page. storage_service_for_tests ssft; auto dir = tmpdir(); simple_schema ss; auto s = ss.schema(); const size_t large_key_pad_size = 9000; const size_t small_key_pad_size = 16; const size_t n_small_keys = 100000; auto make_pkey_text = [] (size_t pad_size) -> sstring { static int i = 0; return format("pkey_0x{:x}_{}", i++, make_random_string(pad_size)); }; // Choose min from several random keys std::optional large_key; for (int i = 0; i < 10; ++i) { auto pk = ss.make_pkey(make_pkey_text(large_key_pad_size)); if (!large_key || pk.less_compare(*s, *large_key)) { large_key = std::move(pk); } } std::vector small_keys; // only larger than *large_key while (small_keys.size() < n_small_keys) { auto pk = ss.make_pkey(make_pkey_text(small_key_pad_size)); if (large_key->less_compare(*s, pk)) { small_keys.emplace_back(std::move(pk)); } } std::sort(small_keys.begin(), small_keys.end(), dht::decorated_key::less_comparator(s)); seastar::memory::scoped_large_allocation_warning_threshold mtg(logalloc::segment_size); auto mt = make_lw_shared(s); { mutation m(s, *large_key); ss.add_row(m, ss.make_ckey(0), "v"); mt->apply(m); } for (auto&& key : small_keys) { mutation m(s, key); auto value = make_random_string(128); ss.add_row(m, ss.make_ckey(0), value); mt->apply(m); } sstables::test_env env; auto sst = env.make_sstable(s, dir.path().string(), 1 /* generation */, sstable_version_types::ka, sstables::sstable::format_types::big); sstable_writer_config cfg; cfg.large_data_handler = &nop_lp_handler; sst->write_components(mt->make_flat_reader(s), 1, s, cfg, mt->get_encoding_stats()).get(); sst->load().get(); auto pr = dht::partition_range::make_singular(small_keys[0]); auto mt_reader = mt->make_flat_reader(s, pr); mutation expected = *read_mutation_from_flat_mutation_reader(mt_reader, db::no_timeout).get0(); auto t0 = std::chrono::steady_clock::now(); auto large_allocs_before = memory::stats().large_allocations(); auto sst_reader = sst->as_mutation_source().make_reader(s, pr); mutation actual = *read_mutation_from_flat_mutation_reader(sst_reader, db::no_timeout).get0(); auto large_allocs_after = memory::stats().large_allocations(); auto duration = std::chrono::steady_clock::now() - t0; BOOST_TEST_MESSAGE(format("Read took {:d} us", duration / 1us)); assert_that(actual).is_equal_to(expected); BOOST_REQUIRE_EQUAL(large_allocs_after - large_allocs_before, 0); } SEASTAR_THREAD_TEST_CASE(test_schema_changes) { auto dir = tmpdir(); storage_service_for_tests ssft; auto wait_bg = seastar::defer([] { sstables::await_background_jobs().get(); }); int gen = 1; std::map, std::tuple> cache; for_each_schema_change([&] (schema_ptr base, const std::vector& base_mutations, schema_ptr changed, const std::vector& changed_mutations) { for (auto version : all_sstable_versions) { auto it = cache.find(std::tuple { version, base }); shared_sstable created_with_base_schema; shared_sstable created_with_changed_schema; sstables::test_env env; if (it == cache.end()) { auto mt = make_lw_shared(base); for (auto& m : base_mutations) { mt->apply(m); } created_with_base_schema = env.make_sstable(base, dir.path().string(), gen, version, sstables::sstable::format_types::big); sstable_writer_config cfg; cfg.large_data_handler = &nop_lp_handler; created_with_base_schema->write_components(mt->make_flat_reader(base), base_mutations.size(), base, cfg, mt->get_encoding_stats()).get(); created_with_base_schema->load().get(); created_with_changed_schema = env.make_sstable(changed, dir.path().string(), gen, version, sstables::sstable::format_types::big); created_with_changed_schema->load().get(); cache.emplace(std::tuple { version, base }, std::tuple { created_with_base_schema, gen }); gen++; } else { created_with_base_schema = std::get(it->second); created_with_changed_schema = env.make_sstable(changed, dir.path().string(), std::get(it->second), version, sstables::sstable::format_types::big); created_with_changed_schema->load().get(); } auto mr = assert_that(created_with_base_schema->as_mutation_source() .make_reader(changed, dht::partition_range::make_open_ended_both_sides(), changed->full_slice())); for (auto& m : changed_mutations) { mr.produces(m); } mr.produces_end_of_stream(); mr = assert_that(created_with_changed_schema->as_mutation_source() .make_reader(changed, dht::partition_range::make_open_ended_both_sides(), changed->full_slice())); for (auto& m : changed_mutations) { mr.produces(m); } mr.produces_end_of_stream(); } }); } SEASTAR_THREAD_TEST_CASE(test_reading_serialization_header) { auto dir = tmpdir(); storage_service_for_tests ssft; auto wait_bg = seastar::defer([] { sstables::await_background_jobs().get(); }); auto random_int32_value = [] { return int32_type->decompose(tests::random::get_int()); }; auto td = tests::data_model::table_description({ { "pk", int32_type } }, { { "ck", utf8_type } }); auto td1 = td; td1.add_static_column("s1", int32_type); td1.add_regular_column("v1", int32_type); td1.add_regular_column("v2", int32_type); auto built_schema = td1.build(); auto s = built_schema.schema; auto md1 = tests::data_model::mutation_description({ to_bytes("pk1") }); md1.add_clustered_row_marker({ to_bytes("ck1") }, -10); md1.add_clustered_cell({ to_bytes("ck1") }, "v1", random_int32_value()); auto m1 = md1.build(s); auto now = gc_clock::now(); auto ttl = gc_clock::duration(std::chrono::hours(1)); auto expiry_time = now + ttl; auto md2 = tests::data_model::mutation_description({ to_bytes("pk2") }); md2.add_static_expiring_cell("s1", random_int32_value(), ttl, expiry_time); auto m2 = md2.build(s); auto mt = make_lw_shared(s); mt->apply(m1); mt->apply(m2); auto md1_overwrite = tests::data_model::mutation_description({ to_bytes("pk1") }); md1_overwrite.add_clustered_row_marker({ to_bytes("ck1") }, 10); auto m1ow = md1_overwrite.build(s); mt->apply(m1ow); sstables::test_env env; { // SSTable class has way too many responsibilities. In particular, it mixes the reading and // writting parts. Let's use a separate objects for writing and reading to ensure that nothing // carries over that wouldn't normally be read from disk. auto sst = env.make_sstable(s, dir.path().string(), 1, sstable::version_types::mc, sstables::sstable::format_types::big); sstable_writer_config cfg; cfg.large_data_handler = &nop_lp_handler; sst->write_components(mt->make_flat_reader(s), 2, s, cfg, mt->get_encoding_stats()).get(); } auto sst = env.make_sstable(s, dir.path().string(), 1, sstable::version_types::mc, sstables::sstable::format_types::big); sst->load().get(); auto hdr = sst->get_serialization_header(); BOOST_CHECK_EQUAL(hdr.static_columns.elements.size(), 1); BOOST_CHECK_EQUAL(hdr.static_columns.elements[0].name.value, to_bytes("s1")); BOOST_CHECK_EQUAL(hdr.regular_columns.elements.size(), 2); BOOST_CHECK(hdr.regular_columns.elements[0].name.value == to_bytes("v1")); BOOST_CHECK(hdr.regular_columns.elements[1].name.value == to_bytes("v2")); BOOST_CHECK_EQUAL(hdr.get_min_timestamp(), -10); BOOST_CHECK_EQUAL(hdr.get_min_local_deletion_time(), expiry_time.time_since_epoch().count()); BOOST_CHECK_EQUAL(hdr.get_min_ttl(), ttl.count()); auto stats = sst->get_encoding_stats_for_compaction(); BOOST_CHECK(stats.min_local_deletion_time == expiry_time); BOOST_CHECK_EQUAL(stats.min_timestamp, 10); // Like Cassandra even if a row marker is not expiring we update the metadata with NO_TTL value // which is 0. BOOST_CHECK(stats.min_ttl == gc_clock::duration(0)); } SEASTAR_THREAD_TEST_CASE(test_merging_encoding_stats) { auto ecc = encoding_stats_collector{}; auto ec1 = encoding_stats{}; ecc.update(ec1); auto ec = ecc.get(); BOOST_CHECK_EQUAL(ec.min_timestamp, ec1.min_timestamp); BOOST_CHECK(ec.min_local_deletion_time == ec1.min_local_deletion_time); BOOST_CHECK(ec.min_ttl == ec1.min_ttl); ec1.min_timestamp = -10; ec1.min_local_deletion_time = gc_clock::now(); ec1.min_ttl = gc_clock::duration(std::chrono::hours(1)); ecc = encoding_stats_collector{}; ecc.update(ec1); ec = ecc.get(); BOOST_CHECK_EQUAL(ec.min_timestamp, ec1.min_timestamp); BOOST_CHECK(ec.min_local_deletion_time == ec1.min_local_deletion_time); BOOST_CHECK(ec.min_ttl == ec1.min_ttl); auto ec2 = encoding_stats{}; ec2.min_timestamp = -20; ec2.min_local_deletion_time = ec1.min_local_deletion_time - std::chrono::seconds(1); ec2.min_ttl = gc_clock::duration(std::chrono::hours(2)); ecc.update(ec2); ec = ecc.get(); BOOST_CHECK_EQUAL(ec.min_timestamp, -20); BOOST_CHECK(ec.min_local_deletion_time == ec2.min_local_deletion_time); BOOST_CHECK(ec.min_ttl == ec1.min_ttl); } // Reproducer for #4206 SEASTAR_THREAD_TEST_CASE(test_counter_header_size) { auto dir = tmpdir(); storage_service_for_tests ssft; auto wait_bg = seastar::defer([] { sstables::await_background_jobs().get(); }); auto s = schema_builder("ks", "counter_test") .with_column("pk", int32_type, column_kind::partition_key) .with_column("ck", int32_type, column_kind::clustering_key) .with_column("c1", counter_type) .build(); auto pk = partition_key::from_single_value(*s, int32_type->decompose(0)); auto ck = clustering_key::from_single_value(*s, int32_type->decompose(0)); auto& col = *s->get_column_definition(utf8_type->decompose(sstring("c1"))); auto ids = std::vector(); for (auto i = 0; i < 128; i++) { ids.emplace_back(counter_id(utils::make_random_uuid())); } std::sort(ids.begin(), ids.end()); mutation m(s, pk); counter_cell_builder ccb; for (auto id : ids) { ccb.add_shard(counter_shard(id, 1, 1)); } m.set_clustered_cell(ck, col, ccb.build(api::new_timestamp())); auto mt = make_lw_shared(s); mt->apply(m); sstables::test_env env; for (const auto version : all_sstable_versions) { auto sst = env.make_sstable(s, dir.path().string(), 1, version, sstables::sstable::format_types::big); sstable_writer_config cfg; cfg.large_data_handler = &nop_lp_handler; sst->write_components(mt->make_flat_reader(s), 1, s, cfg, mt->get_encoding_stats()).get(); sst->load().get(); assert_that(sst->as_mutation_source().make_reader(s)) .produces(m) .produces_end_of_stream(); } }