/* * Copyright (C) 2017 ScyllaDB */ /* * This file is part of Scylla. * * Scylla is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Scylla is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Scylla. If not, see . */ #include #include "tests/cql_test_env.hh" #include "tests/perf/perf.hh" #include "core/app-template.hh" #include "schema_builder.hh" #include "database.hh" #include "db/config.hh" #include "partition_slice_builder.hh" #include #include "transport/messages/result_message.hh" #include "disk-error-handler.hh" thread_local disk_error_signal_type commit_error; thread_local disk_error_signal_type general_disk_error; using namespace std::chrono_literals; reactor::io_stats s; struct metrics_snapshot { std::chrono::high_resolution_clock::time_point hr_clock; steady_clock_type::duration busy_time; steady_clock_type::duration idle_time; reactor::io_stats io; sstables::shared_index_lists::stats index; metrics_snapshot() { reactor& r = *local_engine; io = r.get_io_stats(); busy_time = r.total_busy_time(); idle_time = r.total_idle_time(); hr_clock = std::chrono::high_resolution_clock::now(); index = sstables::shared_index_lists::shard_stats(); } }; class make_printable { using func_type = std::function; func_type _func; public: make_printable(func_type func) : _func(std::move(func)) {} friend std::ostream& operator<<(std::ostream& out, const make_printable& p) { p._func(out); return out; } }; struct test_result { uint64_t fragments_read; metrics_snapshot before; metrics_snapshot after; test_result(metrics_snapshot before, uint64_t fragments_read) : fragments_read(fragments_read) , before(before) { } double duration_in_seconds() const { return std::chrono::duration(after.hr_clock - before.hr_clock).count(); } double fragment_rate() const { return double(fragments_read) / duration_in_seconds(); } uint64_t aio_reads() const { return after.io.aio_reads - before.io.aio_reads; } uint64_t aio_read_bytes() const { return after.io.aio_read_bytes - before.io.aio_read_bytes; } uint64_t read_aheads_discarded() const { return after.io.fstream_read_aheads_discarded - before.io.fstream_read_aheads_discarded; } uint64_t reads_blocked() const { return after.io.fstream_reads_blocked - before.io.fstream_reads_blocked; } uint64_t index_hits() const { return after.index.hits - before.index.hits; } uint64_t index_misses() const { return after.index.misses - before.index.misses; } uint64_t index_blocks() const { return after.index.blocks - before.index.blocks; } float cpu_utilization() const { auto busy_delta = after.busy_time.count() - before.busy_time.count(); auto idle_delta = after.idle_time.count() - before.idle_time.count(); return float(busy_delta) / (busy_delta + idle_delta); } static auto table_header() { return make_printable([] (std::ostream& out) { out << sprint("%10s %9s %10s %6s %10s %7s %7s %8s %8s %8s %6s", "time [s]", "frags", "frag/s", "aio", "[KiB]", "blocked", "dropped", "idx hit", "idx miss", "idx blk", "cpu"); }); } auto table_row() { return make_printable([this] (std::ostream& out) { out << sprint("%10.6f %9d %10.0f %6d %10d %7d %7d %8d %8d %8d %5.1f%%", duration_in_seconds(), fragments_read, fragment_rate(), aio_reads(), aio_read_bytes() / 1024, reads_blocked(), read_aheads_discarded(), index_hits(), index_misses(), index_blocks(), cpu_utilization() * 100); }); } }; static uint64_t consume_all(streamed_mutation& sm) { uint64_t fragments = 0; while (1) { mutation_fragment_opt mfo = sm().get0(); if (!mfo) { break; } ++fragments; } return fragments; } static uint64_t consume_all(mutation_reader& rd) { uint64_t fragments = 0; while (1) { streamed_mutation_opt smo = rd().get0(); if (!smo) { break; } fragments += consume_all(*smo); } return fragments; } // cf should belong to ks.test static test_result scan_rows_with_stride(column_family& cf, int n_rows, int n_read = 1, int n_skip = 0) { auto rd = cf.make_reader(cf.schema(), query::full_partition_range, query::full_slice, default_priority_class(), nullptr, n_skip ? streamed_mutation::forwarding::yes : streamed_mutation::forwarding::no); metrics_snapshot before; streamed_mutation_opt smo = rd().get0(); assert(smo); streamed_mutation& sm = *smo; uint64_t fragments = 0; int ck = 0; while (ck < n_rows) { if (n_skip) { sm.fast_forward_to(position_range( position_in_partition(position_in_partition::clustering_row_tag_t(), clustering_key::from_singular(*cf.schema(), ck)), position_in_partition(position_in_partition::clustering_row_tag_t(), clustering_key::from_singular(*cf.schema(), ck + n_read)) )).get(); } fragments += consume_all(sm); ck += n_read + n_skip; } return {before, fragments}; } static dht::decorated_key make_pkey(const schema& s, int n) { return dht::global_partitioner().decorate_key(s, partition_key::from_singular(s, n)); } std::vector make_pkeys(schema_ptr s, int n) { std::vector keys; for (int i = 0; i < n; ++i) { keys.push_back(make_pkey(*s, i)); } std::sort(keys.begin(), keys.end(), dht::decorated_key::less_comparator(s)); return keys; } static test_result scan_with_stride_partitions(column_family& cf, int n, int n_read = 1, int n_skip = 0) { auto keys = make_pkeys(cf.schema(), n + n_read); int pk = 0; auto rd = cf.make_reader(cf.schema(), n_skip ? dht::partition_range::make_ending_with(dht::partition_range::bound(keys[0], false)) // covering none : query::full_partition_range, query::full_slice); metrics_snapshot before; uint64_t fragments = 0; while (pk < n) { // FIXME: fast_forward_to() cannot be called on a reader from which nothing was read yet. if (pk && n_skip) { rd.fast_forward_to(dht::partition_range( dht::partition_range::bound(keys[pk], true), dht::partition_range::bound(keys[pk + n_read], false) )).get(); } fragments += consume_all(rd); pk += n_read + n_skip; } return {before, fragments}; } static test_result slice_rows(column_family& cf, int offset = 0, int n_read = 1) { auto rd = cf.make_reader(cf.schema(), query::full_partition_range, query::full_slice, default_priority_class(), nullptr, streamed_mutation::forwarding::yes); metrics_snapshot before; streamed_mutation_opt smo = rd().get0(); assert(smo); streamed_mutation& sm = *smo; sm.fast_forward_to(position_range( position_in_partition::for_key(clustering_key::from_singular(*cf.schema(), offset)), position_in_partition::for_key(clustering_key::from_singular(*cf.schema(), offset + n_read)))).get(); uint64_t fragments = consume_all(sm); fragments += consume_all(rd); return {before, fragments}; } // cf is for ks.small_part static test_result slice_partitions(column_family& cf, int n, int offset = 0, int n_read = 1) { auto keys = make_pkeys(cf.schema(), n + n_read); auto pr = dht::partition_range( dht::partition_range::bound(keys[offset], true), dht::partition_range::bound(keys[offset + n_read], false) ); auto rd = cf.make_reader(cf.schema(), pr, query::full_slice); metrics_snapshot before; uint64_t fragments = consume_all(rd); return {before, fragments}; } static bytes make_blob(size_t blob_size) { static thread_local std::independent_bits_engine random_bytes; bytes big_blob(bytes::initialized_later(), blob_size); for (auto&& b : big_blob) { b = random_bytes(); } return big_blob; } struct table_config { sstring name; int n_rows; int value_size; }; static void drop_keyspace_if_exists(cql_test_env& env, sstring name) { try { env.local_db().find_keyspace(name); std::cout << "Dropping keyspace...\n"; env.execute_cql("drop keyspace ks;").get(); } catch (const no_such_keyspace&) { // expected } } static table_config read_config(cql_test_env& env, const sstring& name) { auto msg = env.execute_cql(sprint("select n_rows, value_size from ks.config where name = '%s'", name)).get0(); auto rows = dynamic_pointer_cast(msg); if (rows->rs().size() < 1) { throw std::runtime_error("config not found. Did you run --populate ?"); } const std::vector& config_row = rows->rs().rows()[0]; if (config_row.size() != 2) { throw std::runtime_error("config row has invalid size"); } auto n_rows = value_cast(int32_type->deserialize(*config_row[0])); auto value_size = value_cast(int32_type->deserialize(*config_row[1])); return {name, n_rows, value_size}; } static void populate(cql_test_env& env, table_config cfg) { drop_keyspace_if_exists(env, "ks"); env.execute_cql("CREATE KEYSPACE ks WITH REPLICATION = {'class' : 'SimpleStrategy', 'replication_factor' : 1};").get(); std::cout << "Saving test config...\n"; env.execute_cql("create table config (name text primary key, n_rows int, value_size int)").get(); env.execute_cql(sprint("insert into ks.config (name, n_rows, value_size) values ('%s', %d, %d)", cfg.name, cfg.n_rows, cfg.value_size)).get(); std::cout << "Creating test tables...\n"; // Large partition with lots of rows env.execute_cql("create table test (pk int, ck int, value blob, primary key (pk, ck))" " WITH compression = { 'sstable_compression' : '' };").get(); database& db = env.local_db(); { std::cout << "Populating ks.test with " << cfg.n_rows << " rows..."; auto insert_id = env.prepare("update test set \"value\" = ? where \"pk\" = 0 and \"ck\" = ?;").get0(); for (int ck = 0; ck < cfg.n_rows; ++ck) { env.execute_prepared(insert_id, {{ cql3::raw_value::make_value(data_value(make_blob(cfg.value_size)).serialize()), cql3::raw_value::make_value(data_value(ck).serialize()) }}).get(); } column_family& cf = db.find_column_family("ks", "test"); std::cout << "flushing...\n"; cf.flush().get(); std::cout << "compacting...\n"; cf.compact_all_sstables().get(); } // Small partitions, but lots env.execute_cql("create table small_part (pk int, value blob, primary key (pk))" " WITH compression = { 'sstable_compression' : '' };").get(); { std::cout << "Populating small_part with " << cfg.n_rows << " partitions..."; auto insert_id = env.prepare("update small_part set \"value\" = ? where \"pk\" = ?;").get0(); for (int pk = 0; pk < cfg.n_rows; ++pk) { env.execute_prepared(insert_id, {{ cql3::raw_value::make_value(data_value(make_blob(cfg.value_size)).serialize()), cql3::raw_value::make_value(data_value(pk).serialize()) }}).get(); } column_family& cf = db.find_column_family("ks", "small_part"); std::cout << "flushing...\n"; cf.flush().get(); std::cout << "compacting...\n"; cf.compact_all_sstables().get(); } } int main(int argc, char** argv) { namespace bpo = boost::program_options; app_template app; app.add_options() ("populate", "populate the table") ("verbose", "Enables more logging") ("trace", "Enables trace-level logging") ("rows", bpo::value()->default_value(1000000), "Number of CQL rows in a partition. Relevant only for population.") ("value-size", bpo::value()->default_value(100), "Size of value stored in a cell. Relevant only for population.") ("name", bpo::value()->default_value("default"), "Name of the configuration") ; return app.run(argc, argv, [&app] { db::config cfg; cfg.enable_cache = false; cfg.enable_commitlog = false; cfg.data_file_directories({ "./perf_large_partition_data" }, db::config::config_source::CommandLine); if (!app.configuration().count("verbose")) { logging::logger_registry().set_all_loggers_level(seastar::log_level::warn); } if (app.configuration().count("trace")) { logging::logger_registry().set_logger_level("sstable", seastar::log_level::trace); } std::cout << "Data directory: " << cfg.data_file_directories() << "\n"; return do_with_cql_env([&app] (cql_test_env& env) { return seastar::async([&app, &env] { sstring name = app.configuration()["name"].as(); if (app.configuration().count("populate")) { int n_rows = app.configuration()["rows"].as(); int value_size = app.configuration()["value-size"].as(); table_config cfg{name, n_rows, value_size}; populate(env, cfg); } else { database& db = env.local_db(); column_family& cf = db.find_column_family("ks", "test"); auto cfg = read_config(env, name); std::cout << "Config: rows: " << cfg.n_rows << ", value size: " << cfg.value_size << "\n"; ::sleep(1s).get(); // wait for system table flushes to quiesce cf.run_with_compaction_disabled([&] { return seastar::async([&] { { std::cout << "Testing scanning large partition with skips. \n" << "Reads whole range interleaving reads with skips according to read-skip pattern:\n"; std::cout << sprint("%-7s %-7s ", "read", "skip") << test_result::table_header() << "\n"; auto test = [&] (int n_read, int n_skip) { auto r = scan_rows_with_stride(cf, cfg.n_rows, n_read, n_skip); std::cout << sprint("%-7d %-7d ", n_read, n_skip) << r.table_row() << "\n"; }; test(1, 0); test(1, 1); test(1, 8); test(1, 16); test(1, 32); test(1, 64); test(1, 256); test(1, 1024); test(1, 4096); test(64, 1); test(64, 8); test(64, 16); test(64, 32); test(64, 64); test(64, 256); test(64, 1024); test(64, 4096); } { std::cout << "Testing slicing of large partition:\n"; std::cout << sprint("%-7s %-7s ", "offset", "read") << test_result::table_header() << "\n"; auto test = [&] (int offset, int read) { auto r = slice_rows(cf, offset, read); std::cout << sprint("%-7d %-7d ", offset, read) << r.table_row() << "\n"; }; test(0, 1); test(0, 32); test(0, 256); test(0, 4096); test(cfg.n_rows / 2, 1); test(cfg.n_rows / 2, 32); test(cfg.n_rows / 2, 256); test(cfg.n_rows / 2, 4096); } }); }).get(); column_family& cf2 = db.find_column_family("ks", "small_part"); cf2.run_with_compaction_disabled([&] { return seastar::async([&] { { std::cout << "Testing scanning small partitions with skips. \n" << "Reads whole range interleaving reads with skips according to read-skip pattern:\n"; std::cout << sprint("%-7s %-7s ", "read", "skip") << test_result::table_header() << "\n"; auto test = [&] (int n_read, int n_skip) { auto r = scan_with_stride_partitions(cf2, cfg.n_rows, n_read, n_skip); std::cout << sprint("%-7d %-7d ", n_read, n_skip) << r.table_row() << "\n"; }; test(1, 0); test(1, 1); test(1, 8); test(1, 16); test(1, 32); test(1, 64); test(1, 256); test(1, 1024); test(1, 4096); test(64, 1); test(64, 8); test(64, 16); test(64, 32); test(64, 64); test(64, 256); test(64, 1024); test(64, 4096); } { std::cout << "Testing slicing small partitions:\n"; std::cout << sprint("%-7s %-7s ", "offset", "read") << test_result::table_header() << "\n"; auto test = [&] (int offset, int read) { auto r = slice_partitions(cf2, cfg.n_rows, offset, read); std::cout << sprint("%-7d %-7d ", offset, read) << r.table_row() << "\n"; }; test(0, 1); test(0, 32); test(0, 256); test(0, 4096); test(cfg.n_rows / 2, 1); test(cfg.n_rows / 2, 32); test(cfg.n_rows / 2, 256); test(cfg.n_rows / 2, 4096); } }); }).get(); } }); }, cfg); }); }