/* * Copyright (C) 2023-present ScyllaDB */ /* * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0 */ #include #undef SEASTAR_TESTING_MAIN #include #include "test/lib/cql_test_env.hh" #include "test/lib/log.hh" #include "db/config.hh" BOOST_AUTO_TEST_SUITE(cache_algorithm_test) // These tests are slow, and tuned to a particular amount of memory // (and --memory is ignored in debug mode). // Hence they are not run in debug. #ifndef SEASTAR_DEBUG // The problem with naive index caching is that every uncached read drags a full // index page into the cache. But the index page can be orders of magnitude bigger // than the ingested row. Depending on the workload, this can effectively bloat the // memory usage of every cached row by orders of magnitude, and ruin cache // effectiveness. // // This test checks for the above problem. // // The table created by the test has the following properties: // - There is only one column in the schema -- the partition key -- // because this results in the biggest `index page size`:`row size` ratio, // due to details of the SSTable format. // This makes the effects drastic, and so easy to test. // - The total size of all keys is at least 2 times greater than the size of RAM. // This ensures that most of the index is uncached. This is necessary for // the issue to become a visible. // - The size of user data (1000 per partition) is significantly bigger // than several hundred bytes, to make various constant overheads // (per-cell, per-row, per-partition) smaller than the size of user data. // This simplifies reasoning about the test. // In particular, it should ensure that each index page contains about 2000 keys, // so it has size about 2 MiB. // // After populating this table, the test reads (sequentially) a subset of 1000 rows // multiple times. Since the total size of this hot subset (including overheads) is // only about 1 MiB, the test expects it to be perfectly cached. // This should be true unless index cache is flooding the cache. SEASTAR_TEST_CASE(test_index_doesnt_flood_cache_in_small_partition_workload) { cql_test_config cfg; // Prevents an unnecessary sleep at the end of the test. cfg.db_config->task_ttl_seconds.set(0); // Scylla refreshes query permissions periodically. // This causes background queries to system_auth.roles, which in turn can cause // spurious cache misses to appear in cache_tracker stats, which breaks // the assumptions of the test. // The lines below effectively disable the permission refresh to get rid of that problem. cfg.db_config->permissions_validity_in_ms.set(uint32_t(-1)); cfg.db_config->permissions_update_interval_in_ms.set(uint32_t(-1)); // As of this writing, uncommenting the below should make the test fail. // cfg.db_config->index_cache_fraction.set(1.0); return do_with_cql_env_thread([] (cql_test_env& e) { // We disable compactions because they cause confusing cache mispopulations. // We disable compression because the sstable writer targets a specific // (*compressed* data file size : summary file size) ratio, // so the number of keys per index page becomes hard to control, // and might be arbitrarily large. e.execute_cql("CREATE TABLE ks.t(pk blob PRIMARY KEY) WITH compaction = { 'class' : 'NullCompactionStrategy' } AND compression = {'sstable_compression': ''};").get(); auto insert_query = e.prepare("INSERT INTO ks.t(pk) VALUES (?)").get(); auto select_query = e.prepare("SELECT * FROM t WHERE pk = ?").get(); constexpr uint64_t pk_number = 600000; constexpr uint64_t pk_size = 1000; // Sanity check. The test assumes that the total index size is significantly bigger than RAM. BOOST_REQUIRE_GT(pk_size * pk_number, 2 * seastar::memory::stats().total_memory()); // A bijection between uint64_t and blobs of size pk_size. auto make_key = [pk_size] (uint64_t x) { bytes b(std::max(pk_size, sizeof(x)), '\0'); auto i = b.begin(); write(i, x); return b; }; // Populate the table. for (size_t i = 0; i < pk_number; ++i) { e.execute_prepared(insert_query, {{cql3::raw_value::make_value(make_key(i))}}).get(); } // Flushing makes reasoning easier. e.db().invoke_on_all(&replica::database::flush_all_memtables).get(); int retries = 0; constexpr int max_retries = 100; retry: constexpr uint64_t hot_subset_size = 1000; // Sanity check. The test assumes that the total *hot* index size is significantly bigger than RAM. // // data_summary_ratio is the target `data file size : summary file size` ratio. // In a table containing only primary keys, the approximate size of an index page is `pk_size * data_summary_ratio`. // // The sanity check here is that the maximum total size of the touched index pages is much greater than RAM. // (Maximum is reached when each hot row lands on a different index page.) const uint64_t data_summary_ratio = static_cast(1 / e.local_db().get_config().sstable_summary_ratio()); BOOST_REQUIRE_GT(hot_subset_size * pk_size * data_summary_ratio, 2 * seastar::memory::stats().total_memory()); auto get_misses = [&e] { return e.local_db().row_cache_tracker().get_stats().partition_misses; }; uint64_t misses_before = get_misses(); for (size_t i = 0; i < hot_subset_size; ++i) { e.execute_prepared(select_query, {{cql3::raw_value::make_value(make_key(hot_subset_size * retries + i))}}).get(); } misses_before = get_misses(); uint64_t reads_before = e.local_db().row_cache_tracker().get_stats().reads; uint64_t reads_expected = reads_before; // The rows we just read have a small total size. They should be perfectly cached. for (size_t repeat = 0; repeat < 3; ++repeat) { for (size_t i = 0; i < hot_subset_size; ++i) { e.execute_prepared(select_query, {{cql3::raw_value::make_value(make_key(hot_subset_size * retries + i))}}).get(); ++reads_expected; // If the rows were perfectly cached, there were no new misses. uint64_t reads_after = e.local_db().row_cache_tracker().get_stats().reads_done; uint64_t misses_after = get_misses(); if (get_misses() != misses_before) { // Cache misses are allowed only if they were done by something outside of the test. BOOST_REQUIRE_GT(reads_after, reads_expected); if (retries < max_retries) { ++retries; testlog.warn("Detected extra cache misses (actual={}, expected={}, repeat={}, i={}), but they can be explained by extra reads (after={}, before={}, expected={}) done by something in the background, so retrying. (retries={})", misses_after, misses_before, repeat, i, reads_after, reads_before, reads_expected, retries); goto retry; } else { BOOST_FAIL("Test failed due to too much background noise"); } } BOOST_REQUIRE_EQUAL(get_misses(), misses_before); } } }, std::move(cfg)); } // The previous test checks that index_cache_fraction doesn't allow index cache to // flood the memory. // // This test checks that it doesn't completely kill caching. SEASTAR_TEST_CASE(test_index_is_cached_in_big_partition_workload) { cql_test_config cfg; // Prevents an unnecessary sleep at the end of the test. cfg.db_config->task_ttl_seconds.set(0); // Scylla refreshes query permissions periodically. // This causes background queries to system_auth.roles, which in turn can cause // spurious cache misses to appear in cache_tracker stats, which breaks // the assumptions of the test. // The lines below effectively disable the permission refresh to get rid of that problem. cfg.db_config->permissions_validity_in_ms.set(uint32_t(-1)); cfg.db_config->permissions_update_interval_in_ms.set(uint32_t(-1)); // As of this writing, uncommenting the below should make the test fail. // cfg.db_config->index_cache_fraction.set(0.0); return do_with_cql_env_thread([] (cql_test_env& e) { // We disable compactions because they cause confusing cache mispopulations. // We disable compression because the sstable writer targets a specific // (*compressed* data file size : summary file size) ratio, // so the number of keys per index page becomes hard to control, // and might be arbitrarily large. e.execute_cql("CREATE TABLE ks.t(pk bigint, ck bigint, v blob, primary key (pk, ck)) WITH compaction = { 'class' : 'NullCompactionStrategy' } AND compression = {'sstable_compression': ''};").get(); auto insert_query = e.prepare("INSERT INTO ks.t(pk, ck, v) VALUES (?, ?, ?)").get(); auto select_query = e.prepare("SELECT * FROM t WHERE pk = ? AND ck = ?").get(); constexpr uint64_t pk_number = 10; constexpr uint64_t ck_number = 600; constexpr uint64_t v_size = 100000; // Sanity check. The test assumes that the total table size is significantly bigger than RAM. BOOST_REQUIRE_GT(pk_number * ck_number * v_size, 2 * seastar::memory::stats().total_memory()); // A bijection between uint64_t and blobs of size x. auto make_key = [] (uint64_t x) { bytes b(bytes::initialized_later(), sizeof(x)); auto i = b.begin(); write(i, x); return b; }; // Populate the table. for (size_t pk = 0; pk < pk_number; ++pk) { for (size_t ck = 0; ck < ck_number; ++ck) { e.execute_prepared(insert_query, {{cql3::raw_value::make_value(make_key(pk))}, {cql3::raw_value::make_value(make_key(ck))}, {cql3::raw_value::make_value(bytes(v_size, 0))}}).get(); } } // Flushing makes reasoning easier. e.db().invoke_on_all(&replica::database::flush_all_memtables).get(); // Populate the index cache. for (size_t ck = 0; ck < ck_number; ++ck) { for (size_t pk = 0; pk < pk_number; ++pk) { e.execute_prepared(select_query, {{cql3::raw_value::make_value(make_key(pk))}, {cql3::raw_value::make_value(make_key(ck))}}).get(); } } int retries = 0; constexpr int max_retries = 100; retry: // The index is small and used once every few reads, so it should be perfectly cached. auto get_misses = [&e] { return e.local_db().row_cache_tracker().get_partition_index_cache_stats().misses; }; uint64_t misses_before = get_misses(); uint64_t reads_before = e.local_db().row_cache_tracker().get_stats().reads; uint64_t reads_expected = reads_before; for (size_t ck = 0; ck < ck_number; ++ck) { for (size_t pk = 0; pk < pk_number; ++pk) { e.execute_prepared(select_query, {{cql3::raw_value::make_value(make_key(pk))}, {cql3::raw_value::make_value(make_key(ck))}}).get(); ++reads_expected; } } uint64_t reads_after = e.local_db().row_cache_tracker().get_stats().reads_done; uint64_t misses_after = get_misses(); if (misses_after != misses_before) { // Cache misses are allowed only if they were done by something outside of the test. BOOST_REQUIRE_GT(reads_after, reads_expected); if (retries < max_retries) { ++retries; testlog.warn("Detected extra cache misses (actual={}, expected={}), but they can be explained by extra reads (after={}, before={}, expected={}) done by something in the background, so retrying. (retries={})", misses_after, misses_before, reads_after, reads_before, reads_expected, retries); goto retry; } else { BOOST_FAIL("Test failed due to too much background noise"); } } BOOST_REQUIRE_EQUAL(get_misses(), misses_before); }, std::move(cfg)); } #endif BOOST_AUTO_TEST_SUITE_END()