sstables: Fix partition key count estimation for a range
The method sstable::estimated_keys_for_range() was severely
under-estimating the number of partitions in an sstable for a given
token range.
The first reason is that it underestimated the number of sstable index
pages covered by the range, by one. In extreme, if the requested range
falls into a single index page, we will assume 0 pages, and report 1
partition. The reason is that we were using
get_sample_indexes_for_range(), which returns entries with the keys
falling into the range, not entries for pages which may contain the
keys.
A single page can have a lot of partitions though. By default, there
is a 1:20000 ratio between summary entry size and the data file size
covered by it. If partitions are small, that can be many hundreds of
partitions.
Another reason is that we underestimate the number of partitions in an
index page. We multiply the number of pages by:
(downsampling::BASE_SAMPLING_LEVEL * _components->summary.header.min_index_interval)
/ _components->summary.header.sampling_level
Using defaults, that means multiplying by 128. In the cassandra-stress
workload a single partition takes about 300 bytes in the data file and
summary entry is 22 bytes. That means a single page covers 22 * 20'000
= 440'000 bytes of the data file, which contains about 1'466
partitions. So we underestimate by an order of magnitude.
Underestimating the number of partitions will result in too small
bloom filters being generated for the sstables which are the output of
repair or streaming. This will make the bloom filters ineffective
which results in reads selecting more sstables than necessary.
The fix is to base the estimation on the number of index pages which
may contain keys for the range, and multiply that by the average key
count per index page.
Fixes #5112.
Refs #4994.
The output of test_key_count_estimation:
Before:
count = 10000
est = 10112
est([-inf; +inf]) = 512
est([0; 0]) = 128
est([0; 63]) = 128
est([0; 255]) = 128
est([0; 511]) = 128
est([0; 1023]) = 128
est([0; 4095]) = 256
est([0; 9999]) = 512
est([5000; 5000]) = 1
est([5000; 5063]) = 1
est([5000; 5255]) = 1
est([5000; 5511]) = 1
est([5000; 6023]) = 128
est([5000; 9095]) = 256
est([5000; 9999]) = 256
est(non-overlapping to the left) = 1
est(non-overlapping to the right) = 1
After:
count = 10000
est = 10112
est([-inf; +inf]) = 10112
est([0; 0]) = 2528
est([0; 63]) = 2528
est([0; 255]) = 2528
est([0; 511]) = 2528
est([0; 1023]) = 2528
est([0; 4095]) = 5056
est([0; 9999]) = 10112
est([5000; 5000]) = 2528
est([5000; 5063]) = 2528
est([5000; 5255]) = 2528
est([5000; 5511]) = 2528
est([5000; 6023]) = 5056
est([5000; 9095]) = 7584
est([5000; 9999]) = 7584
est(non-overlapping to the left) = 0
est(non-overlapping to the right) = 0
Tests:
- unit (dev)
Reviewed-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20190927141339.31315-1-tgrabiec@scylladb.com>
This commit is contained in:
committed by
Avi Kivity
parent
10f90d0e25
commit
b93cc21a94
@@ -3091,6 +3091,56 @@ std::optional<std::pair<uint64_t, uint64_t>> sstable::get_sample_indexes_for_ran
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a pair of positions [p1, p2) in the summary file corresponding to
|
||||
* pages which may include keys covered by the specified range, or a disengaged
|
||||
* optional if the sstable does not include any keys from the range.
|
||||
*/
|
||||
std::optional<std::pair<uint64_t, uint64_t>> sstable::get_index_pages_for_range(const dht::token_range& range) {
|
||||
const auto& entries = _components->summary.entries;
|
||||
auto entries_size = entries.size();
|
||||
index_comparator cmp(*_schema);
|
||||
dht::ring_position_comparator rp_cmp(*_schema);
|
||||
uint64_t left = 0;
|
||||
if (range.start()) {
|
||||
dht::ring_position_view pos = range.start()->is_inclusive()
|
||||
? dht::ring_position_view::starting_at(range.start()->value())
|
||||
: dht::ring_position_view::ending_at(range.start()->value());
|
||||
|
||||
// There is no summary entry for the last key, so in order to determine
|
||||
// if pos overlaps with the sstable or not we have to compare with the
|
||||
// last key.
|
||||
if (rp_cmp(pos, get_last_decorated_key()) > 0) {
|
||||
// left is past the end of the sampling.
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
left = std::distance(std::begin(entries),
|
||||
std::lower_bound(entries.begin(), entries.end(), pos, cmp));
|
||||
|
||||
if (left) {
|
||||
--left;
|
||||
}
|
||||
}
|
||||
uint64_t right = entries_size;
|
||||
if (range.end()) {
|
||||
dht::ring_position_view pos = range.end()->is_inclusive()
|
||||
? dht::ring_position_view::ending_at(range.end()->value())
|
||||
: dht::ring_position_view::starting_at(range.end()->value());
|
||||
|
||||
right = std::distance(std::begin(entries),
|
||||
std::lower_bound(entries.begin(), entries.end(), pos, cmp));
|
||||
if (right == 0) {
|
||||
// The first key is strictly greater than right.
|
||||
return std::nullopt;
|
||||
}
|
||||
}
|
||||
if (left < right) {
|
||||
return std::optional<std::pair<uint64_t, uint64_t>>(std::in_place_t(), left, right);
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
std::vector<dht::decorated_key> sstable::get_key_samples(const schema& s, const dht::token_range& range) {
|
||||
auto index_range = get_sample_indexes_for_range(range);
|
||||
std::vector<dht::decorated_key> res;
|
||||
@@ -3104,10 +3154,15 @@ std::vector<dht::decorated_key> sstable::get_key_samples(const schema& s, const
|
||||
}
|
||||
|
||||
uint64_t sstable::estimated_keys_for_range(const dht::token_range& range) {
|
||||
auto sample_index_range = get_sample_indexes_for_range(range);
|
||||
uint64_t sample_key_count = sample_index_range ? sample_index_range->second - sample_index_range->first : 0;
|
||||
// adjust for the current sampling level
|
||||
uint64_t estimated_keys = sample_key_count * ((downsampling::BASE_SAMPLING_LEVEL * _components->summary.header.min_index_interval) / _components->summary.header.sampling_level);
|
||||
auto page_range = get_index_pages_for_range(range);
|
||||
if (!page_range) {
|
||||
return 0;
|
||||
}
|
||||
using uint128_t = unsigned __int128;
|
||||
uint64_t range_pages = page_range->second - page_range->first;
|
||||
auto total_keys = get_estimated_key_count();
|
||||
auto total_pages = _components->summary.entries.size();
|
||||
uint64_t estimated_keys = (uint128_t)range_pages * total_keys / total_pages;
|
||||
return std::max(uint64_t(1), estimated_keys);
|
||||
}
|
||||
|
||||
|
||||
@@ -656,6 +656,7 @@ private:
|
||||
composite::eoc marker = composite::eoc::none);
|
||||
|
||||
std::optional<std::pair<uint64_t, uint64_t>> get_sample_indexes_for_range(const dht::token_range& range);
|
||||
std::optional<std::pair<uint64_t, uint64_t>> get_index_pages_for_range(const dht::token_range& range);
|
||||
|
||||
std::vector<unsigned> compute_shards_for_this_sstable() const;
|
||||
template <typename Components>
|
||||
|
||||
@@ -393,7 +393,7 @@ SEASTAR_THREAD_TEST_CASE(read_partial_range_2) {
|
||||
}
|
||||
|
||||
static
|
||||
mutation_source make_sstable_mutation_source(sstables::test_env& env, schema_ptr s, sstring dir, std::vector<mutation> mutations,
|
||||
shared_sstable make_sstable(sstables::test_env& env, schema_ptr s, sstring dir, std::vector<mutation> mutations,
|
||||
sstable_writer_config cfg, sstables::sstable::version_types version, gc_clock::time_point query_time = gc_clock::now()) {
|
||||
auto sst = env.make_sstable(s,
|
||||
dir,
|
||||
@@ -412,7 +412,13 @@ mutation_source make_sstable_mutation_source(sstables::test_env& env, schema_ptr
|
||||
sst->write_components(mt->make_flat_reader(s), mutations.size(), s, cfg, mt->get_encoding_stats()).get();
|
||||
sst->load().get();
|
||||
|
||||
return as_mutation_source(sst);
|
||||
return sst;
|
||||
}
|
||||
|
||||
static
|
||||
mutation_source make_sstable_mutation_source(sstables::test_env& env, schema_ptr s, sstring dir, std::vector<mutation> mutations,
|
||||
sstable_writer_config cfg, sstables::sstable::version_types version, gc_clock::time_point query_time = gc_clock::now()) {
|
||||
return as_mutation_source(make_sstable(env, s, dir, std::move(mutations), cfg, version, query_time));
|
||||
}
|
||||
|
||||
// Must be run in a seastar thread
|
||||
@@ -1411,6 +1417,73 @@ SEASTAR_TEST_CASE(test_no_index_reads_when_rows_fall_into_range_boundaries) {
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_key_count_estimation) {
|
||||
return seastar::async([] {
|
||||
auto wait_bg = seastar::defer([] { sstables::await_background_jobs().get(); });
|
||||
for (const auto version : all_sstable_versions) {
|
||||
storage_service_for_tests ssft;
|
||||
simple_schema ss;
|
||||
auto s = ss.schema();
|
||||
|
||||
int count = 10'000;
|
||||
std::vector<dht::decorated_key> all_pks = ss.make_pkeys(count + 2);
|
||||
std::vector<dht::decorated_key> pks(all_pks.begin() + 1, all_pks.end() - 1);
|
||||
std::vector<mutation> muts;
|
||||
for (auto pk : pks) {
|
||||
mutation m(ss.schema(), pk);
|
||||
ss.add_row(m, ss.make_ckey(1), "v");
|
||||
muts.push_back(std::move(m));
|
||||
}
|
||||
|
||||
tmpdir dir;
|
||||
sstables::test_env env;
|
||||
shared_sstable sst = make_sstable(env, s, dir.path().string(), muts, sstable_writer_config{}, version);
|
||||
|
||||
auto max_est = sst->get_estimated_key_count();
|
||||
BOOST_TEST_MESSAGE(format("count = {}", count));
|
||||
BOOST_TEST_MESSAGE(format("est = {}", max_est));
|
||||
|
||||
{
|
||||
auto est = sst->estimated_keys_for_range(dht::token_range::make_open_ended_both_sides());
|
||||
BOOST_TEST_MESSAGE(format("est([-inf; +inf]) = {}", est));
|
||||
BOOST_REQUIRE_EQUAL(est, sst->get_estimated_key_count());
|
||||
}
|
||||
|
||||
for (int size : {1, 64, 256, 512, 1024, 4096, count}) {
|
||||
auto r = dht::token_range::make(pks[0].token(), pks[size - 1].token());
|
||||
auto est = sst->estimated_keys_for_range(r);
|
||||
BOOST_TEST_MESSAGE(format("est([0; {}] = {}", size - 1, est));
|
||||
BOOST_REQUIRE_GE(est, size);
|
||||
BOOST_REQUIRE_LE(est, max_est);
|
||||
}
|
||||
|
||||
for (int size : {1, 64, 256, 512, 1024, 4096, count}) {
|
||||
auto lower = 5000;
|
||||
auto upper = std::min(count - 1, lower + size - 1);
|
||||
auto r = dht::token_range::make(pks[lower].token(), pks[upper].token());
|
||||
auto est = sst->estimated_keys_for_range(r);
|
||||
BOOST_TEST_MESSAGE(format("est([{}; {}]) = {}", lower, upper, est));
|
||||
BOOST_REQUIRE_GE(est, upper - lower + 1);
|
||||
BOOST_REQUIRE_LE(est, max_est);
|
||||
}
|
||||
|
||||
{
|
||||
auto r = dht::token_range::make(all_pks[0].token(), all_pks[0].token());
|
||||
auto est = sst->estimated_keys_for_range(r);
|
||||
BOOST_TEST_MESSAGE(format("est(non-overlapping to the left) = {}", est));
|
||||
BOOST_REQUIRE_EQUAL(est, 0);
|
||||
}
|
||||
|
||||
{
|
||||
auto r = dht::token_range::make(all_pks[all_pks.size() - 1].token(), all_pks[all_pks.size() - 1].token());
|
||||
auto est = sst->estimated_keys_for_range(r);
|
||||
BOOST_TEST_MESSAGE(format("est(non-overlapping to the right) = {}", est));
|
||||
BOOST_REQUIRE_EQUAL(est, 0);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_large_index_pages_do_not_cause_large_allocations) {
|
||||
auto wait_bg = seastar::defer([] { sstables::await_background_jobs().get(); });
|
||||
|
||||
|
||||
Reference in New Issue
Block a user