mirror of
https://github.com/scylladb/scylladb.git
synced 2026-04-21 17:10:35 +00:00
Historically, we had two index_readers per a sstable_mutation_reader, one for the lower bound and one for the upper bound. Most of public members of the index_reader class were only called on either of those. With the changes introduced in #2981, two readers are even more tied together as they now have a shared-per-pair list of index pages that needs proper cleanup and was protruding woefully into the caller code. This fix re-structures index_reader so that it now keeps track of both lower and upper bounds. The shared_index_lists structure is encapsulated within index_reader and becomes an internal detail rather than a liability. Fixes #3220. Tests: unit (debug, release) + Tested using cassandra-stress commands from #3189. perf_fast_forward results indicate there is no performance degradation caused by thix fix. =========================== Baseline =================================== running: large-partition-skips Testing scanning large partition with skips. Reads whole range interleaving reads with skips according to read-skip pattern: read skip time (s) frags frag/s aio (KiB) blocked dropped idx hit idx miss idx blk c hit c miss c blk cpu 1 0 0.494458 1000000 2022418 1018 126960 27 0 0 0 0 0 0 0 97.6% 1 1 1.754717 500000 284946 997 127064 6 0 0 3 3 0 0 0 99.9% 1 8 0.551664 111112 201413 997 127064 6 0 0 3 3 0 0 0 99.7% 1 16 0.383888 58824 153232 1001 127080 10 0 0 5 5 0 0 0 99.5% 1 32 0.289073 30304 104832 997 127064 28 0 0 3 3 0 0 0 99.3% 1 64 0.236963 15385 64926 997 127064 122 0 0 3 3 0 0 0 99.2% 1 256 0.172901 3892 22510 997 127064 217 0 0 3 3 0 0 0 95.5% 1 1024 0.117570 976 8301 997 127064 235 0 0 3 3 0 0 0 49.0% 1 4096 0.085811 245 2855 664 27172 375 274 0 3 3 0 0 0 21.4% 64 1 0.512781 984616 1920149 1142 127064 139 0 0 3 3 0 0 0 98.7% 64 8 0.479232 888896 1854833 1001 127080 10 0 0 5 5 0 0 0 99.6% 64 16 0.451193 800000 1773078 997 127064 6 0 0 3 3 0 0 0 99.6% 64 32 0.408684 666688 1631305 997 127064 6 0 0 3 3 0 0 0 99.5% 64 64 0.351906 500032 1420924 997 127064 14 0 0 3 3 0 0 0 99.5% 64 256 0.227008 200000 881026 997 127064 211 0 0 3 3 0 0 0 99.1% 64 1024 0.125803 58880 468032 997 127064 290 0 0 3 3 0 0 0 65.1% 64 4096 0.098155 15424 157139 703 27856 401 267 0 3 3 0 0 0 25.8% running: large-partition-slicing Testing slicing of large partition: offset read time (s) frags frag/s aio (KiB) blocked dropped idx hit idx miss idx blk c hit c miss c blk cpu 0 1 0.000701 1 1427 9 296 6 4 0 3 3 0 0 0 12.4% 0 32 0.000698 32 45827 9 296 6 3 0 3 3 0 0 0 13.9% 0 256 0.000808 256 316920 10 328 6 3 0 3 3 0 0 0 24.9% 0 4096 0.004368 4096 937697 25 808 14 3 0 3 3 0 0 0 45.9% 500000 1 0.001196 1 836 13 412 9 4 0 3 3 0 0 0 22.7% 500000 32 0.001200 32 26664 13 412 9 4 0 3 3 0 0 0 22.2% 500000 256 0.001503 256 170338 14 444 10 4 0 3 3 0 0 0 25.3% 500000 4096 0.004351 4096 941465 30 956 20 4 0 3 3 0 0 0 50.7% running: large-partition-slicing-clustering-keys Testing slicing of large partition using clustering keys: offset read time (s) frags frag/s aio (KiB) blocked dropped idx hit idx miss idx blk c hit c miss c blk cpu 0 1 0.000625 1 1601 7 176 6 0 0 3 3 0 0 0 23.2% 0 32 0.000604 32 53016 7 176 6 0 0 3 3 0 0 0 24.7% 0 256 0.000695 256 368498 8 180 6 0 0 3 3 0 0 0 36.4% 0 4096 0.004083 4096 1003106 20 692 12 1 0 3 3 0 0 0 47.0% 500000 1 0.001198 1 835 12 516 9 3 0 3 3 0 0 0 22.8% 500000 32 0.000981 32 32631 12 388 9 3 0 3 3 0 0 0 29.2% 500000 256 0.001320 256 194011 13 384 10 3 0 3 3 0 0 0 29.0% 500000 4096 0.003944 4096 1038567 25 840 17 2 0 3 3 0 0 0 52.2% running: large-partition-slicing-single-key-reader Testing slicing of large partition, single-partition reader: offset read time (s) frags frag/s aio (KiB) blocked dropped idx hit idx miss idx blk c hit c miss c blk cpu 0 1 0.000849 1 1178 9 488 6 0 0 3 3 0 0 0 16.5% 0 32 0.000661 32 48415 9 296 6 0 0 3 3 0 0 0 22.2% 0 256 0.000756 256 338648 10 328 6 0 0 3 3 0 0 0 33.3% 0 4096 0.004147 4096 987610 22 840 12 1 0 3 3 0 0 0 47.9% 500000 1 0.001041 1 960 13 476 9 3 0 3 3 0 0 0 25.9% 500000 32 0.001020 32 31375 13 412 9 3 0 3 3 0 0 0 29.1% 500000 256 0.001265 256 202373 14 444 10 3 0 3 3 0 0 0 32.0% 500000 4096 0.004121 4096 994014 30 988 18 3 0 3 3 0 0 0 52.7% running: large-partition-select-few-rows Testing selecting few rows from a large partition: stride rows time (s) frags frag/s aio (KiB) blocked dropped idx hit idx miss idx blk c hit c miss c blk cpu 1000000 1 0.000668 1 1498 9 296 6 4 0 3 3 0 0 0 19.8% 500000 2 0.000976 2 2048 13 412 9 4 0 3 3 0 0 0 29.0% 250000 4 0.001408 4 2842 18 572 12 6 0 3 3 0 0 0 28.8% 125000 8 0.002004 8 3993 29 912 19 10 0 3 3 0 0 0 34.0% 62500 16 0.002883 16 5551 50 1584 32 18 0 3 3 0 0 0 41.9% 2 500000 1.053215 500000 474737 1138 127080 120 0 0 5 5 0 0 0 99.7% running: large-partition-forwarding Testing forwarding with clustering restriction in a large partition: pk-scan time (s) frags frag/s aio (KiB) blocked dropped idx hit idx miss idx blk c hit c miss c blk cpu yes 0.002717 2 736 24 2684 8 16 0 3 3 0 0 0 19.7% no 0.001004 2 1992 13 412 8 2 0 3 3 0 0 0 30.2% running: small-partition-skips Testing scanning small partitions with skips. Reads whole range interleaving reads with skips according to read-skip pattern: read skip time (s) frags frag/s aio (KiB) blocked dropped idx hit idx miss idx blk c hit c miss c blk cpu -> 1 0 1.466523 1000000 681885 1369 139732 33 1 0 0 0 0 0 0 99.7% -> 1 1 12.792183 500000 39086 6235 177736 5155 0 0 5123 7663 0 0 0 96.4% -> 1 8 3.451431 111112 32193 6235 177736 5155 0 0 5123 9673 0 0 0 84.8% -> 1 16 2.223815 58824 26452 6234 177704 5154 0 0 5122 9965 0 0 0 75.0% -> 1 32 1.512511 30304 20036 6233 177680 5155 1 0 5123 10090 0 0 0 61.8% -> 1 64 1.129465 15385 13621 6227 177464 5154 0 0 5122 10159 0 0 0 49.5% -> 1 256 0.733282 3892 5308 6211 175464 5178 24 0 5122 10220 0 0 0 33.8% -> 1 1024 0.397302 976 2457 5946 142152 5369 217 0 5120 10235 0 0 0 32.1% -> 1 4096 0.187746 245 1305 5499 81992 5296 142 0 5122 10240 0 0 0 46.8% -> 64 1 2.428488 984616 405444 7332 177736 5155 25 0 5123 5208 0 0 0 79.9% -> 64 8 2.262876 888896 392817 6235 177736 5155 0 0 5123 5654 0 0 0 78.1% -> 64 16 2.137544 800000 374261 6234 177732 5154 0 0 5122 6110 0 0 0 77.1% -> 64 32 1.862466 666688 357960 6235 177736 5155 0 0 5123 6844 0 0 0 73.7% -> 64 64 1.547757 500032 323069 6234 177728 5155 0 0 5123 7651 0 0 0 68.7% -> 64 256 0.914612 200000 218672 6233 177704 5154 0 0 5122 9202 0 0 0 55.5% -> 64 1024 0.475472 58880 123835 6229 177492 5154 5 0 5122 9930 0 0 0 45.4% -> 64 4096 0.271239 15424 56865 6158 169480 5257 114 0 5115 10142 0 0 0 44.1% running: small-partition-slicing Testing slicing small partitions: offset read time (s) frags frag/s aio (KiB) blocked dropped idx hit idx miss idx blk c hit c miss c blk cpu 0 1 0.003209 1 312 3 260 2 7 0 1 1 0 0 0 15.5% 0 32 0.004205 32 7610 16 1428 10 0 0 5 5 0 0 0 15.7% 0 256 0.009830 256 26042 97 8572 62 0 0 31 31 0 0 0 18.7% 0 4096 0.015471 4096 264748 100 8704 64 0 0 32 32 0 0 0 48.4% 500000 1 0.003654 1 274 34 492 33 0 0 32 64 0 0 0 28.7% 500000 32 0.004287 32 7464 40 1260 36 0 0 32 64 0 0 0 26.0% 500000 256 0.009598 256 26673 100 8748 64 4 0 32 64 0 0 0 20.6% 500000 4096 0.014151 4096 289449 119 7892 85 0 0 53 64 0 0 0 54.1% ======================== With the patch ================================ running: large-partition-skips Testing scanning large partition with skips. Reads whole range interleaving reads with skips according to read-skip pattern: read skip time (s) frags frag/s aio (KiB) blocked dropped idx hit idx miss idx blk c hit c miss c blk cpu 1 0 0.468887 1000000 2132711 1018 126960 29 0 0 0 0 0 0 0 98.4% 1 1 1.735113 500000 288166 1001 127080 10 0 0 5 5 0 0 0 99.9% 1 8 0.535616 111112 207447 997 127064 6 0 0 3 3 0 0 0 99.6% 1 16 0.365487 58824 160947 1001 127080 15 0 0 5 5 0 0 0 99.5% 1 32 0.272208 30304 111326 997 127064 21 0 0 3 3 0 0 0 99.3% 1 64 0.224049 15385 68668 997 127064 208 0 0 3 3 0 0 0 99.1% 1 256 0.159247 3892 24440 997 127064 250 0 0 3 3 0 0 0 94.7% 1 1024 0.102107 976 9559 997 127064 292 0 0 3 3 0 0 0 53.6% 1 4096 0.084310 245 2906 664 27172 371 273 0 3 3 0 0 0 20.2% 64 1 0.508340 984616 1936923 1142 127064 129 0 0 3 3 0 0 0 98.1% 64 8 0.470369 888896 1889786 997 127064 6 0 0 3 3 0 0 0 99.6% 64 16 0.439917 800000 1818526 1001 127080 10 0 0 5 5 0 0 0 99.6% 64 32 0.397938 666688 1675358 997 127064 6 0 0 3 3 0 0 0 99.5% 64 64 0.344144 500032 1452972 997 127064 18 0 0 3 3 0 0 0 99.4% 64 256 0.219996 200000 909107 997 127064 251 0 0 3 3 0 0 0 99.1% 64 1024 0.124294 58880 473715 997 127064 284 1 0 3 3 0 0 0 62.2% 64 4096 0.097580 15424 158065 703 27856 400 267 0 3 3 0 0 0 25.3% running: large-partition-slicing Testing slicing of large partition: offset read time (s) frags frag/s aio (KiB) blocked dropped idx hit idx miss idx blk c hit c miss c blk cpu 0 1 0.000733 1 1365 9 296 6 4 0 3 3 0 0 0 19.3% 0 32 0.000705 32 45417 9 296 6 3 0 3 3 0 0 0 15.3% 0 256 0.000830 256 308364 10 328 6 3 0 3 3 0 0 0 26.7% 0 4096 0.004631 4096 884529 25 808 14 3 0 3 3 0 0 0 48.1% 500000 1 0.001184 1 845 13 412 9 4 0 3 3 0 0 0 23.7% 500000 32 0.001199 32 26690 13 412 9 4 0 3 3 0 0 0 21.9% 500000 256 0.001530 256 167296 14 444 10 4 0 3 3 0 0 0 26.8% 500000 4096 0.004379 4096 935474 30 956 19 4 0 3 3 0 0 0 51.5% running: large-partition-slicing-clustering-keys Testing slicing of large partition using clustering keys: offset read time (s) frags frag/s aio (KiB) blocked dropped idx hit idx miss idx blk c hit c miss c blk cpu 0 1 0.000620 1 1614 7 176 6 0 0 3 3 0 0 0 27.4% 0 32 0.000625 32 51218 7 176 6 0 0 3 3 0 0 0 27.0% 0 256 0.000701 256 365148 8 180 6 0 0 3 3 0 0 0 35.2% 0 4096 0.004063 4096 1008130 20 692 12 1 0 3 3 0 0 0 47.6% 500000 1 0.001208 1 827 12 516 9 3 0 3 3 0 0 0 24.3% 500000 32 0.000973 32 32876 12 388 9 3 0 3 3 0 0 0 28.7% 500000 256 0.001315 256 194612 13 384 10 3 0 3 3 0 0 0 29.0% 500000 4096 0.003950 4096 1037068 25 840 17 2 0 3 3 0 0 0 52.7% running: large-partition-slicing-single-key-reader Testing slicing of large partition, single-partition reader: offset read time (s) frags frag/s aio (KiB) blocked dropped idx hit idx miss idx blk c hit c miss c blk cpu 0 1 0.000844 1 1185 9 488 6 0 0 3 3 0 0 0 16.5% 0 32 0.000656 32 48753 9 296 6 0 0 3 3 0 0 0 23.1% 0 256 0.000751 256 341011 10 328 6 0 0 3 3 0 0 0 34.0% 0 4096 0.004173 4096 981632 22 840 12 1 0 3 3 0 0 0 47.0% 500000 1 0.001036 1 966 13 476 9 3 0 3 3 0 0 0 25.4% 500000 32 0.001014 32 31573 13 412 9 3 0 3 3 0 0 0 27.4% 500000 256 0.001280 256 200044 14 444 10 3 0 3 3 0 0 0 31.8% 500000 4096 0.004081 4096 1003746 30 988 18 3 0 3 3 0 0 0 51.6% running: large-partition-select-few-rows Testing selecting few rows from a large partition: stride rows time (s) frags frag/s aio (KiB) blocked dropped idx hit idx miss idx blk c hit c miss c blk cpu 1000000 1 0.000668 1 1498 9 296 6 3 0 3 3 0 0 0 21.7% 500000 2 0.000958 2 2088 13 412 9 4 0 3 3 0 0 0 27.7% 250000 4 0.001495 4 2676 18 572 12 6 0 3 3 0 0 0 25.8% 125000 8 0.002069 8 3866 29 912 19 10 0 3 3 0 0 0 30.8% 62500 16 0.002856 16 5603 50 1584 32 18 0 3 3 0 0 0 41.7% 2 500000 1.063129 500000 470310 1138 127080 120 0 0 5 5 0 0 0 99.7% running: large-partition-forwarding Testing forwarding with clustering restriction in a large partition: pk-scan time (s) frags frag/s aio (KiB) blocked dropped idx hit idx miss idx blk c hit c miss c blk cpu yes 0.002567 2 779 24 2684 8 16 0 3 3 0 0 0 21.5% no 0.001013 2 1975 13 412 8 2 0 3 3 0 0 0 28.9% running: small-partition-skips Testing scanning small partitions with skips. Reads whole range interleaving reads with skips according to read-skip pattern: read skip time (s) frags frag/s aio (KiB) blocked dropped idx hit idx miss idx blk c hit c miss c blk cpu -> 1 0 1.349959 1000000 740763 1369 139732 33 1 0 0 0 0 0 0 99.7% -> 1 1 12.640751 500000 39555 8144 191168 7064 0 0 7032 11481 0 0 0 96.2% -> 1 8 3.404269 111112 32639 6651 180660 5571 0 0 5539 10505 0 0 0 84.5% -> 1 16 2.175424 58824 27040 6434 179116 5354 0 0 5322 10365 0 0 0 74.3% -> 1 32 1.493365 30304 20292 6335 178404 5257 0 0 5225 10294 0 0 0 61.1% -> 1 64 1.112168 15385 13833 6256 177672 5183 0 0 5151 10217 0 0 0 48.7% -> 1 256 0.719282 3892 5411 6211 175464 5178 24 0 5122 10220 0 0 0 33.3% -> 1 1024 0.393236 976 2482 5946 142152 5369 217 0 5120 10235 0 0 0 30.7% -> 1 4096 0.185284 245 1322 5499 81992 5296 142 0 5122 10240 0 0 0 44.7% -> 64 1 2.356711 984616 417792 7361 177944 5184 21 0 5152 5266 0 0 0 79.1% -> 64 8 2.192331 888896 405457 6253 177868 5173 0 0 5141 5690 0 0 0 77.2% -> 64 16 2.029835 800000 394121 6245 177812 5165 0 0 5133 6132 0 0 0 75.7% -> 64 32 1.806448 666688 369060 6245 177808 5165 0 0 5133 6864 0 0 0 72.6% -> 64 64 1.508492 500032 331478 6242 177788 5163 0 0 5131 7667 0 0 0 67.7% -> 64 256 0.892881 200000 223994 6233 177704 5154 0 0 5122 9202 0 0 0 54.2% -> 64 1024 0.465715 58880 126429 6229 177492 5154 0 0 5122 9930 0 0 0 44.0% -> 64 4096 0.266582 15424 57858 6158 169480 5257 114 0 5115 10142 0 0 0 42.3% running: small-partition-slicing Testing slicing small partitions: offset read time (s) frags frag/s aio (KiB) blocked dropped idx hit idx miss idx blk c hit c miss c blk cpu 0 1 0.003113 1 321 3 260 2 0 0 1 1 0 0 0 13.4% 0 32 0.004166 32 7682 16 1428 10 0 0 5 5 0 0 0 14.9% 0 256 0.009813 256 26088 97 8572 62 0 0 31 31 0 0 0 18.4% 0 4096 0.014798 4096 276794 100 8704 64 0 0 32 32 0 0 0 46.3% 500000 1 0.003700 1 270 34 492 33 0 0 32 64 0 0 0 28.4% 500000 32 0.004030 32 7940 40 1260 36 0 0 32 64 0 0 0 27.8% 500000 256 0.009514 256 26908 100 8748 64 0 0 32 64 0 0 0 20.2% 500000 4096 0.013368 4096 306413 119 7892 85 0 0 53 64 0 0 0 53.6% Signed-off-by: Vladimir Krivopalov <vladimir@scylladb.com> Message-Id: <a72818f79ca4081a606424545b0053fa581d49e7.1522173144.git.vladimir@scylladb.com>
91 lines
3.2 KiB
C++
91 lines
3.2 KiB
C++
/*
|
|
* Copyright (C) 2017 ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* This file is part of Scylla.
|
|
*
|
|
* Scylla is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU Affero General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* Scylla is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include "dht/i_partitioner.hh"
|
|
#include "schema.hh"
|
|
#include "sstables/index_reader.hh"
|
|
|
|
class index_reader_assertions {
|
|
std::unique_ptr<sstables::index_reader> _r;
|
|
public:
|
|
index_reader_assertions(std::unique_ptr<sstables::index_reader> r)
|
|
: _r(std::move(r))
|
|
{ }
|
|
|
|
index_reader_assertions& has_monotonic_positions(const schema& s) {
|
|
auto pos_cmp = position_in_partition::composite_less_compare(s);
|
|
auto rp_cmp = dht::ring_position_comparator(s);
|
|
auto prev = dht::ring_position::min();
|
|
_r->read_lower_partition_data().get();
|
|
while (!_r->eof()) {
|
|
auto& e = _r->current_lower_partition_entry();
|
|
auto k = e.get_decorated_key();
|
|
auto token = dht::token(k.token());
|
|
auto rp = dht::ring_position(token, k.key().to_partition_key(s));
|
|
|
|
if (!rp_cmp(prev, rp)) {
|
|
BOOST_FAIL(sprint("Partitions have invalid order: %s >= %s", prev, rp));
|
|
}
|
|
|
|
prev = rp;
|
|
|
|
while (e.get_read_pi_blocks_count() < e.get_total_pi_blocks_count()) {
|
|
e.get_next_pi_blocks().get();
|
|
auto* infos = e.get_pi_blocks();
|
|
if (infos->empty()) {
|
|
continue;
|
|
}
|
|
auto& prev = (*infos)[0];
|
|
for (size_t i = 1; i < infos->size(); ++i) {
|
|
auto& cur = (*infos)[i];
|
|
if (pos_cmp(cur.start(s), prev.end(s))) {
|
|
std::cout << "promoted index:\n";
|
|
for (auto& e : *infos) {
|
|
std::cout << " " << e.start(s) << "-" << e.end(s)
|
|
<< ": +" << e.offset() << " len=" << e.width() << std::endl;
|
|
}
|
|
BOOST_FAIL(sprint("Index blocks are not monotonic: %s >= %s", prev.end(s), cur.start(s)));
|
|
}
|
|
cur = prev;
|
|
}
|
|
}
|
|
_r->advance_lower_to_next_partition().get();
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
index_reader_assertions& is_empty(const schema& s) {
|
|
_r->read_lower_partition_data().get();
|
|
while (!_r->eof()) {
|
|
BOOST_REQUIRE(_r->current_lower_partition_entry().get_total_pi_blocks_count() == 0);
|
|
_r->advance_lower_to_next_partition().get();
|
|
}
|
|
return *this;
|
|
}
|
|
};
|
|
|
|
inline
|
|
index_reader_assertions assert_that(std::unique_ptr<sstables::index_reader> r) {
|
|
return { std::move(r) };
|
|
}
|