mirror of
https://github.com/scylladb/scylladb.git
synced 2026-04-19 16:15:07 +00:00
Historically, we had two index_readers per a sstable_mutation_reader, one for the lower bound and one for the upper bound. Most of public members of the index_reader class were only called on either of those. With the changes introduced in #2981, two readers are even more tied together as they now have a shared-per-pair list of index pages that needs proper cleanup and was protruding woefully into the caller code. This fix re-structures index_reader so that it now keeps track of both lower and upper bounds. The shared_index_lists structure is encapsulated within index_reader and becomes an internal detail rather than a liability. Fixes #3220. Tests: unit (debug, release) + Tested using cassandra-stress commands from #3189. perf_fast_forward results indicate there is no performance degradation caused by thix fix. =========================== Baseline =================================== running: large-partition-skips Testing scanning large partition with skips. Reads whole range interleaving reads with skips according to read-skip pattern: read skip time (s) frags frag/s aio (KiB) blocked dropped idx hit idx miss idx blk c hit c miss c blk cpu 1 0 0.494458 1000000 2022418 1018 126960 27 0 0 0 0 0 0 0 97.6% 1 1 1.754717 500000 284946 997 127064 6 0 0 3 3 0 0 0 99.9% 1 8 0.551664 111112 201413 997 127064 6 0 0 3 3 0 0 0 99.7% 1 16 0.383888 58824 153232 1001 127080 10 0 0 5 5 0 0 0 99.5% 1 32 0.289073 30304 104832 997 127064 28 0 0 3 3 0 0 0 99.3% 1 64 0.236963 15385 64926 997 127064 122 0 0 3 3 0 0 0 99.2% 1 256 0.172901 3892 22510 997 127064 217 0 0 3 3 0 0 0 95.5% 1 1024 0.117570 976 8301 997 127064 235 0 0 3 3 0 0 0 49.0% 1 4096 0.085811 245 2855 664 27172 375 274 0 3 3 0 0 0 21.4% 64 1 0.512781 984616 1920149 1142 127064 139 0 0 3 3 0 0 0 98.7% 64 8 0.479232 888896 1854833 1001 127080 10 0 0 5 5 0 0 0 99.6% 64 16 0.451193 800000 1773078 997 127064 6 0 0 3 3 0 0 0 99.6% 64 32 0.408684 666688 1631305 997 127064 6 0 0 3 3 0 0 0 99.5% 64 64 0.351906 500032 1420924 997 127064 14 0 0 3 3 0 0 0 99.5% 64 256 0.227008 200000 881026 997 127064 211 0 0 3 3 0 0 0 99.1% 64 1024 0.125803 58880 468032 997 127064 290 0 0 3 3 0 0 0 65.1% 64 4096 0.098155 15424 157139 703 27856 401 267 0 3 3 0 0 0 25.8% running: large-partition-slicing Testing slicing of large partition: offset read time (s) frags frag/s aio (KiB) blocked dropped idx hit idx miss idx blk c hit c miss c blk cpu 0 1 0.000701 1 1427 9 296 6 4 0 3 3 0 0 0 12.4% 0 32 0.000698 32 45827 9 296 6 3 0 3 3 0 0 0 13.9% 0 256 0.000808 256 316920 10 328 6 3 0 3 3 0 0 0 24.9% 0 4096 0.004368 4096 937697 25 808 14 3 0 3 3 0 0 0 45.9% 500000 1 0.001196 1 836 13 412 9 4 0 3 3 0 0 0 22.7% 500000 32 0.001200 32 26664 13 412 9 4 0 3 3 0 0 0 22.2% 500000 256 0.001503 256 170338 14 444 10 4 0 3 3 0 0 0 25.3% 500000 4096 0.004351 4096 941465 30 956 20 4 0 3 3 0 0 0 50.7% running: large-partition-slicing-clustering-keys Testing slicing of large partition using clustering keys: offset read time (s) frags frag/s aio (KiB) blocked dropped idx hit idx miss idx blk c hit c miss c blk cpu 0 1 0.000625 1 1601 7 176 6 0 0 3 3 0 0 0 23.2% 0 32 0.000604 32 53016 7 176 6 0 0 3 3 0 0 0 24.7% 0 256 0.000695 256 368498 8 180 6 0 0 3 3 0 0 0 36.4% 0 4096 0.004083 4096 1003106 20 692 12 1 0 3 3 0 0 0 47.0% 500000 1 0.001198 1 835 12 516 9 3 0 3 3 0 0 0 22.8% 500000 32 0.000981 32 32631 12 388 9 3 0 3 3 0 0 0 29.2% 500000 256 0.001320 256 194011 13 384 10 3 0 3 3 0 0 0 29.0% 500000 4096 0.003944 4096 1038567 25 840 17 2 0 3 3 0 0 0 52.2% running: large-partition-slicing-single-key-reader Testing slicing of large partition, single-partition reader: offset read time (s) frags frag/s aio (KiB) blocked dropped idx hit idx miss idx blk c hit c miss c blk cpu 0 1 0.000849 1 1178 9 488 6 0 0 3 3 0 0 0 16.5% 0 32 0.000661 32 48415 9 296 6 0 0 3 3 0 0 0 22.2% 0 256 0.000756 256 338648 10 328 6 0 0 3 3 0 0 0 33.3% 0 4096 0.004147 4096 987610 22 840 12 1 0 3 3 0 0 0 47.9% 500000 1 0.001041 1 960 13 476 9 3 0 3 3 0 0 0 25.9% 500000 32 0.001020 32 31375 13 412 9 3 0 3 3 0 0 0 29.1% 500000 256 0.001265 256 202373 14 444 10 3 0 3 3 0 0 0 32.0% 500000 4096 0.004121 4096 994014 30 988 18 3 0 3 3 0 0 0 52.7% running: large-partition-select-few-rows Testing selecting few rows from a large partition: stride rows time (s) frags frag/s aio (KiB) blocked dropped idx hit idx miss idx blk c hit c miss c blk cpu 1000000 1 0.000668 1 1498 9 296 6 4 0 3 3 0 0 0 19.8% 500000 2 0.000976 2 2048 13 412 9 4 0 3 3 0 0 0 29.0% 250000 4 0.001408 4 2842 18 572 12 6 0 3 3 0 0 0 28.8% 125000 8 0.002004 8 3993 29 912 19 10 0 3 3 0 0 0 34.0% 62500 16 0.002883 16 5551 50 1584 32 18 0 3 3 0 0 0 41.9% 2 500000 1.053215 500000 474737 1138 127080 120 0 0 5 5 0 0 0 99.7% running: large-partition-forwarding Testing forwarding with clustering restriction in a large partition: pk-scan time (s) frags frag/s aio (KiB) blocked dropped idx hit idx miss idx blk c hit c miss c blk cpu yes 0.002717 2 736 24 2684 8 16 0 3 3 0 0 0 19.7% no 0.001004 2 1992 13 412 8 2 0 3 3 0 0 0 30.2% running: small-partition-skips Testing scanning small partitions with skips. Reads whole range interleaving reads with skips according to read-skip pattern: read skip time (s) frags frag/s aio (KiB) blocked dropped idx hit idx miss idx blk c hit c miss c blk cpu -> 1 0 1.466523 1000000 681885 1369 139732 33 1 0 0 0 0 0 0 99.7% -> 1 1 12.792183 500000 39086 6235 177736 5155 0 0 5123 7663 0 0 0 96.4% -> 1 8 3.451431 111112 32193 6235 177736 5155 0 0 5123 9673 0 0 0 84.8% -> 1 16 2.223815 58824 26452 6234 177704 5154 0 0 5122 9965 0 0 0 75.0% -> 1 32 1.512511 30304 20036 6233 177680 5155 1 0 5123 10090 0 0 0 61.8% -> 1 64 1.129465 15385 13621 6227 177464 5154 0 0 5122 10159 0 0 0 49.5% -> 1 256 0.733282 3892 5308 6211 175464 5178 24 0 5122 10220 0 0 0 33.8% -> 1 1024 0.397302 976 2457 5946 142152 5369 217 0 5120 10235 0 0 0 32.1% -> 1 4096 0.187746 245 1305 5499 81992 5296 142 0 5122 10240 0 0 0 46.8% -> 64 1 2.428488 984616 405444 7332 177736 5155 25 0 5123 5208 0 0 0 79.9% -> 64 8 2.262876 888896 392817 6235 177736 5155 0 0 5123 5654 0 0 0 78.1% -> 64 16 2.137544 800000 374261 6234 177732 5154 0 0 5122 6110 0 0 0 77.1% -> 64 32 1.862466 666688 357960 6235 177736 5155 0 0 5123 6844 0 0 0 73.7% -> 64 64 1.547757 500032 323069 6234 177728 5155 0 0 5123 7651 0 0 0 68.7% -> 64 256 0.914612 200000 218672 6233 177704 5154 0 0 5122 9202 0 0 0 55.5% -> 64 1024 0.475472 58880 123835 6229 177492 5154 5 0 5122 9930 0 0 0 45.4% -> 64 4096 0.271239 15424 56865 6158 169480 5257 114 0 5115 10142 0 0 0 44.1% running: small-partition-slicing Testing slicing small partitions: offset read time (s) frags frag/s aio (KiB) blocked dropped idx hit idx miss idx blk c hit c miss c blk cpu 0 1 0.003209 1 312 3 260 2 7 0 1 1 0 0 0 15.5% 0 32 0.004205 32 7610 16 1428 10 0 0 5 5 0 0 0 15.7% 0 256 0.009830 256 26042 97 8572 62 0 0 31 31 0 0 0 18.7% 0 4096 0.015471 4096 264748 100 8704 64 0 0 32 32 0 0 0 48.4% 500000 1 0.003654 1 274 34 492 33 0 0 32 64 0 0 0 28.7% 500000 32 0.004287 32 7464 40 1260 36 0 0 32 64 0 0 0 26.0% 500000 256 0.009598 256 26673 100 8748 64 4 0 32 64 0 0 0 20.6% 500000 4096 0.014151 4096 289449 119 7892 85 0 0 53 64 0 0 0 54.1% ======================== With the patch ================================ running: large-partition-skips Testing scanning large partition with skips. Reads whole range interleaving reads with skips according to read-skip pattern: read skip time (s) frags frag/s aio (KiB) blocked dropped idx hit idx miss idx blk c hit c miss c blk cpu 1 0 0.468887 1000000 2132711 1018 126960 29 0 0 0 0 0 0 0 98.4% 1 1 1.735113 500000 288166 1001 127080 10 0 0 5 5 0 0 0 99.9% 1 8 0.535616 111112 207447 997 127064 6 0 0 3 3 0 0 0 99.6% 1 16 0.365487 58824 160947 1001 127080 15 0 0 5 5 0 0 0 99.5% 1 32 0.272208 30304 111326 997 127064 21 0 0 3 3 0 0 0 99.3% 1 64 0.224049 15385 68668 997 127064 208 0 0 3 3 0 0 0 99.1% 1 256 0.159247 3892 24440 997 127064 250 0 0 3 3 0 0 0 94.7% 1 1024 0.102107 976 9559 997 127064 292 0 0 3 3 0 0 0 53.6% 1 4096 0.084310 245 2906 664 27172 371 273 0 3 3 0 0 0 20.2% 64 1 0.508340 984616 1936923 1142 127064 129 0 0 3 3 0 0 0 98.1% 64 8 0.470369 888896 1889786 997 127064 6 0 0 3 3 0 0 0 99.6% 64 16 0.439917 800000 1818526 1001 127080 10 0 0 5 5 0 0 0 99.6% 64 32 0.397938 666688 1675358 997 127064 6 0 0 3 3 0 0 0 99.5% 64 64 0.344144 500032 1452972 997 127064 18 0 0 3 3 0 0 0 99.4% 64 256 0.219996 200000 909107 997 127064 251 0 0 3 3 0 0 0 99.1% 64 1024 0.124294 58880 473715 997 127064 284 1 0 3 3 0 0 0 62.2% 64 4096 0.097580 15424 158065 703 27856 400 267 0 3 3 0 0 0 25.3% running: large-partition-slicing Testing slicing of large partition: offset read time (s) frags frag/s aio (KiB) blocked dropped idx hit idx miss idx blk c hit c miss c blk cpu 0 1 0.000733 1 1365 9 296 6 4 0 3 3 0 0 0 19.3% 0 32 0.000705 32 45417 9 296 6 3 0 3 3 0 0 0 15.3% 0 256 0.000830 256 308364 10 328 6 3 0 3 3 0 0 0 26.7% 0 4096 0.004631 4096 884529 25 808 14 3 0 3 3 0 0 0 48.1% 500000 1 0.001184 1 845 13 412 9 4 0 3 3 0 0 0 23.7% 500000 32 0.001199 32 26690 13 412 9 4 0 3 3 0 0 0 21.9% 500000 256 0.001530 256 167296 14 444 10 4 0 3 3 0 0 0 26.8% 500000 4096 0.004379 4096 935474 30 956 19 4 0 3 3 0 0 0 51.5% running: large-partition-slicing-clustering-keys Testing slicing of large partition using clustering keys: offset read time (s) frags frag/s aio (KiB) blocked dropped idx hit idx miss idx blk c hit c miss c blk cpu 0 1 0.000620 1 1614 7 176 6 0 0 3 3 0 0 0 27.4% 0 32 0.000625 32 51218 7 176 6 0 0 3 3 0 0 0 27.0% 0 256 0.000701 256 365148 8 180 6 0 0 3 3 0 0 0 35.2% 0 4096 0.004063 4096 1008130 20 692 12 1 0 3 3 0 0 0 47.6% 500000 1 0.001208 1 827 12 516 9 3 0 3 3 0 0 0 24.3% 500000 32 0.000973 32 32876 12 388 9 3 0 3 3 0 0 0 28.7% 500000 256 0.001315 256 194612 13 384 10 3 0 3 3 0 0 0 29.0% 500000 4096 0.003950 4096 1037068 25 840 17 2 0 3 3 0 0 0 52.7% running: large-partition-slicing-single-key-reader Testing slicing of large partition, single-partition reader: offset read time (s) frags frag/s aio (KiB) blocked dropped idx hit idx miss idx blk c hit c miss c blk cpu 0 1 0.000844 1 1185 9 488 6 0 0 3 3 0 0 0 16.5% 0 32 0.000656 32 48753 9 296 6 0 0 3 3 0 0 0 23.1% 0 256 0.000751 256 341011 10 328 6 0 0 3 3 0 0 0 34.0% 0 4096 0.004173 4096 981632 22 840 12 1 0 3 3 0 0 0 47.0% 500000 1 0.001036 1 966 13 476 9 3 0 3 3 0 0 0 25.4% 500000 32 0.001014 32 31573 13 412 9 3 0 3 3 0 0 0 27.4% 500000 256 0.001280 256 200044 14 444 10 3 0 3 3 0 0 0 31.8% 500000 4096 0.004081 4096 1003746 30 988 18 3 0 3 3 0 0 0 51.6% running: large-partition-select-few-rows Testing selecting few rows from a large partition: stride rows time (s) frags frag/s aio (KiB) blocked dropped idx hit idx miss idx blk c hit c miss c blk cpu 1000000 1 0.000668 1 1498 9 296 6 3 0 3 3 0 0 0 21.7% 500000 2 0.000958 2 2088 13 412 9 4 0 3 3 0 0 0 27.7% 250000 4 0.001495 4 2676 18 572 12 6 0 3 3 0 0 0 25.8% 125000 8 0.002069 8 3866 29 912 19 10 0 3 3 0 0 0 30.8% 62500 16 0.002856 16 5603 50 1584 32 18 0 3 3 0 0 0 41.7% 2 500000 1.063129 500000 470310 1138 127080 120 0 0 5 5 0 0 0 99.7% running: large-partition-forwarding Testing forwarding with clustering restriction in a large partition: pk-scan time (s) frags frag/s aio (KiB) blocked dropped idx hit idx miss idx blk c hit c miss c blk cpu yes 0.002567 2 779 24 2684 8 16 0 3 3 0 0 0 21.5% no 0.001013 2 1975 13 412 8 2 0 3 3 0 0 0 28.9% running: small-partition-skips Testing scanning small partitions with skips. Reads whole range interleaving reads with skips according to read-skip pattern: read skip time (s) frags frag/s aio (KiB) blocked dropped idx hit idx miss idx blk c hit c miss c blk cpu -> 1 0 1.349959 1000000 740763 1369 139732 33 1 0 0 0 0 0 0 99.7% -> 1 1 12.640751 500000 39555 8144 191168 7064 0 0 7032 11481 0 0 0 96.2% -> 1 8 3.404269 111112 32639 6651 180660 5571 0 0 5539 10505 0 0 0 84.5% -> 1 16 2.175424 58824 27040 6434 179116 5354 0 0 5322 10365 0 0 0 74.3% -> 1 32 1.493365 30304 20292 6335 178404 5257 0 0 5225 10294 0 0 0 61.1% -> 1 64 1.112168 15385 13833 6256 177672 5183 0 0 5151 10217 0 0 0 48.7% -> 1 256 0.719282 3892 5411 6211 175464 5178 24 0 5122 10220 0 0 0 33.3% -> 1 1024 0.393236 976 2482 5946 142152 5369 217 0 5120 10235 0 0 0 30.7% -> 1 4096 0.185284 245 1322 5499 81992 5296 142 0 5122 10240 0 0 0 44.7% -> 64 1 2.356711 984616 417792 7361 177944 5184 21 0 5152 5266 0 0 0 79.1% -> 64 8 2.192331 888896 405457 6253 177868 5173 0 0 5141 5690 0 0 0 77.2% -> 64 16 2.029835 800000 394121 6245 177812 5165 0 0 5133 6132 0 0 0 75.7% -> 64 32 1.806448 666688 369060 6245 177808 5165 0 0 5133 6864 0 0 0 72.6% -> 64 64 1.508492 500032 331478 6242 177788 5163 0 0 5131 7667 0 0 0 67.7% -> 64 256 0.892881 200000 223994 6233 177704 5154 0 0 5122 9202 0 0 0 54.2% -> 64 1024 0.465715 58880 126429 6229 177492 5154 0 0 5122 9930 0 0 0 44.0% -> 64 4096 0.266582 15424 57858 6158 169480 5257 114 0 5115 10142 0 0 0 42.3% running: small-partition-slicing Testing slicing small partitions: offset read time (s) frags frag/s aio (KiB) blocked dropped idx hit idx miss idx blk c hit c miss c blk cpu 0 1 0.003113 1 321 3 260 2 0 0 1 1 0 0 0 13.4% 0 32 0.004166 32 7682 16 1428 10 0 0 5 5 0 0 0 14.9% 0 256 0.009813 256 26088 97 8572 62 0 0 31 31 0 0 0 18.4% 0 4096 0.014798 4096 276794 100 8704 64 0 0 32 32 0 0 0 46.3% 500000 1 0.003700 1 270 34 492 33 0 0 32 64 0 0 0 28.4% 500000 32 0.004030 32 7940 40 1260 36 0 0 32 64 0 0 0 27.8% 500000 256 0.009514 256 26908 100 8748 64 0 0 32 64 0 0 0 20.2% 500000 4096 0.013368 4096 306413 119 7892 85 0 0 53 64 0 0 0 53.6% Signed-off-by: Vladimir Krivopalov <vladimir@scylladb.com> Message-Id: <a72818f79ca4081a606424545b0053fa581d49e7.1522173144.git.vladimir@scylladb.com>
730 lines
29 KiB
C++
730 lines
29 KiB
C++
/*
|
|
* Copyright (C) 2015 ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* This file is part of Scylla.
|
|
*
|
|
* Scylla is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU Affero General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* Scylla is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#pragma once
|
|
#include "sstables.hh"
|
|
#include "consumer.hh"
|
|
#include "downsampling.hh"
|
|
#include "sstables/shared_index_lists.hh"
|
|
#include <seastar/util/bool_class.hh>
|
|
#include "utils/buffer_input_stream.hh"
|
|
#include "sstables/prepended_input_stream.hh"
|
|
|
|
namespace sstables {
|
|
|
|
class index_consumer {
|
|
uint64_t max_quantity;
|
|
public:
|
|
index_list indexes;
|
|
|
|
index_consumer(uint64_t q) : max_quantity(q) {
|
|
indexes.reserve(q);
|
|
}
|
|
|
|
void consume_entry(index_entry&& ie, uint64_t offset) {
|
|
indexes.push_back(std::move(ie));
|
|
}
|
|
void reset() {
|
|
indexes.clear();
|
|
}
|
|
};
|
|
|
|
// See #2993
|
|
class trust_promoted_index_tag;
|
|
using trust_promoted_index = bool_class<trust_promoted_index_tag>;
|
|
|
|
// IndexConsumer is a concept that implements:
|
|
//
|
|
// bool should_continue();
|
|
// void consume_entry(index_entry&& ie, uintt64_t offset);
|
|
template <class IndexConsumer>
|
|
class index_consume_entry_context : public data_consumer::continuous_data_consumer<index_consume_entry_context<IndexConsumer>> {
|
|
using proceed = data_consumer::proceed;
|
|
using processing_result = data_consumer::processing_result;
|
|
using continuous_data_consumer = data_consumer::continuous_data_consumer<index_consume_entry_context<IndexConsumer>>;
|
|
private:
|
|
IndexConsumer& _consumer;
|
|
file _index_file;
|
|
file_input_stream_options _options;
|
|
uint64_t _entry_offset;
|
|
|
|
enum class state {
|
|
START,
|
|
KEY_SIZE,
|
|
KEY_BYTES,
|
|
POSITION,
|
|
PROMOTED_SIZE,
|
|
LOCAL_DELETION_TIME,
|
|
MARKED_FOR_DELETE_AT,
|
|
NUM_PROMOTED_INDEX_BLOCKS,
|
|
CONSUME_ENTRY,
|
|
} _state = state::START;
|
|
|
|
temporary_buffer<char> _key;
|
|
uint32_t _promoted_index_size;
|
|
uint64_t _position;
|
|
stdx::optional<deletion_time> _deletion_time;
|
|
uint32_t _num_pi_blocks = 0;
|
|
|
|
trust_promoted_index _trust_pi;
|
|
const schema& _s;
|
|
|
|
public:
|
|
void verify_end_state() {
|
|
if (this->_remain > 0) {
|
|
throw std::runtime_error("index_consume_entry_context - no more data but parsing is incomplete");
|
|
}
|
|
}
|
|
|
|
bool non_consuming() const {
|
|
return ((_state == state::CONSUME_ENTRY) || (_state == state::START));
|
|
}
|
|
|
|
processing_result process_state(temporary_buffer<char>& data) {
|
|
switch (_state) {
|
|
// START comes first, to make the handling of the 0-quantity case simpler
|
|
case state::START:
|
|
_state = state::KEY_SIZE;
|
|
break;
|
|
case state::KEY_SIZE:
|
|
if (this->read_16(data) != continuous_data_consumer::read_status::ready) {
|
|
_state = state::KEY_BYTES;
|
|
break;
|
|
}
|
|
case state::KEY_BYTES:
|
|
if (this->read_bytes(data, this->_u16, _key) != continuous_data_consumer::read_status::ready) {
|
|
_state = state::POSITION;
|
|
break;
|
|
}
|
|
case state::POSITION:
|
|
if (this->read_64(data) != continuous_data_consumer::read_status::ready) {
|
|
_state = state::PROMOTED_SIZE;
|
|
break;
|
|
}
|
|
case state::PROMOTED_SIZE:
|
|
_position = this->_u64;
|
|
if (this->read_32(data) != continuous_data_consumer::read_status::ready) {
|
|
_state = state::LOCAL_DELETION_TIME;
|
|
break;
|
|
}
|
|
case state::LOCAL_DELETION_TIME:
|
|
_promoted_index_size = this->_u32;
|
|
if (_promoted_index_size == 0) {
|
|
_state = state::CONSUME_ENTRY;
|
|
goto state_CONSUME_ENTRY;
|
|
}
|
|
_deletion_time.emplace();
|
|
if (this->read_32(data) != continuous_data_consumer::read_status::ready) {
|
|
_state = state::MARKED_FOR_DELETE_AT;
|
|
break;
|
|
}
|
|
case state::MARKED_FOR_DELETE_AT:
|
|
_deletion_time->local_deletion_time = this->_u32;
|
|
if (this->read_64(data) != continuous_data_consumer::read_status::ready) {
|
|
_state = state::NUM_PROMOTED_INDEX_BLOCKS;
|
|
break;
|
|
}
|
|
case state::NUM_PROMOTED_INDEX_BLOCKS:
|
|
_deletion_time->marked_for_delete_at = this->_u64;
|
|
if (this->read_32(data) != continuous_data_consumer::read_status::ready) {
|
|
_state = state::CONSUME_ENTRY;
|
|
break;
|
|
}
|
|
state_CONSUME_ENTRY:
|
|
case state::CONSUME_ENTRY: {
|
|
auto len = (_key.size() + _promoted_index_size + 14);
|
|
if (_deletion_time) {
|
|
_num_pi_blocks = this->_u32;
|
|
_promoted_index_size -= 16;
|
|
}
|
|
auto data_size = data.size();
|
|
stdx::optional<input_stream<char>> promoted_index_stream;
|
|
if ((_trust_pi == trust_promoted_index::yes) && (_promoted_index_size > 0)) {
|
|
if (_promoted_index_size <= data_size) {
|
|
auto buf = data.share();
|
|
buf.trim(_promoted_index_size);
|
|
promoted_index_stream = make_buffer_input_stream(std::move(buf));
|
|
} else {
|
|
promoted_index_stream = make_prepended_input_stream(
|
|
std::move(data),
|
|
make_file_input_stream(_index_file, _entry_offset + _key.size() + 30 + data_size,
|
|
_promoted_index_size - data_size, _options).detach());
|
|
}
|
|
} else {
|
|
_num_pi_blocks = 0;
|
|
}
|
|
_consumer.consume_entry(index_entry{std::move(_key), _position, std::move(promoted_index_stream),
|
|
_promoted_index_size, std::move(_deletion_time), _num_pi_blocks, _s}, _entry_offset);
|
|
_entry_offset += len;
|
|
_deletion_time = stdx::nullopt;
|
|
_num_pi_blocks = 0;
|
|
_state = state::START;
|
|
if (_promoted_index_size <= data_size) {
|
|
data.trim_front(_promoted_index_size);
|
|
} else {
|
|
assert(data.empty());
|
|
return skip_bytes{_promoted_index_size - data_size};
|
|
}
|
|
}
|
|
break;
|
|
default:
|
|
throw malformed_sstable_exception("unknown state");
|
|
}
|
|
return proceed::yes;
|
|
}
|
|
|
|
index_consume_entry_context(IndexConsumer& consumer, trust_promoted_index trust_pi, const schema& s,
|
|
file index_file, file_input_stream_options options, uint64_t start, uint64_t maxlen)
|
|
: continuous_data_consumer(make_file_input_stream(index_file, start, maxlen, options), start, maxlen)
|
|
, _consumer(consumer), _index_file(index_file), _options(options)
|
|
, _entry_offset(start), _trust_pi(trust_pi), _s(s)
|
|
{}
|
|
|
|
void reset(uint64_t offset) {
|
|
_state = state::START;
|
|
_entry_offset = offset;
|
|
_consumer.reset();
|
|
}
|
|
};
|
|
|
|
// Less-comparator for lookups in the partition index.
|
|
class index_comparator {
|
|
dht::ring_position_comparator _tri_cmp;
|
|
public:
|
|
index_comparator(const schema& s) : _tri_cmp(s) {}
|
|
|
|
bool operator()(const summary_entry& e, dht::ring_position_view rp) const {
|
|
return _tri_cmp(e.get_decorated_key(), rp) < 0;
|
|
}
|
|
|
|
bool operator()(const index_entry& e, dht::ring_position_view rp) const {
|
|
return _tri_cmp(e.get_decorated_key(), rp) < 0;
|
|
}
|
|
|
|
bool operator()(dht::ring_position_view rp, const summary_entry& e) const {
|
|
return _tri_cmp(e.get_decorated_key(), rp) > 0;
|
|
}
|
|
|
|
bool operator()(dht::ring_position_view rp, const index_entry& e) const {
|
|
return _tri_cmp(e.get_decorated_key(), rp) > 0;
|
|
}
|
|
};
|
|
|
|
inline static
|
|
future<> close_index_list(shared_index_lists::list_ptr& list) {
|
|
if (list) {
|
|
return parallel_for_each(*list, [](index_entry &ie) {
|
|
return ie.close_pi_stream();
|
|
}).finally([&list] {
|
|
list = {};
|
|
});
|
|
}
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
// Provides access to sstable indexes.
|
|
//
|
|
// Maintains logical cursors to sstable elements (partitions, cells).
|
|
// Holds two cursors pointing to the range within sstable (upper cursor may be not set).
|
|
// Initially the lower cursor is positioned on the first partition in the sstable.
|
|
// Cursors can be advanced forward using advance_to().
|
|
//
|
|
// If eof() then the lower bound cursor is positioned past all partitions in the sstable.
|
|
class index_reader {
|
|
shared_sstable _sstable;
|
|
const io_priority_class& _pc;
|
|
shared_index_lists _index_lists;
|
|
|
|
struct reader {
|
|
index_consumer _consumer;
|
|
index_consume_entry_context<index_consumer> _context;
|
|
|
|
inline static file_input_stream_options get_file_input_stream_options(shared_sstable sst, const io_priority_class& pc) {
|
|
file_input_stream_options options;
|
|
options.buffer_size = sst->sstable_buffer_size;
|
|
options.read_ahead = 2;
|
|
options.io_priority_class = pc;
|
|
return options;
|
|
}
|
|
|
|
reader(shared_sstable sst, const io_priority_class& pc, uint64_t begin, uint64_t end, uint64_t quantity)
|
|
: _consumer(quantity)
|
|
, _context(_consumer,
|
|
trust_promoted_index(sst->has_correct_promoted_index_entries()), *sst->_schema, sst->_index_file,
|
|
get_file_input_stream_options(sst, pc), begin, end - begin)
|
|
{ }
|
|
};
|
|
|
|
// Contains information about index_reader position in the index file
|
|
struct index_bound {
|
|
shared_index_lists::list_ptr current_list;
|
|
uint64_t previous_summary_idx = 0;
|
|
uint64_t current_summary_idx = 0;
|
|
uint64_t current_index_idx = 0;
|
|
uint64_t current_pi_idx = 0; // Points to upper bound of the cursor.
|
|
uint64_t data_file_position = 0;
|
|
indexable_element element = indexable_element::partition;
|
|
};
|
|
|
|
index_bound _lower_bound;
|
|
// Upper bound may remain uninitialized
|
|
std::optional<index_bound> _upper_bound;
|
|
|
|
private:
|
|
void advance_to_end(index_bound& bound) {
|
|
sstlog.trace("index {}: advance_to_end() bound {}", this, &bound);
|
|
bound.data_file_position = data_file_end();
|
|
bound.element = indexable_element::partition;
|
|
bound.current_list = {};
|
|
}
|
|
|
|
// Must be called for non-decreasing summary_idx.
|
|
future<> advance_to_page(index_bound& bound, uint64_t summary_idx) {
|
|
sstlog.trace("index {}: advance_to_page({}), bound {}", this, summary_idx, &bound);
|
|
assert(!bound.current_list || bound.current_summary_idx <= summary_idx);
|
|
if (bound.current_list && bound.current_summary_idx == summary_idx) {
|
|
sstlog.trace("index {}: same page", this);
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
auto& summary = _sstable->get_summary();
|
|
if (summary_idx >= summary.header.size) {
|
|
sstlog.trace("index {}: eof", this);
|
|
advance_to_end(bound);
|
|
return make_ready_future<>();
|
|
}
|
|
auto loader = [this] (uint64_t summary_idx) -> future<index_list> {
|
|
auto& summary = _sstable->get_summary();
|
|
uint64_t position = summary.entries[summary_idx].position;
|
|
uint64_t quantity = downsampling::get_effective_index_interval_after_index(summary_idx, summary.header.sampling_level,
|
|
summary.header.min_index_interval);
|
|
|
|
uint64_t end;
|
|
if (summary_idx + 1 >= summary.header.size) {
|
|
end = _sstable->index_size();
|
|
} else {
|
|
end = summary.entries[summary_idx + 1].position;
|
|
}
|
|
|
|
return do_with(std::make_unique<reader>(_sstable, _pc, position, end, quantity), [this, summary_idx] (auto& entries_reader) {
|
|
return entries_reader->_context.consume_input().then([this, summary_idx, &entries_reader] {
|
|
auto indexes = std::move(entries_reader->_consumer.indexes);
|
|
return entries_reader->_context.close().then([indexes = std::move(indexes)] () mutable {
|
|
return std::move(indexes);
|
|
});
|
|
|
|
});
|
|
});
|
|
};
|
|
|
|
return _index_lists.get_or_load(summary_idx, loader).then([this, &bound, summary_idx] (shared_index_lists::list_ptr ref) {
|
|
bound.current_list = std::move(ref);
|
|
bound.current_summary_idx = summary_idx;
|
|
bound.current_index_idx = 0;
|
|
bound.current_pi_idx = 0;
|
|
bound.data_file_position = (*bound.current_list)[0].position();
|
|
bound.element = indexable_element::partition;
|
|
|
|
if (sstlog.is_enabled(seastar::log_level::trace)) {
|
|
sstlog.trace("index {} bound {}: page:", this, &bound);
|
|
for (const index_entry& e : *bound.current_list) {
|
|
auto dk = dht::global_partitioner().decorate_key(*_sstable->_schema,
|
|
e.get_key().to_partition_key(*_sstable->_schema));
|
|
sstlog.trace(" {} -> {}", dk, e.position());
|
|
}
|
|
}
|
|
});
|
|
}
|
|
|
|
future<> advance_lower_to_start(const dht::partition_range &range) {
|
|
if (range.start()) {
|
|
return advance_to(_lower_bound,
|
|
dht::ring_position_view(range.start()->value(),
|
|
dht::ring_position_view::after_key(!range.start()->is_inclusive())));
|
|
}
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
future<> advance_upper_to_end(const dht::partition_range &range) {
|
|
if (!_upper_bound) {
|
|
_upper_bound.emplace();
|
|
}
|
|
if (range.end()) {
|
|
return advance_to(*_upper_bound,
|
|
dht::ring_position_view(range.end()->value(),
|
|
dht::ring_position_view::after_key(range.end()->is_inclusive())));
|
|
}
|
|
advance_to_end(*_upper_bound);
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
// Tells whether details about current partition can be accessed.
|
|
// If this returns false, you have to call read_partition_data().
|
|
//
|
|
// Calling read_partition_data() may involve doing I/O. The reason
|
|
// why control over this is exposed and not done under the hood is that
|
|
// in some cases it only makes sense to access partition details from index
|
|
// if it is readily available, and if it is not, we're better off obtaining
|
|
// them by continuing reading from sstable.
|
|
bool partition_data_ready(const index_bound& bound) const {
|
|
return static_cast<bool>(bound.current_list);
|
|
}
|
|
|
|
// Valid if partition_data_ready(bound)
|
|
index_entry& current_partition_entry(index_bound& bound) {
|
|
assert(bound.current_list);
|
|
return (*bound.current_list)[bound.current_index_idx];
|
|
}
|
|
|
|
future<> advance_to_next_partition(index_bound& bound) {
|
|
sstlog.trace("index {} bound {}: advance_to_next_partition()", &bound, this);
|
|
if (!partition_data_ready(bound)) {
|
|
return advance_to_page(bound, 0).then([this, &bound] {
|
|
return advance_to_next_partition(bound);
|
|
});
|
|
}
|
|
if (bound.current_index_idx + 1 < bound.current_list->size()) {
|
|
++bound.current_index_idx;
|
|
bound.current_pi_idx = 0;
|
|
bound.data_file_position = (*bound.current_list)[bound.current_index_idx].position();
|
|
bound.element = indexable_element::partition;
|
|
return make_ready_future<>();
|
|
}
|
|
auto& summary = _sstable->get_summary();
|
|
if (bound.current_summary_idx + 1 < summary.header.size) {
|
|
return advance_to_page(bound, bound.current_summary_idx + 1);
|
|
}
|
|
advance_to_end(bound);
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
future<> advance_to(index_bound& bound, dht::ring_position_view pos) {
|
|
sstlog.trace("index {} bound {}: advance_to({}), _previous_summary_idx={}, _current_summary_idx={}",
|
|
this, &bound, pos, bound.previous_summary_idx, bound.current_summary_idx);
|
|
|
|
if (pos.is_min()) {
|
|
sstlog.trace("index {}: first entry", this);
|
|
return make_ready_future<>();
|
|
} else if (pos.is_max()) {
|
|
advance_to_end(bound);
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
auto& summary = _sstable->get_summary();
|
|
bound.previous_summary_idx = std::distance(std::begin(summary.entries),
|
|
std::lower_bound(summary.entries.begin() + bound.previous_summary_idx, summary.entries.end(), pos, index_comparator(*_sstable->_schema)));
|
|
|
|
if (bound.previous_summary_idx == 0) {
|
|
sstlog.trace("index {}: first entry", this);
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
auto summary_idx = bound.previous_summary_idx - 1;
|
|
|
|
sstlog.trace("index {}: summary_idx={}", this, summary_idx);
|
|
|
|
// Despite the requirement that the values of 'pos' in subsequent calls
|
|
// are increasing we still may encounter a situation when we try to read
|
|
// the previous bucket.
|
|
// For example, let's say we have index like this:
|
|
// summary: A K ...
|
|
// index: A C D F K M N O ...
|
|
// Now, we want to get positions for range [G, J]. We start with [G,
|
|
// summary look up will tel us to check the first bucket. However, there
|
|
// is no G in that bucket so we read the following one to get the
|
|
// position (see the advance_to_page() call below). After we've got it, it's time to
|
|
// get J] position. Again, summary points us to the first bucket and we
|
|
// hit an assert since the reader is already at the second bucket and we
|
|
// cannot go backward.
|
|
// The solution is this condition above. If our lookup requires reading
|
|
// the previous bucket we assume that the entry doesn't exist and return
|
|
// the position of the first one in the current index bucket.
|
|
if (summary_idx + 1 == bound.current_summary_idx) {
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
return advance_to_page(bound, summary_idx).then([this, &bound, pos, summary_idx] {
|
|
sstlog.trace("index {}: old page index = {}", this, bound.current_index_idx);
|
|
auto& entries = *bound.current_list;
|
|
auto i = std::lower_bound(std::begin(entries) + bound.current_index_idx, std::end(entries), pos, index_comparator(*_sstable->_schema));
|
|
if (i == std::end(entries)) {
|
|
sstlog.trace("index {}: not found", this);
|
|
return advance_to_page(bound, summary_idx + 1);
|
|
}
|
|
bound.current_index_idx = std::distance(std::begin(entries), i);
|
|
bound.current_pi_idx = 0;
|
|
bound.data_file_position = i->position();
|
|
bound.element = indexable_element::partition;
|
|
sstlog.trace("index {}: new page index = {}, pos={}", this, bound.current_index_idx, bound.data_file_position);
|
|
return make_ready_future<>();
|
|
});
|
|
}
|
|
|
|
// Returns position right after all partitions in the sstable
|
|
uint64_t data_file_end() const {
|
|
return _sstable->data_size();
|
|
}
|
|
|
|
public:
|
|
index_reader(shared_sstable sst, const io_priority_class& pc)
|
|
: _sstable(std::move(sst))
|
|
, _pc(pc)
|
|
{
|
|
sstlog.trace("index {}: index_reader for {}", this, _sstable->get_filename());
|
|
}
|
|
|
|
// Ensures that lower_partition_data_ready() returns true.
|
|
// Can be called only when !eof(_lower_bound)
|
|
future<> read_lower_partition_data() {
|
|
assert(!eof());
|
|
if (partition_data_ready(_lower_bound)) {
|
|
return make_ready_future<>();
|
|
}
|
|
// The only case when _current_list may be missing is when the cursor is at the beginning
|
|
assert(_lower_bound.current_summary_idx == 0);
|
|
return advance_to_page(_lower_bound, 0);
|
|
}
|
|
|
|
// Advance index_reader bounds to the bounds of the supplied range
|
|
future<> advance_to(const dht::partition_range& range) {
|
|
return seastar::when_all_succeed(
|
|
advance_lower_to_start(range),
|
|
advance_upper_to_end(range));
|
|
}
|
|
|
|
index_entry& current_lower_partition_entry() {
|
|
return current_partition_entry(_lower_bound);
|
|
}
|
|
|
|
// Returns tombstone for the current lower partition if it was recorded in the sstable.
|
|
// It may be unavailable for old sstables for which this information was not generated.
|
|
// Can be called only when lower_partition_data_ready().
|
|
stdx::optional<sstables::deletion_time> lower_partition_tombstone() {
|
|
return current_partition_entry(_lower_bound).get_deletion_time();
|
|
}
|
|
|
|
// Returns the key for current lower partition.
|
|
// Can be called only when lower_partition_data_ready().
|
|
// The result is valid as long as index_reader is valid.
|
|
key_view lower_partition_key() {
|
|
index_entry& e = current_partition_entry(_lower_bound);
|
|
return e.get_key();
|
|
}
|
|
|
|
bool lower_partition_data_ready() const {
|
|
return partition_data_ready(_lower_bound);
|
|
}
|
|
|
|
// Forwards the lower bound cursor to given position in current partition.
|
|
//
|
|
// Note that the index within partition, unlike the partition index, doesn't cover all keys.
|
|
// So this may forward the cursor to some position pos' which precedes pos, even though
|
|
// there exist rows with positions in the range [pos', pos].
|
|
//
|
|
// Must be called for non-decreasing positions.
|
|
// Must be called only after advanced to some partition and !eof().
|
|
future<> advance_lower_to(position_in_partition_view pos) {
|
|
sstlog.trace("index {}: advance_lower_to({}), current data_file_pos={}",
|
|
this, pos, _lower_bound.data_file_position);
|
|
|
|
if (!lower_partition_data_ready()) {
|
|
return read_lower_partition_data().then([this, pos] {
|
|
sstlog.trace("index {}: page done", this);
|
|
assert(partition_data_ready(_lower_bound));
|
|
return advance_lower_to(pos);
|
|
});
|
|
}
|
|
|
|
index_entry& e = current_lower_partition_entry();
|
|
if (e.get_total_pi_blocks_count() == 0) {
|
|
sstlog.trace("index {}: no promoted index", this);
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
promoted_index_blocks* pi_blocks = e.get_pi_blocks();
|
|
assert(pi_blocks);
|
|
|
|
if ((e.get_total_pi_blocks_count() == e.get_read_pi_blocks_count())
|
|
&& _lower_bound.current_pi_idx >= pi_blocks->size() - 1) {
|
|
sstlog.trace("index {}: position in current block (all blocks are read)", this);
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
const schema& s = *_sstable->_schema;
|
|
auto cmp_with_start = [pos_cmp = position_in_partition::composite_less_compare(s), s]
|
|
(position_in_partition_view pos, const promoted_index_block& info) -> bool {
|
|
return pos_cmp(pos, info.start(s));
|
|
};
|
|
|
|
|
|
if (!pi_blocks->empty() && cmp_with_start(pos, (*pi_blocks)[_lower_bound.current_pi_idx])) {
|
|
sstlog.trace("index {}: position in current block (exact match)", this);
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
auto i = std::upper_bound(pi_blocks->begin() + _lower_bound.current_pi_idx, pi_blocks->end(), pos, cmp_with_start);
|
|
_lower_bound.current_pi_idx = std::distance(pi_blocks->begin(), i);
|
|
if ((i != pi_blocks->end()) || (e.get_read_pi_blocks_count() == e.get_total_pi_blocks_count())) {
|
|
if (i != pi_blocks->begin()) {
|
|
--i;
|
|
}
|
|
|
|
_lower_bound.data_file_position = e.position() + i->offset();
|
|
_lower_bound.element = indexable_element::cell;
|
|
sstlog.trace("index {}: lower bound skipped to cell, _current_pi_idx={}, _data_file_position={}",
|
|
this, _lower_bound.current_pi_idx, _lower_bound.data_file_position);
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
return e.get_pi_blocks_until(pos).then([this, &e, pi_blocks] (size_t current_pi_idx) {
|
|
_lower_bound.current_pi_idx = current_pi_idx;
|
|
auto i = std::begin(*pi_blocks);
|
|
if (_lower_bound.current_pi_idx > 0) {
|
|
std::advance(i, _lower_bound.current_pi_idx - 1);
|
|
}
|
|
_lower_bound.data_file_position = e.position() + i->offset();
|
|
_lower_bound.element = indexable_element::cell;
|
|
sstlog.trace("index {}: skipped to cell, _current_pi_idx={}, _data_file_position={}",
|
|
this, _lower_bound.current_pi_idx, _lower_bound.data_file_position);
|
|
return make_ready_future<>();
|
|
});
|
|
}
|
|
|
|
// Forwards the upper blound cursor to a position which is greater than given position in current partition.
|
|
//
|
|
// Note that the index within partition, unlike the partition index, doesn't cover all keys.
|
|
// So this may not forward to the smallest position which is greater than pos.
|
|
//
|
|
// May advance to the next partition if it's not possible to find a suitable position inside
|
|
// current partition.
|
|
//
|
|
// Must be called only when !eof().
|
|
future<> advance_upper_past(position_in_partition_view pos) {
|
|
sstlog.trace("index {}: advance_upper_past({})", this, pos);
|
|
|
|
// We advance cursor within the current lower bound partition
|
|
// So need to make sure first that it is read
|
|
if (!partition_data_ready(_lower_bound)) {
|
|
return read_lower_partition_data().then([this, pos] {
|
|
assert(lower_partition_data_ready());
|
|
return advance_upper_past(pos);
|
|
});
|
|
}
|
|
|
|
if (!_upper_bound) {
|
|
_upper_bound = _lower_bound;
|
|
}
|
|
|
|
index_entry& e = current_partition_entry(*_upper_bound);
|
|
if (e.get_total_pi_blocks_count() == 0) {
|
|
sstlog.trace("index {}: no promoted index", this);
|
|
return advance_to_next_partition(*_upper_bound);
|
|
}
|
|
|
|
if (e.get_read_pi_blocks_count() == 0) {
|
|
return e.get_next_pi_blocks().then([this, pos] {
|
|
return advance_upper_past(pos);
|
|
});
|
|
}
|
|
|
|
const schema& s = *_sstable->_schema;
|
|
auto cmp_with_start = [pos_cmp = position_in_partition::composite_less_compare(s), s]
|
|
(position_in_partition_view pos, const promoted_index_block& info) -> bool {
|
|
return pos_cmp(pos, info.start(s));
|
|
};
|
|
promoted_index_blocks* pi_blocks = e.get_pi_blocks();
|
|
assert(pi_blocks);
|
|
auto i = std::upper_bound(pi_blocks->begin() + _upper_bound->current_pi_idx, pi_blocks->end(), pos, cmp_with_start);
|
|
_upper_bound->current_pi_idx = std::distance(pi_blocks->begin(), i);
|
|
if (i == pi_blocks->end()) {
|
|
return advance_to_next_partition(*_upper_bound);
|
|
}
|
|
|
|
_upper_bound->data_file_position = e.position() + i->offset();
|
|
_upper_bound->element = indexable_element::cell;
|
|
sstlog.trace("index {} upper bound: skipped to cell, _current_pi_idx={}, _data_file_position={}",
|
|
this, _upper_bound->current_pi_idx, _upper_bound->data_file_position);
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
// Like advance_to(dht::ring_position_view), but returns information whether the key was found
|
|
future<bool> advance_lower_and_check_if_present(dht::ring_position_view key) {
|
|
return advance_to(_lower_bound, key).then([this, key] {
|
|
if (eof()) {
|
|
return make_ready_future<bool>(false);
|
|
}
|
|
return read_lower_partition_data().then([this, key] {
|
|
index_comparator cmp(*_sstable->_schema);
|
|
return cmp(key, current_partition_entry(_lower_bound)) == 0;
|
|
});
|
|
});
|
|
}
|
|
|
|
// Moves the lower bound cursor to the beginning of next partition.
|
|
// Can be called only when !eof().
|
|
future<> advance_lower_to_next_partition() {
|
|
return advance_to_next_partition(_lower_bound);
|
|
}
|
|
|
|
// Positions the lower bound cursor on the first partition which is not smaller than pos (like std::lower_bound).
|
|
// Must be called for non-decreasing positions.
|
|
future<> advance_lower_to(dht::ring_position_view pos) {
|
|
return advance_to(_lower_bound, pos);
|
|
}
|
|
|
|
struct data_file_positions_range {
|
|
uint64_t start;
|
|
std::optional<uint64_t> end;
|
|
};
|
|
|
|
// Returns positions in the data file of the cursor.
|
|
// End position may not be set
|
|
data_file_positions_range data_file_positions() const {
|
|
data_file_positions_range result;
|
|
result.start = _lower_bound.data_file_position;
|
|
if (_upper_bound) {
|
|
result.end = _upper_bound->data_file_position;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
// Returns the kind of sstable element the lower bound cursor is pointing at.
|
|
indexable_element lower_element_kind() const {
|
|
return _lower_bound.element;
|
|
}
|
|
|
|
bool eof() const {
|
|
return _lower_bound.data_file_position == data_file_end();
|
|
}
|
|
|
|
future<> close() {
|
|
// Need to close consequently as we expect to not have close_current_list_ptr to run in parallel
|
|
return close_index_list(_lower_bound.current_list).then([this] {
|
|
if (_upper_bound) {
|
|
return close_index_list(_upper_bound->current_list);
|
|
}
|
|
return make_ready_future<>();
|
|
});
|
|
}
|
|
};
|
|
|
|
}
|