diff --git a/sstables/sstables.cc b/sstables/sstables.cc index 4014cd8aa3..523d021a36 100644 --- a/sstables/sstables.cc +++ b/sstables/sstables.cc @@ -1334,6 +1334,18 @@ future sstable::get_open_info() & { }); } +static composite::eoc bound_kind_to_start_marker(bound_kind start_kind) { + return start_kind == bound_kind::excl_start + ? composite::eoc::end + : composite::eoc::start; +} + +static composite::eoc bound_kind_to_end_marker(bound_kind end_kind) { + return end_kind == bound_kind::excl_end + ? composite::eoc::start + : composite::eoc::end; +} + static void output_promoted_index_entry(bytes_ostream& promoted_index, const bytes& first_col, const bytes& last_col, @@ -1390,8 +1402,9 @@ static bytes serialize_colname(const composite& clustering_key, // (which might be gone later). void sstable::maybe_flush_pi_block(file_writer& out, const composite& clustering_key, - const std::vector& column_names) { - bytes colname = serialize_colname(clustering_key, column_names, composite::eoc::none); + const std::vector& column_names, + composite::eoc marker) { + bytes colname = serialize_colname(clustering_key, column_names, marker); if (_pi_write.block_first_colname.empty()) { // This is the first column in the partition, or first column since we // closed a promoted-index block. Remember its name and position - @@ -1423,7 +1436,9 @@ void sstable::maybe_flush_pi_block(file_writer& out, auto start = composite::from_clustering_element(*_pi_write.schemap, rt.start); auto end = composite::from_clustering_element(*_pi_write.schemap, rt.end); write_range_tombstone(out, - start, rt.start_kind, end, rt.end_kind, {}, rt.tomb); + start, bound_kind_to_start_marker(rt.start_kind), + end, bound_kind_to_end_marker(rt.end_kind), + {}, rt.tomb); } } _pi_write.block_next_start_offset = out.offset() + _pi_write.desired_block_size; @@ -1605,24 +1620,18 @@ void sstable::write_row_tombstone(file_writer& out, const composite& key, const void sstable::write_range_tombstone(file_writer& out, const composite& start, - bound_kind start_kind, + composite::eoc start_marker, const composite& end, - bound_kind end_kind, + composite::eoc end_marker, std::vector suffix, const tombstone t) { if (!t) { return; } - auto start_marker = start_kind == bound_kind::excl_start - ? composite::eoc::end - : composite::eoc::start; write_column_name(out, start, suffix, start_marker); column_mask mask = column_mask::range_tombstone; write(out, mask); - auto end_marker = end_kind == bound_kind::excl_end - ? composite::eoc::start - : composite::eoc::end; write_column_name(out, end, suffix, end_marker); write_deletion_time(out, t); } @@ -1987,9 +1996,11 @@ stop_iteration components_writer::consume(range_tombstone&& rt) { // already closed by rt.start, so the accumulator doesn't grow boundless. _sst._pi_write.tombstone_accumulator->apply(rt); auto start = composite::from_clustering_element(_schema, std::move(rt.start)); + auto start_marker = bound_kind_to_start_marker(rt.start_kind); auto end = composite::from_clustering_element(_schema, std::move(rt.end)); - _sst.maybe_flush_pi_block(_out, start, {}); - _sst.write_range_tombstone(_out, std::move(start), rt.start_kind, std::move(end), rt.end_kind, {}, rt.tomb); + auto end_marker = bound_kind_to_end_marker(rt.end_kind); + _sst.maybe_flush_pi_block(_out, start, {}, start_marker); + _sst.write_range_tombstone(_out, std::move(start), start_marker, std::move(end), end_marker, {}, rt.tomb); return stop_iteration::no; } diff --git a/sstables/sstables.hh b/sstables/sstables.hh index ab5326bbe9..76d6bea44e 100644 --- a/sstables/sstables.hh +++ b/sstables/sstables.hh @@ -512,7 +512,8 @@ private: void maybe_flush_pi_block(file_writer& out, const composite& clustering_key, - const std::vector& column_names); + const std::vector& column_names, + composite::eoc marker = composite::eoc::none); schema_ptr _schema; sstring _dir; @@ -617,9 +618,9 @@ private: void write_cell(file_writer& out, atomic_cell_view cell, const column_definition& cdef); void write_column_name(file_writer& out, const composite& clustering_key, const std::vector& column_names, composite::eoc marker = composite::eoc::none); void write_column_name(file_writer& out, bytes_view column_names); - void write_range_tombstone(file_writer& out, const composite& start, bound_kind start_kind, const composite& end, bound_kind stop_kind, std::vector suffix, const tombstone t); + void write_range_tombstone(file_writer& out, const composite& start, composite::eoc start_marker, const composite& end, composite::eoc end_marker, std::vector suffix, const tombstone t); void write_range_tombstone(file_writer& out, const composite& start, const composite& end, std::vector suffix, const tombstone t) { - write_range_tombstone(out, start, bound_kind::incl_start, end, bound_kind::incl_end, std::move(suffix), std::move(t)); + write_range_tombstone(out, start, composite::eoc::start, end, composite::eoc::end, std::move(suffix), std::move(t)); } void write_collection(file_writer& out, const composite& clustering_key, const column_definition& cdef, collection_mutation_view collection); void write_row_tombstone(file_writer& out, const composite& key, const row_tombstone t); diff --git a/tests/sstable_assertions.hh b/tests/sstable_assertions.hh new file mode 100644 index 0000000000..4379f2cad1 --- /dev/null +++ b/tests/sstable_assertions.hh @@ -0,0 +1,74 @@ +/* + * Copyright (C) 2017 ScyllaDB + */ + +/* + * This file is part of Scylla. + * + * Scylla is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Scylla is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Scylla. If not, see . + */ + +#pragma once + +#include "dht/i_partitioner.hh" +#include "schema.hh" +#include "sstables/index_reader.hh" + +class index_reader_assertions { + std::unique_ptr _r; +public: + index_reader_assertions(std::unique_ptr r) + : _r(std::move(r)) + { } + + index_reader_assertions& has_monotonic_positions(const schema& s) { + auto pos_cmp = position_in_partition::composite_less_compare(s); + auto rp_cmp = dht::ring_position_comparator(s); + auto prev = dht::ring_position::min(); + _r->read_partition_data().get(); + while (!_r->eof()) { + auto k = _r->current_partition_entry().get_decorated_key(); + auto rp = dht::ring_position(k.token(), k.key().to_partition_key(s)); + + if (!rp_cmp(prev, rp)) { + BOOST_FAIL(sprint("Partitions have invalid order: %s >= %s", prev, rp)); + } + + prev = rp; + + auto* pi = _r->current_partition_entry().get_promoted_index(s); + if (!pi->entries.empty()) { + auto& prev = pi->entries[0]; + for (size_t i = 1; i < pi->entries.size(); ++i) { + auto& cur = pi->entries[i]; + if (!pos_cmp(prev.end, cur.start)) { + std::cout << "promoted index:\n"; + for (auto& e : pi->entries) { + std::cout << " " << e.start << "-" << e.end << ": +" << e.offset << " len=" << e.width << std::endl; + } + BOOST_FAIL(sprint("Index blocks are not monotonic: %s >= %s", prev.end, cur.start)); + } + cur = prev; + } + } + _r->advance_to_next_partition().get(); + } + return *this; + } +}; + +inline +index_reader_assertions assert_that(std::unique_ptr r) { + return { std::move(r) }; +} diff --git a/tests/sstable_mutation_test.cc b/tests/sstable_mutation_test.cc index 6bdcddd240..c33cd210e3 100644 --- a/tests/sstable_mutation_test.cc +++ b/tests/sstable_mutation_test.cc @@ -35,6 +35,7 @@ #include "tmpdir.hh" #include "memtable-sstable.hh" #include "disk-error-handler.hh" +#include "tests/sstable_assertions.hh" thread_local disk_error_signal_type commit_error; thread_local disk_error_signal_type general_disk_error; @@ -801,3 +802,52 @@ SEASTAR_TEST_CASE(test_non_compound_table_row_is_not_marked_as_static) { BOOST_REQUIRE(bool(mut)); }); } + +SEASTAR_TEST_CASE(test_promoted_index_blocks_are_monotonic) { + return seastar::async([] { + auto dir = make_lw_shared(); + schema_builder builder("ks", "cf"); + builder.with_column("p", utf8_type, column_kind::partition_key); + builder.with_column("c1", int32_type, column_kind::clustering_key); + builder.with_column("c2", int32_type, column_kind::clustering_key); + builder.with_column("v", int32_type); + auto s = builder.build(); + + auto k = partition_key::from_exploded(*s, {to_bytes("key1")}); + auto cell = atomic_cell::make_live(1, int32_type->decompose(88), { }); + mutation m(k, s); + + auto ck = clustering_key::from_exploded(*s, {int32_type->decompose(1), int32_type->decompose(2)}); + m.set_clustered_cell(ck, *s->get_column_definition("v"), cell); + + ck = clustering_key::from_exploded(*s, {int32_type->decompose(1), int32_type->decompose(4)}); + m.set_clustered_cell(ck, *s->get_column_definition("v"), std::move(cell)); + + ck = clustering_key::from_exploded(*s, {int32_type->decompose(1), int32_type->decompose(6)}); + m.set_clustered_cell(ck, *s->get_column_definition("v"), std::move(cell)); + + ck = clustering_key::from_exploded(*s, {int32_type->decompose(3), int32_type->decompose(9)}); + m.set_clustered_cell(ck, *s->get_column_definition("v"), std::move(cell)); + + m.partition().apply_row_tombstone(*s, range_tombstone( + clustering_key_prefix::from_exploded(*s, {int32_type->decompose(1)}), + bound_kind::excl_start, + clustering_key_prefix::from_exploded(*s, {int32_type->decompose(2)}), + bound_kind::incl_end, + {1, gc_clock::now()})); + + auto mt = make_lw_shared(s); + mt->apply(std::move(m)); + + auto sst = make_lw_shared(s, + dir->path, + 1 /* generation */, + sstables::sstable::version_types::ka, + sstables::sstable::format_types::big); + sstable_writer_config cfg; + cfg.promoted_index_block_size = 1; + sst->write_components(mt->make_reader(s), 1, s, cfg).get(); + sst->load().get(); + assert_that(sst->get_index_reader(default_priority_class())).has_monotonic_positions(*s); + }); +}