diff --git a/bytes.hh b/bytes.hh index 78552b32c7..d1c05a3b6a 100644 --- a/bytes.hh +++ b/bytes.hh @@ -35,6 +35,10 @@ using bytes_mutable_view = basic_mutable_view; using bytes_opt = std::experimental::optional; using sstring_view = std::experimental::string_view; +inline sstring_view to_sstring_view(bytes_view view) { + return {reinterpret_cast(view.data()), view.size()}; +} + namespace std { template <> diff --git a/database.cc b/database.cc index ef7438972a..f2c9ce2ce4 100644 --- a/database.cc +++ b/database.cc @@ -88,7 +88,9 @@ logging::logger dblog("database"); namespace { sstables::sstable::version_types get_highest_supported_format() { - if (service::get_local_storage_service().cluster_supports_la_sstable()) { + if (service::get_local_storage_service().cluster_supports_mc_sstable()) { + return sstables::sstable::version_types::mc; + } else if (service::get_local_storage_service().cluster_supports_la_sstable()) { return sstables::sstable::version_types::la; } else { return sstables::sstable::version_types::ka; diff --git a/db/config.hh b/db/config.hh index 698ddd88b1..197a519c25 100644 --- a/db/config.hh +++ b/db/config.hh @@ -739,6 +739,7 @@ public: " Performance is affected to some extent as a result. Useful to help debugging problems that may arise at another layers.") \ val(cpu_scheduler, bool, true, Used, "Enable cpu scheduling") \ val(view_building, bool, true, Used, "Enable view building; should only be set to false when the node is experience issues due to view building") \ + val(enable_sstables_mc_format, bool, false, Used, "Enable SSTables 'mc' format to be used as the default file format; FOR TESTING PURPOSES ONLY - TO BE REMOVED BEFORE RELEASE") \ /* done! */ #define _make_value_member(name, type, deflt, status, desc, ...) \ diff --git a/service/storage_service.cc b/service/storage_service.cc index 74e52d1e43..b4576169f0 100644 --- a/service/storage_service.cc +++ b/service/storage_service.cc @@ -100,6 +100,7 @@ static const sstring XXHASH_FEATURE = "XXHASH"; static const sstring ROLES_FEATURE = "ROLES"; static const sstring LA_SSTABLE_FEATURE = "LA_SSTABLE_FORMAT"; static const sstring STREAM_WITH_RPC_STREAM = "STREAM_WITH_RPC_STREAM"; +static const sstring MC_SSTABLE_FEATURE = "MC_SSTABLE_FORMAT"; distributed _the_storage_service; @@ -209,7 +210,11 @@ sstring storage_service::get_config_supported_features() { LA_SSTABLE_FEATURE, STREAM_WITH_RPC_STREAM, }; - if (service::get_local_storage_service()._db.local().get_config().experimental()) { + auto& config = service::get_local_storage_service()._db.local().get_config(); + if (config.enable_sstables_mc_format()) { + features.push_back(MC_SSTABLE_FEATURE); + } + if (config.experimental()) { features.push_back(MATERIALIZED_VIEWS_FEATURE); features.push_back(INDEXES_FEATURE); } @@ -426,6 +431,7 @@ void storage_service::register_features() { _roles_feature = gms::feature(ROLES_FEATURE); _la_sstable_feature = gms::feature(LA_SSTABLE_FEATURE); _stream_with_rpc_stream_feature = gms::feature(STREAM_WITH_RPC_STREAM); + _mc_sstable_feature = gms::feature(MC_SSTABLE_FEATURE); if (_db.local().get_config().experimental()) { _materialized_views_feature = gms::feature(MATERIALIZED_VIEWS_FEATURE); diff --git a/service/storage_service.hh b/service/storage_service.hh index e5c23b0507..278095245e 100644 --- a/service/storage_service.hh +++ b/service/storage_service.hh @@ -289,6 +289,7 @@ private: gms::feature _roles_feature; gms::feature _la_sstable_feature; gms::feature _stream_with_rpc_stream_feature; + gms::feature _mc_sstable_feature; public: void enable_all_features() { _range_tombstones_feature.enable(); @@ -305,6 +306,7 @@ public: _roles_feature.enable(); _la_sstable_feature.enable(); _stream_with_rpc_stream_feature.enable(); + _mc_sstable_feature.enable(); } void finish_bootstrapping() { @@ -2287,6 +2289,10 @@ public: bool cluster_supports_stream_with_rpc_stream() const { return bool(_stream_with_rpc_stream_feature); } + + bool cluster_supports_mc_sstable() const { + return bool(_mc_sstable_feature); + } }; inline future<> init_storage_service(distributed& db, sharded& auth_service, sharded& sys_dist_ks) { diff --git a/sstables/column_translation.hh b/sstables/column_translation.hh index e19ac72520..e0945d3a66 100644 --- a/sstables/column_translation.hh +++ b/sstables/column_translation.hh @@ -28,6 +28,7 @@ #include "schema.hh" #include "sstables/types.hh" #include "utils/UUID.hh" +#include "db/marshal/type_parser.hh" namespace sstables { @@ -40,7 +41,7 @@ inline column_values_fixed_lengths get_clustering_values_fixed_lengths(const ser column_values_fixed_lengths lengths; lengths.reserve(header.clustering_key_types_names.elements.size()); for (auto&& t : header.clustering_key_types_names.elements) { - auto type = abstract_type::parse_type(sstring(std::cbegin(t.value), std::cend(t.value))); + auto type = db::marshal::type_parser::parse(to_sstring_view(t.value)); lengths.push_back(type->value_length_if_fixed()); } diff --git a/sstables/m_format_read_helpers.hh b/sstables/m_format_read_helpers.hh index 0ecc4ad6fe..b9d9588a6d 100644 --- a/sstables/m_format_read_helpers.hh +++ b/sstables/m_format_read_helpers.hh @@ -125,40 +125,33 @@ inline future<> read_vint(random_access_reader& in, T& value) { } } -inline api::timestamp_type parse_timestamp(uint64_t value) { - if (value > api::max_timestamp) { - throw malformed_sstable_exception("Too big timestamp: " + value); - } - return static_cast(value); -} - inline api::timestamp_type parse_timestamp(const serialization_header& header, uint64_t delta) { - return parse_timestamp(header.min_timestamp.value + delta); + return static_cast(header.get_min_timestamp() + delta); } inline gc_clock::duration parse_ttl(uint32_t value) { if (value > std::numeric_limits::max()) { - throw malformed_sstable_exception("Too big ttl: " + value); + throw malformed_sstable_exception(format("Too big ttl: {}", value)); } return gc_clock::duration(value); } inline gc_clock::duration parse_ttl(const serialization_header& header, uint32_t delta) { - return parse_ttl(header.min_ttl.value + delta); + return parse_ttl(header.get_min_ttl() + delta); } inline gc_clock::time_point parse_expiry(uint32_t value) { if (value > std::numeric_limits::max()) { - throw malformed_sstable_exception("Too big expiry: " + value); + throw malformed_sstable_exception(format("Too big expiry: {}", value)); } return gc_clock::time_point(gc_clock::duration(value)); } inline gc_clock::time_point parse_expiry(const serialization_header& header, uint32_t delta) { - return parse_expiry(header.min_local_deletion_time.value + delta); + return parse_expiry(header.get_min_local_deletion_time() + delta); } }; // namespace sstables diff --git a/sstables/m_format_write_helpers.cc b/sstables/m_format_write_helpers.cc index 4be0dd9b6b..ea4b2d3de7 100644 --- a/sstables/m_format_write_helpers.cc +++ b/sstables/m_format_write_helpers.cc @@ -298,7 +298,7 @@ void write_missing_columns(file_writer& out, const schema& s, const row& row) { template void write_unsigned_delta_vint(file_writer& out, T value, T base) { using unsigned_type = std::make_unsigned_t; - unsigned_type delta = value - base; + unsigned_type delta = static_cast(value) - base; write_vint(out, delta); } diff --git a/sstables/mp_row_consumer.hh b/sstables/mp_row_consumer.hh index 5415541d6d..eea012c5d0 100644 --- a/sstables/mp_row_consumer.hh +++ b/sstables/mp_row_consumer.hh @@ -1014,6 +1014,10 @@ public: } std::optional fast_forward_to(position_range r, db::timeout_clock::time_point) { + if (!_mf_filter) { + return {}; + } + return _mf_filter->fast_forward_to(std::move(r)); } diff --git a/sstables/row.hh b/sstables/row.hh index 4aea0bb876..5fa3268176 100644 --- a/sstables/row.hh +++ b/sstables/row.hh @@ -1173,6 +1173,14 @@ public: } case state::COMPLEX_COLUMN_SIZE_2: _subcolumns_to_read = _u64; + if (_subcolumns_to_read == 0) { + auto id = get_column_id(); + move_to_next_column(); + if (_consumer.consume_complex_column_end(std::move(id)) != consumer_m::proceed::yes) { + _state = state::COLUMN; + return consumer_m::proceed::no; + } + } goto column_label; case state::RANGE_TOMBSTONE_MARKER: range_tombstone_marker_label: @@ -1282,6 +1290,13 @@ public: { } void verify_end_state() { + // If reading a partial row (i.e., when we have a clustering row + // filter and using a promoted index), we may be in FLAGS or FLAGS_2 + // state instead of PARTITION_START. + if (_state == state::FLAGS || _state == state::FLAGS_2) { + _consumer.consume_partition_end(); + return; + } if (_state != state::PARTITION_START || _prestate != prestate::NONE) { throw malformed_sstable_exception("end of input, but not end of partition"); } diff --git a/sstables/sstables.cc b/sstables/sstables.cc index cd5c4bb451..28e8d89f69 100644 --- a/sstables/sstables.cc +++ b/sstables/sstables.cc @@ -1299,11 +1299,7 @@ double sstable::estimate_droppable_tombstone_ratio(gc_clock::time_point gc_befor } future<> sstable::read_statistics(const io_priority_class& pc) { - return read_simple(_components->statistics, pc).then([this] { - if (_version == version_types::mc) { - adjust_serialization_header(); - } - }); + return read_simple(_components->statistics, pc); } void sstable::write_statistics(const io_priority_class& pc) { @@ -2118,9 +2114,9 @@ static sstring pk_type_to_string(const schema& s) { static serialization_header make_serialization_header(const schema& s, const encoding_stats& enc_stats) { serialization_header header; - header.min_timestamp.value = enc_stats.min_timestamp - encoding_stats::timestamp_epoch; - header.min_local_deletion_time.value = enc_stats.min_local_deletion_time - encoding_stats::deletion_time_epoch; - header.min_ttl.value = enc_stats.min_ttl - encoding_stats::ttl_epoch; + header.min_timestamp_base.value = static_cast(enc_stats.min_timestamp) - encoding_stats::timestamp_epoch; + header.min_local_deletion_time_base.value = enc_stats.min_local_deletion_time - encoding_stats::deletion_time_epoch; + header.min_ttl_base.value = enc_stats.min_ttl - encoding_stats::ttl_epoch; header.pk_type_name = to_bytes_array_vint_size(pk_type_to_string(s)); @@ -3759,7 +3755,7 @@ future<> sstable::set_generation(int64_t new_generation) { } entry_descriptor entry_descriptor::make_descriptor(sstring sstdir, sstring fname) { - static std::regex la("la-(\\d+)-(\\w+)-(.*)"); + static std::regex la_mc("(la|mc)-(\\d+)-(\\w+)-(.*)"); static std::regex ka("(\\w+)-(\\w+)-ka-(\\d+)-(.*)"); static std::regex dir(".*/([^/]*)/(\\w+)-[\\da-fA-F]+(?:/upload|/snapshots/[^/]+)?/?"); @@ -3776,7 +3772,7 @@ entry_descriptor entry_descriptor::make_descriptor(sstring sstdir, sstring fname sstlog.debug("Make descriptor sstdir: {}; fname: {}", sstdir, fname); std::string s(fname); - if (std::regex_match(s, match, la)) { + if (std::regex_match(s, match, la_mc)) { std::string sdir(sstdir); std::smatch dirmatch; if (std::regex_match(sdir, dirmatch, dir)) { @@ -3785,10 +3781,10 @@ entry_descriptor entry_descriptor::make_descriptor(sstring sstdir, sstring fname } else { throw malformed_sstable_exception(seastar::sprint("invalid version for file %s with path %s. Path doesn't match known pattern.", fname, sstdir)); } - version = sstable::version_types::la; - generation = match[1].str(); - format = sstring(match[2].str()); - component = sstring(match[3].str()); + version = (match[1].str() == "la") ? sstable::version_types::la : sstable::version_types::mc; + generation = match[2].str(); + format = sstring(match[3].str()); + component = sstring(match[4].str()); } else if (std::regex_match(s, match, ka)) { ks = match[1].str(); cf = match[2].str(); diff --git a/sstables/sstables.hh b/sstables/sstables.hh index 06051d416b..79395f91e9 100644 --- a/sstables/sstables.hh +++ b/sstables/sstables.hh @@ -592,9 +592,6 @@ private: serialization_header& s = *static_cast(p.get()); return s; } - void adjust_serialization_header() { - get_mutable_serialization_header(*_components).adjust(); - } public: future<> read_toc(); diff --git a/sstables/types.hh b/sstables/types.hh index 730ad084cf..65bc36ea22 100644 --- a/sstables/types.hh +++ b/sstables/types.hh @@ -358,9 +358,9 @@ struct stats_metadata : public metadata_base { using bytes_array_vint_size = disk_string_vint_size; struct serialization_header : public metadata_base { - vint min_timestamp; - vint min_local_deletion_time; - vint min_ttl; + vint min_timestamp_base; + vint min_local_deletion_time_base; + vint min_ttl_base; bytes_array_vint_size pk_type_name; disk_array_vint_size clustering_key_types_names; struct column_desc { @@ -381,9 +381,9 @@ struct serialization_header : public metadata_base { switch (v) { case sstable_version_types::mc: return f( - min_timestamp, - min_local_deletion_time, - min_ttl, + min_timestamp_base, + min_local_deletion_time_base, + min_ttl_base, pk_type_name, clustering_key_types_names, static_columns, @@ -397,10 +397,17 @@ struct serialization_header : public metadata_base { // Should never reach here - compiler will complain if switch above does not cover all sstable versions abort(); } - void adjust() { - min_timestamp.value += encoding_stats::timestamp_epoch; - min_local_deletion_time.value += encoding_stats::deletion_time_epoch; - min_ttl.value += encoding_stats::ttl_epoch; + + uint64_t get_min_timestamp() const { + return min_timestamp_base.value + encoding_stats::timestamp_epoch; + } + + uint32_t get_min_ttl() const { + return min_ttl_base.value + encoding_stats::ttl_epoch; + } + + uint32_t get_min_local_deletion_time() const { + return min_local_deletion_time_base.value + encoding_stats::deletion_time_epoch; } }; diff --git a/tests/sstable_3_x_test.cc b/tests/sstable_3_x_test.cc index e4032c5571..59fc952bf2 100644 --- a/tests/sstable_3_x_test.cc +++ b/tests/sstable_3_x_test.cc @@ -46,6 +46,7 @@ #include "types.hh" #include "partition_slice_builder.hh" #include "schema.hh" +#include "utils/UUID_gen.hh" using namespace sstables; @@ -4408,3 +4409,52 @@ SEASTAR_THREAD_TEST_CASE(test_read_table_empty_clustering_key) { assert_that(sst.read_rows_flat()).produces(keys); } +/* + * Test for a bug discovered with Scylla's compaction_history tables + * containing complex columns with zero subcolumns. + */ +SEASTAR_THREAD_TEST_CASE(test_complex_column_zero_subcolumns_read) { + using utils::UUID; + const sstring path = + "tests/sstables/3.x/uncompressed/complex_column_zero_subcolumns"; + + schema_ptr s = schema_builder("test_ks", "test_table") + .with_column("id", uuid_type, column_kind::partition_key) + .with_column("bytes_in", long_type) + .with_column("bytes_out", long_type) + .with_column("columnfamily_name", utf8_type) + .with_column("compacted_at", timestamp_type) + .with_column("keyspace_name", utf8_type) + .with_column("rows_merged", map_type_impl::get_instance(int32_type, long_type, true)) + .build(); + + sstable_assertions sst(s, path); + sst.load(); + + auto to_pkey = [&s] (const UUID& key) { + auto bytes = uuid_type->decompose(key); + auto pk = partition_key::from_single_value(*s, bytes); + return dht::global_partitioner().decorate_key(*s, pk); + }; + + std::vector keys { + UUID{"09fea990-b320-11e8-83a7-000000000000"}, + UUID{"0a310430-b320-11e8-83a7-000000000000"}, + UUID{"0a214cc0-b320-11e8-83a7-000000000000"}, + UUID{"0a00a560-b320-11e8-83a7-000000000000"}, + UUID{"0a0a6960-b320-11e8-83a7-000000000000"}, + UUID{"0a147b80-b320-11e8-83a7-000000000000"}, + UUID{"0a187320-b320-11e8-83a7-000000000000"}, + }; + + auto rd = sst.read_rows_flat(); + rd.set_max_buffer_size(1); + auto r = assert_that(std::move(rd)); + for (const auto& key : keys) { + r.produces_partition_start(to_pkey(key)) + .produces_row_with_key(clustering_key::make_empty()) + .produces_partition_end(); + } + r.produces_end_of_stream(); +} + diff --git a/tests/sstables/3.x/uncompressed/complex_column_zero_subcolumns/mc-1-big-Data.db b/tests/sstables/3.x/uncompressed/complex_column_zero_subcolumns/mc-1-big-Data.db new file mode 100644 index 0000000000..f5d2ff7fd1 Binary files /dev/null and b/tests/sstables/3.x/uncompressed/complex_column_zero_subcolumns/mc-1-big-Data.db differ diff --git a/tests/sstables/3.x/uncompressed/complex_column_zero_subcolumns/mc-1-big-Index.db b/tests/sstables/3.x/uncompressed/complex_column_zero_subcolumns/mc-1-big-Index.db new file mode 100644 index 0000000000..f19b85715e Binary files /dev/null and b/tests/sstables/3.x/uncompressed/complex_column_zero_subcolumns/mc-1-big-Index.db differ diff --git a/tests/sstables/3.x/uncompressed/complex_column_zero_subcolumns/mc-1-big-Statistics.db b/tests/sstables/3.x/uncompressed/complex_column_zero_subcolumns/mc-1-big-Statistics.db new file mode 100644 index 0000000000..4389e4871a Binary files /dev/null and b/tests/sstables/3.x/uncompressed/complex_column_zero_subcolumns/mc-1-big-Statistics.db differ diff --git a/tests/sstables/3.x/uncompressed/complex_column_zero_subcolumns/mc-1-big-Summary.db b/tests/sstables/3.x/uncompressed/complex_column_zero_subcolumns/mc-1-big-Summary.db new file mode 100644 index 0000000000..818d03d170 Binary files /dev/null and b/tests/sstables/3.x/uncompressed/complex_column_zero_subcolumns/mc-1-big-Summary.db differ diff --git a/tests/sstables/3.x/uncompressed/complex_column_zero_subcolumns/mc-1-big-TOC.txt b/tests/sstables/3.x/uncompressed/complex_column_zero_subcolumns/mc-1-big-TOC.txt new file mode 100644 index 0000000000..f3121935fb --- /dev/null +++ b/tests/sstables/3.x/uncompressed/complex_column_zero_subcolumns/mc-1-big-TOC.txt @@ -0,0 +1,5 @@ +Index.db +Statistics.db +TOC.txt +Data.db +Summary.db